agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# Method
|
|
2
|
+
|
|
3
|
+
## System Overview
|
|
4
|
+
|
|
5
|
+
Context-as-a-Service (CaaS) is a modular pipeline for intelligent context extraction and serving. Figure 1 illustrates the overall architecture.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
┌─────────────┐ ┌──────────────┐ ┌─────────────┐ ┌──────────────┐
|
|
9
|
+
│ Ingestion │ ──▶ │ Structure │ ──▶ │ Metadata │ ──▶ │ Time │
|
|
10
|
+
│ (PDF/HTML/ │ │ Parser │ │ Injector │ │ Decay │
|
|
11
|
+
│ Code) │ └──────────────┘ └─────────────┘ └──────────────┘
|
|
12
|
+
└─────────────┘ │ │ │
|
|
13
|
+
▼ ▼ ▼
|
|
14
|
+
┌────────────────────────────────────────────────────┐
|
|
15
|
+
│ Indexed Document Store │
|
|
16
|
+
│ (Three-Tier Value Hierarchy) │
|
|
17
|
+
└────────────────────────────────────────────────────┘
|
|
18
|
+
│
|
|
19
|
+
▼
|
|
20
|
+
┌────────────────────────────────────────────────────┐
|
|
21
|
+
│ Context Triad Assembly │
|
|
22
|
+
│ (Hot / Warm / Cold Prioritization) │
|
|
23
|
+
└────────────────────────────────────────────────────┘
|
|
24
|
+
│
|
|
25
|
+
┌─────────────────────┼─────────────────────┐
|
|
26
|
+
▼ ▼ ▼
|
|
27
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
28
|
+
│ Heuristic │ │ Pragmatic │ │ Trust │
|
|
29
|
+
│ Router │ │ Truth │ │ Gateway │
|
|
30
|
+
└─────────────┘ └─────────────┘ └─────────────┘
|
|
31
|
+
│ │ │
|
|
32
|
+
└─────────────────────┼─────────────────────┘
|
|
33
|
+
▼
|
|
34
|
+
┌───────────┐
|
|
35
|
+
│ LLM │
|
|
36
|
+
└───────────┘
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Structure-Aware Indexing
|
|
40
|
+
|
|
41
|
+
### Problem
|
|
42
|
+
|
|
43
|
+
Traditional RAG systems use **flat chunking**: split documents into fixed-size segments (e.g., 500 tokens) and embed each equally. This approach treats a class definition the same as a TODO comment, losing the structural signals that encode importance.
|
|
44
|
+
|
|
45
|
+
### Solution: Three-Tier Value Hierarchy
|
|
46
|
+
|
|
47
|
+
We classify document content into three value tiers based on document type and structural patterns:
|
|
48
|
+
|
|
49
|
+
**Algorithm 1: Structure-Aware Classification**
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
def classify_content(chunk, doc_type):
|
|
53
|
+
if doc_type == "code":
|
|
54
|
+
if is_class_or_function_definition(chunk):
|
|
55
|
+
return HIGH_VALUE
|
|
56
|
+
elif is_docstring_or_comment(chunk):
|
|
57
|
+
return MEDIUM_VALUE
|
|
58
|
+
else: # imports, config, TODOs
|
|
59
|
+
return LOW_VALUE
|
|
60
|
+
|
|
61
|
+
elif doc_type == "legal":
|
|
62
|
+
if is_definitions_or_liability(chunk):
|
|
63
|
+
return HIGH_VALUE
|
|
64
|
+
elif is_terms_or_conditions(chunk):
|
|
65
|
+
return MEDIUM_VALUE
|
|
66
|
+
else: # boilerplate, signatures
|
|
67
|
+
return LOW_VALUE
|
|
68
|
+
|
|
69
|
+
# ... similar rules for policy, documentation, etc.
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Value Tier Definitions
|
|
73
|
+
|
|
74
|
+
| Tier | Code | Legal | Policy | Documentation |
|
|
75
|
+
|------|------|-------|--------|---------------|
|
|
76
|
+
| **HIGH** | Class/function defs | Definitions, liability | Core requirements | API endpoints |
|
|
77
|
+
| **MEDIUM** | Docstrings, comments | Terms, conditions | Guidelines | Examples, explanations |
|
|
78
|
+
| **LOW** | Imports, TODOs | Boilerplate, signatures | Formatting | Metadata, headers |
|
|
79
|
+
|
|
80
|
+
### Retrieval Weighting
|
|
81
|
+
|
|
82
|
+
During retrieval, we apply multiplicative weights:
|
|
83
|
+
|
|
84
|
+
$$\text{score}(c) = \text{similarity}(q, c) \times w_{\text{tier}}(c)$$
|
|
85
|
+
|
|
86
|
+
Where $w_{\text{HIGH}} = 1.5$, $w_{\text{MEDIUM}} = 1.0$, $w_{\text{LOW}} = 0.5$.
|
|
87
|
+
|
|
88
|
+
## Metadata Injection
|
|
89
|
+
|
|
90
|
+
### Problem
|
|
91
|
+
|
|
92
|
+
When chunks are extracted from documents, they lose context. "It increased by 5%" is meaningless without knowing the document path: `Q3 Earnings → Revenue → North America`.
|
|
93
|
+
|
|
94
|
+
### Solution: Contextual Breadcrumbs
|
|
95
|
+
|
|
96
|
+
We automatically inject metadata into each chunk:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
@dataclass
|
|
100
|
+
class EnrichedChunk:
|
|
101
|
+
content: str
|
|
102
|
+
metadata: ChunkMetadata
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class ChunkMetadata:
|
|
106
|
+
document_path: str # "Q3_Earnings.pdf"
|
|
107
|
+
section_hierarchy: List[str] # ["Revenue", "North America"]
|
|
108
|
+
created_at: datetime
|
|
109
|
+
updated_at: datetime
|
|
110
|
+
source_type: str # "official" | "informal"
|
|
111
|
+
confidence: float # 0.0 - 1.0
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The metadata is prepended to the chunk content before sending to the LLM:
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
[Source: Q3_Earnings.pdf > Revenue > North America | Updated: 2026-01-15]
|
|
118
|
+
Revenue increased by 5% compared to Q2, driven primarily by...
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Time-Based Decay
|
|
122
|
+
|
|
123
|
+
### Problem
|
|
124
|
+
|
|
125
|
+
Semantic similarity ignores temporal relevance. A 2021 answer to "How to restart the server" may match perfectly but be dangerously outdated.
|
|
126
|
+
|
|
127
|
+
### Solution: Exponential Decay
|
|
128
|
+
|
|
129
|
+
We apply time-based decay using an exponential function inspired by radioactive decay:
|
|
130
|
+
|
|
131
|
+
$$\text{decay}(t) = e^{-\lambda t}$$
|
|
132
|
+
|
|
133
|
+
Where:
|
|
134
|
+
- $t$ = time since document creation/update (days)
|
|
135
|
+
- $\lambda = \ln(2) / T_{1/2}$ (decay constant)
|
|
136
|
+
- $T_{1/2}$ = half-life parameter (configurable per domain)
|
|
137
|
+
|
|
138
|
+
**Algorithm 2: Time-Aware Scoring**
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
def time_adjusted_score(chunk, query, half_life_days=90):
|
|
142
|
+
base_score = semantic_similarity(query, chunk.content)
|
|
143
|
+
|
|
144
|
+
age_days = (now() - chunk.metadata.updated_at).days
|
|
145
|
+
decay_factor = exp(-log(2) * age_days / half_life_days)
|
|
146
|
+
|
|
147
|
+
return base_score * decay_factor
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Domain-Specific Half-Lives
|
|
151
|
+
|
|
152
|
+
| Domain | Half-Life | Rationale |
|
|
153
|
+
|--------|-----------|-----------|
|
|
154
|
+
| Code/Engineering | 90 days | APIs change frequently |
|
|
155
|
+
| Policy/HR | 365 days | Policies updated annually |
|
|
156
|
+
| Legal | 730 days | Contracts have longer validity |
|
|
157
|
+
| Incidents | 30 days | Recent incidents most relevant |
|
|
158
|
+
|
|
159
|
+
## Context Triad
|
|
160
|
+
|
|
161
|
+
### Problem
|
|
162
|
+
|
|
163
|
+
Traditional systems stuff context into the LLM window with no priority distinction. A user's current question, their preferences, and historical archives from years ago compete equally for space.
|
|
164
|
+
|
|
165
|
+
### Solution: Hot/Warm/Cold Classification
|
|
166
|
+
|
|
167
|
+
We organize context into three intimacy-based tiers:
|
|
168
|
+
|
|
169
|
+
| Tier | Content | Token Budget | Priority |
|
|
170
|
+
|------|---------|--------------|----------|
|
|
171
|
+
| **Hot** 🔥 | Current conversation, last 10 turns | 2,000 | Highest |
|
|
172
|
+
| **Warm** 🌡️ | User preferences, recent documents | 1,000 | Medium |
|
|
173
|
+
| **Cold** ❄️ | Historical archives, reference docs | 5,000 | Lowest |
|
|
174
|
+
|
|
175
|
+
**Algorithm 3: Context Assembly**
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
def assemble_context(query, conversation, user_profile, retrieved_docs):
|
|
179
|
+
context = ContextTriad()
|
|
180
|
+
|
|
181
|
+
# Hot: Preserve recent turns exactly (FIFO, no summarization)
|
|
182
|
+
context.hot = conversation.get_last_n_turns(10)
|
|
183
|
+
context.hot.truncate_to_tokens(2000)
|
|
184
|
+
|
|
185
|
+
# Warm: User context and recent activity
|
|
186
|
+
context.warm = user_profile.preferences + user_profile.recent_docs
|
|
187
|
+
context.warm.truncate_to_tokens(1000)
|
|
188
|
+
|
|
189
|
+
# Cold: Retrieved documents, time-decayed and ranked
|
|
190
|
+
context.cold = rank_by_relevance_and_time(retrieved_docs, query)
|
|
191
|
+
context.cold.truncate_to_tokens(5000)
|
|
192
|
+
|
|
193
|
+
return context.assemble() # Total: 8,000 tokens
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### FIFO vs. Summarization
|
|
197
|
+
|
|
198
|
+
We explicitly reject summarization for conversation history:
|
|
199
|
+
|
|
200
|
+
| Approach | Pros | Cons |
|
|
201
|
+
|----------|------|------|
|
|
202
|
+
| **Summarization** | Compresses more history | Loses nuance, costs tokens to generate |
|
|
203
|
+
| **FIFO (Ours)** | Preserves exact recent content | Loses older context |
|
|
204
|
+
|
|
205
|
+
Our philosophy: **"Chopping > Summarizing"**. Users rarely reference content from 20 minutes ago but frequently reference the exact code snippet from 30 seconds ago.
|
|
206
|
+
|
|
207
|
+
## Heuristic Router
|
|
208
|
+
|
|
209
|
+
### Problem
|
|
210
|
+
|
|
211
|
+
ML-based routers add latency (15-50ms). LLM-based routers are slower still (100-500ms). For high-volume enterprise deployments, this latency compounds.
|
|
212
|
+
|
|
213
|
+
### Solution: Deterministic Rules
|
|
214
|
+
|
|
215
|
+
We use a rule-based router with **zero model inference**:
|
|
216
|
+
|
|
217
|
+
**Algorithm 4: Heuristic Routing**
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
def route_query(query):
|
|
221
|
+
query_lower = query.lower()
|
|
222
|
+
|
|
223
|
+
# Keyword-based routing
|
|
224
|
+
if any(kw in query_lower for kw in ["error", "bug", "crash", "fail"]):
|
|
225
|
+
return RouteType.TROUBLESHOOTING
|
|
226
|
+
|
|
227
|
+
if any(kw in query_lower for kw in ["how to", "steps", "guide"]):
|
|
228
|
+
return RouteType.PROCEDURAL
|
|
229
|
+
|
|
230
|
+
if any(kw in query_lower for kw in ["policy", "rule", "allowed"]):
|
|
231
|
+
return RouteType.POLICY
|
|
232
|
+
|
|
233
|
+
if any(kw in query_lower for kw in ["api", "endpoint", "request"]):
|
|
234
|
+
return RouteType.TECHNICAL
|
|
235
|
+
|
|
236
|
+
# Default: general retrieval
|
|
237
|
+
return RouteType.GENERAL
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Routing Performance
|
|
241
|
+
|
|
242
|
+
| Router | Mean Latency | Accuracy |
|
|
243
|
+
|--------|--------------|----------|
|
|
244
|
+
| LLM-based | 450ms | 95% |
|
|
245
|
+
| ML-based | 15ms | 92% |
|
|
246
|
+
| **Heuristic (Ours)** | **0.003ms** | **89%** |
|
|
247
|
+
|
|
248
|
+
We trade 3-6% accuracy for **5,000-150,000x speedup**. For most enterprise queries, deterministic rules suffice.
|
|
249
|
+
|
|
250
|
+
## Pragmatic Truth
|
|
251
|
+
|
|
252
|
+
### Problem
|
|
253
|
+
|
|
254
|
+
Official documentation often contains theoretical or aspirational information. The actual truth lives in Slack logs, incident reports, and team notes.
|
|
255
|
+
|
|
256
|
+
### Solution: Dual-Source Tracking
|
|
257
|
+
|
|
258
|
+
We maintain parallel indices for official and informal sources:
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
@dataclass
|
|
262
|
+
class PragmaticTruthResult:
|
|
263
|
+
official_answer: str
|
|
264
|
+
official_source: str
|
|
265
|
+
informal_answer: Optional[str]
|
|
266
|
+
informal_source: Optional[str]
|
|
267
|
+
conflict_detected: bool
|
|
268
|
+
conflict_explanation: Optional[str]
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
**Algorithm 5: Conflict Detection**
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
def detect_conflict(official_chunks, informal_chunks, query):
|
|
275
|
+
official_answer = synthesize(official_chunks)
|
|
276
|
+
informal_answer = synthesize(informal_chunks)
|
|
277
|
+
|
|
278
|
+
# Semantic similarity between answers
|
|
279
|
+
similarity = cosine_similarity(
|
|
280
|
+
embed(official_answer),
|
|
281
|
+
embed(informal_answer)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if similarity < CONFLICT_THRESHOLD: # e.g., 0.7
|
|
285
|
+
return Conflict(
|
|
286
|
+
official=official_answer,
|
|
287
|
+
informal=informal_answer,
|
|
288
|
+
explanation=generate_conflict_explanation(
|
|
289
|
+
official_answer, informal_answer
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return NoConflict(answer=official_answer)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
### Example Output
|
|
297
|
+
|
|
298
|
+
**Query**: "What's the API rate limit?"
|
|
299
|
+
|
|
300
|
+
| Source Type | Answer | Source |
|
|
301
|
+
|-------------|--------|--------|
|
|
302
|
+
| Official | "100 requests/minute" | api_docs.md |
|
|
303
|
+
| Informal | "Crashes around 50; the docs lie" | #engineering Slack |
|
|
304
|
+
| **Conflict** | ⚠️ Yes | |
|
|
305
|
+
|
|
306
|
+
**CaaS Response**: "The official documentation states 100 requests/minute, but engineering discussions indicate the practical limit is closer to 50 requests/minute before performance degradation."
|
|
307
|
+
|
|
308
|
+
## Trust Gateway
|
|
309
|
+
|
|
310
|
+
### Problem
|
|
311
|
+
|
|
312
|
+
Third-party routing services require sending proprietary data through external APIs. No enterprise CISO accepts this data leakage risk.
|
|
313
|
+
|
|
314
|
+
### Solution: On-Premises Deployment
|
|
315
|
+
|
|
316
|
+
The Trust Gateway is designed for deployment behind the enterprise firewall:
|
|
317
|
+
|
|
318
|
+
```
|
|
319
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
320
|
+
│ ENTERPRISE NETWORK │
|
|
321
|
+
│ ┌─────────────┐ ┌─────────────────┐ ┌───────────┐ │
|
|
322
|
+
│ │ Internal │ ──▶ │ Trust Gateway │ ──▶ │ LLM API │ │
|
|
323
|
+
│ │ Services │ │ (On-Prem) │ │ (External)│ │
|
|
324
|
+
│ └─────────────┘ └─────────────────┘ └───────────┘ │
|
|
325
|
+
│ │ │
|
|
326
|
+
│ No PII/proprietary │
|
|
327
|
+
│ data leaves network │
|
|
328
|
+
│ until sanitized │
|
|
329
|
+
└─────────────────────────────────────────────────────────────┘
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
### Security Properties
|
|
333
|
+
|
|
334
|
+
1. **Data Sovereignty**: All processing happens within enterprise boundaries
|
|
335
|
+
2. **Audit Logging**: Complete trace of all context assembly decisions
|
|
336
|
+
3. **PII Filtering**: Optional sanitization before external API calls
|
|
337
|
+
4. **Model Agnostic**: Route to any LLM provider (OpenAI, Anthropic, local)
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Implementation
|
|
342
|
+
|
|
343
|
+
CaaS is implemented in Python 3.8+ with the following dependencies:
|
|
344
|
+
|
|
345
|
+
- **FastAPI**: REST API server
|
|
346
|
+
- **Pydantic**: Data validation and serialization
|
|
347
|
+
- **NumPy/scikit-learn**: Vector operations
|
|
348
|
+
- **tiktoken**: Token counting
|
|
349
|
+
|
|
350
|
+
The complete implementation is open-source: https://github.com/imran-siddique/context-as-a-service
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Paper Outline
|
|
2
|
+
|
|
3
|
+
**Working Title:** Context-as-a-Service: A Principled Architecture for Enterprise RAG Systems
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 1. Abstract (~250 words)
|
|
8
|
+
- Problem: Seven fallacies in production RAG
|
|
9
|
+
- Solution: CaaS framework with 5 novel components
|
|
10
|
+
- Results: +28% Precision@5, sub-ms routing, enterprise-ready
|
|
11
|
+
- See [abstract.md](abstract.md) for current draft
|
|
12
|
+
|
|
13
|
+
## 2. Introduction (1.5 pages)
|
|
14
|
+
- Motivation: RAG is ubiquitous but broken at scale
|
|
15
|
+
- The Seven Fallacies (brief enumeration)
|
|
16
|
+
- Contributions (numbered list):
|
|
17
|
+
1. Taxonomy of RAG pitfalls
|
|
18
|
+
2. Structure-aware indexing
|
|
19
|
+
3. Context Triad (Hot/Warm/Cold)
|
|
20
|
+
4. Pragmatic Truth tracking
|
|
21
|
+
5. Trust Gateway architecture
|
|
22
|
+
6. Heuristic Router
|
|
23
|
+
7. Open benchmark corpus
|
|
24
|
+
|
|
25
|
+
## 3. Related Work (1 page)
|
|
26
|
+
- See [../docs/RELATED_WORK.md](../docs/RELATED_WORK.md)
|
|
27
|
+
- RAG frameworks (LlamaIndex, LangChain, Haystack)
|
|
28
|
+
- Context management (MemGPT, Reflexion)
|
|
29
|
+
- Enterprise AI deployment challenges
|
|
30
|
+
- Temporal/decay-based retrieval
|
|
31
|
+
|
|
32
|
+
## 4. The Seven Fallacies (1 page)
|
|
33
|
+
- 4.1 Flat Chunk Fallacy (Structure)
|
|
34
|
+
- 4.2 Context Amnesia (Metadata)
|
|
35
|
+
- 4.3 Time-Blind Retrieval (Temporal)
|
|
36
|
+
- 4.4 Flat Context Fallacy (Priority)
|
|
37
|
+
- 4.5 Official Truth Fallacy (Source)
|
|
38
|
+
- 4.6 Brutal Squeeze (Summarization)
|
|
39
|
+
- 4.7 Middleware Gap (Trust)
|
|
40
|
+
|
|
41
|
+
## 5. Method: CaaS Architecture (2 pages)
|
|
42
|
+
- 5.1 System Overview (Figure 1: Architecture diagram)
|
|
43
|
+
- 5.2 Structure-Aware Indexing
|
|
44
|
+
- Three-tier value hierarchy (High/Medium/Low)
|
|
45
|
+
- Algorithm 1: Structure detection
|
|
46
|
+
- 5.3 Context Triad
|
|
47
|
+
- Hot: Current conversation
|
|
48
|
+
- Warm: User preferences, recent docs
|
|
49
|
+
- Cold: Historical archives
|
|
50
|
+
- Figure 2: Triad visualization
|
|
51
|
+
- 5.4 Pragmatic Truth
|
|
52
|
+
- Official vs. practical knowledge
|
|
53
|
+
- Conflict detection algorithm
|
|
54
|
+
- 5.5 Heuristic Router
|
|
55
|
+
- Deterministic routing rules
|
|
56
|
+
- Zero-latency design
|
|
57
|
+
- 5.6 Trust Gateway
|
|
58
|
+
- On-premises deployment
|
|
59
|
+
- Security properties
|
|
60
|
+
|
|
61
|
+
## 6. Experiments (2 pages)
|
|
62
|
+
- 6.1 Benchmark Corpus
|
|
63
|
+
- 16 enterprise documents
|
|
64
|
+
- Domain distribution (Table 1)
|
|
65
|
+
- Available on Hugging Face
|
|
66
|
+
- 6.2 Baselines
|
|
67
|
+
- Naive chunking (500 tokens)
|
|
68
|
+
- LlamaIndex default
|
|
69
|
+
- LangChain default
|
|
70
|
+
- 6.3 Metrics
|
|
71
|
+
- Precision@K, NDCG@K
|
|
72
|
+
- Routing latency
|
|
73
|
+
- Token efficiency
|
|
74
|
+
- 6.4 Results
|
|
75
|
+
- Table 2: Main results (+28.1% P@5)
|
|
76
|
+
- Table 3: Ablation study
|
|
77
|
+
- Table 4: Latency comparison (0.003ms routing)
|
|
78
|
+
- 6.5 Statistical Significance
|
|
79
|
+
- Paired t-tests (p < 0.001)
|
|
80
|
+
- Cohen's d = 3.36 (large effect)
|
|
81
|
+
|
|
82
|
+
## 7. Discussion (0.5 pages)
|
|
83
|
+
- Limitations
|
|
84
|
+
- Small benchmark corpus (16 docs)
|
|
85
|
+
- Synthetic documents
|
|
86
|
+
- No end-to-end LLM evaluation
|
|
87
|
+
- Broader Impact
|
|
88
|
+
- Enterprise adoption considerations
|
|
89
|
+
- Privacy implications of Trust Gateway
|
|
90
|
+
- Future Work
|
|
91
|
+
- Larger evaluation corpora
|
|
92
|
+
- Integration with more LLM providers
|
|
93
|
+
- Learned routing (optional hybrid)
|
|
94
|
+
|
|
95
|
+
## 8. Conclusion (0.5 pages)
|
|
96
|
+
- Summary of contributions
|
|
97
|
+
- Call to action: open-source, reproducible
|
|
98
|
+
|
|
99
|
+
## 9. References
|
|
100
|
+
- ~30-40 citations
|
|
101
|
+
- RAG papers, enterprise AI, context management
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Appendices (Supplementary)
|
|
106
|
+
- A. Full hyperparameter tables
|
|
107
|
+
- B. Additional qualitative examples
|
|
108
|
+
- C. Pseudocode for all algorithms
|
|
109
|
+
- D. Dataset card
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Figures Needed
|
|
114
|
+
- [ ] Fig 1: System architecture
|
|
115
|
+
- [ ] Fig 2: Context Triad diagram
|
|
116
|
+
- [ ] Fig 3: Structure-aware indexing example
|
|
117
|
+
- [ ] Fig 4: Results bar chart
|
|
118
|
+
|
|
119
|
+
## Tables Needed
|
|
120
|
+
- [x] Table 1: Corpus statistics (in dataset card)
|
|
121
|
+
- [x] Table 2: Main results (from run_evaluation.py)
|
|
122
|
+
- [x] Table 3: Ablation study (from run_evaluation.py)
|
|
123
|
+
- [x] Table 4: Statistical tests (from statistical_tests.py)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Related Work
|
|
2
|
+
|
|
3
|
+
*Adapted from [docs/RELATED_WORK.md](../docs/RELATED_WORK.md)*
|
|
4
|
+
|
|
5
|
+
## Retrieval-Augmented Generation
|
|
6
|
+
|
|
7
|
+
The foundation of modern RAG systems traces to Lewis et al. [1], who introduced the paradigm of combining retrieval with generation for knowledge-intensive NLP tasks. Subsequent work by Guu et al. [2] demonstrated the benefits of retrieval-augmented pre-training, while Izacard & Grave [3] developed the Fusion-in-Decoder architecture for open-domain QA.
|
|
8
|
+
|
|
9
|
+
CaaS differs from these foundational approaches by focusing on the **serving-time context management** problem rather than the retrieval mechanism itself. We assume any retriever (dense, sparse, or hybrid) and instead optimize how retrieved context is organized, prioritized, and presented to the LLM.
|
|
10
|
+
|
|
11
|
+
## Document Structure and Hierarchical Indexing
|
|
12
|
+
|
|
13
|
+
Hierarchical document understanding has been explored in summarization [4, 5] and document-level NLP [6]. These works demonstrate that respecting document structure improves downstream performance. CaaS applies this insight to RAG through our **three-tier value hierarchy** (High/Medium/Low), which explicitly encodes structural importance into the retrieval ranking.
|
|
14
|
+
|
|
15
|
+
Unlike learned hierarchical representations, CaaS uses **deterministic heuristics** based on document type detection (code, legal, policy, etc.), enabling zero-latency decisions without model inference.
|
|
16
|
+
|
|
17
|
+
## Temporal Information Retrieval
|
|
18
|
+
|
|
19
|
+
The importance of time in retrieval has been studied extensively in web search [7] and more recently in LLM contexts [8, 9]. Kasai et al. [10] introduced RealTime QA, demonstrating that time-sensitive questions require time-aware retrieval. Lazaridou et al. [11] showed that language models struggle with temporal knowledge degradation.
|
|
20
|
+
|
|
21
|
+
CaaS implements **explicit time-based decay** with configurable half-life parameters, inspired by radioactive decay models. Unlike implicit temporal signals in embeddings, our approach provides transparent, explainable temporal weighting.
|
|
22
|
+
|
|
23
|
+
## Source Attribution and Provenance
|
|
24
|
+
|
|
25
|
+
Recent work on attribution [12, 13, 14] addresses the challenge of tracing generated content to sources. Menick et al. [12] trained models to support answers with verified quotes, while Rashkin et al. [14] developed metrics for attribution quality.
|
|
26
|
+
|
|
27
|
+
CaaS's **Pragmatic Truth** module extends attribution by explicitly tracking **conflicts between sources**—surfacing when official documentation disagrees with informal sources (Slack, tickets, incident reports). This addresses a gap in current attribution systems that assume source consistency.
|
|
28
|
+
|
|
29
|
+
## The Accumulation Paradox and Long-Context Degradation
|
|
30
|
+
|
|
31
|
+
A growing body of work reveals a counterintuitive phenomenon we term the **Accumulation Paradox**: adding more context to LLMs can paradoxically *degrade* rather than improve performance. Liu et al. [21] demonstrated this empirically in their landmark "Lost in the Middle" study, showing that model performance follows a U-shaped curve where information in the middle of long contexts is systematically ignored. They found that "performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts."
|
|
32
|
+
|
|
33
|
+
This degradation extends to streaming and agentic settings. Xiao et al. [22] showed that window attention mechanisms fail entirely when context length exceeds cache size, introducing the "attention sink" phenomenon where initial tokens receive disproportionate attention regardless of semantic relevance. Li et al. [23] further demonstrated that even purpose-built long-context LLMs struggle with accumulated context, revealing biases toward later-presented information and degraded reasoning over multiple context pieces.
|
|
34
|
+
|
|
35
|
+
For agentic AI systems with extended interactions, Packer et al. [24] (MemGPT) showed that raw context accumulation cannot sustain long-running agents, proposing virtual context management inspired by operating system memory hierarchies. This work directly motivates CaaS's approach: rather than assuming more context is better, we implement **intelligent context decay and prioritization** that acknowledges the Accumulation Paradox.
|
|
36
|
+
|
|
37
|
+
CaaS addresses these challenges through: (1) **time-based decay** that naturally deprioritizes older context before it causes degradation, (2) **the Context Triad** (Hot/Warm/Cold) that ensures the most relevant context occupies attention-friendly positions, and (3) **structure-aware indexing** that prevents low-value content from diluting the context window.
|
|
38
|
+
|
|
39
|
+
## Context Window Management
|
|
40
|
+
|
|
41
|
+
Managing long conversations and context windows is a growing challenge as LLMs are deployed in production [15, 16]. Common approaches include summarization [17] and compression [18], but these introduce lossy transformations that can discard critical details.
|
|
42
|
+
|
|
43
|
+
CaaS takes a different approach with **FIFO sliding window management**: rather than summarizing poorly, we truncate precisely. Our philosophy—"Chopping > Summarizing"—preserves recent turns losslessly while accepting that older context is simply dropped. This design choice reflects the empirical observation that users rarely reference content from many turns ago, but frequently reference the exact code or error message from seconds ago.
|
|
44
|
+
|
|
45
|
+
## Enterprise AI Deployment
|
|
46
|
+
|
|
47
|
+
The enterprise deployment of LLMs introduces unique challenges around security, compliance, and data sovereignty [19, 20]. While cloud-based routing services offer cost optimization through model selection, they create unacceptable data leakage risks for sensitive enterprise data.
|
|
48
|
+
|
|
49
|
+
CaaS's **Trust Gateway** addresses this through an on-premises deployment model. Rather than competing on routing intelligence, we compete on trust: enterprises deploy the gateway behind their firewall, maintaining complete data sovereignty while still benefiting from intelligent context serving.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## References
|
|
54
|
+
|
|
55
|
+
[1] Lewis, P., et al. (2020). "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks." *NeurIPS 2020*. https://arxiv.org/abs/2005.11401
|
|
56
|
+
|
|
57
|
+
[2] Guu, K., et al. (2020). "REALM: Retrieval-Augmented Language Model Pre-Training." *ICML 2020*. https://arxiv.org/abs/2002.08909
|
|
58
|
+
|
|
59
|
+
[3] Izacard, G., & Grave, E. (2021). "Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering." *EACL 2021*. https://arxiv.org/abs/2007.01282
|
|
60
|
+
|
|
61
|
+
[4] Cohan, A., et al. (2018). "A Discourse-Aware Attention Model for Abstractive Summarization of Long Documents." *NAACL 2018*. https://arxiv.org/abs/1804.05685
|
|
62
|
+
|
|
63
|
+
[5] Liu, Y., & Lapata, M. (2019). "Hierarchical Transformers for Multi-Document Summarization." *ACL 2019*. https://arxiv.org/abs/1905.13164
|
|
64
|
+
|
|
65
|
+
[6] Xiao, W., & Carenini, G. (2019). "Extractive Summarization of Long Documents by Combining Global and Local Context." *EMNLP 2019*. https://arxiv.org/abs/1909.08089
|
|
66
|
+
|
|
67
|
+
[7] Campos, R., et al. (2014). "Survey of Temporal Information Retrieval and Scoping Methods." *WWW Journal*. DOI: 10.1007/s11280-013-0230-y
|
|
68
|
+
|
|
69
|
+
[8] Dai, Z., & Callan, J. (2019). "Deeper Text Understanding for IR with Contextual Neural Language Modeling." *SIGIR 2019*. https://arxiv.org/abs/1905.09217
|
|
70
|
+
|
|
71
|
+
[9] Nguyen, T., et al. (2016). "A Neural Network Approach to Context-Sensitive Generation of Conversational Responses." *NAACL 2016*. https://arxiv.org/abs/1506.06714
|
|
72
|
+
|
|
73
|
+
[10] Kasai, J., et al. (2022). "RealTime QA: What's the Answer Right Now?" *NeurIPS 2022*. https://arxiv.org/abs/2207.13332
|
|
74
|
+
|
|
75
|
+
[11] Lazaridou, A., et al. (2021). "Mind the Gap: Assessing Temporal Generalization in Neural Language Models." *NeurIPS 2021*. https://arxiv.org/abs/2102.01951
|
|
76
|
+
|
|
77
|
+
[12] Menick, J., et al. (2022). "Teaching Language Models to Support Answers with Verified Quotes." *NeurIPS 2022*. https://arxiv.org/abs/2203.11147
|
|
78
|
+
|
|
79
|
+
[13] Gao, L., et al. (2022). "Rarr: Researching and Revising What Language Models Say, Using Language Models." *ACL 2023*. https://arxiv.org/abs/2210.08726
|
|
80
|
+
|
|
81
|
+
[14] Rashkin, H., et al. (2021). "Measuring Attribution in Natural Language Generation Models." *Computational Linguistics 2021*. https://arxiv.org/abs/2112.12870
|
|
82
|
+
|
|
83
|
+
[15] Dinan, E., et al. (2019). "Wizard of Wikipedia: Knowledge-Powered Conversational Agents." *ICLR 2019*. https://arxiv.org/abs/1811.01241
|
|
84
|
+
|
|
85
|
+
[16] Zhang, S., et al. (2020). "DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation." *ACL 2020*. https://arxiv.org/abs/1911.00536
|
|
86
|
+
|
|
87
|
+
[17] Chevalier, A., et al. (2023). "Adapting Language Models to Compress Contexts." *EMNLP 2023*. https://arxiv.org/abs/2305.14788
|
|
88
|
+
|
|
89
|
+
[18] Gekhman, Z., et al. (2023). "Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?" *arXiv*. https://arxiv.org/abs/2405.05904
|
|
90
|
+
|
|
91
|
+
[19] Wang, L., et al. (2023). "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection." *arXiv*. https://arxiv.org/abs/2310.11511
|
|
92
|
+
|
|
93
|
+
[20] Khattab, O., et al. (2021). "Baleen: Robust Multi-Hop Reasoning at Scale via Condensed Retrieval." *NeurIPS 2021*. https://arxiv.org/abs/2101.00436
|
|
94
|
+
|
|
95
|
+
[21] Liu, N. F., Lin, K., Hewitt, J., Paranjape, A., Bevilacqua, M., Petroni, F., & Liang, P. (2023). "Lost in the Middle: How Language Models Use Long Contexts." *Transactions of the Association for Computational Linguistics (TACL)*. https://arxiv.org/abs/2307.03172
|
|
96
|
+
|
|
97
|
+
[22] Xiao, G., Tian, Y., Chen, B., Han, S., & Lewis, M. (2024). "Efficient Streaming Language Models with Attention Sinks." *ICLR 2024*. https://arxiv.org/abs/2309.17453
|
|
98
|
+
|
|
99
|
+
[23] Li, T., Zhang, G., Do, Q. D., Yue, X., & Chen, W. (2024). "Long-context LLMs Struggle with Long In-context Learning." *arXiv*. https://arxiv.org/abs/2404.02060
|
|
100
|
+
|
|
101
|
+
[24] Packer, C., Wooders, S., Lin, K., Fang, V., Patil, S. G., Stoica, I., & Gonzalez, J. E. (2023). "MemGPT: Towards LLMs as Operating Systems." *arXiv*. https://arxiv.org/abs/2310.08560
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Results Tables for Paper
|
|
2
|
+
|
|
3
|
+
## Table 1: Corpus Statistics
|
|
4
|
+
|
|
5
|
+
| Property | Value |
|
|
6
|
+
|----------|-------|
|
|
7
|
+
| Documents | 16 |
|
|
8
|
+
| Lines | 2,935 |
|
|
9
|
+
| Characters | 100,562 |
|
|
10
|
+
| Tokens (est.) | 16,286 |
|
|
11
|
+
| Formats | 5 (MD, PY, HTML, SQL, YAML) |
|
|
12
|
+
| Domains | 6 (Eng, Docs, HR, Legal, Security, Business) |
|
|
13
|
+
|
|
14
|
+
## Table 2: Main Results
|
|
15
|
+
|
|
16
|
+
| Method | P@5 | NDCG@10 | Latency |
|
|
17
|
+
|--------|-----|---------|---------|
|
|
18
|
+
| Baseline | 0.640 ± 0.057 | 0.610 ± 0.048 | 38ms |
|
|
19
|
+
| **CaaS** | **0.820 ± 0.045** | **0.780 ± 0.042** | 45ms |
|
|
20
|
+
| Δ | **+28.1%** | **+27.9%** | +18.4% |
|
|
21
|
+
|
|
22
|
+
## Table 3: Ablation Study
|
|
23
|
+
|
|
24
|
+
| Config | P@5 | NDCG@10 | Δ P@5 vs Base |
|
|
25
|
+
|--------|-----|---------|---------------|
|
|
26
|
+
| Baseline | 0.640 | 0.610 | — |
|
|
27
|
+
| + Structure-Aware | 0.740 | 0.700 | +15.6% |
|
|
28
|
+
| + Time Decay | 0.700 | 0.670 | +9.4% |
|
|
29
|
+
| + Metadata | 0.720 | 0.690 | +12.5% |
|
|
30
|
+
| + Pragmatic Truth | 0.680 | 0.650 | +6.3% |
|
|
31
|
+
| **Full CaaS** | **0.820** | **0.780** | **+28.1%** |
|
|
32
|
+
|
|
33
|
+
## Table 4: Statistical Significance
|
|
34
|
+
|
|
35
|
+
| Metric | t-stat | p-value | Cohen's d |
|
|
36
|
+
|--------|--------|---------|-----------|
|
|
37
|
+
| P@5 | 22.31 | < 0.001 | 3.36 (large) |
|
|
38
|
+
| NDCG@10 | 19.87 | < 0.001 | 2.98 (large) |
|
|
39
|
+
|
|
40
|
+
## Table 5: Routing Latency
|
|
41
|
+
|
|
42
|
+
| Router Type | Mean | p95 | p99 |
|
|
43
|
+
|-------------|------|-----|-----|
|
|
44
|
+
| Heuristic (Ours) | **0.003ms** | 0.005ms | 0.008ms |
|
|
45
|
+
| ML-based | 15.2ms | 28.4ms | 45.1ms |
|
|
46
|
+
| LLM-based | 450ms | 890ms | 1,200ms |
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
*Data source: benchmarks/results/evaluation_2026-01-20.json, statistical_results.json*
|