agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# Paper Appendix Materials
|
|
2
|
+
|
|
3
|
+
**Self-Correcting Agent Kernel (SCAK)**
|
|
4
|
+
**Version:** 1.1.0
|
|
5
|
+
**Date:** 2026-01-18
|
|
6
|
+
|
|
7
|
+
This document contains supplementary materials for the SCAK paper submission, including ablation studies, reproduction commands, statistical methodology, and experimental configurations.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Appendix A: Ablation Study Results
|
|
12
|
+
|
|
13
|
+
### A.1 Main Ablation Table (GAIA Benchmark)
|
|
14
|
+
|
|
15
|
+
**Setup:** 50 vague queries, 5 runs per configuration, seed 42
|
|
16
|
+
|
|
17
|
+
| Configuration | Detection Rate (mean ± std) | Correction Rate (mean ± std) | Post-Patch Success (mean ± std) | p-value vs. Full | Cohen's d |
|
|
18
|
+
|--------------|----------------------------|-----------------------------|---------------------------------|------------------|-----------|
|
|
19
|
+
| **Full SCAK** (baseline) | 100.0% ± 0.0 | 72.0% ± 4.2 | 82.0% ± 3.1 | — | — |
|
|
20
|
+
| No Semantic Purge | 100.0% ± 0.0 | 68.0% ± 5.1 | 75.0% ± 4.5 | 0.042* | 0.86 |
|
|
21
|
+
| No Teacher Model (o1) | 45.0% ± 8.3 | 28.0% ± 6.7 | 41.0% ± 7.2 | <0.001*** | 7.89 |
|
|
22
|
+
| No Tiered Memory | 92.0% ± 3.4 | 55.0% ± 7.9 | 68.0% ± 5.6 | 0.003** | 2.68 |
|
|
23
|
+
| No Differential Audit | 0.0% ± 0.0 | 0.0% ± 0.0 | 8.0% ± 2.1 | <0.001*** | ∞ |
|
|
24
|
+
| Self-Critique (no teacher) | 100.0% ± 0.0 | 40.0% ± 6.2 | 52.0% ± 5.8 | <0.001*** | 6.04 |
|
|
25
|
+
|
|
26
|
+
*Significance: `*` p<0.05, `**` p<0.01, `***` p<0.001*
|
|
27
|
+
|
|
28
|
+
### A.2 Context Efficiency Ablation (Amnesia Test)
|
|
29
|
+
|
|
30
|
+
| Configuration | Initial Tokens | After 50 Patches | After Model Upgrade | Reduction % |
|
|
31
|
+
|--------------|----------------|------------------|---------------------|-------------|
|
|
32
|
+
| Full SCAK | 800 | 1,600 | 880 | 45.0% |
|
|
33
|
+
| No Semantic Purge | 800 | 1,600 | 1,600 | 0.0% |
|
|
34
|
+
| No Type A/B Classification | 800 | 1,600 | 1,200 | 25.0% |
|
|
35
|
+
|
|
36
|
+
### A.3 Chaos Engineering Ablation (MTTR)
|
|
37
|
+
|
|
38
|
+
| Configuration | MTTR (mean ± std) | Recovery Rate (mean ± std) | p-value vs. Full |
|
|
39
|
+
|--------------|-------------------|---------------------------|------------------|
|
|
40
|
+
| Full SCAK | 28s ± 6 | 85% ± 7 | — |
|
|
41
|
+
| No Patcher Rollback | 45s ± 12 | 70% ± 9 | 0.008** |
|
|
42
|
+
| No Triage Engine | 52s ± 15 | 62% ± 11 | 0.002** |
|
|
43
|
+
|
|
44
|
+
### A.4 Raw Data: Correction Rates (5 runs)
|
|
45
|
+
|
|
46
|
+
| Run | Full SCAK | No Purge | No Teacher | No Tiered | No Audit | Self-Critique |
|
|
47
|
+
|-----|-----------|----------|------------|-----------|----------|---------------|
|
|
48
|
+
| 1 | 0.70 | 0.66 | 0.26 | 0.52 | 0.00 | 0.38 |
|
|
49
|
+
| 2 | 0.74 | 0.72 | 0.32 | 0.58 | 0.00 | 0.44 |
|
|
50
|
+
| 3 | 0.68 | 0.64 | 0.22 | 0.48 | 0.00 | 0.36 |
|
|
51
|
+
| 4 | 0.76 | 0.70 | 0.30 | 0.60 | 0.00 | 0.42 |
|
|
52
|
+
| 5 | 0.72 | 0.68 | 0.30 | 0.57 | 0.00 | 0.40 |
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Appendix B: Reproduction Commands
|
|
57
|
+
|
|
58
|
+
### B.1 Environment Setup
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Clone repository
|
|
62
|
+
git clone https://github.com/imran-siddique/self-correcting-agent-kernel.git
|
|
63
|
+
cd self-correcting-agent-kernel
|
|
64
|
+
|
|
65
|
+
# Install dependencies
|
|
66
|
+
pip install scak[all]
|
|
67
|
+
|
|
68
|
+
# Set API keys
|
|
69
|
+
export OPENAI_API_KEY="sk-..."
|
|
70
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
71
|
+
|
|
72
|
+
# Verify installation
|
|
73
|
+
python -c "from src.kernel.auditor import CompletenessAuditor; print('OK')"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### B.2 Main Experiments
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Set global seed
|
|
80
|
+
python -c "from reproducibility.seed_control import set_seeds; set_seeds(42)"
|
|
81
|
+
|
|
82
|
+
# GAIA Laziness Benchmark (Experiment A)
|
|
83
|
+
python experiments/gaia_benchmark/run_benchmark.py \
|
|
84
|
+
--queries datasets/gaia_vague_queries/vague_queries.json \
|
|
85
|
+
--output results/gaia_results.json \
|
|
86
|
+
--seed 42 \
|
|
87
|
+
--runs 5
|
|
88
|
+
|
|
89
|
+
# Chaos Engineering (Experiment C)
|
|
90
|
+
python experiments/chaos_engineering/run_chaos.py \
|
|
91
|
+
--scenarios datasets/chaos_scenarios/schema_failures.json \
|
|
92
|
+
--output results/chaos_results.json \
|
|
93
|
+
--seed 42
|
|
94
|
+
|
|
95
|
+
# Amnesia Test (Experiment B)
|
|
96
|
+
python experiments/amnesia_test.py \
|
|
97
|
+
--patches datasets/patches/synthetic_patches.json \
|
|
98
|
+
--old-model gpt-4o \
|
|
99
|
+
--new-model gpt-5 \
|
|
100
|
+
--output results/amnesia_results.json \
|
|
101
|
+
--seed 42
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### B.3 Ablation Studies
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# No Semantic Purge
|
|
108
|
+
python experiments/ablation_studies/ablation_no_purge.py \
|
|
109
|
+
--seed 42 --runs 5 --output results/ablation_no_purge.json
|
|
110
|
+
|
|
111
|
+
# No Differential Auditing
|
|
112
|
+
python experiments/ablation_studies/ablation_no_audit.py \
|
|
113
|
+
--seed 42 --runs 5 --output results/ablation_no_audit.json
|
|
114
|
+
|
|
115
|
+
# Full ablation suite
|
|
116
|
+
python experiments/run_comprehensive_ablations.py \
|
|
117
|
+
--seed 42 --runs 5 --output results/ablation_suite.json
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### B.4 Statistical Analysis
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
python reproducibility/statistical_analysis.py \
|
|
124
|
+
--treatment results/gaia_results.json \
|
|
125
|
+
--control results/baseline_gpt4o.json \
|
|
126
|
+
--output results/statistical_report.json
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### B.5 Docker Reproduction (Recommended)
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
cd reproducibility
|
|
133
|
+
docker build -t scak-repro:1.0 -f Dockerfile.reproducibility .
|
|
134
|
+
docker run --rm \
|
|
135
|
+
-e OPENAI_API_KEY=$OPENAI_API_KEY \
|
|
136
|
+
-v $(pwd)/results:/results \
|
|
137
|
+
scak-repro:1.0 python run_all_experiments.py --seed 42 --runs 5
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Appendix C: Statistical Methodology
|
|
143
|
+
|
|
144
|
+
### C.1 Hypothesis Testing
|
|
145
|
+
|
|
146
|
+
**Primary Test:** Welch's two-sample t-test (unequal variances)
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from scipy import stats
|
|
150
|
+
|
|
151
|
+
# Example: Full SCAK vs No Semantic Purge
|
|
152
|
+
full_scak = [0.70, 0.74, 0.68, 0.76, 0.72]
|
|
153
|
+
no_purge = [0.66, 0.72, 0.64, 0.70, 0.68]
|
|
154
|
+
|
|
155
|
+
t_stat, p_value = stats.ttest_ind(full_scak, no_purge, equal_var=False)
|
|
156
|
+
# Result: t=2.486, p=0.0418
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Alternative Test:** Mann-Whitney U (non-parametric, if normality violated)
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
u_stat, p_value = stats.mannwhitneyu(full_scak, no_purge, alternative='two-sided')
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### C.2 Multiple Comparison Correction
|
|
166
|
+
|
|
167
|
+
**Method:** Bonferroni correction for 5 ablation comparisons
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
alpha_original = 0.05
|
|
171
|
+
n_comparisons = 5
|
|
172
|
+
alpha_corrected = alpha_original / n_comparisons # 0.01
|
|
173
|
+
|
|
174
|
+
# All reported p-values significant at α=0.05;
|
|
175
|
+
# After Bonferroni: No Teacher, No Audit remain significant at α=0.01
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### C.3 Effect Size (Cohen's d)
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
import numpy as np
|
|
182
|
+
|
|
183
|
+
def cohens_d(group1, group2):
|
|
184
|
+
n1, n2 = len(group1), len(group2)
|
|
185
|
+
var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
|
|
186
|
+
pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
|
|
187
|
+
return (np.mean(group1) - np.mean(group2)) / pooled_std
|
|
188
|
+
|
|
189
|
+
# Interpretation:
|
|
190
|
+
# |d| < 0.2: negligible
|
|
191
|
+
# |d| 0.2-0.5: small
|
|
192
|
+
# |d| 0.5-0.8: medium
|
|
193
|
+
# |d| > 0.8: large
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### C.4 Confidence Intervals (Bootstrap)
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
import numpy as np
|
|
200
|
+
|
|
201
|
+
def bootstrap_ci(data, n_bootstrap=10000, alpha=0.05):
|
|
202
|
+
rng = np.random.default_rng(42)
|
|
203
|
+
means = [np.mean(rng.choice(data, len(data), replace=True))
|
|
204
|
+
for _ in range(n_bootstrap)]
|
|
205
|
+
return np.percentile(means, [100*alpha/2, 100*(1-alpha/2)])
|
|
206
|
+
|
|
207
|
+
# Example: 95% CI for correction rate
|
|
208
|
+
ci = bootstrap_ci([0.70, 0.74, 0.68, 0.76, 0.72])
|
|
209
|
+
# Result: [0.686, 0.754]
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Appendix D: Experimental Configuration
|
|
215
|
+
|
|
216
|
+
### D.1 Hardware Specifications
|
|
217
|
+
|
|
218
|
+
| Component | Specification |
|
|
219
|
+
|-----------|--------------|
|
|
220
|
+
| **CPU** | Intel Xeon E5-2686 v4 @ 2.30GHz (8 cores) |
|
|
221
|
+
| **RAM** | 32 GB |
|
|
222
|
+
| **GPU** | None (CPU-only, LLM via API) |
|
|
223
|
+
| **Disk** | 100 GB SSD |
|
|
224
|
+
| **Network** | 1 Gbps |
|
|
225
|
+
| **Cloud** | AWS EC2 c5.2xlarge |
|
|
226
|
+
| **Region** | us-east-1 |
|
|
227
|
+
|
|
228
|
+
### D.2 Software Versions
|
|
229
|
+
|
|
230
|
+
| Package | Version |
|
|
231
|
+
|---------|---------|
|
|
232
|
+
| Python | 3.10.12 |
|
|
233
|
+
| pydantic | 2.5.3 |
|
|
234
|
+
| openai | 1.7.2 |
|
|
235
|
+
| anthropic | 0.8.1 |
|
|
236
|
+
| scipy | 1.11.3 |
|
|
237
|
+
| numpy | 1.24.3 |
|
|
238
|
+
| pytest | 7.4.3 |
|
|
239
|
+
|
|
240
|
+
### D.3 LLM Model Versions
|
|
241
|
+
|
|
242
|
+
| Role | Model | Snapshot |
|
|
243
|
+
|------|-------|----------|
|
|
244
|
+
| Weak Agent | OpenAI GPT-4o | gpt-4o-2024-08-06 |
|
|
245
|
+
| Teacher (Auditor) | OpenAI o1-preview | o1-preview-2024-09-12 |
|
|
246
|
+
| Alternative Teacher | Anthropic Claude | claude-3-5-sonnet-20241022 |
|
|
247
|
+
|
|
248
|
+
### D.4 Seed Configuration
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
# reproducibility/seed_control.py
|
|
252
|
+
GLOBAL_SEED = 42
|
|
253
|
+
|
|
254
|
+
import random
|
|
255
|
+
import numpy as np
|
|
256
|
+
import os
|
|
257
|
+
|
|
258
|
+
random.seed(GLOBAL_SEED)
|
|
259
|
+
np.random.seed(GLOBAL_SEED)
|
|
260
|
+
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
|
|
261
|
+
|
|
262
|
+
# Note: LLM API calls remain non-deterministic (±2% variance)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### D.5 API Cost Breakdown
|
|
266
|
+
|
|
267
|
+
| Experiment | Queries | Teacher Calls | GPT-4o Cost | o1-preview Cost | Total |
|
|
268
|
+
|------------|---------|---------------|-------------|-----------------|-------|
|
|
269
|
+
| GAIA Benchmark | 50 | 36 | $1.25 | $6.00 | $7.25 |
|
|
270
|
+
| Chaos Engineering | 20 | 17 | $0.50 | $1.50 | $2.00 |
|
|
271
|
+
| Amnesia Test | N/A | 5 | $0.25 | $0.50 | $0.75 |
|
|
272
|
+
| Ablations (5 configs) | 250 | 180 | $6.25 | $30.00 | $36.25 |
|
|
273
|
+
| **Total** | **320** | **238** | **$8.25** | **$38.00** | **$46.25** |
|
|
274
|
+
|
|
275
|
+
*Prices based on OpenAI pricing as of 2026-01-18*
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Appendix E: Dataset Details
|
|
280
|
+
|
|
281
|
+
### E.1 GAIA Laziness Benchmark
|
|
282
|
+
|
|
283
|
+
- **Source:** Extended from GAIA (General AI Assistants) benchmark
|
|
284
|
+
- **Size:** 50 vague queries
|
|
285
|
+
- **Categories:**
|
|
286
|
+
- Archived Resources (20): Data exists in archives
|
|
287
|
+
- Renamed Entities (15): Resources were renamed
|
|
288
|
+
- Time-Based Confusion (10): "recent", "latest", "last week"
|
|
289
|
+
- Synonym Issues (5): Different terminology
|
|
290
|
+
- **HuggingFace:** `imran-siddique/scak_gaia_laziness`
|
|
291
|
+
|
|
292
|
+
### E.2 Red-Team Security Benchmark
|
|
293
|
+
|
|
294
|
+
- **Size:** 60 adversarial prompts
|
|
295
|
+
- **Categories:**
|
|
296
|
+
- Jailbreak Attempts (20)
|
|
297
|
+
- Prompt Injection (15)
|
|
298
|
+
- PII Extraction (10)
|
|
299
|
+
- Harmful Content (10)
|
|
300
|
+
- Role-Play Exploits (5)
|
|
301
|
+
- **HuggingFace:** `imran-siddique/scak_red_team`
|
|
302
|
+
|
|
303
|
+
### E.3 Chaos Engineering Scenarios
|
|
304
|
+
|
|
305
|
+
- **Size:** 20 failure scenarios
|
|
306
|
+
- **Types:**
|
|
307
|
+
- Database schema breaks (8)
|
|
308
|
+
- API timeout simulations (6)
|
|
309
|
+
- Invalid response formats (4)
|
|
310
|
+
- Permission denials (2)
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Appendix F: Broader Impact Statement
|
|
315
|
+
|
|
316
|
+
### Positive Impacts
|
|
317
|
+
- **Reliability:** Reduces agent failures in production, improving user trust
|
|
318
|
+
- **Efficiency:** Context reduction lowers costs and latency
|
|
319
|
+
- **Safety:** Governance layer prevents harmful outputs
|
|
320
|
+
|
|
321
|
+
### Potential Risks
|
|
322
|
+
- **Over-reliance:** Users may trust self-correcting agents too much
|
|
323
|
+
- **Teacher Dependency:** Concentration of power in teacher model providers
|
|
324
|
+
- **Adversarial Exploitation:** Patch injection attacks (see Limitations)
|
|
325
|
+
|
|
326
|
+
### Mitigations
|
|
327
|
+
- Human-in-the-loop for high-stakes decisions
|
|
328
|
+
- Multi-teacher ensemble to reduce single-provider dependency
|
|
329
|
+
- Patch provenance tracking and anomaly detection
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Appendix G: Checklist for Reproducibility
|
|
334
|
+
|
|
335
|
+
- [x] Code publicly available (GitHub + PyPI)
|
|
336
|
+
- [x] Datasets publicly available (HuggingFace)
|
|
337
|
+
- [x] Exact software versions documented
|
|
338
|
+
- [x] Hardware specifications provided
|
|
339
|
+
- [x] Random seeds specified
|
|
340
|
+
- [x] Statistical tests described
|
|
341
|
+
- [x] Confidence intervals reported
|
|
342
|
+
- [x] Ablation studies conducted
|
|
343
|
+
- [x] Limitations honestly discussed
|
|
344
|
+
- [x] Docker image for full reproduction
|
|
345
|
+
|
|
346
|
+
---
|
|
347
|
+
|
|
348
|
+
**Last Updated:** 2026-01-18
|
|
349
|
+
**Repository:** https://github.com/imran-siddique/self-correcting-agent-kernel
|
|
350
|
+
**PyPI:** https://pypi.org/project/scak/
|
|
351
|
+
**Contact:** research@scak.ai
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
% Self-Correcting Agent Kernel - Bibliography
|
|
2
|
+
% Generated: 2026-01-18
|
|
3
|
+
% For use in academic paper submission
|
|
4
|
+
|
|
5
|
+
% ===== Core Inspirations =====
|
|
6
|
+
|
|
7
|
+
@inproceedings{shinn2023reflexion,
|
|
8
|
+
title={Reflexion: Language Agents with Verbal Reinforcement Learning},
|
|
9
|
+
author={Shinn, Noah and Cassano, Federico and Berman, Edward and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
|
|
10
|
+
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
11
|
+
year={2023},
|
|
12
|
+
url={https://arxiv.org/abs/2303.11366}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
@article{madaan2023self,
|
|
16
|
+
title={Self-Refine: Iterative Refinement with Self-Feedback},
|
|
17
|
+
author={Madaan, Aman and Tandon, Niket and Gupta, Prakhar and others},
|
|
18
|
+
journal={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
19
|
+
year={2023},
|
|
20
|
+
url={https://arxiv.org/abs/2303.17651}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
@article{wang2023voyager,
|
|
24
|
+
title={Voyager: An Open-Ended Embodied Agent with Large Language Models},
|
|
25
|
+
author={Wang, Guanzhi and Xie, Yuqi and Jiang, Yunfan and others},
|
|
26
|
+
year={2023},
|
|
27
|
+
url={https://arxiv.org/abs/2305.16291}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
@article{bai2022constitutional,
|
|
31
|
+
title={Constitutional AI: Harmlessness from AI Feedback},
|
|
32
|
+
author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and others},
|
|
33
|
+
journal={arXiv preprint arXiv:2212.08073},
|
|
34
|
+
year={2022},
|
|
35
|
+
publisher={Anthropic}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
% ===== Reinforcement Learning from Human Feedback =====
|
|
39
|
+
|
|
40
|
+
@inproceedings{christiano2017deep,
|
|
41
|
+
title={Deep Reinforcement Learning from Human Feedback},
|
|
42
|
+
author={Christiano, Paul F and Leike, Jan and Brown, Tom and others},
|
|
43
|
+
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
44
|
+
year={2017},
|
|
45
|
+
url={https://arxiv.org/abs/1706.03741}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
@article{ouyang2022training,
|
|
49
|
+
title={Training language models to follow instructions with human feedback},
|
|
50
|
+
author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and others},
|
|
51
|
+
journal={arXiv preprint arXiv:2203.02155},
|
|
52
|
+
year={2022},
|
|
53
|
+
publisher={OpenAI}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@article{stiennon2020learning,
|
|
57
|
+
title={Learning to summarize with human feedback},
|
|
58
|
+
author={Stiennon, Nisan and Ouyang, Long and Wu, Jeffrey and others},
|
|
59
|
+
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
60
|
+
year={2020},
|
|
61
|
+
url={https://arxiv.org/abs/2009.01325}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
% ===== Multi-Agent Systems =====
|
|
65
|
+
|
|
66
|
+
@article{wu2023autogen,
|
|
67
|
+
title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation},
|
|
68
|
+
author={Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and others},
|
|
69
|
+
journal={arXiv preprint arXiv:2308.08155},
|
|
70
|
+
year={2023},
|
|
71
|
+
publisher={Microsoft Research}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
@article{park2023generative,
|
|
75
|
+
title={Generative Agents: Interactive Simulacra of Human Behavior},
|
|
76
|
+
author={Park, Joon Sung and O'Brien, Joseph C and Cai, Carrie J and others},
|
|
77
|
+
year={2023},
|
|
78
|
+
url={https://arxiv.org/abs/2304.03442}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
@article{hong2023metagpt,
|
|
82
|
+
title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
|
|
83
|
+
author={Hong, Sirui and Zhuge, Mingchen and Chen, Jiaqi and Zheng, Xiawu and Cheng, Yuheng and Zhang, Ceyao and others},
|
|
84
|
+
year={2023},
|
|
85
|
+
url={https://arxiv.org/abs/2308.00352}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
% ===== Tool Use and Grounding =====
|
|
89
|
+
|
|
90
|
+
@article{schick2023toolformer,
|
|
91
|
+
title={Toolformer: Language Models Can Teach Themselves to Use Tools},
|
|
92
|
+
author={Schick, Timo and Dwivedi-Yu, Jane and Dessi, Roberto and others},
|
|
93
|
+
year={2023},
|
|
94
|
+
url={https://arxiv.org/abs/2302.04761}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
@inproceedings{yao2023react,
|
|
98
|
+
title={ReAct: Synergizing Reasoning and Acting in Language Models},
|
|
99
|
+
author={Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and others},
|
|
100
|
+
booktitle={International Conference on Learning Representations (ICLR)},
|
|
101
|
+
year={2023},
|
|
102
|
+
url={https://arxiv.org/abs/2210.03629}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
@article{qin2023toolllm,
|
|
106
|
+
title={ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs},
|
|
107
|
+
author={Qin, Yujia and Liang, Shihao and Ye, Yining and others},
|
|
108
|
+
year={2023},
|
|
109
|
+
url={https://arxiv.org/abs/2307.16789}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
% ===== Context Management =====
|
|
113
|
+
|
|
114
|
+
@article{liu2023lost,
|
|
115
|
+
title={Lost in the Middle: How Language Models Use Long Contexts},
|
|
116
|
+
author={Liu, Nelson F and Lin, Kevin and Hewitt, John and others},
|
|
117
|
+
year={2023},
|
|
118
|
+
url={https://arxiv.org/abs/2307.03172}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
@article{mohtashami2023landmark,
|
|
122
|
+
title={Landmark Attention: Random-Access Infinite Context Length for Transformers},
|
|
123
|
+
author={Mohtashami, Amirkeivan and Jaggi, Martin},
|
|
124
|
+
year={2023},
|
|
125
|
+
url={https://arxiv.org/abs/2305.16300}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
@inproceedings{lewis2020retrieval,
|
|
129
|
+
title={Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks},
|
|
130
|
+
author={Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and others},
|
|
131
|
+
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
132
|
+
year={2020},
|
|
133
|
+
url={https://arxiv.org/abs/2005.11401}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
% ===== Safety and Alignment (2024-2025) =====
|
|
137
|
+
|
|
138
|
+
@misc{meta2024llamaguard2,
|
|
139
|
+
title={LlamaGuard 2: Safety Classification for LLM Interactions},
|
|
140
|
+
author={{Meta AI}},
|
|
141
|
+
year={2024},
|
|
142
|
+
howpublished={Meta Research Blog},
|
|
143
|
+
note={Technical report on improved safety classification}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
@article{han2024wildguard,
|
|
147
|
+
title={WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs},
|
|
148
|
+
author={Han, Seungju and Rao, Kavel and Ettinger, Allyson and Jiang, Liwei and Lin, Bill Yuchen and Lambert, Nathan and Choi, Yejin and Dziri, Nouha},
|
|
149
|
+
journal={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
150
|
+
year={2024},
|
|
151
|
+
url={https://arxiv.org/abs/2406.18495}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
@article{zou2023universal,
|
|
155
|
+
title={Universal and Transferable Adversarial Attacks on Aligned Language Models},
|
|
156
|
+
author={Zou, Andy and Wang, Zifan and Carlini, Nicholas and Nasr, Milad and Kolter, J. Zico and Fredrikson, Matt},
|
|
157
|
+
year={2023},
|
|
158
|
+
url={https://arxiv.org/abs/2307.15043}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
% ===== Benchmarks =====
|
|
162
|
+
|
|
163
|
+
@article{mialon2023gaia,
|
|
164
|
+
title={GAIA: a benchmark for General AI Assistants},
|
|
165
|
+
author={Mialon, Gr{\'e}goire and Fourrier, Cl{\'e}mentine and Swift, Craig and Wolf, Thomas and LeCun, Yann and Scialom, Thomas},
|
|
166
|
+
year={2023},
|
|
167
|
+
url={https://arxiv.org/abs/2311.12983}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
@article{liu2023agentbench,
|
|
171
|
+
title={AgentBench: Evaluating LLMs as Agents},
|
|
172
|
+
author={Liu, Xiao and Yu, Hao and Zhang, Hanchen and others},
|
|
173
|
+
year={2023},
|
|
174
|
+
url={https://arxiv.org/abs/2308.03688}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
% ===== Production ML =====
|
|
178
|
+
|
|
179
|
+
@inproceedings{sculley2015hidden,
|
|
180
|
+
title={Hidden Technical Debt in Machine Learning Systems},
|
|
181
|
+
author={Sculley, David and Holt, Gary and Golovin, Daniel and others},
|
|
182
|
+
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
|
183
|
+
year={2015}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
@inproceedings{breck2019data,
|
|
187
|
+
title={Data Validation for Machine Learning},
|
|
188
|
+
author={Breck, Eric and Polyzotis, Neoklis and Roy, Sudip and others},
|
|
189
|
+
booktitle={MLSys},
|
|
190
|
+
year={2019}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
% ===== Cognitive Science Foundations =====
|
|
194
|
+
|
|
195
|
+
@book{kahneman2011thinking,
|
|
196
|
+
title={Thinking, Fast and Slow},
|
|
197
|
+
author={Kahneman, Daniel},
|
|
198
|
+
year={2011},
|
|
199
|
+
publisher={Farrar, Straus and Giroux}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
@book{boyd1987discourse,
|
|
203
|
+
title={A Discourse on Winning and Losing},
|
|
204
|
+
author={Boyd, John R},
|
|
205
|
+
year={1987},
|
|
206
|
+
publisher={Air University Press}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
% ===== Governance and Policy (2025) =====
|
|
210
|
+
|
|
211
|
+
@misc{wef2025governance,
|
|
212
|
+
title={AI Agents in the Workplace: Governance Framework and Risk Mitigation},
|
|
213
|
+
author={{World Economic Forum}},
|
|
214
|
+
year={2025},
|
|
215
|
+
howpublished={WEF Whitepaper},
|
|
216
|
+
month={January}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
@misc{euai2024,
|
|
220
|
+
title={Regulation (EU) 2024/1689 - Artificial Intelligence Act},
|
|
221
|
+
author={{European Parliament}},
|
|
222
|
+
year={2024},
|
|
223
|
+
howpublished={Official Journal of the European Union}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
% ===== LLM Models Referenced =====
|
|
227
|
+
|
|
228
|
+
@misc{openai2023gpt4,
|
|
229
|
+
title={GPT-4 Technical Report},
|
|
230
|
+
author={{OpenAI}},
|
|
231
|
+
year={2023},
|
|
232
|
+
url={https://arxiv.org/abs/2303.08774}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
@misc{openai2024o1,
|
|
236
|
+
title={Learning to Reason with LLMs},
|
|
237
|
+
author={{OpenAI}},
|
|
238
|
+
year={2024},
|
|
239
|
+
howpublished={OpenAI Blog},
|
|
240
|
+
month={September}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
@misc{anthropic2024claude,
|
|
244
|
+
title={Claude 3.5 Sonnet: Extended Context and Reasoning},
|
|
245
|
+
author={{Anthropic}},
|
|
246
|
+
year={2024},
|
|
247
|
+
howpublished={Anthropic Release Notes},
|
|
248
|
+
month={June}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
% ===== Additional References =====
|
|
252
|
+
|
|
253
|
+
@article{chen2023teaching,
|
|
254
|
+
title={Teaching Large Language Models to Self-Debug},
|
|
255
|
+
author={Chen, Xinyun and Lin, Maxwell and Sch{\"a}rli, Nathanael and Zhou, Denny},
|
|
256
|
+
year={2023},
|
|
257
|
+
url={https://arxiv.org/abs/2304.05128}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
@article{zhang2023multimodal,
|
|
261
|
+
title={Multimodal Chain-of-Thought Reasoning in Language Models},
|
|
262
|
+
author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and others},
|
|
263
|
+
year={2023},
|
|
264
|
+
url={https://arxiv.org/abs/2302.00923}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
@misc{basiri2016chaos,
|
|
268
|
+
title={Chaos Engineering},
|
|
269
|
+
author={Basiri, Ali and Behnam, Niosha and de Rooij, Ruud and others},
|
|
270
|
+
journal={IEEE Software},
|
|
271
|
+
year={2016},
|
|
272
|
+
publisher={IEEE}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
% ===== Our Work =====
|
|
276
|
+
|
|
277
|
+
@software{scak2026,
|
|
278
|
+
title={Self-Correcting Agent Kernel: Automated Alignment via Differential Auditing and Semantic Memory Hygiene},
|
|
279
|
+
author={{Self-Correcting Agent Team}},
|
|
280
|
+
year={2026},
|
|
281
|
+
version={1.1.0},
|
|
282
|
+
url={https://github.com/imran-siddique/self-correcting-agent-kernel},
|
|
283
|
+
note={Research foundations: Reflexion (NeurIPS 2023), Constitutional AI (Anthropic 2022), Voyager (arXiv:2305.16291)}
|
|
284
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|