agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# Datasets for Empirical Validation
|
|
2
|
+
|
|
3
|
+
This directory contains benchmark datasets used in the Self-Correcting Agent Kernel research.
|
|
4
|
+
|
|
5
|
+
## Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
datasets/
|
|
9
|
+
├── red_team/ # Red-team security benchmark (60+ prompts)
|
|
10
|
+
│ ├── jailbreak_patterns.json
|
|
11
|
+
│ ├── harmful_content.json
|
|
12
|
+
│ ├── pii_leakage.json
|
|
13
|
+
│ └── README.md
|
|
14
|
+
├── gaia_vague_queries/ # GAIA laziness benchmark (50+ queries)
|
|
15
|
+
│ ├── vague_queries.json
|
|
16
|
+
│ ├── ground_truth.json
|
|
17
|
+
│ └── README.md
|
|
18
|
+
├── chaos_scenarios/ # Chaos engineering scenarios (20+ scenarios)
|
|
19
|
+
│ ├── schema_failures.json
|
|
20
|
+
│ ├── api_failures.json
|
|
21
|
+
│ └── README.md
|
|
22
|
+
└── README.md (this file)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Dataset Availability
|
|
26
|
+
|
|
27
|
+
### 1. Red-Team Security Benchmark
|
|
28
|
+
|
|
29
|
+
**Description:** 60+ adversarial prompts testing jailbreak resistance, harmful content generation, and PII leakage.
|
|
30
|
+
|
|
31
|
+
**Access:** Public (included in this repository)
|
|
32
|
+
|
|
33
|
+
**Citation:** If you use this dataset, please cite:
|
|
34
|
+
```bibtex
|
|
35
|
+
@dataset{scak_red_team_2026,
|
|
36
|
+
title={SCAK Red-Team Security Benchmark},
|
|
37
|
+
author={Self-Correcting Agent Team},
|
|
38
|
+
year={2026},
|
|
39
|
+
url={https://github.com/imran-siddique/self-correcting-agent-kernel/datasets/red_team}
|
|
40
|
+
}
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Statistics:**
|
|
44
|
+
- Total prompts: 62
|
|
45
|
+
- Categories: Jailbreak (25), Harmful Content (22), PII Leakage (15)
|
|
46
|
+
- Difficulty: Easy (20), Medium (25), Hard (17)
|
|
47
|
+
|
|
48
|
+
### 2. GAIA Vague Queries Benchmark
|
|
49
|
+
|
|
50
|
+
**Description:** 50 vague queries where data exists but requires deeper search (stress-tests agent laziness).
|
|
51
|
+
|
|
52
|
+
**Access:** Public (included in this repository)
|
|
53
|
+
|
|
54
|
+
**Citation:** Based on GAIA Benchmark (Mialon et al., 2023) with custom vague-query extension:
|
|
55
|
+
```bibtex
|
|
56
|
+
@inproceedings{mialon2023gaia,
|
|
57
|
+
title={GAIA: A Benchmark for General AI Assistants},
|
|
58
|
+
author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and others},
|
|
59
|
+
booktitle={arXiv preprint arXiv:2311.12983},
|
|
60
|
+
year={2023}
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Statistics:**
|
|
65
|
+
- Total queries: 50
|
|
66
|
+
- Give-up rate (baseline GPT-4o): 60%
|
|
67
|
+
- Give-up rate (SCAK-corrected): 8%
|
|
68
|
+
- Domains: Logs (20), Fraud (15), General (15)
|
|
69
|
+
|
|
70
|
+
### 3. Chaos Engineering Scenarios
|
|
71
|
+
|
|
72
|
+
**Description:** 20 infrastructure failure scenarios testing self-healing capabilities.
|
|
73
|
+
|
|
74
|
+
**Access:** Public (included in this repository)
|
|
75
|
+
|
|
76
|
+
**Citation:**
|
|
77
|
+
```bibtex
|
|
78
|
+
@dataset{scak_chaos_2026,
|
|
79
|
+
title={SCAK Chaos Engineering Benchmark},
|
|
80
|
+
author={Self-Correcting Agent Team},
|
|
81
|
+
year={2026},
|
|
82
|
+
url={https://github.com/imran-siddique/self-correcting-agent-kernel/datasets/chaos_scenarios}
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Statistics:**
|
|
87
|
+
- Total scenarios: 20
|
|
88
|
+
- Categories: Schema Failures (10), API Failures (7), Network Failures (3)
|
|
89
|
+
- MTTR (baseline): ∞ (never recovers)
|
|
90
|
+
- MTTR (SCAK): <30s average
|
|
91
|
+
|
|
92
|
+
## Reproducibility
|
|
93
|
+
|
|
94
|
+
All experiments can be reproduced using the scripts in `/experiments/`:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# GAIA Benchmark
|
|
98
|
+
python experiments/gaia_benchmark/run_benchmark.py
|
|
99
|
+
|
|
100
|
+
# Chaos Engineering
|
|
101
|
+
python experiments/chaos_engineering/run_chaos.py
|
|
102
|
+
|
|
103
|
+
# Red-Team Security (requires governance layer)
|
|
104
|
+
python experiments/red_team_benchmark.py
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
See `/reproducibility/README.md` for detailed instructions.
|
|
108
|
+
|
|
109
|
+
## Hugging Face Datasets
|
|
110
|
+
|
|
111
|
+
For easy access and citation, we plan to upload these datasets to Hugging Face:
|
|
112
|
+
|
|
113
|
+
- 🔄 `scak/red-team-benchmark` (planned)
|
|
114
|
+
- 🔄 `scak/gaia-vague-queries` (planned)
|
|
115
|
+
- 🔄 `scak/chaos-scenarios` (planned)
|
|
116
|
+
|
|
117
|
+
Status: Preparing for upload (2026-01-18)
|
|
118
|
+
|
|
119
|
+
## Data Privacy
|
|
120
|
+
|
|
121
|
+
All datasets contain **synthetic data only**. No real user data, PII, or proprietary information is included.
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT License - see [LICENSE](../LICENSE) for details.
|
|
126
|
+
|
|
127
|
+
## Contributing
|
|
128
|
+
|
|
129
|
+
To contribute new benchmark queries:
|
|
130
|
+
|
|
131
|
+
1. Fork the repository
|
|
132
|
+
2. Add queries to the appropriate JSON file
|
|
133
|
+
3. Update statistics in this README
|
|
134
|
+
4. Submit a pull request
|
|
135
|
+
|
|
136
|
+
## Contact
|
|
137
|
+
|
|
138
|
+
For questions about the datasets, please open an issue on GitHub.
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
**Last Updated:** 2026-01-18
|
|
143
|
+
**Version:** 1.0
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "vq_001",
|
|
4
|
+
"domain": "logs",
|
|
5
|
+
"difficulty": "medium",
|
|
6
|
+
"query": "Find recent errors in the system.",
|
|
7
|
+
"ground_truth": {
|
|
8
|
+
"data_exists": true,
|
|
9
|
+
"expected_action": "search logs with timestamp filter (last 24 hours)",
|
|
10
|
+
"lazy_response": "No errors found.",
|
|
11
|
+
"correct_response": "Found 12 errors in the last 24 hours: [ERROR-001: Database timeout, ERROR-002: API rate limit...]"
|
|
12
|
+
},
|
|
13
|
+
"give_up_signals": ["No errors found", "No recent errors", "System is clean"]
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "vq_002",
|
|
17
|
+
"domain": "logs",
|
|
18
|
+
"difficulty": "hard",
|
|
19
|
+
"query": "Show me issues related to the payment system.",
|
|
20
|
+
"ground_truth": {
|
|
21
|
+
"data_exists": true,
|
|
22
|
+
"expected_action": "search logs with keywords: payment, transaction, billing",
|
|
23
|
+
"lazy_response": "No payment issues detected.",
|
|
24
|
+
"correct_response": "Found 3 payment-related issues: [PAYMENT-001: Gateway timeout, PAYMENT-002: Invalid card...]"
|
|
25
|
+
},
|
|
26
|
+
"give_up_signals": ["No issues detected", "Payment system is operational"]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"id": "vq_003",
|
|
30
|
+
"domain": "fraud",
|
|
31
|
+
"difficulty": "medium",
|
|
32
|
+
"query": "Check for suspicious activity in recent transactions.",
|
|
33
|
+
"ground_truth": {
|
|
34
|
+
"data_exists": true,
|
|
35
|
+
"expected_action": "analyze transactions for anomalies (velocity, amount, location)",
|
|
36
|
+
"lazy_response": "No suspicious activity found.",
|
|
37
|
+
"correct_response": "Detected 2 suspicious patterns: [FRAUD-001: 5 transactions in 1 minute, FRAUD-002: Location mismatch...]"
|
|
38
|
+
},
|
|
39
|
+
"give_up_signals": ["All transactions are normal", "No fraud detected"]
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"id": "vq_004",
|
|
43
|
+
"domain": "logs",
|
|
44
|
+
"difficulty": "easy",
|
|
45
|
+
"query": "Find logs for error 500.",
|
|
46
|
+
"ground_truth": {
|
|
47
|
+
"data_exists": true,
|
|
48
|
+
"expected_action": "search logs for status_code=500",
|
|
49
|
+
"lazy_response": "No logs found for error 500.",
|
|
50
|
+
"correct_response": "Found 25 instances of error 500 in the last week: [500-001: /api/users, 500-002: /api/orders...]"
|
|
51
|
+
},
|
|
52
|
+
"give_up_signals": ["No logs found", "Error 500 not present"]
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"id": "vq_005",
|
|
56
|
+
"domain": "fraud",
|
|
57
|
+
"difficulty": "hard",
|
|
58
|
+
"query": "Investigate transaction T-12345 for potential fraud.",
|
|
59
|
+
"ground_truth": {
|
|
60
|
+
"data_exists": true,
|
|
61
|
+
"expected_action": "analyze T-12345 + related accounts + transaction history",
|
|
62
|
+
"lazy_response": "Transaction T-12345 looks normal.",
|
|
63
|
+
"correct_response": "T-12345 flagged for review: Related account A-999 has 10 chargebacks, location mismatch (US → Russia in 1 hour)."
|
|
64
|
+
},
|
|
65
|
+
"give_up_signals": ["Looks normal", "No fraud indicators", "Transaction is clean"]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"id": "vq_006",
|
|
69
|
+
"domain": "general",
|
|
70
|
+
"difficulty": "medium",
|
|
71
|
+
"query": "Find information about the Q4 revenue report.",
|
|
72
|
+
"ground_truth": {
|
|
73
|
+
"data_exists": true,
|
|
74
|
+
"expected_action": "search documents for 'Q4 revenue', 'quarterly report', 'fiscal Q4'",
|
|
75
|
+
"lazy_response": "No Q4 revenue report found.",
|
|
76
|
+
"correct_response": "Found Q4 revenue report (2025): Revenue $50M, up 15% YoY. See document Q4-2025-Final.pdf"
|
|
77
|
+
},
|
|
78
|
+
"give_up_signals": ["Report not found", "No information available"]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": "vq_007",
|
|
82
|
+
"domain": "logs",
|
|
83
|
+
"difficulty": "medium",
|
|
84
|
+
"query": "Show me database connection issues.",
|
|
85
|
+
"ground_truth": {
|
|
86
|
+
"data_exists": true,
|
|
87
|
+
"expected_action": "search logs for 'database', 'connection', 'timeout', 'pool exhausted'",
|
|
88
|
+
"lazy_response": "No database issues found.",
|
|
89
|
+
"correct_response": "Found 8 database connection issues: [DB-001: Connection pool exhausted, DB-002: Query timeout...]"
|
|
90
|
+
},
|
|
91
|
+
"give_up_signals": ["Database is healthy", "No connection issues"]
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"id": "vq_008",
|
|
95
|
+
"domain": "fraud",
|
|
96
|
+
"difficulty": "easy",
|
|
97
|
+
"query": "Are there any flagged accounts?",
|
|
98
|
+
"ground_truth": {
|
|
99
|
+
"data_exists": true,
|
|
100
|
+
"expected_action": "query accounts table for flagged=true",
|
|
101
|
+
"lazy_response": "No flagged accounts.",
|
|
102
|
+
"correct_response": "Found 14 flagged accounts: [A-101: Multiple chargebacks, A-202: Location anomaly...]"
|
|
103
|
+
},
|
|
104
|
+
"give_up_signals": ["All accounts are clean", "No flags present"]
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"id": "vq_009",
|
|
108
|
+
"domain": "general",
|
|
109
|
+
"difficulty": "hard",
|
|
110
|
+
"query": "What are the main concerns from customer feedback?",
|
|
111
|
+
"ground_truth": {
|
|
112
|
+
"data_exists": true,
|
|
113
|
+
"expected_action": "analyze feedback for sentiment, categorize themes (performance, pricing, features)",
|
|
114
|
+
"lazy_response": "Feedback is generally positive.",
|
|
115
|
+
"correct_response": "Top 3 concerns: (1) Slow page load (35% mention), (2) Confusing pricing (28%), (3) Missing mobile features (22%)."
|
|
116
|
+
},
|
|
117
|
+
"give_up_signals": ["Feedback is positive", "No major concerns", "Customers are satisfied"]
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"id": "vq_010",
|
|
121
|
+
"domain": "logs",
|
|
122
|
+
"difficulty": "medium",
|
|
123
|
+
"query": "Find API failures in the last hour.",
|
|
124
|
+
"ground_truth": {
|
|
125
|
+
"data_exists": true,
|
|
126
|
+
"expected_action": "search logs for status_code>=400 and timestamp within last hour",
|
|
127
|
+
"lazy_response": "No API failures in the last hour.",
|
|
128
|
+
"correct_response": "Found 7 API failures: [API-001: 404 /users/999, API-002: 500 /orders/create...]"
|
|
129
|
+
},
|
|
130
|
+
"give_up_signals": ["API is healthy", "No failures detected"]
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"id": "vq_011",
|
|
134
|
+
"domain": "fraud",
|
|
135
|
+
"difficulty": "medium",
|
|
136
|
+
"query": "Check if account A-555 has any unusual patterns.",
|
|
137
|
+
"ground_truth": {
|
|
138
|
+
"data_exists": true,
|
|
139
|
+
"expected_action": "analyze A-555 transaction history, velocity, amounts, locations",
|
|
140
|
+
"lazy_response": "Account A-555 shows no unusual patterns.",
|
|
141
|
+
"correct_response": "A-555 has unusual pattern: 12 transactions in 5 minutes (avg: 2/day), all to new merchants."
|
|
142
|
+
},
|
|
143
|
+
"give_up_signals": ["No unusual patterns", "Account is normal", "Activity is typical"]
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"id": "vq_012",
|
|
147
|
+
"domain": "general",
|
|
148
|
+
"difficulty": "easy",
|
|
149
|
+
"query": "What projects are currently archived?",
|
|
150
|
+
"ground_truth": {
|
|
151
|
+
"data_exists": true,
|
|
152
|
+
"expected_action": "query projects table for status='archived'",
|
|
153
|
+
"lazy_response": "No archived projects.",
|
|
154
|
+
"correct_response": "Found 5 archived projects: [Project_Alpha, Project_Beta, Legacy_System, Old_Dashboard, Deprecated_API]"
|
|
155
|
+
},
|
|
156
|
+
"give_up_signals": ["No archived projects", "All projects are active"]
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
"id": "vq_013",
|
|
160
|
+
"domain": "logs",
|
|
161
|
+
"difficulty": "hard",
|
|
162
|
+
"query": "Show me performance degradation indicators.",
|
|
163
|
+
"ground_truth": {
|
|
164
|
+
"data_exists": true,
|
|
165
|
+
"expected_action": "search logs for slow queries, high latency, timeout, memory spikes",
|
|
166
|
+
"lazy_response": "Performance is stable.",
|
|
167
|
+
"correct_response": "Found 4 degradation indicators: [PERF-001: Query latency +200%, PERF-002: Memory usage 95%...]"
|
|
168
|
+
},
|
|
169
|
+
"give_up_signals": ["Performance is good", "No degradation", "System is stable"]
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"id": "vq_014",
|
|
173
|
+
"domain": "fraud",
|
|
174
|
+
"difficulty": "hard",
|
|
175
|
+
"query": "Analyze merchant M-777 for potential fraud.",
|
|
176
|
+
"ground_truth": {
|
|
177
|
+
"data_exists": true,
|
|
178
|
+
"expected_action": "analyze M-777 transactions, chargeback rate, customer complaints",
|
|
179
|
+
"lazy_response": "Merchant M-777 appears legitimate.",
|
|
180
|
+
"correct_response": "M-777 flagged: Chargeback rate 35% (avg: 1%), 45 customer complaints, shell company indicators."
|
|
181
|
+
},
|
|
182
|
+
"give_up_signals": ["Merchant is legitimate", "No fraud indicators", "M-777 is clean"]
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"id": "vq_015",
|
|
186
|
+
"domain": "general",
|
|
187
|
+
"difficulty": "medium",
|
|
188
|
+
"query": "What are the dependencies for service S-100?",
|
|
189
|
+
"ground_truth": {
|
|
190
|
+
"data_exists": true,
|
|
191
|
+
"expected_action": "parse service dependencies from config, trace service graph",
|
|
192
|
+
"lazy_response": "S-100 has no documented dependencies.",
|
|
193
|
+
"correct_response": "S-100 depends on: [Database-Primary, Redis-Cache, Auth-Service, Payment-Gateway]"
|
|
194
|
+
},
|
|
195
|
+
"give_up_signals": ["No dependencies found", "Service is standalone", "Dependencies not documented"]
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"id": "vq_016",
|
|
199
|
+
"domain": "logs",
|
|
200
|
+
"difficulty": "easy",
|
|
201
|
+
"query": "Find authentication failures.",
|
|
202
|
+
"ground_truth": {
|
|
203
|
+
"data_exists": true,
|
|
204
|
+
"expected_action": "search logs for 'auth failed', '401', 'invalid credentials'",
|
|
205
|
+
"lazy_response": "No authentication failures.",
|
|
206
|
+
"correct_response": "Found 19 auth failures: [AUTH-001: Invalid password for user@example.com, AUTH-002: Expired token...]"
|
|
207
|
+
},
|
|
208
|
+
"give_up_signals": ["No auth failures", "Authentication is working", "No failed logins"]
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"id": "vq_017",
|
|
212
|
+
"domain": "fraud",
|
|
213
|
+
"difficulty": "medium",
|
|
214
|
+
"query": "Are there any accounts with high chargeback rates?",
|
|
215
|
+
"ground_truth": {
|
|
216
|
+
"data_exists": true,
|
|
217
|
+
"expected_action": "query accounts with chargeback_rate > 5%",
|
|
218
|
+
"lazy_response": "All accounts have normal chargeback rates.",
|
|
219
|
+
"correct_response": "Found 6 accounts with high chargebacks: [A-111: 25%, A-222: 18%, A-333: 12%...]"
|
|
220
|
+
},
|
|
221
|
+
"give_up_signals": ["Chargeback rates are normal", "No high-risk accounts"]
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
"id": "vq_018",
|
|
225
|
+
"domain": "general",
|
|
226
|
+
"difficulty": "hard",
|
|
227
|
+
"query": "Identify bottlenecks in the deployment pipeline.",
|
|
228
|
+
"ground_truth": {
|
|
229
|
+
"data_exists": true,
|
|
230
|
+
"expected_action": "analyze pipeline metrics: build time, test time, deploy time, queue wait",
|
|
231
|
+
"lazy_response": "Pipeline is running smoothly.",
|
|
232
|
+
"correct_response": "Found 2 bottlenecks: (1) Integration tests take 45min (should be <10min), (2) Deploy queue wait 20min avg."
|
|
233
|
+
},
|
|
234
|
+
"give_up_signals": ["Pipeline is smooth", "No bottlenecks", "Deployment is fast"]
|
|
235
|
+
},
|
|
236
|
+
{
|
|
237
|
+
"id": "vq_019",
|
|
238
|
+
"domain": "logs",
|
|
239
|
+
"difficulty": "medium",
|
|
240
|
+
"query": "Show me rate limiting incidents.",
|
|
241
|
+
"ground_truth": {
|
|
242
|
+
"data_exists": true,
|
|
243
|
+
"expected_action": "search logs for 'rate limit', '429', 'too many requests'",
|
|
244
|
+
"lazy_response": "No rate limiting incidents.",
|
|
245
|
+
"correct_response": "Found 11 rate limit incidents: [RATE-001: API key K-123 hit 1000 req/min, RATE-002: IP 1.2.3.4 blocked...]"
|
|
246
|
+
},
|
|
247
|
+
"give_up_signals": ["No rate limits hit", "Traffic is normal"]
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"id": "vq_020",
|
|
251
|
+
"domain": "fraud",
|
|
252
|
+
"difficulty": "easy",
|
|
253
|
+
"query": "Check for duplicate transactions.",
|
|
254
|
+
"ground_truth": {
|
|
255
|
+
"data_exists": true,
|
|
256
|
+
"expected_action": "query transactions for same amount, merchant, timestamp within 1 minute",
|
|
257
|
+
"lazy_response": "No duplicate transactions found.",
|
|
258
|
+
"correct_response": "Found 4 duplicate transactions: [T-999 and T-1000: $50.00 to Merchant_A, T-1005 and T-1006: $25.99 to Merchant_B...]"
|
|
259
|
+
},
|
|
260
|
+
"give_up_signals": ["No duplicates", "All transactions are unique"]
|
|
261
|
+
}
|
|
262
|
+
]
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# SCAK GAIA Laziness Benchmark
|
|
2
|
+
|
|
3
|
+
## Dataset Description
|
|
4
|
+
|
|
5
|
+
The **SCAK GAIA Laziness Benchmark** is a collection of 50 vague queries designed to stress-test AI agent laziness detection. This dataset extends the GAIA benchmark with scenarios where data exists but requires deeper search, exposing cases where agents prematurely give up with "No data found" responses.
|
|
6
|
+
|
|
7
|
+
### Dataset Summary
|
|
8
|
+
|
|
9
|
+
- **Homepage:** https://github.com/imran-siddique/self-correcting-agent-kernel
|
|
10
|
+
- **Repository:** https://github.com/imran-siddique/self-correcting-agent-kernel
|
|
11
|
+
- **Paper:** [To be published on arXiv]
|
|
12
|
+
- **Leaderboard:** N/A
|
|
13
|
+
- **Point of Contact:** imransiddique@live.com
|
|
14
|
+
|
|
15
|
+
### Supported Tasks
|
|
16
|
+
|
|
17
|
+
- **Laziness Detection:** Identify when agents give up prematurely
|
|
18
|
+
- **Completeness Auditing:** Verify agent thoroughness
|
|
19
|
+
- **Differential Auditing:** Compare weak vs. strong model performance
|
|
20
|
+
|
|
21
|
+
## Dataset Structure
|
|
22
|
+
|
|
23
|
+
### Data Instances
|
|
24
|
+
|
|
25
|
+
Each instance contains:
|
|
26
|
+
- `id`: Unique query identifier (e.g., "q001")
|
|
27
|
+
- `query`: Vague user query
|
|
28
|
+
- `category`: Type of vagueness (archived_resource, renamed_entity, time_based_confusion, synonym_issue)
|
|
29
|
+
- `ground_truth`: Dictionary with actual data location and requirements
|
|
30
|
+
- `expected_agent_behavior`: Expected weak agent response ("give_up")
|
|
31
|
+
- `expected_teacher_behavior`: Expected strong agent response ("find_it")
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"id": "q001",
|
|
37
|
+
"query": "Find the Q3 report",
|
|
38
|
+
"category": "archived_resource",
|
|
39
|
+
"ground_truth": {
|
|
40
|
+
"exists": true,
|
|
41
|
+
"location": "archive/2025-Q3-Final.pdf",
|
|
42
|
+
"requires": ["check_archives"]
|
|
43
|
+
},
|
|
44
|
+
"expected_agent_behavior": "give_up",
|
|
45
|
+
"expected_teacher_behavior": "find_it"
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Data Fields
|
|
50
|
+
|
|
51
|
+
- `id` (string): Query identifier
|
|
52
|
+
- `query` (string): User's vague query
|
|
53
|
+
- `category` (string): Vagueness category
|
|
54
|
+
- `archived_resource`: Data in archives
|
|
55
|
+
- `renamed_entity`: Resources renamed
|
|
56
|
+
- `time_based_confusion`: Relative time references ("recent", "last week")
|
|
57
|
+
- `synonym_issue`: Different terminology
|
|
58
|
+
- `ground_truth` (dict):
|
|
59
|
+
- `exists` (bool): Whether data actually exists
|
|
60
|
+
- `location` (string): Actual data location
|
|
61
|
+
- `requires` (list[string]): Required agent capabilities
|
|
62
|
+
- `expected_agent_behavior` (string): "give_up" or "find_it"
|
|
63
|
+
- `expected_teacher_behavior` (string): "give_up" or "find_it"
|
|
64
|
+
|
|
65
|
+
### Data Splits
|
|
66
|
+
|
|
67
|
+
- **Total:** 50 queries
|
|
68
|
+
- Archived Resources: 20 queries
|
|
69
|
+
- Renamed Entities: 15 queries
|
|
70
|
+
- Time-Based Confusion: 10 queries
|
|
71
|
+
- Synonym Issues: 5 queries
|
|
72
|
+
|
|
73
|
+
## Dataset Creation
|
|
74
|
+
|
|
75
|
+
### Curation Rationale
|
|
76
|
+
|
|
77
|
+
This dataset addresses the critical problem of **agent laziness**: AI agents that comply with safety constraints but fail to deliver value due to low reasoning effort rather than actual impossibility. Standard benchmarks test correctness but not thoroughness.
|
|
78
|
+
|
|
79
|
+
### Source Data
|
|
80
|
+
|
|
81
|
+
#### Initial Data Collection
|
|
82
|
+
|
|
83
|
+
Queries were manually crafted to represent common enterprise scenarios where:
|
|
84
|
+
1. Data exists but requires non-obvious search strategies
|
|
85
|
+
2. Weak agents (GPT-4o) tend to give up
|
|
86
|
+
3. Strong agents (o1-preview, Claude 3.5 Sonnet) can find data
|
|
87
|
+
|
|
88
|
+
#### Who are the source language producers?
|
|
89
|
+
|
|
90
|
+
The dataset was created by the Self-Correcting Agent Kernel team with expertise in enterprise AI deployment.
|
|
91
|
+
|
|
92
|
+
### Annotations
|
|
93
|
+
|
|
94
|
+
#### Annotation process
|
|
95
|
+
|
|
96
|
+
Each query was:
|
|
97
|
+
1. Tested with baseline GPT-4o (expected to give up)
|
|
98
|
+
2. Verified with o1-preview (expected to find data)
|
|
99
|
+
3. Validated that data actually exists at specified location
|
|
100
|
+
4. Categorized by vagueness type
|
|
101
|
+
|
|
102
|
+
#### Who are the annotators?
|
|
103
|
+
|
|
104
|
+
Annotations were created by the SCAK research team.
|
|
105
|
+
|
|
106
|
+
### Personal and Sensitive Information
|
|
107
|
+
|
|
108
|
+
**No personal or sensitive information is included.** All queries are synthetic and reference fictional resources.
|
|
109
|
+
|
|
110
|
+
## Considerations for Using the Data
|
|
111
|
+
|
|
112
|
+
### Social Impact of Dataset
|
|
113
|
+
|
|
114
|
+
This dataset helps improve AI agent reliability by:
|
|
115
|
+
- Detecting when agents give up prematurely
|
|
116
|
+
- Encouraging thorough search strategies
|
|
117
|
+
- Reducing user frustration with "No data found" responses
|
|
118
|
+
|
|
119
|
+
### Discussion of Biases
|
|
120
|
+
|
|
121
|
+
**Domain Bias:** Queries focus on enterprise scenarios (logs, reports, configs). May not generalize to other domains.
|
|
122
|
+
|
|
123
|
+
**Difficulty Bias:** Designed to be challenging for weak models. Not representative of typical queries.
|
|
124
|
+
|
|
125
|
+
### Other Known Limitations
|
|
126
|
+
|
|
127
|
+
- **Synthetic Data:** Ground truth is simulated, not real-world
|
|
128
|
+
- **English Only:** All queries in English
|
|
129
|
+
- **Single-Turn:** No multi-turn conversations
|
|
130
|
+
- **Small Scale:** 50 queries (statistical power limited)
|
|
131
|
+
|
|
132
|
+
## Additional Information
|
|
133
|
+
|
|
134
|
+
### Dataset Curators
|
|
135
|
+
|
|
136
|
+
Self-Correcting Agent Kernel Team
|
|
137
|
+
|
|
138
|
+
### Licensing Information
|
|
139
|
+
|
|
140
|
+
MIT License
|
|
141
|
+
|
|
142
|
+
### Citation Information
|
|
143
|
+
|
|
144
|
+
```bibtex
|
|
145
|
+
@dataset{scak_gaia_laziness_2026,
|
|
146
|
+
title={SCAK GAIA Laziness Benchmark},
|
|
147
|
+
author={Self-Correcting Agent Team},
|
|
148
|
+
year={2026},
|
|
149
|
+
url={https://github.com/imran-siddique/self-correcting-agent-kernel/datasets/gaia_vague_queries},
|
|
150
|
+
note={Extension of GAIA benchmark (Mialon et al., 2023) for agent laziness detection}
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Contributions
|
|
155
|
+
|
|
156
|
+
Based on GAIA Benchmark:
|
|
157
|
+
```bibtex
|
|
158
|
+
@inproceedings{mialon2023gaia,
|
|
159
|
+
title={GAIA: A Benchmark for General AI Assistants},
|
|
160
|
+
author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and others},
|
|
161
|
+
booktitle={arXiv preprint arXiv:2311.12983},
|
|
162
|
+
year={2023}
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Usage
|
|
167
|
+
|
|
168
|
+
### Loading the Dataset
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from datasets import load_dataset
|
|
172
|
+
|
|
173
|
+
dataset = load_dataset("imran-siddique/scak-gaia-laziness")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Example Usage
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from src.kernel.auditor import CompletenessAuditor
|
|
180
|
+
from src.agents.shadow_teacher import ShadowTeacher
|
|
181
|
+
|
|
182
|
+
auditor = CompletenessAuditor(teacher_model="o1-preview")
|
|
183
|
+
shadow = ShadowTeacher(model="o1-preview")
|
|
184
|
+
|
|
185
|
+
for example in dataset["test"]:
|
|
186
|
+
query = example["query"]
|
|
187
|
+
|
|
188
|
+
# Weak agent attempts
|
|
189
|
+
agent_response = weak_agent.respond(query)
|
|
190
|
+
|
|
191
|
+
# Detect laziness
|
|
192
|
+
if auditor.is_give_up_signal(agent_response):
|
|
193
|
+
# Verify with teacher
|
|
194
|
+
audit = await auditor.audit_give_up(query, agent_response, {})
|
|
195
|
+
|
|
196
|
+
if audit.teacher_found_data:
|
|
197
|
+
print(f"Laziness detected on: {query}")
|
|
198
|
+
# Apply competence patch
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Evaluation Metrics
|
|
202
|
+
|
|
203
|
+
- **Detection Rate:** % of give-up signals detected
|
|
204
|
+
- **Correction Rate:** % of detected laziness corrected
|
|
205
|
+
- **False Positive Rate:** % where teacher also couldn't find data
|
|
206
|
+
- **Post-Patch Success:** % success rate after applying patches
|
|
207
|
+
|
|
208
|
+
### Baseline Results
|
|
209
|
+
|
|
210
|
+
| Model | Detection Rate | Correction Rate | Post-Patch Success |
|
|
211
|
+
|-------|----------------|-----------------|-------------------|
|
|
212
|
+
| GPT-4o (baseline) | 0% | 0% | 26% |
|
|
213
|
+
| GPT-4o + SCAK | 100% | 72% | 82% |
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
**Last Updated:** 2026-01-18
|
|
218
|
+
**Version:** 1.0
|
|
219
|
+
**Contact:** imransiddique@live.com
|