agent-os-kernel 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.2.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.2.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HumanEval Dataset Loader
|
|
3
|
+
|
|
4
|
+
This module provides utilities for loading and processing the HumanEval dataset,
|
|
5
|
+
the industry standard benchmark for code generation models.
|
|
6
|
+
|
|
7
|
+
The HumanEval dataset contains 164 hand-written programming problems with function
|
|
8
|
+
signatures, docstrings, and unit tests. It's widely used to evaluate code generation
|
|
9
|
+
capabilities of language models.
|
|
10
|
+
|
|
11
|
+
Reference: "Evaluating Large Language Models Trained on Code" (Chen et al., 2021)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HumanEvalLoader:
|
|
20
|
+
"""
|
|
21
|
+
Loader for the HumanEval dataset.
|
|
22
|
+
|
|
23
|
+
This class handles loading and formatting HumanEval problems for use
|
|
24
|
+
with the Cross-Model Verification Kernel.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, dataset_path: str | None = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the HumanEval loader.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset_path: Path to the HumanEval JSON file. If None, uses the
|
|
33
|
+
sample dataset in experiments/datasets/humaneval_sample.json
|
|
34
|
+
"""
|
|
35
|
+
if dataset_path is None:
|
|
36
|
+
# Use default sample dataset
|
|
37
|
+
# Go up from datasets -> cross_model_verification_kernel -> src -> repo_root
|
|
38
|
+
base_dir = Path(__file__).parent.parent.parent.parent
|
|
39
|
+
dataset_path = base_dir / "experiments" / "datasets" / "humaneval_sample.json"
|
|
40
|
+
|
|
41
|
+
self.dataset_path = Path(dataset_path)
|
|
42
|
+
self.problems = []
|
|
43
|
+
self._load_dataset()
|
|
44
|
+
|
|
45
|
+
def _load_dataset(self):
|
|
46
|
+
"""Load the dataset from JSON file."""
|
|
47
|
+
if not self.dataset_path.exists():
|
|
48
|
+
raise FileNotFoundError(
|
|
49
|
+
f"HumanEval dataset not found at {self.dataset_path}. "
|
|
50
|
+
"Please provide a valid path or ensure the sample dataset exists."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
with open(self.dataset_path, encoding="utf-8") as f:
|
|
54
|
+
self.problems = json.load(f)
|
|
55
|
+
|
|
56
|
+
print(f"✅ Loaded {len(self.problems)} problems from HumanEval dataset")
|
|
57
|
+
|
|
58
|
+
def get_all_problems(self) -> list[dict[str, Any]]:
|
|
59
|
+
"""
|
|
60
|
+
Get all problems from the dataset.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of problem dictionaries
|
|
64
|
+
"""
|
|
65
|
+
return self.problems
|
|
66
|
+
|
|
67
|
+
def get_problem(self, task_id: str) -> dict[str, Any] | None:
|
|
68
|
+
"""
|
|
69
|
+
Get a specific problem by task ID.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
task_id: The task ID (e.g., "HumanEval/0")
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Problem dictionary or None if not found
|
|
76
|
+
"""
|
|
77
|
+
for problem in self.problems:
|
|
78
|
+
if problem.get("task_id") == task_id:
|
|
79
|
+
return problem
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
def get_problem_by_index(self, index: int) -> dict[str, Any] | None:
|
|
83
|
+
"""
|
|
84
|
+
Get a problem by its index in the dataset.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
index: Zero-based index
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Problem dictionary or None if index out of range
|
|
91
|
+
"""
|
|
92
|
+
if 0 <= index < len(self.problems):
|
|
93
|
+
return self.problems[index]
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def format_for_kernel(self, problem: dict[str, Any]) -> dict[str, Any]:
|
|
97
|
+
"""
|
|
98
|
+
Format a HumanEval problem for use with the Verification Kernel.
|
|
99
|
+
|
|
100
|
+
The kernel expects problems in a specific format with 'id' and 'query' keys.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
problem: Raw HumanEval problem dictionary
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Formatted problem dictionary with 'id', 'query', and metadata
|
|
107
|
+
"""
|
|
108
|
+
task_id = problem.get("task_id", "unknown")
|
|
109
|
+
prompt = problem.get("prompt", "")
|
|
110
|
+
test = problem.get("test", "")
|
|
111
|
+
entry_point = problem.get("entry_point", "")
|
|
112
|
+
|
|
113
|
+
# Create a detailed query that includes the function signature and docstring
|
|
114
|
+
query = (
|
|
115
|
+
f"Complete the following Python function:\n\n"
|
|
116
|
+
f"{prompt}\n\n"
|
|
117
|
+
f"Requirements:\n"
|
|
118
|
+
f"- The function must pass all provided test cases\n"
|
|
119
|
+
f"- Follow the exact function signature provided\n"
|
|
120
|
+
f"- Entry point: {entry_point}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
"id": task_id.replace("/", "_"), # Make filesystem-safe
|
|
125
|
+
"query": query,
|
|
126
|
+
"metadata": {
|
|
127
|
+
"task_id": task_id,
|
|
128
|
+
"entry_point": entry_point,
|
|
129
|
+
"test_code": test,
|
|
130
|
+
"original_prompt": prompt,
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
def get_problem_subset(self, start: int = 0, count: int = 10) -> list[dict[str, Any]]:
|
|
135
|
+
"""
|
|
136
|
+
Get a subset of problems from the dataset.
|
|
137
|
+
|
|
138
|
+
This is useful for running experiments on a smaller scale before
|
|
139
|
+
scaling up to the full dataset.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
start: Starting index (default: 0)
|
|
143
|
+
count: Number of problems to return (default: 10)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of problem dictionaries
|
|
147
|
+
"""
|
|
148
|
+
end = min(start + count, len(self.problems))
|
|
149
|
+
return self.problems[start:end]
|
|
150
|
+
|
|
151
|
+
def format_all_for_kernel(
|
|
152
|
+
self, start: int = 0, count: int | None = None
|
|
153
|
+
) -> list[dict[str, Any]]:
|
|
154
|
+
"""
|
|
155
|
+
Format multiple problems for the kernel.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
start: Starting index (default: 0)
|
|
159
|
+
count: Number of problems to format (default: all remaining)
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List of formatted problem dictionaries
|
|
163
|
+
"""
|
|
164
|
+
if count is None:
|
|
165
|
+
count = len(self.problems) - start
|
|
166
|
+
|
|
167
|
+
subset = self.get_problem_subset(start, count)
|
|
168
|
+
return [self.format_for_kernel(p) for p in subset]
|
|
169
|
+
|
|
170
|
+
def __len__(self) -> int:
|
|
171
|
+
"""Return the number of problems in the dataset."""
|
|
172
|
+
return len(self.problems)
|
|
173
|
+
|
|
174
|
+
def __iter__(self):
|
|
175
|
+
"""Iterate over all problems."""
|
|
176
|
+
return iter(self.problems)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def download_full_humaneval(output_path: str = "experiments/datasets/humaneval_full.jsonl"):
|
|
180
|
+
"""
|
|
181
|
+
Download the full HumanEval dataset from the official source.
|
|
182
|
+
|
|
183
|
+
Note: This requires internet access and the 'requests' library.
|
|
184
|
+
The full dataset is available at:
|
|
185
|
+
https://github.com/openai/human-eval
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
output_path: Where to save the downloaded dataset
|
|
189
|
+
"""
|
|
190
|
+
try:
|
|
191
|
+
import requests
|
|
192
|
+
except ImportError:
|
|
193
|
+
print("❌ Error: 'requests' library is required to download the dataset.")
|
|
194
|
+
print("Install it with: pip install requests")
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
url = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
|
|
198
|
+
|
|
199
|
+
print(f"Downloading HumanEval dataset from {url}...")
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
import gzip
|
|
203
|
+
|
|
204
|
+
response = requests.get(url)
|
|
205
|
+
response.raise_for_status()
|
|
206
|
+
|
|
207
|
+
# Decompress and save
|
|
208
|
+
import io
|
|
209
|
+
|
|
210
|
+
compressed_file = io.BytesIO(response.content)
|
|
211
|
+
|
|
212
|
+
problems = []
|
|
213
|
+
with gzip.open(compressed_file, "rt", encoding="utf-8") as f:
|
|
214
|
+
for line in f:
|
|
215
|
+
if line.strip():
|
|
216
|
+
problems.append(json.loads(line))
|
|
217
|
+
|
|
218
|
+
# Save as regular JSON for easier handling
|
|
219
|
+
output_path = Path(output_path)
|
|
220
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
|
|
222
|
+
with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
|
|
223
|
+
json.dump(problems, f, indent=2)
|
|
224
|
+
|
|
225
|
+
print(f"✅ Downloaded {len(problems)} problems to {output_path.with_suffix('.json')}")
|
|
226
|
+
return str(output_path.with_suffix(".json"))
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f"❌ Failed to download HumanEval dataset: {e}")
|
|
230
|
+
print("You can manually download it from: https://github.com/openai/human-eval")
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
if __name__ == "__main__":
|
|
235
|
+
# Example usage
|
|
236
|
+
print("=" * 80)
|
|
237
|
+
print("HumanEval Dataset Loader - Demo")
|
|
238
|
+
print("=" * 80)
|
|
239
|
+
|
|
240
|
+
# Load the sample dataset
|
|
241
|
+
loader = HumanEvalLoader()
|
|
242
|
+
|
|
243
|
+
print("\n📊 Dataset Statistics:")
|
|
244
|
+
print(f" Total problems: {len(loader)}")
|
|
245
|
+
|
|
246
|
+
# Get first problem
|
|
247
|
+
print("\n📝 Example Problem:")
|
|
248
|
+
problem = loader.get_problem_by_index(0)
|
|
249
|
+
if problem:
|
|
250
|
+
print(f" Task ID: {problem['task_id']}")
|
|
251
|
+
print(f" Entry Point: {problem['entry_point']}")
|
|
252
|
+
print(f" Prompt Preview: {problem['prompt'][:100]}...")
|
|
253
|
+
|
|
254
|
+
# Format for kernel
|
|
255
|
+
print("\n🔧 Formatted for Kernel:")
|
|
256
|
+
formatted = loader.format_for_kernel(problem)
|
|
257
|
+
print(f" ID: {formatted['id']}")
|
|
258
|
+
print(f" Query (first 150 chars): {formatted['query'][:150]}...")
|
|
259
|
+
|
|
260
|
+
# Get a subset
|
|
261
|
+
print("\n📦 Getting subset (5 problems):")
|
|
262
|
+
subset = loader.format_all_for_kernel(start=0, count=5)
|
|
263
|
+
for i, p in enumerate(subset, 1):
|
|
264
|
+
print(f" {i}. {p['id']}")
|
|
265
|
+
|
|
266
|
+
print("\n" + "=" * 80)
|
|
267
|
+
print("To download the full HumanEval dataset:")
|
|
268
|
+
print(
|
|
269
|
+
" python -c 'from cross_model_verification_kernel.datasets.humaneval_loader import download_full_humaneval; download_full_humaneval()'"
|
|
270
|
+
)
|
|
271
|
+
print("=" * 80)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generator Module
|
|
3
|
+
|
|
4
|
+
Implements the Generator component that creates code using one model.
|
|
5
|
+
The Generator is intentionally decoupled from the Verifier to enable adversarial verification.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .models import BaseModelInterface, MockModelInterface, ModelProvider
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class GeneratorConfig:
|
|
16
|
+
"""Configuration for the Generator"""
|
|
17
|
+
|
|
18
|
+
model: ModelProvider
|
|
19
|
+
temperature: float = 0.7
|
|
20
|
+
max_tokens: int = 2000
|
|
21
|
+
api_key: str | None = None
|
|
22
|
+
custom_instructions: str | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GeneratedCode:
|
|
27
|
+
"""Container for generated code"""
|
|
28
|
+
|
|
29
|
+
code: str
|
|
30
|
+
language: str
|
|
31
|
+
description: str
|
|
32
|
+
model_used: str
|
|
33
|
+
metadata: dict[str, Any] | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Generator:
|
|
37
|
+
"""
|
|
38
|
+
Generator component that creates code using a specified model.
|
|
39
|
+
|
|
40
|
+
This component is intentionally decoupled from verification to enable
|
|
41
|
+
adversarial testing with different models.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: GeneratorConfig, model_interface: BaseModelInterface | None = None):
|
|
45
|
+
"""
|
|
46
|
+
Initialize the Generator
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
config: Generator configuration
|
|
50
|
+
model_interface: Optional custom model interface (uses mock if not provided)
|
|
51
|
+
"""
|
|
52
|
+
self.config = config
|
|
53
|
+
self.model_interface = model_interface or MockModelInterface(
|
|
54
|
+
model=config.model, api_key=config.api_key
|
|
55
|
+
)
|
|
56
|
+
self.generation_count = 0
|
|
57
|
+
|
|
58
|
+
def generate_code(
|
|
59
|
+
self, task_description: str, language: str = "python", **kwargs
|
|
60
|
+
) -> GeneratedCode:
|
|
61
|
+
"""
|
|
62
|
+
Generate code based on a task description
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
task_description: Description of what the code should do
|
|
66
|
+
language: Programming language (default: python)
|
|
67
|
+
**kwargs: Additional generation parameters
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
GeneratedCode object containing the generated code and metadata
|
|
71
|
+
"""
|
|
72
|
+
self.generation_count += 1
|
|
73
|
+
|
|
74
|
+
# Build the prompt for code generation
|
|
75
|
+
prompt = self._build_generation_prompt(task_description, language)
|
|
76
|
+
|
|
77
|
+
# Add custom instructions if provided
|
|
78
|
+
if self.config.custom_instructions:
|
|
79
|
+
prompt = f"{self.config.custom_instructions}\n\n{prompt}"
|
|
80
|
+
|
|
81
|
+
# Generate using the model
|
|
82
|
+
response = self.model_interface.generate(
|
|
83
|
+
prompt, temperature=self.config.temperature, max_tokens=self.config.max_tokens, **kwargs
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return GeneratedCode(
|
|
87
|
+
code=response.content,
|
|
88
|
+
language=language,
|
|
89
|
+
description=task_description,
|
|
90
|
+
model_used=response.model,
|
|
91
|
+
metadata={
|
|
92
|
+
"generation_count": self.generation_count,
|
|
93
|
+
"provider": response.provider.value,
|
|
94
|
+
"response_metadata": response.metadata,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _build_generation_prompt(self, task_description: str, language: str) -> str:
|
|
99
|
+
"""Build the prompt for code generation"""
|
|
100
|
+
return f"""Generate {language} code for the following task:
|
|
101
|
+
|
|
102
|
+
Task: {task_description}
|
|
103
|
+
|
|
104
|
+
Requirements:
|
|
105
|
+
- Write clean, functional code
|
|
106
|
+
- Include necessary imports
|
|
107
|
+
- Focus on correctness and readability
|
|
108
|
+
|
|
109
|
+
Generate the code:"""
|
|
110
|
+
|
|
111
|
+
def get_stats(self) -> dict[str, Any]:
|
|
112
|
+
"""Get generator statistics"""
|
|
113
|
+
return {
|
|
114
|
+
"model": self.config.model.value,
|
|
115
|
+
"generation_count": self.generation_count,
|
|
116
|
+
"temperature": self.config.temperature,
|
|
117
|
+
"max_tokens": self.config.max_tokens,
|
|
118
|
+
}
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Verification Kernel
|
|
3
|
+
|
|
4
|
+
The main orchestrator for the adversarial architecture.
|
|
5
|
+
Coordinates Generator and Verifier to provide model-diverse verification.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .generator import GeneratedCode, Generator, GeneratorConfig
|
|
12
|
+
from .models import ModelProvider
|
|
13
|
+
from .verifier import VerificationReport, Verifier, VerifierConfig
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class BlindSpotAnalysis:
|
|
18
|
+
"""Mathematical analysis of blind spot reduction through model diversity"""
|
|
19
|
+
|
|
20
|
+
single_model_error_prob: float
|
|
21
|
+
independent_error_prob: float
|
|
22
|
+
correlation_coefficient: float
|
|
23
|
+
combined_error_prob: float
|
|
24
|
+
risk_reduction_factor: float
|
|
25
|
+
|
|
26
|
+
def __str__(self) -> str:
|
|
27
|
+
return f"""Blind Spot Analysis:
|
|
28
|
+
- Single model error probability: {self.single_model_error_prob:.4f}
|
|
29
|
+
- Independent error probability: {self.independent_error_prob:.4f}
|
|
30
|
+
- Model correlation coefficient: {self.correlation_coefficient:.4f}
|
|
31
|
+
- Combined error probability: {self.combined_error_prob:.4f}
|
|
32
|
+
- Risk reduction factor: {self.risk_reduction_factor:.2f}x"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class VerificationResult:
|
|
37
|
+
"""Complete result of adversarial verification"""
|
|
38
|
+
|
|
39
|
+
generated_code: GeneratedCode
|
|
40
|
+
verification_report: VerificationReport
|
|
41
|
+
blind_spot_analysis: BlindSpotAnalysis
|
|
42
|
+
generator_model: str
|
|
43
|
+
verifier_model: str
|
|
44
|
+
success: bool
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class VerificationKernel:
|
|
48
|
+
"""
|
|
49
|
+
Main kernel that orchestrates adversarial verification with model diversity.
|
|
50
|
+
|
|
51
|
+
This kernel implements the mathematical framework that demonstrates how
|
|
52
|
+
using different models for generation and verification reduces the
|
|
53
|
+
probability of shared blind spots.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, generator_config: GeneratorConfig, verifier_config: VerifierConfig):
|
|
57
|
+
"""
|
|
58
|
+
Initialize the Verification Kernel
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
generator_config: Configuration for the generator
|
|
62
|
+
verifier_config: Configuration for the verifier
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ValueError: If generator and verifier use the same model
|
|
66
|
+
"""
|
|
67
|
+
# Enforce model diversity
|
|
68
|
+
if generator_config.model == verifier_config.model:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Generator and Verifier must use DIFFERENT models for adversarial verification. "
|
|
71
|
+
f"Both are configured to use {generator_config.model.value}. "
|
|
72
|
+
f"This defeats the purpose of model diversity."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.generator = Generator(generator_config)
|
|
76
|
+
self.verifier = Verifier(verifier_config)
|
|
77
|
+
self.verification_history = []
|
|
78
|
+
|
|
79
|
+
def verify_task(
|
|
80
|
+
self, task_description: str, language: str = "python", **kwargs
|
|
81
|
+
) -> VerificationResult:
|
|
82
|
+
"""
|
|
83
|
+
Execute the full adversarial verification pipeline
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
task_description: Description of the task to generate code for
|
|
87
|
+
language: Programming language
|
|
88
|
+
**kwargs: Additional parameters
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
VerificationResult with complete analysis
|
|
92
|
+
"""
|
|
93
|
+
# Step 1: Generate code with Generator model
|
|
94
|
+
generated_code = self.generator.generate_code(
|
|
95
|
+
task_description=task_description, language=language, **kwargs
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Step 2: Verify with different Verifier model (adversarial)
|
|
99
|
+
verification_report = self.verifier.verify_code(
|
|
100
|
+
code=generated_code.code, description=task_description, language=language, **kwargs
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Step 3: Calculate blind spot analysis
|
|
104
|
+
blind_spot_analysis = self._calculate_blind_spot_reduction(
|
|
105
|
+
generator_model=self.generator.config.model, verifier_model=self.verifier.config.model
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Create result
|
|
109
|
+
result = VerificationResult(
|
|
110
|
+
generated_code=generated_code,
|
|
111
|
+
verification_report=verification_report,
|
|
112
|
+
blind_spot_analysis=blind_spot_analysis,
|
|
113
|
+
generator_model=generated_code.model_used,
|
|
114
|
+
verifier_model=verification_report.model_used,
|
|
115
|
+
success=verification_report.passed,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.verification_history.append(result)
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
def _calculate_blind_spot_reduction(
|
|
122
|
+
self, generator_model: ModelProvider, verifier_model: ModelProvider
|
|
123
|
+
) -> BlindSpotAnalysis:
|
|
124
|
+
"""
|
|
125
|
+
Calculate the mathematical reduction in blind spot probability
|
|
126
|
+
|
|
127
|
+
Using probability theory:
|
|
128
|
+
- P(error) = probability a single model makes an error
|
|
129
|
+
- P(both_error) = probability both models make the SAME error
|
|
130
|
+
- If models are independent: P(both_error) = P(error)²
|
|
131
|
+
- If models are correlated: P(both_error) = P(error)² + ρ*P(error)*(1-P(error))
|
|
132
|
+
|
|
133
|
+
Where ρ is the correlation coefficient between models.
|
|
134
|
+
|
|
135
|
+
Model diversity reduces ρ, thus reducing P(both_error).
|
|
136
|
+
"""
|
|
137
|
+
# Estimated error probabilities (can be calibrated from real data)
|
|
138
|
+
single_model_error_prob = 0.15 # 15% chance of missing a bug
|
|
139
|
+
|
|
140
|
+
# Correlation coefficient based on model diversity
|
|
141
|
+
correlation = self._estimate_model_correlation(generator_model, verifier_model)
|
|
142
|
+
|
|
143
|
+
# Independent case (theoretical minimum)
|
|
144
|
+
independent_error_prob = single_model_error_prob**2
|
|
145
|
+
|
|
146
|
+
# Actual combined error probability with correlation
|
|
147
|
+
combined_error_prob = single_model_error_prob**2 + correlation * single_model_error_prob * (
|
|
148
|
+
1 - single_model_error_prob
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Risk reduction factor
|
|
152
|
+
risk_reduction = single_model_error_prob / combined_error_prob
|
|
153
|
+
|
|
154
|
+
return BlindSpotAnalysis(
|
|
155
|
+
single_model_error_prob=single_model_error_prob,
|
|
156
|
+
independent_error_prob=independent_error_prob,
|
|
157
|
+
correlation_coefficient=correlation,
|
|
158
|
+
combined_error_prob=combined_error_prob,
|
|
159
|
+
risk_reduction_factor=risk_reduction,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def _estimate_model_correlation(self, model1: ModelProvider, model2: ModelProvider) -> float:
|
|
163
|
+
"""
|
|
164
|
+
Estimate correlation coefficient between two models
|
|
165
|
+
|
|
166
|
+
Models from different providers have lower correlation.
|
|
167
|
+
Models from the same provider have higher correlation.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Correlation coefficient ρ ∈ [0, 1]
|
|
171
|
+
"""
|
|
172
|
+
# Extract provider families
|
|
173
|
+
provider1 = self._get_provider_family(model1)
|
|
174
|
+
provider2 = self._get_provider_family(model2)
|
|
175
|
+
|
|
176
|
+
if provider1 != provider2:
|
|
177
|
+
# Different providers: low correlation (0.1-0.3)
|
|
178
|
+
return 0.2
|
|
179
|
+
else:
|
|
180
|
+
# Same provider: higher correlation (0.4-0.6)
|
|
181
|
+
return 0.5
|
|
182
|
+
|
|
183
|
+
def _get_provider_family(self, model: ModelProvider) -> str:
|
|
184
|
+
"""Get the provider family (GPT, Gemini, Claude)"""
|
|
185
|
+
if model.value.startswith("gpt"):
|
|
186
|
+
return "openai"
|
|
187
|
+
elif model.value.startswith("gemini"):
|
|
188
|
+
return "google"
|
|
189
|
+
elif model.value.startswith("claude"):
|
|
190
|
+
return "anthropic"
|
|
191
|
+
return "unknown"
|
|
192
|
+
|
|
193
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
194
|
+
"""Get comprehensive statistics about the verification kernel"""
|
|
195
|
+
total_verifications = len(self.verification_history)
|
|
196
|
+
passed = sum(1 for r in self.verification_history if r.success)
|
|
197
|
+
failed = total_verifications - passed
|
|
198
|
+
|
|
199
|
+
avg_risk_reduction = (
|
|
200
|
+
sum(r.blind_spot_analysis.risk_reduction_factor for r in self.verification_history)
|
|
201
|
+
/ total_verifications
|
|
202
|
+
if total_verifications > 0
|
|
203
|
+
else 0
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"generator": self.generator.get_stats(),
|
|
208
|
+
"verifier": self.verifier.get_stats(),
|
|
209
|
+
"total_verifications": total_verifications,
|
|
210
|
+
"passed": passed,
|
|
211
|
+
"failed": failed,
|
|
212
|
+
"success_rate": passed / total_verifications if total_verifications > 0 else 0,
|
|
213
|
+
"average_risk_reduction_factor": avg_risk_reduction,
|
|
214
|
+
"model_diversity": {
|
|
215
|
+
"generator": self.generator.config.model.value,
|
|
216
|
+
"verifier": self.verifier.config.model.value,
|
|
217
|
+
"are_different": self.generator.config.model != self.verifier.config.model,
|
|
218
|
+
},
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
def print_verification_summary(self, result: VerificationResult):
|
|
222
|
+
"""Print a human-readable summary of verification results"""
|
|
223
|
+
print("=" * 80)
|
|
224
|
+
print("ADVERSARIAL VERIFICATION REPORT")
|
|
225
|
+
print("=" * 80)
|
|
226
|
+
print(f"\nGenerator Model: {result.generator_model}")
|
|
227
|
+
print(f"Verifier Model: {result.verifier_model}")
|
|
228
|
+
print(f"\nTask: {result.generated_code.description}")
|
|
229
|
+
print(f"\nGenerated Code ({result.generated_code.language}):")
|
|
230
|
+
print("-" * 80)
|
|
231
|
+
print(result.generated_code.code)
|
|
232
|
+
print("-" * 80)
|
|
233
|
+
print(f"\nVerification Status: {'✓ PASSED' if result.success else '✗ FAILED'}")
|
|
234
|
+
print(f"Summary: {result.verification_report.summary}")
|
|
235
|
+
print(f"\nIssues Found: {len(result.verification_report.issues)}")
|
|
236
|
+
for i, issue in enumerate(result.verification_report.issues, 1):
|
|
237
|
+
print(f"\n {i}. [{issue.severity.value.upper()}] {issue.category}")
|
|
238
|
+
print(f" {issue.description}")
|
|
239
|
+
if issue.suggestion:
|
|
240
|
+
print(f" Suggestion: {issue.suggestion}")
|
|
241
|
+
print(f"\n{result.blind_spot_analysis}")
|
|
242
|
+
print("=" * 80)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def prosecutor_check(kernel, code_snippet: str) -> bool:
|
|
246
|
+
"""
|
|
247
|
+
Execute the Prosecutor Workflow: Generate and run hostile tests against code.
|
|
248
|
+
|
|
249
|
+
This is a demonstration function showing how to use the Prosecutor Mode
|
|
250
|
+
to verify code by attempting to break it with adversarial tests.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
kernel: VerificationKernel instance (used for verifier access)
|
|
254
|
+
code_snippet: The code to test
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
bool: True if code survived the attack, False if it was broken
|
|
258
|
+
|
|
259
|
+
Security Note:
|
|
260
|
+
The code_snippet is executed in a sandbox with timeout and resource limits.
|
|
261
|
+
While the sandbox provides isolation, this function is intended for trusted
|
|
262
|
+
code verification workflows, not for arbitrary untrusted code execution.
|
|
263
|
+
"""
|
|
264
|
+
from .agents.verifier_gemini import GeminiVerifier
|
|
265
|
+
from .tools.sandbox import Sandbox
|
|
266
|
+
|
|
267
|
+
verifier = GeminiVerifier()
|
|
268
|
+
sandbox = Sandbox()
|
|
269
|
+
|
|
270
|
+
print("🕵️ Prosecutor (Gemini) is analyzing code...")
|
|
271
|
+
|
|
272
|
+
# 1. Generate the Attack
|
|
273
|
+
attack_script = verifier.generate_hostile_test(code_snippet)
|
|
274
|
+
print(f"⚔️ Generated Hostile Test:\n{attack_script}\n")
|
|
275
|
+
|
|
276
|
+
# 2. Combine Target + Attack
|
|
277
|
+
# Note: Both code_snippet and attack_script are executed in the sandbox
|
|
278
|
+
full_execution_script = f"{code_snippet}\n\n{attack_script}"
|
|
279
|
+
|
|
280
|
+
# 3. Run in Sandbox (with timeout and resource limits)
|
|
281
|
+
print("RUNNING IN SANDBOX...")
|
|
282
|
+
result = sandbox.execute(full_execution_script)
|
|
283
|
+
|
|
284
|
+
# 4. Judgement
|
|
285
|
+
if result["status"] == "success":
|
|
286
|
+
# If the script ran without error, the code SURVIVED the attack
|
|
287
|
+
# (Assuming the attack was meant to assert/crash on bug)
|
|
288
|
+
print("✅ PASSED: The code survived the hostile test.")
|
|
289
|
+
return True
|
|
290
|
+
else:
|
|
291
|
+
print(f"❌ FAILED: The Prosecutor broke the code.\nError: {result['error']}")
|
|
292
|
+
return False
|