agent-os-kernel 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_os/__init__.py +66 -4
- agent_os/agents_compat.py +286 -0
- agent_os/base_agent.py +308 -0
- agent_os/cli.py +1079 -19
- agent_os/integrations/__init__.py +37 -2
- agent_os/integrations/openai_adapter.py +502 -0
- agent_os/integrations/semantic_kernel_adapter.py +569 -0
- agent_os/stateless.py +349 -0
- agent_os_kernel-1.3.0.dist-info/METADATA +676 -0
- agent_os_kernel-1.3.0.dist-info/RECORD +1053 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/entry_points.txt +0 -1
- modules/amb/.github/workflows/ci.yml +102 -0
- modules/amb/.github/workflows/publish.yml +146 -0
- modules/amb/.gitignore +134 -0
- modules/amb/CHANGELOG.md +118 -0
- modules/amb/CONTRIBUTING.md +141 -0
- modules/amb/LICENSE +21 -0
- modules/amb/README.md +188 -0
- modules/amb/amb_core/__init__.py +175 -0
- modules/amb/amb_core/adapters/__init__.py +55 -0
- modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
- modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
- modules/amb/amb_core/adapters/kafka_broker.py +258 -0
- modules/amb/amb_core/adapters/nats_broker.py +283 -0
- modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
- modules/amb/amb_core/adapters/redis_broker.py +260 -0
- modules/amb/amb_core/broker.py +143 -0
- modules/amb/amb_core/bus.py +479 -0
- modules/amb/amb_core/cloudevents.py +507 -0
- modules/amb/amb_core/dlq.py +343 -0
- modules/amb/amb_core/hf_utils.py +534 -0
- modules/amb/amb_core/memory_broker.py +408 -0
- modules/amb/amb_core/models.py +139 -0
- modules/amb/amb_core/persistence.py +527 -0
- modules/amb/amb_core/schema.py +292 -0
- modules/amb/amb_core/tracing.py +356 -0
- modules/amb/examples/advanced_features.py +223 -0
- modules/amb/examples/backpressure_demo.py +225 -0
- modules/amb/examples/basic_usage.py +117 -0
- modules/amb/examples/tracing_demo.py +104 -0
- modules/amb/experiments/README.md +52 -0
- modules/amb/experiments/reproduce_results.py +467 -0
- modules/amb/experiments/results.json +324 -0
- modules/amb/paper/README.md +40 -0
- modules/amb/paper/paper.tex +365 -0
- modules/amb/paper/whitepaper.md +377 -0
- modules/amb/pyproject.toml +117 -0
- modules/amb/tests/__init__.py +1 -0
- modules/amb/tests/test_backpressure_priority.py +280 -0
- modules/amb/tests/test_bus.py +198 -0
- modules/amb/tests/test_cloudevents.py +443 -0
- modules/amb/tests/test_features.py +531 -0
- modules/amb/tests/test_models.py +74 -0
- modules/amb/tests/test_tracing.py +254 -0
- modules/atr/.github/workflows/ci.yml +101 -0
- modules/atr/.github/workflows/publish.yml +140 -0
- modules/atr/.gitignore +134 -0
- modules/atr/.pre-commit-config.yaml +37 -0
- modules/atr/CHANGELOG.md +39 -0
- modules/atr/CONTRIBUTING.md +96 -0
- modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
- modules/atr/README.md +180 -0
- modules/atr/atr/__init__.py +638 -0
- modules/atr/atr/access.py +346 -0
- modules/atr/atr/composition.py +643 -0
- modules/atr/atr/decorator.py +355 -0
- modules/atr/atr/executor.py +382 -0
- modules/atr/atr/health.py +555 -0
- modules/atr/atr/hf_utils.py +447 -0
- modules/atr/atr/injection.py +420 -0
- modules/atr/atr/metrics.py +438 -0
- modules/atr/atr/policies.py +401 -0
- modules/atr/atr/py.typed +2 -0
- modules/atr/atr/registry.py +450 -0
- modules/atr/atr/schema.py +478 -0
- modules/atr/atr/tools/safe/__init__.py +73 -0
- modules/atr/atr/tools/safe/calculator.py +380 -0
- modules/atr/atr/tools/safe/datetime_tool.py +441 -0
- modules/atr/atr/tools/safe/file_reader.py +400 -0
- modules/atr/atr/tools/safe/http_client.py +314 -0
- modules/atr/atr/tools/safe/json_parser.py +372 -0
- modules/atr/atr/tools/safe/text_tool.py +526 -0
- modules/atr/atr/tools/safe/toolkit.py +173 -0
- modules/atr/docs/PYPI_SETUP.md +113 -0
- modules/atr/examples/README.md +27 -0
- modules/atr/examples/demo.py +144 -0
- modules/atr/examples/sandbox_demo.py +218 -0
- modules/atr/experiments/README.md +69 -0
- modules/atr/experiments/reproduce_results.py +509 -0
- modules/atr/experiments/results/.gitkeep +0 -0
- modules/atr/experiments/results/results_20260123_140334.json +71 -0
- modules/atr/paper/README.md +36 -0
- modules/atr/paper/figures/.gitkeep +0 -0
- modules/atr/paper/references.bib +84 -0
- modules/atr/paper/structure.tex +293 -0
- modules/atr/paper/whitepaper.md +234 -0
- modules/atr/pyproject.toml +148 -0
- modules/atr/requirements.txt +1 -0
- modules/atr/setup.py +30 -0
- modules/atr/tests/__init__.py +1 -0
- modules/atr/tests/test_decorator.py +317 -0
- modules/atr/tests/test_executor.py +245 -0
- modules/atr/tests/test_integration_executor.py +184 -0
- modules/atr/tests/test_registry.py +312 -0
- modules/atr/tests/test_schema.py +182 -0
- modules/atr/tests/test_v2_features.py +708 -0
- modules/caas/.dockerignore +63 -0
- modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
- modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- modules/caas/.github/workflows/ci.yml +100 -0
- modules/caas/.github/workflows/lint.yml +39 -0
- modules/caas/.github/workflows/publish-pypi.yml +124 -0
- modules/caas/.gitignore +73 -0
- modules/caas/.pre-commit-config.yaml +33 -0
- modules/caas/CHANGELOG.md +58 -0
- modules/caas/CONTRIBUTING.md +346 -0
- modules/caas/Dockerfile +41 -0
- modules/caas/LICENSE +21 -0
- modules/caas/MANIFEST.in +11 -0
- modules/caas/README.md +158 -0
- modules/caas/benchmarks/README.md +255 -0
- modules/caas/benchmarks/create_hf_dataset.py +502 -0
- modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
- modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
- modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
- modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
- modules/caas/benchmarks/hf_dataset/README.md +214 -0
- modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
- modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
- modules/caas/benchmarks/results/README.md +66 -0
- modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
- modules/caas/benchmarks/run_evaluation.py +561 -0
- modules/caas/benchmarks/statistical_tests.py +289 -0
- modules/caas/benchmarks/verify_sample_corpus.py +83 -0
- modules/caas/docker-compose.yml +38 -0
- modules/caas/docs/CONTEXT_TRIAD.md +462 -0
- modules/caas/docs/CONTRIBUTING.md +346 -0
- modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
- modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
- modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
- modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
- modules/caas/docs/METADATA_INJECTION.md +404 -0
- modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
- modules/caas/docs/RELATED_WORK.md +312 -0
- modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
- modules/caas/docs/RELEASE_GUIDE.md +285 -0
- modules/caas/docs/REPRODUCIBILITY.md +386 -0
- modules/caas/docs/SLIDING_WINDOW.md +387 -0
- modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
- modules/caas/docs/TESTING.md +259 -0
- modules/caas/docs/THREAT_MODEL.md +247 -0
- modules/caas/docs/TRUST_GATEWAY.md +575 -0
- modules/caas/docs/VFS.md +298 -0
- modules/caas/examples/agents/enterprise_security_agent.py +414 -0
- modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
- modules/caas/examples/demos/demo.py +309 -0
- modules/caas/examples/demos/demo_context_triad.py +225 -0
- modules/caas/examples/demos/demo_conversation_manager.py +285 -0
- modules/caas/examples/demos/demo_heuristic_router.py +133 -0
- modules/caas/examples/demos/demo_metadata_injection.py +198 -0
- modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
- modules/caas/examples/demos/demo_structure_aware.py +140 -0
- modules/caas/examples/demos/demo_time_decay.py +247 -0
- modules/caas/examples/demos/demo_trust_gateway.py +383 -0
- modules/caas/examples/multi_agent/README.md +159 -0
- modules/caas/examples/multi_agent/research_team.py +369 -0
- modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
- modules/caas/examples/usage/auth_module.py +142 -0
- modules/caas/examples/usage/usage_example.py +173 -0
- modules/caas/experiments/README.md +42 -0
- modules/caas/experiments/reproduce_results.py +462 -0
- modules/caas/paper/ARXIV_METADATA.md +145 -0
- modules/caas/paper/ARXIV_README.md +47 -0
- modules/caas/paper/CHECKLIST.md +103 -0
- modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
- modules/caas/paper/README.md +71 -0
- modules/caas/paper/abstract.md +24 -0
- modules/caas/paper/arxiv_submission.tar +0 -0
- modules/caas/paper/arxiv_submission.zip +0 -0
- modules/caas/paper/build_pdf.py +355 -0
- modules/caas/paper/experiments.md +149 -0
- modules/caas/paper/figures/.gitkeep +0 -0
- modules/caas/paper/figures/README.md +237 -0
- modules/caas/paper/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
- modules/caas/paper/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/figures/fig2_context_triad.svg +105 -0
- modules/caas/paper/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
- modules/caas/paper/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
- modules/caas/paper/intro.md +103 -0
- modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
- modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
- modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
- modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
- modules/caas/paper/latex/main.tex +468 -0
- modules/caas/paper/latex/references.bib +140 -0
- modules/caas/paper/method.md +350 -0
- modules/caas/paper/outline.md +123 -0
- modules/caas/paper/related_work.md +101 -0
- modules/caas/paper/tables/.gitkeep +0 -0
- modules/caas/paper/tables/results_tables.md +50 -0
- modules/caas/pyproject.toml +172 -0
- modules/caas/requirements.txt +11 -0
- modules/caas/src/caas/__init__.py +232 -0
- modules/caas/src/caas/api/__init__.py +7 -0
- modules/caas/src/caas/api/server.py +1326 -0
- modules/caas/src/caas/caching.py +832 -0
- modules/caas/src/caas/cli.py +208 -0
- modules/caas/src/caas/conversation.py +221 -0
- modules/caas/src/caas/decay.py +118 -0
- modules/caas/src/caas/detection/__init__.py +7 -0
- modules/caas/src/caas/detection/detector.py +236 -0
- modules/caas/src/caas/enrichment.py +127 -0
- modules/caas/src/caas/gateway/__init__.py +24 -0
- modules/caas/src/caas/gateway/trust_gateway.py +471 -0
- modules/caas/src/caas/hf_utils.py +477 -0
- modules/caas/src/caas/ingestion/__init__.py +21 -0
- modules/caas/src/caas/ingestion/processors.py +251 -0
- modules/caas/src/caas/ingestion/structure_parser.py +185 -0
- modules/caas/src/caas/models.py +354 -0
- modules/caas/src/caas/pragmatic_truth.py +441 -0
- modules/caas/src/caas/routing/__init__.py +8 -0
- modules/caas/src/caas/routing/heuristic_router.py +242 -0
- modules/caas/src/caas/storage/__init__.py +7 -0
- modules/caas/src/caas/storage/store.py +450 -0
- modules/caas/src/caas/triad.py +472 -0
- modules/caas/src/caas/tuning/__init__.py +7 -0
- modules/caas/src/caas/tuning/tuner.py +322 -0
- modules/caas/src/caas/vfs/__init__.py +12 -0
- modules/caas/src/caas/vfs/filesystem.py +450 -0
- modules/caas/tests/__init__.py +3 -0
- modules/caas/tests/conftest.py +8 -0
- modules/caas/tests/test_caching.py +628 -0
- modules/caas/tests/test_context_triad.py +385 -0
- modules/caas/tests/test_conversation_manager.py +289 -0
- modules/caas/tests/test_functionality.py +215 -0
- modules/caas/tests/test_heuristic_router.py +370 -0
- modules/caas/tests/test_metadata_injection.py +328 -0
- modules/caas/tests/test_pragmatic_truth.py +322 -0
- modules/caas/tests/test_structure_aware_indexing.py +283 -0
- modules/caas/tests/test_time_decay.py +268 -0
- modules/caas/tests/test_trust_gateway.py +445 -0
- modules/caas/tests/test_vfs.py +298 -0
- modules/cmvk/.github/FUNDING.yml +9 -0
- modules/cmvk/.github/dependabot.yml +54 -0
- modules/cmvk/.github/workflows/ci.yml +205 -0
- modules/cmvk/.github/workflows/publish.yml +143 -0
- modules/cmvk/.gitignore +147 -0
- modules/cmvk/.pre-commit-config.yaml +58 -0
- modules/cmvk/CHANGELOG.md +146 -0
- modules/cmvk/CITATION.cff +48 -0
- modules/cmvk/CONTRIBUTING.md +229 -0
- modules/cmvk/Dockerfile +87 -0
- modules/cmvk/HF_MODEL_CARD.md +185 -0
- modules/cmvk/LICENSE +21 -0
- modules/cmvk/README.md +149 -0
- modules/cmvk/SECURITY.md +114 -0
- modules/cmvk/config/prompts/generator_v1.txt +23 -0
- modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
- modules/cmvk/config/settings.yaml +40 -0
- modules/cmvk/coverage_html/.gitignore +2 -0
- modules/cmvk/coverage_html/class_index.html +658 -0
- modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
- modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
- modules/cmvk/coverage_html/function_index.html +1978 -0
- modules/cmvk/coverage_html/index.html +255 -0
- modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
- modules/cmvk/coverage_html/status.json +1 -0
- modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
- modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
- modules/cmvk/docs/DIAGRAMS.md +325 -0
- modules/cmvk/docs/architecture.md +345 -0
- modules/cmvk/docs/features.md +308 -0
- modules/cmvk/docs/getting_started.md +279 -0
- modules/cmvk/docs/innovation_layer.md +377 -0
- modules/cmvk/docs/safety.md +281 -0
- modules/cmvk/docs/traceability.md +150 -0
- modules/cmvk/examples/basic_example.py +62 -0
- modules/cmvk/examples/demo_complete_pipeline.py +209 -0
- modules/cmvk/examples/demo_innovation_layer.py +197 -0
- modules/cmvk/examples/example.py +112 -0
- modules/cmvk/examples/model_diversity_comparison.py +110 -0
- modules/cmvk/examples/real_api_integration.py +121 -0
- modules/cmvk/examples/test_full_pipeline.py +303 -0
- modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
- modules/cmvk/experiments/README.md +216 -0
- modules/cmvk/experiments/ablation_runner.py +666 -0
- modules/cmvk/experiments/baseline_runner.py +158 -0
- modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
- modules/cmvk/experiments/datasets/README.md +85 -0
- modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
- modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
- modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
- modules/cmvk/experiments/datasets/sabotage.json +262 -0
- modules/cmvk/experiments/datasets/sample.json +40 -0
- modules/cmvk/experiments/demo_with_traces.py +110 -0
- modules/cmvk/experiments/efficiency_curve.py +259 -0
- modules/cmvk/experiments/experiment_runner.py +243 -0
- modules/cmvk/experiments/paper_data_generator.py +183 -0
- modules/cmvk/experiments/reproduce_results.py +407 -0
- modules/cmvk/experiments/reproducible_runner.py +352 -0
- modules/cmvk/experiments/sabotage_stress_test.py +311 -0
- modules/cmvk/experiments/test_lateral_thinking.py +116 -0
- modules/cmvk/experiments/test_prosecutor.py +41 -0
- modules/cmvk/experiments/visualize_results.py +735 -0
- modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
- modules/cmvk/notebooks/analysis.ipynb +124 -0
- modules/cmvk/paper/PAPER.md +561 -0
- modules/cmvk/paper/arxiv_checklist.md +230 -0
- modules/cmvk/paper/cmvk_neurips.aux +77 -0
- modules/cmvk/paper/cmvk_neurips.bbl +81 -0
- modules/cmvk/paper/cmvk_neurips.blg +48 -0
- modules/cmvk/paper/cmvk_neurips.out +16 -0
- modules/cmvk/paper/cmvk_neurips.pdf +0 -0
- modules/cmvk/paper/cmvk_neurips.tex +309 -0
- modules/cmvk/paper/figures/ablation.png +0 -0
- modules/cmvk/paper/figures/ablation.svg +39 -0
- modules/cmvk/paper/figures/architecture.png +0 -0
- modules/cmvk/paper/figures/architecture.svg +115 -0
- modules/cmvk/paper/figures/results_bar.png +0 -0
- modules/cmvk/paper/figures/results_bar.svg +70 -0
- modules/cmvk/paper/generate_figures.py +383 -0
- modules/cmvk/paper/neurips_2024.sty +101 -0
- modules/cmvk/paper/references.bib +98 -0
- modules/cmvk/paper/structure.tex +200 -0
- modules/cmvk/pyproject.toml +189 -0
- modules/cmvk/requirements-dev.txt +19 -0
- modules/cmvk/requirements.txt +14 -0
- modules/cmvk/src/cmvk/__init__.py +216 -0
- modules/cmvk/src/cmvk/audit.py +400 -0
- modules/cmvk/src/cmvk/benchmarks.py +476 -0
- modules/cmvk/src/cmvk/constitutional.py +902 -0
- modules/cmvk/src/cmvk/hf_utils.py +299 -0
- modules/cmvk/src/cmvk/metrics.py +471 -0
- modules/cmvk/src/cmvk/profiles.py +298 -0
- modules/cmvk/src/cmvk/py.typed +0 -0
- modules/cmvk/src/cmvk/types.py +10 -0
- modules/cmvk/src/cmvk/verification.py +954 -0
- modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
- modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
- modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
- modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
- modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
- modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
- modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
- modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
- modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
- modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
- modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
- modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
- modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
- modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
- modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
- modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
- modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
- modules/cmvk/tests/__init__.py +3 -0
- modules/cmvk/tests/conftest.py +61 -0
- modules/cmvk/tests/integration/__init__.py +1 -0
- modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
- modules/cmvk/tests/integration/test_integration.py +53 -0
- modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
- modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
- modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
- modules/cmvk/tests/test_constitutional.py +611 -0
- modules/cmvk/tests/test_enhanced_features.py +603 -0
- modules/cmvk/tests/test_verification.py +255 -0
- modules/cmvk/tests/unit/__init__.py +1 -0
- modules/cmvk/tests/unit/test_agents.py +64 -0
- modules/cmvk/tests/unit/test_cli.py +224 -0
- modules/cmvk/tests/unit/test_core.py +126 -0
- modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
- modules/cmvk/tests/unit/test_kernel.py +255 -0
- modules/cmvk/tests/unit/test_reproducibility.py +160 -0
- modules/cmvk/tests/unit/test_trace_logger.py +115 -0
- modules/cmvk/tests/unit/test_visualizer.py +218 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
- modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
- modules/control-plane/.github/discussions.yml +73 -0
- modules/control-plane/.github/pull_request_template.md +82 -0
- modules/control-plane/.github/workflows/publish.yml +146 -0
- modules/control-plane/.github/workflows/release.yml +39 -0
- modules/control-plane/.github/workflows/tests.yml +58 -0
- modules/control-plane/.gitignore +55 -0
- modules/control-plane/CHANGELOG.md +203 -0
- modules/control-plane/CONTRIBUTING.md +311 -0
- modules/control-plane/CONTRIBUTORS.md +88 -0
- modules/control-plane/Dockerfile +82 -0
- modules/control-plane/LICENSE +21 -0
- modules/control-plane/MANIFEST.in +17 -0
- modules/control-plane/README.md +1264 -0
- modules/control-plane/ROADMAP.md +228 -0
- modules/control-plane/SECURITY.md +210 -0
- modules/control-plane/SUPPORT.md +106 -0
- modules/control-plane/acp-cli.py +212 -0
- modules/control-plane/benchmark/README.md +257 -0
- modules/control-plane/benchmark/__init__.py +19 -0
- modules/control-plane/benchmark/red_team_dataset.py +517 -0
- modules/control-plane/benchmark.py +563 -0
- modules/control-plane/build_and_publish.sh +130 -0
- modules/control-plane/docker-compose.yml +74 -0
- modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
- modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
- modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
- modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
- modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
- modules/control-plane/docs/CASE_STUDIES.md +645 -0
- modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
- modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
- modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
- modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
- modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
- modules/control-plane/docs/LIMITATIONS.md +523 -0
- modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
- modules/control-plane/docs/README.md +58 -0
- modules/control-plane/docs/RELATED_WORK.md +319 -0
- modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
- modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
- modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
- modules/control-plane/docs/api/CORE.md +270 -0
- modules/control-plane/docs/architecture/architecture.md +120 -0
- modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
- modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
- modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
- modules/control-plane/docs/guides/QUICKSTART.md +217 -0
- modules/control-plane/examples/README.md +138 -0
- modules/control-plane/examples/a2a_demo.py +410 -0
- modules/control-plane/examples/adapter_demo.py +347 -0
- modules/control-plane/examples/advanced_features.py +403 -0
- modules/control-plane/examples/basic_usage.py +261 -0
- modules/control-plane/examples/benchmark_demo.py +186 -0
- modules/control-plane/examples/compliance_demo.py +333 -0
- modules/control-plane/examples/configuration.py +265 -0
- modules/control-plane/examples/getting_started.py +178 -0
- modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
- modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
- modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
- modules/control-plane/examples/kernel_v1_demo.py +273 -0
- modules/control-plane/examples/langchain_demo.py +281 -0
- modules/control-plane/examples/lifecycle_demo.py +724 -0
- modules/control-plane/examples/mcp_demo.py +378 -0
- modules/control-plane/examples/ml_safety_demo.py +157 -0
- modules/control-plane/examples/multimodal_demo.py +347 -0
- modules/control-plane/examples/observability_demo.py +370 -0
- modules/control-plane/examples/use_cases.py +336 -0
- modules/control-plane/experiments/long_horizon_purge.py +235 -0
- modules/control-plane/experiments/multi_agent_rag.py +165 -0
- modules/control-plane/experiments/reproduce_results.py +667 -0
- modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
- modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
- modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
- modules/control-plane/paper/Paper.pdf +0 -0
- modules/control-plane/paper/README.md +71 -0
- modules/control-plane/paper/appendix.md +152 -0
- modules/control-plane/paper/architecture.md +15 -0
- modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
- modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
- modules/control-plane/paper/arxiv/main.aux +97 -0
- modules/control-plane/paper/arxiv/main.bbl +112 -0
- modules/control-plane/paper/arxiv/main.blg +48 -0
- modules/control-plane/paper/arxiv/main.out +33 -0
- modules/control-plane/paper/arxiv/main.pdf +0 -0
- modules/control-plane/paper/arxiv/main.tex +479 -0
- modules/control-plane/paper/arxiv/references.bib +234 -0
- modules/control-plane/paper/arxiv_submission.tar +0 -0
- modules/control-plane/paper/arxiv_submission.zip +0 -0
- modules/control-plane/paper/build.sh +68 -0
- modules/control-plane/paper/figures/README.md +47 -0
- modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
- modules/control-plane/paper/figures/ablation_chart.png +0 -0
- modules/control-plane/paper/figures/architecture.pdf +0 -0
- modules/control-plane/paper/figures/architecture.png +0 -0
- modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
- modules/control-plane/paper/figures/constraint_graphs.png +0 -0
- modules/control-plane/paper/figures/generate_figures.py +252 -0
- modules/control-plane/paper/figures/results_chart.pdf +0 -0
- modules/control-plane/paper/figures/results_chart.png +0 -0
- modules/control-plane/paper/main.md +273 -0
- modules/control-plane/paper/main.tex +214 -0
- modules/control-plane/paper/main_arxiv.aux +53 -0
- modules/control-plane/paper/main_arxiv.out +17 -0
- modules/control-plane/paper/main_arxiv.pdf +0 -0
- modules/control-plane/paper/main_arxiv.tex +264 -0
- modules/control-plane/paper/references.bib +234 -0
- modules/control-plane/pyproject.toml +124 -0
- modules/control-plane/reproducibility/ABLATIONS.md +136 -0
- modules/control-plane/reproducibility/README.md +288 -0
- modules/control-plane/reproducibility/commands.md +467 -0
- modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
- modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
- modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
- modules/control-plane/reproducibility/hardware_specs.md +317 -0
- modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
- modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
- modules/control-plane/reproducibility/seeds.json +106 -0
- modules/control-plane/scripts/prepare_pypi.py +46 -0
- modules/control-plane/scripts/prepare_release.py +176 -0
- modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
- modules/control-plane/setup.py +69 -0
- modules/control-plane/src/agent_control_plane/__init__.py +639 -0
- modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
- modules/control-plane/src/agent_control_plane/adapter.py +415 -0
- modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
- modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
- modules/control-plane/src/agent_control_plane/compliance.py +718 -0
- modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
- modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
- modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
- modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
- modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
- modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
- modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
- modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
- modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
- modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
- modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
- modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
- modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
- modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
- modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
- modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
- modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
- modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
- modules/control-plane/src/agent_control_plane/observability.py +785 -0
- modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
- modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
- modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
- modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
- modules/control-plane/src/agent_control_plane/signals.py +491 -0
- modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
- modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
- modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
- modules/control-plane/src/agent_control_plane/vfs.py +695 -0
- modules/control-plane/tests/README.md +33 -0
- modules/control-plane/tests/test_a2a_adapter.py +336 -0
- modules/control-plane/tests/test_adapter.py +422 -0
- modules/control-plane/tests/test_advanced_features.py +389 -0
- modules/control-plane/tests/test_benchmark.py +223 -0
- modules/control-plane/tests/test_compliance.py +214 -0
- modules/control-plane/tests/test_control_plane.py +295 -0
- modules/control-plane/tests/test_hibernation.py +274 -0
- modules/control-plane/tests/test_kernel_interception.py +284 -0
- modules/control-plane/tests/test_langchain_adapter.py +258 -0
- modules/control-plane/tests/test_lifecycle.py +1174 -0
- modules/control-plane/tests/test_mcp_adapter.py +293 -0
- modules/control-plane/tests/test_ml_safety.py +142 -0
- modules/control-plane/tests/test_multimodal.py +317 -0
- modules/control-plane/tests/test_new_features.py +435 -0
- modules/control-plane/tests/test_observability.py +338 -0
- modules/control-plane/tests/test_time_travel.py +387 -0
- modules/emk/.github/workflows/ci.yml +105 -0
- modules/emk/.github/workflows/publish.yml +144 -0
- modules/emk/.gitignore +74 -0
- modules/emk/CHANGELOG.md +41 -0
- modules/emk/CONTRIBUTING.md +295 -0
- modules/emk/IMPLEMENTATION.md +174 -0
- modules/emk/LICENSE +21 -0
- modules/emk/MANIFEST.in +8 -0
- modules/emk/README.md +135 -0
- modules/emk/RELEASE_NOTES.md +82 -0
- modules/emk/SECURITY.md +52 -0
- modules/emk/codecov.yml +39 -0
- modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
- modules/emk/emk/__init__.py +106 -0
- modules/emk/emk/hf_utils.py +419 -0
- modules/emk/emk/indexer.py +144 -0
- modules/emk/emk/py.typed +0 -0
- modules/emk/emk/schema.py +204 -0
- modules/emk/emk/sleep_cycle.py +345 -0
- modules/emk/emk/store.py +479 -0
- modules/emk/examples/basic_usage.py +123 -0
- modules/emk/examples/memory_features_demo.py +154 -0
- modules/emk/experiments/README.md +59 -0
- modules/emk/experiments/reproduce_results.py +461 -0
- modules/emk/experiments/results.json +61 -0
- modules/emk/paper/structure.tex +192 -0
- modules/emk/paper/whitepaper.md +273 -0
- modules/emk/pyproject.toml +91 -0
- modules/emk/setup.py +5 -0
- modules/emk/tests/test_file_adapter.py +195 -0
- modules/emk/tests/test_indexer.py +174 -0
- modules/emk/tests/test_init.py +55 -0
- modules/emk/tests/test_negative_memory.py +83 -0
- modules/emk/tests/test_schema.py +150 -0
- modules/emk/tests/test_semantic_rules.py +175 -0
- modules/emk/tests/test_sleep_cycle.py +335 -0
- modules/emk/tests/test_store_anti_patterns.py +239 -0
- modules/iatp/.github/workflows/docker-build.yml +124 -0
- modules/iatp/.github/workflows/publish.yml +174 -0
- modules/iatp/.github/workflows/python-package.yml +121 -0
- modules/iatp/.gitignore +67 -0
- modules/iatp/.pre-commit-config.yaml +64 -0
- modules/iatp/CHANGELOG.md +120 -0
- modules/iatp/Dockerfile +91 -0
- modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
- modules/iatp/MANIFEST.in +9 -0
- modules/iatp/README.md +180 -0
- modules/iatp/docker/Dockerfile.agent +27 -0
- modules/iatp/docker/Dockerfile.sidecar-python +86 -0
- modules/iatp/docker/README.md +258 -0
- modules/iatp/docker-compose.yml +194 -0
- modules/iatp/docs/ARCHITECTURE.md +243 -0
- modules/iatp/docs/CLI_GUIDE.md +220 -0
- modules/iatp/docs/DEPLOYMENT.md +304 -0
- modules/iatp/examples/README.md +132 -0
- modules/iatp/examples/backend_agent.py +39 -0
- modules/iatp/examples/client.py +168 -0
- modules/iatp/examples/demo_attestation_reputation.py +274 -0
- modules/iatp/examples/demo_client.py +240 -0
- modules/iatp/examples/demo_rbac.py +143 -0
- modules/iatp/examples/integration_demo.py +245 -0
- modules/iatp/examples/manifests/coder_agent.json +20 -0
- modules/iatp/examples/manifests/reviewer_agent.json +19 -0
- modules/iatp/examples/manifests/secure_bank.json +14 -0
- modules/iatp/examples/manifests/standard_agent.json +14 -0
- modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
- modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
- modules/iatp/examples/run_sidecar.py +105 -0
- modules/iatp/examples/run_untrusted_sidecar.py +77 -0
- modules/iatp/examples/secure_bank_agent.py +138 -0
- modules/iatp/examples/test_untrusted.py +82 -0
- modules/iatp/examples/untrusted_agent.py +119 -0
- modules/iatp/experiments/README.md +58 -0
- modules/iatp/experiments/cascading_hallucination/README.md +149 -0
- modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
- modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
- modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
- modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
- modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
- modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
- modules/iatp/experiments/reproduce_results.py +574 -0
- modules/iatp/experiments/results.json +2336 -0
- modules/iatp/iatp/__init__.py +164 -0
- modules/iatp/iatp/attestation.py +401 -0
- modules/iatp/iatp/cli.py +253 -0
- modules/iatp/iatp/hf_utils.py +469 -0
- modules/iatp/iatp/ipc_pipes.py +578 -0
- modules/iatp/iatp/main.py +410 -0
- modules/iatp/iatp/models/__init__.py +445 -0
- modules/iatp/iatp/policy_engine.py +335 -0
- modules/iatp/iatp/py.typed +2 -0
- modules/iatp/iatp/recovery.py +319 -0
- modules/iatp/iatp/security/__init__.py +268 -0
- modules/iatp/iatp/sidecar/__init__.py +517 -0
- modules/iatp/iatp/telemetry/__init__.py +162 -0
- modules/iatp/iatp/tests/__init__.py +1 -0
- modules/iatp/iatp/tests/test_attestation.py +368 -0
- modules/iatp/iatp/tests/test_cli.py +129 -0
- modules/iatp/iatp/tests/test_models.py +128 -0
- modules/iatp/iatp/tests/test_policy_engine.py +345 -0
- modules/iatp/iatp/tests/test_recovery.py +279 -0
- modules/iatp/iatp/tests/test_security.py +220 -0
- modules/iatp/iatp/tests/test_sidecar.py +165 -0
- modules/iatp/iatp/tests/test_telemetry.py +173 -0
- modules/iatp/paper/BLOG.md +307 -0
- modules/iatp/paper/PAPER.md +236 -0
- modules/iatp/paper/RFC_SUBMISSION.md +299 -0
- modules/iatp/paper/whitepaper.md +369 -0
- modules/iatp/proto/README.md +200 -0
- modules/iatp/proto/generate_stubs.py +81 -0
- modules/iatp/proto/iatp.proto +552 -0
- modules/iatp/pyproject.toml +180 -0
- modules/iatp/requirements-dev.txt +2 -0
- modules/iatp/requirements.txt +6 -0
- modules/iatp/setup.py +60 -0
- modules/iatp/sidecar/README.md +487 -0
- modules/iatp/sidecar/go/Dockerfile +32 -0
- modules/iatp/sidecar/go/README.md +237 -0
- modules/iatp/sidecar/go/go.mod +8 -0
- modules/iatp/sidecar/go/main.go +488 -0
- modules/iatp/spec/001-handshake.md +436 -0
- modules/iatp/spec/002-reversibility.md +394 -0
- modules/iatp/spec/schema/capability_manifest.json +266 -0
- modules/iatp/test_integration.py +310 -0
- modules/mcp-kernel-server/README.md +261 -0
- modules/mcp-kernel-server/pyproject.toml +60 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
- modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
- modules/mute-agent/.github/workflows/safety_check.yml +45 -0
- modules/mute-agent/.gitignore +53 -0
- modules/mute-agent/ARCHITECTURE.md +531 -0
- modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
- modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
- modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
- modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
- modules/mute-agent/LICENSE +21 -0
- modules/mute-agent/PHASE3_SUMMARY.md +297 -0
- modules/mute-agent/README.md +360 -0
- modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
- modules/mute-agent/USAGE.md +505 -0
- modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
- modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
- modules/mute-agent/VERIFICATION_REPORT.md +435 -0
- modules/mute-agent/charts/cost_comparison.png +0 -0
- modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
- modules/mute-agent/charts/metrics_comparison.png +0 -0
- modules/mute-agent/charts/scenario_breakdown.png +0 -0
- modules/mute-agent/charts/trace_attack_blocked.html +140 -0
- modules/mute-agent/charts/trace_attack_blocked.png +0 -0
- modules/mute-agent/charts/trace_failure.html +140 -0
- modules/mute-agent/charts/trace_failure.png +0 -0
- modules/mute-agent/charts/trace_success.html +140 -0
- modules/mute-agent/charts/trace_success.png +0 -0
- modules/mute-agent/examples/__init__.py +1 -0
- modules/mute-agent/examples/advanced_example.py +384 -0
- modules/mute-agent/examples/graph_debugger_demo.py +241 -0
- modules/mute-agent/examples/listener_example.py +297 -0
- modules/mute-agent/examples/simple_example.py +242 -0
- modules/mute-agent/examples/steel_man_demo.py +297 -0
- modules/mute-agent/experiments/README.md +135 -0
- modules/mute-agent/experiments/__init__.py +3 -0
- modules/mute-agent/experiments/agent_comparison.csv +6 -0
- modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
- modules/mute-agent/experiments/ambiguity_test.py +335 -0
- modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
- modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
- modules/mute-agent/experiments/baseline_agent.py +189 -0
- modules/mute-agent/experiments/benchmark.py +402 -0
- modules/mute-agent/experiments/demo.py +172 -0
- modules/mute-agent/experiments/generate_cost_curve.py +474 -0
- modules/mute-agent/experiments/jailbreak_test.py +137 -0
- modules/mute-agent/experiments/latent_state_scenario.py +361 -0
- modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
- modules/mute-agent/experiments/run_extended_experiment.py +40 -0
- modules/mute-agent/experiments/run_v2_experiments.py +266 -0
- modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
- modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
- modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
- modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
- modules/mute-agent/experiments/visualize.py +400 -0
- modules/mute-agent/mute_agent/__init__.py +66 -0
- modules/mute-agent/mute_agent/core/__init__.py +1 -0
- modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
- modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
- modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
- modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
- modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
- modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
- modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
- modules/mute-agent/mute_agent/listener/__init__.py +41 -0
- modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
- modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
- modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
- modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
- modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
- modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
- modules/mute-agent/mute_agent/listener/listener.py +608 -0
- modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
- modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
- modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
- modules/mute-agent/mute_agent/super_system/router.py +202 -0
- modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
- modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
- modules/mute-agent/requirements-dev.txt +6 -0
- modules/mute-agent/requirements.txt +9 -0
- modules/mute-agent/setup.py +64 -0
- modules/mute-agent/src/__init__.py +0 -0
- modules/mute-agent/src/agents/__init__.py +0 -0
- modules/mute-agent/src/agents/baseline_agent.py +524 -0
- modules/mute-agent/src/agents/interactive_agent.py +113 -0
- modules/mute-agent/src/agents/mute_agent.py +622 -0
- modules/mute-agent/src/benchmarks/__init__.py +0 -0
- modules/mute-agent/src/benchmarks/evaluator.py +481 -0
- modules/mute-agent/src/benchmarks/scenarios.json +985 -0
- modules/mute-agent/src/core/__init__.py +0 -0
- modules/mute-agent/src/core/mock_state.py +320 -0
- modules/mute-agent/src/core/tools.py +441 -0
- modules/nexus/__init__.py +49 -0
- modules/nexus/arbiter.py +357 -0
- modules/nexus/client.py +464 -0
- modules/nexus/dmz.py +417 -0
- modules/nexus/escrow.py +428 -0
- modules/nexus/exceptions.py +284 -0
- modules/nexus/registry.py +391 -0
- modules/nexus/reputation.py +423 -0
- modules/nexus/schemas/__init__.py +49 -0
- modules/nexus/schemas/compliance.py +274 -0
- modules/nexus/schemas/escrow.py +249 -0
- modules/nexus/schemas/manifest.py +223 -0
- modules/nexus/schemas/receipt.py +206 -0
- modules/observability/README.md +192 -0
- modules/observability/alertmanager/alertmanager.yml +116 -0
- modules/observability/alerts/agent-os-alerts.yaml +197 -0
- modules/observability/docker-compose.yml +128 -0
- modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
- modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
- modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
- modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
- modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
- modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
- modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
- modules/observability/otel/otel-collector-config.yml +61 -0
- modules/observability/prometheus/prometheus.yml +63 -0
- modules/observability/pyproject.toml +53 -0
- modules/observability/scripts/export_dashboards.py +55 -0
- modules/observability/src/agent_os_observability/__init__.py +25 -0
- modules/observability/src/agent_os_observability/dashboards.py +896 -0
- modules/observability/src/agent_os_observability/metrics.py +396 -0
- modules/observability/src/agent_os_observability/server.py +221 -0
- modules/observability/src/agent_os_observability/tracer.py +226 -0
- modules/primitives/.gitignore +8 -0
- modules/primitives/README.md +62 -0
- modules/primitives/agent_primitives/__init__.py +22 -0
- modules/primitives/agent_primitives/failures.py +82 -0
- modules/primitives/agent_primitives/py.typed +0 -0
- modules/primitives/pyproject.toml +68 -0
- modules/scak/.github/copilot-instructions.md +396 -0
- modules/scak/.github/workflows/release.yml +117 -0
- modules/scak/.gitignore +32 -0
- modules/scak/CHANGELOG.md +173 -0
- modules/scak/CITATION.cff +62 -0
- modules/scak/CONTRIBUTING.md +429 -0
- modules/scak/Dockerfile +58 -0
- modules/scak/ENTERPRISE_FEATURES.md +518 -0
- modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
- modules/scak/LIMITATIONS.md +565 -0
- modules/scak/MANIFEST.in +16 -0
- modules/scak/NOVELTY.md +535 -0
- modules/scak/README.md +928 -0
- modules/scak/RESEARCH.md +670 -0
- modules/scak/agent_kernel/__init__.py +66 -0
- modules/scak/agent_kernel/analyzer.py +432 -0
- modules/scak/agent_kernel/auditor.py +31 -0
- modules/scak/agent_kernel/completeness_auditor.py +234 -0
- modules/scak/agent_kernel/detector.py +200 -0
- modules/scak/agent_kernel/kernel.py +741 -0
- modules/scak/agent_kernel/memory_manager.py +82 -0
- modules/scak/agent_kernel/models.py +372 -0
- modules/scak/agent_kernel/nudge_mechanism.py +260 -0
- modules/scak/agent_kernel/outcome_analyzer.py +335 -0
- modules/scak/agent_kernel/patcher.py +579 -0
- modules/scak/agent_kernel/semantic_analyzer.py +313 -0
- modules/scak/agent_kernel/semantic_purge.py +346 -0
- modules/scak/agent_kernel/simulator.py +447 -0
- modules/scak/agent_kernel/teacher.py +82 -0
- modules/scak/agent_kernel/triage.py +149 -0
- modules/scak/build_and_publish.ps1 +74 -0
- modules/scak/build_and_publish.sh +74 -0
- modules/scak/cli.py +471 -0
- modules/scak/dashboard.py +462 -0
- modules/scak/datasets/DATASET_CARD.md +219 -0
- modules/scak/datasets/README.md +143 -0
- modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
- modules/scak/datasets/hf_upload/README.md +219 -0
- modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
- modules/scak/datasets/prepare_hf_datasets.py +145 -0
- modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
- modules/scak/docker-compose.yml +99 -0
- modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
- modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
- modules/scak/docs/Dual-Loop-Architecture.md +344 -0
- modules/scak/docs/Enhanced-Features.md +612 -0
- modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
- modules/scak/docs/README.md +128 -0
- modules/scak/docs/Reference-Implementations.md +163 -0
- modules/scak/docs/SCAK_V2.md +374 -0
- modules/scak/docs/Three-Failure-Types.md +178 -0
- modules/scak/examples/basic_example.py +155 -0
- modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
- modules/scak/examples/langchain_integration_example.py +339 -0
- modules/scak/examples/layer4_demo.py +243 -0
- modules/scak/examples/production_features_demo.py +353 -0
- modules/scak/examples/quick_demo.py +79 -0
- modules/scak/examples/scak_v2_demo.py +252 -0
- modules/scak/experiments/README.md +438 -0
- modules/scak/experiments/ablation_studies/README.md +192 -0
- modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
- modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
- modules/scak/experiments/chaos_engineering/README.md +332 -0
- modules/scak/experiments/context_efficiency_test.py +328 -0
- modules/scak/experiments/gaia_benchmark/README.md +208 -0
- modules/scak/experiments/laziness_benchmark.py +179 -0
- modules/scak/experiments/long_horizon_task_experiment.py +252 -0
- modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
- modules/scak/experiments/results/ablation_table.md +12 -0
- modules/scak/experiments/results/long_horizon.json +36 -0
- modules/scak/experiments/results/multi_agent_rag.json +66 -0
- modules/scak/experiments/run_comprehensive_ablations.py +332 -0
- modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
- modules/scak/notebooks/getting_started.ipynb +33 -0
- modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
- modules/scak/paper/PAPER_CHECKLIST.md +304 -0
- modules/scak/paper/Paper.pdf +0 -0
- modules/scak/paper/README.md +113 -0
- modules/scak/paper/appendix.md +351 -0
- modules/scak/paper/arxiv/bibliography.bib +284 -0
- modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv/main.aux +103 -0
- modules/scak/paper/arxiv/main.bbl +113 -0
- modules/scak/paper/arxiv/main.blg +55 -0
- modules/scak/paper/arxiv/main.out +31 -0
- modules/scak/paper/arxiv/main.pdf +0 -0
- modules/scak/paper/arxiv/main.tex +482 -0
- modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
- modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.aux +103 -0
- modules/scak/paper/arxiv_submission/main.bbl +113 -0
- modules/scak/paper/arxiv_submission/main.blg +55 -0
- modules/scak/paper/arxiv_submission/main.out +31 -0
- modules/scak/paper/arxiv_submission/main.pdf +0 -0
- modules/scak/paper/arxiv_submission/main.tex +482 -0
- modules/scak/paper/arxiv_submission.tar.gz +0 -0
- modules/scak/paper/bibliography.bib +284 -0
- modules/scak/paper/build.sh +55 -0
- modules/scak/paper/figures/README.md +32 -0
- modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
- modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
- modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
- modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
- modules/scak/paper/figures/fig3_gaia_results.md +64 -0
- modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
- modules/scak/paper/figures/fig3_gaia_results.png +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
- modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
- modules/scak/paper/figures/fig5_context_reduction.md +71 -0
- modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
- modules/scak/paper/figures/fig5_context_reduction.png +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
- modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
- modules/scak/paper/figures/generate_figures.py +463 -0
- modules/scak/paper/main.aux +103 -0
- modules/scak/paper/main.bbl +113 -0
- modules/scak/paper/main.blg +55 -0
- modules/scak/paper/main.md +192 -0
- modules/scak/paper/main.out +31 -0
- modules/scak/paper/main.pdf +0 -0
- modules/scak/paper/main.tex +482 -0
- modules/scak/reproducibility/ABLATIONS.md +225 -0
- modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
- modules/scak/reproducibility/README.md +421 -0
- modules/scak/reproducibility/requirements-pinned.txt +32 -0
- modules/scak/reproducibility/run_all_experiments.py +395 -0
- modules/scak/reproducibility/seed_control.py +53 -0
- modules/scak/reproducibility/statistical_analysis.py +302 -0
- modules/scak/requirements.txt +50 -0
- modules/scak/setup.py +93 -0
- modules/scak/src/__init__.py +124 -0
- modules/scak/src/agents/__init__.py +13 -0
- modules/scak/src/agents/conflict_resolution.py +732 -0
- modules/scak/src/agents/orchestrator.py +761 -0
- modules/scak/src/agents/pubsub.py +484 -0
- modules/scak/src/agents/shadow_teacher.py +344 -0
- modules/scak/src/agents/swarm.py +661 -0
- modules/scak/src/agents/worker.py +357 -0
- modules/scak/src/integrations/__init__.py +81 -0
- modules/scak/src/integrations/cmvk_adapter.py +430 -0
- modules/scak/src/integrations/control_plane_adapter.py +601 -0
- modules/scak/src/integrations/langchain_integration.py +902 -0
- modules/scak/src/interfaces/__init__.py +59 -0
- modules/scak/src/interfaces/llm_clients.py +505 -0
- modules/scak/src/interfaces/openapi_tools.py +611 -0
- modules/scak/src/interfaces/plugin_system.py +605 -0
- modules/scak/src/interfaces/protocols.py +365 -0
- modules/scak/src/interfaces/telemetry.py +464 -0
- modules/scak/src/interfaces/tool_registry.py +547 -0
- modules/scak/src/kernel/__init__.py +100 -0
- modules/scak/src/kernel/auditor.py +305 -0
- modules/scak/src/kernel/circuit_breaker.py +398 -0
- modules/scak/src/kernel/core.py +724 -0
- modules/scak/src/kernel/distributed.py +667 -0
- modules/scak/src/kernel/evolution.py +455 -0
- modules/scak/src/kernel/failover.py +621 -0
- modules/scak/src/kernel/governance.py +710 -0
- modules/scak/src/kernel/governance_v2.py +603 -0
- modules/scak/src/kernel/lazy_evaluator.py +514 -0
- modules/scak/src/kernel/load_testing.py +633 -0
- modules/scak/src/kernel/memory.py +945 -0
- modules/scak/src/kernel/patcher.py +581 -0
- modules/scak/src/kernel/rubric.py +419 -0
- modules/scak/src/kernel/schemas.py +390 -0
- modules/scak/src/kernel/skill_mapper.py +309 -0
- modules/scak/src/kernel/triage.py +149 -0
- modules/scak/src/mocks/__init__.py +99 -0
- modules/scak/tests/__init__.py +1 -0
- modules/scak/tests/test_circuit_breaker.py +403 -0
- modules/scak/tests/test_conflict_resolution.py +287 -0
- modules/scak/tests/test_dual_loop.py +463 -0
- modules/scak/tests/test_enhanced_features.py +421 -0
- modules/scak/tests/test_failover_and_load.py +438 -0
- modules/scak/tests/test_governance.py +185 -0
- modules/scak/tests/test_kernel.py +359 -0
- modules/scak/tests/test_langchain_integration.py +451 -0
- modules/scak/tests/test_lazy_evaluator.py +465 -0
- modules/scak/tests/test_llm_clients.py +122 -0
- modules/scak/tests/test_memory_controller.py +528 -0
- modules/scak/tests/test_orchestrator.py +181 -0
- modules/scak/tests/test_phase3_integration.py +265 -0
- modules/scak/tests/test_pubsub_swarm.py +203 -0
- modules/scak/tests/test_reference_implementations.py +240 -0
- modules/scak/tests/test_rubric.py +363 -0
- modules/scak/tests/test_scak_v2.py +651 -0
- modules/scak/tests/test_skill_mapper.py +217 -0
- modules/scak/tests/test_specific_failures.py +393 -0
- modules/scak/tests/test_tool_registry.py +264 -0
- modules/scak/tests/test_tools_and_plugins.py +303 -0
- modules/scak/tests/test_triage.py +596 -0
- modules/scak/tests/test_write_through.py +319 -0
- agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
- agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/WHEEL +0 -0
- {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,3111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent Lifecycle Management - v0.2.0
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive lifecycle management for autonomous AI agents,
|
|
5
|
+
including health monitoring, auto-recovery, circuit breakers, scaling, distributed
|
|
6
|
+
coordination, dependency management, graceful shutdown, resource quotas, observability,
|
|
7
|
+
and hot reload capabilities.
|
|
8
|
+
|
|
9
|
+
Features:
|
|
10
|
+
- ACP-001: Agent Health Checks (liveness/readiness probes)
|
|
11
|
+
- ACP-002: Agent Auto-Recovery (automatic restart of crashed agents)
|
|
12
|
+
- ACP-003: Circuit Breaker (prevent cascading failures)
|
|
13
|
+
- ACP-004: Agent Scaling (horizontal scaling for high-throughput)
|
|
14
|
+
- ACP-005: Distributed Coordination (leader election, consensus)
|
|
15
|
+
- ACP-006: Agent Dependency Graph (enforced start order)
|
|
16
|
+
- ACP-007: Graceful Shutdown (preserve in-flight verifications)
|
|
17
|
+
- ACP-008: Resource Quotas (memory/CPU limits per agent)
|
|
18
|
+
- ACP-009: Agent Observability (metrics/logging integration)
|
|
19
|
+
- ACP-010: Hot Reload (code changes without full restart)
|
|
20
|
+
|
|
21
|
+
Research Foundations:
|
|
22
|
+
- Circuit Breaker pattern (Michael Nygard, "Release It!")
|
|
23
|
+
- Kubernetes probe patterns (liveness, readiness, startup)
|
|
24
|
+
- Raft consensus algorithm (Ongaro & Ousterhout, 2014)
|
|
25
|
+
- Actor model supervision (Erlang/OTP, Akka)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from typing import (
|
|
29
|
+
Dict, List, Optional, Any, Union, Callable, Type, Set, Awaitable,
|
|
30
|
+
TypeVar, Generic, Protocol, runtime_checkable
|
|
31
|
+
)
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
from enum import Enum, auto
|
|
34
|
+
from datetime import datetime, timedelta
|
|
35
|
+
from collections import defaultdict, deque
|
|
36
|
+
from abc import ABC, abstractmethod
|
|
37
|
+
import asyncio
|
|
38
|
+
import time
|
|
39
|
+
import uuid
|
|
40
|
+
import logging
|
|
41
|
+
import threading
|
|
42
|
+
import weakref
|
|
43
|
+
import traceback
|
|
44
|
+
import hashlib
|
|
45
|
+
import importlib
|
|
46
|
+
import sys
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Configure module logger
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ============================================================================
|
|
54
|
+
# Enums and Constants
|
|
55
|
+
# ============================================================================
|
|
56
|
+
|
|
57
|
+
class HealthStatus(Enum):
|
|
58
|
+
"""Health status of an agent"""
|
|
59
|
+
UNKNOWN = "unknown"
|
|
60
|
+
HEALTHY = "healthy"
|
|
61
|
+
UNHEALTHY = "unhealthy"
|
|
62
|
+
DEGRADED = "degraded"
|
|
63
|
+
STARTING = "starting"
|
|
64
|
+
STOPPING = "stopping"
|
|
65
|
+
STOPPED = "stopped"
|
|
66
|
+
FAILED = "failed"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class AgentState(Enum):
|
|
70
|
+
"""State of an agent in the lifecycle"""
|
|
71
|
+
REGISTERED = "registered"
|
|
72
|
+
PENDING = "pending"
|
|
73
|
+
STARTING = "starting"
|
|
74
|
+
RUNNING = "running"
|
|
75
|
+
STOPPING = "stopping"
|
|
76
|
+
STOPPED = "stopped"
|
|
77
|
+
FAILED = "failed"
|
|
78
|
+
RECOVERING = "recovering"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class CircuitState(Enum):
|
|
82
|
+
"""State of a circuit breaker"""
|
|
83
|
+
CLOSED = "closed" # Normal operation
|
|
84
|
+
OPEN = "open" # Failing, reject requests
|
|
85
|
+
HALF_OPEN = "half_open" # Testing recovery
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class CoordinationRole(Enum):
|
|
89
|
+
"""Role in distributed coordination"""
|
|
90
|
+
LEADER = "leader"
|
|
91
|
+
FOLLOWER = "follower"
|
|
92
|
+
CANDIDATE = "candidate"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ShutdownPhase(Enum):
|
|
96
|
+
"""Phases of graceful shutdown"""
|
|
97
|
+
RUNNING = "running"
|
|
98
|
+
DRAINING = "draining"
|
|
99
|
+
STOPPING = "stopping"
|
|
100
|
+
TERMINATED = "terminated"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ============================================================================
|
|
104
|
+
# ACP-001: Agent Health Checks
|
|
105
|
+
# ============================================================================
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class HealthCheckResult:
|
|
109
|
+
"""Result of a health check probe"""
|
|
110
|
+
healthy: bool
|
|
111
|
+
status: HealthStatus
|
|
112
|
+
message: str = ""
|
|
113
|
+
latency_ms: float = 0.0
|
|
114
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
115
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class HealthCheckConfig:
|
|
120
|
+
"""Configuration for health check probes"""
|
|
121
|
+
# Liveness probe settings
|
|
122
|
+
liveness_interval_seconds: float = 10.0
|
|
123
|
+
liveness_timeout_seconds: float = 5.0
|
|
124
|
+
liveness_failure_threshold: int = 3
|
|
125
|
+
|
|
126
|
+
# Readiness probe settings
|
|
127
|
+
readiness_interval_seconds: float = 5.0
|
|
128
|
+
readiness_timeout_seconds: float = 3.0
|
|
129
|
+
readiness_failure_threshold: int = 1
|
|
130
|
+
|
|
131
|
+
# Startup probe settings (for slow-starting agents)
|
|
132
|
+
startup_probe_enabled: bool = True
|
|
133
|
+
startup_timeout_seconds: float = 60.0
|
|
134
|
+
startup_period_seconds: float = 5.0
|
|
135
|
+
|
|
136
|
+
# Custom health check function
|
|
137
|
+
custom_health_check: Optional[Callable[[], Awaitable[bool]]] = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@runtime_checkable
|
|
141
|
+
class HealthCheckable(Protocol):
|
|
142
|
+
"""Protocol for agents that support health checks"""
|
|
143
|
+
|
|
144
|
+
async def liveness_check(self) -> bool:
|
|
145
|
+
"""Check if the agent is alive (not deadlocked/crashed)"""
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
async def readiness_check(self) -> bool:
|
|
149
|
+
"""Check if the agent is ready to accept requests"""
|
|
150
|
+
...
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class HealthMonitor:
|
|
154
|
+
"""
|
|
155
|
+
Monitors agent health via liveness and readiness probes.
|
|
156
|
+
|
|
157
|
+
Implements Kubernetes-style health checking patterns:
|
|
158
|
+
- Liveness: Is the agent alive? If not, restart it.
|
|
159
|
+
- Readiness: Is the agent ready to accept requests?
|
|
160
|
+
- Startup: Has the agent finished starting up?
|
|
161
|
+
|
|
162
|
+
Usage:
|
|
163
|
+
monitor = HealthMonitor(config=HealthCheckConfig())
|
|
164
|
+
|
|
165
|
+
# Register an agent
|
|
166
|
+
monitor.register_agent(agent_id, agent_instance)
|
|
167
|
+
|
|
168
|
+
# Start monitoring
|
|
169
|
+
await monitor.start()
|
|
170
|
+
|
|
171
|
+
# Check status
|
|
172
|
+
status = monitor.get_agent_health(agent_id)
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
def __init__(self, config: Optional[HealthCheckConfig] = None):
|
|
176
|
+
self.config = config or HealthCheckConfig()
|
|
177
|
+
self._agents: Dict[str, Any] = {}
|
|
178
|
+
self._health_status: Dict[str, HealthStatus] = {}
|
|
179
|
+
self._liveness_failures: Dict[str, int] = defaultdict(int)
|
|
180
|
+
self._readiness_failures: Dict[str, int] = defaultdict(int)
|
|
181
|
+
self._last_check: Dict[str, datetime] = {}
|
|
182
|
+
self._check_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
|
|
183
|
+
self._running = False
|
|
184
|
+
self._tasks: List[asyncio.Task] = []
|
|
185
|
+
self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
|
|
186
|
+
self._lock = asyncio.Lock()
|
|
187
|
+
|
|
188
|
+
def register_agent(
|
|
189
|
+
self,
|
|
190
|
+
agent_id: str,
|
|
191
|
+
agent: Any,
|
|
192
|
+
custom_liveness: Optional[Callable[[], Awaitable[bool]]] = None,
|
|
193
|
+
custom_readiness: Optional[Callable[[], Awaitable[bool]]] = None
|
|
194
|
+
) -> None:
|
|
195
|
+
"""Register an agent for health monitoring"""
|
|
196
|
+
self._agents[agent_id] = {
|
|
197
|
+
"agent": agent,
|
|
198
|
+
"custom_liveness": custom_liveness,
|
|
199
|
+
"custom_readiness": custom_readiness,
|
|
200
|
+
"registered_at": datetime.now()
|
|
201
|
+
}
|
|
202
|
+
self._health_status[agent_id] = HealthStatus.UNKNOWN
|
|
203
|
+
logger.info(f"Registered agent {agent_id} for health monitoring")
|
|
204
|
+
|
|
205
|
+
def unregister_agent(self, agent_id: str) -> None:
|
|
206
|
+
"""Unregister an agent from health monitoring"""
|
|
207
|
+
if agent_id in self._agents:
|
|
208
|
+
del self._agents[agent_id]
|
|
209
|
+
self._health_status.pop(agent_id, None)
|
|
210
|
+
self._liveness_failures.pop(agent_id, None)
|
|
211
|
+
self._readiness_failures.pop(agent_id, None)
|
|
212
|
+
logger.info(f"Unregistered agent {agent_id} from health monitoring")
|
|
213
|
+
|
|
214
|
+
async def start(self) -> None:
|
|
215
|
+
"""Start the health monitoring loop"""
|
|
216
|
+
if self._running:
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
self._running = True
|
|
220
|
+
self._tasks.append(asyncio.create_task(self._liveness_loop()))
|
|
221
|
+
self._tasks.append(asyncio.create_task(self._readiness_loop()))
|
|
222
|
+
logger.info("Health monitor started")
|
|
223
|
+
|
|
224
|
+
async def stop(self) -> None:
|
|
225
|
+
"""Stop the health monitoring loop"""
|
|
226
|
+
self._running = False
|
|
227
|
+
for task in self._tasks:
|
|
228
|
+
task.cancel()
|
|
229
|
+
try:
|
|
230
|
+
await task
|
|
231
|
+
except asyncio.CancelledError:
|
|
232
|
+
pass
|
|
233
|
+
self._tasks.clear()
|
|
234
|
+
logger.info("Health monitor stopped")
|
|
235
|
+
|
|
236
|
+
async def _liveness_loop(self) -> None:
|
|
237
|
+
"""Main loop for liveness checks"""
|
|
238
|
+
while self._running:
|
|
239
|
+
for agent_id in list(self._agents.keys()):
|
|
240
|
+
try:
|
|
241
|
+
result = await self._check_liveness(agent_id)
|
|
242
|
+
self._check_history[agent_id].append(result)
|
|
243
|
+
|
|
244
|
+
if not result.healthy:
|
|
245
|
+
self._liveness_failures[agent_id] += 1
|
|
246
|
+
if self._liveness_failures[agent_id] >= self.config.liveness_failure_threshold:
|
|
247
|
+
self._health_status[agent_id] = HealthStatus.FAILED
|
|
248
|
+
await self._trigger_callbacks("liveness_failed", agent_id)
|
|
249
|
+
else:
|
|
250
|
+
self._liveness_failures[agent_id] = 0
|
|
251
|
+
if self._health_status[agent_id] == HealthStatus.FAILED:
|
|
252
|
+
self._health_status[agent_id] = HealthStatus.HEALTHY
|
|
253
|
+
await self._trigger_callbacks("liveness_restored", agent_id)
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.error(f"Liveness check failed for {agent_id}: {e}")
|
|
257
|
+
self._liveness_failures[agent_id] += 1
|
|
258
|
+
|
|
259
|
+
await asyncio.sleep(self.config.liveness_interval_seconds)
|
|
260
|
+
|
|
261
|
+
async def _readiness_loop(self) -> None:
|
|
262
|
+
"""Main loop for readiness checks"""
|
|
263
|
+
while self._running:
|
|
264
|
+
for agent_id in list(self._agents.keys()):
|
|
265
|
+
try:
|
|
266
|
+
result = await self._check_readiness(agent_id)
|
|
267
|
+
|
|
268
|
+
if not result.healthy:
|
|
269
|
+
self._readiness_failures[agent_id] += 1
|
|
270
|
+
if self._readiness_failures[agent_id] >= self.config.readiness_failure_threshold:
|
|
271
|
+
if self._health_status[agent_id] == HealthStatus.HEALTHY:
|
|
272
|
+
self._health_status[agent_id] = HealthStatus.DEGRADED
|
|
273
|
+
await self._trigger_callbacks("readiness_failed", agent_id)
|
|
274
|
+
else:
|
|
275
|
+
self._readiness_failures[agent_id] = 0
|
|
276
|
+
if self._health_status[agent_id] == HealthStatus.DEGRADED:
|
|
277
|
+
self._health_status[agent_id] = HealthStatus.HEALTHY
|
|
278
|
+
await self._trigger_callbacks("readiness_restored", agent_id)
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(f"Readiness check failed for {agent_id}: {e}")
|
|
282
|
+
self._readiness_failures[agent_id] += 1
|
|
283
|
+
|
|
284
|
+
await asyncio.sleep(self.config.readiness_interval_seconds)
|
|
285
|
+
|
|
286
|
+
async def _check_liveness(self, agent_id: str) -> HealthCheckResult:
|
|
287
|
+
"""Perform liveness check for an agent"""
|
|
288
|
+
start_time = time.time()
|
|
289
|
+
agent_info = self._agents.get(agent_id)
|
|
290
|
+
|
|
291
|
+
if not agent_info:
|
|
292
|
+
return HealthCheckResult(
|
|
293
|
+
healthy=False,
|
|
294
|
+
status=HealthStatus.UNKNOWN,
|
|
295
|
+
message="Agent not found"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
agent = agent_info["agent"]
|
|
299
|
+
custom_check = agent_info.get("custom_liveness")
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
# Try custom liveness check first
|
|
303
|
+
if custom_check:
|
|
304
|
+
healthy = await asyncio.wait_for(
|
|
305
|
+
custom_check(),
|
|
306
|
+
timeout=self.config.liveness_timeout_seconds
|
|
307
|
+
)
|
|
308
|
+
# Try protocol method
|
|
309
|
+
elif isinstance(agent, HealthCheckable):
|
|
310
|
+
healthy = await asyncio.wait_for(
|
|
311
|
+
agent.liveness_check(),
|
|
312
|
+
timeout=self.config.liveness_timeout_seconds
|
|
313
|
+
)
|
|
314
|
+
# Fallback: check if agent has is_alive method
|
|
315
|
+
elif hasattr(agent, 'is_alive'):
|
|
316
|
+
if asyncio.iscoroutinefunction(agent.is_alive):
|
|
317
|
+
healthy = await asyncio.wait_for(
|
|
318
|
+
agent.is_alive(),
|
|
319
|
+
timeout=self.config.liveness_timeout_seconds
|
|
320
|
+
)
|
|
321
|
+
else:
|
|
322
|
+
healthy = agent.is_alive()
|
|
323
|
+
else:
|
|
324
|
+
# Default: assume healthy if agent exists
|
|
325
|
+
healthy = True
|
|
326
|
+
|
|
327
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
328
|
+
self._last_check[agent_id] = datetime.now()
|
|
329
|
+
|
|
330
|
+
return HealthCheckResult(
|
|
331
|
+
healthy=healthy,
|
|
332
|
+
status=HealthStatus.HEALTHY if healthy else HealthStatus.UNHEALTHY,
|
|
333
|
+
latency_ms=latency_ms
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
except asyncio.TimeoutError:
|
|
337
|
+
return HealthCheckResult(
|
|
338
|
+
healthy=False,
|
|
339
|
+
status=HealthStatus.UNHEALTHY,
|
|
340
|
+
message="Liveness check timed out",
|
|
341
|
+
latency_ms=self.config.liveness_timeout_seconds * 1000
|
|
342
|
+
)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
return HealthCheckResult(
|
|
345
|
+
healthy=False,
|
|
346
|
+
status=HealthStatus.FAILED,
|
|
347
|
+
message=str(e),
|
|
348
|
+
latency_ms=(time.time() - start_time) * 1000
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
async def _check_readiness(self, agent_id: str) -> HealthCheckResult:
|
|
352
|
+
"""Perform readiness check for an agent"""
|
|
353
|
+
start_time = time.time()
|
|
354
|
+
agent_info = self._agents.get(agent_id)
|
|
355
|
+
|
|
356
|
+
if not agent_info:
|
|
357
|
+
return HealthCheckResult(
|
|
358
|
+
healthy=False,
|
|
359
|
+
status=HealthStatus.UNKNOWN,
|
|
360
|
+
message="Agent not found"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
agent = agent_info["agent"]
|
|
364
|
+
custom_check = agent_info.get("custom_readiness")
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
if custom_check:
|
|
368
|
+
ready = await asyncio.wait_for(
|
|
369
|
+
custom_check(),
|
|
370
|
+
timeout=self.config.readiness_timeout_seconds
|
|
371
|
+
)
|
|
372
|
+
elif isinstance(agent, HealthCheckable):
|
|
373
|
+
ready = await asyncio.wait_for(
|
|
374
|
+
agent.readiness_check(),
|
|
375
|
+
timeout=self.config.readiness_timeout_seconds
|
|
376
|
+
)
|
|
377
|
+
elif hasattr(agent, 'is_ready'):
|
|
378
|
+
if asyncio.iscoroutinefunction(agent.is_ready):
|
|
379
|
+
ready = await asyncio.wait_for(
|
|
380
|
+
agent.is_ready(),
|
|
381
|
+
timeout=self.config.readiness_timeout_seconds
|
|
382
|
+
)
|
|
383
|
+
else:
|
|
384
|
+
ready = agent.is_ready()
|
|
385
|
+
else:
|
|
386
|
+
ready = True
|
|
387
|
+
|
|
388
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
389
|
+
|
|
390
|
+
return HealthCheckResult(
|
|
391
|
+
healthy=ready,
|
|
392
|
+
status=HealthStatus.HEALTHY if ready else HealthStatus.DEGRADED,
|
|
393
|
+
latency_ms=latency_ms
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
except asyncio.TimeoutError:
|
|
397
|
+
return HealthCheckResult(
|
|
398
|
+
healthy=False,
|
|
399
|
+
status=HealthStatus.DEGRADED,
|
|
400
|
+
message="Readiness check timed out",
|
|
401
|
+
latency_ms=self.config.readiness_timeout_seconds * 1000
|
|
402
|
+
)
|
|
403
|
+
except Exception as e:
|
|
404
|
+
return HealthCheckResult(
|
|
405
|
+
healthy=False,
|
|
406
|
+
status=HealthStatus.DEGRADED,
|
|
407
|
+
message=str(e),
|
|
408
|
+
latency_ms=(time.time() - start_time) * 1000
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def on_event(self, event: str, callback: Callable[[str], Awaitable[None]]) -> None:
|
|
412
|
+
"""Register a callback for health events"""
|
|
413
|
+
self._callbacks[event].append(callback)
|
|
414
|
+
|
|
415
|
+
async def _trigger_callbacks(self, event: str, agent_id: str) -> None:
|
|
416
|
+
"""Trigger all callbacks for an event"""
|
|
417
|
+
for callback in self._callbacks.get(event, []):
|
|
418
|
+
try:
|
|
419
|
+
await callback(agent_id)
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.error(f"Callback error for {event}: {e}")
|
|
422
|
+
|
|
423
|
+
def get_agent_health(self, agent_id: str) -> HealthStatus:
|
|
424
|
+
"""Get the current health status of an agent"""
|
|
425
|
+
return self._health_status.get(agent_id, HealthStatus.UNKNOWN)
|
|
426
|
+
|
|
427
|
+
def get_all_health_status(self) -> Dict[str, HealthStatus]:
|
|
428
|
+
"""Get health status for all agents"""
|
|
429
|
+
return dict(self._health_status)
|
|
430
|
+
|
|
431
|
+
def get_health_history(self, agent_id: str) -> List[HealthCheckResult]:
|
|
432
|
+
"""Get health check history for an agent"""
|
|
433
|
+
return list(self._check_history.get(agent_id, []))
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# ============================================================================
|
|
437
|
+
# ACP-002: Agent Auto-Recovery
|
|
438
|
+
# ============================================================================
|
|
439
|
+
|
|
440
|
+
@dataclass
|
|
441
|
+
class RecoveryConfig:
|
|
442
|
+
"""Configuration for auto-recovery behavior"""
|
|
443
|
+
enabled: bool = True
|
|
444
|
+
max_restarts: int = 5
|
|
445
|
+
restart_delay_seconds: float = 1.0
|
|
446
|
+
restart_delay_max_seconds: float = 60.0
|
|
447
|
+
restart_delay_multiplier: float = 2.0
|
|
448
|
+
reset_restart_count_after_seconds: float = 300.0
|
|
449
|
+
on_max_restarts: str = "stop" # "stop", "alert", "continue"
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
@dataclass
|
|
453
|
+
class RecoveryEvent:
|
|
454
|
+
"""Record of a recovery event"""
|
|
455
|
+
agent_id: str
|
|
456
|
+
event_type: str # "restart", "failure", "recovery_success", "max_restarts"
|
|
457
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
458
|
+
attempt: int = 0
|
|
459
|
+
error: Optional[str] = None
|
|
460
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class AutoRecoveryManager:
|
|
464
|
+
"""
|
|
465
|
+
Manages automatic recovery of failed agents.
|
|
466
|
+
|
|
467
|
+
Implements exponential backoff for restart attempts and tracks
|
|
468
|
+
recovery history for analysis.
|
|
469
|
+
|
|
470
|
+
Features:
|
|
471
|
+
- Automatic restart with exponential backoff
|
|
472
|
+
- Maximum restart limit with configurable behavior
|
|
473
|
+
- Recovery event logging
|
|
474
|
+
- Callbacks for recovery events
|
|
475
|
+
|
|
476
|
+
Usage:
|
|
477
|
+
recovery = AutoRecoveryManager(config=RecoveryConfig())
|
|
478
|
+
recovery.register_agent(agent_id, agent_factory)
|
|
479
|
+
|
|
480
|
+
# When agent fails
|
|
481
|
+
await recovery.handle_failure(agent_id, error)
|
|
482
|
+
"""
|
|
483
|
+
|
|
484
|
+
def __init__(self, config: Optional[RecoveryConfig] = None):
|
|
485
|
+
self.config = config or RecoveryConfig()
|
|
486
|
+
self._agent_factories: Dict[str, Callable[[], Any]] = {}
|
|
487
|
+
self._restart_counts: Dict[str, int] = defaultdict(int)
|
|
488
|
+
self._last_restart: Dict[str, datetime] = {}
|
|
489
|
+
self._current_delay: Dict[str, float] = {}
|
|
490
|
+
self._recovery_history: deque = deque(maxlen=1000)
|
|
491
|
+
self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
|
|
492
|
+
self._agents: Dict[str, Any] = {}
|
|
493
|
+
self._lock = asyncio.Lock()
|
|
494
|
+
|
|
495
|
+
def register_agent(
|
|
496
|
+
self,
|
|
497
|
+
agent_id: str,
|
|
498
|
+
factory: Callable[[], Any],
|
|
499
|
+
initial_instance: Optional[Any] = None
|
|
500
|
+
) -> None:
|
|
501
|
+
"""Register an agent with its factory function for recovery"""
|
|
502
|
+
self._agent_factories[agent_id] = factory
|
|
503
|
+
if initial_instance:
|
|
504
|
+
self._agents[agent_id] = initial_instance
|
|
505
|
+
self._restart_counts[agent_id] = 0
|
|
506
|
+
self._current_delay[agent_id] = self.config.restart_delay_seconds
|
|
507
|
+
logger.info(f"Registered agent {agent_id} for auto-recovery")
|
|
508
|
+
|
|
509
|
+
def unregister_agent(self, agent_id: str) -> None:
|
|
510
|
+
"""Unregister an agent from auto-recovery"""
|
|
511
|
+
self._agent_factories.pop(agent_id, None)
|
|
512
|
+
self._agents.pop(agent_id, None)
|
|
513
|
+
self._restart_counts.pop(agent_id, None)
|
|
514
|
+
self._last_restart.pop(agent_id, None)
|
|
515
|
+
self._current_delay.pop(agent_id, None)
|
|
516
|
+
|
|
517
|
+
async def handle_failure(
|
|
518
|
+
self,
|
|
519
|
+
agent_id: str,
|
|
520
|
+
error: Optional[Exception] = None
|
|
521
|
+
) -> Optional[Any]:
|
|
522
|
+
"""
|
|
523
|
+
Handle an agent failure and attempt recovery.
|
|
524
|
+
|
|
525
|
+
Returns the new agent instance if recovery succeeds, None otherwise.
|
|
526
|
+
"""
|
|
527
|
+
if not self.config.enabled:
|
|
528
|
+
logger.info(f"Auto-recovery disabled, not recovering {agent_id}")
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
async with self._lock:
|
|
532
|
+
# Check if we should reset restart count
|
|
533
|
+
if agent_id in self._last_restart:
|
|
534
|
+
time_since_last = (datetime.now() - self._last_restart[agent_id]).total_seconds()
|
|
535
|
+
if time_since_last > self.config.reset_restart_count_after_seconds:
|
|
536
|
+
self._restart_counts[agent_id] = 0
|
|
537
|
+
self._current_delay[agent_id] = self.config.restart_delay_seconds
|
|
538
|
+
|
|
539
|
+
# Check if max restarts reached
|
|
540
|
+
if self._restart_counts[agent_id] >= self.config.max_restarts:
|
|
541
|
+
event = RecoveryEvent(
|
|
542
|
+
agent_id=agent_id,
|
|
543
|
+
event_type="max_restarts",
|
|
544
|
+
attempt=self._restart_counts[agent_id],
|
|
545
|
+
error=str(error) if error else None
|
|
546
|
+
)
|
|
547
|
+
self._recovery_history.append(event)
|
|
548
|
+
await self._trigger_callbacks("max_restarts", agent_id, event)
|
|
549
|
+
|
|
550
|
+
if self.config.on_max_restarts == "stop":
|
|
551
|
+
logger.error(f"Max restarts reached for {agent_id}, stopping")
|
|
552
|
+
return None
|
|
553
|
+
elif self.config.on_max_restarts == "alert":
|
|
554
|
+
logger.warning(f"Max restarts reached for {agent_id}, alerting")
|
|
555
|
+
await self._trigger_callbacks("alert", agent_id, event)
|
|
556
|
+
# "continue" falls through to attempt restart anyway
|
|
557
|
+
|
|
558
|
+
# Calculate delay with exponential backoff
|
|
559
|
+
delay = self._current_delay.get(agent_id, self.config.restart_delay_seconds)
|
|
560
|
+
|
|
561
|
+
# Log failure event
|
|
562
|
+
failure_event = RecoveryEvent(
|
|
563
|
+
agent_id=agent_id,
|
|
564
|
+
event_type="failure",
|
|
565
|
+
attempt=self._restart_counts[agent_id],
|
|
566
|
+
error=str(error) if error else None
|
|
567
|
+
)
|
|
568
|
+
self._recovery_history.append(failure_event)
|
|
569
|
+
await self._trigger_callbacks("failure", agent_id, failure_event)
|
|
570
|
+
|
|
571
|
+
logger.info(f"Attempting recovery for {agent_id} after {delay:.1f}s delay")
|
|
572
|
+
await asyncio.sleep(delay)
|
|
573
|
+
|
|
574
|
+
# Attempt restart
|
|
575
|
+
try:
|
|
576
|
+
factory = self._agent_factories.get(agent_id)
|
|
577
|
+
if not factory:
|
|
578
|
+
logger.error(f"No factory registered for {agent_id}")
|
|
579
|
+
return None
|
|
580
|
+
|
|
581
|
+
new_agent = factory()
|
|
582
|
+
if asyncio.iscoroutine(new_agent):
|
|
583
|
+
new_agent = await new_agent
|
|
584
|
+
|
|
585
|
+
# Start the agent if it has a start method
|
|
586
|
+
if hasattr(new_agent, 'start'):
|
|
587
|
+
if asyncio.iscoroutinefunction(new_agent.start):
|
|
588
|
+
await new_agent.start()
|
|
589
|
+
else:
|
|
590
|
+
new_agent.start()
|
|
591
|
+
|
|
592
|
+
self._agents[agent_id] = new_agent
|
|
593
|
+
self._restart_counts[agent_id] += 1
|
|
594
|
+
self._last_restart[agent_id] = datetime.now()
|
|
595
|
+
|
|
596
|
+
# Increase delay for next potential failure (exponential backoff)
|
|
597
|
+
self._current_delay[agent_id] = min(
|
|
598
|
+
delay * self.config.restart_delay_multiplier,
|
|
599
|
+
self.config.restart_delay_max_seconds
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
success_event = RecoveryEvent(
|
|
603
|
+
agent_id=agent_id,
|
|
604
|
+
event_type="recovery_success",
|
|
605
|
+
attempt=self._restart_counts[agent_id]
|
|
606
|
+
)
|
|
607
|
+
self._recovery_history.append(success_event)
|
|
608
|
+
await self._trigger_callbacks("recovery_success", agent_id, success_event)
|
|
609
|
+
|
|
610
|
+
logger.info(f"Successfully recovered agent {agent_id}")
|
|
611
|
+
return new_agent
|
|
612
|
+
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.error(f"Failed to recover agent {agent_id}: {e}")
|
|
615
|
+
self._restart_counts[agent_id] += 1
|
|
616
|
+
return await self.handle_failure(agent_id, e)
|
|
617
|
+
|
|
618
|
+
def on_event(
|
|
619
|
+
self,
|
|
620
|
+
event: str,
|
|
621
|
+
callback: Callable[[str, RecoveryEvent], Awaitable[None]]
|
|
622
|
+
) -> None:
|
|
623
|
+
"""Register a callback for recovery events"""
|
|
624
|
+
self._callbacks[event].append(callback)
|
|
625
|
+
|
|
626
|
+
async def _trigger_callbacks(
|
|
627
|
+
self,
|
|
628
|
+
event: str,
|
|
629
|
+
agent_id: str,
|
|
630
|
+
recovery_event: RecoveryEvent
|
|
631
|
+
) -> None:
|
|
632
|
+
"""Trigger all callbacks for an event"""
|
|
633
|
+
for callback in self._callbacks.get(event, []):
|
|
634
|
+
try:
|
|
635
|
+
await callback(agent_id, recovery_event)
|
|
636
|
+
except Exception as e:
|
|
637
|
+
logger.error(f"Callback error for {event}: {e}")
|
|
638
|
+
|
|
639
|
+
def get_agent(self, agent_id: str) -> Optional[Any]:
|
|
640
|
+
"""Get the current agent instance"""
|
|
641
|
+
return self._agents.get(agent_id)
|
|
642
|
+
|
|
643
|
+
def get_restart_count(self, agent_id: str) -> int:
|
|
644
|
+
"""Get the restart count for an agent"""
|
|
645
|
+
return self._restart_counts.get(agent_id, 0)
|
|
646
|
+
|
|
647
|
+
def get_recovery_history(
|
|
648
|
+
self,
|
|
649
|
+
agent_id: Optional[str] = None
|
|
650
|
+
) -> List[RecoveryEvent]:
|
|
651
|
+
"""Get recovery history, optionally filtered by agent"""
|
|
652
|
+
if agent_id:
|
|
653
|
+
return [e for e in self._recovery_history if e.agent_id == agent_id]
|
|
654
|
+
return list(self._recovery_history)
|
|
655
|
+
|
|
656
|
+
def reset_restart_count(self, agent_id: str) -> None:
|
|
657
|
+
"""Manually reset the restart count for an agent"""
|
|
658
|
+
self._restart_counts[agent_id] = 0
|
|
659
|
+
self._current_delay[agent_id] = self.config.restart_delay_seconds
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
# ============================================================================
|
|
663
|
+
# ACP-003: Circuit Breaker
|
|
664
|
+
# ============================================================================
|
|
665
|
+
|
|
666
|
+
@dataclass
|
|
667
|
+
class CircuitBreakerConfig:
|
|
668
|
+
"""Configuration for circuit breaker behavior"""
|
|
669
|
+
failure_threshold: int = 5
|
|
670
|
+
success_threshold: int = 3
|
|
671
|
+
recovery_timeout_seconds: float = 60.0
|
|
672
|
+
half_open_max_calls: int = 3
|
|
673
|
+
exclude_exceptions: List[Type[Exception]] = field(default_factory=list)
|
|
674
|
+
include_exceptions: Optional[List[Type[Exception]]] = None
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
@dataclass
|
|
678
|
+
class CircuitBreakerMetrics:
|
|
679
|
+
"""Metrics for a circuit breaker"""
|
|
680
|
+
state: CircuitState
|
|
681
|
+
failure_count: int
|
|
682
|
+
success_count: int
|
|
683
|
+
total_calls: int
|
|
684
|
+
total_failures: int
|
|
685
|
+
total_successes: int
|
|
686
|
+
last_failure_time: Optional[datetime]
|
|
687
|
+
last_success_time: Optional[datetime]
|
|
688
|
+
state_changed_at: datetime
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
class CircuitBreaker:
|
|
692
|
+
"""
|
|
693
|
+
Circuit breaker for preventing cascading failures.
|
|
694
|
+
|
|
695
|
+
Implements the circuit breaker pattern to protect against cascading
|
|
696
|
+
failures when an agent or service becomes unavailable.
|
|
697
|
+
|
|
698
|
+
States:
|
|
699
|
+
- CLOSED: Normal operation, requests pass through
|
|
700
|
+
- OPEN: Failing, requests are rejected immediately
|
|
701
|
+
- HALF_OPEN: Testing recovery, limited requests allowed
|
|
702
|
+
|
|
703
|
+
Features:
|
|
704
|
+
- Configurable failure/success thresholds
|
|
705
|
+
- Automatic recovery timeout
|
|
706
|
+
- Exception filtering
|
|
707
|
+
- Metrics collection
|
|
708
|
+
|
|
709
|
+
Usage:
|
|
710
|
+
breaker = CircuitBreaker(
|
|
711
|
+
config=CircuitBreakerConfig(
|
|
712
|
+
failure_threshold=5,
|
|
713
|
+
recovery_timeout=60
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# Use as decorator
|
|
718
|
+
@breaker
|
|
719
|
+
async def call_agent():
|
|
720
|
+
...
|
|
721
|
+
|
|
722
|
+
# Or use context manager
|
|
723
|
+
async with breaker:
|
|
724
|
+
await call_agent()
|
|
725
|
+
"""
|
|
726
|
+
|
|
727
|
+
def __init__(
|
|
728
|
+
self,
|
|
729
|
+
name: str = "default",
|
|
730
|
+
config: Optional[CircuitBreakerConfig] = None,
|
|
731
|
+
failure_threshold: Optional[int] = None,
|
|
732
|
+
recovery_timeout: Optional[float] = None
|
|
733
|
+
):
|
|
734
|
+
self.name = name
|
|
735
|
+
self.config = config or CircuitBreakerConfig()
|
|
736
|
+
|
|
737
|
+
# Allow direct parameter override for convenience API
|
|
738
|
+
if failure_threshold is not None:
|
|
739
|
+
self.config.failure_threshold = failure_threshold
|
|
740
|
+
if recovery_timeout is not None:
|
|
741
|
+
self.config.recovery_timeout_seconds = recovery_timeout
|
|
742
|
+
|
|
743
|
+
self._state = CircuitState.CLOSED
|
|
744
|
+
self._failure_count = 0
|
|
745
|
+
self._success_count = 0
|
|
746
|
+
self._half_open_calls = 0
|
|
747
|
+
self._last_failure_time: Optional[datetime] = None
|
|
748
|
+
self._last_success_time: Optional[datetime] = None
|
|
749
|
+
self._state_changed_at = datetime.now()
|
|
750
|
+
self._total_calls = 0
|
|
751
|
+
self._total_failures = 0
|
|
752
|
+
self._total_successes = 0
|
|
753
|
+
self._lock = asyncio.Lock()
|
|
754
|
+
self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
|
|
755
|
+
|
|
756
|
+
@property
|
|
757
|
+
def state(self) -> CircuitState:
|
|
758
|
+
"""Get the current circuit state"""
|
|
759
|
+
return self._state
|
|
760
|
+
|
|
761
|
+
@property
|
|
762
|
+
def is_closed(self) -> bool:
|
|
763
|
+
"""Check if circuit is closed (normal operation)"""
|
|
764
|
+
return self._state == CircuitState.CLOSED
|
|
765
|
+
|
|
766
|
+
@property
|
|
767
|
+
def is_open(self) -> bool:
|
|
768
|
+
"""Check if circuit is open (rejecting requests)"""
|
|
769
|
+
return self._state == CircuitState.OPEN
|
|
770
|
+
|
|
771
|
+
async def __aenter__(self):
|
|
772
|
+
"""Async context manager entry"""
|
|
773
|
+
await self._before_call()
|
|
774
|
+
return self
|
|
775
|
+
|
|
776
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
777
|
+
"""Async context manager exit"""
|
|
778
|
+
if exc_type is None:
|
|
779
|
+
await self._on_success()
|
|
780
|
+
else:
|
|
781
|
+
if self._should_count_exception(exc_type):
|
|
782
|
+
await self._on_failure(exc_val)
|
|
783
|
+
return False
|
|
784
|
+
|
|
785
|
+
def __call__(self, func: Callable) -> Callable:
|
|
786
|
+
"""Decorator for wrapping functions with circuit breaker"""
|
|
787
|
+
async def wrapper(*args, **kwargs):
|
|
788
|
+
await self._before_call()
|
|
789
|
+
try:
|
|
790
|
+
if asyncio.iscoroutinefunction(func):
|
|
791
|
+
result = await func(*args, **kwargs)
|
|
792
|
+
else:
|
|
793
|
+
result = func(*args, **kwargs)
|
|
794
|
+
await self._on_success()
|
|
795
|
+
return result
|
|
796
|
+
except Exception as e:
|
|
797
|
+
if self._should_count_exception(type(e)):
|
|
798
|
+
await self._on_failure(e)
|
|
799
|
+
raise
|
|
800
|
+
return wrapper
|
|
801
|
+
|
|
802
|
+
async def _before_call(self) -> None:
|
|
803
|
+
"""Check circuit state before a call"""
|
|
804
|
+
async with self._lock:
|
|
805
|
+
self._total_calls += 1
|
|
806
|
+
|
|
807
|
+
if self._state == CircuitState.OPEN:
|
|
808
|
+
# Check if recovery timeout has elapsed
|
|
809
|
+
if self._last_failure_time:
|
|
810
|
+
elapsed = (datetime.now() - self._last_failure_time).total_seconds()
|
|
811
|
+
if elapsed >= self.config.recovery_timeout_seconds:
|
|
812
|
+
self._transition_to(CircuitState.HALF_OPEN)
|
|
813
|
+
self._half_open_calls = 0
|
|
814
|
+
else:
|
|
815
|
+
raise CircuitBreakerOpenError(
|
|
816
|
+
f"Circuit {self.name} is open, retry after "
|
|
817
|
+
f"{self.config.recovery_timeout_seconds - elapsed:.1f}s"
|
|
818
|
+
)
|
|
819
|
+
else:
|
|
820
|
+
raise CircuitBreakerOpenError(f"Circuit {self.name} is open")
|
|
821
|
+
|
|
822
|
+
elif self._state == CircuitState.HALF_OPEN:
|
|
823
|
+
if self._half_open_calls >= self.config.half_open_max_calls:
|
|
824
|
+
raise CircuitBreakerOpenError(
|
|
825
|
+
f"Circuit {self.name} is half-open, max test calls reached"
|
|
826
|
+
)
|
|
827
|
+
self._half_open_calls += 1
|
|
828
|
+
|
|
829
|
+
async def _on_success(self) -> None:
|
|
830
|
+
"""Handle a successful call"""
|
|
831
|
+
async with self._lock:
|
|
832
|
+
self._total_successes += 1
|
|
833
|
+
self._last_success_time = datetime.now()
|
|
834
|
+
|
|
835
|
+
if self._state == CircuitState.HALF_OPEN:
|
|
836
|
+
self._success_count += 1
|
|
837
|
+
if self._success_count >= self.config.success_threshold:
|
|
838
|
+
self._transition_to(CircuitState.CLOSED)
|
|
839
|
+
elif self._state == CircuitState.CLOSED:
|
|
840
|
+
self._failure_count = 0
|
|
841
|
+
|
|
842
|
+
async def _on_failure(self, error: Exception) -> None:
|
|
843
|
+
"""Handle a failed call"""
|
|
844
|
+
async with self._lock:
|
|
845
|
+
self._total_failures += 1
|
|
846
|
+
self._last_failure_time = datetime.now()
|
|
847
|
+
self._failure_count += 1
|
|
848
|
+
|
|
849
|
+
if self._state == CircuitState.HALF_OPEN:
|
|
850
|
+
# Any failure in half-open state opens the circuit
|
|
851
|
+
self._transition_to(CircuitState.OPEN)
|
|
852
|
+
elif self._state == CircuitState.CLOSED:
|
|
853
|
+
if self._failure_count >= self.config.failure_threshold:
|
|
854
|
+
self._transition_to(CircuitState.OPEN)
|
|
855
|
+
|
|
856
|
+
def _transition_to(self, new_state: CircuitState) -> None:
|
|
857
|
+
"""Transition to a new circuit state"""
|
|
858
|
+
old_state = self._state
|
|
859
|
+
self._state = new_state
|
|
860
|
+
self._state_changed_at = datetime.now()
|
|
861
|
+
|
|
862
|
+
if new_state == CircuitState.CLOSED:
|
|
863
|
+
self._failure_count = 0
|
|
864
|
+
self._success_count = 0
|
|
865
|
+
elif new_state == CircuitState.HALF_OPEN:
|
|
866
|
+
self._success_count = 0
|
|
867
|
+
self._half_open_calls = 0
|
|
868
|
+
|
|
869
|
+
logger.info(f"Circuit {self.name} transitioned from {old_state.value} to {new_state.value}")
|
|
870
|
+
|
|
871
|
+
# Trigger callbacks asynchronously
|
|
872
|
+
asyncio.create_task(self._trigger_state_change(old_state, new_state))
|
|
873
|
+
|
|
874
|
+
async def _trigger_state_change(
|
|
875
|
+
self,
|
|
876
|
+
old_state: CircuitState,
|
|
877
|
+
new_state: CircuitState
|
|
878
|
+
) -> None:
|
|
879
|
+
"""Trigger callbacks for state change"""
|
|
880
|
+
for callback in self._callbacks.get("state_change", []):
|
|
881
|
+
try:
|
|
882
|
+
await callback(self.name, old_state, new_state)
|
|
883
|
+
except Exception as e:
|
|
884
|
+
logger.error(f"Circuit breaker callback error: {e}")
|
|
885
|
+
|
|
886
|
+
def _should_count_exception(self, exc_type: Type[Exception]) -> bool:
|
|
887
|
+
"""Determine if an exception should be counted as a failure"""
|
|
888
|
+
# Check exclude list
|
|
889
|
+
for excluded in self.config.exclude_exceptions:
|
|
890
|
+
if issubclass(exc_type, excluded):
|
|
891
|
+
return False
|
|
892
|
+
|
|
893
|
+
# Check include list if specified
|
|
894
|
+
if self.config.include_exceptions is not None:
|
|
895
|
+
for included in self.config.include_exceptions:
|
|
896
|
+
if issubclass(exc_type, included):
|
|
897
|
+
return True
|
|
898
|
+
return False
|
|
899
|
+
|
|
900
|
+
return True
|
|
901
|
+
|
|
902
|
+
def on_state_change(
|
|
903
|
+
self,
|
|
904
|
+
callback: Callable[[str, CircuitState, CircuitState], Awaitable[None]]
|
|
905
|
+
) -> None:
|
|
906
|
+
"""Register a callback for state changes"""
|
|
907
|
+
self._callbacks["state_change"].append(callback)
|
|
908
|
+
|
|
909
|
+
def get_metrics(self) -> CircuitBreakerMetrics:
|
|
910
|
+
"""Get current circuit breaker metrics"""
|
|
911
|
+
return CircuitBreakerMetrics(
|
|
912
|
+
state=self._state,
|
|
913
|
+
failure_count=self._failure_count,
|
|
914
|
+
success_count=self._success_count,
|
|
915
|
+
total_calls=self._total_calls,
|
|
916
|
+
total_failures=self._total_failures,
|
|
917
|
+
total_successes=self._total_successes,
|
|
918
|
+
last_failure_time=self._last_failure_time,
|
|
919
|
+
last_success_time=self._last_success_time,
|
|
920
|
+
state_changed_at=self._state_changed_at
|
|
921
|
+
)
|
|
922
|
+
|
|
923
|
+
def reset(self) -> None:
|
|
924
|
+
"""Manually reset the circuit breaker to closed state"""
|
|
925
|
+
self._state = CircuitState.CLOSED
|
|
926
|
+
self._failure_count = 0
|
|
927
|
+
self._success_count = 0
|
|
928
|
+
self._half_open_calls = 0
|
|
929
|
+
self._state_changed_at = datetime.now()
|
|
930
|
+
logger.info(f"Circuit {self.name} manually reset to CLOSED")
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
class CircuitBreakerOpenError(Exception):
|
|
934
|
+
"""Raised when a circuit breaker is open"""
|
|
935
|
+
pass
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
class CircuitBreakerRegistry:
|
|
939
|
+
"""Registry for managing multiple circuit breakers"""
|
|
940
|
+
|
|
941
|
+
def __init__(self):
|
|
942
|
+
self._breakers: Dict[str, CircuitBreaker] = {}
|
|
943
|
+
|
|
944
|
+
def get_or_create(
|
|
945
|
+
self,
|
|
946
|
+
name: str,
|
|
947
|
+
config: Optional[CircuitBreakerConfig] = None
|
|
948
|
+
) -> CircuitBreaker:
|
|
949
|
+
"""Get or create a circuit breaker by name"""
|
|
950
|
+
if name not in self._breakers:
|
|
951
|
+
self._breakers[name] = CircuitBreaker(name=name, config=config)
|
|
952
|
+
return self._breakers[name]
|
|
953
|
+
|
|
954
|
+
def get(self, name: str) -> Optional[CircuitBreaker]:
|
|
955
|
+
"""Get a circuit breaker by name"""
|
|
956
|
+
return self._breakers.get(name)
|
|
957
|
+
|
|
958
|
+
def get_all_metrics(self) -> Dict[str, CircuitBreakerMetrics]:
|
|
959
|
+
"""Get metrics for all circuit breakers"""
|
|
960
|
+
return {name: cb.get_metrics() for name, cb in self._breakers.items()}
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
# ============================================================================
|
|
964
|
+
# ACP-004: Agent Scaling
|
|
965
|
+
# ============================================================================
|
|
966
|
+
|
|
967
|
+
@dataclass
|
|
968
|
+
class ScalingConfig:
|
|
969
|
+
"""Configuration for agent scaling"""
|
|
970
|
+
min_replicas: int = 1
|
|
971
|
+
max_replicas: int = 10
|
|
972
|
+
target_cpu_utilization: float = 0.7
|
|
973
|
+
target_memory_utilization: float = 0.8
|
|
974
|
+
scale_up_threshold: float = 0.8
|
|
975
|
+
scale_down_threshold: float = 0.3
|
|
976
|
+
scale_up_cooldown_seconds: float = 60.0
|
|
977
|
+
scale_down_cooldown_seconds: float = 300.0
|
|
978
|
+
scale_up_increment: int = 1
|
|
979
|
+
scale_down_increment: int = 1
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
@dataclass
|
|
983
|
+
class AgentReplica:
|
|
984
|
+
"""Represents a replica of an agent"""
|
|
985
|
+
replica_id: str
|
|
986
|
+
agent_id: str
|
|
987
|
+
instance: Any
|
|
988
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
989
|
+
status: AgentState = AgentState.PENDING
|
|
990
|
+
metrics: Dict[str, float] = field(default_factory=dict)
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
class AgentScaler:
|
|
994
|
+
"""
|
|
995
|
+
Horizontal scaling manager for agents.
|
|
996
|
+
|
|
997
|
+
Provides automatic scaling based on load metrics, supporting both
|
|
998
|
+
scale-up and scale-down with configurable thresholds and cooldowns.
|
|
999
|
+
|
|
1000
|
+
Features:
|
|
1001
|
+
- Automatic scale-up/scale-down based on utilization
|
|
1002
|
+
- Configurable min/max replicas
|
|
1003
|
+
- Load balancing across replicas
|
|
1004
|
+
- Cooldown periods to prevent thrashing
|
|
1005
|
+
|
|
1006
|
+
Usage:
|
|
1007
|
+
scaler = AgentScaler()
|
|
1008
|
+
|
|
1009
|
+
# Register agent type with factory
|
|
1010
|
+
scaler.register_agent_type(
|
|
1011
|
+
agent_type="claims_agent",
|
|
1012
|
+
factory=create_claims_agent,
|
|
1013
|
+
config=ScalingConfig(min_replicas=2, max_replicas=10)
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
# Get available replica
|
|
1017
|
+
agent = await scaler.get_replica("claims_agent")
|
|
1018
|
+
|
|
1019
|
+
# Manual scaling
|
|
1020
|
+
await scaler.scale_to("claims_agent", replicas=5)
|
|
1021
|
+
"""
|
|
1022
|
+
|
|
1023
|
+
def __init__(self):
|
|
1024
|
+
self._agent_types: Dict[str, Dict[str, Any]] = {}
|
|
1025
|
+
self._replicas: Dict[str, Dict[str, AgentReplica]] = defaultdict(dict)
|
|
1026
|
+
self._last_scale_up: Dict[str, datetime] = {}
|
|
1027
|
+
self._last_scale_down: Dict[str, datetime] = {}
|
|
1028
|
+
self._load_balancer_index: Dict[str, int] = defaultdict(int)
|
|
1029
|
+
self._lock = asyncio.Lock()
|
|
1030
|
+
self._running = False
|
|
1031
|
+
self._scaling_task: Optional[asyncio.Task] = None
|
|
1032
|
+
|
|
1033
|
+
def register_agent_type(
|
|
1034
|
+
self,
|
|
1035
|
+
agent_type: str,
|
|
1036
|
+
factory: Callable[[], Any],
|
|
1037
|
+
config: Optional[ScalingConfig] = None,
|
|
1038
|
+
replicas: int = 1
|
|
1039
|
+
) -> None:
|
|
1040
|
+
"""Register an agent type for scaling"""
|
|
1041
|
+
config = config or ScalingConfig()
|
|
1042
|
+
self._agent_types[agent_type] = {
|
|
1043
|
+
"factory": factory,
|
|
1044
|
+
"config": config,
|
|
1045
|
+
"target_replicas": max(config.min_replicas, replicas)
|
|
1046
|
+
}
|
|
1047
|
+
logger.info(f"Registered agent type {agent_type} for scaling")
|
|
1048
|
+
|
|
1049
|
+
async def start(self) -> None:
|
|
1050
|
+
"""Start the scaling manager"""
|
|
1051
|
+
if self._running:
|
|
1052
|
+
return
|
|
1053
|
+
|
|
1054
|
+
self._running = True
|
|
1055
|
+
|
|
1056
|
+
# Initialize replicas for all registered types
|
|
1057
|
+
for agent_type, info in self._agent_types.items():
|
|
1058
|
+
await self.scale_to(agent_type, info["target_replicas"])
|
|
1059
|
+
|
|
1060
|
+
# Start autoscaling loop
|
|
1061
|
+
self._scaling_task = asyncio.create_task(self._autoscaling_loop())
|
|
1062
|
+
logger.info("Agent scaler started")
|
|
1063
|
+
|
|
1064
|
+
async def stop(self) -> None:
|
|
1065
|
+
"""Stop the scaling manager"""
|
|
1066
|
+
self._running = False
|
|
1067
|
+
if self._scaling_task:
|
|
1068
|
+
self._scaling_task.cancel()
|
|
1069
|
+
try:
|
|
1070
|
+
await self._scaling_task
|
|
1071
|
+
except asyncio.CancelledError:
|
|
1072
|
+
pass
|
|
1073
|
+
|
|
1074
|
+
# Stop all replicas
|
|
1075
|
+
for agent_type in list(self._replicas.keys()):
|
|
1076
|
+
await self.scale_to(agent_type, 0)
|
|
1077
|
+
|
|
1078
|
+
logger.info("Agent scaler stopped")
|
|
1079
|
+
|
|
1080
|
+
async def scale_to(self, agent_type: str, replicas: int) -> None:
|
|
1081
|
+
"""Scale an agent type to a specific number of replicas"""
|
|
1082
|
+
if agent_type not in self._agent_types:
|
|
1083
|
+
raise ValueError(f"Unknown agent type: {agent_type}")
|
|
1084
|
+
|
|
1085
|
+
async with self._lock:
|
|
1086
|
+
config = self._agent_types[agent_type]["config"]
|
|
1087
|
+
replicas = max(0, min(replicas, config.max_replicas))
|
|
1088
|
+
|
|
1089
|
+
current_count = len(self._replicas[agent_type])
|
|
1090
|
+
|
|
1091
|
+
if replicas > current_count:
|
|
1092
|
+
# Scale up
|
|
1093
|
+
for _ in range(replicas - current_count):
|
|
1094
|
+
await self._create_replica(agent_type)
|
|
1095
|
+
elif replicas < current_count:
|
|
1096
|
+
# Scale down
|
|
1097
|
+
to_remove = current_count - replicas
|
|
1098
|
+
replica_ids = list(self._replicas[agent_type].keys())[:to_remove]
|
|
1099
|
+
for replica_id in replica_ids:
|
|
1100
|
+
await self._remove_replica(agent_type, replica_id)
|
|
1101
|
+
|
|
1102
|
+
self._agent_types[agent_type]["target_replicas"] = replicas
|
|
1103
|
+
logger.info(f"Scaled {agent_type} to {replicas} replicas")
|
|
1104
|
+
|
|
1105
|
+
async def scale_up(self, agent_type: str, count: int = 1) -> None:
|
|
1106
|
+
"""Scale up an agent type by adding replicas"""
|
|
1107
|
+
current = len(self._replicas.get(agent_type, {}))
|
|
1108
|
+
await self.scale_to(agent_type, current + count)
|
|
1109
|
+
|
|
1110
|
+
async def scale_down(self, agent_type: str, count: int = 1) -> None:
|
|
1111
|
+
"""Scale down an agent type by removing replicas"""
|
|
1112
|
+
current = len(self._replicas.get(agent_type, {}))
|
|
1113
|
+
await self.scale_to(agent_type, max(0, current - count))
|
|
1114
|
+
|
|
1115
|
+
async def _create_replica(self, agent_type: str) -> AgentReplica:
|
|
1116
|
+
"""Create a new replica for an agent type"""
|
|
1117
|
+
factory = self._agent_types[agent_type]["factory"]
|
|
1118
|
+
replica_id = f"{agent_type}-{uuid.uuid4().hex[:8]}"
|
|
1119
|
+
|
|
1120
|
+
instance = factory()
|
|
1121
|
+
if asyncio.iscoroutine(instance):
|
|
1122
|
+
instance = await instance
|
|
1123
|
+
|
|
1124
|
+
# Start the agent if it has a start method
|
|
1125
|
+
if hasattr(instance, 'start'):
|
|
1126
|
+
if asyncio.iscoroutinefunction(instance.start):
|
|
1127
|
+
await instance.start()
|
|
1128
|
+
else:
|
|
1129
|
+
instance.start()
|
|
1130
|
+
|
|
1131
|
+
replica = AgentReplica(
|
|
1132
|
+
replica_id=replica_id,
|
|
1133
|
+
agent_id=agent_type,
|
|
1134
|
+
instance=instance,
|
|
1135
|
+
status=AgentState.RUNNING
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
self._replicas[agent_type][replica_id] = replica
|
|
1139
|
+
logger.info(f"Created replica {replica_id} for {agent_type}")
|
|
1140
|
+
return replica
|
|
1141
|
+
|
|
1142
|
+
async def _remove_replica(self, agent_type: str, replica_id: str) -> None:
|
|
1143
|
+
"""Remove a replica"""
|
|
1144
|
+
replica = self._replicas[agent_type].pop(replica_id, None)
|
|
1145
|
+
if replica and replica.instance:
|
|
1146
|
+
# Stop the agent if it has a stop method
|
|
1147
|
+
if hasattr(replica.instance, 'stop'):
|
|
1148
|
+
if asyncio.iscoroutinefunction(replica.instance.stop):
|
|
1149
|
+
await replica.instance.stop()
|
|
1150
|
+
else:
|
|
1151
|
+
replica.instance.stop()
|
|
1152
|
+
logger.info(f"Removed replica {replica_id} from {agent_type}")
|
|
1153
|
+
|
|
1154
|
+
async def get_replica(self, agent_type: str) -> Optional[Any]:
|
|
1155
|
+
"""Get an available replica using round-robin load balancing"""
|
|
1156
|
+
replicas = self._replicas.get(agent_type, {})
|
|
1157
|
+
if not replicas:
|
|
1158
|
+
return None
|
|
1159
|
+
|
|
1160
|
+
# Round-robin selection
|
|
1161
|
+
replica_list = list(replicas.values())
|
|
1162
|
+
running_replicas = [r for r in replica_list if r.status == AgentState.RUNNING]
|
|
1163
|
+
|
|
1164
|
+
if not running_replicas:
|
|
1165
|
+
return None
|
|
1166
|
+
|
|
1167
|
+
index = self._load_balancer_index[agent_type] % len(running_replicas)
|
|
1168
|
+
self._load_balancer_index[agent_type] += 1
|
|
1169
|
+
|
|
1170
|
+
return running_replicas[index].instance
|
|
1171
|
+
|
|
1172
|
+
async def _autoscaling_loop(self) -> None:
|
|
1173
|
+
"""Background loop for automatic scaling"""
|
|
1174
|
+
while self._running:
|
|
1175
|
+
try:
|
|
1176
|
+
for agent_type, info in self._agent_types.items():
|
|
1177
|
+
config = info["config"]
|
|
1178
|
+
replicas = self._replicas.get(agent_type, {})
|
|
1179
|
+
|
|
1180
|
+
if not replicas:
|
|
1181
|
+
continue
|
|
1182
|
+
|
|
1183
|
+
# Calculate average utilization
|
|
1184
|
+
total_cpu = sum(r.metrics.get("cpu", 0) for r in replicas.values())
|
|
1185
|
+
avg_cpu = total_cpu / len(replicas) if replicas else 0
|
|
1186
|
+
|
|
1187
|
+
now = datetime.now()
|
|
1188
|
+
|
|
1189
|
+
# Check scale up
|
|
1190
|
+
if avg_cpu > config.scale_up_threshold:
|
|
1191
|
+
last_scale = self._last_scale_up.get(agent_type, datetime.min)
|
|
1192
|
+
if (now - last_scale).total_seconds() > config.scale_up_cooldown_seconds:
|
|
1193
|
+
if len(replicas) < config.max_replicas:
|
|
1194
|
+
await self.scale_up(agent_type, config.scale_up_increment)
|
|
1195
|
+
self._last_scale_up[agent_type] = now
|
|
1196
|
+
|
|
1197
|
+
# Check scale down
|
|
1198
|
+
elif avg_cpu < config.scale_down_threshold:
|
|
1199
|
+
last_scale = self._last_scale_down.get(agent_type, datetime.min)
|
|
1200
|
+
if (now - last_scale).total_seconds() > config.scale_down_cooldown_seconds:
|
|
1201
|
+
if len(replicas) > config.min_replicas:
|
|
1202
|
+
await self.scale_down(agent_type, config.scale_down_increment)
|
|
1203
|
+
self._last_scale_down[agent_type] = now
|
|
1204
|
+
|
|
1205
|
+
except Exception as e:
|
|
1206
|
+
logger.error(f"Autoscaling loop error: {e}")
|
|
1207
|
+
|
|
1208
|
+
await asyncio.sleep(10) # Check every 10 seconds
|
|
1209
|
+
|
|
1210
|
+
def update_replica_metrics(
|
|
1211
|
+
self,
|
|
1212
|
+
agent_type: str,
|
|
1213
|
+
replica_id: str,
|
|
1214
|
+
metrics: Dict[str, float]
|
|
1215
|
+
) -> None:
|
|
1216
|
+
"""Update metrics for a replica"""
|
|
1217
|
+
if agent_type in self._replicas and replica_id in self._replicas[agent_type]:
|
|
1218
|
+
self._replicas[agent_type][replica_id].metrics.update(metrics)
|
|
1219
|
+
|
|
1220
|
+
def get_replica_count(self, agent_type: str) -> int:
|
|
1221
|
+
"""Get the current replica count for an agent type"""
|
|
1222
|
+
return len(self._replicas.get(agent_type, {}))
|
|
1223
|
+
|
|
1224
|
+
def get_all_replicas(self, agent_type: str) -> List[AgentReplica]:
|
|
1225
|
+
"""Get all replicas for an agent type"""
|
|
1226
|
+
return list(self._replicas.get(agent_type, {}).values())
|
|
1227
|
+
|
|
1228
|
+
|
|
1229
|
+
# ============================================================================
|
|
1230
|
+
# ACP-005: Distributed Coordination
|
|
1231
|
+
# ============================================================================
|
|
1232
|
+
|
|
1233
|
+
@dataclass
|
|
1234
|
+
class LeaderElectionConfig:
|
|
1235
|
+
"""Configuration for leader election"""
|
|
1236
|
+
heartbeat_interval_seconds: float = 1.0
|
|
1237
|
+
election_timeout_min_seconds: float = 3.0
|
|
1238
|
+
election_timeout_max_seconds: float = 5.0
|
|
1239
|
+
lease_duration_seconds: float = 15.0
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
@dataclass
|
|
1243
|
+
class LeaderInfo:
|
|
1244
|
+
"""Information about the current leader"""
|
|
1245
|
+
leader_id: str
|
|
1246
|
+
elected_at: datetime
|
|
1247
|
+
lease_expires_at: datetime
|
|
1248
|
+
term: int
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
class DistributedCoordinator:
|
|
1252
|
+
"""
|
|
1253
|
+
Distributed coordination for stateful operations.
|
|
1254
|
+
|
|
1255
|
+
Implements leader election and basic consensus for coordinating
|
|
1256
|
+
multiple agent instances.
|
|
1257
|
+
|
|
1258
|
+
Features:
|
|
1259
|
+
- Leader election using Raft-like protocol
|
|
1260
|
+
- Distributed locks
|
|
1261
|
+
- Heartbeat-based failure detection
|
|
1262
|
+
- Automatic leader failover
|
|
1263
|
+
|
|
1264
|
+
Usage:
|
|
1265
|
+
coordinator = DistributedCoordinator(node_id="node-1")
|
|
1266
|
+
|
|
1267
|
+
# Start coordination
|
|
1268
|
+
await coordinator.start()
|
|
1269
|
+
|
|
1270
|
+
# Check if leader
|
|
1271
|
+
if coordinator.is_leader:
|
|
1272
|
+
# Perform leader-only operations
|
|
1273
|
+
...
|
|
1274
|
+
|
|
1275
|
+
# Acquire distributed lock
|
|
1276
|
+
async with coordinator.lock("resource-1"):
|
|
1277
|
+
# Critical section
|
|
1278
|
+
...
|
|
1279
|
+
"""
|
|
1280
|
+
|
|
1281
|
+
def __init__(
|
|
1282
|
+
self,
|
|
1283
|
+
node_id: str,
|
|
1284
|
+
config: Optional[LeaderElectionConfig] = None,
|
|
1285
|
+
peers: Optional[List[str]] = None
|
|
1286
|
+
):
|
|
1287
|
+
self.node_id = node_id
|
|
1288
|
+
self.config = config or LeaderElectionConfig()
|
|
1289
|
+
self.peers = peers or []
|
|
1290
|
+
|
|
1291
|
+
self._role = CoordinationRole.FOLLOWER
|
|
1292
|
+
self._current_term = 0
|
|
1293
|
+
self._voted_for: Optional[str] = None
|
|
1294
|
+
self._leader_id: Optional[str] = None
|
|
1295
|
+
self._leader_lease_expires: Optional[datetime] = None
|
|
1296
|
+
|
|
1297
|
+
self._last_heartbeat = datetime.now()
|
|
1298
|
+
self._election_timeout = self._random_election_timeout()
|
|
1299
|
+
|
|
1300
|
+
self._locks: Dict[str, asyncio.Lock] = {}
|
|
1301
|
+
self._lock_holders: Dict[str, str] = {}
|
|
1302
|
+
|
|
1303
|
+
self._running = False
|
|
1304
|
+
self._tasks: List[asyncio.Task] = []
|
|
1305
|
+
self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
|
|
1306
|
+
self._lock = asyncio.Lock()
|
|
1307
|
+
|
|
1308
|
+
def _random_election_timeout(self) -> float:
|
|
1309
|
+
"""Generate a random election timeout"""
|
|
1310
|
+
import random
|
|
1311
|
+
return random.uniform(
|
|
1312
|
+
self.config.election_timeout_min_seconds,
|
|
1313
|
+
self.config.election_timeout_max_seconds
|
|
1314
|
+
)
|
|
1315
|
+
|
|
1316
|
+
@property
|
|
1317
|
+
def is_leader(self) -> bool:
|
|
1318
|
+
"""Check if this node is the leader"""
|
|
1319
|
+
return self._role == CoordinationRole.LEADER
|
|
1320
|
+
|
|
1321
|
+
@property
|
|
1322
|
+
def role(self) -> CoordinationRole:
|
|
1323
|
+
"""Get current role"""
|
|
1324
|
+
return self._role
|
|
1325
|
+
|
|
1326
|
+
@property
|
|
1327
|
+
def leader_id(self) -> Optional[str]:
|
|
1328
|
+
"""Get the current leader ID"""
|
|
1329
|
+
return self._leader_id
|
|
1330
|
+
|
|
1331
|
+
async def start(self) -> None:
|
|
1332
|
+
"""Start the coordinator"""
|
|
1333
|
+
if self._running:
|
|
1334
|
+
return
|
|
1335
|
+
|
|
1336
|
+
self._running = True
|
|
1337
|
+
self._tasks.append(asyncio.create_task(self._election_loop()))
|
|
1338
|
+
|
|
1339
|
+
# If no peers, become leader immediately
|
|
1340
|
+
if not self.peers:
|
|
1341
|
+
await self._become_leader()
|
|
1342
|
+
|
|
1343
|
+
logger.info(f"Distributed coordinator started for node {self.node_id}")
|
|
1344
|
+
|
|
1345
|
+
async def stop(self) -> None:
|
|
1346
|
+
"""Stop the coordinator"""
|
|
1347
|
+
self._running = False
|
|
1348
|
+
for task in self._tasks:
|
|
1349
|
+
task.cancel()
|
|
1350
|
+
try:
|
|
1351
|
+
await task
|
|
1352
|
+
except asyncio.CancelledError:
|
|
1353
|
+
pass
|
|
1354
|
+
self._tasks.clear()
|
|
1355
|
+
logger.info(f"Distributed coordinator stopped for node {self.node_id}")
|
|
1356
|
+
|
|
1357
|
+
async def _election_loop(self) -> None:
|
|
1358
|
+
"""Main election and heartbeat loop"""
|
|
1359
|
+
while self._running:
|
|
1360
|
+
try:
|
|
1361
|
+
if self._role == CoordinationRole.LEADER:
|
|
1362
|
+
# Send heartbeats as leader
|
|
1363
|
+
await self._send_heartbeats()
|
|
1364
|
+
await asyncio.sleep(self.config.heartbeat_interval_seconds)
|
|
1365
|
+
else:
|
|
1366
|
+
# Check for election timeout
|
|
1367
|
+
elapsed = (datetime.now() - self._last_heartbeat).total_seconds()
|
|
1368
|
+
if elapsed > self._election_timeout:
|
|
1369
|
+
await self._start_election()
|
|
1370
|
+
await asyncio.sleep(0.1)
|
|
1371
|
+
|
|
1372
|
+
except Exception as e:
|
|
1373
|
+
logger.error(f"Election loop error: {e}")
|
|
1374
|
+
await asyncio.sleep(1)
|
|
1375
|
+
|
|
1376
|
+
async def _start_election(self) -> None:
|
|
1377
|
+
"""Start a leader election"""
|
|
1378
|
+
async with self._lock:
|
|
1379
|
+
self._role = CoordinationRole.CANDIDATE
|
|
1380
|
+
self._current_term += 1
|
|
1381
|
+
self._voted_for = self.node_id
|
|
1382
|
+
self._election_timeout = self._random_election_timeout()
|
|
1383
|
+
|
|
1384
|
+
logger.info(f"Node {self.node_id} starting election for term {self._current_term}")
|
|
1385
|
+
|
|
1386
|
+
# In a real implementation, request votes from peers
|
|
1387
|
+
# For single-node or simple cases, just become leader
|
|
1388
|
+
if not self.peers:
|
|
1389
|
+
await self._become_leader()
|
|
1390
|
+
else:
|
|
1391
|
+
# Simplified: if we're a candidate and no peers respond, become leader
|
|
1392
|
+
votes_received = 1 # Vote for self
|
|
1393
|
+
votes_needed = (len(self.peers) + 1) // 2 + 1
|
|
1394
|
+
|
|
1395
|
+
# In real implementation: send RequestVote RPCs to peers
|
|
1396
|
+
# For now, simulate winning the election
|
|
1397
|
+
if votes_received >= votes_needed or not self.peers:
|
|
1398
|
+
await self._become_leader()
|
|
1399
|
+
|
|
1400
|
+
async def _become_leader(self) -> None:
|
|
1401
|
+
"""Transition to leader role"""
|
|
1402
|
+
self._role = CoordinationRole.LEADER
|
|
1403
|
+
self._leader_id = self.node_id
|
|
1404
|
+
self._leader_lease_expires = datetime.now() + timedelta(
|
|
1405
|
+
seconds=self.config.lease_duration_seconds
|
|
1406
|
+
)
|
|
1407
|
+
|
|
1408
|
+
logger.info(f"Node {self.node_id} became leader for term {self._current_term}")
|
|
1409
|
+
await self._trigger_callbacks("leader_elected", self.node_id)
|
|
1410
|
+
|
|
1411
|
+
async def _send_heartbeats(self) -> None:
|
|
1412
|
+
"""Send heartbeats to followers"""
|
|
1413
|
+
self._leader_lease_expires = datetime.now() + timedelta(
|
|
1414
|
+
seconds=self.config.lease_duration_seconds
|
|
1415
|
+
)
|
|
1416
|
+
# In real implementation: send AppendEntries RPCs to peers
|
|
1417
|
+
|
|
1418
|
+
def receive_heartbeat(self, leader_id: str, term: int) -> None:
|
|
1419
|
+
"""Receive a heartbeat from the leader"""
|
|
1420
|
+
if term >= self._current_term:
|
|
1421
|
+
self._current_term = term
|
|
1422
|
+
self._role = CoordinationRole.FOLLOWER
|
|
1423
|
+
self._leader_id = leader_id
|
|
1424
|
+
self._last_heartbeat = datetime.now()
|
|
1425
|
+
self._voted_for = None
|
|
1426
|
+
|
|
1427
|
+
async def acquire_lock(self, resource_id: str, timeout: float = 30.0) -> bool:
|
|
1428
|
+
"""Acquire a distributed lock"""
|
|
1429
|
+
if resource_id not in self._locks:
|
|
1430
|
+
self._locks[resource_id] = asyncio.Lock()
|
|
1431
|
+
|
|
1432
|
+
try:
|
|
1433
|
+
acquired = await asyncio.wait_for(
|
|
1434
|
+
self._locks[resource_id].acquire(),
|
|
1435
|
+
timeout=timeout
|
|
1436
|
+
)
|
|
1437
|
+
if acquired:
|
|
1438
|
+
self._lock_holders[resource_id] = self.node_id
|
|
1439
|
+
logger.debug(f"Node {self.node_id} acquired lock on {resource_id}")
|
|
1440
|
+
return acquired
|
|
1441
|
+
except asyncio.TimeoutError:
|
|
1442
|
+
return False
|
|
1443
|
+
|
|
1444
|
+
def release_lock(self, resource_id: str) -> None:
|
|
1445
|
+
"""Release a distributed lock"""
|
|
1446
|
+
if resource_id in self._locks and self._locks[resource_id].locked():
|
|
1447
|
+
self._locks[resource_id].release()
|
|
1448
|
+
self._lock_holders.pop(resource_id, None)
|
|
1449
|
+
logger.debug(f"Node {self.node_id} released lock on {resource_id}")
|
|
1450
|
+
|
|
1451
|
+
def lock(self, resource_id: str, timeout: float = 30.0):
|
|
1452
|
+
"""Context manager for distributed lock"""
|
|
1453
|
+
return DistributedLockContext(self, resource_id, timeout)
|
|
1454
|
+
|
|
1455
|
+
def on_event(
|
|
1456
|
+
self,
|
|
1457
|
+
event: str,
|
|
1458
|
+
callback: Callable[[str], Awaitable[None]]
|
|
1459
|
+
) -> None:
|
|
1460
|
+
"""Register a callback for coordination events"""
|
|
1461
|
+
self._callbacks[event].append(callback)
|
|
1462
|
+
|
|
1463
|
+
async def _trigger_callbacks(self, event: str, *args) -> None:
|
|
1464
|
+
"""Trigger callbacks for an event"""
|
|
1465
|
+
for callback in self._callbacks.get(event, []):
|
|
1466
|
+
try:
|
|
1467
|
+
await callback(*args)
|
|
1468
|
+
except Exception as e:
|
|
1469
|
+
logger.error(f"Coordination callback error: {e}")
|
|
1470
|
+
|
|
1471
|
+
def get_leader_info(self) -> Optional[LeaderInfo]:
|
|
1472
|
+
"""Get information about the current leader"""
|
|
1473
|
+
if self._leader_id:
|
|
1474
|
+
return LeaderInfo(
|
|
1475
|
+
leader_id=self._leader_id,
|
|
1476
|
+
elected_at=datetime.now(), # Would be tracked in real implementation
|
|
1477
|
+
lease_expires_at=self._leader_lease_expires or datetime.now(),
|
|
1478
|
+
term=self._current_term
|
|
1479
|
+
)
|
|
1480
|
+
return None
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
class DistributedLockContext:
|
|
1484
|
+
"""Context manager for distributed locks"""
|
|
1485
|
+
|
|
1486
|
+
def __init__(
|
|
1487
|
+
self,
|
|
1488
|
+
coordinator: DistributedCoordinator,
|
|
1489
|
+
resource_id: str,
|
|
1490
|
+
timeout: float
|
|
1491
|
+
):
|
|
1492
|
+
self._coordinator = coordinator
|
|
1493
|
+
self._resource_id = resource_id
|
|
1494
|
+
self._timeout = timeout
|
|
1495
|
+
|
|
1496
|
+
async def __aenter__(self):
|
|
1497
|
+
acquired = await self._coordinator.acquire_lock(self._resource_id, self._timeout)
|
|
1498
|
+
if not acquired:
|
|
1499
|
+
raise TimeoutError(f"Failed to acquire lock on {self._resource_id}")
|
|
1500
|
+
return self
|
|
1501
|
+
|
|
1502
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
1503
|
+
self._coordinator.release_lock(self._resource_id)
|
|
1504
|
+
return False
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
# ============================================================================
|
|
1508
|
+
# ACP-006: Agent Dependency Graph
|
|
1509
|
+
# ============================================================================
|
|
1510
|
+
|
|
1511
|
+
@dataclass
|
|
1512
|
+
class AgentDependency:
|
|
1513
|
+
"""Represents a dependency between agents"""
|
|
1514
|
+
agent_id: str
|
|
1515
|
+
depends_on: List[str]
|
|
1516
|
+
optional_depends_on: List[str] = field(default_factory=list)
|
|
1517
|
+
startup_timeout_seconds: float = 60.0
|
|
1518
|
+
|
|
1519
|
+
|
|
1520
|
+
class DependencyGraph:
|
|
1521
|
+
"""
|
|
1522
|
+
Manages agent startup order based on dependencies.
|
|
1523
|
+
|
|
1524
|
+
Ensures agents start in the correct order, respecting dependencies
|
|
1525
|
+
and detecting circular dependencies.
|
|
1526
|
+
|
|
1527
|
+
Features:
|
|
1528
|
+
- Topological sorting for startup order
|
|
1529
|
+
- Circular dependency detection
|
|
1530
|
+
- Optional vs required dependencies
|
|
1531
|
+
- Parallel startup where possible
|
|
1532
|
+
|
|
1533
|
+
Usage:
|
|
1534
|
+
graph = DependencyGraph()
|
|
1535
|
+
|
|
1536
|
+
graph.add_agent("api-server", depends_on=["database", "cache"])
|
|
1537
|
+
graph.add_agent("database", depends_on=[])
|
|
1538
|
+
graph.add_agent("cache", depends_on=[])
|
|
1539
|
+
|
|
1540
|
+
# Get startup order
|
|
1541
|
+
order = graph.get_startup_order()
|
|
1542
|
+
# Returns: ["database", "cache", "api-server"]
|
|
1543
|
+
"""
|
|
1544
|
+
|
|
1545
|
+
def __init__(self):
|
|
1546
|
+
self._agents: Dict[str, AgentDependency] = {}
|
|
1547
|
+
self._graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depends_on
|
|
1548
|
+
self._reverse_graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depended_by
|
|
1549
|
+
|
|
1550
|
+
def add_agent(
|
|
1551
|
+
self,
|
|
1552
|
+
agent_id: str,
|
|
1553
|
+
depends_on: Optional[List[str]] = None,
|
|
1554
|
+
optional_depends_on: Optional[List[str]] = None,
|
|
1555
|
+
startup_timeout: float = 60.0
|
|
1556
|
+
) -> None:
|
|
1557
|
+
"""Add an agent with its dependencies"""
|
|
1558
|
+
depends_on = depends_on or []
|
|
1559
|
+
optional_depends_on = optional_depends_on or []
|
|
1560
|
+
|
|
1561
|
+
self._agents[agent_id] = AgentDependency(
|
|
1562
|
+
agent_id=agent_id,
|
|
1563
|
+
depends_on=depends_on,
|
|
1564
|
+
optional_depends_on=optional_depends_on,
|
|
1565
|
+
startup_timeout_seconds=startup_timeout
|
|
1566
|
+
)
|
|
1567
|
+
|
|
1568
|
+
# Update graphs
|
|
1569
|
+
for dep in depends_on:
|
|
1570
|
+
self._graph[agent_id].add(dep)
|
|
1571
|
+
self._reverse_graph[dep].add(agent_id)
|
|
1572
|
+
|
|
1573
|
+
logger.debug(f"Added agent {agent_id} with dependencies: {depends_on}")
|
|
1574
|
+
|
|
1575
|
+
def remove_agent(self, agent_id: str) -> None:
|
|
1576
|
+
"""Remove an agent from the dependency graph"""
|
|
1577
|
+
if agent_id in self._agents:
|
|
1578
|
+
# Remove from graphs
|
|
1579
|
+
for dep in self._graph[agent_id]:
|
|
1580
|
+
self._reverse_graph[dep].discard(agent_id)
|
|
1581
|
+
del self._graph[agent_id]
|
|
1582
|
+
del self._agents[agent_id]
|
|
1583
|
+
|
|
1584
|
+
def get_dependencies(self, agent_id: str) -> List[str]:
|
|
1585
|
+
"""Get all dependencies for an agent"""
|
|
1586
|
+
agent = self._agents.get(agent_id)
|
|
1587
|
+
if agent:
|
|
1588
|
+
return agent.depends_on + agent.optional_depends_on
|
|
1589
|
+
return []
|
|
1590
|
+
|
|
1591
|
+
def get_dependents(self, agent_id: str) -> List[str]:
|
|
1592
|
+
"""Get all agents that depend on this agent"""
|
|
1593
|
+
return list(self._reverse_graph.get(agent_id, set()))
|
|
1594
|
+
|
|
1595
|
+
def has_circular_dependency(self) -> bool:
|
|
1596
|
+
"""Check if there are any circular dependencies"""
|
|
1597
|
+
visited = set()
|
|
1598
|
+
rec_stack = set()
|
|
1599
|
+
|
|
1600
|
+
def dfs(node: str) -> bool:
|
|
1601
|
+
visited.add(node)
|
|
1602
|
+
rec_stack.add(node)
|
|
1603
|
+
|
|
1604
|
+
for neighbor in self._graph.get(node, set()):
|
|
1605
|
+
if neighbor not in visited:
|
|
1606
|
+
if dfs(neighbor):
|
|
1607
|
+
return True
|
|
1608
|
+
elif neighbor in rec_stack:
|
|
1609
|
+
return True
|
|
1610
|
+
|
|
1611
|
+
rec_stack.remove(node)
|
|
1612
|
+
return False
|
|
1613
|
+
|
|
1614
|
+
for agent_id in self._agents:
|
|
1615
|
+
if agent_id not in visited:
|
|
1616
|
+
if dfs(agent_id):
|
|
1617
|
+
return True
|
|
1618
|
+
|
|
1619
|
+
return False
|
|
1620
|
+
|
|
1621
|
+
def get_startup_order(self) -> List[str]:
|
|
1622
|
+
"""
|
|
1623
|
+
Get the startup order using topological sort.
|
|
1624
|
+
|
|
1625
|
+
Returns agents in order such that dependencies are started first.
|
|
1626
|
+
Raises ValueError if there are circular dependencies.
|
|
1627
|
+
"""
|
|
1628
|
+
if self.has_circular_dependency():
|
|
1629
|
+
raise ValueError("Circular dependency detected in agent graph")
|
|
1630
|
+
|
|
1631
|
+
# Kahn's algorithm for topological sort
|
|
1632
|
+
in_degree = {agent_id: 0 for agent_id in self._agents}
|
|
1633
|
+
for agent_id in self._agents:
|
|
1634
|
+
for dep in self._graph.get(agent_id, set()):
|
|
1635
|
+
if dep in in_degree:
|
|
1636
|
+
in_degree[agent_id] += 1
|
|
1637
|
+
|
|
1638
|
+
# Start with agents that have no dependencies
|
|
1639
|
+
queue = deque([a for a, d in in_degree.items() if d == 0])
|
|
1640
|
+
result = []
|
|
1641
|
+
|
|
1642
|
+
while queue:
|
|
1643
|
+
agent_id = queue.popleft()
|
|
1644
|
+
result.append(agent_id)
|
|
1645
|
+
|
|
1646
|
+
for dependent in self._reverse_graph.get(agent_id, set()):
|
|
1647
|
+
if dependent in in_degree:
|
|
1648
|
+
in_degree[dependent] -= 1
|
|
1649
|
+
if in_degree[dependent] == 0:
|
|
1650
|
+
queue.append(dependent)
|
|
1651
|
+
|
|
1652
|
+
return result
|
|
1653
|
+
|
|
1654
|
+
def get_parallel_startup_groups(self) -> List[List[str]]:
|
|
1655
|
+
"""
|
|
1656
|
+
Get groups of agents that can be started in parallel.
|
|
1657
|
+
|
|
1658
|
+
Returns a list of groups, where agents within a group can start
|
|
1659
|
+
simultaneously, but groups must be started in order.
|
|
1660
|
+
"""
|
|
1661
|
+
if self.has_circular_dependency():
|
|
1662
|
+
raise ValueError("Circular dependency detected")
|
|
1663
|
+
|
|
1664
|
+
result = []
|
|
1665
|
+
remaining = set(self._agents.keys())
|
|
1666
|
+
started = set()
|
|
1667
|
+
|
|
1668
|
+
while remaining:
|
|
1669
|
+
# Find agents whose dependencies are all started
|
|
1670
|
+
group = []
|
|
1671
|
+
for agent_id in remaining:
|
|
1672
|
+
deps = self._graph.get(agent_id, set())
|
|
1673
|
+
if all(dep in started or dep not in self._agents for dep in deps):
|
|
1674
|
+
group.append(agent_id)
|
|
1675
|
+
|
|
1676
|
+
if not group:
|
|
1677
|
+
raise ValueError("Unable to resolve dependencies")
|
|
1678
|
+
|
|
1679
|
+
result.append(group)
|
|
1680
|
+
for agent_id in group:
|
|
1681
|
+
remaining.remove(agent_id)
|
|
1682
|
+
started.add(agent_id)
|
|
1683
|
+
|
|
1684
|
+
return result
|
|
1685
|
+
|
|
1686
|
+
def get_shutdown_order(self) -> List[str]:
|
|
1687
|
+
"""Get the shutdown order (reverse of startup order)"""
|
|
1688
|
+
return list(reversed(self.get_startup_order()))
|
|
1689
|
+
|
|
1690
|
+
def validate(self) -> List[str]:
|
|
1691
|
+
"""
|
|
1692
|
+
Validate the dependency graph.
|
|
1693
|
+
|
|
1694
|
+
Returns a list of validation errors, or empty list if valid.
|
|
1695
|
+
"""
|
|
1696
|
+
errors = []
|
|
1697
|
+
|
|
1698
|
+
# Check for circular dependencies
|
|
1699
|
+
if self.has_circular_dependency():
|
|
1700
|
+
errors.append("Circular dependency detected")
|
|
1701
|
+
|
|
1702
|
+
# Check for missing dependencies
|
|
1703
|
+
for agent_id, agent in self._agents.items():
|
|
1704
|
+
for dep in agent.depends_on:
|
|
1705
|
+
if dep not in self._agents:
|
|
1706
|
+
errors.append(f"Agent {agent_id} depends on missing agent {dep}")
|
|
1707
|
+
|
|
1708
|
+
return errors
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
# ============================================================================
|
|
1712
|
+
# ACP-007: Graceful Shutdown
|
|
1713
|
+
# ============================================================================
|
|
1714
|
+
|
|
1715
|
+
@dataclass
|
|
1716
|
+
class ShutdownConfig:
|
|
1717
|
+
"""Configuration for graceful shutdown"""
|
|
1718
|
+
drain_timeout_seconds: float = 30.0
|
|
1719
|
+
force_timeout_seconds: float = 60.0
|
|
1720
|
+
checkpoint_enabled: bool = True
|
|
1721
|
+
save_in_flight: bool = True
|
|
1722
|
+
|
|
1723
|
+
|
|
1724
|
+
@dataclass
|
|
1725
|
+
class InFlightOperation:
|
|
1726
|
+
"""Represents an in-flight operation during shutdown"""
|
|
1727
|
+
operation_id: str
|
|
1728
|
+
agent_id: str
|
|
1729
|
+
operation_type: str
|
|
1730
|
+
started_at: datetime
|
|
1731
|
+
data: Dict[str, Any] = field(default_factory=dict)
|
|
1732
|
+
|
|
1733
|
+
|
|
1734
|
+
class GracefulShutdownManager:
|
|
1735
|
+
"""
|
|
1736
|
+
Manages graceful shutdown to preserve in-flight verifications.
|
|
1737
|
+
|
|
1738
|
+
Features:
|
|
1739
|
+
- Drain period for completing in-flight operations
|
|
1740
|
+
- Operation checkpointing
|
|
1741
|
+
- Configurable force timeout
|
|
1742
|
+
- Shutdown hooks
|
|
1743
|
+
|
|
1744
|
+
Usage:
|
|
1745
|
+
shutdown_manager = GracefulShutdownManager(
|
|
1746
|
+
config=ShutdownConfig(drain_timeout_seconds=30)
|
|
1747
|
+
)
|
|
1748
|
+
|
|
1749
|
+
# Register in-flight operation
|
|
1750
|
+
op_id = shutdown_manager.register_operation(
|
|
1751
|
+
agent_id="claims-agent",
|
|
1752
|
+
operation_type="verification",
|
|
1753
|
+
data={"claim_id": "123"}
|
|
1754
|
+
)
|
|
1755
|
+
|
|
1756
|
+
# Complete operation
|
|
1757
|
+
shutdown_manager.complete_operation(op_id)
|
|
1758
|
+
|
|
1759
|
+
# Initiate graceful shutdown
|
|
1760
|
+
await shutdown_manager.shutdown()
|
|
1761
|
+
"""
|
|
1762
|
+
|
|
1763
|
+
def __init__(self, config: Optional[ShutdownConfig] = None):
|
|
1764
|
+
self.config = config or ShutdownConfig()
|
|
1765
|
+
self._phase = ShutdownPhase.RUNNING
|
|
1766
|
+
self._in_flight: Dict[str, InFlightOperation] = {}
|
|
1767
|
+
self._shutdown_hooks: List[Callable[[], Awaitable[None]]] = []
|
|
1768
|
+
self._checkpoint_data: Dict[str, Any] = {}
|
|
1769
|
+
self._lock = asyncio.Lock()
|
|
1770
|
+
self._shutdown_event = asyncio.Event()
|
|
1771
|
+
|
|
1772
|
+
@property
|
|
1773
|
+
def phase(self) -> ShutdownPhase:
|
|
1774
|
+
"""Get the current shutdown phase"""
|
|
1775
|
+
return self._phase
|
|
1776
|
+
|
|
1777
|
+
@property
|
|
1778
|
+
def is_shutting_down(self) -> bool:
|
|
1779
|
+
"""Check if shutdown is in progress"""
|
|
1780
|
+
return self._phase != ShutdownPhase.RUNNING
|
|
1781
|
+
|
|
1782
|
+
def register_operation(
|
|
1783
|
+
self,
|
|
1784
|
+
agent_id: str,
|
|
1785
|
+
operation_type: str,
|
|
1786
|
+
data: Optional[Dict[str, Any]] = None
|
|
1787
|
+
) -> str:
|
|
1788
|
+
"""Register an in-flight operation"""
|
|
1789
|
+
if self._phase != ShutdownPhase.RUNNING:
|
|
1790
|
+
raise RuntimeError("Cannot register new operations during shutdown")
|
|
1791
|
+
|
|
1792
|
+
operation_id = str(uuid.uuid4())
|
|
1793
|
+
self._in_flight[operation_id] = InFlightOperation(
|
|
1794
|
+
operation_id=operation_id,
|
|
1795
|
+
agent_id=agent_id,
|
|
1796
|
+
operation_type=operation_type,
|
|
1797
|
+
started_at=datetime.now(),
|
|
1798
|
+
data=data or {}
|
|
1799
|
+
)
|
|
1800
|
+
return operation_id
|
|
1801
|
+
|
|
1802
|
+
def complete_operation(self, operation_id: str) -> None:
|
|
1803
|
+
"""Mark an operation as complete"""
|
|
1804
|
+
self._in_flight.pop(operation_id, None)
|
|
1805
|
+
|
|
1806
|
+
# Check if all operations complete during draining
|
|
1807
|
+
if self._phase == ShutdownPhase.DRAINING and not self._in_flight:
|
|
1808
|
+
self._shutdown_event.set()
|
|
1809
|
+
|
|
1810
|
+
def get_in_flight_count(self) -> int:
|
|
1811
|
+
"""Get the number of in-flight operations"""
|
|
1812
|
+
return len(self._in_flight)
|
|
1813
|
+
|
|
1814
|
+
def get_in_flight_operations(self) -> List[InFlightOperation]:
|
|
1815
|
+
"""Get all in-flight operations"""
|
|
1816
|
+
return list(self._in_flight.values())
|
|
1817
|
+
|
|
1818
|
+
def add_shutdown_hook(
|
|
1819
|
+
self,
|
|
1820
|
+
hook: Callable[[], Awaitable[None]]
|
|
1821
|
+
) -> None:
|
|
1822
|
+
"""Add a shutdown hook to be called during shutdown"""
|
|
1823
|
+
self._shutdown_hooks.append(hook)
|
|
1824
|
+
|
|
1825
|
+
async def shutdown(self) -> Dict[str, Any]:
|
|
1826
|
+
"""
|
|
1827
|
+
Initiate graceful shutdown.
|
|
1828
|
+
|
|
1829
|
+
Returns a summary of the shutdown process.
|
|
1830
|
+
"""
|
|
1831
|
+
async with self._lock:
|
|
1832
|
+
if self._phase != ShutdownPhase.RUNNING:
|
|
1833
|
+
return {"status": "already_shutting_down", "phase": self._phase.value}
|
|
1834
|
+
|
|
1835
|
+
logger.info("Initiating graceful shutdown")
|
|
1836
|
+
result = {
|
|
1837
|
+
"started_at": datetime.now().isoformat(),
|
|
1838
|
+
"in_flight_at_start": len(self._in_flight),
|
|
1839
|
+
"checkpointed": [],
|
|
1840
|
+
"timed_out": []
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
# Phase 1: Draining
|
|
1844
|
+
self._phase = ShutdownPhase.DRAINING
|
|
1845
|
+
logger.info(f"Draining {len(self._in_flight)} in-flight operations")
|
|
1846
|
+
|
|
1847
|
+
if self._in_flight:
|
|
1848
|
+
self._shutdown_event.clear()
|
|
1849
|
+
try:
|
|
1850
|
+
await asyncio.wait_for(
|
|
1851
|
+
self._shutdown_event.wait(),
|
|
1852
|
+
timeout=self.config.drain_timeout_seconds
|
|
1853
|
+
)
|
|
1854
|
+
except asyncio.TimeoutError:
|
|
1855
|
+
logger.warning("Drain timeout reached, saving remaining operations")
|
|
1856
|
+
|
|
1857
|
+
# Checkpoint remaining operations
|
|
1858
|
+
if self.config.save_in_flight:
|
|
1859
|
+
for op_id, op in list(self._in_flight.items()):
|
|
1860
|
+
self._checkpoint_data[op_id] = {
|
|
1861
|
+
"agent_id": op.agent_id,
|
|
1862
|
+
"operation_type": op.operation_type,
|
|
1863
|
+
"data": op.data,
|
|
1864
|
+
"started_at": op.started_at.isoformat()
|
|
1865
|
+
}
|
|
1866
|
+
result["checkpointed"].append(op_id)
|
|
1867
|
+
|
|
1868
|
+
result["timed_out"] = list(self._in_flight.keys())
|
|
1869
|
+
|
|
1870
|
+
# Phase 2: Stopping
|
|
1871
|
+
self._phase = ShutdownPhase.STOPPING
|
|
1872
|
+
logger.info("Running shutdown hooks")
|
|
1873
|
+
|
|
1874
|
+
for hook in self._shutdown_hooks:
|
|
1875
|
+
try:
|
|
1876
|
+
await asyncio.wait_for(
|
|
1877
|
+
hook(),
|
|
1878
|
+
timeout=self.config.force_timeout_seconds
|
|
1879
|
+
)
|
|
1880
|
+
except asyncio.TimeoutError:
|
|
1881
|
+
logger.warning("Shutdown hook timed out")
|
|
1882
|
+
except Exception as e:
|
|
1883
|
+
logger.error(f"Shutdown hook error: {e}")
|
|
1884
|
+
|
|
1885
|
+
# Phase 3: Terminated
|
|
1886
|
+
self._phase = ShutdownPhase.TERMINATED
|
|
1887
|
+
result["completed_at"] = datetime.now().isoformat()
|
|
1888
|
+
result["checkpoint_data"] = self._checkpoint_data
|
|
1889
|
+
|
|
1890
|
+
logger.info("Graceful shutdown complete")
|
|
1891
|
+
return result
|
|
1892
|
+
|
|
1893
|
+
def get_checkpoint_data(self) -> Dict[str, Any]:
|
|
1894
|
+
"""Get checkpointed data from shutdown"""
|
|
1895
|
+
return dict(self._checkpoint_data)
|
|
1896
|
+
|
|
1897
|
+
async def restore_from_checkpoint(
|
|
1898
|
+
self,
|
|
1899
|
+
checkpoint_data: Dict[str, Any]
|
|
1900
|
+
) -> List[InFlightOperation]:
|
|
1901
|
+
"""Restore operations from checkpoint data"""
|
|
1902
|
+
restored = []
|
|
1903
|
+
for op_id, data in checkpoint_data.items():
|
|
1904
|
+
op = InFlightOperation(
|
|
1905
|
+
operation_id=op_id,
|
|
1906
|
+
agent_id=data["agent_id"],
|
|
1907
|
+
operation_type=data["operation_type"],
|
|
1908
|
+
started_at=datetime.fromisoformat(data["started_at"]),
|
|
1909
|
+
data=data.get("data", {})
|
|
1910
|
+
)
|
|
1911
|
+
self._in_flight[op_id] = op
|
|
1912
|
+
restored.append(op)
|
|
1913
|
+
|
|
1914
|
+
logger.info(f"Restored {len(restored)} operations from checkpoint")
|
|
1915
|
+
return restored
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
# ============================================================================
|
|
1919
|
+
# ACP-008: Resource Quotas
|
|
1920
|
+
# ============================================================================
|
|
1921
|
+
|
|
1922
|
+
@dataclass
|
|
1923
|
+
class AgentResourceQuota:
|
|
1924
|
+
"""Resource quota limits for an agent"""
|
|
1925
|
+
memory_mb: int = 512
|
|
1926
|
+
cpu_percent: float = 25.0
|
|
1927
|
+
max_concurrent_operations: int = 10
|
|
1928
|
+
max_operations_per_minute: int = 100
|
|
1929
|
+
network_bandwidth_mbps: Optional[float] = None
|
|
1930
|
+
storage_mb: Optional[int] = None
|
|
1931
|
+
|
|
1932
|
+
|
|
1933
|
+
@dataclass
|
|
1934
|
+
class ResourceUsage:
|
|
1935
|
+
"""Current resource usage for an agent"""
|
|
1936
|
+
agent_id: str
|
|
1937
|
+
memory_mb: float = 0.0
|
|
1938
|
+
cpu_percent: float = 0.0
|
|
1939
|
+
concurrent_operations: int = 0
|
|
1940
|
+
operations_this_minute: int = 0
|
|
1941
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
1942
|
+
|
|
1943
|
+
|
|
1944
|
+
class ResourceQuotaManager:
|
|
1945
|
+
"""
|
|
1946
|
+
Manages resource quotas and limits per agent.
|
|
1947
|
+
|
|
1948
|
+
Features:
|
|
1949
|
+
- Memory and CPU limits
|
|
1950
|
+
- Concurrent operation limits
|
|
1951
|
+
- Rate limiting (operations per minute)
|
|
1952
|
+
- Usage tracking and reporting
|
|
1953
|
+
|
|
1954
|
+
Usage:
|
|
1955
|
+
quota_manager = ResourceQuotaManager()
|
|
1956
|
+
|
|
1957
|
+
quota_manager.set_quota("claims-agent", AgentResourceQuota(
|
|
1958
|
+
memory_mb=512,
|
|
1959
|
+
cpu_percent=25,
|
|
1960
|
+
max_concurrent_operations=10
|
|
1961
|
+
))
|
|
1962
|
+
|
|
1963
|
+
# Check before operation
|
|
1964
|
+
if quota_manager.can_execute("claims-agent"):
|
|
1965
|
+
quota_manager.record_operation("claims-agent")
|
|
1966
|
+
# Execute operation
|
|
1967
|
+
"""
|
|
1968
|
+
|
|
1969
|
+
def __init__(self):
|
|
1970
|
+
self._quotas: Dict[str, AgentResourceQuota] = {}
|
|
1971
|
+
self._usage: Dict[str, ResourceUsage] = {}
|
|
1972
|
+
self._operation_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
|
1973
|
+
self._lock = asyncio.Lock()
|
|
1974
|
+
|
|
1975
|
+
def set_quota(self, agent_id: str, quota: AgentResourceQuota) -> None:
|
|
1976
|
+
"""Set the resource quota for an agent"""
|
|
1977
|
+
self._quotas[agent_id] = quota
|
|
1978
|
+
if agent_id not in self._usage:
|
|
1979
|
+
self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
|
|
1980
|
+
logger.info(f"Set quota for {agent_id}: memory={quota.memory_mb}MB, cpu={quota.cpu_percent}%")
|
|
1981
|
+
|
|
1982
|
+
def get_quota(self, agent_id: str) -> Optional[AgentResourceQuota]:
|
|
1983
|
+
"""Get the quota for an agent"""
|
|
1984
|
+
return self._quotas.get(agent_id)
|
|
1985
|
+
|
|
1986
|
+
def can_execute(self, agent_id: str) -> bool:
|
|
1987
|
+
"""Check if an agent can execute a new operation"""
|
|
1988
|
+
quota = self._quotas.get(agent_id)
|
|
1989
|
+
if not quota:
|
|
1990
|
+
return True # No quota means no limits
|
|
1991
|
+
|
|
1992
|
+
usage = self._usage.get(agent_id)
|
|
1993
|
+
if not usage:
|
|
1994
|
+
return True
|
|
1995
|
+
|
|
1996
|
+
# Check concurrent operations
|
|
1997
|
+
if usage.concurrent_operations >= quota.max_concurrent_operations:
|
|
1998
|
+
logger.warning(f"Agent {agent_id} at max concurrent operations")
|
|
1999
|
+
return False
|
|
2000
|
+
|
|
2001
|
+
# Check rate limit
|
|
2002
|
+
ops_this_minute = self._count_recent_operations(agent_id, seconds=60)
|
|
2003
|
+
if ops_this_minute >= quota.max_operations_per_minute:
|
|
2004
|
+
logger.warning(f"Agent {agent_id} at rate limit")
|
|
2005
|
+
return False
|
|
2006
|
+
|
|
2007
|
+
# Check memory
|
|
2008
|
+
if usage.memory_mb > quota.memory_mb:
|
|
2009
|
+
logger.warning(f"Agent {agent_id} over memory quota")
|
|
2010
|
+
return False
|
|
2011
|
+
|
|
2012
|
+
# Check CPU
|
|
2013
|
+
if usage.cpu_percent > quota.cpu_percent:
|
|
2014
|
+
logger.warning(f"Agent {agent_id} over CPU quota")
|
|
2015
|
+
return False
|
|
2016
|
+
|
|
2017
|
+
return True
|
|
2018
|
+
|
|
2019
|
+
def record_operation_start(self, agent_id: str) -> None:
|
|
2020
|
+
"""Record the start of an operation"""
|
|
2021
|
+
if agent_id not in self._usage:
|
|
2022
|
+
self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
|
|
2023
|
+
|
|
2024
|
+
self._usage[agent_id].concurrent_operations += 1
|
|
2025
|
+
self._operation_counts[agent_id].append(datetime.now())
|
|
2026
|
+
|
|
2027
|
+
def record_operation_end(self, agent_id: str) -> None:
|
|
2028
|
+
"""Record the end of an operation"""
|
|
2029
|
+
if agent_id in self._usage:
|
|
2030
|
+
self._usage[agent_id].concurrent_operations = max(
|
|
2031
|
+
0, self._usage[agent_id].concurrent_operations - 1
|
|
2032
|
+
)
|
|
2033
|
+
|
|
2034
|
+
def update_resource_usage(
|
|
2035
|
+
self,
|
|
2036
|
+
agent_id: str,
|
|
2037
|
+
memory_mb: Optional[float] = None,
|
|
2038
|
+
cpu_percent: Optional[float] = None
|
|
2039
|
+
) -> None:
|
|
2040
|
+
"""Update the resource usage for an agent"""
|
|
2041
|
+
if agent_id not in self._usage:
|
|
2042
|
+
self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
|
|
2043
|
+
|
|
2044
|
+
usage = self._usage[agent_id]
|
|
2045
|
+
if memory_mb is not None:
|
|
2046
|
+
usage.memory_mb = memory_mb
|
|
2047
|
+
if cpu_percent is not None:
|
|
2048
|
+
usage.cpu_percent = cpu_percent
|
|
2049
|
+
usage.timestamp = datetime.now()
|
|
2050
|
+
|
|
2051
|
+
def _count_recent_operations(self, agent_id: str, seconds: int) -> int:
|
|
2052
|
+
"""Count operations in the last N seconds"""
|
|
2053
|
+
cutoff = datetime.now() - timedelta(seconds=seconds)
|
|
2054
|
+
count = 0
|
|
2055
|
+
for ts in self._operation_counts.get(agent_id, []):
|
|
2056
|
+
if ts > cutoff:
|
|
2057
|
+
count += 1
|
|
2058
|
+
return count
|
|
2059
|
+
|
|
2060
|
+
def get_usage(self, agent_id: str) -> Optional[ResourceUsage]:
|
|
2061
|
+
"""Get current usage for an agent"""
|
|
2062
|
+
return self._usage.get(agent_id)
|
|
2063
|
+
|
|
2064
|
+
def get_all_usage(self) -> Dict[str, ResourceUsage]:
|
|
2065
|
+
"""Get usage for all agents"""
|
|
2066
|
+
return dict(self._usage)
|
|
2067
|
+
|
|
2068
|
+
def check_quota_violations(self) -> Dict[str, List[str]]:
|
|
2069
|
+
"""Check for quota violations across all agents"""
|
|
2070
|
+
violations = {}
|
|
2071
|
+
|
|
2072
|
+
for agent_id, quota in self._quotas.items():
|
|
2073
|
+
usage = self._usage.get(agent_id)
|
|
2074
|
+
if not usage:
|
|
2075
|
+
continue
|
|
2076
|
+
|
|
2077
|
+
agent_violations = []
|
|
2078
|
+
|
|
2079
|
+
if usage.memory_mb > quota.memory_mb:
|
|
2080
|
+
agent_violations.append(
|
|
2081
|
+
f"Memory: {usage.memory_mb:.1f}MB > {quota.memory_mb}MB"
|
|
2082
|
+
)
|
|
2083
|
+
|
|
2084
|
+
if usage.cpu_percent > quota.cpu_percent:
|
|
2085
|
+
agent_violations.append(
|
|
2086
|
+
f"CPU: {usage.cpu_percent:.1f}% > {quota.cpu_percent}%"
|
|
2087
|
+
)
|
|
2088
|
+
|
|
2089
|
+
if usage.concurrent_operations > quota.max_concurrent_operations:
|
|
2090
|
+
agent_violations.append(
|
|
2091
|
+
f"Concurrent ops: {usage.concurrent_operations} > {quota.max_concurrent_operations}"
|
|
2092
|
+
)
|
|
2093
|
+
|
|
2094
|
+
if agent_violations:
|
|
2095
|
+
violations[agent_id] = agent_violations
|
|
2096
|
+
|
|
2097
|
+
return violations
|
|
2098
|
+
|
|
2099
|
+
|
|
2100
|
+
# ============================================================================
|
|
2101
|
+
# ACP-009: Agent Observability
|
|
2102
|
+
# ============================================================================
|
|
2103
|
+
|
|
2104
|
+
@dataclass
|
|
2105
|
+
class AgentMetric:
|
|
2106
|
+
"""A metric measurement for an agent"""
|
|
2107
|
+
name: str
|
|
2108
|
+
value: float
|
|
2109
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
|
2110
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
2111
|
+
metric_type: str = "gauge" # gauge, counter, histogram
|
|
2112
|
+
|
|
2113
|
+
|
|
2114
|
+
@dataclass
|
|
2115
|
+
class AgentLogEntry:
|
|
2116
|
+
"""A log entry from an agent"""
|
|
2117
|
+
agent_id: str
|
|
2118
|
+
level: str # debug, info, warning, error, critical
|
|
2119
|
+
message: str
|
|
2120
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
2121
|
+
context: Dict[str, Any] = field(default_factory=dict)
|
|
2122
|
+
|
|
2123
|
+
|
|
2124
|
+
class AgentObservabilityProvider:
|
|
2125
|
+
"""
|
|
2126
|
+
Built-in observability for agents (metrics, logging, tracing).
|
|
2127
|
+
|
|
2128
|
+
Features:
|
|
2129
|
+
- Structured logging with context
|
|
2130
|
+
- Metrics collection (counters, gauges, histograms)
|
|
2131
|
+
- Distributed tracing support
|
|
2132
|
+
- Prometheus-compatible export
|
|
2133
|
+
|
|
2134
|
+
Usage:
|
|
2135
|
+
observability = AgentObservabilityProvider()
|
|
2136
|
+
|
|
2137
|
+
# Record metric
|
|
2138
|
+
observability.record_metric(
|
|
2139
|
+
agent_id="claims-agent",
|
|
2140
|
+
name="verification_latency_ms",
|
|
2141
|
+
value=150.5,
|
|
2142
|
+
labels={"claim_type": "auto"}
|
|
2143
|
+
)
|
|
2144
|
+
|
|
2145
|
+
# Log with context
|
|
2146
|
+
observability.log(
|
|
2147
|
+
agent_id="claims-agent",
|
|
2148
|
+
level="info",
|
|
2149
|
+
message="Verification completed",
|
|
2150
|
+
context={"claim_id": "123", "result": "approved"}
|
|
2151
|
+
)
|
|
2152
|
+
|
|
2153
|
+
# Export metrics
|
|
2154
|
+
metrics = observability.export_prometheus()
|
|
2155
|
+
"""
|
|
2156
|
+
|
|
2157
|
+
def __init__(self, max_log_entries: int = 10000, max_metrics: int = 10000):
|
|
2158
|
+
self._metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_metrics))
|
|
2159
|
+
self._logs: deque = deque(maxlen=max_log_entries)
|
|
2160
|
+
self._counters: Dict[str, float] = defaultdict(float)
|
|
2161
|
+
self._gauges: Dict[str, float] = {}
|
|
2162
|
+
self._histograms: Dict[str, List[float]] = defaultdict(list)
|
|
2163
|
+
self._metric_metadata: Dict[str, Dict[str, Any]] = {}
|
|
2164
|
+
self._lock = asyncio.Lock()
|
|
2165
|
+
|
|
2166
|
+
def record_metric(
|
|
2167
|
+
self,
|
|
2168
|
+
agent_id: str,
|
|
2169
|
+
name: str,
|
|
2170
|
+
value: float,
|
|
2171
|
+
labels: Optional[Dict[str, str]] = None,
|
|
2172
|
+
metric_type: str = "gauge"
|
|
2173
|
+
) -> None:
|
|
2174
|
+
"""Record a metric for an agent"""
|
|
2175
|
+
labels = labels or {}
|
|
2176
|
+
labels["agent_id"] = agent_id
|
|
2177
|
+
|
|
2178
|
+
metric = AgentMetric(
|
|
2179
|
+
name=name,
|
|
2180
|
+
value=value,
|
|
2181
|
+
labels=labels,
|
|
2182
|
+
metric_type=metric_type
|
|
2183
|
+
)
|
|
2184
|
+
|
|
2185
|
+
full_name = f"{name}:{self._make_label_key(labels)}"
|
|
2186
|
+
self._metrics[agent_id].append(metric)
|
|
2187
|
+
|
|
2188
|
+
# Update aggregates
|
|
2189
|
+
if metric_type == "counter":
|
|
2190
|
+
self._counters[full_name] += value
|
|
2191
|
+
elif metric_type == "gauge":
|
|
2192
|
+
self._gauges[full_name] = value
|
|
2193
|
+
elif metric_type == "histogram":
|
|
2194
|
+
self._histograms[name].append(value)
|
|
2195
|
+
|
|
2196
|
+
def increment_counter(
|
|
2197
|
+
self,
|
|
2198
|
+
agent_id: str,
|
|
2199
|
+
name: str,
|
|
2200
|
+
value: float = 1.0,
|
|
2201
|
+
labels: Optional[Dict[str, str]] = None
|
|
2202
|
+
) -> None:
|
|
2203
|
+
"""Increment a counter metric"""
|
|
2204
|
+
self.record_metric(agent_id, name, value, labels, metric_type="counter")
|
|
2205
|
+
|
|
2206
|
+
def set_gauge(
|
|
2207
|
+
self,
|
|
2208
|
+
agent_id: str,
|
|
2209
|
+
name: str,
|
|
2210
|
+
value: float,
|
|
2211
|
+
labels: Optional[Dict[str, str]] = None
|
|
2212
|
+
) -> None:
|
|
2213
|
+
"""Set a gauge metric"""
|
|
2214
|
+
self.record_metric(agent_id, name, value, labels, metric_type="gauge")
|
|
2215
|
+
|
|
2216
|
+
def observe_histogram(
|
|
2217
|
+
self,
|
|
2218
|
+
agent_id: str,
|
|
2219
|
+
name: str,
|
|
2220
|
+
value: float,
|
|
2221
|
+
labels: Optional[Dict[str, str]] = None
|
|
2222
|
+
) -> None:
|
|
2223
|
+
"""Observe a value for a histogram metric"""
|
|
2224
|
+
self.record_metric(agent_id, name, value, labels, metric_type="histogram")
|
|
2225
|
+
|
|
2226
|
+
def log(
|
|
2227
|
+
self,
|
|
2228
|
+
agent_id: str,
|
|
2229
|
+
level: str,
|
|
2230
|
+
message: str,
|
|
2231
|
+
context: Optional[Dict[str, Any]] = None
|
|
2232
|
+
) -> None:
|
|
2233
|
+
"""Log a message with structured context"""
|
|
2234
|
+
entry = AgentLogEntry(
|
|
2235
|
+
agent_id=agent_id,
|
|
2236
|
+
level=level,
|
|
2237
|
+
message=message,
|
|
2238
|
+
context=context or {}
|
|
2239
|
+
)
|
|
2240
|
+
self._logs.append(entry)
|
|
2241
|
+
|
|
2242
|
+
# Also log to Python logger
|
|
2243
|
+
log_func = getattr(logger, level.lower(), logger.info)
|
|
2244
|
+
log_func(f"[{agent_id}] {message}", extra={"context": context})
|
|
2245
|
+
|
|
2246
|
+
def get_metrics(
|
|
2247
|
+
self,
|
|
2248
|
+
agent_id: Optional[str] = None,
|
|
2249
|
+
name: Optional[str] = None
|
|
2250
|
+
) -> List[AgentMetric]:
|
|
2251
|
+
"""Get recorded metrics"""
|
|
2252
|
+
if agent_id:
|
|
2253
|
+
metrics = list(self._metrics.get(agent_id, []))
|
|
2254
|
+
else:
|
|
2255
|
+
metrics = []
|
|
2256
|
+
for agent_metrics in self._metrics.values():
|
|
2257
|
+
metrics.extend(agent_metrics)
|
|
2258
|
+
|
|
2259
|
+
if name:
|
|
2260
|
+
metrics = [m for m in metrics if m.name == name]
|
|
2261
|
+
|
|
2262
|
+
return metrics
|
|
2263
|
+
|
|
2264
|
+
def get_logs(
|
|
2265
|
+
self,
|
|
2266
|
+
agent_id: Optional[str] = None,
|
|
2267
|
+
level: Optional[str] = None,
|
|
2268
|
+
limit: int = 100
|
|
2269
|
+
) -> List[AgentLogEntry]:
|
|
2270
|
+
"""Get log entries"""
|
|
2271
|
+
logs = list(self._logs)
|
|
2272
|
+
|
|
2273
|
+
if agent_id:
|
|
2274
|
+
logs = [l for l in logs if l.agent_id == agent_id]
|
|
2275
|
+
if level:
|
|
2276
|
+
logs = [l for l in logs if l.level == level]
|
|
2277
|
+
|
|
2278
|
+
return logs[-limit:]
|
|
2279
|
+
|
|
2280
|
+
def export_prometheus(self) -> str:
|
|
2281
|
+
"""Export metrics in Prometheus text format"""
|
|
2282
|
+
lines = []
|
|
2283
|
+
|
|
2284
|
+
# Export counters
|
|
2285
|
+
for full_name, value in self._counters.items():
|
|
2286
|
+
name = full_name.split(":")[0]
|
|
2287
|
+
lines.append(f"# TYPE {name} counter")
|
|
2288
|
+
lines.append(f"{full_name.replace(':', '')} {value}")
|
|
2289
|
+
|
|
2290
|
+
# Export gauges
|
|
2291
|
+
for full_name, value in self._gauges.items():
|
|
2292
|
+
name = full_name.split(":")[0]
|
|
2293
|
+
lines.append(f"# TYPE {name} gauge")
|
|
2294
|
+
lines.append(f"{full_name.replace(':', '')} {value}")
|
|
2295
|
+
|
|
2296
|
+
# Export histogram summaries
|
|
2297
|
+
for name, values in self._histograms.items():
|
|
2298
|
+
if values:
|
|
2299
|
+
lines.append(f"# TYPE {name} histogram")
|
|
2300
|
+
lines.append(f"{name}_count {len(values)}")
|
|
2301
|
+
lines.append(f"{name}_sum {sum(values)}")
|
|
2302
|
+
|
|
2303
|
+
# Calculate percentiles
|
|
2304
|
+
sorted_vals = sorted(values)
|
|
2305
|
+
for p in [0.5, 0.9, 0.99]:
|
|
2306
|
+
idx = int(len(sorted_vals) * p)
|
|
2307
|
+
lines.append(f'{name}{{quantile="{p}"}} {sorted_vals[idx]}')
|
|
2308
|
+
|
|
2309
|
+
return "\n".join(lines)
|
|
2310
|
+
|
|
2311
|
+
def _make_label_key(self, labels: Dict[str, str]) -> str:
|
|
2312
|
+
"""Create a unique key from labels"""
|
|
2313
|
+
return ",".join(f'{k}="{v}"' for k, v in sorted(labels.items()))
|
|
2314
|
+
|
|
2315
|
+
def get_agent_summary(self, agent_id: str) -> Dict[str, Any]:
|
|
2316
|
+
"""Get an observability summary for an agent"""
|
|
2317
|
+
metrics = self.get_metrics(agent_id)
|
|
2318
|
+
logs = self.get_logs(agent_id)
|
|
2319
|
+
|
|
2320
|
+
return {
|
|
2321
|
+
"agent_id": agent_id,
|
|
2322
|
+
"total_metrics": len(metrics),
|
|
2323
|
+
"total_logs": len(logs),
|
|
2324
|
+
"recent_metrics": metrics[-10:] if metrics else [],
|
|
2325
|
+
"recent_logs": logs[-10:] if logs else [],
|
|
2326
|
+
"log_level_counts": self._count_log_levels(logs)
|
|
2327
|
+
}
|
|
2328
|
+
|
|
2329
|
+
def _count_log_levels(self, logs: List[AgentLogEntry]) -> Dict[str, int]:
|
|
2330
|
+
"""Count logs by level"""
|
|
2331
|
+
counts = defaultdict(int)
|
|
2332
|
+
for log in logs:
|
|
2333
|
+
counts[log.level] += 1
|
|
2334
|
+
return dict(counts)
|
|
2335
|
+
|
|
2336
|
+
|
|
2337
|
+
# ============================================================================
|
|
2338
|
+
# ACP-010: Hot Reload
|
|
2339
|
+
# ============================================================================
|
|
2340
|
+
|
|
2341
|
+
@dataclass
|
|
2342
|
+
class HotReloadConfig:
|
|
2343
|
+
"""Configuration for hot reload"""
|
|
2344
|
+
enabled: bool = True
|
|
2345
|
+
watch_paths: List[str] = field(default_factory=list)
|
|
2346
|
+
reload_delay_seconds: float = 1.0
|
|
2347
|
+
preserve_state: bool = True
|
|
2348
|
+
|
|
2349
|
+
|
|
2350
|
+
@dataclass
|
|
2351
|
+
class ReloadEvent:
|
|
2352
|
+
"""Record of a hot reload event"""
|
|
2353
|
+
agent_id: str
|
|
2354
|
+
old_version: str
|
|
2355
|
+
new_version: str
|
|
2356
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
2357
|
+
success: bool = True
|
|
2358
|
+
error: Optional[str] = None
|
|
2359
|
+
preserved_state: Dict[str, Any] = field(default_factory=dict)
|
|
2360
|
+
|
|
2361
|
+
|
|
2362
|
+
class HotReloadManager:
|
|
2363
|
+
"""
|
|
2364
|
+
Manages hot reload of agent code without full restart.
|
|
2365
|
+
|
|
2366
|
+
Features:
|
|
2367
|
+
- Code change detection
|
|
2368
|
+
- Graceful reload with state preservation
|
|
2369
|
+
- Version tracking
|
|
2370
|
+
- Rollback support
|
|
2371
|
+
|
|
2372
|
+
Usage:
|
|
2373
|
+
hot_reload = HotReloadManager(
|
|
2374
|
+
config=HotReloadConfig(
|
|
2375
|
+
watch_paths=["./agents"],
|
|
2376
|
+
preserve_state=True
|
|
2377
|
+
)
|
|
2378
|
+
)
|
|
2379
|
+
|
|
2380
|
+
# Register agent
|
|
2381
|
+
hot_reload.register_agent(
|
|
2382
|
+
agent_id="claims-agent",
|
|
2383
|
+
module_name="agents.claims",
|
|
2384
|
+
class_name="ClaimsAgent"
|
|
2385
|
+
)
|
|
2386
|
+
|
|
2387
|
+
# Trigger reload
|
|
2388
|
+
await hot_reload.reload_agent("claims-agent")
|
|
2389
|
+
"""
|
|
2390
|
+
|
|
2391
|
+
def __init__(self, config: Optional[HotReloadConfig] = None):
|
|
2392
|
+
self.config = config or HotReloadConfig()
|
|
2393
|
+
self._agents: Dict[str, Dict[str, Any]] = {}
|
|
2394
|
+
self._versions: Dict[str, str] = {}
|
|
2395
|
+
self._previous_versions: Dict[str, Any] = {}
|
|
2396
|
+
self._reload_history: deque = deque(maxlen=100)
|
|
2397
|
+
self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
|
|
2398
|
+
self._lock = asyncio.Lock()
|
|
2399
|
+
|
|
2400
|
+
def register_agent(
|
|
2401
|
+
self,
|
|
2402
|
+
agent_id: str,
|
|
2403
|
+
module_name: str,
|
|
2404
|
+
class_name: str,
|
|
2405
|
+
factory: Optional[Callable[[], Any]] = None,
|
|
2406
|
+
instance: Optional[Any] = None,
|
|
2407
|
+
state_extractor: Optional[Callable[[Any], Dict[str, Any]]] = None,
|
|
2408
|
+
state_injector: Optional[Callable[[Any, Dict[str, Any]], None]] = None
|
|
2409
|
+
) -> None:
|
|
2410
|
+
"""Register an agent for hot reload"""
|
|
2411
|
+
self._agents[agent_id] = {
|
|
2412
|
+
"module_name": module_name,
|
|
2413
|
+
"class_name": class_name,
|
|
2414
|
+
"factory": factory,
|
|
2415
|
+
"instance": instance,
|
|
2416
|
+
"state_extractor": state_extractor,
|
|
2417
|
+
"state_injector": state_injector
|
|
2418
|
+
}
|
|
2419
|
+
self._versions[agent_id] = self._compute_version(module_name)
|
|
2420
|
+
logger.info(f"Registered agent {agent_id} for hot reload (version: {self._versions[agent_id][:8]})")
|
|
2421
|
+
|
|
2422
|
+
def _compute_version(self, module_name: str) -> str:
|
|
2423
|
+
"""Compute a version hash for a module"""
|
|
2424
|
+
try:
|
|
2425
|
+
module = sys.modules.get(module_name)
|
|
2426
|
+
if module and hasattr(module, '__file__') and module.__file__:
|
|
2427
|
+
with open(module.__file__, 'rb') as f:
|
|
2428
|
+
return hashlib.md5(f.read()).hexdigest()
|
|
2429
|
+
except Exception as e:
|
|
2430
|
+
logger.warning(f"Could not compute version for {module_name}: {e}")
|
|
2431
|
+
|
|
2432
|
+
return hashlib.md5(module_name.encode()).hexdigest()
|
|
2433
|
+
|
|
2434
|
+
async def check_for_changes(self, agent_id: str) -> bool:
|
|
2435
|
+
"""Check if an agent's code has changed"""
|
|
2436
|
+
if agent_id not in self._agents:
|
|
2437
|
+
return False
|
|
2438
|
+
|
|
2439
|
+
module_name = self._agents[agent_id]["module_name"]
|
|
2440
|
+
new_version = self._compute_version(module_name)
|
|
2441
|
+
old_version = self._versions.get(agent_id, "")
|
|
2442
|
+
|
|
2443
|
+
return new_version != old_version
|
|
2444
|
+
|
|
2445
|
+
async def reload_agent(
|
|
2446
|
+
self,
|
|
2447
|
+
agent_id: str,
|
|
2448
|
+
force: bool = False
|
|
2449
|
+
) -> ReloadEvent:
|
|
2450
|
+
"""Reload an agent with optional state preservation"""
|
|
2451
|
+
if agent_id not in self._agents:
|
|
2452
|
+
raise ValueError(f"Agent {agent_id} not registered for hot reload")
|
|
2453
|
+
|
|
2454
|
+
async with self._lock:
|
|
2455
|
+
agent_info = self._agents[agent_id]
|
|
2456
|
+
module_name = agent_info["module_name"]
|
|
2457
|
+
class_name = agent_info["class_name"]
|
|
2458
|
+
old_version = self._versions.get(agent_id, "unknown")
|
|
2459
|
+
|
|
2460
|
+
# Check if reload needed
|
|
2461
|
+
if not force and not await self.check_for_changes(agent_id):
|
|
2462
|
+
return ReloadEvent(
|
|
2463
|
+
agent_id=agent_id,
|
|
2464
|
+
old_version=old_version,
|
|
2465
|
+
new_version=old_version,
|
|
2466
|
+
success=True,
|
|
2467
|
+
error="No changes detected"
|
|
2468
|
+
)
|
|
2469
|
+
|
|
2470
|
+
try:
|
|
2471
|
+
# Extract state from current instance
|
|
2472
|
+
preserved_state = {}
|
|
2473
|
+
if self.config.preserve_state and agent_info.get("instance"):
|
|
2474
|
+
extractor = agent_info.get("state_extractor")
|
|
2475
|
+
if extractor:
|
|
2476
|
+
preserved_state = extractor(agent_info["instance"])
|
|
2477
|
+
elif hasattr(agent_info["instance"], 'get_state'):
|
|
2478
|
+
preserved_state = agent_info["instance"].get_state()
|
|
2479
|
+
|
|
2480
|
+
# Stop old instance
|
|
2481
|
+
old_instance = agent_info.get("instance")
|
|
2482
|
+
if old_instance and hasattr(old_instance, 'stop'):
|
|
2483
|
+
if asyncio.iscoroutinefunction(old_instance.stop):
|
|
2484
|
+
await old_instance.stop()
|
|
2485
|
+
else:
|
|
2486
|
+
old_instance.stop()
|
|
2487
|
+
|
|
2488
|
+
# Store for potential rollback
|
|
2489
|
+
self._previous_versions[agent_id] = {
|
|
2490
|
+
"instance": old_instance,
|
|
2491
|
+
"version": old_version
|
|
2492
|
+
}
|
|
2493
|
+
|
|
2494
|
+
# Reload the module
|
|
2495
|
+
if module_name in sys.modules:
|
|
2496
|
+
module = importlib.reload(sys.modules[module_name])
|
|
2497
|
+
else:
|
|
2498
|
+
module = importlib.import_module(module_name)
|
|
2499
|
+
|
|
2500
|
+
# Create new instance
|
|
2501
|
+
agent_class = getattr(module, class_name)
|
|
2502
|
+
|
|
2503
|
+
if agent_info.get("factory"):
|
|
2504
|
+
new_instance = agent_info["factory"]()
|
|
2505
|
+
else:
|
|
2506
|
+
new_instance = agent_class()
|
|
2507
|
+
|
|
2508
|
+
if asyncio.iscoroutine(new_instance):
|
|
2509
|
+
new_instance = await new_instance
|
|
2510
|
+
|
|
2511
|
+
# Inject preserved state
|
|
2512
|
+
if preserved_state:
|
|
2513
|
+
injector = agent_info.get("state_injector")
|
|
2514
|
+
if injector:
|
|
2515
|
+
injector(new_instance, preserved_state)
|
|
2516
|
+
elif hasattr(new_instance, 'set_state'):
|
|
2517
|
+
new_instance.set_state(preserved_state)
|
|
2518
|
+
|
|
2519
|
+
# Start new instance
|
|
2520
|
+
if hasattr(new_instance, 'start'):
|
|
2521
|
+
if asyncio.iscoroutinefunction(new_instance.start):
|
|
2522
|
+
await new_instance.start()
|
|
2523
|
+
else:
|
|
2524
|
+
new_instance.start()
|
|
2525
|
+
|
|
2526
|
+
# Update registry
|
|
2527
|
+
agent_info["instance"] = new_instance
|
|
2528
|
+
new_version = self._compute_version(module_name)
|
|
2529
|
+
self._versions[agent_id] = new_version
|
|
2530
|
+
|
|
2531
|
+
event = ReloadEvent(
|
|
2532
|
+
agent_id=agent_id,
|
|
2533
|
+
old_version=old_version,
|
|
2534
|
+
new_version=new_version,
|
|
2535
|
+
success=True,
|
|
2536
|
+
preserved_state=preserved_state
|
|
2537
|
+
)
|
|
2538
|
+
|
|
2539
|
+
self._reload_history.append(event)
|
|
2540
|
+
await self._trigger_callbacks("reload_success", agent_id, event)
|
|
2541
|
+
|
|
2542
|
+
logger.info(f"Hot reloaded agent {agent_id}: {old_version[:8]} -> {new_version[:8]}")
|
|
2543
|
+
return event
|
|
2544
|
+
|
|
2545
|
+
except Exception as e:
|
|
2546
|
+
event = ReloadEvent(
|
|
2547
|
+
agent_id=agent_id,
|
|
2548
|
+
old_version=old_version,
|
|
2549
|
+
new_version=old_version,
|
|
2550
|
+
success=False,
|
|
2551
|
+
error=str(e)
|
|
2552
|
+
)
|
|
2553
|
+
|
|
2554
|
+
self._reload_history.append(event)
|
|
2555
|
+
await self._trigger_callbacks("reload_failed", agent_id, event)
|
|
2556
|
+
|
|
2557
|
+
logger.error(f"Hot reload failed for {agent_id}: {e}")
|
|
2558
|
+
return event
|
|
2559
|
+
|
|
2560
|
+
async def rollback_agent(self, agent_id: str) -> bool:
|
|
2561
|
+
"""Rollback an agent to the previous version"""
|
|
2562
|
+
if agent_id not in self._previous_versions:
|
|
2563
|
+
logger.warning(f"No previous version available for {agent_id}")
|
|
2564
|
+
return False
|
|
2565
|
+
|
|
2566
|
+
async with self._lock:
|
|
2567
|
+
try:
|
|
2568
|
+
prev = self._previous_versions[agent_id]
|
|
2569
|
+
agent_info = self._agents[agent_id]
|
|
2570
|
+
|
|
2571
|
+
# Stop current instance
|
|
2572
|
+
current = agent_info.get("instance")
|
|
2573
|
+
if current and hasattr(current, 'stop'):
|
|
2574
|
+
if asyncio.iscoroutinefunction(current.stop):
|
|
2575
|
+
await current.stop()
|
|
2576
|
+
else:
|
|
2577
|
+
current.stop()
|
|
2578
|
+
|
|
2579
|
+
# Restore previous instance
|
|
2580
|
+
prev_instance = prev["instance"]
|
|
2581
|
+
if prev_instance and hasattr(prev_instance, 'start'):
|
|
2582
|
+
if asyncio.iscoroutinefunction(prev_instance.start):
|
|
2583
|
+
await prev_instance.start()
|
|
2584
|
+
else:
|
|
2585
|
+
prev_instance.start()
|
|
2586
|
+
|
|
2587
|
+
agent_info["instance"] = prev_instance
|
|
2588
|
+
self._versions[agent_id] = prev["version"]
|
|
2589
|
+
|
|
2590
|
+
logger.info(f"Rolled back agent {agent_id} to version {prev['version'][:8]}")
|
|
2591
|
+
return True
|
|
2592
|
+
|
|
2593
|
+
except Exception as e:
|
|
2594
|
+
logger.error(f"Rollback failed for {agent_id}: {e}")
|
|
2595
|
+
return False
|
|
2596
|
+
|
|
2597
|
+
def get_agent_version(self, agent_id: str) -> Optional[str]:
|
|
2598
|
+
"""Get the current version of an agent"""
|
|
2599
|
+
return self._versions.get(agent_id)
|
|
2600
|
+
|
|
2601
|
+
def get_agent_instance(self, agent_id: str) -> Optional[Any]:
|
|
2602
|
+
"""Get the current instance of an agent"""
|
|
2603
|
+
if agent_id in self._agents:
|
|
2604
|
+
return self._agents[agent_id].get("instance")
|
|
2605
|
+
return None
|
|
2606
|
+
|
|
2607
|
+
def get_reload_history(
|
|
2608
|
+
self,
|
|
2609
|
+
agent_id: Optional[str] = None
|
|
2610
|
+
) -> List[ReloadEvent]:
|
|
2611
|
+
"""Get reload history"""
|
|
2612
|
+
history = list(self._reload_history)
|
|
2613
|
+
if agent_id:
|
|
2614
|
+
history = [e for e in history if e.agent_id == agent_id]
|
|
2615
|
+
return history
|
|
2616
|
+
|
|
2617
|
+
def on_event(
|
|
2618
|
+
self,
|
|
2619
|
+
event: str,
|
|
2620
|
+
callback: Callable[[str, ReloadEvent], Awaitable[None]]
|
|
2621
|
+
) -> None:
|
|
2622
|
+
"""Register a callback for reload events"""
|
|
2623
|
+
self._callbacks[event].append(callback)
|
|
2624
|
+
|
|
2625
|
+
async def _trigger_callbacks(
|
|
2626
|
+
self,
|
|
2627
|
+
event: str,
|
|
2628
|
+
agent_id: str,
|
|
2629
|
+
reload_event: ReloadEvent
|
|
2630
|
+
) -> None:
|
|
2631
|
+
"""Trigger callbacks for an event"""
|
|
2632
|
+
for callback in self._callbacks.get(event, []):
|
|
2633
|
+
try:
|
|
2634
|
+
await callback(agent_id, reload_event)
|
|
2635
|
+
except Exception as e:
|
|
2636
|
+
logger.error(f"Hot reload callback error: {e}")
|
|
2637
|
+
|
|
2638
|
+
|
|
2639
|
+
# ============================================================================
|
|
2640
|
+
# Main Agent Registration
|
|
2641
|
+
# ============================================================================
|
|
2642
|
+
|
|
2643
|
+
@dataclass
|
|
2644
|
+
class AgentRegistration:
|
|
2645
|
+
"""Registration details for an agent in the control plane"""
|
|
2646
|
+
agent_type: Type
|
|
2647
|
+
replicas: int = 1
|
|
2648
|
+
dependencies: List[str] = field(default_factory=list)
|
|
2649
|
+
resources: Optional[AgentResourceQuota] = None
|
|
2650
|
+
health_config: Optional[HealthCheckConfig] = None
|
|
2651
|
+
recovery_config: Optional[RecoveryConfig] = None
|
|
2652
|
+
circuit_breaker: Optional[CircuitBreaker] = None
|
|
2653
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
2654
|
+
|
|
2655
|
+
|
|
2656
|
+
# ============================================================================
|
|
2657
|
+
# Enhanced Agent Control Plane
|
|
2658
|
+
# ============================================================================
|
|
2659
|
+
|
|
2660
|
+
class EnhancedAgentControlPlane:
|
|
2661
|
+
"""
|
|
2662
|
+
Enhanced Agent Control Plane with full lifecycle management.
|
|
2663
|
+
|
|
2664
|
+
This is the main interface for managing autonomous AI agents with
|
|
2665
|
+
comprehensive lifecycle features including health monitoring,
|
|
2666
|
+
auto-recovery, circuit breakers, scaling, distributed coordination,
|
|
2667
|
+
dependency management, graceful shutdown, resource quotas,
|
|
2668
|
+
observability, and hot reload.
|
|
2669
|
+
|
|
2670
|
+
Usage:
|
|
2671
|
+
control_plane = EnhancedAgentControlPlane(
|
|
2672
|
+
health_check_interval=30,
|
|
2673
|
+
auto_recovery=True,
|
|
2674
|
+
circuit_breaker=CircuitBreaker(
|
|
2675
|
+
failure_threshold=5,
|
|
2676
|
+
recovery_timeout=60
|
|
2677
|
+
)
|
|
2678
|
+
)
|
|
2679
|
+
|
|
2680
|
+
control_plane.register(
|
|
2681
|
+
ClaimsAgent,
|
|
2682
|
+
replicas=3,
|
|
2683
|
+
dependencies=["message-bus"],
|
|
2684
|
+
resources=AgentResourceQuota(
|
|
2685
|
+
memory_mb=512,
|
|
2686
|
+
cpu_percent=25
|
|
2687
|
+
)
|
|
2688
|
+
)
|
|
2689
|
+
|
|
2690
|
+
await control_plane.start_all()
|
|
2691
|
+
"""
|
|
2692
|
+
|
|
2693
|
+
def __init__(
|
|
2694
|
+
self,
|
|
2695
|
+
health_check_interval: float = 30.0,
|
|
2696
|
+
auto_recovery: bool = True,
|
|
2697
|
+
circuit_breaker: Optional[CircuitBreaker] = None,
|
|
2698
|
+
node_id: Optional[str] = None,
|
|
2699
|
+
health_config: Optional[HealthCheckConfig] = None,
|
|
2700
|
+
recovery_config: Optional[RecoveryConfig] = None,
|
|
2701
|
+
scaling_config: Optional[ScalingConfig] = None,
|
|
2702
|
+
shutdown_config: Optional[ShutdownConfig] = None,
|
|
2703
|
+
hot_reload_config: Optional[HotReloadConfig] = None
|
|
2704
|
+
):
|
|
2705
|
+
"""
|
|
2706
|
+
Initialize the Enhanced Agent Control Plane.
|
|
2707
|
+
|
|
2708
|
+
Args:
|
|
2709
|
+
health_check_interval: Interval between health checks (seconds)
|
|
2710
|
+
auto_recovery: Enable automatic recovery of failed agents
|
|
2711
|
+
circuit_breaker: Default circuit breaker configuration
|
|
2712
|
+
node_id: Node ID for distributed coordination
|
|
2713
|
+
health_config: Health check configuration
|
|
2714
|
+
recovery_config: Auto-recovery configuration
|
|
2715
|
+
scaling_config: Agent scaling configuration
|
|
2716
|
+
shutdown_config: Graceful shutdown configuration
|
|
2717
|
+
hot_reload_config: Hot reload configuration
|
|
2718
|
+
"""
|
|
2719
|
+
self.node_id = node_id or f"node-{uuid.uuid4().hex[:8]}"
|
|
2720
|
+
|
|
2721
|
+
# Configure health monitoring
|
|
2722
|
+
health_config = health_config or HealthCheckConfig()
|
|
2723
|
+
health_config.liveness_interval_seconds = health_check_interval
|
|
2724
|
+
self.health_monitor = HealthMonitor(config=health_config)
|
|
2725
|
+
|
|
2726
|
+
# Configure auto-recovery
|
|
2727
|
+
recovery_config = recovery_config or RecoveryConfig()
|
|
2728
|
+
recovery_config.enabled = auto_recovery
|
|
2729
|
+
self.recovery_manager = AutoRecoveryManager(config=recovery_config)
|
|
2730
|
+
|
|
2731
|
+
# Configure circuit breakers
|
|
2732
|
+
self.default_circuit_breaker = circuit_breaker
|
|
2733
|
+
self.circuit_breaker_registry = CircuitBreakerRegistry()
|
|
2734
|
+
|
|
2735
|
+
# Configure scaling
|
|
2736
|
+
self.scaler = AgentScaler()
|
|
2737
|
+
self.default_scaling_config = scaling_config or ScalingConfig()
|
|
2738
|
+
|
|
2739
|
+
# Configure distributed coordination
|
|
2740
|
+
self.coordinator = DistributedCoordinator(node_id=self.node_id)
|
|
2741
|
+
|
|
2742
|
+
# Configure dependency graph
|
|
2743
|
+
self.dependency_graph = DependencyGraph()
|
|
2744
|
+
|
|
2745
|
+
# Configure graceful shutdown
|
|
2746
|
+
self.shutdown_manager = GracefulShutdownManager(
|
|
2747
|
+
config=shutdown_config or ShutdownConfig()
|
|
2748
|
+
)
|
|
2749
|
+
|
|
2750
|
+
# Configure resource quotas
|
|
2751
|
+
self.quota_manager = ResourceQuotaManager()
|
|
2752
|
+
|
|
2753
|
+
# Configure observability
|
|
2754
|
+
self.observability = AgentObservabilityProvider()
|
|
2755
|
+
|
|
2756
|
+
# Configure hot reload
|
|
2757
|
+
self.hot_reload = HotReloadManager(
|
|
2758
|
+
config=hot_reload_config or HotReloadConfig()
|
|
2759
|
+
)
|
|
2760
|
+
|
|
2761
|
+
# Agent registrations
|
|
2762
|
+
self._registrations: Dict[str, AgentRegistration] = {}
|
|
2763
|
+
self._instances: Dict[str, List[Any]] = defaultdict(list)
|
|
2764
|
+
self._running = False
|
|
2765
|
+
|
|
2766
|
+
# Wire up callbacks
|
|
2767
|
+
self._setup_callbacks()
|
|
2768
|
+
|
|
2769
|
+
def _setup_callbacks(self) -> None:
|
|
2770
|
+
"""Set up internal callbacks between components"""
|
|
2771
|
+
# Health -> Recovery: trigger recovery on health failure
|
|
2772
|
+
async def on_liveness_failed(agent_id: str):
|
|
2773
|
+
self.observability.log(agent_id, "error", "Liveness check failed")
|
|
2774
|
+
self.observability.increment_counter(agent_id, "health_failures_total")
|
|
2775
|
+
await self.recovery_manager.handle_failure(agent_id)
|
|
2776
|
+
|
|
2777
|
+
self.health_monitor.on_event("liveness_failed", on_liveness_failed)
|
|
2778
|
+
|
|
2779
|
+
# Recovery -> Health: register recovered agents
|
|
2780
|
+
async def on_recovery_success(agent_id: str, event: RecoveryEvent):
|
|
2781
|
+
self.observability.log(agent_id, "info", f"Agent recovered (attempt {event.attempt})")
|
|
2782
|
+
self.observability.increment_counter(agent_id, "recoveries_total")
|
|
2783
|
+
agent = self.recovery_manager.get_agent(agent_id)
|
|
2784
|
+
if agent:
|
|
2785
|
+
self.health_monitor.register_agent(agent_id, agent)
|
|
2786
|
+
|
|
2787
|
+
self.recovery_manager.on_event("recovery_success", on_recovery_success)
|
|
2788
|
+
|
|
2789
|
+
def register(
|
|
2790
|
+
self,
|
|
2791
|
+
agent_type: Type,
|
|
2792
|
+
agent_id: Optional[str] = None,
|
|
2793
|
+
replicas: int = 1,
|
|
2794
|
+
dependencies: Optional[List[str]] = None,
|
|
2795
|
+
resources: Optional[AgentResourceQuota] = None,
|
|
2796
|
+
health_config: Optional[HealthCheckConfig] = None,
|
|
2797
|
+
recovery_config: Optional[RecoveryConfig] = None,
|
|
2798
|
+
circuit_breaker: Optional[CircuitBreaker] = None,
|
|
2799
|
+
**metadata
|
|
2800
|
+
) -> str:
|
|
2801
|
+
"""
|
|
2802
|
+
Register an agent type with the control plane.
|
|
2803
|
+
|
|
2804
|
+
Args:
|
|
2805
|
+
agent_type: The agent class to register
|
|
2806
|
+
agent_id: Optional agent ID (defaults to class name)
|
|
2807
|
+
replicas: Number of replicas to create
|
|
2808
|
+
dependencies: List of agent IDs this agent depends on
|
|
2809
|
+
resources: Resource quota for this agent
|
|
2810
|
+
health_config: Health check configuration
|
|
2811
|
+
recovery_config: Auto-recovery configuration
|
|
2812
|
+
circuit_breaker: Circuit breaker for this agent
|
|
2813
|
+
**metadata: Additional metadata
|
|
2814
|
+
|
|
2815
|
+
Returns:
|
|
2816
|
+
The agent ID
|
|
2817
|
+
"""
|
|
2818
|
+
agent_id = agent_id or agent_type.__name__
|
|
2819
|
+
dependencies = dependencies or []
|
|
2820
|
+
|
|
2821
|
+
registration = AgentRegistration(
|
|
2822
|
+
agent_type=agent_type,
|
|
2823
|
+
replicas=replicas,
|
|
2824
|
+
dependencies=dependencies,
|
|
2825
|
+
resources=resources,
|
|
2826
|
+
health_config=health_config,
|
|
2827
|
+
recovery_config=recovery_config,
|
|
2828
|
+
circuit_breaker=circuit_breaker or self.default_circuit_breaker,
|
|
2829
|
+
metadata=metadata
|
|
2830
|
+
)
|
|
2831
|
+
|
|
2832
|
+
self._registrations[agent_id] = registration
|
|
2833
|
+
|
|
2834
|
+
# Register with dependency graph
|
|
2835
|
+
self.dependency_graph.add_agent(agent_id, depends_on=dependencies)
|
|
2836
|
+
|
|
2837
|
+
# Register with scaler
|
|
2838
|
+
self.scaler.register_agent_type(
|
|
2839
|
+
agent_type=agent_id,
|
|
2840
|
+
factory=lambda at=agent_type: at(),
|
|
2841
|
+
config=self.default_scaling_config,
|
|
2842
|
+
replicas=replicas
|
|
2843
|
+
)
|
|
2844
|
+
|
|
2845
|
+
# Set resource quota if provided
|
|
2846
|
+
if resources:
|
|
2847
|
+
self.quota_manager.set_quota(agent_id, resources)
|
|
2848
|
+
|
|
2849
|
+
# Register circuit breaker
|
|
2850
|
+
if circuit_breaker:
|
|
2851
|
+
self.circuit_breaker_registry._breakers[agent_id] = circuit_breaker
|
|
2852
|
+
|
|
2853
|
+
self.observability.log(
|
|
2854
|
+
agent_id, "info",
|
|
2855
|
+
f"Registered agent with {replicas} replicas, dependencies: {dependencies}"
|
|
2856
|
+
)
|
|
2857
|
+
|
|
2858
|
+
logger.info(f"Registered agent {agent_id}: replicas={replicas}, dependencies={dependencies}")
|
|
2859
|
+
return agent_id
|
|
2860
|
+
|
|
2861
|
+
async def start_all(self) -> Dict[str, Any]:
|
|
2862
|
+
"""
|
|
2863
|
+
Start all registered agents in dependency order.
|
|
2864
|
+
|
|
2865
|
+
Returns:
|
|
2866
|
+
Summary of startup results
|
|
2867
|
+
"""
|
|
2868
|
+
if self._running:
|
|
2869
|
+
return {"status": "already_running"}
|
|
2870
|
+
|
|
2871
|
+
result = {
|
|
2872
|
+
"started_at": datetime.now().isoformat(),
|
|
2873
|
+
"agents": {},
|
|
2874
|
+
"errors": []
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
try:
|
|
2878
|
+
# Validate dependency graph
|
|
2879
|
+
errors = self.dependency_graph.validate()
|
|
2880
|
+
if errors:
|
|
2881
|
+
result["errors"] = errors
|
|
2882
|
+
return result
|
|
2883
|
+
|
|
2884
|
+
# Get startup order
|
|
2885
|
+
startup_groups = self.dependency_graph.get_parallel_startup_groups()
|
|
2886
|
+
|
|
2887
|
+
# Start coordinator
|
|
2888
|
+
await self.coordinator.start()
|
|
2889
|
+
|
|
2890
|
+
# Start health monitor
|
|
2891
|
+
await self.health_monitor.start()
|
|
2892
|
+
|
|
2893
|
+
# Start agents in dependency order
|
|
2894
|
+
for group in startup_groups:
|
|
2895
|
+
# Start agents in this group in parallel
|
|
2896
|
+
tasks = []
|
|
2897
|
+
for agent_id in group:
|
|
2898
|
+
tasks.append(self._start_agent(agent_id))
|
|
2899
|
+
|
|
2900
|
+
group_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
2901
|
+
|
|
2902
|
+
for agent_id, res in zip(group, group_results):
|
|
2903
|
+
if isinstance(res, Exception):
|
|
2904
|
+
result["agents"][agent_id] = {
|
|
2905
|
+
"status": "failed",
|
|
2906
|
+
"error": str(res)
|
|
2907
|
+
}
|
|
2908
|
+
result["errors"].append(f"{agent_id}: {res}")
|
|
2909
|
+
else:
|
|
2910
|
+
result["agents"][agent_id] = res
|
|
2911
|
+
|
|
2912
|
+
# Start scaler
|
|
2913
|
+
await self.scaler.start()
|
|
2914
|
+
|
|
2915
|
+
self._running = True
|
|
2916
|
+
result["status"] = "started"
|
|
2917
|
+
|
|
2918
|
+
except Exception as e:
|
|
2919
|
+
result["status"] = "failed"
|
|
2920
|
+
result["errors"].append(str(e))
|
|
2921
|
+
logger.error(f"Failed to start control plane: {e}")
|
|
2922
|
+
|
|
2923
|
+
return result
|
|
2924
|
+
|
|
2925
|
+
async def _start_agent(self, agent_id: str) -> Dict[str, Any]:
|
|
2926
|
+
"""Start a single agent"""
|
|
2927
|
+
registration = self._registrations.get(agent_id)
|
|
2928
|
+
if not registration:
|
|
2929
|
+
raise ValueError(f"Agent {agent_id} not registered")
|
|
2930
|
+
|
|
2931
|
+
result = {
|
|
2932
|
+
"agent_id": agent_id,
|
|
2933
|
+
"status": "starting",
|
|
2934
|
+
"replicas": []
|
|
2935
|
+
}
|
|
2936
|
+
|
|
2937
|
+
# Check resource quota
|
|
2938
|
+
if registration.resources:
|
|
2939
|
+
self.quota_manager.set_quota(agent_id, registration.resources)
|
|
2940
|
+
|
|
2941
|
+
# Create factory for recovery manager
|
|
2942
|
+
def create_agent():
|
|
2943
|
+
return registration.agent_type()
|
|
2944
|
+
|
|
2945
|
+
# Register with recovery manager
|
|
2946
|
+
self.recovery_manager.register_agent(agent_id, create_agent)
|
|
2947
|
+
|
|
2948
|
+
# Create replicas
|
|
2949
|
+
for i in range(registration.replicas):
|
|
2950
|
+
replica_id = f"{agent_id}-{i}"
|
|
2951
|
+
try:
|
|
2952
|
+
instance = create_agent()
|
|
2953
|
+
|
|
2954
|
+
# Start instance if it has start method
|
|
2955
|
+
if hasattr(instance, 'start'):
|
|
2956
|
+
if asyncio.iscoroutinefunction(instance.start):
|
|
2957
|
+
await instance.start()
|
|
2958
|
+
else:
|
|
2959
|
+
instance.start()
|
|
2960
|
+
|
|
2961
|
+
self._instances[agent_id].append(instance)
|
|
2962
|
+
|
|
2963
|
+
# Register with health monitor
|
|
2964
|
+
self.health_monitor.register_agent(replica_id, instance)
|
|
2965
|
+
|
|
2966
|
+
result["replicas"].append({
|
|
2967
|
+
"replica_id": replica_id,
|
|
2968
|
+
"status": "running"
|
|
2969
|
+
})
|
|
2970
|
+
|
|
2971
|
+
self.observability.log(agent_id, "info", f"Started replica {replica_id}")
|
|
2972
|
+
|
|
2973
|
+
except Exception as e:
|
|
2974
|
+
result["replicas"].append({
|
|
2975
|
+
"replica_id": replica_id,
|
|
2976
|
+
"status": "failed",
|
|
2977
|
+
"error": str(e)
|
|
2978
|
+
})
|
|
2979
|
+
self.observability.log(agent_id, "error", f"Failed to start replica {replica_id}: {e}")
|
|
2980
|
+
|
|
2981
|
+
result["status"] = "running"
|
|
2982
|
+
return result
|
|
2983
|
+
|
|
2984
|
+
async def stop_all(self) -> Dict[str, Any]:
|
|
2985
|
+
"""
|
|
2986
|
+
Stop all agents gracefully.
|
|
2987
|
+
|
|
2988
|
+
Returns:
|
|
2989
|
+
Summary of shutdown results
|
|
2990
|
+
"""
|
|
2991
|
+
if not self._running:
|
|
2992
|
+
return {"status": "not_running"}
|
|
2993
|
+
|
|
2994
|
+
# Initiate graceful shutdown
|
|
2995
|
+
shutdown_result = await self.shutdown_manager.shutdown()
|
|
2996
|
+
|
|
2997
|
+
# Stop components in reverse order
|
|
2998
|
+
await self.scaler.stop()
|
|
2999
|
+
await self.health_monitor.stop()
|
|
3000
|
+
await self.coordinator.stop()
|
|
3001
|
+
|
|
3002
|
+
# Stop agents in reverse dependency order
|
|
3003
|
+
shutdown_order = self.dependency_graph.get_shutdown_order()
|
|
3004
|
+
|
|
3005
|
+
for agent_id in shutdown_order:
|
|
3006
|
+
for instance in self._instances.get(agent_id, []):
|
|
3007
|
+
try:
|
|
3008
|
+
if hasattr(instance, 'stop'):
|
|
3009
|
+
if asyncio.iscoroutinefunction(instance.stop):
|
|
3010
|
+
await instance.stop()
|
|
3011
|
+
else:
|
|
3012
|
+
instance.stop()
|
|
3013
|
+
except Exception as e:
|
|
3014
|
+
logger.error(f"Error stopping {agent_id}: {e}")
|
|
3015
|
+
|
|
3016
|
+
self._instances[agent_id].clear()
|
|
3017
|
+
|
|
3018
|
+
self._running = False
|
|
3019
|
+
|
|
3020
|
+
return {
|
|
3021
|
+
"status": "stopped",
|
|
3022
|
+
"shutdown_result": shutdown_result
|
|
3023
|
+
}
|
|
3024
|
+
|
|
3025
|
+
def get_agent(self, agent_id: str, replica_index: int = 0) -> Optional[Any]:
|
|
3026
|
+
"""Get an agent instance by ID"""
|
|
3027
|
+
instances = self._instances.get(agent_id, [])
|
|
3028
|
+
if 0 <= replica_index < len(instances):
|
|
3029
|
+
return instances[replica_index]
|
|
3030
|
+
return None
|
|
3031
|
+
|
|
3032
|
+
async def get_available_agent(self, agent_id: str) -> Optional[Any]:
|
|
3033
|
+
"""Get an available agent instance (load balanced)"""
|
|
3034
|
+
# Check circuit breaker
|
|
3035
|
+
breaker = self.circuit_breaker_registry.get(agent_id)
|
|
3036
|
+
if breaker and breaker.is_open:
|
|
3037
|
+
return None
|
|
3038
|
+
|
|
3039
|
+
# Check resource quota
|
|
3040
|
+
if not self.quota_manager.can_execute(agent_id):
|
|
3041
|
+
return None
|
|
3042
|
+
|
|
3043
|
+
# Get replica from scaler
|
|
3044
|
+
return await self.scaler.get_replica(agent_id)
|
|
3045
|
+
|
|
3046
|
+
def get_health_status(self, agent_id: str) -> HealthStatus:
|
|
3047
|
+
"""Get the health status of an agent"""
|
|
3048
|
+
return self.health_monitor.get_agent_health(agent_id)
|
|
3049
|
+
|
|
3050
|
+
def get_all_health_status(self) -> Dict[str, HealthStatus]:
|
|
3051
|
+
"""Get health status for all agents"""
|
|
3052
|
+
return self.health_monitor.get_all_health_status()
|
|
3053
|
+
|
|
3054
|
+
def get_circuit_breaker(self, agent_id: str) -> Optional[CircuitBreaker]:
|
|
3055
|
+
"""Get the circuit breaker for an agent"""
|
|
3056
|
+
return self.circuit_breaker_registry.get(agent_id)
|
|
3057
|
+
|
|
3058
|
+
def get_metrics(self) -> str:
|
|
3059
|
+
"""Get Prometheus-formatted metrics"""
|
|
3060
|
+
return self.observability.export_prometheus()
|
|
3061
|
+
|
|
3062
|
+
def get_status(self) -> Dict[str, Any]:
|
|
3063
|
+
"""Get comprehensive status of the control plane"""
|
|
3064
|
+
return {
|
|
3065
|
+
"running": self._running,
|
|
3066
|
+
"node_id": self.node_id,
|
|
3067
|
+
"is_leader": self.coordinator.is_leader,
|
|
3068
|
+
"registered_agents": list(self._registrations.keys()),
|
|
3069
|
+
"health_status": {
|
|
3070
|
+
k: v.value for k, v in self.health_monitor.get_all_health_status().items()
|
|
3071
|
+
},
|
|
3072
|
+
"circuit_breakers": {
|
|
3073
|
+
name: cb.get_metrics().__dict__
|
|
3074
|
+
for name, cb in self.circuit_breaker_registry._breakers.items()
|
|
3075
|
+
},
|
|
3076
|
+
"resource_violations": self.quota_manager.check_quota_violations(),
|
|
3077
|
+
"in_flight_operations": self.shutdown_manager.get_in_flight_count()
|
|
3078
|
+
}
|
|
3079
|
+
|
|
3080
|
+
|
|
3081
|
+
# Convenience factory function
|
|
3082
|
+
def create_control_plane(
|
|
3083
|
+
health_check_interval: float = 30.0,
|
|
3084
|
+
auto_recovery: bool = True,
|
|
3085
|
+
circuit_breaker: Optional[CircuitBreaker] = None,
|
|
3086
|
+
**kwargs
|
|
3087
|
+
) -> EnhancedAgentControlPlane:
|
|
3088
|
+
"""
|
|
3089
|
+
Create an enhanced agent control plane.
|
|
3090
|
+
|
|
3091
|
+
This is the recommended way to create a control plane instance.
|
|
3092
|
+
|
|
3093
|
+
Args:
|
|
3094
|
+
health_check_interval: Interval between health checks
|
|
3095
|
+
auto_recovery: Enable automatic recovery
|
|
3096
|
+
circuit_breaker: Default circuit breaker
|
|
3097
|
+
**kwargs: Additional configuration
|
|
3098
|
+
|
|
3099
|
+
Returns:
|
|
3100
|
+
Configured EnhancedAgentControlPlane instance
|
|
3101
|
+
"""
|
|
3102
|
+
return EnhancedAgentControlPlane(
|
|
3103
|
+
health_check_interval=health_check_interval,
|
|
3104
|
+
auto_recovery=auto_recovery,
|
|
3105
|
+
circuit_breaker=circuit_breaker,
|
|
3106
|
+
**kwargs
|
|
3107
|
+
)
|
|
3108
|
+
|
|
3109
|
+
|
|
3110
|
+
# Backwards compatibility alias
|
|
3111
|
+
AgentControlPlaneV2 = EnhancedAgentControlPlane
|