agent-os-kernel 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1051) hide show
  1. agent_os/__init__.py +66 -4
  2. agent_os/agents_compat.py +286 -0
  3. agent_os/base_agent.py +308 -0
  4. agent_os/cli.py +1079 -19
  5. agent_os/integrations/__init__.py +37 -2
  6. agent_os/integrations/openai_adapter.py +502 -0
  7. agent_os/integrations/semantic_kernel_adapter.py +569 -0
  8. agent_os/stateless.py +349 -0
  9. agent_os_kernel-1.3.0.dist-info/METADATA +676 -0
  10. agent_os_kernel-1.3.0.dist-info/RECORD +1053 -0
  11. {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/entry_points.txt +0 -1
  12. modules/amb/.github/workflows/ci.yml +102 -0
  13. modules/amb/.github/workflows/publish.yml +146 -0
  14. modules/amb/.gitignore +134 -0
  15. modules/amb/CHANGELOG.md +118 -0
  16. modules/amb/CONTRIBUTING.md +141 -0
  17. modules/amb/LICENSE +21 -0
  18. modules/amb/README.md +188 -0
  19. modules/amb/amb_core/__init__.py +175 -0
  20. modules/amb/amb_core/adapters/__init__.py +55 -0
  21. modules/amb/amb_core/adapters/aws_sqs_broker.py +374 -0
  22. modules/amb/amb_core/adapters/azure_servicebus_broker.py +338 -0
  23. modules/amb/amb_core/adapters/kafka_broker.py +258 -0
  24. modules/amb/amb_core/adapters/nats_broker.py +283 -0
  25. modules/amb/amb_core/adapters/rabbitmq_broker.py +233 -0
  26. modules/amb/amb_core/adapters/redis_broker.py +260 -0
  27. modules/amb/amb_core/broker.py +143 -0
  28. modules/amb/amb_core/bus.py +479 -0
  29. modules/amb/amb_core/cloudevents.py +507 -0
  30. modules/amb/amb_core/dlq.py +343 -0
  31. modules/amb/amb_core/hf_utils.py +534 -0
  32. modules/amb/amb_core/memory_broker.py +408 -0
  33. modules/amb/amb_core/models.py +139 -0
  34. modules/amb/amb_core/persistence.py +527 -0
  35. modules/amb/amb_core/schema.py +292 -0
  36. modules/amb/amb_core/tracing.py +356 -0
  37. modules/amb/examples/advanced_features.py +223 -0
  38. modules/amb/examples/backpressure_demo.py +225 -0
  39. modules/amb/examples/basic_usage.py +117 -0
  40. modules/amb/examples/tracing_demo.py +104 -0
  41. modules/amb/experiments/README.md +52 -0
  42. modules/amb/experiments/reproduce_results.py +467 -0
  43. modules/amb/experiments/results.json +324 -0
  44. modules/amb/paper/README.md +40 -0
  45. modules/amb/paper/paper.tex +365 -0
  46. modules/amb/paper/whitepaper.md +377 -0
  47. modules/amb/pyproject.toml +117 -0
  48. modules/amb/tests/__init__.py +1 -0
  49. modules/amb/tests/test_backpressure_priority.py +280 -0
  50. modules/amb/tests/test_bus.py +198 -0
  51. modules/amb/tests/test_cloudevents.py +443 -0
  52. modules/amb/tests/test_features.py +531 -0
  53. modules/amb/tests/test_models.py +74 -0
  54. modules/amb/tests/test_tracing.py +254 -0
  55. modules/atr/.github/workflows/ci.yml +101 -0
  56. modules/atr/.github/workflows/publish.yml +140 -0
  57. modules/atr/.gitignore +134 -0
  58. modules/atr/.pre-commit-config.yaml +37 -0
  59. modules/atr/CHANGELOG.md +39 -0
  60. modules/atr/CONTRIBUTING.md +96 -0
  61. modules/atr/IMPLEMENTATION_SUMMARY.md +143 -0
  62. modules/atr/README.md +180 -0
  63. modules/atr/atr/__init__.py +638 -0
  64. modules/atr/atr/access.py +346 -0
  65. modules/atr/atr/composition.py +643 -0
  66. modules/atr/atr/decorator.py +355 -0
  67. modules/atr/atr/executor.py +382 -0
  68. modules/atr/atr/health.py +555 -0
  69. modules/atr/atr/hf_utils.py +447 -0
  70. modules/atr/atr/injection.py +420 -0
  71. modules/atr/atr/metrics.py +438 -0
  72. modules/atr/atr/policies.py +401 -0
  73. modules/atr/atr/py.typed +2 -0
  74. modules/atr/atr/registry.py +450 -0
  75. modules/atr/atr/schema.py +478 -0
  76. modules/atr/atr/tools/safe/__init__.py +73 -0
  77. modules/atr/atr/tools/safe/calculator.py +380 -0
  78. modules/atr/atr/tools/safe/datetime_tool.py +441 -0
  79. modules/atr/atr/tools/safe/file_reader.py +400 -0
  80. modules/atr/atr/tools/safe/http_client.py +314 -0
  81. modules/atr/atr/tools/safe/json_parser.py +372 -0
  82. modules/atr/atr/tools/safe/text_tool.py +526 -0
  83. modules/atr/atr/tools/safe/toolkit.py +173 -0
  84. modules/atr/docs/PYPI_SETUP.md +113 -0
  85. modules/atr/examples/README.md +27 -0
  86. modules/atr/examples/demo.py +144 -0
  87. modules/atr/examples/sandbox_demo.py +218 -0
  88. modules/atr/experiments/README.md +69 -0
  89. modules/atr/experiments/reproduce_results.py +509 -0
  90. modules/atr/experiments/results/.gitkeep +0 -0
  91. modules/atr/experiments/results/results_20260123_140334.json +71 -0
  92. modules/atr/paper/README.md +36 -0
  93. modules/atr/paper/figures/.gitkeep +0 -0
  94. modules/atr/paper/references.bib +84 -0
  95. modules/atr/paper/structure.tex +293 -0
  96. modules/atr/paper/whitepaper.md +234 -0
  97. modules/atr/pyproject.toml +148 -0
  98. modules/atr/requirements.txt +1 -0
  99. modules/atr/setup.py +30 -0
  100. modules/atr/tests/__init__.py +1 -0
  101. modules/atr/tests/test_decorator.py +317 -0
  102. modules/atr/tests/test_executor.py +245 -0
  103. modules/atr/tests/test_integration_executor.py +184 -0
  104. modules/atr/tests/test_registry.py +312 -0
  105. modules/atr/tests/test_schema.py +182 -0
  106. modules/atr/tests/test_v2_features.py +708 -0
  107. modules/caas/.dockerignore +63 -0
  108. modules/caas/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  109. modules/caas/.github/ISSUE_TEMPLATE/custom.md +10 -0
  110. modules/caas/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  111. modules/caas/.github/workflows/ci.yml +100 -0
  112. modules/caas/.github/workflows/lint.yml +39 -0
  113. modules/caas/.github/workflows/publish-pypi.yml +124 -0
  114. modules/caas/.gitignore +73 -0
  115. modules/caas/.pre-commit-config.yaml +33 -0
  116. modules/caas/CHANGELOG.md +58 -0
  117. modules/caas/CONTRIBUTING.md +346 -0
  118. modules/caas/Dockerfile +41 -0
  119. modules/caas/LICENSE +21 -0
  120. modules/caas/MANIFEST.in +11 -0
  121. modules/caas/README.md +158 -0
  122. modules/caas/benchmarks/README.md +255 -0
  123. modules/caas/benchmarks/create_hf_dataset.py +502 -0
  124. modules/caas/benchmarks/data/sample_corpus/README.md +86 -0
  125. modules/caas/benchmarks/data/sample_corpus/auth_module.py +211 -0
  126. modules/caas/benchmarks/data/sample_corpus/contribution_guide.md +185 -0
  127. modules/caas/benchmarks/data/sample_corpus/remote_work_policy.html +57 -0
  128. modules/caas/benchmarks/hf_dataset/README.md +214 -0
  129. modules/caas/benchmarks/hf_dataset/caas_benchmark_corpus.py +73 -0
  130. modules/caas/benchmarks/hf_dataset/corpus_preview.json +193 -0
  131. modules/caas/benchmarks/results/README.md +66 -0
  132. modules/caas/benchmarks/results/evaluation_2026-01-20.json +121 -0
  133. modules/caas/benchmarks/run_evaluation.py +561 -0
  134. modules/caas/benchmarks/statistical_tests.py +289 -0
  135. modules/caas/benchmarks/verify_sample_corpus.py +83 -0
  136. modules/caas/docker-compose.yml +38 -0
  137. modules/caas/docs/CONTEXT_TRIAD.md +462 -0
  138. modules/caas/docs/CONTRIBUTING.md +346 -0
  139. modules/caas/docs/ETHICS_AND_LIMITATIONS.md +336 -0
  140. modules/caas/docs/HEURISTIC_ROUTER.md +442 -0
  141. modules/caas/docs/IMPLEMENTATION_SUMMARY.md +363 -0
  142. modules/caas/docs/IMPLEMENTATION_SUMMARY_CONTEXT_TRIAD.md +277 -0
  143. modules/caas/docs/IMPLEMENTATION_SUMMARY_HEURISTIC_ROUTER.md +231 -0
  144. modules/caas/docs/IMPLEMENTATION_SUMMARY_METADATA_INJECTION.md +258 -0
  145. modules/caas/docs/IMPLEMENTATION_SUMMARY_PRAGMATIC_TRUTH.md +212 -0
  146. modules/caas/docs/IMPLEMENTATION_SUMMARY_TRUST_GATEWAY.md +319 -0
  147. modules/caas/docs/LAYER_1_PRIMITIVE.md +202 -0
  148. modules/caas/docs/METADATA_INJECTION.md +404 -0
  149. modules/caas/docs/PRAGMATIC_TRUTH.md +431 -0
  150. modules/caas/docs/RELATED_WORK.md +312 -0
  151. modules/caas/docs/RELEASE_CHECKLIST.md +219 -0
  152. modules/caas/docs/RELEASE_GUIDE.md +285 -0
  153. modules/caas/docs/REPRODUCIBILITY.md +386 -0
  154. modules/caas/docs/SLIDING_WINDOW.md +387 -0
  155. modules/caas/docs/STRUCTURE_AWARE_INDEXING.md +158 -0
  156. modules/caas/docs/TESTING.md +259 -0
  157. modules/caas/docs/THREAT_MODEL.md +247 -0
  158. modules/caas/docs/TRUST_GATEWAY.md +575 -0
  159. modules/caas/docs/VFS.md +298 -0
  160. modules/caas/examples/agents/enterprise_security_agent.py +414 -0
  161. modules/caas/examples/agents/intelligent_document_analyzer.py +380 -0
  162. modules/caas/examples/demos/demo.py +309 -0
  163. modules/caas/examples/demos/demo_context_triad.py +225 -0
  164. modules/caas/examples/demos/demo_conversation_manager.py +285 -0
  165. modules/caas/examples/demos/demo_heuristic_router.py +133 -0
  166. modules/caas/examples/demos/demo_metadata_injection.py +198 -0
  167. modules/caas/examples/demos/demo_pragmatic_truth.py +303 -0
  168. modules/caas/examples/demos/demo_structure_aware.py +140 -0
  169. modules/caas/examples/demos/demo_time_decay.py +247 -0
  170. modules/caas/examples/demos/demo_trust_gateway.py +383 -0
  171. modules/caas/examples/multi_agent/README.md +159 -0
  172. modules/caas/examples/multi_agent/research_team.py +369 -0
  173. modules/caas/examples/multi_agent/vfs_collaboration.py +393 -0
  174. modules/caas/examples/usage/auth_module.py +142 -0
  175. modules/caas/examples/usage/usage_example.py +173 -0
  176. modules/caas/experiments/README.md +42 -0
  177. modules/caas/experiments/reproduce_results.py +462 -0
  178. modules/caas/paper/ARXIV_METADATA.md +145 -0
  179. modules/caas/paper/ARXIV_README.md +47 -0
  180. modules/caas/paper/CHECKLIST.md +103 -0
  181. modules/caas/paper/GITHUB_RELEASE_NOTES.md +105 -0
  182. modules/caas/paper/README.md +71 -0
  183. modules/caas/paper/abstract.md +24 -0
  184. modules/caas/paper/arxiv_submission.tar +0 -0
  185. modules/caas/paper/arxiv_submission.zip +0 -0
  186. modules/caas/paper/build_pdf.py +355 -0
  187. modules/caas/paper/experiments.md +149 -0
  188. modules/caas/paper/figures/.gitkeep +0 -0
  189. modules/caas/paper/figures/README.md +237 -0
  190. modules/caas/paper/figures/fig1_system_architecture.png +0 -0
  191. modules/caas/paper/figures/fig1_system_architecture.svg +198 -0
  192. modules/caas/paper/figures/fig2_context_triad.png +0 -0
  193. modules/caas/paper/figures/fig2_context_triad.svg +105 -0
  194. modules/caas/paper/figures/fig3_ablation_results.png +0 -0
  195. modules/caas/paper/figures/fig3_ablation_results.svg +113 -0
  196. modules/caas/paper/figures/fig4_routing_latency.png +0 -0
  197. modules/caas/paper/figures/fig4_routing_latency.svg +97 -0
  198. modules/caas/paper/intro.md +103 -0
  199. modules/caas/paper/latex/figures/fig1_system_architecture.png +0 -0
  200. modules/caas/paper/latex/figures/fig2_context_triad.png +0 -0
  201. modules/caas/paper/latex/figures/fig3_ablation_results.png +0 -0
  202. modules/caas/paper/latex/figures/fig4_routing_latency.png +0 -0
  203. modules/caas/paper/latex/main.tex +468 -0
  204. modules/caas/paper/latex/references.bib +140 -0
  205. modules/caas/paper/method.md +350 -0
  206. modules/caas/paper/outline.md +123 -0
  207. modules/caas/paper/related_work.md +101 -0
  208. modules/caas/paper/tables/.gitkeep +0 -0
  209. modules/caas/paper/tables/results_tables.md +50 -0
  210. modules/caas/pyproject.toml +172 -0
  211. modules/caas/requirements.txt +11 -0
  212. modules/caas/src/caas/__init__.py +232 -0
  213. modules/caas/src/caas/api/__init__.py +7 -0
  214. modules/caas/src/caas/api/server.py +1326 -0
  215. modules/caas/src/caas/caching.py +832 -0
  216. modules/caas/src/caas/cli.py +208 -0
  217. modules/caas/src/caas/conversation.py +221 -0
  218. modules/caas/src/caas/decay.py +118 -0
  219. modules/caas/src/caas/detection/__init__.py +7 -0
  220. modules/caas/src/caas/detection/detector.py +236 -0
  221. modules/caas/src/caas/enrichment.py +127 -0
  222. modules/caas/src/caas/gateway/__init__.py +24 -0
  223. modules/caas/src/caas/gateway/trust_gateway.py +471 -0
  224. modules/caas/src/caas/hf_utils.py +477 -0
  225. modules/caas/src/caas/ingestion/__init__.py +21 -0
  226. modules/caas/src/caas/ingestion/processors.py +251 -0
  227. modules/caas/src/caas/ingestion/structure_parser.py +185 -0
  228. modules/caas/src/caas/models.py +354 -0
  229. modules/caas/src/caas/pragmatic_truth.py +441 -0
  230. modules/caas/src/caas/routing/__init__.py +8 -0
  231. modules/caas/src/caas/routing/heuristic_router.py +242 -0
  232. modules/caas/src/caas/storage/__init__.py +7 -0
  233. modules/caas/src/caas/storage/store.py +450 -0
  234. modules/caas/src/caas/triad.py +472 -0
  235. modules/caas/src/caas/tuning/__init__.py +7 -0
  236. modules/caas/src/caas/tuning/tuner.py +322 -0
  237. modules/caas/src/caas/vfs/__init__.py +12 -0
  238. modules/caas/src/caas/vfs/filesystem.py +450 -0
  239. modules/caas/tests/__init__.py +3 -0
  240. modules/caas/tests/conftest.py +8 -0
  241. modules/caas/tests/test_caching.py +628 -0
  242. modules/caas/tests/test_context_triad.py +385 -0
  243. modules/caas/tests/test_conversation_manager.py +289 -0
  244. modules/caas/tests/test_functionality.py +215 -0
  245. modules/caas/tests/test_heuristic_router.py +370 -0
  246. modules/caas/tests/test_metadata_injection.py +328 -0
  247. modules/caas/tests/test_pragmatic_truth.py +322 -0
  248. modules/caas/tests/test_structure_aware_indexing.py +283 -0
  249. modules/caas/tests/test_time_decay.py +268 -0
  250. modules/caas/tests/test_trust_gateway.py +445 -0
  251. modules/caas/tests/test_vfs.py +298 -0
  252. modules/cmvk/.github/FUNDING.yml +9 -0
  253. modules/cmvk/.github/dependabot.yml +54 -0
  254. modules/cmvk/.github/workflows/ci.yml +205 -0
  255. modules/cmvk/.github/workflows/publish.yml +143 -0
  256. modules/cmvk/.gitignore +147 -0
  257. modules/cmvk/.pre-commit-config.yaml +58 -0
  258. modules/cmvk/CHANGELOG.md +146 -0
  259. modules/cmvk/CITATION.cff +48 -0
  260. modules/cmvk/CONTRIBUTING.md +229 -0
  261. modules/cmvk/Dockerfile +87 -0
  262. modules/cmvk/HF_MODEL_CARD.md +185 -0
  263. modules/cmvk/LICENSE +21 -0
  264. modules/cmvk/README.md +149 -0
  265. modules/cmvk/SECURITY.md +114 -0
  266. modules/cmvk/config/prompts/generator_v1.txt +23 -0
  267. modules/cmvk/config/prompts/verifier_hostile.txt +32 -0
  268. modules/cmvk/config/settings.yaml +40 -0
  269. modules/cmvk/coverage_html/.gitignore +2 -0
  270. modules/cmvk/coverage_html/class_index.html +658 -0
  271. modules/cmvk/coverage_html/coverage_html_cb_188fc9a4.js +735 -0
  272. modules/cmvk/coverage_html/favicon_32_cb_c827f16f.png +0 -0
  273. modules/cmvk/coverage_html/function_index.html +1978 -0
  274. modules/cmvk/coverage_html/index.html +255 -0
  275. modules/cmvk/coverage_html/keybd_closed_cb_900cfef5.png +0 -0
  276. modules/cmvk/coverage_html/status.json +1 -0
  277. modules/cmvk/coverage_html/style_cb_5c747636.css +389 -0
  278. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38___init___py.html +315 -0
  279. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_audit_py.html +499 -0
  280. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_benchmarks_py.html +575 -0
  281. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_constitutional_py.html +1001 -0
  282. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_hf_utils_py.html +398 -0
  283. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_metrics_py.html +570 -0
  284. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_profiles_py.html +397 -0
  285. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_types_py.html +109 -0
  286. modules/cmvk/coverage_html/z_2c49bd2ed3e01e38_verification_py.html +1053 -0
  287. modules/cmvk/docs/DIAGRAMS.md +325 -0
  288. modules/cmvk/docs/architecture.md +345 -0
  289. modules/cmvk/docs/features.md +308 -0
  290. modules/cmvk/docs/getting_started.md +279 -0
  291. modules/cmvk/docs/innovation_layer.md +377 -0
  292. modules/cmvk/docs/safety.md +281 -0
  293. modules/cmvk/docs/traceability.md +150 -0
  294. modules/cmvk/examples/basic_example.py +62 -0
  295. modules/cmvk/examples/demo_complete_pipeline.py +209 -0
  296. modules/cmvk/examples/demo_innovation_layer.py +197 -0
  297. modules/cmvk/examples/example.py +112 -0
  298. modules/cmvk/examples/model_diversity_comparison.py +110 -0
  299. modules/cmvk/examples/real_api_integration.py +121 -0
  300. modules/cmvk/examples/test_full_pipeline.py +303 -0
  301. modules/cmvk/experiments/FEATURE_2_LATERAL_THINKING.md +187 -0
  302. modules/cmvk/experiments/README.md +216 -0
  303. modules/cmvk/experiments/ablation_runner.py +666 -0
  304. modules/cmvk/experiments/baseline_runner.py +158 -0
  305. modules/cmvk/experiments/blind_spot_benchmark.py +364 -0
  306. modules/cmvk/experiments/datasets/README.md +85 -0
  307. modules/cmvk/experiments/datasets/humaneval_50.json +352 -0
  308. modules/cmvk/experiments/datasets/humaneval_full.json +1150 -0
  309. modules/cmvk/experiments/datasets/humaneval_sample.json +32 -0
  310. modules/cmvk/experiments/datasets/sabotage.json +262 -0
  311. modules/cmvk/experiments/datasets/sample.json +40 -0
  312. modules/cmvk/experiments/demo_with_traces.py +110 -0
  313. modules/cmvk/experiments/efficiency_curve.py +259 -0
  314. modules/cmvk/experiments/experiment_runner.py +243 -0
  315. modules/cmvk/experiments/paper_data_generator.py +183 -0
  316. modules/cmvk/experiments/reproduce_results.py +407 -0
  317. modules/cmvk/experiments/reproducible_runner.py +352 -0
  318. modules/cmvk/experiments/sabotage_stress_test.py +311 -0
  319. modules/cmvk/experiments/test_lateral_thinking.py +116 -0
  320. modules/cmvk/experiments/test_prosecutor.py +41 -0
  321. modules/cmvk/experiments/visualize_results.py +735 -0
  322. modules/cmvk/logs/traces/demo_HumanEval_0_20260121-204900.json +36 -0
  323. modules/cmvk/notebooks/analysis.ipynb +124 -0
  324. modules/cmvk/paper/PAPER.md +561 -0
  325. modules/cmvk/paper/arxiv_checklist.md +230 -0
  326. modules/cmvk/paper/cmvk_neurips.aux +77 -0
  327. modules/cmvk/paper/cmvk_neurips.bbl +81 -0
  328. modules/cmvk/paper/cmvk_neurips.blg +48 -0
  329. modules/cmvk/paper/cmvk_neurips.out +16 -0
  330. modules/cmvk/paper/cmvk_neurips.pdf +0 -0
  331. modules/cmvk/paper/cmvk_neurips.tex +309 -0
  332. modules/cmvk/paper/figures/ablation.png +0 -0
  333. modules/cmvk/paper/figures/ablation.svg +39 -0
  334. modules/cmvk/paper/figures/architecture.png +0 -0
  335. modules/cmvk/paper/figures/architecture.svg +115 -0
  336. modules/cmvk/paper/figures/results_bar.png +0 -0
  337. modules/cmvk/paper/figures/results_bar.svg +70 -0
  338. modules/cmvk/paper/generate_figures.py +383 -0
  339. modules/cmvk/paper/neurips_2024.sty +101 -0
  340. modules/cmvk/paper/references.bib +98 -0
  341. modules/cmvk/paper/structure.tex +200 -0
  342. modules/cmvk/pyproject.toml +189 -0
  343. modules/cmvk/requirements-dev.txt +19 -0
  344. modules/cmvk/requirements.txt +14 -0
  345. modules/cmvk/src/cmvk/__init__.py +216 -0
  346. modules/cmvk/src/cmvk/audit.py +400 -0
  347. modules/cmvk/src/cmvk/benchmarks.py +476 -0
  348. modules/cmvk/src/cmvk/constitutional.py +902 -0
  349. modules/cmvk/src/cmvk/hf_utils.py +299 -0
  350. modules/cmvk/src/cmvk/metrics.py +471 -0
  351. modules/cmvk/src/cmvk/profiles.py +298 -0
  352. modules/cmvk/src/cmvk/py.typed +0 -0
  353. modules/cmvk/src/cmvk/types.py +10 -0
  354. modules/cmvk/src/cmvk/verification.py +954 -0
  355. modules/cmvk/src/cross_model_verification_kernel/__init__.py +91 -0
  356. modules/cmvk/src/cross_model_verification_kernel/__main__.py +10 -0
  357. modules/cmvk/src/cross_model_verification_kernel/agents/__init__.py +16 -0
  358. modules/cmvk/src/cross_model_verification_kernel/agents/base_agent.py +142 -0
  359. modules/cmvk/src/cross_model_verification_kernel/agents/generator_openai.py +223 -0
  360. modules/cmvk/src/cross_model_verification_kernel/agents/verifier_anthropic.py +448 -0
  361. modules/cmvk/src/cross_model_verification_kernel/agents/verifier_gemini.py +481 -0
  362. modules/cmvk/src/cross_model_verification_kernel/cli.py +570 -0
  363. modules/cmvk/src/cross_model_verification_kernel/core/__init__.py +26 -0
  364. modules/cmvk/src/cross_model_verification_kernel/core/graph_memory.py +308 -0
  365. modules/cmvk/src/cross_model_verification_kernel/core/kernel.py +413 -0
  366. modules/cmvk/src/cross_model_verification_kernel/core/trace_logger.py +75 -0
  367. modules/cmvk/src/cross_model_verification_kernel/core/types.py +121 -0
  368. modules/cmvk/src/cross_model_verification_kernel/datasets/__init__.py +20 -0
  369. modules/cmvk/src/cross_model_verification_kernel/datasets/humaneval_loader.py +271 -0
  370. modules/cmvk/src/cross_model_verification_kernel/generator.py +118 -0
  371. modules/cmvk/src/cross_model_verification_kernel/kernel.py +292 -0
  372. modules/cmvk/src/cross_model_verification_kernel/models.py +111 -0
  373. modules/cmvk/src/cross_model_verification_kernel/py.typed +1 -0
  374. modules/cmvk/src/cross_model_verification_kernel/simple_kernel.py +185 -0
  375. modules/cmvk/src/cross_model_verification_kernel/tools/__init__.py +94 -0
  376. modules/cmvk/src/cross_model_verification_kernel/tools/huggingface_upload.py +394 -0
  377. modules/cmvk/src/cross_model_verification_kernel/tools/sandbox.py +159 -0
  378. modules/cmvk/src/cross_model_verification_kernel/tools/statistics.py +468 -0
  379. modules/cmvk/src/cross_model_verification_kernel/tools/visualizer.py +312 -0
  380. modules/cmvk/src/cross_model_verification_kernel/tools/web_search.py +86 -0
  381. modules/cmvk/src/cross_model_verification_kernel/verifier.py +257 -0
  382. modules/cmvk/tests/__init__.py +3 -0
  383. modules/cmvk/tests/conftest.py +61 -0
  384. modules/cmvk/tests/integration/__init__.py +1 -0
  385. modules/cmvk/tests/integration/test_anthropic_verifier.py +269 -0
  386. modules/cmvk/tests/integration/test_integration.py +53 -0
  387. modules/cmvk/tests/integration/test_lateral_thinking_integration.py +199 -0
  388. modules/cmvk/tests/integration/test_lateral_thinking_witness.py +208 -0
  389. modules/cmvk/tests/integration/test_prosecutor_mode.py +131 -0
  390. modules/cmvk/tests/test_constitutional.py +611 -0
  391. modules/cmvk/tests/test_enhanced_features.py +603 -0
  392. modules/cmvk/tests/test_verification.py +255 -0
  393. modules/cmvk/tests/unit/__init__.py +1 -0
  394. modules/cmvk/tests/unit/test_agents.py +64 -0
  395. modules/cmvk/tests/unit/test_cli.py +224 -0
  396. modules/cmvk/tests/unit/test_core.py +126 -0
  397. modules/cmvk/tests/unit/test_humaneval_loader.py +197 -0
  398. modules/cmvk/tests/unit/test_kernel.py +255 -0
  399. modules/cmvk/tests/unit/test_reproducibility.py +160 -0
  400. modules/cmvk/tests/unit/test_trace_logger.py +115 -0
  401. modules/cmvk/tests/unit/test_visualizer.py +218 -0
  402. modules/control-plane/.github/ISSUE_TEMPLATE/bug_report.yml +82 -0
  403. modules/control-plane/.github/ISSUE_TEMPLATE/config.yml +11 -0
  404. modules/control-plane/.github/ISSUE_TEMPLATE/feature_request.yml +104 -0
  405. modules/control-plane/.github/ISSUE_TEMPLATE/question.yml +70 -0
  406. modules/control-plane/.github/ISSUE_TEMPLATE/security_vulnerability.yml +84 -0
  407. modules/control-plane/.github/discussions.yml +73 -0
  408. modules/control-plane/.github/pull_request_template.md +82 -0
  409. modules/control-plane/.github/workflows/publish.yml +146 -0
  410. modules/control-plane/.github/workflows/release.yml +39 -0
  411. modules/control-plane/.github/workflows/tests.yml +58 -0
  412. modules/control-plane/.gitignore +55 -0
  413. modules/control-plane/CHANGELOG.md +203 -0
  414. modules/control-plane/CONTRIBUTING.md +311 -0
  415. modules/control-plane/CONTRIBUTORS.md +88 -0
  416. modules/control-plane/Dockerfile +82 -0
  417. modules/control-plane/LICENSE +21 -0
  418. modules/control-plane/MANIFEST.in +17 -0
  419. modules/control-plane/README.md +1264 -0
  420. modules/control-plane/ROADMAP.md +228 -0
  421. modules/control-plane/SECURITY.md +210 -0
  422. modules/control-plane/SUPPORT.md +106 -0
  423. modules/control-plane/acp-cli.py +212 -0
  424. modules/control-plane/benchmark/README.md +257 -0
  425. modules/control-plane/benchmark/__init__.py +19 -0
  426. modules/control-plane/benchmark/red_team_dataset.py +517 -0
  427. modules/control-plane/benchmark.py +563 -0
  428. modules/control-plane/build_and_publish.sh +130 -0
  429. modules/control-plane/docker-compose.yml +74 -0
  430. modules/control-plane/docs/ABLATION_STUDIES.md +528 -0
  431. modules/control-plane/docs/ADAPTER_GUIDE.md +544 -0
  432. modules/control-plane/docs/ADVANCED_FEATURES.md +543 -0
  433. modules/control-plane/docs/AIOS_COMPARISON.md +296 -0
  434. modules/control-plane/docs/BIBLIOGRAPHY.md +367 -0
  435. modules/control-plane/docs/CASE_STUDIES.md +645 -0
  436. modules/control-plane/docs/DOCKER_DEPLOYMENT.md +184 -0
  437. modules/control-plane/docs/ECOSYSTEM_STATUS.md +98 -0
  438. modules/control-plane/docs/HF_MODEL_CARD.md +168 -0
  439. modules/control-plane/docs/KERNEL_V1_RELEASE.md +454 -0
  440. modules/control-plane/docs/LAYER3_FRAMEWORK.md +227 -0
  441. modules/control-plane/docs/LIMITATIONS.md +523 -0
  442. modules/control-plane/docs/PYPI_PUBLISHING.md +195 -0
  443. modules/control-plane/docs/README.md +58 -0
  444. modules/control-plane/docs/RELATED_WORK.md +319 -0
  445. modules/control-plane/docs/RELEASE_v1.1.0.md +252 -0
  446. modules/control-plane/docs/REPRODUCIBILITY.md +540 -0
  447. modules/control-plane/docs/RESEARCH_FOUNDATION.md +197 -0
  448. modules/control-plane/docs/api/CORE.md +270 -0
  449. modules/control-plane/docs/architecture/architecture.md +120 -0
  450. modules/control-plane/docs/community/ANNOUNCEMENT_TEMPLATES.md +52 -0
  451. modules/control-plane/docs/guides/IMPLEMENTATION.md +225 -0
  452. modules/control-plane/docs/guides/PHILOSOPHY.md +354 -0
  453. modules/control-plane/docs/guides/QUICKSTART.md +217 -0
  454. modules/control-plane/examples/README.md +138 -0
  455. modules/control-plane/examples/a2a_demo.py +410 -0
  456. modules/control-plane/examples/adapter_demo.py +347 -0
  457. modules/control-plane/examples/advanced_features.py +403 -0
  458. modules/control-plane/examples/basic_usage.py +261 -0
  459. modules/control-plane/examples/benchmark_demo.py +186 -0
  460. modules/control-plane/examples/compliance_demo.py +333 -0
  461. modules/control-plane/examples/configuration.py +265 -0
  462. modules/control-plane/examples/getting_started.py +178 -0
  463. modules/control-plane/examples/hibernation_and_time_travel_demo.py +406 -0
  464. modules/control-plane/examples/interactive_tutorial.ipynb +497 -0
  465. modules/control-plane/examples/kernel_interceptor_demo.py +202 -0
  466. modules/control-plane/examples/kernel_v1_demo.py +273 -0
  467. modules/control-plane/examples/langchain_demo.py +281 -0
  468. modules/control-plane/examples/lifecycle_demo.py +724 -0
  469. modules/control-plane/examples/mcp_demo.py +378 -0
  470. modules/control-plane/examples/ml_safety_demo.py +157 -0
  471. modules/control-plane/examples/multimodal_demo.py +347 -0
  472. modules/control-plane/examples/observability_demo.py +370 -0
  473. modules/control-plane/examples/use_cases.py +336 -0
  474. modules/control-plane/experiments/long_horizon_purge.py +235 -0
  475. modules/control-plane/experiments/multi_agent_rag.py +165 -0
  476. modules/control-plane/experiments/reproduce_results.py +667 -0
  477. modules/control-plane/paper/ARXIV_SUBMISSION_INFO.txt +122 -0
  478. modules/control-plane/paper/ETHICS_STATEMENT.md +248 -0
  479. modules/control-plane/paper/PAPER_CHECKLIST.md +72 -0
  480. modules/control-plane/paper/Paper.pdf +0 -0
  481. modules/control-plane/paper/README.md +71 -0
  482. modules/control-plane/paper/appendix.md +152 -0
  483. modules/control-plane/paper/architecture.md +15 -0
  484. modules/control-plane/paper/arxiv/figures/ablation_chart.png +0 -0
  485. modules/control-plane/paper/arxiv/figures/architecture.png +0 -0
  486. modules/control-plane/paper/arxiv/figures/constraint_graphs.png +0 -0
  487. modules/control-plane/paper/arxiv/figures/results_chart.png +0 -0
  488. modules/control-plane/paper/arxiv/main.aux +97 -0
  489. modules/control-plane/paper/arxiv/main.bbl +112 -0
  490. modules/control-plane/paper/arxiv/main.blg +48 -0
  491. modules/control-plane/paper/arxiv/main.out +33 -0
  492. modules/control-plane/paper/arxiv/main.pdf +0 -0
  493. modules/control-plane/paper/arxiv/main.tex +479 -0
  494. modules/control-plane/paper/arxiv/references.bib +234 -0
  495. modules/control-plane/paper/arxiv_submission.tar +0 -0
  496. modules/control-plane/paper/arxiv_submission.zip +0 -0
  497. modules/control-plane/paper/build.sh +68 -0
  498. modules/control-plane/paper/figures/README.md +47 -0
  499. modules/control-plane/paper/figures/ablation_chart.pdf +0 -0
  500. modules/control-plane/paper/figures/ablation_chart.png +0 -0
  501. modules/control-plane/paper/figures/architecture.pdf +0 -0
  502. modules/control-plane/paper/figures/architecture.png +0 -0
  503. modules/control-plane/paper/figures/constraint_graphs.pdf +0 -0
  504. modules/control-plane/paper/figures/constraint_graphs.png +0 -0
  505. modules/control-plane/paper/figures/generate_figures.py +252 -0
  506. modules/control-plane/paper/figures/results_chart.pdf +0 -0
  507. modules/control-plane/paper/figures/results_chart.png +0 -0
  508. modules/control-plane/paper/main.md +273 -0
  509. modules/control-plane/paper/main.tex +214 -0
  510. modules/control-plane/paper/main_arxiv.aux +53 -0
  511. modules/control-plane/paper/main_arxiv.out +17 -0
  512. modules/control-plane/paper/main_arxiv.pdf +0 -0
  513. modules/control-plane/paper/main_arxiv.tex +264 -0
  514. modules/control-plane/paper/references.bib +234 -0
  515. modules/control-plane/pyproject.toml +124 -0
  516. modules/control-plane/reproducibility/ABLATIONS.md +136 -0
  517. modules/control-plane/reproducibility/README.md +288 -0
  518. modules/control-plane/reproducibility/commands.md +467 -0
  519. modules/control-plane/reproducibility/docker_config/Dockerfile +39 -0
  520. modules/control-plane/reproducibility/experiment_configs/purge_config.json +46 -0
  521. modules/control-plane/reproducibility/experiment_configs/rag_config.json +36 -0
  522. modules/control-plane/reproducibility/hardware_specs.md +317 -0
  523. modules/control-plane/reproducibility/requirements_frozen.txt +0 -0
  524. modules/control-plane/reproducibility/run_all_experiments.sh +45 -0
  525. modules/control-plane/reproducibility/seeds.json +106 -0
  526. modules/control-plane/scripts/prepare_pypi.py +46 -0
  527. modules/control-plane/scripts/prepare_release.py +176 -0
  528. modules/control-plane/scripts/upload_dataset_to_hf.py +316 -0
  529. modules/control-plane/setup.py +69 -0
  530. modules/control-plane/src/agent_control_plane/__init__.py +639 -0
  531. modules/control-plane/src/agent_control_plane/a2a_adapter.py +541 -0
  532. modules/control-plane/src/agent_control_plane/adapter.py +415 -0
  533. modules/control-plane/src/agent_control_plane/agent_hibernation.py +364 -0
  534. modules/control-plane/src/agent_control_plane/agent_kernel.py +464 -0
  535. modules/control-plane/src/agent_control_plane/compliance.py +718 -0
  536. modules/control-plane/src/agent_control_plane/constraint_graphs.py +475 -0
  537. modules/control-plane/src/agent_control_plane/control_plane.py +848 -0
  538. modules/control-plane/src/agent_control_plane/example_executors.py +193 -0
  539. modules/control-plane/src/agent_control_plane/execution_engine.py +229 -0
  540. modules/control-plane/src/agent_control_plane/flight_recorder.py +600 -0
  541. modules/control-plane/src/agent_control_plane/governance_layer.py +432 -0
  542. modules/control-plane/src/agent_control_plane/hf_utils.py +561 -0
  543. modules/control-plane/src/agent_control_plane/interfaces/__init__.py +53 -0
  544. modules/control-plane/src/agent_control_plane/interfaces/kernel_interface.py +359 -0
  545. modules/control-plane/src/agent_control_plane/interfaces/plugin_interface.py +495 -0
  546. modules/control-plane/src/agent_control_plane/interfaces/protocol_interfaces.py +385 -0
  547. modules/control-plane/src/agent_control_plane/kernel_space.py +707 -0
  548. modules/control-plane/src/agent_control_plane/langchain_adapter.py +422 -0
  549. modules/control-plane/src/agent_control_plane/lifecycle.py +3111 -0
  550. modules/control-plane/src/agent_control_plane/mcp_adapter.py +517 -0
  551. modules/control-plane/src/agent_control_plane/ml_safety.py +560 -0
  552. modules/control-plane/src/agent_control_plane/multimodal.py +724 -0
  553. modules/control-plane/src/agent_control_plane/mute_agent.py +419 -0
  554. modules/control-plane/src/agent_control_plane/observability.py +785 -0
  555. modules/control-plane/src/agent_control_plane/orchestrator.py +480 -0
  556. modules/control-plane/src/agent_control_plane/plugin_registry.py +748 -0
  557. modules/control-plane/src/agent_control_plane/policy_engine.py +525 -0
  558. modules/control-plane/src/agent_control_plane/shadow_mode.py +307 -0
  559. modules/control-plane/src/agent_control_plane/signals.py +491 -0
  560. modules/control-plane/src/agent_control_plane/supervisor_agents.py +427 -0
  561. modules/control-plane/src/agent_control_plane/time_travel_debugger.py +554 -0
  562. modules/control-plane/src/agent_control_plane/tool_registry.py +350 -0
  563. modules/control-plane/src/agent_control_plane/vfs.py +695 -0
  564. modules/control-plane/tests/README.md +33 -0
  565. modules/control-plane/tests/test_a2a_adapter.py +336 -0
  566. modules/control-plane/tests/test_adapter.py +422 -0
  567. modules/control-plane/tests/test_advanced_features.py +389 -0
  568. modules/control-plane/tests/test_benchmark.py +223 -0
  569. modules/control-plane/tests/test_compliance.py +214 -0
  570. modules/control-plane/tests/test_control_plane.py +295 -0
  571. modules/control-plane/tests/test_hibernation.py +274 -0
  572. modules/control-plane/tests/test_kernel_interception.py +284 -0
  573. modules/control-plane/tests/test_langchain_adapter.py +258 -0
  574. modules/control-plane/tests/test_lifecycle.py +1174 -0
  575. modules/control-plane/tests/test_mcp_adapter.py +293 -0
  576. modules/control-plane/tests/test_ml_safety.py +142 -0
  577. modules/control-plane/tests/test_multimodal.py +317 -0
  578. modules/control-plane/tests/test_new_features.py +435 -0
  579. modules/control-plane/tests/test_observability.py +338 -0
  580. modules/control-plane/tests/test_time_travel.py +387 -0
  581. modules/emk/.github/workflows/ci.yml +105 -0
  582. modules/emk/.github/workflows/publish.yml +144 -0
  583. modules/emk/.gitignore +74 -0
  584. modules/emk/CHANGELOG.md +41 -0
  585. modules/emk/CONTRIBUTING.md +295 -0
  586. modules/emk/IMPLEMENTATION.md +174 -0
  587. modules/emk/LICENSE +21 -0
  588. modules/emk/MANIFEST.in +8 -0
  589. modules/emk/README.md +135 -0
  590. modules/emk/RELEASE_NOTES.md +82 -0
  591. modules/emk/SECURITY.md +52 -0
  592. modules/emk/codecov.yml +39 -0
  593. modules/emk/docs/MEMORY_MANAGEMENT.md +285 -0
  594. modules/emk/emk/__init__.py +106 -0
  595. modules/emk/emk/hf_utils.py +419 -0
  596. modules/emk/emk/indexer.py +144 -0
  597. modules/emk/emk/py.typed +0 -0
  598. modules/emk/emk/schema.py +204 -0
  599. modules/emk/emk/sleep_cycle.py +345 -0
  600. modules/emk/emk/store.py +479 -0
  601. modules/emk/examples/basic_usage.py +123 -0
  602. modules/emk/examples/memory_features_demo.py +154 -0
  603. modules/emk/experiments/README.md +59 -0
  604. modules/emk/experiments/reproduce_results.py +461 -0
  605. modules/emk/experiments/results.json +61 -0
  606. modules/emk/paper/structure.tex +192 -0
  607. modules/emk/paper/whitepaper.md +273 -0
  608. modules/emk/pyproject.toml +91 -0
  609. modules/emk/setup.py +5 -0
  610. modules/emk/tests/test_file_adapter.py +195 -0
  611. modules/emk/tests/test_indexer.py +174 -0
  612. modules/emk/tests/test_init.py +55 -0
  613. modules/emk/tests/test_negative_memory.py +83 -0
  614. modules/emk/tests/test_schema.py +150 -0
  615. modules/emk/tests/test_semantic_rules.py +175 -0
  616. modules/emk/tests/test_sleep_cycle.py +335 -0
  617. modules/emk/tests/test_store_anti_patterns.py +239 -0
  618. modules/iatp/.github/workflows/docker-build.yml +124 -0
  619. modules/iatp/.github/workflows/publish.yml +174 -0
  620. modules/iatp/.github/workflows/python-package.yml +121 -0
  621. modules/iatp/.gitignore +67 -0
  622. modules/iatp/.pre-commit-config.yaml +64 -0
  623. modules/iatp/CHANGELOG.md +120 -0
  624. modules/iatp/Dockerfile +91 -0
  625. modules/iatp/IMPLEMENTATION_SUMMARY.md +218 -0
  626. modules/iatp/MANIFEST.in +9 -0
  627. modules/iatp/README.md +180 -0
  628. modules/iatp/docker/Dockerfile.agent +27 -0
  629. modules/iatp/docker/Dockerfile.sidecar-python +86 -0
  630. modules/iatp/docker/README.md +258 -0
  631. modules/iatp/docker-compose.yml +194 -0
  632. modules/iatp/docs/ARCHITECTURE.md +243 -0
  633. modules/iatp/docs/CLI_GUIDE.md +220 -0
  634. modules/iatp/docs/DEPLOYMENT.md +304 -0
  635. modules/iatp/examples/README.md +132 -0
  636. modules/iatp/examples/backend_agent.py +39 -0
  637. modules/iatp/examples/client.py +168 -0
  638. modules/iatp/examples/demo_attestation_reputation.py +274 -0
  639. modules/iatp/examples/demo_client.py +240 -0
  640. modules/iatp/examples/demo_rbac.py +143 -0
  641. modules/iatp/examples/integration_demo.py +245 -0
  642. modules/iatp/examples/manifests/coder_agent.json +20 -0
  643. modules/iatp/examples/manifests/reviewer_agent.json +19 -0
  644. modules/iatp/examples/manifests/secure_bank.json +14 -0
  645. modules/iatp/examples/manifests/standard_agent.json +14 -0
  646. modules/iatp/examples/manifests/untrusted_honeypot.json +14 -0
  647. modules/iatp/examples/run_secure_bank_sidecar.py +85 -0
  648. modules/iatp/examples/run_sidecar.py +105 -0
  649. modules/iatp/examples/run_untrusted_sidecar.py +77 -0
  650. modules/iatp/examples/secure_bank_agent.py +138 -0
  651. modules/iatp/examples/test_untrusted.py +82 -0
  652. modules/iatp/examples/untrusted_agent.py +119 -0
  653. modules/iatp/experiments/README.md +58 -0
  654. modules/iatp/experiments/cascading_hallucination/README.md +149 -0
  655. modules/iatp/experiments/cascading_hallucination/agent_a_user.py +41 -0
  656. modules/iatp/experiments/cascading_hallucination/agent_b_summarizer.py +54 -0
  657. modules/iatp/experiments/cascading_hallucination/agent_c_database.py +47 -0
  658. modules/iatp/experiments/cascading_hallucination/proof_of_concept.py +290 -0
  659. modules/iatp/experiments/cascading_hallucination/run_experiment.py +226 -0
  660. modules/iatp/experiments/cascading_hallucination/sidecar_c.py +61 -0
  661. modules/iatp/experiments/reproduce_results.py +574 -0
  662. modules/iatp/experiments/results.json +2336 -0
  663. modules/iatp/iatp/__init__.py +164 -0
  664. modules/iatp/iatp/attestation.py +401 -0
  665. modules/iatp/iatp/cli.py +253 -0
  666. modules/iatp/iatp/hf_utils.py +469 -0
  667. modules/iatp/iatp/ipc_pipes.py +578 -0
  668. modules/iatp/iatp/main.py +410 -0
  669. modules/iatp/iatp/models/__init__.py +445 -0
  670. modules/iatp/iatp/policy_engine.py +335 -0
  671. modules/iatp/iatp/py.typed +2 -0
  672. modules/iatp/iatp/recovery.py +319 -0
  673. modules/iatp/iatp/security/__init__.py +268 -0
  674. modules/iatp/iatp/sidecar/__init__.py +517 -0
  675. modules/iatp/iatp/telemetry/__init__.py +162 -0
  676. modules/iatp/iatp/tests/__init__.py +1 -0
  677. modules/iatp/iatp/tests/test_attestation.py +368 -0
  678. modules/iatp/iatp/tests/test_cli.py +129 -0
  679. modules/iatp/iatp/tests/test_models.py +128 -0
  680. modules/iatp/iatp/tests/test_policy_engine.py +345 -0
  681. modules/iatp/iatp/tests/test_recovery.py +279 -0
  682. modules/iatp/iatp/tests/test_security.py +220 -0
  683. modules/iatp/iatp/tests/test_sidecar.py +165 -0
  684. modules/iatp/iatp/tests/test_telemetry.py +173 -0
  685. modules/iatp/paper/BLOG.md +307 -0
  686. modules/iatp/paper/PAPER.md +236 -0
  687. modules/iatp/paper/RFC_SUBMISSION.md +299 -0
  688. modules/iatp/paper/whitepaper.md +369 -0
  689. modules/iatp/proto/README.md +200 -0
  690. modules/iatp/proto/generate_stubs.py +81 -0
  691. modules/iatp/proto/iatp.proto +552 -0
  692. modules/iatp/pyproject.toml +180 -0
  693. modules/iatp/requirements-dev.txt +2 -0
  694. modules/iatp/requirements.txt +6 -0
  695. modules/iatp/setup.py +60 -0
  696. modules/iatp/sidecar/README.md +487 -0
  697. modules/iatp/sidecar/go/Dockerfile +32 -0
  698. modules/iatp/sidecar/go/README.md +237 -0
  699. modules/iatp/sidecar/go/go.mod +8 -0
  700. modules/iatp/sidecar/go/main.go +488 -0
  701. modules/iatp/spec/001-handshake.md +436 -0
  702. modules/iatp/spec/002-reversibility.md +394 -0
  703. modules/iatp/spec/schema/capability_manifest.json +266 -0
  704. modules/iatp/test_integration.py +310 -0
  705. modules/mcp-kernel-server/README.md +261 -0
  706. modules/mcp-kernel-server/pyproject.toml +60 -0
  707. modules/mcp-kernel-server/src/mcp_kernel_server/__init__.py +26 -0
  708. modules/mcp-kernel-server/src/mcp_kernel_server/cli.py +229 -0
  709. modules/mcp-kernel-server/src/mcp_kernel_server/resources.py +215 -0
  710. modules/mcp-kernel-server/src/mcp_kernel_server/server.py +562 -0
  711. modules/mcp-kernel-server/src/mcp_kernel_server/tools.py +1172 -0
  712. modules/mute-agent/.github/workflows/safety_check.yml +45 -0
  713. modules/mute-agent/.gitignore +53 -0
  714. modules/mute-agent/ARCHITECTURE.md +531 -0
  715. modules/mute-agent/BENCHMARK_GUIDE.md +384 -0
  716. modules/mute-agent/COMPLETION_SUMMARY.md +293 -0
  717. modules/mute-agent/EXPERIMENT_SUMMARY.md +318 -0
  718. modules/mute-agent/IMPLEMENTATION_SUMMARY.md +212 -0
  719. modules/mute-agent/LICENSE +21 -0
  720. modules/mute-agent/PHASE3_SUMMARY.md +297 -0
  721. modules/mute-agent/README.md +360 -0
  722. modules/mute-agent/STEEL_MAN_RESULTS.md +353 -0
  723. modules/mute-agent/USAGE.md +505 -0
  724. modules/mute-agent/V2_IMPLEMENTATION_SUMMARY.md +253 -0
  725. modules/mute-agent/V2_STEEL_MAN_IMPLEMENTATION.md +274 -0
  726. modules/mute-agent/VERIFICATION_REPORT.md +435 -0
  727. modules/mute-agent/charts/cost_comparison.png +0 -0
  728. modules/mute-agent/charts/cost_vs_ambiguity.png +0 -0
  729. modules/mute-agent/charts/metrics_comparison.png +0 -0
  730. modules/mute-agent/charts/scenario_breakdown.png +0 -0
  731. modules/mute-agent/charts/trace_attack_blocked.html +140 -0
  732. modules/mute-agent/charts/trace_attack_blocked.png +0 -0
  733. modules/mute-agent/charts/trace_failure.html +140 -0
  734. modules/mute-agent/charts/trace_failure.png +0 -0
  735. modules/mute-agent/charts/trace_success.html +140 -0
  736. modules/mute-agent/charts/trace_success.png +0 -0
  737. modules/mute-agent/examples/__init__.py +1 -0
  738. modules/mute-agent/examples/advanced_example.py +384 -0
  739. modules/mute-agent/examples/graph_debugger_demo.py +241 -0
  740. modules/mute-agent/examples/listener_example.py +297 -0
  741. modules/mute-agent/examples/simple_example.py +242 -0
  742. modules/mute-agent/examples/steel_man_demo.py +297 -0
  743. modules/mute-agent/experiments/README.md +135 -0
  744. modules/mute-agent/experiments/__init__.py +3 -0
  745. modules/mute-agent/experiments/agent_comparison.csv +6 -0
  746. modules/mute-agent/experiments/agent_comparison_50runs.csv +6 -0
  747. modules/mute-agent/experiments/ambiguity_test.py +335 -0
  748. modules/mute-agent/experiments/ambiguity_test_results.csv +31 -0
  749. modules/mute-agent/experiments/ambiguity_test_results_50runs.csv +51 -0
  750. modules/mute-agent/experiments/baseline_agent.py +189 -0
  751. modules/mute-agent/experiments/benchmark.py +402 -0
  752. modules/mute-agent/experiments/demo.py +172 -0
  753. modules/mute-agent/experiments/generate_cost_curve.py +474 -0
  754. modules/mute-agent/experiments/jailbreak_test.py +137 -0
  755. modules/mute-agent/experiments/latent_state_scenario.py +361 -0
  756. modules/mute-agent/experiments/mute_agent_experiment.py +349 -0
  757. modules/mute-agent/experiments/run_extended_experiment.py +40 -0
  758. modules/mute-agent/experiments/run_v2_experiments.py +266 -0
  759. modules/mute-agent/experiments/run_v2_experiments_auto.py +247 -0
  760. modules/mute-agent/experiments/v2_scenarios/README.md +214 -0
  761. modules/mute-agent/experiments/v2_scenarios/__init__.py +4 -0
  762. modules/mute-agent/experiments/v2_scenarios/scenario_1_deep_dependency.py +325 -0
  763. modules/mute-agent/experiments/v2_scenarios/scenario_2_adversarial.py +328 -0
  764. modules/mute-agent/experiments/v2_scenarios/scenario_3_false_positive.py +303 -0
  765. modules/mute-agent/experiments/v2_scenarios/scenario_4_performance.py +319 -0
  766. modules/mute-agent/experiments/visualize.py +400 -0
  767. modules/mute-agent/mute_agent/__init__.py +66 -0
  768. modules/mute-agent/mute_agent/core/__init__.py +1 -0
  769. modules/mute-agent/mute_agent/core/execution_agent.py +164 -0
  770. modules/mute-agent/mute_agent/core/handshake_protocol.py +199 -0
  771. modules/mute-agent/mute_agent/core/reasoning_agent.py +236 -0
  772. modules/mute-agent/mute_agent/knowledge_graph/__init__.py +1 -0
  773. modules/mute-agent/mute_agent/knowledge_graph/graph_elements.py +63 -0
  774. modules/mute-agent/mute_agent/knowledge_graph/multidimensional_graph.py +168 -0
  775. modules/mute-agent/mute_agent/knowledge_graph/subgraph.py +222 -0
  776. modules/mute-agent/mute_agent/listener/__init__.py +41 -0
  777. modules/mute-agent/mute_agent/listener/adapters/__init__.py +29 -0
  778. modules/mute-agent/mute_agent/listener/adapters/base_adapter.py +187 -0
  779. modules/mute-agent/mute_agent/listener/adapters/caas_adapter.py +342 -0
  780. modules/mute-agent/mute_agent/listener/adapters/control_plane_adapter.py +434 -0
  781. modules/mute-agent/mute_agent/listener/adapters/iatp_adapter.py +330 -0
  782. modules/mute-agent/mute_agent/listener/adapters/scak_adapter.py +249 -0
  783. modules/mute-agent/mute_agent/listener/listener.py +608 -0
  784. modules/mute-agent/mute_agent/listener/state_observer.py +434 -0
  785. modules/mute-agent/mute_agent/listener/threshold_config.py +311 -0
  786. modules/mute-agent/mute_agent/super_system/__init__.py +1 -0
  787. modules/mute-agent/mute_agent/super_system/router.py +202 -0
  788. modules/mute-agent/mute_agent/visualization/__init__.py +8 -0
  789. modules/mute-agent/mute_agent/visualization/graph_debugger.py +495 -0
  790. modules/mute-agent/requirements-dev.txt +6 -0
  791. modules/mute-agent/requirements.txt +9 -0
  792. modules/mute-agent/setup.py +64 -0
  793. modules/mute-agent/src/__init__.py +0 -0
  794. modules/mute-agent/src/agents/__init__.py +0 -0
  795. modules/mute-agent/src/agents/baseline_agent.py +524 -0
  796. modules/mute-agent/src/agents/interactive_agent.py +113 -0
  797. modules/mute-agent/src/agents/mute_agent.py +622 -0
  798. modules/mute-agent/src/benchmarks/__init__.py +0 -0
  799. modules/mute-agent/src/benchmarks/evaluator.py +481 -0
  800. modules/mute-agent/src/benchmarks/scenarios.json +985 -0
  801. modules/mute-agent/src/core/__init__.py +0 -0
  802. modules/mute-agent/src/core/mock_state.py +320 -0
  803. modules/mute-agent/src/core/tools.py +441 -0
  804. modules/nexus/__init__.py +49 -0
  805. modules/nexus/arbiter.py +357 -0
  806. modules/nexus/client.py +464 -0
  807. modules/nexus/dmz.py +417 -0
  808. modules/nexus/escrow.py +428 -0
  809. modules/nexus/exceptions.py +284 -0
  810. modules/nexus/registry.py +391 -0
  811. modules/nexus/reputation.py +423 -0
  812. modules/nexus/schemas/__init__.py +49 -0
  813. modules/nexus/schemas/compliance.py +274 -0
  814. modules/nexus/schemas/escrow.py +249 -0
  815. modules/nexus/schemas/manifest.py +223 -0
  816. modules/nexus/schemas/receipt.py +206 -0
  817. modules/observability/README.md +192 -0
  818. modules/observability/alertmanager/alertmanager.yml +116 -0
  819. modules/observability/alerts/agent-os-alerts.yaml +197 -0
  820. modules/observability/docker-compose.yml +128 -0
  821. modules/observability/grafana/dashboards/agent-os-amb.json +448 -0
  822. modules/observability/grafana/dashboards/agent-os-cmvk.json +441 -0
  823. modules/observability/grafana/dashboards/agent-os-overview.json +268 -0
  824. modules/observability/grafana/dashboards/agent-os-performance.json +15 -0
  825. modules/observability/grafana/dashboards/agent-os-safety.json +50 -0
  826. modules/observability/grafana/provisioning/dashboards/dashboards.yml +15 -0
  827. modules/observability/grafana/provisioning/datasources/datasources.yml +33 -0
  828. modules/observability/otel/otel-collector-config.yml +61 -0
  829. modules/observability/prometheus/prometheus.yml +63 -0
  830. modules/observability/pyproject.toml +53 -0
  831. modules/observability/scripts/export_dashboards.py +55 -0
  832. modules/observability/src/agent_os_observability/__init__.py +25 -0
  833. modules/observability/src/agent_os_observability/dashboards.py +896 -0
  834. modules/observability/src/agent_os_observability/metrics.py +396 -0
  835. modules/observability/src/agent_os_observability/server.py +221 -0
  836. modules/observability/src/agent_os_observability/tracer.py +226 -0
  837. modules/primitives/.gitignore +8 -0
  838. modules/primitives/README.md +62 -0
  839. modules/primitives/agent_primitives/__init__.py +22 -0
  840. modules/primitives/agent_primitives/failures.py +82 -0
  841. modules/primitives/agent_primitives/py.typed +0 -0
  842. modules/primitives/pyproject.toml +68 -0
  843. modules/scak/.github/copilot-instructions.md +396 -0
  844. modules/scak/.github/workflows/release.yml +117 -0
  845. modules/scak/.gitignore +32 -0
  846. modules/scak/CHANGELOG.md +173 -0
  847. modules/scak/CITATION.cff +62 -0
  848. modules/scak/CONTRIBUTING.md +429 -0
  849. modules/scak/Dockerfile +58 -0
  850. modules/scak/ENTERPRISE_FEATURES.md +518 -0
  851. modules/scak/IMPLEMENTATION_SUMMARY.md +206 -0
  852. modules/scak/LIMITATIONS.md +565 -0
  853. modules/scak/MANIFEST.in +16 -0
  854. modules/scak/NOVELTY.md +535 -0
  855. modules/scak/README.md +928 -0
  856. modules/scak/RESEARCH.md +670 -0
  857. modules/scak/agent_kernel/__init__.py +66 -0
  858. modules/scak/agent_kernel/analyzer.py +432 -0
  859. modules/scak/agent_kernel/auditor.py +31 -0
  860. modules/scak/agent_kernel/completeness_auditor.py +234 -0
  861. modules/scak/agent_kernel/detector.py +200 -0
  862. modules/scak/agent_kernel/kernel.py +741 -0
  863. modules/scak/agent_kernel/memory_manager.py +82 -0
  864. modules/scak/agent_kernel/models.py +372 -0
  865. modules/scak/agent_kernel/nudge_mechanism.py +260 -0
  866. modules/scak/agent_kernel/outcome_analyzer.py +335 -0
  867. modules/scak/agent_kernel/patcher.py +579 -0
  868. modules/scak/agent_kernel/semantic_analyzer.py +313 -0
  869. modules/scak/agent_kernel/semantic_purge.py +346 -0
  870. modules/scak/agent_kernel/simulator.py +447 -0
  871. modules/scak/agent_kernel/teacher.py +82 -0
  872. modules/scak/agent_kernel/triage.py +149 -0
  873. modules/scak/build_and_publish.ps1 +74 -0
  874. modules/scak/build_and_publish.sh +74 -0
  875. modules/scak/cli.py +471 -0
  876. modules/scak/dashboard.py +462 -0
  877. modules/scak/datasets/DATASET_CARD.md +219 -0
  878. modules/scak/datasets/README.md +143 -0
  879. modules/scak/datasets/gaia_vague_queries/vague_queries.json +262 -0
  880. modules/scak/datasets/hf_upload/README.md +219 -0
  881. modules/scak/datasets/hf_upload/scak_gaia_laziness.jsonl +50 -0
  882. modules/scak/datasets/prepare_hf_datasets.py +145 -0
  883. modules/scak/datasets/red_team/jailbreak_patterns.json +202 -0
  884. modules/scak/docker-compose.yml +99 -0
  885. modules/scak/docs/Adaptive-Memory-Hierarchy.md +319 -0
  886. modules/scak/docs/Data-Contracts-and-Schemas.md +285 -0
  887. modules/scak/docs/Dual-Loop-Architecture.md +344 -0
  888. modules/scak/docs/Enhanced-Features.md +612 -0
  889. modules/scak/docs/LANGCHAIN_INTEGRATION.md +572 -0
  890. modules/scak/docs/README.md +128 -0
  891. modules/scak/docs/Reference-Implementations.md +163 -0
  892. modules/scak/docs/SCAK_V2.md +374 -0
  893. modules/scak/docs/Three-Failure-Types.md +178 -0
  894. modules/scak/examples/basic_example.py +155 -0
  895. modules/scak/examples/circuit_breaker_lazy_eval_demo.py +243 -0
  896. modules/scak/examples/langchain_integration_example.py +339 -0
  897. modules/scak/examples/layer4_demo.py +243 -0
  898. modules/scak/examples/production_features_demo.py +353 -0
  899. modules/scak/examples/quick_demo.py +79 -0
  900. modules/scak/examples/scak_v2_demo.py +252 -0
  901. modules/scak/experiments/README.md +438 -0
  902. modules/scak/experiments/ablation_studies/README.md +192 -0
  903. modules/scak/experiments/ablation_studies/ablation_no_audit.py +116 -0
  904. modules/scak/experiments/ablation_studies/ablation_no_purge.py +133 -0
  905. modules/scak/experiments/chaos_engineering/README.md +332 -0
  906. modules/scak/experiments/context_efficiency_test.py +328 -0
  907. modules/scak/experiments/gaia_benchmark/README.md +208 -0
  908. modules/scak/experiments/laziness_benchmark.py +179 -0
  909. modules/scak/experiments/long_horizon_task_experiment.py +252 -0
  910. modules/scak/experiments/multi_agent_rag_experiment.py +284 -0
  911. modules/scak/experiments/results/ablation_table.md +12 -0
  912. modules/scak/experiments/results/long_horizon.json +36 -0
  913. modules/scak/experiments/results/multi_agent_rag.json +66 -0
  914. modules/scak/experiments/run_comprehensive_ablations.py +332 -0
  915. modules/scak/experiments/test_auditor_patcher_integration.py +251 -0
  916. modules/scak/notebooks/getting_started.ipynb +33 -0
  917. modules/scak/paper/ARXIV_SUBMISSION_METADATA.txt +109 -0
  918. modules/scak/paper/PAPER_CHECKLIST.md +304 -0
  919. modules/scak/paper/Paper.pdf +0 -0
  920. modules/scak/paper/README.md +113 -0
  921. modules/scak/paper/appendix.md +351 -0
  922. modules/scak/paper/arxiv/bibliography.bib +284 -0
  923. modules/scak/paper/arxiv/fig1_ooda_architecture.pdf +0 -0
  924. modules/scak/paper/arxiv/fig2_memory_hierarchy.pdf +0 -0
  925. modules/scak/paper/arxiv/fig3_gaia_results.pdf +0 -0
  926. modules/scak/paper/arxiv/fig4_ablation_heatmap.pdf +0 -0
  927. modules/scak/paper/arxiv/fig5_context_reduction.pdf +0 -0
  928. modules/scak/paper/arxiv/fig6_mttr_boxplot.pdf +0 -0
  929. modules/scak/paper/arxiv/main.aux +103 -0
  930. modules/scak/paper/arxiv/main.bbl +113 -0
  931. modules/scak/paper/arxiv/main.blg +55 -0
  932. modules/scak/paper/arxiv/main.out +31 -0
  933. modules/scak/paper/arxiv/main.pdf +0 -0
  934. modules/scak/paper/arxiv/main.tex +482 -0
  935. modules/scak/paper/arxiv_submission/bibliography.bib +284 -0
  936. modules/scak/paper/arxiv_submission/fig1_ooda_architecture.pdf +0 -0
  937. modules/scak/paper/arxiv_submission/fig2_memory_hierarchy.pdf +0 -0
  938. modules/scak/paper/arxiv_submission/fig3_gaia_results.pdf +0 -0
  939. modules/scak/paper/arxiv_submission/fig4_ablation_heatmap.pdf +0 -0
  940. modules/scak/paper/arxiv_submission/fig5_context_reduction.pdf +0 -0
  941. modules/scak/paper/arxiv_submission/fig6_mttr_boxplot.pdf +0 -0
  942. modules/scak/paper/arxiv_submission/main.aux +103 -0
  943. modules/scak/paper/arxiv_submission/main.bbl +113 -0
  944. modules/scak/paper/arxiv_submission/main.blg +55 -0
  945. modules/scak/paper/arxiv_submission/main.out +31 -0
  946. modules/scak/paper/arxiv_submission/main.pdf +0 -0
  947. modules/scak/paper/arxiv_submission/main.tex +482 -0
  948. modules/scak/paper/arxiv_submission.tar.gz +0 -0
  949. modules/scak/paper/bibliography.bib +284 -0
  950. modules/scak/paper/build.sh +55 -0
  951. modules/scak/paper/figures/README.md +32 -0
  952. modules/scak/paper/figures/fig1_ooda_architecture.md +75 -0
  953. modules/scak/paper/figures/fig1_ooda_architecture.pdf +0 -0
  954. modules/scak/paper/figures/fig1_ooda_architecture.png +0 -0
  955. modules/scak/paper/figures/fig2_memory_hierarchy.md +83 -0
  956. modules/scak/paper/figures/fig2_memory_hierarchy.pdf +0 -0
  957. modules/scak/paper/figures/fig2_memory_hierarchy.png +0 -0
  958. modules/scak/paper/figures/fig3_gaia_results.md +64 -0
  959. modules/scak/paper/figures/fig3_gaia_results.pdf +0 -0
  960. modules/scak/paper/figures/fig3_gaia_results.png +0 -0
  961. modules/scak/paper/figures/fig4_ablation_heatmap.md +64 -0
  962. modules/scak/paper/figures/fig4_ablation_heatmap.pdf +0 -0
  963. modules/scak/paper/figures/fig4_ablation_heatmap.png +0 -0
  964. modules/scak/paper/figures/fig5_context_reduction.md +71 -0
  965. modules/scak/paper/figures/fig5_context_reduction.pdf +0 -0
  966. modules/scak/paper/figures/fig5_context_reduction.png +0 -0
  967. modules/scak/paper/figures/fig6_mttr_boxplot.md +80 -0
  968. modules/scak/paper/figures/fig6_mttr_boxplot.pdf +0 -0
  969. modules/scak/paper/figures/fig6_mttr_boxplot.png +0 -0
  970. modules/scak/paper/figures/generate_figures.py +463 -0
  971. modules/scak/paper/main.aux +103 -0
  972. modules/scak/paper/main.bbl +113 -0
  973. modules/scak/paper/main.blg +55 -0
  974. modules/scak/paper/main.md +192 -0
  975. modules/scak/paper/main.out +31 -0
  976. modules/scak/paper/main.pdf +0 -0
  977. modules/scak/paper/main.tex +482 -0
  978. modules/scak/reproducibility/ABLATIONS.md +225 -0
  979. modules/scak/reproducibility/Dockerfile.reproducibility +34 -0
  980. modules/scak/reproducibility/README.md +421 -0
  981. modules/scak/reproducibility/requirements-pinned.txt +32 -0
  982. modules/scak/reproducibility/run_all_experiments.py +395 -0
  983. modules/scak/reproducibility/seed_control.py +53 -0
  984. modules/scak/reproducibility/statistical_analysis.py +302 -0
  985. modules/scak/requirements.txt +50 -0
  986. modules/scak/setup.py +93 -0
  987. modules/scak/src/__init__.py +124 -0
  988. modules/scak/src/agents/__init__.py +13 -0
  989. modules/scak/src/agents/conflict_resolution.py +732 -0
  990. modules/scak/src/agents/orchestrator.py +761 -0
  991. modules/scak/src/agents/pubsub.py +484 -0
  992. modules/scak/src/agents/shadow_teacher.py +344 -0
  993. modules/scak/src/agents/swarm.py +661 -0
  994. modules/scak/src/agents/worker.py +357 -0
  995. modules/scak/src/integrations/__init__.py +81 -0
  996. modules/scak/src/integrations/cmvk_adapter.py +430 -0
  997. modules/scak/src/integrations/control_plane_adapter.py +601 -0
  998. modules/scak/src/integrations/langchain_integration.py +902 -0
  999. modules/scak/src/interfaces/__init__.py +59 -0
  1000. modules/scak/src/interfaces/llm_clients.py +505 -0
  1001. modules/scak/src/interfaces/openapi_tools.py +611 -0
  1002. modules/scak/src/interfaces/plugin_system.py +605 -0
  1003. modules/scak/src/interfaces/protocols.py +365 -0
  1004. modules/scak/src/interfaces/telemetry.py +464 -0
  1005. modules/scak/src/interfaces/tool_registry.py +547 -0
  1006. modules/scak/src/kernel/__init__.py +100 -0
  1007. modules/scak/src/kernel/auditor.py +305 -0
  1008. modules/scak/src/kernel/circuit_breaker.py +398 -0
  1009. modules/scak/src/kernel/core.py +724 -0
  1010. modules/scak/src/kernel/distributed.py +667 -0
  1011. modules/scak/src/kernel/evolution.py +455 -0
  1012. modules/scak/src/kernel/failover.py +621 -0
  1013. modules/scak/src/kernel/governance.py +710 -0
  1014. modules/scak/src/kernel/governance_v2.py +603 -0
  1015. modules/scak/src/kernel/lazy_evaluator.py +514 -0
  1016. modules/scak/src/kernel/load_testing.py +633 -0
  1017. modules/scak/src/kernel/memory.py +945 -0
  1018. modules/scak/src/kernel/patcher.py +581 -0
  1019. modules/scak/src/kernel/rubric.py +419 -0
  1020. modules/scak/src/kernel/schemas.py +390 -0
  1021. modules/scak/src/kernel/skill_mapper.py +309 -0
  1022. modules/scak/src/kernel/triage.py +149 -0
  1023. modules/scak/src/mocks/__init__.py +99 -0
  1024. modules/scak/tests/__init__.py +1 -0
  1025. modules/scak/tests/test_circuit_breaker.py +403 -0
  1026. modules/scak/tests/test_conflict_resolution.py +287 -0
  1027. modules/scak/tests/test_dual_loop.py +463 -0
  1028. modules/scak/tests/test_enhanced_features.py +421 -0
  1029. modules/scak/tests/test_failover_and_load.py +438 -0
  1030. modules/scak/tests/test_governance.py +185 -0
  1031. modules/scak/tests/test_kernel.py +359 -0
  1032. modules/scak/tests/test_langchain_integration.py +451 -0
  1033. modules/scak/tests/test_lazy_evaluator.py +465 -0
  1034. modules/scak/tests/test_llm_clients.py +122 -0
  1035. modules/scak/tests/test_memory_controller.py +528 -0
  1036. modules/scak/tests/test_orchestrator.py +181 -0
  1037. modules/scak/tests/test_phase3_integration.py +265 -0
  1038. modules/scak/tests/test_pubsub_swarm.py +203 -0
  1039. modules/scak/tests/test_reference_implementations.py +240 -0
  1040. modules/scak/tests/test_rubric.py +363 -0
  1041. modules/scak/tests/test_scak_v2.py +651 -0
  1042. modules/scak/tests/test_skill_mapper.py +217 -0
  1043. modules/scak/tests/test_specific_failures.py +393 -0
  1044. modules/scak/tests/test_tool_registry.py +264 -0
  1045. modules/scak/tests/test_tools_and_plugins.py +303 -0
  1046. modules/scak/tests/test_triage.py +596 -0
  1047. modules/scak/tests/test_write_through.py +319 -0
  1048. agent_os_kernel-1.1.0.dist-info/METADATA +0 -400
  1049. agent_os_kernel-1.1.0.dist-info/RECORD +0 -12
  1050. {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/WHEEL +0 -0
  1051. {agent_os_kernel-1.1.0.dist-info → agent_os_kernel-1.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,3111 @@
1
+ """
2
+ Agent Lifecycle Management - v0.2.0
3
+
4
+ This module provides comprehensive lifecycle management for autonomous AI agents,
5
+ including health monitoring, auto-recovery, circuit breakers, scaling, distributed
6
+ coordination, dependency management, graceful shutdown, resource quotas, observability,
7
+ and hot reload capabilities.
8
+
9
+ Features:
10
+ - ACP-001: Agent Health Checks (liveness/readiness probes)
11
+ - ACP-002: Agent Auto-Recovery (automatic restart of crashed agents)
12
+ - ACP-003: Circuit Breaker (prevent cascading failures)
13
+ - ACP-004: Agent Scaling (horizontal scaling for high-throughput)
14
+ - ACP-005: Distributed Coordination (leader election, consensus)
15
+ - ACP-006: Agent Dependency Graph (enforced start order)
16
+ - ACP-007: Graceful Shutdown (preserve in-flight verifications)
17
+ - ACP-008: Resource Quotas (memory/CPU limits per agent)
18
+ - ACP-009: Agent Observability (metrics/logging integration)
19
+ - ACP-010: Hot Reload (code changes without full restart)
20
+
21
+ Research Foundations:
22
+ - Circuit Breaker pattern (Michael Nygard, "Release It!")
23
+ - Kubernetes probe patterns (liveness, readiness, startup)
24
+ - Raft consensus algorithm (Ongaro & Ousterhout, 2014)
25
+ - Actor model supervision (Erlang/OTP, Akka)
26
+ """
27
+
28
+ from typing import (
29
+ Dict, List, Optional, Any, Union, Callable, Type, Set, Awaitable,
30
+ TypeVar, Generic, Protocol, runtime_checkable
31
+ )
32
+ from dataclasses import dataclass, field
33
+ from enum import Enum, auto
34
+ from datetime import datetime, timedelta
35
+ from collections import defaultdict, deque
36
+ from abc import ABC, abstractmethod
37
+ import asyncio
38
+ import time
39
+ import uuid
40
+ import logging
41
+ import threading
42
+ import weakref
43
+ import traceback
44
+ import hashlib
45
+ import importlib
46
+ import sys
47
+
48
+
49
+ # Configure module logger
50
+ logger = logging.getLogger(__name__)
51
+
52
+
53
+ # ============================================================================
54
+ # Enums and Constants
55
+ # ============================================================================
56
+
57
+ class HealthStatus(Enum):
58
+ """Health status of an agent"""
59
+ UNKNOWN = "unknown"
60
+ HEALTHY = "healthy"
61
+ UNHEALTHY = "unhealthy"
62
+ DEGRADED = "degraded"
63
+ STARTING = "starting"
64
+ STOPPING = "stopping"
65
+ STOPPED = "stopped"
66
+ FAILED = "failed"
67
+
68
+
69
+ class AgentState(Enum):
70
+ """State of an agent in the lifecycle"""
71
+ REGISTERED = "registered"
72
+ PENDING = "pending"
73
+ STARTING = "starting"
74
+ RUNNING = "running"
75
+ STOPPING = "stopping"
76
+ STOPPED = "stopped"
77
+ FAILED = "failed"
78
+ RECOVERING = "recovering"
79
+
80
+
81
+ class CircuitState(Enum):
82
+ """State of a circuit breaker"""
83
+ CLOSED = "closed" # Normal operation
84
+ OPEN = "open" # Failing, reject requests
85
+ HALF_OPEN = "half_open" # Testing recovery
86
+
87
+
88
+ class CoordinationRole(Enum):
89
+ """Role in distributed coordination"""
90
+ LEADER = "leader"
91
+ FOLLOWER = "follower"
92
+ CANDIDATE = "candidate"
93
+
94
+
95
+ class ShutdownPhase(Enum):
96
+ """Phases of graceful shutdown"""
97
+ RUNNING = "running"
98
+ DRAINING = "draining"
99
+ STOPPING = "stopping"
100
+ TERMINATED = "terminated"
101
+
102
+
103
+ # ============================================================================
104
+ # ACP-001: Agent Health Checks
105
+ # ============================================================================
106
+
107
+ @dataclass
108
+ class HealthCheckResult:
109
+ """Result of a health check probe"""
110
+ healthy: bool
111
+ status: HealthStatus
112
+ message: str = ""
113
+ latency_ms: float = 0.0
114
+ timestamp: datetime = field(default_factory=datetime.now)
115
+ details: Dict[str, Any] = field(default_factory=dict)
116
+
117
+
118
+ @dataclass
119
+ class HealthCheckConfig:
120
+ """Configuration for health check probes"""
121
+ # Liveness probe settings
122
+ liveness_interval_seconds: float = 10.0
123
+ liveness_timeout_seconds: float = 5.0
124
+ liveness_failure_threshold: int = 3
125
+
126
+ # Readiness probe settings
127
+ readiness_interval_seconds: float = 5.0
128
+ readiness_timeout_seconds: float = 3.0
129
+ readiness_failure_threshold: int = 1
130
+
131
+ # Startup probe settings (for slow-starting agents)
132
+ startup_probe_enabled: bool = True
133
+ startup_timeout_seconds: float = 60.0
134
+ startup_period_seconds: float = 5.0
135
+
136
+ # Custom health check function
137
+ custom_health_check: Optional[Callable[[], Awaitable[bool]]] = None
138
+
139
+
140
+ @runtime_checkable
141
+ class HealthCheckable(Protocol):
142
+ """Protocol for agents that support health checks"""
143
+
144
+ async def liveness_check(self) -> bool:
145
+ """Check if the agent is alive (not deadlocked/crashed)"""
146
+ ...
147
+
148
+ async def readiness_check(self) -> bool:
149
+ """Check if the agent is ready to accept requests"""
150
+ ...
151
+
152
+
153
+ class HealthMonitor:
154
+ """
155
+ Monitors agent health via liveness and readiness probes.
156
+
157
+ Implements Kubernetes-style health checking patterns:
158
+ - Liveness: Is the agent alive? If not, restart it.
159
+ - Readiness: Is the agent ready to accept requests?
160
+ - Startup: Has the agent finished starting up?
161
+
162
+ Usage:
163
+ monitor = HealthMonitor(config=HealthCheckConfig())
164
+
165
+ # Register an agent
166
+ monitor.register_agent(agent_id, agent_instance)
167
+
168
+ # Start monitoring
169
+ await monitor.start()
170
+
171
+ # Check status
172
+ status = monitor.get_agent_health(agent_id)
173
+ """
174
+
175
+ def __init__(self, config: Optional[HealthCheckConfig] = None):
176
+ self.config = config or HealthCheckConfig()
177
+ self._agents: Dict[str, Any] = {}
178
+ self._health_status: Dict[str, HealthStatus] = {}
179
+ self._liveness_failures: Dict[str, int] = defaultdict(int)
180
+ self._readiness_failures: Dict[str, int] = defaultdict(int)
181
+ self._last_check: Dict[str, datetime] = {}
182
+ self._check_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
183
+ self._running = False
184
+ self._tasks: List[asyncio.Task] = []
185
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
186
+ self._lock = asyncio.Lock()
187
+
188
+ def register_agent(
189
+ self,
190
+ agent_id: str,
191
+ agent: Any,
192
+ custom_liveness: Optional[Callable[[], Awaitable[bool]]] = None,
193
+ custom_readiness: Optional[Callable[[], Awaitable[bool]]] = None
194
+ ) -> None:
195
+ """Register an agent for health monitoring"""
196
+ self._agents[agent_id] = {
197
+ "agent": agent,
198
+ "custom_liveness": custom_liveness,
199
+ "custom_readiness": custom_readiness,
200
+ "registered_at": datetime.now()
201
+ }
202
+ self._health_status[agent_id] = HealthStatus.UNKNOWN
203
+ logger.info(f"Registered agent {agent_id} for health monitoring")
204
+
205
+ def unregister_agent(self, agent_id: str) -> None:
206
+ """Unregister an agent from health monitoring"""
207
+ if agent_id in self._agents:
208
+ del self._agents[agent_id]
209
+ self._health_status.pop(agent_id, None)
210
+ self._liveness_failures.pop(agent_id, None)
211
+ self._readiness_failures.pop(agent_id, None)
212
+ logger.info(f"Unregistered agent {agent_id} from health monitoring")
213
+
214
+ async def start(self) -> None:
215
+ """Start the health monitoring loop"""
216
+ if self._running:
217
+ return
218
+
219
+ self._running = True
220
+ self._tasks.append(asyncio.create_task(self._liveness_loop()))
221
+ self._tasks.append(asyncio.create_task(self._readiness_loop()))
222
+ logger.info("Health monitor started")
223
+
224
+ async def stop(self) -> None:
225
+ """Stop the health monitoring loop"""
226
+ self._running = False
227
+ for task in self._tasks:
228
+ task.cancel()
229
+ try:
230
+ await task
231
+ except asyncio.CancelledError:
232
+ pass
233
+ self._tasks.clear()
234
+ logger.info("Health monitor stopped")
235
+
236
+ async def _liveness_loop(self) -> None:
237
+ """Main loop for liveness checks"""
238
+ while self._running:
239
+ for agent_id in list(self._agents.keys()):
240
+ try:
241
+ result = await self._check_liveness(agent_id)
242
+ self._check_history[agent_id].append(result)
243
+
244
+ if not result.healthy:
245
+ self._liveness_failures[agent_id] += 1
246
+ if self._liveness_failures[agent_id] >= self.config.liveness_failure_threshold:
247
+ self._health_status[agent_id] = HealthStatus.FAILED
248
+ await self._trigger_callbacks("liveness_failed", agent_id)
249
+ else:
250
+ self._liveness_failures[agent_id] = 0
251
+ if self._health_status[agent_id] == HealthStatus.FAILED:
252
+ self._health_status[agent_id] = HealthStatus.HEALTHY
253
+ await self._trigger_callbacks("liveness_restored", agent_id)
254
+
255
+ except Exception as e:
256
+ logger.error(f"Liveness check failed for {agent_id}: {e}")
257
+ self._liveness_failures[agent_id] += 1
258
+
259
+ await asyncio.sleep(self.config.liveness_interval_seconds)
260
+
261
+ async def _readiness_loop(self) -> None:
262
+ """Main loop for readiness checks"""
263
+ while self._running:
264
+ for agent_id in list(self._agents.keys()):
265
+ try:
266
+ result = await self._check_readiness(agent_id)
267
+
268
+ if not result.healthy:
269
+ self._readiness_failures[agent_id] += 1
270
+ if self._readiness_failures[agent_id] >= self.config.readiness_failure_threshold:
271
+ if self._health_status[agent_id] == HealthStatus.HEALTHY:
272
+ self._health_status[agent_id] = HealthStatus.DEGRADED
273
+ await self._trigger_callbacks("readiness_failed", agent_id)
274
+ else:
275
+ self._readiness_failures[agent_id] = 0
276
+ if self._health_status[agent_id] == HealthStatus.DEGRADED:
277
+ self._health_status[agent_id] = HealthStatus.HEALTHY
278
+ await self._trigger_callbacks("readiness_restored", agent_id)
279
+
280
+ except Exception as e:
281
+ logger.error(f"Readiness check failed for {agent_id}: {e}")
282
+ self._readiness_failures[agent_id] += 1
283
+
284
+ await asyncio.sleep(self.config.readiness_interval_seconds)
285
+
286
+ async def _check_liveness(self, agent_id: str) -> HealthCheckResult:
287
+ """Perform liveness check for an agent"""
288
+ start_time = time.time()
289
+ agent_info = self._agents.get(agent_id)
290
+
291
+ if not agent_info:
292
+ return HealthCheckResult(
293
+ healthy=False,
294
+ status=HealthStatus.UNKNOWN,
295
+ message="Agent not found"
296
+ )
297
+
298
+ agent = agent_info["agent"]
299
+ custom_check = agent_info.get("custom_liveness")
300
+
301
+ try:
302
+ # Try custom liveness check first
303
+ if custom_check:
304
+ healthy = await asyncio.wait_for(
305
+ custom_check(),
306
+ timeout=self.config.liveness_timeout_seconds
307
+ )
308
+ # Try protocol method
309
+ elif isinstance(agent, HealthCheckable):
310
+ healthy = await asyncio.wait_for(
311
+ agent.liveness_check(),
312
+ timeout=self.config.liveness_timeout_seconds
313
+ )
314
+ # Fallback: check if agent has is_alive method
315
+ elif hasattr(agent, 'is_alive'):
316
+ if asyncio.iscoroutinefunction(agent.is_alive):
317
+ healthy = await asyncio.wait_for(
318
+ agent.is_alive(),
319
+ timeout=self.config.liveness_timeout_seconds
320
+ )
321
+ else:
322
+ healthy = agent.is_alive()
323
+ else:
324
+ # Default: assume healthy if agent exists
325
+ healthy = True
326
+
327
+ latency_ms = (time.time() - start_time) * 1000
328
+ self._last_check[agent_id] = datetime.now()
329
+
330
+ return HealthCheckResult(
331
+ healthy=healthy,
332
+ status=HealthStatus.HEALTHY if healthy else HealthStatus.UNHEALTHY,
333
+ latency_ms=latency_ms
334
+ )
335
+
336
+ except asyncio.TimeoutError:
337
+ return HealthCheckResult(
338
+ healthy=False,
339
+ status=HealthStatus.UNHEALTHY,
340
+ message="Liveness check timed out",
341
+ latency_ms=self.config.liveness_timeout_seconds * 1000
342
+ )
343
+ except Exception as e:
344
+ return HealthCheckResult(
345
+ healthy=False,
346
+ status=HealthStatus.FAILED,
347
+ message=str(e),
348
+ latency_ms=(time.time() - start_time) * 1000
349
+ )
350
+
351
+ async def _check_readiness(self, agent_id: str) -> HealthCheckResult:
352
+ """Perform readiness check for an agent"""
353
+ start_time = time.time()
354
+ agent_info = self._agents.get(agent_id)
355
+
356
+ if not agent_info:
357
+ return HealthCheckResult(
358
+ healthy=False,
359
+ status=HealthStatus.UNKNOWN,
360
+ message="Agent not found"
361
+ )
362
+
363
+ agent = agent_info["agent"]
364
+ custom_check = agent_info.get("custom_readiness")
365
+
366
+ try:
367
+ if custom_check:
368
+ ready = await asyncio.wait_for(
369
+ custom_check(),
370
+ timeout=self.config.readiness_timeout_seconds
371
+ )
372
+ elif isinstance(agent, HealthCheckable):
373
+ ready = await asyncio.wait_for(
374
+ agent.readiness_check(),
375
+ timeout=self.config.readiness_timeout_seconds
376
+ )
377
+ elif hasattr(agent, 'is_ready'):
378
+ if asyncio.iscoroutinefunction(agent.is_ready):
379
+ ready = await asyncio.wait_for(
380
+ agent.is_ready(),
381
+ timeout=self.config.readiness_timeout_seconds
382
+ )
383
+ else:
384
+ ready = agent.is_ready()
385
+ else:
386
+ ready = True
387
+
388
+ latency_ms = (time.time() - start_time) * 1000
389
+
390
+ return HealthCheckResult(
391
+ healthy=ready,
392
+ status=HealthStatus.HEALTHY if ready else HealthStatus.DEGRADED,
393
+ latency_ms=latency_ms
394
+ )
395
+
396
+ except asyncio.TimeoutError:
397
+ return HealthCheckResult(
398
+ healthy=False,
399
+ status=HealthStatus.DEGRADED,
400
+ message="Readiness check timed out",
401
+ latency_ms=self.config.readiness_timeout_seconds * 1000
402
+ )
403
+ except Exception as e:
404
+ return HealthCheckResult(
405
+ healthy=False,
406
+ status=HealthStatus.DEGRADED,
407
+ message=str(e),
408
+ latency_ms=(time.time() - start_time) * 1000
409
+ )
410
+
411
+ def on_event(self, event: str, callback: Callable[[str], Awaitable[None]]) -> None:
412
+ """Register a callback for health events"""
413
+ self._callbacks[event].append(callback)
414
+
415
+ async def _trigger_callbacks(self, event: str, agent_id: str) -> None:
416
+ """Trigger all callbacks for an event"""
417
+ for callback in self._callbacks.get(event, []):
418
+ try:
419
+ await callback(agent_id)
420
+ except Exception as e:
421
+ logger.error(f"Callback error for {event}: {e}")
422
+
423
+ def get_agent_health(self, agent_id: str) -> HealthStatus:
424
+ """Get the current health status of an agent"""
425
+ return self._health_status.get(agent_id, HealthStatus.UNKNOWN)
426
+
427
+ def get_all_health_status(self) -> Dict[str, HealthStatus]:
428
+ """Get health status for all agents"""
429
+ return dict(self._health_status)
430
+
431
+ def get_health_history(self, agent_id: str) -> List[HealthCheckResult]:
432
+ """Get health check history for an agent"""
433
+ return list(self._check_history.get(agent_id, []))
434
+
435
+
436
+ # ============================================================================
437
+ # ACP-002: Agent Auto-Recovery
438
+ # ============================================================================
439
+
440
+ @dataclass
441
+ class RecoveryConfig:
442
+ """Configuration for auto-recovery behavior"""
443
+ enabled: bool = True
444
+ max_restarts: int = 5
445
+ restart_delay_seconds: float = 1.0
446
+ restart_delay_max_seconds: float = 60.0
447
+ restart_delay_multiplier: float = 2.0
448
+ reset_restart_count_after_seconds: float = 300.0
449
+ on_max_restarts: str = "stop" # "stop", "alert", "continue"
450
+
451
+
452
+ @dataclass
453
+ class RecoveryEvent:
454
+ """Record of a recovery event"""
455
+ agent_id: str
456
+ event_type: str # "restart", "failure", "recovery_success", "max_restarts"
457
+ timestamp: datetime = field(default_factory=datetime.now)
458
+ attempt: int = 0
459
+ error: Optional[str] = None
460
+ details: Dict[str, Any] = field(default_factory=dict)
461
+
462
+
463
+ class AutoRecoveryManager:
464
+ """
465
+ Manages automatic recovery of failed agents.
466
+
467
+ Implements exponential backoff for restart attempts and tracks
468
+ recovery history for analysis.
469
+
470
+ Features:
471
+ - Automatic restart with exponential backoff
472
+ - Maximum restart limit with configurable behavior
473
+ - Recovery event logging
474
+ - Callbacks for recovery events
475
+
476
+ Usage:
477
+ recovery = AutoRecoveryManager(config=RecoveryConfig())
478
+ recovery.register_agent(agent_id, agent_factory)
479
+
480
+ # When agent fails
481
+ await recovery.handle_failure(agent_id, error)
482
+ """
483
+
484
+ def __init__(self, config: Optional[RecoveryConfig] = None):
485
+ self.config = config or RecoveryConfig()
486
+ self._agent_factories: Dict[str, Callable[[], Any]] = {}
487
+ self._restart_counts: Dict[str, int] = defaultdict(int)
488
+ self._last_restart: Dict[str, datetime] = {}
489
+ self._current_delay: Dict[str, float] = {}
490
+ self._recovery_history: deque = deque(maxlen=1000)
491
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
492
+ self._agents: Dict[str, Any] = {}
493
+ self._lock = asyncio.Lock()
494
+
495
+ def register_agent(
496
+ self,
497
+ agent_id: str,
498
+ factory: Callable[[], Any],
499
+ initial_instance: Optional[Any] = None
500
+ ) -> None:
501
+ """Register an agent with its factory function for recovery"""
502
+ self._agent_factories[agent_id] = factory
503
+ if initial_instance:
504
+ self._agents[agent_id] = initial_instance
505
+ self._restart_counts[agent_id] = 0
506
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
507
+ logger.info(f"Registered agent {agent_id} for auto-recovery")
508
+
509
+ def unregister_agent(self, agent_id: str) -> None:
510
+ """Unregister an agent from auto-recovery"""
511
+ self._agent_factories.pop(agent_id, None)
512
+ self._agents.pop(agent_id, None)
513
+ self._restart_counts.pop(agent_id, None)
514
+ self._last_restart.pop(agent_id, None)
515
+ self._current_delay.pop(agent_id, None)
516
+
517
+ async def handle_failure(
518
+ self,
519
+ agent_id: str,
520
+ error: Optional[Exception] = None
521
+ ) -> Optional[Any]:
522
+ """
523
+ Handle an agent failure and attempt recovery.
524
+
525
+ Returns the new agent instance if recovery succeeds, None otherwise.
526
+ """
527
+ if not self.config.enabled:
528
+ logger.info(f"Auto-recovery disabled, not recovering {agent_id}")
529
+ return None
530
+
531
+ async with self._lock:
532
+ # Check if we should reset restart count
533
+ if agent_id in self._last_restart:
534
+ time_since_last = (datetime.now() - self._last_restart[agent_id]).total_seconds()
535
+ if time_since_last > self.config.reset_restart_count_after_seconds:
536
+ self._restart_counts[agent_id] = 0
537
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
538
+
539
+ # Check if max restarts reached
540
+ if self._restart_counts[agent_id] >= self.config.max_restarts:
541
+ event = RecoveryEvent(
542
+ agent_id=agent_id,
543
+ event_type="max_restarts",
544
+ attempt=self._restart_counts[agent_id],
545
+ error=str(error) if error else None
546
+ )
547
+ self._recovery_history.append(event)
548
+ await self._trigger_callbacks("max_restarts", agent_id, event)
549
+
550
+ if self.config.on_max_restarts == "stop":
551
+ logger.error(f"Max restarts reached for {agent_id}, stopping")
552
+ return None
553
+ elif self.config.on_max_restarts == "alert":
554
+ logger.warning(f"Max restarts reached for {agent_id}, alerting")
555
+ await self._trigger_callbacks("alert", agent_id, event)
556
+ # "continue" falls through to attempt restart anyway
557
+
558
+ # Calculate delay with exponential backoff
559
+ delay = self._current_delay.get(agent_id, self.config.restart_delay_seconds)
560
+
561
+ # Log failure event
562
+ failure_event = RecoveryEvent(
563
+ agent_id=agent_id,
564
+ event_type="failure",
565
+ attempt=self._restart_counts[agent_id],
566
+ error=str(error) if error else None
567
+ )
568
+ self._recovery_history.append(failure_event)
569
+ await self._trigger_callbacks("failure", agent_id, failure_event)
570
+
571
+ logger.info(f"Attempting recovery for {agent_id} after {delay:.1f}s delay")
572
+ await asyncio.sleep(delay)
573
+
574
+ # Attempt restart
575
+ try:
576
+ factory = self._agent_factories.get(agent_id)
577
+ if not factory:
578
+ logger.error(f"No factory registered for {agent_id}")
579
+ return None
580
+
581
+ new_agent = factory()
582
+ if asyncio.iscoroutine(new_agent):
583
+ new_agent = await new_agent
584
+
585
+ # Start the agent if it has a start method
586
+ if hasattr(new_agent, 'start'):
587
+ if asyncio.iscoroutinefunction(new_agent.start):
588
+ await new_agent.start()
589
+ else:
590
+ new_agent.start()
591
+
592
+ self._agents[agent_id] = new_agent
593
+ self._restart_counts[agent_id] += 1
594
+ self._last_restart[agent_id] = datetime.now()
595
+
596
+ # Increase delay for next potential failure (exponential backoff)
597
+ self._current_delay[agent_id] = min(
598
+ delay * self.config.restart_delay_multiplier,
599
+ self.config.restart_delay_max_seconds
600
+ )
601
+
602
+ success_event = RecoveryEvent(
603
+ agent_id=agent_id,
604
+ event_type="recovery_success",
605
+ attempt=self._restart_counts[agent_id]
606
+ )
607
+ self._recovery_history.append(success_event)
608
+ await self._trigger_callbacks("recovery_success", agent_id, success_event)
609
+
610
+ logger.info(f"Successfully recovered agent {agent_id}")
611
+ return new_agent
612
+
613
+ except Exception as e:
614
+ logger.error(f"Failed to recover agent {agent_id}: {e}")
615
+ self._restart_counts[agent_id] += 1
616
+ return await self.handle_failure(agent_id, e)
617
+
618
+ def on_event(
619
+ self,
620
+ event: str,
621
+ callback: Callable[[str, RecoveryEvent], Awaitable[None]]
622
+ ) -> None:
623
+ """Register a callback for recovery events"""
624
+ self._callbacks[event].append(callback)
625
+
626
+ async def _trigger_callbacks(
627
+ self,
628
+ event: str,
629
+ agent_id: str,
630
+ recovery_event: RecoveryEvent
631
+ ) -> None:
632
+ """Trigger all callbacks for an event"""
633
+ for callback in self._callbacks.get(event, []):
634
+ try:
635
+ await callback(agent_id, recovery_event)
636
+ except Exception as e:
637
+ logger.error(f"Callback error for {event}: {e}")
638
+
639
+ def get_agent(self, agent_id: str) -> Optional[Any]:
640
+ """Get the current agent instance"""
641
+ return self._agents.get(agent_id)
642
+
643
+ def get_restart_count(self, agent_id: str) -> int:
644
+ """Get the restart count for an agent"""
645
+ return self._restart_counts.get(agent_id, 0)
646
+
647
+ def get_recovery_history(
648
+ self,
649
+ agent_id: Optional[str] = None
650
+ ) -> List[RecoveryEvent]:
651
+ """Get recovery history, optionally filtered by agent"""
652
+ if agent_id:
653
+ return [e for e in self._recovery_history if e.agent_id == agent_id]
654
+ return list(self._recovery_history)
655
+
656
+ def reset_restart_count(self, agent_id: str) -> None:
657
+ """Manually reset the restart count for an agent"""
658
+ self._restart_counts[agent_id] = 0
659
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
660
+
661
+
662
+ # ============================================================================
663
+ # ACP-003: Circuit Breaker
664
+ # ============================================================================
665
+
666
+ @dataclass
667
+ class CircuitBreakerConfig:
668
+ """Configuration for circuit breaker behavior"""
669
+ failure_threshold: int = 5
670
+ success_threshold: int = 3
671
+ recovery_timeout_seconds: float = 60.0
672
+ half_open_max_calls: int = 3
673
+ exclude_exceptions: List[Type[Exception]] = field(default_factory=list)
674
+ include_exceptions: Optional[List[Type[Exception]]] = None
675
+
676
+
677
+ @dataclass
678
+ class CircuitBreakerMetrics:
679
+ """Metrics for a circuit breaker"""
680
+ state: CircuitState
681
+ failure_count: int
682
+ success_count: int
683
+ total_calls: int
684
+ total_failures: int
685
+ total_successes: int
686
+ last_failure_time: Optional[datetime]
687
+ last_success_time: Optional[datetime]
688
+ state_changed_at: datetime
689
+
690
+
691
+ class CircuitBreaker:
692
+ """
693
+ Circuit breaker for preventing cascading failures.
694
+
695
+ Implements the circuit breaker pattern to protect against cascading
696
+ failures when an agent or service becomes unavailable.
697
+
698
+ States:
699
+ - CLOSED: Normal operation, requests pass through
700
+ - OPEN: Failing, requests are rejected immediately
701
+ - HALF_OPEN: Testing recovery, limited requests allowed
702
+
703
+ Features:
704
+ - Configurable failure/success thresholds
705
+ - Automatic recovery timeout
706
+ - Exception filtering
707
+ - Metrics collection
708
+
709
+ Usage:
710
+ breaker = CircuitBreaker(
711
+ config=CircuitBreakerConfig(
712
+ failure_threshold=5,
713
+ recovery_timeout=60
714
+ )
715
+ )
716
+
717
+ # Use as decorator
718
+ @breaker
719
+ async def call_agent():
720
+ ...
721
+
722
+ # Or use context manager
723
+ async with breaker:
724
+ await call_agent()
725
+ """
726
+
727
+ def __init__(
728
+ self,
729
+ name: str = "default",
730
+ config: Optional[CircuitBreakerConfig] = None,
731
+ failure_threshold: Optional[int] = None,
732
+ recovery_timeout: Optional[float] = None
733
+ ):
734
+ self.name = name
735
+ self.config = config or CircuitBreakerConfig()
736
+
737
+ # Allow direct parameter override for convenience API
738
+ if failure_threshold is not None:
739
+ self.config.failure_threshold = failure_threshold
740
+ if recovery_timeout is not None:
741
+ self.config.recovery_timeout_seconds = recovery_timeout
742
+
743
+ self._state = CircuitState.CLOSED
744
+ self._failure_count = 0
745
+ self._success_count = 0
746
+ self._half_open_calls = 0
747
+ self._last_failure_time: Optional[datetime] = None
748
+ self._last_success_time: Optional[datetime] = None
749
+ self._state_changed_at = datetime.now()
750
+ self._total_calls = 0
751
+ self._total_failures = 0
752
+ self._total_successes = 0
753
+ self._lock = asyncio.Lock()
754
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
755
+
756
+ @property
757
+ def state(self) -> CircuitState:
758
+ """Get the current circuit state"""
759
+ return self._state
760
+
761
+ @property
762
+ def is_closed(self) -> bool:
763
+ """Check if circuit is closed (normal operation)"""
764
+ return self._state == CircuitState.CLOSED
765
+
766
+ @property
767
+ def is_open(self) -> bool:
768
+ """Check if circuit is open (rejecting requests)"""
769
+ return self._state == CircuitState.OPEN
770
+
771
+ async def __aenter__(self):
772
+ """Async context manager entry"""
773
+ await self._before_call()
774
+ return self
775
+
776
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
777
+ """Async context manager exit"""
778
+ if exc_type is None:
779
+ await self._on_success()
780
+ else:
781
+ if self._should_count_exception(exc_type):
782
+ await self._on_failure(exc_val)
783
+ return False
784
+
785
+ def __call__(self, func: Callable) -> Callable:
786
+ """Decorator for wrapping functions with circuit breaker"""
787
+ async def wrapper(*args, **kwargs):
788
+ await self._before_call()
789
+ try:
790
+ if asyncio.iscoroutinefunction(func):
791
+ result = await func(*args, **kwargs)
792
+ else:
793
+ result = func(*args, **kwargs)
794
+ await self._on_success()
795
+ return result
796
+ except Exception as e:
797
+ if self._should_count_exception(type(e)):
798
+ await self._on_failure(e)
799
+ raise
800
+ return wrapper
801
+
802
+ async def _before_call(self) -> None:
803
+ """Check circuit state before a call"""
804
+ async with self._lock:
805
+ self._total_calls += 1
806
+
807
+ if self._state == CircuitState.OPEN:
808
+ # Check if recovery timeout has elapsed
809
+ if self._last_failure_time:
810
+ elapsed = (datetime.now() - self._last_failure_time).total_seconds()
811
+ if elapsed >= self.config.recovery_timeout_seconds:
812
+ self._transition_to(CircuitState.HALF_OPEN)
813
+ self._half_open_calls = 0
814
+ else:
815
+ raise CircuitBreakerOpenError(
816
+ f"Circuit {self.name} is open, retry after "
817
+ f"{self.config.recovery_timeout_seconds - elapsed:.1f}s"
818
+ )
819
+ else:
820
+ raise CircuitBreakerOpenError(f"Circuit {self.name} is open")
821
+
822
+ elif self._state == CircuitState.HALF_OPEN:
823
+ if self._half_open_calls >= self.config.half_open_max_calls:
824
+ raise CircuitBreakerOpenError(
825
+ f"Circuit {self.name} is half-open, max test calls reached"
826
+ )
827
+ self._half_open_calls += 1
828
+
829
+ async def _on_success(self) -> None:
830
+ """Handle a successful call"""
831
+ async with self._lock:
832
+ self._total_successes += 1
833
+ self._last_success_time = datetime.now()
834
+
835
+ if self._state == CircuitState.HALF_OPEN:
836
+ self._success_count += 1
837
+ if self._success_count >= self.config.success_threshold:
838
+ self._transition_to(CircuitState.CLOSED)
839
+ elif self._state == CircuitState.CLOSED:
840
+ self._failure_count = 0
841
+
842
+ async def _on_failure(self, error: Exception) -> None:
843
+ """Handle a failed call"""
844
+ async with self._lock:
845
+ self._total_failures += 1
846
+ self._last_failure_time = datetime.now()
847
+ self._failure_count += 1
848
+
849
+ if self._state == CircuitState.HALF_OPEN:
850
+ # Any failure in half-open state opens the circuit
851
+ self._transition_to(CircuitState.OPEN)
852
+ elif self._state == CircuitState.CLOSED:
853
+ if self._failure_count >= self.config.failure_threshold:
854
+ self._transition_to(CircuitState.OPEN)
855
+
856
+ def _transition_to(self, new_state: CircuitState) -> None:
857
+ """Transition to a new circuit state"""
858
+ old_state = self._state
859
+ self._state = new_state
860
+ self._state_changed_at = datetime.now()
861
+
862
+ if new_state == CircuitState.CLOSED:
863
+ self._failure_count = 0
864
+ self._success_count = 0
865
+ elif new_state == CircuitState.HALF_OPEN:
866
+ self._success_count = 0
867
+ self._half_open_calls = 0
868
+
869
+ logger.info(f"Circuit {self.name} transitioned from {old_state.value} to {new_state.value}")
870
+
871
+ # Trigger callbacks asynchronously
872
+ asyncio.create_task(self._trigger_state_change(old_state, new_state))
873
+
874
+ async def _trigger_state_change(
875
+ self,
876
+ old_state: CircuitState,
877
+ new_state: CircuitState
878
+ ) -> None:
879
+ """Trigger callbacks for state change"""
880
+ for callback in self._callbacks.get("state_change", []):
881
+ try:
882
+ await callback(self.name, old_state, new_state)
883
+ except Exception as e:
884
+ logger.error(f"Circuit breaker callback error: {e}")
885
+
886
+ def _should_count_exception(self, exc_type: Type[Exception]) -> bool:
887
+ """Determine if an exception should be counted as a failure"""
888
+ # Check exclude list
889
+ for excluded in self.config.exclude_exceptions:
890
+ if issubclass(exc_type, excluded):
891
+ return False
892
+
893
+ # Check include list if specified
894
+ if self.config.include_exceptions is not None:
895
+ for included in self.config.include_exceptions:
896
+ if issubclass(exc_type, included):
897
+ return True
898
+ return False
899
+
900
+ return True
901
+
902
+ def on_state_change(
903
+ self,
904
+ callback: Callable[[str, CircuitState, CircuitState], Awaitable[None]]
905
+ ) -> None:
906
+ """Register a callback for state changes"""
907
+ self._callbacks["state_change"].append(callback)
908
+
909
+ def get_metrics(self) -> CircuitBreakerMetrics:
910
+ """Get current circuit breaker metrics"""
911
+ return CircuitBreakerMetrics(
912
+ state=self._state,
913
+ failure_count=self._failure_count,
914
+ success_count=self._success_count,
915
+ total_calls=self._total_calls,
916
+ total_failures=self._total_failures,
917
+ total_successes=self._total_successes,
918
+ last_failure_time=self._last_failure_time,
919
+ last_success_time=self._last_success_time,
920
+ state_changed_at=self._state_changed_at
921
+ )
922
+
923
+ def reset(self) -> None:
924
+ """Manually reset the circuit breaker to closed state"""
925
+ self._state = CircuitState.CLOSED
926
+ self._failure_count = 0
927
+ self._success_count = 0
928
+ self._half_open_calls = 0
929
+ self._state_changed_at = datetime.now()
930
+ logger.info(f"Circuit {self.name} manually reset to CLOSED")
931
+
932
+
933
+ class CircuitBreakerOpenError(Exception):
934
+ """Raised when a circuit breaker is open"""
935
+ pass
936
+
937
+
938
+ class CircuitBreakerRegistry:
939
+ """Registry for managing multiple circuit breakers"""
940
+
941
+ def __init__(self):
942
+ self._breakers: Dict[str, CircuitBreaker] = {}
943
+
944
+ def get_or_create(
945
+ self,
946
+ name: str,
947
+ config: Optional[CircuitBreakerConfig] = None
948
+ ) -> CircuitBreaker:
949
+ """Get or create a circuit breaker by name"""
950
+ if name not in self._breakers:
951
+ self._breakers[name] = CircuitBreaker(name=name, config=config)
952
+ return self._breakers[name]
953
+
954
+ def get(self, name: str) -> Optional[CircuitBreaker]:
955
+ """Get a circuit breaker by name"""
956
+ return self._breakers.get(name)
957
+
958
+ def get_all_metrics(self) -> Dict[str, CircuitBreakerMetrics]:
959
+ """Get metrics for all circuit breakers"""
960
+ return {name: cb.get_metrics() for name, cb in self._breakers.items()}
961
+
962
+
963
+ # ============================================================================
964
+ # ACP-004: Agent Scaling
965
+ # ============================================================================
966
+
967
+ @dataclass
968
+ class ScalingConfig:
969
+ """Configuration for agent scaling"""
970
+ min_replicas: int = 1
971
+ max_replicas: int = 10
972
+ target_cpu_utilization: float = 0.7
973
+ target_memory_utilization: float = 0.8
974
+ scale_up_threshold: float = 0.8
975
+ scale_down_threshold: float = 0.3
976
+ scale_up_cooldown_seconds: float = 60.0
977
+ scale_down_cooldown_seconds: float = 300.0
978
+ scale_up_increment: int = 1
979
+ scale_down_increment: int = 1
980
+
981
+
982
+ @dataclass
983
+ class AgentReplica:
984
+ """Represents a replica of an agent"""
985
+ replica_id: str
986
+ agent_id: str
987
+ instance: Any
988
+ created_at: datetime = field(default_factory=datetime.now)
989
+ status: AgentState = AgentState.PENDING
990
+ metrics: Dict[str, float] = field(default_factory=dict)
991
+
992
+
993
+ class AgentScaler:
994
+ """
995
+ Horizontal scaling manager for agents.
996
+
997
+ Provides automatic scaling based on load metrics, supporting both
998
+ scale-up and scale-down with configurable thresholds and cooldowns.
999
+
1000
+ Features:
1001
+ - Automatic scale-up/scale-down based on utilization
1002
+ - Configurable min/max replicas
1003
+ - Load balancing across replicas
1004
+ - Cooldown periods to prevent thrashing
1005
+
1006
+ Usage:
1007
+ scaler = AgentScaler()
1008
+
1009
+ # Register agent type with factory
1010
+ scaler.register_agent_type(
1011
+ agent_type="claims_agent",
1012
+ factory=create_claims_agent,
1013
+ config=ScalingConfig(min_replicas=2, max_replicas=10)
1014
+ )
1015
+
1016
+ # Get available replica
1017
+ agent = await scaler.get_replica("claims_agent")
1018
+
1019
+ # Manual scaling
1020
+ await scaler.scale_to("claims_agent", replicas=5)
1021
+ """
1022
+
1023
+ def __init__(self):
1024
+ self._agent_types: Dict[str, Dict[str, Any]] = {}
1025
+ self._replicas: Dict[str, Dict[str, AgentReplica]] = defaultdict(dict)
1026
+ self._last_scale_up: Dict[str, datetime] = {}
1027
+ self._last_scale_down: Dict[str, datetime] = {}
1028
+ self._load_balancer_index: Dict[str, int] = defaultdict(int)
1029
+ self._lock = asyncio.Lock()
1030
+ self._running = False
1031
+ self._scaling_task: Optional[asyncio.Task] = None
1032
+
1033
+ def register_agent_type(
1034
+ self,
1035
+ agent_type: str,
1036
+ factory: Callable[[], Any],
1037
+ config: Optional[ScalingConfig] = None,
1038
+ replicas: int = 1
1039
+ ) -> None:
1040
+ """Register an agent type for scaling"""
1041
+ config = config or ScalingConfig()
1042
+ self._agent_types[agent_type] = {
1043
+ "factory": factory,
1044
+ "config": config,
1045
+ "target_replicas": max(config.min_replicas, replicas)
1046
+ }
1047
+ logger.info(f"Registered agent type {agent_type} for scaling")
1048
+
1049
+ async def start(self) -> None:
1050
+ """Start the scaling manager"""
1051
+ if self._running:
1052
+ return
1053
+
1054
+ self._running = True
1055
+
1056
+ # Initialize replicas for all registered types
1057
+ for agent_type, info in self._agent_types.items():
1058
+ await self.scale_to(agent_type, info["target_replicas"])
1059
+
1060
+ # Start autoscaling loop
1061
+ self._scaling_task = asyncio.create_task(self._autoscaling_loop())
1062
+ logger.info("Agent scaler started")
1063
+
1064
+ async def stop(self) -> None:
1065
+ """Stop the scaling manager"""
1066
+ self._running = False
1067
+ if self._scaling_task:
1068
+ self._scaling_task.cancel()
1069
+ try:
1070
+ await self._scaling_task
1071
+ except asyncio.CancelledError:
1072
+ pass
1073
+
1074
+ # Stop all replicas
1075
+ for agent_type in list(self._replicas.keys()):
1076
+ await self.scale_to(agent_type, 0)
1077
+
1078
+ logger.info("Agent scaler stopped")
1079
+
1080
+ async def scale_to(self, agent_type: str, replicas: int) -> None:
1081
+ """Scale an agent type to a specific number of replicas"""
1082
+ if agent_type not in self._agent_types:
1083
+ raise ValueError(f"Unknown agent type: {agent_type}")
1084
+
1085
+ async with self._lock:
1086
+ config = self._agent_types[agent_type]["config"]
1087
+ replicas = max(0, min(replicas, config.max_replicas))
1088
+
1089
+ current_count = len(self._replicas[agent_type])
1090
+
1091
+ if replicas > current_count:
1092
+ # Scale up
1093
+ for _ in range(replicas - current_count):
1094
+ await self._create_replica(agent_type)
1095
+ elif replicas < current_count:
1096
+ # Scale down
1097
+ to_remove = current_count - replicas
1098
+ replica_ids = list(self._replicas[agent_type].keys())[:to_remove]
1099
+ for replica_id in replica_ids:
1100
+ await self._remove_replica(agent_type, replica_id)
1101
+
1102
+ self._agent_types[agent_type]["target_replicas"] = replicas
1103
+ logger.info(f"Scaled {agent_type} to {replicas} replicas")
1104
+
1105
+ async def scale_up(self, agent_type: str, count: int = 1) -> None:
1106
+ """Scale up an agent type by adding replicas"""
1107
+ current = len(self._replicas.get(agent_type, {}))
1108
+ await self.scale_to(agent_type, current + count)
1109
+
1110
+ async def scale_down(self, agent_type: str, count: int = 1) -> None:
1111
+ """Scale down an agent type by removing replicas"""
1112
+ current = len(self._replicas.get(agent_type, {}))
1113
+ await self.scale_to(agent_type, max(0, current - count))
1114
+
1115
+ async def _create_replica(self, agent_type: str) -> AgentReplica:
1116
+ """Create a new replica for an agent type"""
1117
+ factory = self._agent_types[agent_type]["factory"]
1118
+ replica_id = f"{agent_type}-{uuid.uuid4().hex[:8]}"
1119
+
1120
+ instance = factory()
1121
+ if asyncio.iscoroutine(instance):
1122
+ instance = await instance
1123
+
1124
+ # Start the agent if it has a start method
1125
+ if hasattr(instance, 'start'):
1126
+ if asyncio.iscoroutinefunction(instance.start):
1127
+ await instance.start()
1128
+ else:
1129
+ instance.start()
1130
+
1131
+ replica = AgentReplica(
1132
+ replica_id=replica_id,
1133
+ agent_id=agent_type,
1134
+ instance=instance,
1135
+ status=AgentState.RUNNING
1136
+ )
1137
+
1138
+ self._replicas[agent_type][replica_id] = replica
1139
+ logger.info(f"Created replica {replica_id} for {agent_type}")
1140
+ return replica
1141
+
1142
+ async def _remove_replica(self, agent_type: str, replica_id: str) -> None:
1143
+ """Remove a replica"""
1144
+ replica = self._replicas[agent_type].pop(replica_id, None)
1145
+ if replica and replica.instance:
1146
+ # Stop the agent if it has a stop method
1147
+ if hasattr(replica.instance, 'stop'):
1148
+ if asyncio.iscoroutinefunction(replica.instance.stop):
1149
+ await replica.instance.stop()
1150
+ else:
1151
+ replica.instance.stop()
1152
+ logger.info(f"Removed replica {replica_id} from {agent_type}")
1153
+
1154
+ async def get_replica(self, agent_type: str) -> Optional[Any]:
1155
+ """Get an available replica using round-robin load balancing"""
1156
+ replicas = self._replicas.get(agent_type, {})
1157
+ if not replicas:
1158
+ return None
1159
+
1160
+ # Round-robin selection
1161
+ replica_list = list(replicas.values())
1162
+ running_replicas = [r for r in replica_list if r.status == AgentState.RUNNING]
1163
+
1164
+ if not running_replicas:
1165
+ return None
1166
+
1167
+ index = self._load_balancer_index[agent_type] % len(running_replicas)
1168
+ self._load_balancer_index[agent_type] += 1
1169
+
1170
+ return running_replicas[index].instance
1171
+
1172
+ async def _autoscaling_loop(self) -> None:
1173
+ """Background loop for automatic scaling"""
1174
+ while self._running:
1175
+ try:
1176
+ for agent_type, info in self._agent_types.items():
1177
+ config = info["config"]
1178
+ replicas = self._replicas.get(agent_type, {})
1179
+
1180
+ if not replicas:
1181
+ continue
1182
+
1183
+ # Calculate average utilization
1184
+ total_cpu = sum(r.metrics.get("cpu", 0) for r in replicas.values())
1185
+ avg_cpu = total_cpu / len(replicas) if replicas else 0
1186
+
1187
+ now = datetime.now()
1188
+
1189
+ # Check scale up
1190
+ if avg_cpu > config.scale_up_threshold:
1191
+ last_scale = self._last_scale_up.get(agent_type, datetime.min)
1192
+ if (now - last_scale).total_seconds() > config.scale_up_cooldown_seconds:
1193
+ if len(replicas) < config.max_replicas:
1194
+ await self.scale_up(agent_type, config.scale_up_increment)
1195
+ self._last_scale_up[agent_type] = now
1196
+
1197
+ # Check scale down
1198
+ elif avg_cpu < config.scale_down_threshold:
1199
+ last_scale = self._last_scale_down.get(agent_type, datetime.min)
1200
+ if (now - last_scale).total_seconds() > config.scale_down_cooldown_seconds:
1201
+ if len(replicas) > config.min_replicas:
1202
+ await self.scale_down(agent_type, config.scale_down_increment)
1203
+ self._last_scale_down[agent_type] = now
1204
+
1205
+ except Exception as e:
1206
+ logger.error(f"Autoscaling loop error: {e}")
1207
+
1208
+ await asyncio.sleep(10) # Check every 10 seconds
1209
+
1210
+ def update_replica_metrics(
1211
+ self,
1212
+ agent_type: str,
1213
+ replica_id: str,
1214
+ metrics: Dict[str, float]
1215
+ ) -> None:
1216
+ """Update metrics for a replica"""
1217
+ if agent_type in self._replicas and replica_id in self._replicas[agent_type]:
1218
+ self._replicas[agent_type][replica_id].metrics.update(metrics)
1219
+
1220
+ def get_replica_count(self, agent_type: str) -> int:
1221
+ """Get the current replica count for an agent type"""
1222
+ return len(self._replicas.get(agent_type, {}))
1223
+
1224
+ def get_all_replicas(self, agent_type: str) -> List[AgentReplica]:
1225
+ """Get all replicas for an agent type"""
1226
+ return list(self._replicas.get(agent_type, {}).values())
1227
+
1228
+
1229
+ # ============================================================================
1230
+ # ACP-005: Distributed Coordination
1231
+ # ============================================================================
1232
+
1233
+ @dataclass
1234
+ class LeaderElectionConfig:
1235
+ """Configuration for leader election"""
1236
+ heartbeat_interval_seconds: float = 1.0
1237
+ election_timeout_min_seconds: float = 3.0
1238
+ election_timeout_max_seconds: float = 5.0
1239
+ lease_duration_seconds: float = 15.0
1240
+
1241
+
1242
+ @dataclass
1243
+ class LeaderInfo:
1244
+ """Information about the current leader"""
1245
+ leader_id: str
1246
+ elected_at: datetime
1247
+ lease_expires_at: datetime
1248
+ term: int
1249
+
1250
+
1251
+ class DistributedCoordinator:
1252
+ """
1253
+ Distributed coordination for stateful operations.
1254
+
1255
+ Implements leader election and basic consensus for coordinating
1256
+ multiple agent instances.
1257
+
1258
+ Features:
1259
+ - Leader election using Raft-like protocol
1260
+ - Distributed locks
1261
+ - Heartbeat-based failure detection
1262
+ - Automatic leader failover
1263
+
1264
+ Usage:
1265
+ coordinator = DistributedCoordinator(node_id="node-1")
1266
+
1267
+ # Start coordination
1268
+ await coordinator.start()
1269
+
1270
+ # Check if leader
1271
+ if coordinator.is_leader:
1272
+ # Perform leader-only operations
1273
+ ...
1274
+
1275
+ # Acquire distributed lock
1276
+ async with coordinator.lock("resource-1"):
1277
+ # Critical section
1278
+ ...
1279
+ """
1280
+
1281
+ def __init__(
1282
+ self,
1283
+ node_id: str,
1284
+ config: Optional[LeaderElectionConfig] = None,
1285
+ peers: Optional[List[str]] = None
1286
+ ):
1287
+ self.node_id = node_id
1288
+ self.config = config or LeaderElectionConfig()
1289
+ self.peers = peers or []
1290
+
1291
+ self._role = CoordinationRole.FOLLOWER
1292
+ self._current_term = 0
1293
+ self._voted_for: Optional[str] = None
1294
+ self._leader_id: Optional[str] = None
1295
+ self._leader_lease_expires: Optional[datetime] = None
1296
+
1297
+ self._last_heartbeat = datetime.now()
1298
+ self._election_timeout = self._random_election_timeout()
1299
+
1300
+ self._locks: Dict[str, asyncio.Lock] = {}
1301
+ self._lock_holders: Dict[str, str] = {}
1302
+
1303
+ self._running = False
1304
+ self._tasks: List[asyncio.Task] = []
1305
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
1306
+ self._lock = asyncio.Lock()
1307
+
1308
+ def _random_election_timeout(self) -> float:
1309
+ """Generate a random election timeout"""
1310
+ import random
1311
+ return random.uniform(
1312
+ self.config.election_timeout_min_seconds,
1313
+ self.config.election_timeout_max_seconds
1314
+ )
1315
+
1316
+ @property
1317
+ def is_leader(self) -> bool:
1318
+ """Check if this node is the leader"""
1319
+ return self._role == CoordinationRole.LEADER
1320
+
1321
+ @property
1322
+ def role(self) -> CoordinationRole:
1323
+ """Get current role"""
1324
+ return self._role
1325
+
1326
+ @property
1327
+ def leader_id(self) -> Optional[str]:
1328
+ """Get the current leader ID"""
1329
+ return self._leader_id
1330
+
1331
+ async def start(self) -> None:
1332
+ """Start the coordinator"""
1333
+ if self._running:
1334
+ return
1335
+
1336
+ self._running = True
1337
+ self._tasks.append(asyncio.create_task(self._election_loop()))
1338
+
1339
+ # If no peers, become leader immediately
1340
+ if not self.peers:
1341
+ await self._become_leader()
1342
+
1343
+ logger.info(f"Distributed coordinator started for node {self.node_id}")
1344
+
1345
+ async def stop(self) -> None:
1346
+ """Stop the coordinator"""
1347
+ self._running = False
1348
+ for task in self._tasks:
1349
+ task.cancel()
1350
+ try:
1351
+ await task
1352
+ except asyncio.CancelledError:
1353
+ pass
1354
+ self._tasks.clear()
1355
+ logger.info(f"Distributed coordinator stopped for node {self.node_id}")
1356
+
1357
+ async def _election_loop(self) -> None:
1358
+ """Main election and heartbeat loop"""
1359
+ while self._running:
1360
+ try:
1361
+ if self._role == CoordinationRole.LEADER:
1362
+ # Send heartbeats as leader
1363
+ await self._send_heartbeats()
1364
+ await asyncio.sleep(self.config.heartbeat_interval_seconds)
1365
+ else:
1366
+ # Check for election timeout
1367
+ elapsed = (datetime.now() - self._last_heartbeat).total_seconds()
1368
+ if elapsed > self._election_timeout:
1369
+ await self._start_election()
1370
+ await asyncio.sleep(0.1)
1371
+
1372
+ except Exception as e:
1373
+ logger.error(f"Election loop error: {e}")
1374
+ await asyncio.sleep(1)
1375
+
1376
+ async def _start_election(self) -> None:
1377
+ """Start a leader election"""
1378
+ async with self._lock:
1379
+ self._role = CoordinationRole.CANDIDATE
1380
+ self._current_term += 1
1381
+ self._voted_for = self.node_id
1382
+ self._election_timeout = self._random_election_timeout()
1383
+
1384
+ logger.info(f"Node {self.node_id} starting election for term {self._current_term}")
1385
+
1386
+ # In a real implementation, request votes from peers
1387
+ # For single-node or simple cases, just become leader
1388
+ if not self.peers:
1389
+ await self._become_leader()
1390
+ else:
1391
+ # Simplified: if we're a candidate and no peers respond, become leader
1392
+ votes_received = 1 # Vote for self
1393
+ votes_needed = (len(self.peers) + 1) // 2 + 1
1394
+
1395
+ # In real implementation: send RequestVote RPCs to peers
1396
+ # For now, simulate winning the election
1397
+ if votes_received >= votes_needed or not self.peers:
1398
+ await self._become_leader()
1399
+
1400
+ async def _become_leader(self) -> None:
1401
+ """Transition to leader role"""
1402
+ self._role = CoordinationRole.LEADER
1403
+ self._leader_id = self.node_id
1404
+ self._leader_lease_expires = datetime.now() + timedelta(
1405
+ seconds=self.config.lease_duration_seconds
1406
+ )
1407
+
1408
+ logger.info(f"Node {self.node_id} became leader for term {self._current_term}")
1409
+ await self._trigger_callbacks("leader_elected", self.node_id)
1410
+
1411
+ async def _send_heartbeats(self) -> None:
1412
+ """Send heartbeats to followers"""
1413
+ self._leader_lease_expires = datetime.now() + timedelta(
1414
+ seconds=self.config.lease_duration_seconds
1415
+ )
1416
+ # In real implementation: send AppendEntries RPCs to peers
1417
+
1418
+ def receive_heartbeat(self, leader_id: str, term: int) -> None:
1419
+ """Receive a heartbeat from the leader"""
1420
+ if term >= self._current_term:
1421
+ self._current_term = term
1422
+ self._role = CoordinationRole.FOLLOWER
1423
+ self._leader_id = leader_id
1424
+ self._last_heartbeat = datetime.now()
1425
+ self._voted_for = None
1426
+
1427
+ async def acquire_lock(self, resource_id: str, timeout: float = 30.0) -> bool:
1428
+ """Acquire a distributed lock"""
1429
+ if resource_id not in self._locks:
1430
+ self._locks[resource_id] = asyncio.Lock()
1431
+
1432
+ try:
1433
+ acquired = await asyncio.wait_for(
1434
+ self._locks[resource_id].acquire(),
1435
+ timeout=timeout
1436
+ )
1437
+ if acquired:
1438
+ self._lock_holders[resource_id] = self.node_id
1439
+ logger.debug(f"Node {self.node_id} acquired lock on {resource_id}")
1440
+ return acquired
1441
+ except asyncio.TimeoutError:
1442
+ return False
1443
+
1444
+ def release_lock(self, resource_id: str) -> None:
1445
+ """Release a distributed lock"""
1446
+ if resource_id in self._locks and self._locks[resource_id].locked():
1447
+ self._locks[resource_id].release()
1448
+ self._lock_holders.pop(resource_id, None)
1449
+ logger.debug(f"Node {self.node_id} released lock on {resource_id}")
1450
+
1451
+ def lock(self, resource_id: str, timeout: float = 30.0):
1452
+ """Context manager for distributed lock"""
1453
+ return DistributedLockContext(self, resource_id, timeout)
1454
+
1455
+ def on_event(
1456
+ self,
1457
+ event: str,
1458
+ callback: Callable[[str], Awaitable[None]]
1459
+ ) -> None:
1460
+ """Register a callback for coordination events"""
1461
+ self._callbacks[event].append(callback)
1462
+
1463
+ async def _trigger_callbacks(self, event: str, *args) -> None:
1464
+ """Trigger callbacks for an event"""
1465
+ for callback in self._callbacks.get(event, []):
1466
+ try:
1467
+ await callback(*args)
1468
+ except Exception as e:
1469
+ logger.error(f"Coordination callback error: {e}")
1470
+
1471
+ def get_leader_info(self) -> Optional[LeaderInfo]:
1472
+ """Get information about the current leader"""
1473
+ if self._leader_id:
1474
+ return LeaderInfo(
1475
+ leader_id=self._leader_id,
1476
+ elected_at=datetime.now(), # Would be tracked in real implementation
1477
+ lease_expires_at=self._leader_lease_expires or datetime.now(),
1478
+ term=self._current_term
1479
+ )
1480
+ return None
1481
+
1482
+
1483
+ class DistributedLockContext:
1484
+ """Context manager for distributed locks"""
1485
+
1486
+ def __init__(
1487
+ self,
1488
+ coordinator: DistributedCoordinator,
1489
+ resource_id: str,
1490
+ timeout: float
1491
+ ):
1492
+ self._coordinator = coordinator
1493
+ self._resource_id = resource_id
1494
+ self._timeout = timeout
1495
+
1496
+ async def __aenter__(self):
1497
+ acquired = await self._coordinator.acquire_lock(self._resource_id, self._timeout)
1498
+ if not acquired:
1499
+ raise TimeoutError(f"Failed to acquire lock on {self._resource_id}")
1500
+ return self
1501
+
1502
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1503
+ self._coordinator.release_lock(self._resource_id)
1504
+ return False
1505
+
1506
+
1507
+ # ============================================================================
1508
+ # ACP-006: Agent Dependency Graph
1509
+ # ============================================================================
1510
+
1511
+ @dataclass
1512
+ class AgentDependency:
1513
+ """Represents a dependency between agents"""
1514
+ agent_id: str
1515
+ depends_on: List[str]
1516
+ optional_depends_on: List[str] = field(default_factory=list)
1517
+ startup_timeout_seconds: float = 60.0
1518
+
1519
+
1520
+ class DependencyGraph:
1521
+ """
1522
+ Manages agent startup order based on dependencies.
1523
+
1524
+ Ensures agents start in the correct order, respecting dependencies
1525
+ and detecting circular dependencies.
1526
+
1527
+ Features:
1528
+ - Topological sorting for startup order
1529
+ - Circular dependency detection
1530
+ - Optional vs required dependencies
1531
+ - Parallel startup where possible
1532
+
1533
+ Usage:
1534
+ graph = DependencyGraph()
1535
+
1536
+ graph.add_agent("api-server", depends_on=["database", "cache"])
1537
+ graph.add_agent("database", depends_on=[])
1538
+ graph.add_agent("cache", depends_on=[])
1539
+
1540
+ # Get startup order
1541
+ order = graph.get_startup_order()
1542
+ # Returns: ["database", "cache", "api-server"]
1543
+ """
1544
+
1545
+ def __init__(self):
1546
+ self._agents: Dict[str, AgentDependency] = {}
1547
+ self._graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depends_on
1548
+ self._reverse_graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depended_by
1549
+
1550
+ def add_agent(
1551
+ self,
1552
+ agent_id: str,
1553
+ depends_on: Optional[List[str]] = None,
1554
+ optional_depends_on: Optional[List[str]] = None,
1555
+ startup_timeout: float = 60.0
1556
+ ) -> None:
1557
+ """Add an agent with its dependencies"""
1558
+ depends_on = depends_on or []
1559
+ optional_depends_on = optional_depends_on or []
1560
+
1561
+ self._agents[agent_id] = AgentDependency(
1562
+ agent_id=agent_id,
1563
+ depends_on=depends_on,
1564
+ optional_depends_on=optional_depends_on,
1565
+ startup_timeout_seconds=startup_timeout
1566
+ )
1567
+
1568
+ # Update graphs
1569
+ for dep in depends_on:
1570
+ self._graph[agent_id].add(dep)
1571
+ self._reverse_graph[dep].add(agent_id)
1572
+
1573
+ logger.debug(f"Added agent {agent_id} with dependencies: {depends_on}")
1574
+
1575
+ def remove_agent(self, agent_id: str) -> None:
1576
+ """Remove an agent from the dependency graph"""
1577
+ if agent_id in self._agents:
1578
+ # Remove from graphs
1579
+ for dep in self._graph[agent_id]:
1580
+ self._reverse_graph[dep].discard(agent_id)
1581
+ del self._graph[agent_id]
1582
+ del self._agents[agent_id]
1583
+
1584
+ def get_dependencies(self, agent_id: str) -> List[str]:
1585
+ """Get all dependencies for an agent"""
1586
+ agent = self._agents.get(agent_id)
1587
+ if agent:
1588
+ return agent.depends_on + agent.optional_depends_on
1589
+ return []
1590
+
1591
+ def get_dependents(self, agent_id: str) -> List[str]:
1592
+ """Get all agents that depend on this agent"""
1593
+ return list(self._reverse_graph.get(agent_id, set()))
1594
+
1595
+ def has_circular_dependency(self) -> bool:
1596
+ """Check if there are any circular dependencies"""
1597
+ visited = set()
1598
+ rec_stack = set()
1599
+
1600
+ def dfs(node: str) -> bool:
1601
+ visited.add(node)
1602
+ rec_stack.add(node)
1603
+
1604
+ for neighbor in self._graph.get(node, set()):
1605
+ if neighbor not in visited:
1606
+ if dfs(neighbor):
1607
+ return True
1608
+ elif neighbor in rec_stack:
1609
+ return True
1610
+
1611
+ rec_stack.remove(node)
1612
+ return False
1613
+
1614
+ for agent_id in self._agents:
1615
+ if agent_id not in visited:
1616
+ if dfs(agent_id):
1617
+ return True
1618
+
1619
+ return False
1620
+
1621
+ def get_startup_order(self) -> List[str]:
1622
+ """
1623
+ Get the startup order using topological sort.
1624
+
1625
+ Returns agents in order such that dependencies are started first.
1626
+ Raises ValueError if there are circular dependencies.
1627
+ """
1628
+ if self.has_circular_dependency():
1629
+ raise ValueError("Circular dependency detected in agent graph")
1630
+
1631
+ # Kahn's algorithm for topological sort
1632
+ in_degree = {agent_id: 0 for agent_id in self._agents}
1633
+ for agent_id in self._agents:
1634
+ for dep in self._graph.get(agent_id, set()):
1635
+ if dep in in_degree:
1636
+ in_degree[agent_id] += 1
1637
+
1638
+ # Start with agents that have no dependencies
1639
+ queue = deque([a for a, d in in_degree.items() if d == 0])
1640
+ result = []
1641
+
1642
+ while queue:
1643
+ agent_id = queue.popleft()
1644
+ result.append(agent_id)
1645
+
1646
+ for dependent in self._reverse_graph.get(agent_id, set()):
1647
+ if dependent in in_degree:
1648
+ in_degree[dependent] -= 1
1649
+ if in_degree[dependent] == 0:
1650
+ queue.append(dependent)
1651
+
1652
+ return result
1653
+
1654
+ def get_parallel_startup_groups(self) -> List[List[str]]:
1655
+ """
1656
+ Get groups of agents that can be started in parallel.
1657
+
1658
+ Returns a list of groups, where agents within a group can start
1659
+ simultaneously, but groups must be started in order.
1660
+ """
1661
+ if self.has_circular_dependency():
1662
+ raise ValueError("Circular dependency detected")
1663
+
1664
+ result = []
1665
+ remaining = set(self._agents.keys())
1666
+ started = set()
1667
+
1668
+ while remaining:
1669
+ # Find agents whose dependencies are all started
1670
+ group = []
1671
+ for agent_id in remaining:
1672
+ deps = self._graph.get(agent_id, set())
1673
+ if all(dep in started or dep not in self._agents for dep in deps):
1674
+ group.append(agent_id)
1675
+
1676
+ if not group:
1677
+ raise ValueError("Unable to resolve dependencies")
1678
+
1679
+ result.append(group)
1680
+ for agent_id in group:
1681
+ remaining.remove(agent_id)
1682
+ started.add(agent_id)
1683
+
1684
+ return result
1685
+
1686
+ def get_shutdown_order(self) -> List[str]:
1687
+ """Get the shutdown order (reverse of startup order)"""
1688
+ return list(reversed(self.get_startup_order()))
1689
+
1690
+ def validate(self) -> List[str]:
1691
+ """
1692
+ Validate the dependency graph.
1693
+
1694
+ Returns a list of validation errors, or empty list if valid.
1695
+ """
1696
+ errors = []
1697
+
1698
+ # Check for circular dependencies
1699
+ if self.has_circular_dependency():
1700
+ errors.append("Circular dependency detected")
1701
+
1702
+ # Check for missing dependencies
1703
+ for agent_id, agent in self._agents.items():
1704
+ for dep in agent.depends_on:
1705
+ if dep not in self._agents:
1706
+ errors.append(f"Agent {agent_id} depends on missing agent {dep}")
1707
+
1708
+ return errors
1709
+
1710
+
1711
+ # ============================================================================
1712
+ # ACP-007: Graceful Shutdown
1713
+ # ============================================================================
1714
+
1715
+ @dataclass
1716
+ class ShutdownConfig:
1717
+ """Configuration for graceful shutdown"""
1718
+ drain_timeout_seconds: float = 30.0
1719
+ force_timeout_seconds: float = 60.0
1720
+ checkpoint_enabled: bool = True
1721
+ save_in_flight: bool = True
1722
+
1723
+
1724
+ @dataclass
1725
+ class InFlightOperation:
1726
+ """Represents an in-flight operation during shutdown"""
1727
+ operation_id: str
1728
+ agent_id: str
1729
+ operation_type: str
1730
+ started_at: datetime
1731
+ data: Dict[str, Any] = field(default_factory=dict)
1732
+
1733
+
1734
+ class GracefulShutdownManager:
1735
+ """
1736
+ Manages graceful shutdown to preserve in-flight verifications.
1737
+
1738
+ Features:
1739
+ - Drain period for completing in-flight operations
1740
+ - Operation checkpointing
1741
+ - Configurable force timeout
1742
+ - Shutdown hooks
1743
+
1744
+ Usage:
1745
+ shutdown_manager = GracefulShutdownManager(
1746
+ config=ShutdownConfig(drain_timeout_seconds=30)
1747
+ )
1748
+
1749
+ # Register in-flight operation
1750
+ op_id = shutdown_manager.register_operation(
1751
+ agent_id="claims-agent",
1752
+ operation_type="verification",
1753
+ data={"claim_id": "123"}
1754
+ )
1755
+
1756
+ # Complete operation
1757
+ shutdown_manager.complete_operation(op_id)
1758
+
1759
+ # Initiate graceful shutdown
1760
+ await shutdown_manager.shutdown()
1761
+ """
1762
+
1763
+ def __init__(self, config: Optional[ShutdownConfig] = None):
1764
+ self.config = config or ShutdownConfig()
1765
+ self._phase = ShutdownPhase.RUNNING
1766
+ self._in_flight: Dict[str, InFlightOperation] = {}
1767
+ self._shutdown_hooks: List[Callable[[], Awaitable[None]]] = []
1768
+ self._checkpoint_data: Dict[str, Any] = {}
1769
+ self._lock = asyncio.Lock()
1770
+ self._shutdown_event = asyncio.Event()
1771
+
1772
+ @property
1773
+ def phase(self) -> ShutdownPhase:
1774
+ """Get the current shutdown phase"""
1775
+ return self._phase
1776
+
1777
+ @property
1778
+ def is_shutting_down(self) -> bool:
1779
+ """Check if shutdown is in progress"""
1780
+ return self._phase != ShutdownPhase.RUNNING
1781
+
1782
+ def register_operation(
1783
+ self,
1784
+ agent_id: str,
1785
+ operation_type: str,
1786
+ data: Optional[Dict[str, Any]] = None
1787
+ ) -> str:
1788
+ """Register an in-flight operation"""
1789
+ if self._phase != ShutdownPhase.RUNNING:
1790
+ raise RuntimeError("Cannot register new operations during shutdown")
1791
+
1792
+ operation_id = str(uuid.uuid4())
1793
+ self._in_flight[operation_id] = InFlightOperation(
1794
+ operation_id=operation_id,
1795
+ agent_id=agent_id,
1796
+ operation_type=operation_type,
1797
+ started_at=datetime.now(),
1798
+ data=data or {}
1799
+ )
1800
+ return operation_id
1801
+
1802
+ def complete_operation(self, operation_id: str) -> None:
1803
+ """Mark an operation as complete"""
1804
+ self._in_flight.pop(operation_id, None)
1805
+
1806
+ # Check if all operations complete during draining
1807
+ if self._phase == ShutdownPhase.DRAINING and not self._in_flight:
1808
+ self._shutdown_event.set()
1809
+
1810
+ def get_in_flight_count(self) -> int:
1811
+ """Get the number of in-flight operations"""
1812
+ return len(self._in_flight)
1813
+
1814
+ def get_in_flight_operations(self) -> List[InFlightOperation]:
1815
+ """Get all in-flight operations"""
1816
+ return list(self._in_flight.values())
1817
+
1818
+ def add_shutdown_hook(
1819
+ self,
1820
+ hook: Callable[[], Awaitable[None]]
1821
+ ) -> None:
1822
+ """Add a shutdown hook to be called during shutdown"""
1823
+ self._shutdown_hooks.append(hook)
1824
+
1825
+ async def shutdown(self) -> Dict[str, Any]:
1826
+ """
1827
+ Initiate graceful shutdown.
1828
+
1829
+ Returns a summary of the shutdown process.
1830
+ """
1831
+ async with self._lock:
1832
+ if self._phase != ShutdownPhase.RUNNING:
1833
+ return {"status": "already_shutting_down", "phase": self._phase.value}
1834
+
1835
+ logger.info("Initiating graceful shutdown")
1836
+ result = {
1837
+ "started_at": datetime.now().isoformat(),
1838
+ "in_flight_at_start": len(self._in_flight),
1839
+ "checkpointed": [],
1840
+ "timed_out": []
1841
+ }
1842
+
1843
+ # Phase 1: Draining
1844
+ self._phase = ShutdownPhase.DRAINING
1845
+ logger.info(f"Draining {len(self._in_flight)} in-flight operations")
1846
+
1847
+ if self._in_flight:
1848
+ self._shutdown_event.clear()
1849
+ try:
1850
+ await asyncio.wait_for(
1851
+ self._shutdown_event.wait(),
1852
+ timeout=self.config.drain_timeout_seconds
1853
+ )
1854
+ except asyncio.TimeoutError:
1855
+ logger.warning("Drain timeout reached, saving remaining operations")
1856
+
1857
+ # Checkpoint remaining operations
1858
+ if self.config.save_in_flight:
1859
+ for op_id, op in list(self._in_flight.items()):
1860
+ self._checkpoint_data[op_id] = {
1861
+ "agent_id": op.agent_id,
1862
+ "operation_type": op.operation_type,
1863
+ "data": op.data,
1864
+ "started_at": op.started_at.isoformat()
1865
+ }
1866
+ result["checkpointed"].append(op_id)
1867
+
1868
+ result["timed_out"] = list(self._in_flight.keys())
1869
+
1870
+ # Phase 2: Stopping
1871
+ self._phase = ShutdownPhase.STOPPING
1872
+ logger.info("Running shutdown hooks")
1873
+
1874
+ for hook in self._shutdown_hooks:
1875
+ try:
1876
+ await asyncio.wait_for(
1877
+ hook(),
1878
+ timeout=self.config.force_timeout_seconds
1879
+ )
1880
+ except asyncio.TimeoutError:
1881
+ logger.warning("Shutdown hook timed out")
1882
+ except Exception as e:
1883
+ logger.error(f"Shutdown hook error: {e}")
1884
+
1885
+ # Phase 3: Terminated
1886
+ self._phase = ShutdownPhase.TERMINATED
1887
+ result["completed_at"] = datetime.now().isoformat()
1888
+ result["checkpoint_data"] = self._checkpoint_data
1889
+
1890
+ logger.info("Graceful shutdown complete")
1891
+ return result
1892
+
1893
+ def get_checkpoint_data(self) -> Dict[str, Any]:
1894
+ """Get checkpointed data from shutdown"""
1895
+ return dict(self._checkpoint_data)
1896
+
1897
+ async def restore_from_checkpoint(
1898
+ self,
1899
+ checkpoint_data: Dict[str, Any]
1900
+ ) -> List[InFlightOperation]:
1901
+ """Restore operations from checkpoint data"""
1902
+ restored = []
1903
+ for op_id, data in checkpoint_data.items():
1904
+ op = InFlightOperation(
1905
+ operation_id=op_id,
1906
+ agent_id=data["agent_id"],
1907
+ operation_type=data["operation_type"],
1908
+ started_at=datetime.fromisoformat(data["started_at"]),
1909
+ data=data.get("data", {})
1910
+ )
1911
+ self._in_flight[op_id] = op
1912
+ restored.append(op)
1913
+
1914
+ logger.info(f"Restored {len(restored)} operations from checkpoint")
1915
+ return restored
1916
+
1917
+
1918
+ # ============================================================================
1919
+ # ACP-008: Resource Quotas
1920
+ # ============================================================================
1921
+
1922
+ @dataclass
1923
+ class AgentResourceQuota:
1924
+ """Resource quota limits for an agent"""
1925
+ memory_mb: int = 512
1926
+ cpu_percent: float = 25.0
1927
+ max_concurrent_operations: int = 10
1928
+ max_operations_per_minute: int = 100
1929
+ network_bandwidth_mbps: Optional[float] = None
1930
+ storage_mb: Optional[int] = None
1931
+
1932
+
1933
+ @dataclass
1934
+ class ResourceUsage:
1935
+ """Current resource usage for an agent"""
1936
+ agent_id: str
1937
+ memory_mb: float = 0.0
1938
+ cpu_percent: float = 0.0
1939
+ concurrent_operations: int = 0
1940
+ operations_this_minute: int = 0
1941
+ timestamp: datetime = field(default_factory=datetime.now)
1942
+
1943
+
1944
+ class ResourceQuotaManager:
1945
+ """
1946
+ Manages resource quotas and limits per agent.
1947
+
1948
+ Features:
1949
+ - Memory and CPU limits
1950
+ - Concurrent operation limits
1951
+ - Rate limiting (operations per minute)
1952
+ - Usage tracking and reporting
1953
+
1954
+ Usage:
1955
+ quota_manager = ResourceQuotaManager()
1956
+
1957
+ quota_manager.set_quota("claims-agent", AgentResourceQuota(
1958
+ memory_mb=512,
1959
+ cpu_percent=25,
1960
+ max_concurrent_operations=10
1961
+ ))
1962
+
1963
+ # Check before operation
1964
+ if quota_manager.can_execute("claims-agent"):
1965
+ quota_manager.record_operation("claims-agent")
1966
+ # Execute operation
1967
+ """
1968
+
1969
+ def __init__(self):
1970
+ self._quotas: Dict[str, AgentResourceQuota] = {}
1971
+ self._usage: Dict[str, ResourceUsage] = {}
1972
+ self._operation_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
1973
+ self._lock = asyncio.Lock()
1974
+
1975
+ def set_quota(self, agent_id: str, quota: AgentResourceQuota) -> None:
1976
+ """Set the resource quota for an agent"""
1977
+ self._quotas[agent_id] = quota
1978
+ if agent_id not in self._usage:
1979
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
1980
+ logger.info(f"Set quota for {agent_id}: memory={quota.memory_mb}MB, cpu={quota.cpu_percent}%")
1981
+
1982
+ def get_quota(self, agent_id: str) -> Optional[AgentResourceQuota]:
1983
+ """Get the quota for an agent"""
1984
+ return self._quotas.get(agent_id)
1985
+
1986
+ def can_execute(self, agent_id: str) -> bool:
1987
+ """Check if an agent can execute a new operation"""
1988
+ quota = self._quotas.get(agent_id)
1989
+ if not quota:
1990
+ return True # No quota means no limits
1991
+
1992
+ usage = self._usage.get(agent_id)
1993
+ if not usage:
1994
+ return True
1995
+
1996
+ # Check concurrent operations
1997
+ if usage.concurrent_operations >= quota.max_concurrent_operations:
1998
+ logger.warning(f"Agent {agent_id} at max concurrent operations")
1999
+ return False
2000
+
2001
+ # Check rate limit
2002
+ ops_this_minute = self._count_recent_operations(agent_id, seconds=60)
2003
+ if ops_this_minute >= quota.max_operations_per_minute:
2004
+ logger.warning(f"Agent {agent_id} at rate limit")
2005
+ return False
2006
+
2007
+ # Check memory
2008
+ if usage.memory_mb > quota.memory_mb:
2009
+ logger.warning(f"Agent {agent_id} over memory quota")
2010
+ return False
2011
+
2012
+ # Check CPU
2013
+ if usage.cpu_percent > quota.cpu_percent:
2014
+ logger.warning(f"Agent {agent_id} over CPU quota")
2015
+ return False
2016
+
2017
+ return True
2018
+
2019
+ def record_operation_start(self, agent_id: str) -> None:
2020
+ """Record the start of an operation"""
2021
+ if agent_id not in self._usage:
2022
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
2023
+
2024
+ self._usage[agent_id].concurrent_operations += 1
2025
+ self._operation_counts[agent_id].append(datetime.now())
2026
+
2027
+ def record_operation_end(self, agent_id: str) -> None:
2028
+ """Record the end of an operation"""
2029
+ if agent_id in self._usage:
2030
+ self._usage[agent_id].concurrent_operations = max(
2031
+ 0, self._usage[agent_id].concurrent_operations - 1
2032
+ )
2033
+
2034
+ def update_resource_usage(
2035
+ self,
2036
+ agent_id: str,
2037
+ memory_mb: Optional[float] = None,
2038
+ cpu_percent: Optional[float] = None
2039
+ ) -> None:
2040
+ """Update the resource usage for an agent"""
2041
+ if agent_id not in self._usage:
2042
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
2043
+
2044
+ usage = self._usage[agent_id]
2045
+ if memory_mb is not None:
2046
+ usage.memory_mb = memory_mb
2047
+ if cpu_percent is not None:
2048
+ usage.cpu_percent = cpu_percent
2049
+ usage.timestamp = datetime.now()
2050
+
2051
+ def _count_recent_operations(self, agent_id: str, seconds: int) -> int:
2052
+ """Count operations in the last N seconds"""
2053
+ cutoff = datetime.now() - timedelta(seconds=seconds)
2054
+ count = 0
2055
+ for ts in self._operation_counts.get(agent_id, []):
2056
+ if ts > cutoff:
2057
+ count += 1
2058
+ return count
2059
+
2060
+ def get_usage(self, agent_id: str) -> Optional[ResourceUsage]:
2061
+ """Get current usage for an agent"""
2062
+ return self._usage.get(agent_id)
2063
+
2064
+ def get_all_usage(self) -> Dict[str, ResourceUsage]:
2065
+ """Get usage for all agents"""
2066
+ return dict(self._usage)
2067
+
2068
+ def check_quota_violations(self) -> Dict[str, List[str]]:
2069
+ """Check for quota violations across all agents"""
2070
+ violations = {}
2071
+
2072
+ for agent_id, quota in self._quotas.items():
2073
+ usage = self._usage.get(agent_id)
2074
+ if not usage:
2075
+ continue
2076
+
2077
+ agent_violations = []
2078
+
2079
+ if usage.memory_mb > quota.memory_mb:
2080
+ agent_violations.append(
2081
+ f"Memory: {usage.memory_mb:.1f}MB > {quota.memory_mb}MB"
2082
+ )
2083
+
2084
+ if usage.cpu_percent > quota.cpu_percent:
2085
+ agent_violations.append(
2086
+ f"CPU: {usage.cpu_percent:.1f}% > {quota.cpu_percent}%"
2087
+ )
2088
+
2089
+ if usage.concurrent_operations > quota.max_concurrent_operations:
2090
+ agent_violations.append(
2091
+ f"Concurrent ops: {usage.concurrent_operations} > {quota.max_concurrent_operations}"
2092
+ )
2093
+
2094
+ if agent_violations:
2095
+ violations[agent_id] = agent_violations
2096
+
2097
+ return violations
2098
+
2099
+
2100
+ # ============================================================================
2101
+ # ACP-009: Agent Observability
2102
+ # ============================================================================
2103
+
2104
+ @dataclass
2105
+ class AgentMetric:
2106
+ """A metric measurement for an agent"""
2107
+ name: str
2108
+ value: float
2109
+ labels: Dict[str, str] = field(default_factory=dict)
2110
+ timestamp: datetime = field(default_factory=datetime.now)
2111
+ metric_type: str = "gauge" # gauge, counter, histogram
2112
+
2113
+
2114
+ @dataclass
2115
+ class AgentLogEntry:
2116
+ """A log entry from an agent"""
2117
+ agent_id: str
2118
+ level: str # debug, info, warning, error, critical
2119
+ message: str
2120
+ timestamp: datetime = field(default_factory=datetime.now)
2121
+ context: Dict[str, Any] = field(default_factory=dict)
2122
+
2123
+
2124
+ class AgentObservabilityProvider:
2125
+ """
2126
+ Built-in observability for agents (metrics, logging, tracing).
2127
+
2128
+ Features:
2129
+ - Structured logging with context
2130
+ - Metrics collection (counters, gauges, histograms)
2131
+ - Distributed tracing support
2132
+ - Prometheus-compatible export
2133
+
2134
+ Usage:
2135
+ observability = AgentObservabilityProvider()
2136
+
2137
+ # Record metric
2138
+ observability.record_metric(
2139
+ agent_id="claims-agent",
2140
+ name="verification_latency_ms",
2141
+ value=150.5,
2142
+ labels={"claim_type": "auto"}
2143
+ )
2144
+
2145
+ # Log with context
2146
+ observability.log(
2147
+ agent_id="claims-agent",
2148
+ level="info",
2149
+ message="Verification completed",
2150
+ context={"claim_id": "123", "result": "approved"}
2151
+ )
2152
+
2153
+ # Export metrics
2154
+ metrics = observability.export_prometheus()
2155
+ """
2156
+
2157
+ def __init__(self, max_log_entries: int = 10000, max_metrics: int = 10000):
2158
+ self._metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_metrics))
2159
+ self._logs: deque = deque(maxlen=max_log_entries)
2160
+ self._counters: Dict[str, float] = defaultdict(float)
2161
+ self._gauges: Dict[str, float] = {}
2162
+ self._histograms: Dict[str, List[float]] = defaultdict(list)
2163
+ self._metric_metadata: Dict[str, Dict[str, Any]] = {}
2164
+ self._lock = asyncio.Lock()
2165
+
2166
+ def record_metric(
2167
+ self,
2168
+ agent_id: str,
2169
+ name: str,
2170
+ value: float,
2171
+ labels: Optional[Dict[str, str]] = None,
2172
+ metric_type: str = "gauge"
2173
+ ) -> None:
2174
+ """Record a metric for an agent"""
2175
+ labels = labels or {}
2176
+ labels["agent_id"] = agent_id
2177
+
2178
+ metric = AgentMetric(
2179
+ name=name,
2180
+ value=value,
2181
+ labels=labels,
2182
+ metric_type=metric_type
2183
+ )
2184
+
2185
+ full_name = f"{name}:{self._make_label_key(labels)}"
2186
+ self._metrics[agent_id].append(metric)
2187
+
2188
+ # Update aggregates
2189
+ if metric_type == "counter":
2190
+ self._counters[full_name] += value
2191
+ elif metric_type == "gauge":
2192
+ self._gauges[full_name] = value
2193
+ elif metric_type == "histogram":
2194
+ self._histograms[name].append(value)
2195
+
2196
+ def increment_counter(
2197
+ self,
2198
+ agent_id: str,
2199
+ name: str,
2200
+ value: float = 1.0,
2201
+ labels: Optional[Dict[str, str]] = None
2202
+ ) -> None:
2203
+ """Increment a counter metric"""
2204
+ self.record_metric(agent_id, name, value, labels, metric_type="counter")
2205
+
2206
+ def set_gauge(
2207
+ self,
2208
+ agent_id: str,
2209
+ name: str,
2210
+ value: float,
2211
+ labels: Optional[Dict[str, str]] = None
2212
+ ) -> None:
2213
+ """Set a gauge metric"""
2214
+ self.record_metric(agent_id, name, value, labels, metric_type="gauge")
2215
+
2216
+ def observe_histogram(
2217
+ self,
2218
+ agent_id: str,
2219
+ name: str,
2220
+ value: float,
2221
+ labels: Optional[Dict[str, str]] = None
2222
+ ) -> None:
2223
+ """Observe a value for a histogram metric"""
2224
+ self.record_metric(agent_id, name, value, labels, metric_type="histogram")
2225
+
2226
+ def log(
2227
+ self,
2228
+ agent_id: str,
2229
+ level: str,
2230
+ message: str,
2231
+ context: Optional[Dict[str, Any]] = None
2232
+ ) -> None:
2233
+ """Log a message with structured context"""
2234
+ entry = AgentLogEntry(
2235
+ agent_id=agent_id,
2236
+ level=level,
2237
+ message=message,
2238
+ context=context or {}
2239
+ )
2240
+ self._logs.append(entry)
2241
+
2242
+ # Also log to Python logger
2243
+ log_func = getattr(logger, level.lower(), logger.info)
2244
+ log_func(f"[{agent_id}] {message}", extra={"context": context})
2245
+
2246
+ def get_metrics(
2247
+ self,
2248
+ agent_id: Optional[str] = None,
2249
+ name: Optional[str] = None
2250
+ ) -> List[AgentMetric]:
2251
+ """Get recorded metrics"""
2252
+ if agent_id:
2253
+ metrics = list(self._metrics.get(agent_id, []))
2254
+ else:
2255
+ metrics = []
2256
+ for agent_metrics in self._metrics.values():
2257
+ metrics.extend(agent_metrics)
2258
+
2259
+ if name:
2260
+ metrics = [m for m in metrics if m.name == name]
2261
+
2262
+ return metrics
2263
+
2264
+ def get_logs(
2265
+ self,
2266
+ agent_id: Optional[str] = None,
2267
+ level: Optional[str] = None,
2268
+ limit: int = 100
2269
+ ) -> List[AgentLogEntry]:
2270
+ """Get log entries"""
2271
+ logs = list(self._logs)
2272
+
2273
+ if agent_id:
2274
+ logs = [l for l in logs if l.agent_id == agent_id]
2275
+ if level:
2276
+ logs = [l for l in logs if l.level == level]
2277
+
2278
+ return logs[-limit:]
2279
+
2280
+ def export_prometheus(self) -> str:
2281
+ """Export metrics in Prometheus text format"""
2282
+ lines = []
2283
+
2284
+ # Export counters
2285
+ for full_name, value in self._counters.items():
2286
+ name = full_name.split(":")[0]
2287
+ lines.append(f"# TYPE {name} counter")
2288
+ lines.append(f"{full_name.replace(':', '')} {value}")
2289
+
2290
+ # Export gauges
2291
+ for full_name, value in self._gauges.items():
2292
+ name = full_name.split(":")[0]
2293
+ lines.append(f"# TYPE {name} gauge")
2294
+ lines.append(f"{full_name.replace(':', '')} {value}")
2295
+
2296
+ # Export histogram summaries
2297
+ for name, values in self._histograms.items():
2298
+ if values:
2299
+ lines.append(f"# TYPE {name} histogram")
2300
+ lines.append(f"{name}_count {len(values)}")
2301
+ lines.append(f"{name}_sum {sum(values)}")
2302
+
2303
+ # Calculate percentiles
2304
+ sorted_vals = sorted(values)
2305
+ for p in [0.5, 0.9, 0.99]:
2306
+ idx = int(len(sorted_vals) * p)
2307
+ lines.append(f'{name}{{quantile="{p}"}} {sorted_vals[idx]}')
2308
+
2309
+ return "\n".join(lines)
2310
+
2311
+ def _make_label_key(self, labels: Dict[str, str]) -> str:
2312
+ """Create a unique key from labels"""
2313
+ return ",".join(f'{k}="{v}"' for k, v in sorted(labels.items()))
2314
+
2315
+ def get_agent_summary(self, agent_id: str) -> Dict[str, Any]:
2316
+ """Get an observability summary for an agent"""
2317
+ metrics = self.get_metrics(agent_id)
2318
+ logs = self.get_logs(agent_id)
2319
+
2320
+ return {
2321
+ "agent_id": agent_id,
2322
+ "total_metrics": len(metrics),
2323
+ "total_logs": len(logs),
2324
+ "recent_metrics": metrics[-10:] if metrics else [],
2325
+ "recent_logs": logs[-10:] if logs else [],
2326
+ "log_level_counts": self._count_log_levels(logs)
2327
+ }
2328
+
2329
+ def _count_log_levels(self, logs: List[AgentLogEntry]) -> Dict[str, int]:
2330
+ """Count logs by level"""
2331
+ counts = defaultdict(int)
2332
+ for log in logs:
2333
+ counts[log.level] += 1
2334
+ return dict(counts)
2335
+
2336
+
2337
+ # ============================================================================
2338
+ # ACP-010: Hot Reload
2339
+ # ============================================================================
2340
+
2341
+ @dataclass
2342
+ class HotReloadConfig:
2343
+ """Configuration for hot reload"""
2344
+ enabled: bool = True
2345
+ watch_paths: List[str] = field(default_factory=list)
2346
+ reload_delay_seconds: float = 1.0
2347
+ preserve_state: bool = True
2348
+
2349
+
2350
+ @dataclass
2351
+ class ReloadEvent:
2352
+ """Record of a hot reload event"""
2353
+ agent_id: str
2354
+ old_version: str
2355
+ new_version: str
2356
+ timestamp: datetime = field(default_factory=datetime.now)
2357
+ success: bool = True
2358
+ error: Optional[str] = None
2359
+ preserved_state: Dict[str, Any] = field(default_factory=dict)
2360
+
2361
+
2362
+ class HotReloadManager:
2363
+ """
2364
+ Manages hot reload of agent code without full restart.
2365
+
2366
+ Features:
2367
+ - Code change detection
2368
+ - Graceful reload with state preservation
2369
+ - Version tracking
2370
+ - Rollback support
2371
+
2372
+ Usage:
2373
+ hot_reload = HotReloadManager(
2374
+ config=HotReloadConfig(
2375
+ watch_paths=["./agents"],
2376
+ preserve_state=True
2377
+ )
2378
+ )
2379
+
2380
+ # Register agent
2381
+ hot_reload.register_agent(
2382
+ agent_id="claims-agent",
2383
+ module_name="agents.claims",
2384
+ class_name="ClaimsAgent"
2385
+ )
2386
+
2387
+ # Trigger reload
2388
+ await hot_reload.reload_agent("claims-agent")
2389
+ """
2390
+
2391
+ def __init__(self, config: Optional[HotReloadConfig] = None):
2392
+ self.config = config or HotReloadConfig()
2393
+ self._agents: Dict[str, Dict[str, Any]] = {}
2394
+ self._versions: Dict[str, str] = {}
2395
+ self._previous_versions: Dict[str, Any] = {}
2396
+ self._reload_history: deque = deque(maxlen=100)
2397
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
2398
+ self._lock = asyncio.Lock()
2399
+
2400
+ def register_agent(
2401
+ self,
2402
+ agent_id: str,
2403
+ module_name: str,
2404
+ class_name: str,
2405
+ factory: Optional[Callable[[], Any]] = None,
2406
+ instance: Optional[Any] = None,
2407
+ state_extractor: Optional[Callable[[Any], Dict[str, Any]]] = None,
2408
+ state_injector: Optional[Callable[[Any, Dict[str, Any]], None]] = None
2409
+ ) -> None:
2410
+ """Register an agent for hot reload"""
2411
+ self._agents[agent_id] = {
2412
+ "module_name": module_name,
2413
+ "class_name": class_name,
2414
+ "factory": factory,
2415
+ "instance": instance,
2416
+ "state_extractor": state_extractor,
2417
+ "state_injector": state_injector
2418
+ }
2419
+ self._versions[agent_id] = self._compute_version(module_name)
2420
+ logger.info(f"Registered agent {agent_id} for hot reload (version: {self._versions[agent_id][:8]})")
2421
+
2422
+ def _compute_version(self, module_name: str) -> str:
2423
+ """Compute a version hash for a module"""
2424
+ try:
2425
+ module = sys.modules.get(module_name)
2426
+ if module and hasattr(module, '__file__') and module.__file__:
2427
+ with open(module.__file__, 'rb') as f:
2428
+ return hashlib.md5(f.read()).hexdigest()
2429
+ except Exception as e:
2430
+ logger.warning(f"Could not compute version for {module_name}: {e}")
2431
+
2432
+ return hashlib.md5(module_name.encode()).hexdigest()
2433
+
2434
+ async def check_for_changes(self, agent_id: str) -> bool:
2435
+ """Check if an agent's code has changed"""
2436
+ if agent_id not in self._agents:
2437
+ return False
2438
+
2439
+ module_name = self._agents[agent_id]["module_name"]
2440
+ new_version = self._compute_version(module_name)
2441
+ old_version = self._versions.get(agent_id, "")
2442
+
2443
+ return new_version != old_version
2444
+
2445
+ async def reload_agent(
2446
+ self,
2447
+ agent_id: str,
2448
+ force: bool = False
2449
+ ) -> ReloadEvent:
2450
+ """Reload an agent with optional state preservation"""
2451
+ if agent_id not in self._agents:
2452
+ raise ValueError(f"Agent {agent_id} not registered for hot reload")
2453
+
2454
+ async with self._lock:
2455
+ agent_info = self._agents[agent_id]
2456
+ module_name = agent_info["module_name"]
2457
+ class_name = agent_info["class_name"]
2458
+ old_version = self._versions.get(agent_id, "unknown")
2459
+
2460
+ # Check if reload needed
2461
+ if not force and not await self.check_for_changes(agent_id):
2462
+ return ReloadEvent(
2463
+ agent_id=agent_id,
2464
+ old_version=old_version,
2465
+ new_version=old_version,
2466
+ success=True,
2467
+ error="No changes detected"
2468
+ )
2469
+
2470
+ try:
2471
+ # Extract state from current instance
2472
+ preserved_state = {}
2473
+ if self.config.preserve_state and agent_info.get("instance"):
2474
+ extractor = agent_info.get("state_extractor")
2475
+ if extractor:
2476
+ preserved_state = extractor(agent_info["instance"])
2477
+ elif hasattr(agent_info["instance"], 'get_state'):
2478
+ preserved_state = agent_info["instance"].get_state()
2479
+
2480
+ # Stop old instance
2481
+ old_instance = agent_info.get("instance")
2482
+ if old_instance and hasattr(old_instance, 'stop'):
2483
+ if asyncio.iscoroutinefunction(old_instance.stop):
2484
+ await old_instance.stop()
2485
+ else:
2486
+ old_instance.stop()
2487
+
2488
+ # Store for potential rollback
2489
+ self._previous_versions[agent_id] = {
2490
+ "instance": old_instance,
2491
+ "version": old_version
2492
+ }
2493
+
2494
+ # Reload the module
2495
+ if module_name in sys.modules:
2496
+ module = importlib.reload(sys.modules[module_name])
2497
+ else:
2498
+ module = importlib.import_module(module_name)
2499
+
2500
+ # Create new instance
2501
+ agent_class = getattr(module, class_name)
2502
+
2503
+ if agent_info.get("factory"):
2504
+ new_instance = agent_info["factory"]()
2505
+ else:
2506
+ new_instance = agent_class()
2507
+
2508
+ if asyncio.iscoroutine(new_instance):
2509
+ new_instance = await new_instance
2510
+
2511
+ # Inject preserved state
2512
+ if preserved_state:
2513
+ injector = agent_info.get("state_injector")
2514
+ if injector:
2515
+ injector(new_instance, preserved_state)
2516
+ elif hasattr(new_instance, 'set_state'):
2517
+ new_instance.set_state(preserved_state)
2518
+
2519
+ # Start new instance
2520
+ if hasattr(new_instance, 'start'):
2521
+ if asyncio.iscoroutinefunction(new_instance.start):
2522
+ await new_instance.start()
2523
+ else:
2524
+ new_instance.start()
2525
+
2526
+ # Update registry
2527
+ agent_info["instance"] = new_instance
2528
+ new_version = self._compute_version(module_name)
2529
+ self._versions[agent_id] = new_version
2530
+
2531
+ event = ReloadEvent(
2532
+ agent_id=agent_id,
2533
+ old_version=old_version,
2534
+ new_version=new_version,
2535
+ success=True,
2536
+ preserved_state=preserved_state
2537
+ )
2538
+
2539
+ self._reload_history.append(event)
2540
+ await self._trigger_callbacks("reload_success", agent_id, event)
2541
+
2542
+ logger.info(f"Hot reloaded agent {agent_id}: {old_version[:8]} -> {new_version[:8]}")
2543
+ return event
2544
+
2545
+ except Exception as e:
2546
+ event = ReloadEvent(
2547
+ agent_id=agent_id,
2548
+ old_version=old_version,
2549
+ new_version=old_version,
2550
+ success=False,
2551
+ error=str(e)
2552
+ )
2553
+
2554
+ self._reload_history.append(event)
2555
+ await self._trigger_callbacks("reload_failed", agent_id, event)
2556
+
2557
+ logger.error(f"Hot reload failed for {agent_id}: {e}")
2558
+ return event
2559
+
2560
+ async def rollback_agent(self, agent_id: str) -> bool:
2561
+ """Rollback an agent to the previous version"""
2562
+ if agent_id not in self._previous_versions:
2563
+ logger.warning(f"No previous version available for {agent_id}")
2564
+ return False
2565
+
2566
+ async with self._lock:
2567
+ try:
2568
+ prev = self._previous_versions[agent_id]
2569
+ agent_info = self._agents[agent_id]
2570
+
2571
+ # Stop current instance
2572
+ current = agent_info.get("instance")
2573
+ if current and hasattr(current, 'stop'):
2574
+ if asyncio.iscoroutinefunction(current.stop):
2575
+ await current.stop()
2576
+ else:
2577
+ current.stop()
2578
+
2579
+ # Restore previous instance
2580
+ prev_instance = prev["instance"]
2581
+ if prev_instance and hasattr(prev_instance, 'start'):
2582
+ if asyncio.iscoroutinefunction(prev_instance.start):
2583
+ await prev_instance.start()
2584
+ else:
2585
+ prev_instance.start()
2586
+
2587
+ agent_info["instance"] = prev_instance
2588
+ self._versions[agent_id] = prev["version"]
2589
+
2590
+ logger.info(f"Rolled back agent {agent_id} to version {prev['version'][:8]}")
2591
+ return True
2592
+
2593
+ except Exception as e:
2594
+ logger.error(f"Rollback failed for {agent_id}: {e}")
2595
+ return False
2596
+
2597
+ def get_agent_version(self, agent_id: str) -> Optional[str]:
2598
+ """Get the current version of an agent"""
2599
+ return self._versions.get(agent_id)
2600
+
2601
+ def get_agent_instance(self, agent_id: str) -> Optional[Any]:
2602
+ """Get the current instance of an agent"""
2603
+ if agent_id in self._agents:
2604
+ return self._agents[agent_id].get("instance")
2605
+ return None
2606
+
2607
+ def get_reload_history(
2608
+ self,
2609
+ agent_id: Optional[str] = None
2610
+ ) -> List[ReloadEvent]:
2611
+ """Get reload history"""
2612
+ history = list(self._reload_history)
2613
+ if agent_id:
2614
+ history = [e for e in history if e.agent_id == agent_id]
2615
+ return history
2616
+
2617
+ def on_event(
2618
+ self,
2619
+ event: str,
2620
+ callback: Callable[[str, ReloadEvent], Awaitable[None]]
2621
+ ) -> None:
2622
+ """Register a callback for reload events"""
2623
+ self._callbacks[event].append(callback)
2624
+
2625
+ async def _trigger_callbacks(
2626
+ self,
2627
+ event: str,
2628
+ agent_id: str,
2629
+ reload_event: ReloadEvent
2630
+ ) -> None:
2631
+ """Trigger callbacks for an event"""
2632
+ for callback in self._callbacks.get(event, []):
2633
+ try:
2634
+ await callback(agent_id, reload_event)
2635
+ except Exception as e:
2636
+ logger.error(f"Hot reload callback error: {e}")
2637
+
2638
+
2639
+ # ============================================================================
2640
+ # Main Agent Registration
2641
+ # ============================================================================
2642
+
2643
+ @dataclass
2644
+ class AgentRegistration:
2645
+ """Registration details for an agent in the control plane"""
2646
+ agent_type: Type
2647
+ replicas: int = 1
2648
+ dependencies: List[str] = field(default_factory=list)
2649
+ resources: Optional[AgentResourceQuota] = None
2650
+ health_config: Optional[HealthCheckConfig] = None
2651
+ recovery_config: Optional[RecoveryConfig] = None
2652
+ circuit_breaker: Optional[CircuitBreaker] = None
2653
+ metadata: Dict[str, Any] = field(default_factory=dict)
2654
+
2655
+
2656
+ # ============================================================================
2657
+ # Enhanced Agent Control Plane
2658
+ # ============================================================================
2659
+
2660
+ class EnhancedAgentControlPlane:
2661
+ """
2662
+ Enhanced Agent Control Plane with full lifecycle management.
2663
+
2664
+ This is the main interface for managing autonomous AI agents with
2665
+ comprehensive lifecycle features including health monitoring,
2666
+ auto-recovery, circuit breakers, scaling, distributed coordination,
2667
+ dependency management, graceful shutdown, resource quotas,
2668
+ observability, and hot reload.
2669
+
2670
+ Usage:
2671
+ control_plane = EnhancedAgentControlPlane(
2672
+ health_check_interval=30,
2673
+ auto_recovery=True,
2674
+ circuit_breaker=CircuitBreaker(
2675
+ failure_threshold=5,
2676
+ recovery_timeout=60
2677
+ )
2678
+ )
2679
+
2680
+ control_plane.register(
2681
+ ClaimsAgent,
2682
+ replicas=3,
2683
+ dependencies=["message-bus"],
2684
+ resources=AgentResourceQuota(
2685
+ memory_mb=512,
2686
+ cpu_percent=25
2687
+ )
2688
+ )
2689
+
2690
+ await control_plane.start_all()
2691
+ """
2692
+
2693
+ def __init__(
2694
+ self,
2695
+ health_check_interval: float = 30.0,
2696
+ auto_recovery: bool = True,
2697
+ circuit_breaker: Optional[CircuitBreaker] = None,
2698
+ node_id: Optional[str] = None,
2699
+ health_config: Optional[HealthCheckConfig] = None,
2700
+ recovery_config: Optional[RecoveryConfig] = None,
2701
+ scaling_config: Optional[ScalingConfig] = None,
2702
+ shutdown_config: Optional[ShutdownConfig] = None,
2703
+ hot_reload_config: Optional[HotReloadConfig] = None
2704
+ ):
2705
+ """
2706
+ Initialize the Enhanced Agent Control Plane.
2707
+
2708
+ Args:
2709
+ health_check_interval: Interval between health checks (seconds)
2710
+ auto_recovery: Enable automatic recovery of failed agents
2711
+ circuit_breaker: Default circuit breaker configuration
2712
+ node_id: Node ID for distributed coordination
2713
+ health_config: Health check configuration
2714
+ recovery_config: Auto-recovery configuration
2715
+ scaling_config: Agent scaling configuration
2716
+ shutdown_config: Graceful shutdown configuration
2717
+ hot_reload_config: Hot reload configuration
2718
+ """
2719
+ self.node_id = node_id or f"node-{uuid.uuid4().hex[:8]}"
2720
+
2721
+ # Configure health monitoring
2722
+ health_config = health_config or HealthCheckConfig()
2723
+ health_config.liveness_interval_seconds = health_check_interval
2724
+ self.health_monitor = HealthMonitor(config=health_config)
2725
+
2726
+ # Configure auto-recovery
2727
+ recovery_config = recovery_config or RecoveryConfig()
2728
+ recovery_config.enabled = auto_recovery
2729
+ self.recovery_manager = AutoRecoveryManager(config=recovery_config)
2730
+
2731
+ # Configure circuit breakers
2732
+ self.default_circuit_breaker = circuit_breaker
2733
+ self.circuit_breaker_registry = CircuitBreakerRegistry()
2734
+
2735
+ # Configure scaling
2736
+ self.scaler = AgentScaler()
2737
+ self.default_scaling_config = scaling_config or ScalingConfig()
2738
+
2739
+ # Configure distributed coordination
2740
+ self.coordinator = DistributedCoordinator(node_id=self.node_id)
2741
+
2742
+ # Configure dependency graph
2743
+ self.dependency_graph = DependencyGraph()
2744
+
2745
+ # Configure graceful shutdown
2746
+ self.shutdown_manager = GracefulShutdownManager(
2747
+ config=shutdown_config or ShutdownConfig()
2748
+ )
2749
+
2750
+ # Configure resource quotas
2751
+ self.quota_manager = ResourceQuotaManager()
2752
+
2753
+ # Configure observability
2754
+ self.observability = AgentObservabilityProvider()
2755
+
2756
+ # Configure hot reload
2757
+ self.hot_reload = HotReloadManager(
2758
+ config=hot_reload_config or HotReloadConfig()
2759
+ )
2760
+
2761
+ # Agent registrations
2762
+ self._registrations: Dict[str, AgentRegistration] = {}
2763
+ self._instances: Dict[str, List[Any]] = defaultdict(list)
2764
+ self._running = False
2765
+
2766
+ # Wire up callbacks
2767
+ self._setup_callbacks()
2768
+
2769
+ def _setup_callbacks(self) -> None:
2770
+ """Set up internal callbacks between components"""
2771
+ # Health -> Recovery: trigger recovery on health failure
2772
+ async def on_liveness_failed(agent_id: str):
2773
+ self.observability.log(agent_id, "error", "Liveness check failed")
2774
+ self.observability.increment_counter(agent_id, "health_failures_total")
2775
+ await self.recovery_manager.handle_failure(agent_id)
2776
+
2777
+ self.health_monitor.on_event("liveness_failed", on_liveness_failed)
2778
+
2779
+ # Recovery -> Health: register recovered agents
2780
+ async def on_recovery_success(agent_id: str, event: RecoveryEvent):
2781
+ self.observability.log(agent_id, "info", f"Agent recovered (attempt {event.attempt})")
2782
+ self.observability.increment_counter(agent_id, "recoveries_total")
2783
+ agent = self.recovery_manager.get_agent(agent_id)
2784
+ if agent:
2785
+ self.health_monitor.register_agent(agent_id, agent)
2786
+
2787
+ self.recovery_manager.on_event("recovery_success", on_recovery_success)
2788
+
2789
+ def register(
2790
+ self,
2791
+ agent_type: Type,
2792
+ agent_id: Optional[str] = None,
2793
+ replicas: int = 1,
2794
+ dependencies: Optional[List[str]] = None,
2795
+ resources: Optional[AgentResourceQuota] = None,
2796
+ health_config: Optional[HealthCheckConfig] = None,
2797
+ recovery_config: Optional[RecoveryConfig] = None,
2798
+ circuit_breaker: Optional[CircuitBreaker] = None,
2799
+ **metadata
2800
+ ) -> str:
2801
+ """
2802
+ Register an agent type with the control plane.
2803
+
2804
+ Args:
2805
+ agent_type: The agent class to register
2806
+ agent_id: Optional agent ID (defaults to class name)
2807
+ replicas: Number of replicas to create
2808
+ dependencies: List of agent IDs this agent depends on
2809
+ resources: Resource quota for this agent
2810
+ health_config: Health check configuration
2811
+ recovery_config: Auto-recovery configuration
2812
+ circuit_breaker: Circuit breaker for this agent
2813
+ **metadata: Additional metadata
2814
+
2815
+ Returns:
2816
+ The agent ID
2817
+ """
2818
+ agent_id = agent_id or agent_type.__name__
2819
+ dependencies = dependencies or []
2820
+
2821
+ registration = AgentRegistration(
2822
+ agent_type=agent_type,
2823
+ replicas=replicas,
2824
+ dependencies=dependencies,
2825
+ resources=resources,
2826
+ health_config=health_config,
2827
+ recovery_config=recovery_config,
2828
+ circuit_breaker=circuit_breaker or self.default_circuit_breaker,
2829
+ metadata=metadata
2830
+ )
2831
+
2832
+ self._registrations[agent_id] = registration
2833
+
2834
+ # Register with dependency graph
2835
+ self.dependency_graph.add_agent(agent_id, depends_on=dependencies)
2836
+
2837
+ # Register with scaler
2838
+ self.scaler.register_agent_type(
2839
+ agent_type=agent_id,
2840
+ factory=lambda at=agent_type: at(),
2841
+ config=self.default_scaling_config,
2842
+ replicas=replicas
2843
+ )
2844
+
2845
+ # Set resource quota if provided
2846
+ if resources:
2847
+ self.quota_manager.set_quota(agent_id, resources)
2848
+
2849
+ # Register circuit breaker
2850
+ if circuit_breaker:
2851
+ self.circuit_breaker_registry._breakers[agent_id] = circuit_breaker
2852
+
2853
+ self.observability.log(
2854
+ agent_id, "info",
2855
+ f"Registered agent with {replicas} replicas, dependencies: {dependencies}"
2856
+ )
2857
+
2858
+ logger.info(f"Registered agent {agent_id}: replicas={replicas}, dependencies={dependencies}")
2859
+ return agent_id
2860
+
2861
+ async def start_all(self) -> Dict[str, Any]:
2862
+ """
2863
+ Start all registered agents in dependency order.
2864
+
2865
+ Returns:
2866
+ Summary of startup results
2867
+ """
2868
+ if self._running:
2869
+ return {"status": "already_running"}
2870
+
2871
+ result = {
2872
+ "started_at": datetime.now().isoformat(),
2873
+ "agents": {},
2874
+ "errors": []
2875
+ }
2876
+
2877
+ try:
2878
+ # Validate dependency graph
2879
+ errors = self.dependency_graph.validate()
2880
+ if errors:
2881
+ result["errors"] = errors
2882
+ return result
2883
+
2884
+ # Get startup order
2885
+ startup_groups = self.dependency_graph.get_parallel_startup_groups()
2886
+
2887
+ # Start coordinator
2888
+ await self.coordinator.start()
2889
+
2890
+ # Start health monitor
2891
+ await self.health_monitor.start()
2892
+
2893
+ # Start agents in dependency order
2894
+ for group in startup_groups:
2895
+ # Start agents in this group in parallel
2896
+ tasks = []
2897
+ for agent_id in group:
2898
+ tasks.append(self._start_agent(agent_id))
2899
+
2900
+ group_results = await asyncio.gather(*tasks, return_exceptions=True)
2901
+
2902
+ for agent_id, res in zip(group, group_results):
2903
+ if isinstance(res, Exception):
2904
+ result["agents"][agent_id] = {
2905
+ "status": "failed",
2906
+ "error": str(res)
2907
+ }
2908
+ result["errors"].append(f"{agent_id}: {res}")
2909
+ else:
2910
+ result["agents"][agent_id] = res
2911
+
2912
+ # Start scaler
2913
+ await self.scaler.start()
2914
+
2915
+ self._running = True
2916
+ result["status"] = "started"
2917
+
2918
+ except Exception as e:
2919
+ result["status"] = "failed"
2920
+ result["errors"].append(str(e))
2921
+ logger.error(f"Failed to start control plane: {e}")
2922
+
2923
+ return result
2924
+
2925
+ async def _start_agent(self, agent_id: str) -> Dict[str, Any]:
2926
+ """Start a single agent"""
2927
+ registration = self._registrations.get(agent_id)
2928
+ if not registration:
2929
+ raise ValueError(f"Agent {agent_id} not registered")
2930
+
2931
+ result = {
2932
+ "agent_id": agent_id,
2933
+ "status": "starting",
2934
+ "replicas": []
2935
+ }
2936
+
2937
+ # Check resource quota
2938
+ if registration.resources:
2939
+ self.quota_manager.set_quota(agent_id, registration.resources)
2940
+
2941
+ # Create factory for recovery manager
2942
+ def create_agent():
2943
+ return registration.agent_type()
2944
+
2945
+ # Register with recovery manager
2946
+ self.recovery_manager.register_agent(agent_id, create_agent)
2947
+
2948
+ # Create replicas
2949
+ for i in range(registration.replicas):
2950
+ replica_id = f"{agent_id}-{i}"
2951
+ try:
2952
+ instance = create_agent()
2953
+
2954
+ # Start instance if it has start method
2955
+ if hasattr(instance, 'start'):
2956
+ if asyncio.iscoroutinefunction(instance.start):
2957
+ await instance.start()
2958
+ else:
2959
+ instance.start()
2960
+
2961
+ self._instances[agent_id].append(instance)
2962
+
2963
+ # Register with health monitor
2964
+ self.health_monitor.register_agent(replica_id, instance)
2965
+
2966
+ result["replicas"].append({
2967
+ "replica_id": replica_id,
2968
+ "status": "running"
2969
+ })
2970
+
2971
+ self.observability.log(agent_id, "info", f"Started replica {replica_id}")
2972
+
2973
+ except Exception as e:
2974
+ result["replicas"].append({
2975
+ "replica_id": replica_id,
2976
+ "status": "failed",
2977
+ "error": str(e)
2978
+ })
2979
+ self.observability.log(agent_id, "error", f"Failed to start replica {replica_id}: {e}")
2980
+
2981
+ result["status"] = "running"
2982
+ return result
2983
+
2984
+ async def stop_all(self) -> Dict[str, Any]:
2985
+ """
2986
+ Stop all agents gracefully.
2987
+
2988
+ Returns:
2989
+ Summary of shutdown results
2990
+ """
2991
+ if not self._running:
2992
+ return {"status": "not_running"}
2993
+
2994
+ # Initiate graceful shutdown
2995
+ shutdown_result = await self.shutdown_manager.shutdown()
2996
+
2997
+ # Stop components in reverse order
2998
+ await self.scaler.stop()
2999
+ await self.health_monitor.stop()
3000
+ await self.coordinator.stop()
3001
+
3002
+ # Stop agents in reverse dependency order
3003
+ shutdown_order = self.dependency_graph.get_shutdown_order()
3004
+
3005
+ for agent_id in shutdown_order:
3006
+ for instance in self._instances.get(agent_id, []):
3007
+ try:
3008
+ if hasattr(instance, 'stop'):
3009
+ if asyncio.iscoroutinefunction(instance.stop):
3010
+ await instance.stop()
3011
+ else:
3012
+ instance.stop()
3013
+ except Exception as e:
3014
+ logger.error(f"Error stopping {agent_id}: {e}")
3015
+
3016
+ self._instances[agent_id].clear()
3017
+
3018
+ self._running = False
3019
+
3020
+ return {
3021
+ "status": "stopped",
3022
+ "shutdown_result": shutdown_result
3023
+ }
3024
+
3025
+ def get_agent(self, agent_id: str, replica_index: int = 0) -> Optional[Any]:
3026
+ """Get an agent instance by ID"""
3027
+ instances = self._instances.get(agent_id, [])
3028
+ if 0 <= replica_index < len(instances):
3029
+ return instances[replica_index]
3030
+ return None
3031
+
3032
+ async def get_available_agent(self, agent_id: str) -> Optional[Any]:
3033
+ """Get an available agent instance (load balanced)"""
3034
+ # Check circuit breaker
3035
+ breaker = self.circuit_breaker_registry.get(agent_id)
3036
+ if breaker and breaker.is_open:
3037
+ return None
3038
+
3039
+ # Check resource quota
3040
+ if not self.quota_manager.can_execute(agent_id):
3041
+ return None
3042
+
3043
+ # Get replica from scaler
3044
+ return await self.scaler.get_replica(agent_id)
3045
+
3046
+ def get_health_status(self, agent_id: str) -> HealthStatus:
3047
+ """Get the health status of an agent"""
3048
+ return self.health_monitor.get_agent_health(agent_id)
3049
+
3050
+ def get_all_health_status(self) -> Dict[str, HealthStatus]:
3051
+ """Get health status for all agents"""
3052
+ return self.health_monitor.get_all_health_status()
3053
+
3054
+ def get_circuit_breaker(self, agent_id: str) -> Optional[CircuitBreaker]:
3055
+ """Get the circuit breaker for an agent"""
3056
+ return self.circuit_breaker_registry.get(agent_id)
3057
+
3058
+ def get_metrics(self) -> str:
3059
+ """Get Prometheus-formatted metrics"""
3060
+ return self.observability.export_prometheus()
3061
+
3062
+ def get_status(self) -> Dict[str, Any]:
3063
+ """Get comprehensive status of the control plane"""
3064
+ return {
3065
+ "running": self._running,
3066
+ "node_id": self.node_id,
3067
+ "is_leader": self.coordinator.is_leader,
3068
+ "registered_agents": list(self._registrations.keys()),
3069
+ "health_status": {
3070
+ k: v.value for k, v in self.health_monitor.get_all_health_status().items()
3071
+ },
3072
+ "circuit_breakers": {
3073
+ name: cb.get_metrics().__dict__
3074
+ for name, cb in self.circuit_breaker_registry._breakers.items()
3075
+ },
3076
+ "resource_violations": self.quota_manager.check_quota_violations(),
3077
+ "in_flight_operations": self.shutdown_manager.get_in_flight_count()
3078
+ }
3079
+
3080
+
3081
+ # Convenience factory function
3082
+ def create_control_plane(
3083
+ health_check_interval: float = 30.0,
3084
+ auto_recovery: bool = True,
3085
+ circuit_breaker: Optional[CircuitBreaker] = None,
3086
+ **kwargs
3087
+ ) -> EnhancedAgentControlPlane:
3088
+ """
3089
+ Create an enhanced agent control plane.
3090
+
3091
+ This is the recommended way to create a control plane instance.
3092
+
3093
+ Args:
3094
+ health_check_interval: Interval between health checks
3095
+ auto_recovery: Enable automatic recovery
3096
+ circuit_breaker: Default circuit breaker
3097
+ **kwargs: Additional configuration
3098
+
3099
+ Returns:
3100
+ Configured EnhancedAgentControlPlane instance
3101
+ """
3102
+ return EnhancedAgentControlPlane(
3103
+ health_check_interval=health_check_interval,
3104
+ auto_recovery=auto_recovery,
3105
+ circuit_breaker=circuit_breaker,
3106
+ **kwargs
3107
+ )
3108
+
3109
+
3110
+ # Backwards compatibility alias
3111
+ AgentControlPlaneV2 = EnhancedAgentControlPlane