agent_os_kernel 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. agent_control_plane/__init__.py +662 -0
  2. agent_control_plane/a2a_adapter.py +543 -0
  3. agent_control_plane/adapter.py +417 -0
  4. agent_control_plane/agent_hibernation.py +394 -0
  5. agent_control_plane/agent_kernel.py +470 -0
  6. agent_control_plane/compliance.py +720 -0
  7. agent_control_plane/constraint_graphs.py +478 -0
  8. agent_control_plane/control_plane.py +854 -0
  9. agent_control_plane/example_executors.py +195 -0
  10. agent_control_plane/execution_engine.py +231 -0
  11. agent_control_plane/flight_recorder.py +846 -0
  12. agent_control_plane/governance_layer.py +435 -0
  13. agent_control_plane/hf_utils.py +563 -0
  14. agent_control_plane/interfaces/__init__.py +55 -0
  15. agent_control_plane/interfaces/kernel_interface.py +361 -0
  16. agent_control_plane/interfaces/plugin_interface.py +497 -0
  17. agent_control_plane/interfaces/protocol_interfaces.py +387 -0
  18. agent_control_plane/kernel_space.py +1009 -0
  19. agent_control_plane/langchain_adapter.py +424 -0
  20. agent_control_plane/lifecycle.py +3113 -0
  21. agent_control_plane/mcp_adapter.py +653 -0
  22. agent_control_plane/ml_safety.py +563 -0
  23. agent_control_plane/multimodal.py +727 -0
  24. agent_control_plane/mute_agent.py +422 -0
  25. agent_control_plane/observability.py +787 -0
  26. agent_control_plane/orchestrator.py +482 -0
  27. agent_control_plane/plugin_registry.py +750 -0
  28. agent_control_plane/policy_engine.py +954 -0
  29. agent_control_plane/process_isolation.py +777 -0
  30. agent_control_plane/shadow_mode.py +310 -0
  31. agent_control_plane/signals.py +493 -0
  32. agent_control_plane/supervisor_agents.py +430 -0
  33. agent_control_plane/time_travel_debugger.py +557 -0
  34. agent_control_plane/tool_registry.py +452 -0
  35. agent_control_plane/vfs.py +697 -0
  36. agent_kernel/__init__.py +69 -0
  37. agent_kernel/analyzer.py +435 -0
  38. agent_kernel/auditor.py +36 -0
  39. agent_kernel/completeness_auditor.py +237 -0
  40. agent_kernel/detector.py +203 -0
  41. agent_kernel/kernel.py +744 -0
  42. agent_kernel/memory_manager.py +85 -0
  43. agent_kernel/models.py +374 -0
  44. agent_kernel/nudge_mechanism.py +263 -0
  45. agent_kernel/outcome_analyzer.py +338 -0
  46. agent_kernel/patcher.py +582 -0
  47. agent_kernel/semantic_analyzer.py +316 -0
  48. agent_kernel/semantic_purge.py +349 -0
  49. agent_kernel/simulator.py +449 -0
  50. agent_kernel/teacher.py +85 -0
  51. agent_kernel/triage.py +152 -0
  52. agent_os/__init__.py +409 -0
  53. agent_os/_adversarial_impl.py +200 -0
  54. agent_os/_circuit_breaker_impl.py +232 -0
  55. agent_os/_mcp_metrics.py +193 -0
  56. agent_os/adversarial.py +20 -0
  57. agent_os/agents_compat.py +490 -0
  58. agent_os/audit_logger.py +135 -0
  59. agent_os/base_agent.py +651 -0
  60. agent_os/circuit_breaker.py +34 -0
  61. agent_os/cli/__init__.py +659 -0
  62. agent_os/cli/cmd_audit.py +128 -0
  63. agent_os/cli/cmd_init.py +152 -0
  64. agent_os/cli/cmd_policy.py +41 -0
  65. agent_os/cli/cmd_policy_gen.py +180 -0
  66. agent_os/cli/cmd_validate.py +258 -0
  67. agent_os/cli/mcp_scan.py +265 -0
  68. agent_os/cli/output.py +192 -0
  69. agent_os/cli/policy_checker.py +330 -0
  70. agent_os/compat.py +74 -0
  71. agent_os/constraint_graph.py +234 -0
  72. agent_os/content_governance.py +140 -0
  73. agent_os/context_budget.py +305 -0
  74. agent_os/credential_redactor.py +224 -0
  75. agent_os/diff_policy.py +89 -0
  76. agent_os/egress_policy.py +159 -0
  77. agent_os/escalation.py +276 -0
  78. agent_os/event_bus.py +124 -0
  79. agent_os/exceptions.py +180 -0
  80. agent_os/execution_context_policy.py +141 -0
  81. agent_os/github_enterprise.py +96 -0
  82. agent_os/health.py +20 -0
  83. agent_os/integrations/__init__.py +279 -0
  84. agent_os/integrations/a2a_adapter.py +279 -0
  85. agent_os/integrations/agent_lightning/__init__.py +30 -0
  86. agent_os/integrations/anthropic_adapter.py +420 -0
  87. agent_os/integrations/autogen_adapter.py +620 -0
  88. agent_os/integrations/base.py +1137 -0
  89. agent_os/integrations/compat.py +229 -0
  90. agent_os/integrations/config.py +98 -0
  91. agent_os/integrations/conversation_guardian.py +957 -0
  92. agent_os/integrations/crewai_adapter.py +467 -0
  93. agent_os/integrations/drift_detector.py +425 -0
  94. agent_os/integrations/dry_run.py +124 -0
  95. agent_os/integrations/escalation.py +582 -0
  96. agent_os/integrations/gemini_adapter.py +364 -0
  97. agent_os/integrations/google_adk_adapter.py +633 -0
  98. agent_os/integrations/guardrails_adapter.py +394 -0
  99. agent_os/integrations/health.py +197 -0
  100. agent_os/integrations/langchain_adapter.py +654 -0
  101. agent_os/integrations/llamafirewall.py +343 -0
  102. agent_os/integrations/llamaindex_adapter.py +188 -0
  103. agent_os/integrations/logging.py +191 -0
  104. agent_os/integrations/maf_adapter.py +631 -0
  105. agent_os/integrations/mistral_adapter.py +365 -0
  106. agent_os/integrations/openai_adapter.py +816 -0
  107. agent_os/integrations/openai_agents_sdk.py +406 -0
  108. agent_os/integrations/policy_compose.py +171 -0
  109. agent_os/integrations/profiling.py +144 -0
  110. agent_os/integrations/pydantic_ai_adapter.py +420 -0
  111. agent_os/integrations/rate_limiter.py +130 -0
  112. agent_os/integrations/rbac.py +143 -0
  113. agent_os/integrations/registry.py +113 -0
  114. agent_os/integrations/scope_guard.py +303 -0
  115. agent_os/integrations/semantic_kernel_adapter.py +769 -0
  116. agent_os/integrations/smolagents_adapter.py +629 -0
  117. agent_os/integrations/templates.py +178 -0
  118. agent_os/integrations/token_budget.py +134 -0
  119. agent_os/integrations/tool_aliases.py +190 -0
  120. agent_os/integrations/webhooks.py +177 -0
  121. agent_os/lite.py +208 -0
  122. agent_os/mcp_gateway.py +385 -0
  123. agent_os/mcp_message_signer.py +273 -0
  124. agent_os/mcp_protocols.py +161 -0
  125. agent_os/mcp_response_scanner.py +232 -0
  126. agent_os/mcp_security.py +924 -0
  127. agent_os/mcp_session_auth.py +231 -0
  128. agent_os/mcp_sliding_rate_limiter.py +184 -0
  129. agent_os/memory_guard.py +409 -0
  130. agent_os/metrics.py +134 -0
  131. agent_os/mute.py +428 -0
  132. agent_os/mute_agent.py +209 -0
  133. agent_os/policies/__init__.py +77 -0
  134. agent_os/policies/async_evaluator.py +275 -0
  135. agent_os/policies/backends.py +670 -0
  136. agent_os/policies/bridge.py +169 -0
  137. agent_os/policies/budget.py +85 -0
  138. agent_os/policies/cli.py +294 -0
  139. agent_os/policies/conflict_resolution.py +270 -0
  140. agent_os/policies/data_classification.py +252 -0
  141. agent_os/policies/evaluator.py +239 -0
  142. agent_os/policies/policy_schema.json +228 -0
  143. agent_os/policies/rate_limiting.py +145 -0
  144. agent_os/policies/schema.py +115 -0
  145. agent_os/policies/shared.py +331 -0
  146. agent_os/prompt_injection.py +694 -0
  147. agent_os/providers.py +182 -0
  148. agent_os/py.typed +0 -0
  149. agent_os/retry.py +81 -0
  150. agent_os/reversibility.py +251 -0
  151. agent_os/sandbox.py +432 -0
  152. agent_os/sandbox_provider.py +140 -0
  153. agent_os/secure_codegen.py +525 -0
  154. agent_os/security_skills.py +538 -0
  155. agent_os/semantic_policy.py +422 -0
  156. agent_os/server/__init__.py +15 -0
  157. agent_os/server/__main__.py +25 -0
  158. agent_os/server/app.py +277 -0
  159. agent_os/server/models.py +104 -0
  160. agent_os/shift_left_metrics.py +130 -0
  161. agent_os/stateless.py +742 -0
  162. agent_os/supervisor.py +148 -0
  163. agent_os/task_outcome.py +148 -0
  164. agent_os/transparency.py +181 -0
  165. agent_os/trust_root.py +128 -0
  166. agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
  167. agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
  168. agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
  169. agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
  170. agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
  171. agent_os_observability/__init__.py +27 -0
  172. agent_os_observability/dashboards.py +898 -0
  173. agent_os_observability/metrics.py +398 -0
  174. agent_os_observability/server.py +223 -0
  175. agent_os_observability/tracer.py +232 -0
  176. agent_primitives/__init__.py +24 -0
  177. agent_primitives/failures.py +84 -0
  178. agent_primitives/py.typed +0 -0
  179. amb_core/__init__.py +177 -0
  180. amb_core/adapters/__init__.py +57 -0
  181. amb_core/adapters/aws_sqs_broker.py +376 -0
  182. amb_core/adapters/azure_servicebus_broker.py +340 -0
  183. amb_core/adapters/kafka_broker.py +260 -0
  184. amb_core/adapters/nats_broker.py +285 -0
  185. amb_core/adapters/rabbitmq_broker.py +235 -0
  186. amb_core/adapters/redis_broker.py +262 -0
  187. amb_core/broker.py +145 -0
  188. amb_core/bus.py +481 -0
  189. amb_core/cloudevents.py +509 -0
  190. amb_core/dlq.py +345 -0
  191. amb_core/hf_utils.py +536 -0
  192. amb_core/memory_broker.py +410 -0
  193. amb_core/models.py +141 -0
  194. amb_core/persistence.py +529 -0
  195. amb_core/schema.py +294 -0
  196. amb_core/tracing.py +358 -0
  197. atr/__init__.py +640 -0
  198. atr/access.py +348 -0
  199. atr/composition.py +645 -0
  200. atr/decorator.py +357 -0
  201. atr/executor.py +384 -0
  202. atr/health.py +557 -0
  203. atr/hf_utils.py +449 -0
  204. atr/injection.py +422 -0
  205. atr/metrics.py +440 -0
  206. atr/policies.py +403 -0
  207. atr/py.typed +2 -0
  208. atr/registry.py +452 -0
  209. atr/schema.py +480 -0
  210. atr/tools/safe/__init__.py +75 -0
  211. atr/tools/safe/calculator.py +467 -0
  212. atr/tools/safe/datetime_tool.py +443 -0
  213. atr/tools/safe/file_reader.py +402 -0
  214. atr/tools/safe/http_client.py +316 -0
  215. atr/tools/safe/json_parser.py +374 -0
  216. atr/tools/safe/text_tool.py +537 -0
  217. atr/tools/safe/toolkit.py +175 -0
  218. caas/__init__.py +162 -0
  219. caas/api/__init__.py +7 -0
  220. caas/api/server.py +1328 -0
  221. caas/caching.py +834 -0
  222. caas/cli.py +210 -0
  223. caas/conversation.py +223 -0
  224. caas/decay.py +72 -0
  225. caas/detection/__init__.py +9 -0
  226. caas/detection/detector.py +238 -0
  227. caas/enrichment.py +130 -0
  228. caas/gateway/__init__.py +27 -0
  229. caas/gateway/trust_gateway.py +474 -0
  230. caas/hf_utils.py +479 -0
  231. caas/ingestion/__init__.py +23 -0
  232. caas/ingestion/processors.py +253 -0
  233. caas/ingestion/structure_parser.py +188 -0
  234. caas/models.py +356 -0
  235. caas/pragmatic_truth.py +444 -0
  236. caas/routing/__init__.py +10 -0
  237. caas/routing/heuristic_router.py +58 -0
  238. caas/storage/__init__.py +9 -0
  239. caas/storage/store.py +389 -0
  240. caas/triad.py +213 -0
  241. caas/tuning/__init__.py +9 -0
  242. caas/tuning/tuner.py +329 -0
  243. caas/vfs/__init__.py +14 -0
  244. caas/vfs/filesystem.py +452 -0
  245. cmvk/__init__.py +218 -0
  246. cmvk/audit.py +402 -0
  247. cmvk/benchmarks.py +478 -0
  248. cmvk/constitutional.py +904 -0
  249. cmvk/hf_utils.py +301 -0
  250. cmvk/metrics.py +473 -0
  251. cmvk/profiles.py +300 -0
  252. cmvk/py.typed +0 -0
  253. cmvk/types.py +12 -0
  254. cmvk/verification.py +956 -0
  255. emk/__init__.py +89 -0
  256. emk/causal.py +352 -0
  257. emk/hf_utils.py +421 -0
  258. emk/indexer.py +83 -0
  259. emk/py.typed +0 -0
  260. emk/schema.py +204 -0
  261. emk/sleep_cycle.py +347 -0
  262. emk/store.py +281 -0
  263. iatp/__init__.py +166 -0
  264. iatp/attestation.py +461 -0
  265. iatp/cli.py +317 -0
  266. iatp/hf_utils.py +472 -0
  267. iatp/ipc_pipes.py +580 -0
  268. iatp/main.py +412 -0
  269. iatp/models/__init__.py +447 -0
  270. iatp/policy_engine.py +337 -0
  271. iatp/py.typed +2 -0
  272. iatp/recovery.py +321 -0
  273. iatp/security/__init__.py +270 -0
  274. iatp/sidecar/__init__.py +519 -0
  275. iatp/telemetry/__init__.py +164 -0
  276. iatp/tests/__init__.py +1 -0
  277. iatp/tests/test_attestation.py +370 -0
  278. iatp/tests/test_cli.py +131 -0
  279. iatp/tests/test_ed25519_attestation.py +211 -0
  280. iatp/tests/test_models.py +130 -0
  281. iatp/tests/test_policy_engine.py +347 -0
  282. iatp/tests/test_recovery.py +281 -0
  283. iatp/tests/test_security.py +222 -0
  284. iatp/tests/test_sidecar.py +167 -0
  285. iatp/tests/test_telemetry.py +175 -0
  286. mcp_kernel_server/__init__.py +28 -0
  287. mcp_kernel_server/cli.py +274 -0
  288. mcp_kernel_server/resources.py +217 -0
  289. mcp_kernel_server/server.py +564 -0
  290. mcp_kernel_server/tools.py +1174 -0
  291. mute_agent/__init__.py +68 -0
  292. mute_agent/core/__init__.py +1 -0
  293. mute_agent/core/execution_agent.py +166 -0
  294. mute_agent/core/handshake_protocol.py +201 -0
  295. mute_agent/core/reasoning_agent.py +238 -0
  296. mute_agent/knowledge_graph/__init__.py +1 -0
  297. mute_agent/knowledge_graph/graph_elements.py +65 -0
  298. mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
  299. mute_agent/knowledge_graph/subgraph.py +224 -0
  300. mute_agent/listener/__init__.py +43 -0
  301. mute_agent/listener/adapters/__init__.py +31 -0
  302. mute_agent/listener/adapters/base_adapter.py +189 -0
  303. mute_agent/listener/adapters/caas_adapter.py +344 -0
  304. mute_agent/listener/adapters/control_plane_adapter.py +436 -0
  305. mute_agent/listener/adapters/iatp_adapter.py +332 -0
  306. mute_agent/listener/adapters/scak_adapter.py +251 -0
  307. mute_agent/listener/listener.py +610 -0
  308. mute_agent/listener/state_observer.py +436 -0
  309. mute_agent/listener/threshold_config.py +313 -0
  310. mute_agent/super_system/__init__.py +1 -0
  311. mute_agent/super_system/router.py +204 -0
  312. mute_agent/visualization/__init__.py +10 -0
  313. mute_agent/visualization/graph_debugger.py +502 -0
  314. nexus/README.md +60 -0
  315. nexus/__init__.py +51 -0
  316. nexus/arbiter.py +359 -0
  317. nexus/client.py +466 -0
  318. nexus/dmz.py +444 -0
  319. nexus/escrow.py +430 -0
  320. nexus/exceptions.py +286 -0
  321. nexus/pyproject.toml +36 -0
  322. nexus/registry.py +393 -0
  323. nexus/reputation.py +425 -0
  324. nexus/schemas/__init__.py +51 -0
  325. nexus/schemas/compliance.py +276 -0
  326. nexus/schemas/escrow.py +251 -0
  327. nexus/schemas/manifest.py +225 -0
  328. nexus/schemas/receipt.py +208 -0
  329. nexus/tests/__init__.py +0 -0
  330. nexus/tests/conftest.py +146 -0
  331. nexus/tests/test_arbiter.py +192 -0
  332. nexus/tests/test_dmz.py +194 -0
  333. nexus/tests/test_escrow.py +276 -0
  334. nexus/tests/test_exceptions.py +225 -0
  335. nexus/tests/test_registry.py +232 -0
  336. nexus/tests/test_reputation.py +328 -0
  337. nexus/tests/test_schemas.py +295 -0
@@ -0,0 +1,3113 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+ """
4
+ Agent Lifecycle Management - v0.2.0
5
+
6
+ This module provides comprehensive lifecycle management for autonomous AI agents,
7
+ including health monitoring, auto-recovery, circuit breakers, scaling, distributed
8
+ coordination, dependency management, graceful shutdown, resource quotas, observability,
9
+ and hot reload capabilities.
10
+
11
+ Features:
12
+ - ACP-001: Agent Health Checks (liveness/readiness probes)
13
+ - ACP-002: Agent Auto-Recovery (automatic restart of crashed agents)
14
+ - ACP-003: Circuit Breaker (prevent cascading failures)
15
+ - ACP-004: Agent Scaling (horizontal scaling for high-throughput)
16
+ - ACP-005: Distributed Coordination (leader election, consensus)
17
+ - ACP-006: Agent Dependency Graph (enforced start order)
18
+ - ACP-007: Graceful Shutdown (preserve in-flight verifications)
19
+ - ACP-008: Resource Quotas (memory/CPU limits per agent)
20
+ - ACP-009: Agent Observability (metrics/logging integration)
21
+ - ACP-010: Hot Reload (code changes without full restart)
22
+
23
+ Research Foundations:
24
+ - Circuit Breaker pattern (Michael Nygard, "Release It!")
25
+ - Kubernetes probe patterns (liveness, readiness, startup)
26
+ - Raft consensus algorithm (Ongaro & Ousterhout, 2014)
27
+ - Actor model supervision (Erlang/OTP, Akka)
28
+ """
29
+
30
+ from typing import (
31
+ Dict, List, Optional, Any, Union, Callable, Type, Set, Awaitable,
32
+ TypeVar, Generic, Protocol, runtime_checkable
33
+ )
34
+ from dataclasses import dataclass, field
35
+ from enum import Enum, auto
36
+ from datetime import datetime, timedelta
37
+ from collections import defaultdict, deque
38
+ from abc import ABC, abstractmethod
39
+ import asyncio
40
+ import time
41
+ import uuid
42
+ import logging
43
+ import threading
44
+ import weakref
45
+ import traceback
46
+ import hashlib
47
+ import importlib
48
+ import sys
49
+
50
+
51
+ # Configure module logger
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ # ============================================================================
56
+ # Enums and Constants
57
+ # ============================================================================
58
+
59
+ class HealthStatus(Enum):
60
+ """Health status of an agent"""
61
+ UNKNOWN = "unknown"
62
+ HEALTHY = "healthy"
63
+ UNHEALTHY = "unhealthy"
64
+ DEGRADED = "degraded"
65
+ STARTING = "starting"
66
+ STOPPING = "stopping"
67
+ STOPPED = "stopped"
68
+ FAILED = "failed"
69
+
70
+
71
+ class AgentState(Enum):
72
+ """State of an agent in the lifecycle"""
73
+ REGISTERED = "registered"
74
+ PENDING = "pending"
75
+ STARTING = "starting"
76
+ RUNNING = "running"
77
+ STOPPING = "stopping"
78
+ STOPPED = "stopped"
79
+ FAILED = "failed"
80
+ RECOVERING = "recovering"
81
+
82
+
83
+ class CircuitState(Enum):
84
+ """State of a circuit breaker"""
85
+ CLOSED = "closed" # Normal operation
86
+ OPEN = "open" # Failing, reject requests
87
+ HALF_OPEN = "half_open" # Testing recovery
88
+
89
+
90
+ class CoordinationRole(Enum):
91
+ """Role in distributed coordination"""
92
+ LEADER = "leader"
93
+ FOLLOWER = "follower"
94
+ CANDIDATE = "candidate"
95
+
96
+
97
+ class ShutdownPhase(Enum):
98
+ """Phases of graceful shutdown"""
99
+ RUNNING = "running"
100
+ DRAINING = "draining"
101
+ STOPPING = "stopping"
102
+ TERMINATED = "terminated"
103
+
104
+
105
+ # ============================================================================
106
+ # ACP-001: Agent Health Checks
107
+ # ============================================================================
108
+
109
+ @dataclass
110
+ class HealthCheckResult:
111
+ """Result of a health check probe"""
112
+ healthy: bool
113
+ status: HealthStatus
114
+ message: str = ""
115
+ latency_ms: float = 0.0
116
+ timestamp: datetime = field(default_factory=datetime.now)
117
+ details: Dict[str, Any] = field(default_factory=dict)
118
+
119
+
120
+ @dataclass
121
+ class HealthCheckConfig:
122
+ """Configuration for health check probes"""
123
+ # Liveness probe settings
124
+ liveness_interval_seconds: float = 10.0
125
+ liveness_timeout_seconds: float = 5.0
126
+ liveness_failure_threshold: int = 3
127
+
128
+ # Readiness probe settings
129
+ readiness_interval_seconds: float = 5.0
130
+ readiness_timeout_seconds: float = 3.0
131
+ readiness_failure_threshold: int = 1
132
+
133
+ # Startup probe settings (for slow-starting agents)
134
+ startup_probe_enabled: bool = True
135
+ startup_timeout_seconds: float = 60.0
136
+ startup_period_seconds: float = 5.0
137
+
138
+ # Custom health check function
139
+ custom_health_check: Optional[Callable[[], Awaitable[bool]]] = None
140
+
141
+
142
+ @runtime_checkable
143
+ class HealthCheckable(Protocol):
144
+ """Protocol for agents that support health checks"""
145
+
146
+ async def liveness_check(self) -> bool:
147
+ """Check if the agent is alive (not deadlocked/crashed)"""
148
+ ...
149
+
150
+ async def readiness_check(self) -> bool:
151
+ """Check if the agent is ready to accept requests"""
152
+ ...
153
+
154
+
155
+ class HealthMonitor:
156
+ """
157
+ Monitors agent health via liveness and readiness probes.
158
+
159
+ Implements Kubernetes-style health checking patterns:
160
+ - Liveness: Is the agent alive? If not, restart it.
161
+ - Readiness: Is the agent ready to accept requests?
162
+ - Startup: Has the agent finished starting up?
163
+
164
+ Usage:
165
+ monitor = HealthMonitor(config=HealthCheckConfig())
166
+
167
+ # Register an agent
168
+ monitor.register_agent(agent_id, agent_instance)
169
+
170
+ # Start monitoring
171
+ await monitor.start()
172
+
173
+ # Check status
174
+ status = monitor.get_agent_health(agent_id)
175
+ """
176
+
177
+ def __init__(self, config: Optional[HealthCheckConfig] = None):
178
+ self.config = config or HealthCheckConfig()
179
+ self._agents: Dict[str, Any] = {}
180
+ self._health_status: Dict[str, HealthStatus] = {}
181
+ self._liveness_failures: Dict[str, int] = defaultdict(int)
182
+ self._readiness_failures: Dict[str, int] = defaultdict(int)
183
+ self._last_check: Dict[str, datetime] = {}
184
+ self._check_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
185
+ self._running = False
186
+ self._tasks: List[asyncio.Task] = []
187
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
188
+ self._lock = asyncio.Lock()
189
+
190
+ def register_agent(
191
+ self,
192
+ agent_id: str,
193
+ agent: Any,
194
+ custom_liveness: Optional[Callable[[], Awaitable[bool]]] = None,
195
+ custom_readiness: Optional[Callable[[], Awaitable[bool]]] = None
196
+ ) -> None:
197
+ """Register an agent for health monitoring"""
198
+ self._agents[agent_id] = {
199
+ "agent": agent,
200
+ "custom_liveness": custom_liveness,
201
+ "custom_readiness": custom_readiness,
202
+ "registered_at": datetime.now()
203
+ }
204
+ self._health_status[agent_id] = HealthStatus.UNKNOWN
205
+ logger.info(f"Registered agent {agent_id} for health monitoring")
206
+
207
+ def unregister_agent(self, agent_id: str) -> None:
208
+ """Unregister an agent from health monitoring"""
209
+ if agent_id in self._agents:
210
+ del self._agents[agent_id]
211
+ self._health_status.pop(agent_id, None)
212
+ self._liveness_failures.pop(agent_id, None)
213
+ self._readiness_failures.pop(agent_id, None)
214
+ logger.info(f"Unregistered agent {agent_id} from health monitoring")
215
+
216
+ async def start(self) -> None:
217
+ """Start the health monitoring loop"""
218
+ if self._running:
219
+ return
220
+
221
+ self._running = True
222
+ self._tasks.append(asyncio.create_task(self._liveness_loop()))
223
+ self._tasks.append(asyncio.create_task(self._readiness_loop()))
224
+ logger.info("Health monitor started")
225
+
226
+ async def stop(self) -> None:
227
+ """Stop the health monitoring loop"""
228
+ self._running = False
229
+ for task in self._tasks:
230
+ task.cancel()
231
+ try:
232
+ await task
233
+ except asyncio.CancelledError:
234
+ pass
235
+ self._tasks.clear()
236
+ logger.info("Health monitor stopped")
237
+
238
+ async def _liveness_loop(self) -> None:
239
+ """Main loop for liveness checks"""
240
+ while self._running:
241
+ for agent_id in list(self._agents.keys()):
242
+ try:
243
+ result = await self._check_liveness(agent_id)
244
+ self._check_history[agent_id].append(result)
245
+
246
+ if not result.healthy:
247
+ self._liveness_failures[agent_id] += 1
248
+ if self._liveness_failures[agent_id] >= self.config.liveness_failure_threshold:
249
+ self._health_status[agent_id] = HealthStatus.FAILED
250
+ await self._trigger_callbacks("liveness_failed", agent_id)
251
+ else:
252
+ self._liveness_failures[agent_id] = 0
253
+ if self._health_status[agent_id] == HealthStatus.FAILED:
254
+ self._health_status[agent_id] = HealthStatus.HEALTHY
255
+ await self._trigger_callbacks("liveness_restored", agent_id)
256
+
257
+ except Exception as e:
258
+ logger.error(f"Liveness check failed for {agent_id}: {e}")
259
+ self._liveness_failures[agent_id] += 1
260
+
261
+ await asyncio.sleep(self.config.liveness_interval_seconds)
262
+
263
+ async def _readiness_loop(self) -> None:
264
+ """Main loop for readiness checks"""
265
+ while self._running:
266
+ for agent_id in list(self._agents.keys()):
267
+ try:
268
+ result = await self._check_readiness(agent_id)
269
+
270
+ if not result.healthy:
271
+ self._readiness_failures[agent_id] += 1
272
+ if self._readiness_failures[agent_id] >= self.config.readiness_failure_threshold:
273
+ if self._health_status[agent_id] == HealthStatus.HEALTHY:
274
+ self._health_status[agent_id] = HealthStatus.DEGRADED
275
+ await self._trigger_callbacks("readiness_failed", agent_id)
276
+ else:
277
+ self._readiness_failures[agent_id] = 0
278
+ if self._health_status[agent_id] == HealthStatus.DEGRADED:
279
+ self._health_status[agent_id] = HealthStatus.HEALTHY
280
+ await self._trigger_callbacks("readiness_restored", agent_id)
281
+
282
+ except Exception as e:
283
+ logger.error(f"Readiness check failed for {agent_id}: {e}")
284
+ self._readiness_failures[agent_id] += 1
285
+
286
+ await asyncio.sleep(self.config.readiness_interval_seconds)
287
+
288
+ async def _check_liveness(self, agent_id: str) -> HealthCheckResult:
289
+ """Perform liveness check for an agent"""
290
+ start_time = time.time()
291
+ agent_info = self._agents.get(agent_id)
292
+
293
+ if not agent_info:
294
+ return HealthCheckResult(
295
+ healthy=False,
296
+ status=HealthStatus.UNKNOWN,
297
+ message="Agent not found"
298
+ )
299
+
300
+ agent = agent_info["agent"]
301
+ custom_check = agent_info.get("custom_liveness")
302
+
303
+ try:
304
+ # Try custom liveness check first
305
+ if custom_check:
306
+ healthy = await asyncio.wait_for(
307
+ custom_check(),
308
+ timeout=self.config.liveness_timeout_seconds
309
+ )
310
+ # Try protocol method
311
+ elif isinstance(agent, HealthCheckable):
312
+ healthy = await asyncio.wait_for(
313
+ agent.liveness_check(),
314
+ timeout=self.config.liveness_timeout_seconds
315
+ )
316
+ # Fallback: check if agent has is_alive method
317
+ elif hasattr(agent, 'is_alive'):
318
+ if asyncio.iscoroutinefunction(agent.is_alive):
319
+ healthy = await asyncio.wait_for(
320
+ agent.is_alive(),
321
+ timeout=self.config.liveness_timeout_seconds
322
+ )
323
+ else:
324
+ healthy = agent.is_alive()
325
+ else:
326
+ # Default: assume healthy if agent exists
327
+ healthy = True
328
+
329
+ latency_ms = (time.time() - start_time) * 1000
330
+ self._last_check[agent_id] = datetime.now()
331
+
332
+ return HealthCheckResult(
333
+ healthy=healthy,
334
+ status=HealthStatus.HEALTHY if healthy else HealthStatus.UNHEALTHY,
335
+ latency_ms=latency_ms
336
+ )
337
+
338
+ except asyncio.TimeoutError:
339
+ return HealthCheckResult(
340
+ healthy=False,
341
+ status=HealthStatus.UNHEALTHY,
342
+ message="Liveness check timed out",
343
+ latency_ms=self.config.liveness_timeout_seconds * 1000
344
+ )
345
+ except Exception as e:
346
+ return HealthCheckResult(
347
+ healthy=False,
348
+ status=HealthStatus.FAILED,
349
+ message=str(e),
350
+ latency_ms=(time.time() - start_time) * 1000
351
+ )
352
+
353
+ async def _check_readiness(self, agent_id: str) -> HealthCheckResult:
354
+ """Perform readiness check for an agent"""
355
+ start_time = time.time()
356
+ agent_info = self._agents.get(agent_id)
357
+
358
+ if not agent_info:
359
+ return HealthCheckResult(
360
+ healthy=False,
361
+ status=HealthStatus.UNKNOWN,
362
+ message="Agent not found"
363
+ )
364
+
365
+ agent = agent_info["agent"]
366
+ custom_check = agent_info.get("custom_readiness")
367
+
368
+ try:
369
+ if custom_check:
370
+ ready = await asyncio.wait_for(
371
+ custom_check(),
372
+ timeout=self.config.readiness_timeout_seconds
373
+ )
374
+ elif isinstance(agent, HealthCheckable):
375
+ ready = await asyncio.wait_for(
376
+ agent.readiness_check(),
377
+ timeout=self.config.readiness_timeout_seconds
378
+ )
379
+ elif hasattr(agent, 'is_ready'):
380
+ if asyncio.iscoroutinefunction(agent.is_ready):
381
+ ready = await asyncio.wait_for(
382
+ agent.is_ready(),
383
+ timeout=self.config.readiness_timeout_seconds
384
+ )
385
+ else:
386
+ ready = agent.is_ready()
387
+ else:
388
+ ready = True
389
+
390
+ latency_ms = (time.time() - start_time) * 1000
391
+
392
+ return HealthCheckResult(
393
+ healthy=ready,
394
+ status=HealthStatus.HEALTHY if ready else HealthStatus.DEGRADED,
395
+ latency_ms=latency_ms
396
+ )
397
+
398
+ except asyncio.TimeoutError:
399
+ return HealthCheckResult(
400
+ healthy=False,
401
+ status=HealthStatus.DEGRADED,
402
+ message="Readiness check timed out",
403
+ latency_ms=self.config.readiness_timeout_seconds * 1000
404
+ )
405
+ except Exception as e:
406
+ return HealthCheckResult(
407
+ healthy=False,
408
+ status=HealthStatus.DEGRADED,
409
+ message=str(e),
410
+ latency_ms=(time.time() - start_time) * 1000
411
+ )
412
+
413
+ def on_event(self, event: str, callback: Callable[[str], Awaitable[None]]) -> None:
414
+ """Register a callback for health events"""
415
+ self._callbacks[event].append(callback)
416
+
417
+ async def _trigger_callbacks(self, event: str, agent_id: str) -> None:
418
+ """Trigger all callbacks for an event"""
419
+ for callback in self._callbacks.get(event, []):
420
+ try:
421
+ await callback(agent_id)
422
+ except Exception as e:
423
+ logger.error(f"Callback error for {event}: {e}")
424
+
425
+ def get_agent_health(self, agent_id: str) -> HealthStatus:
426
+ """Get the current health status of an agent"""
427
+ return self._health_status.get(agent_id, HealthStatus.UNKNOWN)
428
+
429
+ def get_all_health_status(self) -> Dict[str, HealthStatus]:
430
+ """Get health status for all agents"""
431
+ return dict(self._health_status)
432
+
433
+ def get_health_history(self, agent_id: str) -> List[HealthCheckResult]:
434
+ """Get health check history for an agent"""
435
+ return list(self._check_history.get(agent_id, []))
436
+
437
+
438
+ # ============================================================================
439
+ # ACP-002: Agent Auto-Recovery
440
+ # ============================================================================
441
+
442
+ @dataclass
443
+ class RecoveryConfig:
444
+ """Configuration for auto-recovery behavior"""
445
+ enabled: bool = True
446
+ max_restarts: int = 5
447
+ restart_delay_seconds: float = 1.0
448
+ restart_delay_max_seconds: float = 60.0
449
+ restart_delay_multiplier: float = 2.0
450
+ reset_restart_count_after_seconds: float = 300.0
451
+ on_max_restarts: str = "stop" # "stop", "alert", "continue"
452
+
453
+
454
+ @dataclass
455
+ class RecoveryEvent:
456
+ """Record of a recovery event"""
457
+ agent_id: str
458
+ event_type: str # "restart", "failure", "recovery_success", "max_restarts"
459
+ timestamp: datetime = field(default_factory=datetime.now)
460
+ attempt: int = 0
461
+ error: Optional[str] = None
462
+ details: Dict[str, Any] = field(default_factory=dict)
463
+
464
+
465
+ class AutoRecoveryManager:
466
+ """
467
+ Manages automatic recovery of failed agents.
468
+
469
+ Implements exponential backoff for restart attempts and tracks
470
+ recovery history for analysis.
471
+
472
+ Features:
473
+ - Automatic restart with exponential backoff
474
+ - Maximum restart limit with configurable behavior
475
+ - Recovery event logging
476
+ - Callbacks for recovery events
477
+
478
+ Usage:
479
+ recovery = AutoRecoveryManager(config=RecoveryConfig())
480
+ recovery.register_agent(agent_id, agent_factory)
481
+
482
+ # When agent fails
483
+ await recovery.handle_failure(agent_id, error)
484
+ """
485
+
486
+ def __init__(self, config: Optional[RecoveryConfig] = None):
487
+ self.config = config or RecoveryConfig()
488
+ self._agent_factories: Dict[str, Callable[[], Any]] = {}
489
+ self._restart_counts: Dict[str, int] = defaultdict(int)
490
+ self._last_restart: Dict[str, datetime] = {}
491
+ self._current_delay: Dict[str, float] = {}
492
+ self._recovery_history: deque = deque(maxlen=1000)
493
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
494
+ self._agents: Dict[str, Any] = {}
495
+ self._lock = asyncio.Lock()
496
+
497
+ def register_agent(
498
+ self,
499
+ agent_id: str,
500
+ factory: Callable[[], Any],
501
+ initial_instance: Optional[Any] = None
502
+ ) -> None:
503
+ """Register an agent with its factory function for recovery"""
504
+ self._agent_factories[agent_id] = factory
505
+ if initial_instance:
506
+ self._agents[agent_id] = initial_instance
507
+ self._restart_counts[agent_id] = 0
508
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
509
+ logger.info(f"Registered agent {agent_id} for auto-recovery")
510
+
511
+ def unregister_agent(self, agent_id: str) -> None:
512
+ """Unregister an agent from auto-recovery"""
513
+ self._agent_factories.pop(agent_id, None)
514
+ self._agents.pop(agent_id, None)
515
+ self._restart_counts.pop(agent_id, None)
516
+ self._last_restart.pop(agent_id, None)
517
+ self._current_delay.pop(agent_id, None)
518
+
519
+ async def handle_failure(
520
+ self,
521
+ agent_id: str,
522
+ error: Optional[Exception] = None
523
+ ) -> Optional[Any]:
524
+ """
525
+ Handle an agent failure and attempt recovery.
526
+
527
+ Returns the new agent instance if recovery succeeds, None otherwise.
528
+ """
529
+ if not self.config.enabled:
530
+ logger.info(f"Auto-recovery disabled, not recovering {agent_id}")
531
+ return None
532
+
533
+ async with self._lock:
534
+ # Check if we should reset restart count
535
+ if agent_id in self._last_restart:
536
+ time_since_last = (datetime.now() - self._last_restart[agent_id]).total_seconds()
537
+ if time_since_last > self.config.reset_restart_count_after_seconds:
538
+ self._restart_counts[agent_id] = 0
539
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
540
+
541
+ # Check if max restarts reached
542
+ if self._restart_counts[agent_id] >= self.config.max_restarts:
543
+ event = RecoveryEvent(
544
+ agent_id=agent_id,
545
+ event_type="max_restarts",
546
+ attempt=self._restart_counts[agent_id],
547
+ error=str(error) if error else None
548
+ )
549
+ self._recovery_history.append(event)
550
+ await self._trigger_callbacks("max_restarts", agent_id, event)
551
+
552
+ if self.config.on_max_restarts == "stop":
553
+ logger.error(f"Max restarts reached for {agent_id}, stopping")
554
+ return None
555
+ elif self.config.on_max_restarts == "alert":
556
+ logger.warning(f"Max restarts reached for {agent_id}, alerting")
557
+ await self._trigger_callbacks("alert", agent_id, event)
558
+ # "continue" falls through to attempt restart anyway
559
+
560
+ # Calculate delay with exponential backoff
561
+ delay = self._current_delay.get(agent_id, self.config.restart_delay_seconds)
562
+
563
+ # Log failure event
564
+ failure_event = RecoveryEvent(
565
+ agent_id=agent_id,
566
+ event_type="failure",
567
+ attempt=self._restart_counts[agent_id],
568
+ error=str(error) if error else None
569
+ )
570
+ self._recovery_history.append(failure_event)
571
+ await self._trigger_callbacks("failure", agent_id, failure_event)
572
+
573
+ logger.info(f"Attempting recovery for {agent_id} after {delay:.1f}s delay")
574
+ await asyncio.sleep(delay)
575
+
576
+ # Attempt restart
577
+ try:
578
+ factory = self._agent_factories.get(agent_id)
579
+ if not factory:
580
+ logger.error(f"No factory registered for {agent_id}")
581
+ return None
582
+
583
+ new_agent = factory()
584
+ if asyncio.iscoroutine(new_agent):
585
+ new_agent = await new_agent
586
+
587
+ # Start the agent if it has a start method
588
+ if hasattr(new_agent, 'start'):
589
+ if asyncio.iscoroutinefunction(new_agent.start):
590
+ await new_agent.start()
591
+ else:
592
+ new_agent.start()
593
+
594
+ self._agents[agent_id] = new_agent
595
+ self._restart_counts[agent_id] += 1
596
+ self._last_restart[agent_id] = datetime.now()
597
+
598
+ # Increase delay for next potential failure (exponential backoff)
599
+ self._current_delay[agent_id] = min(
600
+ delay * self.config.restart_delay_multiplier,
601
+ self.config.restart_delay_max_seconds
602
+ )
603
+
604
+ success_event = RecoveryEvent(
605
+ agent_id=agent_id,
606
+ event_type="recovery_success",
607
+ attempt=self._restart_counts[agent_id]
608
+ )
609
+ self._recovery_history.append(success_event)
610
+ await self._trigger_callbacks("recovery_success", agent_id, success_event)
611
+
612
+ logger.info(f"Successfully recovered agent {agent_id}")
613
+ return new_agent
614
+
615
+ except Exception as e:
616
+ logger.error(f"Failed to recover agent {agent_id}: {e}")
617
+ self._restart_counts[agent_id] += 1
618
+ return await self.handle_failure(agent_id, e)
619
+
620
+ def on_event(
621
+ self,
622
+ event: str,
623
+ callback: Callable[[str, RecoveryEvent], Awaitable[None]]
624
+ ) -> None:
625
+ """Register a callback for recovery events"""
626
+ self._callbacks[event].append(callback)
627
+
628
+ async def _trigger_callbacks(
629
+ self,
630
+ event: str,
631
+ agent_id: str,
632
+ recovery_event: RecoveryEvent
633
+ ) -> None:
634
+ """Trigger all callbacks for an event"""
635
+ for callback in self._callbacks.get(event, []):
636
+ try:
637
+ await callback(agent_id, recovery_event)
638
+ except Exception as e:
639
+ logger.error(f"Callback error for {event}: {e}")
640
+
641
+ def get_agent(self, agent_id: str) -> Optional[Any]:
642
+ """Get the current agent instance"""
643
+ return self._agents.get(agent_id)
644
+
645
+ def get_restart_count(self, agent_id: str) -> int:
646
+ """Get the restart count for an agent"""
647
+ return self._restart_counts.get(agent_id, 0)
648
+
649
+ def get_recovery_history(
650
+ self,
651
+ agent_id: Optional[str] = None
652
+ ) -> List[RecoveryEvent]:
653
+ """Get recovery history, optionally filtered by agent"""
654
+ if agent_id:
655
+ return [e for e in self._recovery_history if e.agent_id == agent_id]
656
+ return list(self._recovery_history)
657
+
658
+ def reset_restart_count(self, agent_id: str) -> None:
659
+ """Manually reset the restart count for an agent"""
660
+ self._restart_counts[agent_id] = 0
661
+ self._current_delay[agent_id] = self.config.restart_delay_seconds
662
+
663
+
664
+ # ============================================================================
665
+ # ACP-003: Circuit Breaker
666
+ # ============================================================================
667
+
668
+ @dataclass
669
+ class CircuitBreakerConfig:
670
+ """Configuration for circuit breaker behavior"""
671
+ failure_threshold: int = 5
672
+ success_threshold: int = 3
673
+ recovery_timeout_seconds: float = 60.0
674
+ half_open_max_calls: int = 3
675
+ exclude_exceptions: List[Type[Exception]] = field(default_factory=list)
676
+ include_exceptions: Optional[List[Type[Exception]]] = None
677
+
678
+
679
+ @dataclass
680
+ class CircuitBreakerMetrics:
681
+ """Metrics for a circuit breaker"""
682
+ state: CircuitState
683
+ failure_count: int
684
+ success_count: int
685
+ total_calls: int
686
+ total_failures: int
687
+ total_successes: int
688
+ last_failure_time: Optional[datetime]
689
+ last_success_time: Optional[datetime]
690
+ state_changed_at: datetime
691
+
692
+
693
+ class CircuitBreaker:
694
+ """
695
+ Circuit breaker for preventing cascading failures.
696
+
697
+ Implements the circuit breaker pattern to protect against cascading
698
+ failures when an agent or service becomes unavailable.
699
+
700
+ States:
701
+ - CLOSED: Normal operation, requests pass through
702
+ - OPEN: Failing, requests are rejected immediately
703
+ - HALF_OPEN: Testing recovery, limited requests allowed
704
+
705
+ Features:
706
+ - Configurable failure/success thresholds
707
+ - Automatic recovery timeout
708
+ - Exception filtering
709
+ - Metrics collection
710
+
711
+ Usage:
712
+ breaker = CircuitBreaker(
713
+ config=CircuitBreakerConfig(
714
+ failure_threshold=5,
715
+ recovery_timeout=60
716
+ )
717
+ )
718
+
719
+ # Use as decorator
720
+ @breaker
721
+ async def call_agent():
722
+ ...
723
+
724
+ # Or use context manager
725
+ async with breaker:
726
+ await call_agent()
727
+ """
728
+
729
+ def __init__(
730
+ self,
731
+ name: str = "default",
732
+ config: Optional[CircuitBreakerConfig] = None,
733
+ failure_threshold: Optional[int] = None,
734
+ recovery_timeout: Optional[float] = None
735
+ ):
736
+ self.name = name
737
+ self.config = config or CircuitBreakerConfig()
738
+
739
+ # Allow direct parameter override for convenience API
740
+ if failure_threshold is not None:
741
+ self.config.failure_threshold = failure_threshold
742
+ if recovery_timeout is not None:
743
+ self.config.recovery_timeout_seconds = recovery_timeout
744
+
745
+ self._state = CircuitState.CLOSED
746
+ self._failure_count = 0
747
+ self._success_count = 0
748
+ self._half_open_calls = 0
749
+ self._last_failure_time: Optional[datetime] = None
750
+ self._last_success_time: Optional[datetime] = None
751
+ self._state_changed_at = datetime.now()
752
+ self._total_calls = 0
753
+ self._total_failures = 0
754
+ self._total_successes = 0
755
+ self._lock = asyncio.Lock()
756
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
757
+
758
+ @property
759
+ def state(self) -> CircuitState:
760
+ """Get the current circuit state"""
761
+ return self._state
762
+
763
+ @property
764
+ def is_closed(self) -> bool:
765
+ """Check if circuit is closed (normal operation)"""
766
+ return self._state == CircuitState.CLOSED
767
+
768
+ @property
769
+ def is_open(self) -> bool:
770
+ """Check if circuit is open (rejecting requests)"""
771
+ return self._state == CircuitState.OPEN
772
+
773
+ async def __aenter__(self):
774
+ """Async context manager entry"""
775
+ await self._before_call()
776
+ return self
777
+
778
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
779
+ """Async context manager exit"""
780
+ if exc_type is None:
781
+ await self._on_success()
782
+ else:
783
+ if self._should_count_exception(exc_type):
784
+ await self._on_failure(exc_val)
785
+ return False
786
+
787
+ def __call__(self, func: Callable) -> Callable:
788
+ """Decorator for wrapping functions with circuit breaker"""
789
+ async def wrapper(*args, **kwargs):
790
+ await self._before_call()
791
+ try:
792
+ if asyncio.iscoroutinefunction(func):
793
+ result = await func(*args, **kwargs)
794
+ else:
795
+ result = func(*args, **kwargs)
796
+ await self._on_success()
797
+ return result
798
+ except Exception as e:
799
+ if self._should_count_exception(type(e)):
800
+ await self._on_failure(e)
801
+ raise
802
+ return wrapper
803
+
804
+ async def _before_call(self) -> None:
805
+ """Check circuit state before a call"""
806
+ async with self._lock:
807
+ self._total_calls += 1
808
+
809
+ if self._state == CircuitState.OPEN:
810
+ # Check if recovery timeout has elapsed
811
+ if self._last_failure_time:
812
+ elapsed = (datetime.now() - self._last_failure_time).total_seconds()
813
+ if elapsed >= self.config.recovery_timeout_seconds:
814
+ self._transition_to(CircuitState.HALF_OPEN)
815
+ self._half_open_calls = 0
816
+ else:
817
+ raise CircuitBreakerOpenError(
818
+ f"Circuit {self.name} is open, retry after "
819
+ f"{self.config.recovery_timeout_seconds - elapsed:.1f}s"
820
+ )
821
+ else:
822
+ raise CircuitBreakerOpenError(f"Circuit {self.name} is open")
823
+
824
+ elif self._state == CircuitState.HALF_OPEN:
825
+ if self._half_open_calls >= self.config.half_open_max_calls:
826
+ raise CircuitBreakerOpenError(
827
+ f"Circuit {self.name} is half-open, max test calls reached"
828
+ )
829
+ self._half_open_calls += 1
830
+
831
+ async def _on_success(self) -> None:
832
+ """Handle a successful call"""
833
+ async with self._lock:
834
+ self._total_successes += 1
835
+ self._last_success_time = datetime.now()
836
+
837
+ if self._state == CircuitState.HALF_OPEN:
838
+ self._success_count += 1
839
+ if self._success_count >= self.config.success_threshold:
840
+ self._transition_to(CircuitState.CLOSED)
841
+ elif self._state == CircuitState.CLOSED:
842
+ self._failure_count = 0
843
+
844
+ async def _on_failure(self, error: Exception) -> None:
845
+ """Handle a failed call"""
846
+ async with self._lock:
847
+ self._total_failures += 1
848
+ self._last_failure_time = datetime.now()
849
+ self._failure_count += 1
850
+
851
+ if self._state == CircuitState.HALF_OPEN:
852
+ # Any failure in half-open state opens the circuit
853
+ self._transition_to(CircuitState.OPEN)
854
+ elif self._state == CircuitState.CLOSED:
855
+ if self._failure_count >= self.config.failure_threshold:
856
+ self._transition_to(CircuitState.OPEN)
857
+
858
+ def _transition_to(self, new_state: CircuitState) -> None:
859
+ """Transition to a new circuit state"""
860
+ old_state = self._state
861
+ self._state = new_state
862
+ self._state_changed_at = datetime.now()
863
+
864
+ if new_state == CircuitState.CLOSED:
865
+ self._failure_count = 0
866
+ self._success_count = 0
867
+ elif new_state == CircuitState.HALF_OPEN:
868
+ self._success_count = 0
869
+ self._half_open_calls = 0
870
+
871
+ logger.info(f"Circuit {self.name} transitioned from {old_state.value} to {new_state.value}")
872
+
873
+ # Trigger callbacks asynchronously
874
+ asyncio.create_task(self._trigger_state_change(old_state, new_state))
875
+
876
+ async def _trigger_state_change(
877
+ self,
878
+ old_state: CircuitState,
879
+ new_state: CircuitState
880
+ ) -> None:
881
+ """Trigger callbacks for state change"""
882
+ for callback in self._callbacks.get("state_change", []):
883
+ try:
884
+ await callback(self.name, old_state, new_state)
885
+ except Exception as e:
886
+ logger.error(f"Circuit breaker callback error: {e}")
887
+
888
+ def _should_count_exception(self, exc_type: Type[Exception]) -> bool:
889
+ """Determine if an exception should be counted as a failure"""
890
+ # Check exclude list
891
+ for excluded in self.config.exclude_exceptions:
892
+ if issubclass(exc_type, excluded):
893
+ return False
894
+
895
+ # Check include list if specified
896
+ if self.config.include_exceptions is not None:
897
+ for included in self.config.include_exceptions:
898
+ if issubclass(exc_type, included):
899
+ return True
900
+ return False
901
+
902
+ return True
903
+
904
+ def on_state_change(
905
+ self,
906
+ callback: Callable[[str, CircuitState, CircuitState], Awaitable[None]]
907
+ ) -> None:
908
+ """Register a callback for state changes"""
909
+ self._callbacks["state_change"].append(callback)
910
+
911
+ def get_metrics(self) -> CircuitBreakerMetrics:
912
+ """Get current circuit breaker metrics"""
913
+ return CircuitBreakerMetrics(
914
+ state=self._state,
915
+ failure_count=self._failure_count,
916
+ success_count=self._success_count,
917
+ total_calls=self._total_calls,
918
+ total_failures=self._total_failures,
919
+ total_successes=self._total_successes,
920
+ last_failure_time=self._last_failure_time,
921
+ last_success_time=self._last_success_time,
922
+ state_changed_at=self._state_changed_at
923
+ )
924
+
925
+ def reset(self) -> None:
926
+ """Manually reset the circuit breaker to closed state"""
927
+ self._state = CircuitState.CLOSED
928
+ self._failure_count = 0
929
+ self._success_count = 0
930
+ self._half_open_calls = 0
931
+ self._state_changed_at = datetime.now()
932
+ logger.info(f"Circuit {self.name} manually reset to CLOSED")
933
+
934
+
935
+ class CircuitBreakerOpenError(Exception):
936
+ """Raised when a circuit breaker is open"""
937
+ pass
938
+
939
+
940
+ class CircuitBreakerRegistry:
941
+ """Registry for managing multiple circuit breakers"""
942
+
943
+ def __init__(self):
944
+ self._breakers: Dict[str, CircuitBreaker] = {}
945
+
946
+ def get_or_create(
947
+ self,
948
+ name: str,
949
+ config: Optional[CircuitBreakerConfig] = None
950
+ ) -> CircuitBreaker:
951
+ """Get or create a circuit breaker by name"""
952
+ if name not in self._breakers:
953
+ self._breakers[name] = CircuitBreaker(name=name, config=config)
954
+ return self._breakers[name]
955
+
956
+ def get(self, name: str) -> Optional[CircuitBreaker]:
957
+ """Get a circuit breaker by name"""
958
+ return self._breakers.get(name)
959
+
960
+ def get_all_metrics(self) -> Dict[str, CircuitBreakerMetrics]:
961
+ """Get metrics for all circuit breakers"""
962
+ return {name: cb.get_metrics() for name, cb in self._breakers.items()}
963
+
964
+
965
+ # ============================================================================
966
+ # ACP-004: Agent Scaling
967
+ # ============================================================================
968
+
969
+ @dataclass
970
+ class ScalingConfig:
971
+ """Configuration for agent scaling"""
972
+ min_replicas: int = 1
973
+ max_replicas: int = 10
974
+ target_cpu_utilization: float = 0.7
975
+ target_memory_utilization: float = 0.8
976
+ scale_up_threshold: float = 0.8
977
+ scale_down_threshold: float = 0.3
978
+ scale_up_cooldown_seconds: float = 60.0
979
+ scale_down_cooldown_seconds: float = 300.0
980
+ scale_up_increment: int = 1
981
+ scale_down_increment: int = 1
982
+
983
+
984
+ @dataclass
985
+ class AgentReplica:
986
+ """Represents a replica of an agent"""
987
+ replica_id: str
988
+ agent_id: str
989
+ instance: Any
990
+ created_at: datetime = field(default_factory=datetime.now)
991
+ status: AgentState = AgentState.PENDING
992
+ metrics: Dict[str, float] = field(default_factory=dict)
993
+
994
+
995
+ class AgentScaler:
996
+ """
997
+ Horizontal scaling manager for agents.
998
+
999
+ Provides automatic scaling based on load metrics, supporting both
1000
+ scale-up and scale-down with configurable thresholds and cooldowns.
1001
+
1002
+ Features:
1003
+ - Automatic scale-up/scale-down based on utilization
1004
+ - Configurable min/max replicas
1005
+ - Load balancing across replicas
1006
+ - Cooldown periods to prevent thrashing
1007
+
1008
+ Usage:
1009
+ scaler = AgentScaler()
1010
+
1011
+ # Register agent type with factory
1012
+ scaler.register_agent_type(
1013
+ agent_type="claims_agent",
1014
+ factory=create_claims_agent,
1015
+ config=ScalingConfig(min_replicas=2, max_replicas=10)
1016
+ )
1017
+
1018
+ # Get available replica
1019
+ agent = await scaler.get_replica("claims_agent")
1020
+
1021
+ # Manual scaling
1022
+ await scaler.scale_to("claims_agent", replicas=5)
1023
+ """
1024
+
1025
+ def __init__(self):
1026
+ self._agent_types: Dict[str, Dict[str, Any]] = {}
1027
+ self._replicas: Dict[str, Dict[str, AgentReplica]] = defaultdict(dict)
1028
+ self._last_scale_up: Dict[str, datetime] = {}
1029
+ self._last_scale_down: Dict[str, datetime] = {}
1030
+ self._load_balancer_index: Dict[str, int] = defaultdict(int)
1031
+ self._lock = asyncio.Lock()
1032
+ self._running = False
1033
+ self._scaling_task: Optional[asyncio.Task] = None
1034
+
1035
+ def register_agent_type(
1036
+ self,
1037
+ agent_type: str,
1038
+ factory: Callable[[], Any],
1039
+ config: Optional[ScalingConfig] = None,
1040
+ replicas: int = 1
1041
+ ) -> None:
1042
+ """Register an agent type for scaling"""
1043
+ config = config or ScalingConfig()
1044
+ self._agent_types[agent_type] = {
1045
+ "factory": factory,
1046
+ "config": config,
1047
+ "target_replicas": max(config.min_replicas, replicas)
1048
+ }
1049
+ logger.info(f"Registered agent type {agent_type} for scaling")
1050
+
1051
+ async def start(self) -> None:
1052
+ """Start the scaling manager"""
1053
+ if self._running:
1054
+ return
1055
+
1056
+ self._running = True
1057
+
1058
+ # Initialize replicas for all registered types
1059
+ for agent_type, info in self._agent_types.items():
1060
+ await self.scale_to(agent_type, info["target_replicas"])
1061
+
1062
+ # Start autoscaling loop
1063
+ self._scaling_task = asyncio.create_task(self._autoscaling_loop())
1064
+ logger.info("Agent scaler started")
1065
+
1066
+ async def stop(self) -> None:
1067
+ """Stop the scaling manager"""
1068
+ self._running = False
1069
+ if self._scaling_task:
1070
+ self._scaling_task.cancel()
1071
+ try:
1072
+ await self._scaling_task
1073
+ except asyncio.CancelledError:
1074
+ pass
1075
+
1076
+ # Stop all replicas
1077
+ for agent_type in list(self._replicas.keys()):
1078
+ await self.scale_to(agent_type, 0)
1079
+
1080
+ logger.info("Agent scaler stopped")
1081
+
1082
+ async def scale_to(self, agent_type: str, replicas: int) -> None:
1083
+ """Scale an agent type to a specific number of replicas"""
1084
+ if agent_type not in self._agent_types:
1085
+ raise ValueError(f"Unknown agent type: {agent_type}")
1086
+
1087
+ async with self._lock:
1088
+ config = self._agent_types[agent_type]["config"]
1089
+ replicas = max(0, min(replicas, config.max_replicas))
1090
+
1091
+ current_count = len(self._replicas[agent_type])
1092
+
1093
+ if replicas > current_count:
1094
+ # Scale up
1095
+ for _ in range(replicas - current_count):
1096
+ await self._create_replica(agent_type)
1097
+ elif replicas < current_count:
1098
+ # Scale down
1099
+ to_remove = current_count - replicas
1100
+ replica_ids = list(self._replicas[agent_type].keys())[:to_remove]
1101
+ for replica_id in replica_ids:
1102
+ await self._remove_replica(agent_type, replica_id)
1103
+
1104
+ self._agent_types[agent_type]["target_replicas"] = replicas
1105
+ logger.info(f"Scaled {agent_type} to {replicas} replicas")
1106
+
1107
+ async def scale_up(self, agent_type: str, count: int = 1) -> None:
1108
+ """Scale up an agent type by adding replicas"""
1109
+ current = len(self._replicas.get(agent_type, {}))
1110
+ await self.scale_to(agent_type, current + count)
1111
+
1112
+ async def scale_down(self, agent_type: str, count: int = 1) -> None:
1113
+ """Scale down an agent type by removing replicas"""
1114
+ current = len(self._replicas.get(agent_type, {}))
1115
+ await self.scale_to(agent_type, max(0, current - count))
1116
+
1117
+ async def _create_replica(self, agent_type: str) -> AgentReplica:
1118
+ """Create a new replica for an agent type"""
1119
+ factory = self._agent_types[agent_type]["factory"]
1120
+ replica_id = f"{agent_type}-{uuid.uuid4().hex[:8]}"
1121
+
1122
+ instance = factory()
1123
+ if asyncio.iscoroutine(instance):
1124
+ instance = await instance
1125
+
1126
+ # Start the agent if it has a start method
1127
+ if hasattr(instance, 'start'):
1128
+ if asyncio.iscoroutinefunction(instance.start):
1129
+ await instance.start()
1130
+ else:
1131
+ instance.start()
1132
+
1133
+ replica = AgentReplica(
1134
+ replica_id=replica_id,
1135
+ agent_id=agent_type,
1136
+ instance=instance,
1137
+ status=AgentState.RUNNING
1138
+ )
1139
+
1140
+ self._replicas[agent_type][replica_id] = replica
1141
+ logger.info(f"Created replica {replica_id} for {agent_type}")
1142
+ return replica
1143
+
1144
+ async def _remove_replica(self, agent_type: str, replica_id: str) -> None:
1145
+ """Remove a replica"""
1146
+ replica = self._replicas[agent_type].pop(replica_id, None)
1147
+ if replica and replica.instance:
1148
+ # Stop the agent if it has a stop method
1149
+ if hasattr(replica.instance, 'stop'):
1150
+ if asyncio.iscoroutinefunction(replica.instance.stop):
1151
+ await replica.instance.stop()
1152
+ else:
1153
+ replica.instance.stop()
1154
+ logger.info(f"Removed replica {replica_id} from {agent_type}")
1155
+
1156
+ async def get_replica(self, agent_type: str) -> Optional[Any]:
1157
+ """Get an available replica using round-robin load balancing"""
1158
+ replicas = self._replicas.get(agent_type, {})
1159
+ if not replicas:
1160
+ return None
1161
+
1162
+ # Round-robin selection
1163
+ replica_list = list(replicas.values())
1164
+ running_replicas = [r for r in replica_list if r.status == AgentState.RUNNING]
1165
+
1166
+ if not running_replicas:
1167
+ return None
1168
+
1169
+ index = self._load_balancer_index[agent_type] % len(running_replicas)
1170
+ self._load_balancer_index[agent_type] += 1
1171
+
1172
+ return running_replicas[index].instance
1173
+
1174
+ async def _autoscaling_loop(self) -> None:
1175
+ """Background loop for automatic scaling"""
1176
+ while self._running:
1177
+ try:
1178
+ for agent_type, info in self._agent_types.items():
1179
+ config = info["config"]
1180
+ replicas = self._replicas.get(agent_type, {})
1181
+
1182
+ if not replicas:
1183
+ continue
1184
+
1185
+ # Calculate average utilization
1186
+ total_cpu = sum(r.metrics.get("cpu", 0) for r in replicas.values())
1187
+ avg_cpu = total_cpu / len(replicas) if replicas else 0
1188
+
1189
+ now = datetime.now()
1190
+
1191
+ # Check scale up
1192
+ if avg_cpu > config.scale_up_threshold:
1193
+ last_scale = self._last_scale_up.get(agent_type, datetime.min)
1194
+ if (now - last_scale).total_seconds() > config.scale_up_cooldown_seconds:
1195
+ if len(replicas) < config.max_replicas:
1196
+ await self.scale_up(agent_type, config.scale_up_increment)
1197
+ self._last_scale_up[agent_type] = now
1198
+
1199
+ # Check scale down
1200
+ elif avg_cpu < config.scale_down_threshold:
1201
+ last_scale = self._last_scale_down.get(agent_type, datetime.min)
1202
+ if (now - last_scale).total_seconds() > config.scale_down_cooldown_seconds:
1203
+ if len(replicas) > config.min_replicas:
1204
+ await self.scale_down(agent_type, config.scale_down_increment)
1205
+ self._last_scale_down[agent_type] = now
1206
+
1207
+ except Exception as e:
1208
+ logger.error(f"Autoscaling loop error: {e}")
1209
+
1210
+ await asyncio.sleep(10) # Check every 10 seconds
1211
+
1212
+ def update_replica_metrics(
1213
+ self,
1214
+ agent_type: str,
1215
+ replica_id: str,
1216
+ metrics: Dict[str, float]
1217
+ ) -> None:
1218
+ """Update metrics for a replica"""
1219
+ if agent_type in self._replicas and replica_id in self._replicas[agent_type]:
1220
+ self._replicas[agent_type][replica_id].metrics.update(metrics)
1221
+
1222
+ def get_replica_count(self, agent_type: str) -> int:
1223
+ """Get the current replica count for an agent type"""
1224
+ return len(self._replicas.get(agent_type, {}))
1225
+
1226
+ def get_all_replicas(self, agent_type: str) -> List[AgentReplica]:
1227
+ """Get all replicas for an agent type"""
1228
+ return list(self._replicas.get(agent_type, {}).values())
1229
+
1230
+
1231
+ # ============================================================================
1232
+ # ACP-005: Distributed Coordination
1233
+ # ============================================================================
1234
+
1235
+ @dataclass
1236
+ class LeaderElectionConfig:
1237
+ """Configuration for leader election"""
1238
+ heartbeat_interval_seconds: float = 1.0
1239
+ election_timeout_min_seconds: float = 3.0
1240
+ election_timeout_max_seconds: float = 5.0
1241
+ lease_duration_seconds: float = 15.0
1242
+
1243
+
1244
+ @dataclass
1245
+ class LeaderInfo:
1246
+ """Information about the current leader"""
1247
+ leader_id: str
1248
+ elected_at: datetime
1249
+ lease_expires_at: datetime
1250
+ term: int
1251
+
1252
+
1253
+ class DistributedCoordinator:
1254
+ """
1255
+ Distributed coordination for stateful operations.
1256
+
1257
+ Implements leader election and basic consensus for coordinating
1258
+ multiple agent instances.
1259
+
1260
+ Features:
1261
+ - Leader election using Raft-like protocol
1262
+ - Distributed locks
1263
+ - Heartbeat-based failure detection
1264
+ - Automatic leader failover
1265
+
1266
+ Usage:
1267
+ coordinator = DistributedCoordinator(node_id="node-1")
1268
+
1269
+ # Start coordination
1270
+ await coordinator.start()
1271
+
1272
+ # Check if leader
1273
+ if coordinator.is_leader:
1274
+ # Perform leader-only operations
1275
+ ...
1276
+
1277
+ # Acquire distributed lock
1278
+ async with coordinator.lock("resource-1"):
1279
+ # Critical section
1280
+ ...
1281
+ """
1282
+
1283
+ def __init__(
1284
+ self,
1285
+ node_id: str,
1286
+ config: Optional[LeaderElectionConfig] = None,
1287
+ peers: Optional[List[str]] = None
1288
+ ):
1289
+ self.node_id = node_id
1290
+ self.config = config or LeaderElectionConfig()
1291
+ self.peers = peers or []
1292
+
1293
+ self._role = CoordinationRole.FOLLOWER
1294
+ self._current_term = 0
1295
+ self._voted_for: Optional[str] = None
1296
+ self._leader_id: Optional[str] = None
1297
+ self._leader_lease_expires: Optional[datetime] = None
1298
+
1299
+ self._last_heartbeat = datetime.now()
1300
+ self._election_timeout = self._random_election_timeout()
1301
+
1302
+ self._locks: Dict[str, asyncio.Lock] = {}
1303
+ self._lock_holders: Dict[str, str] = {}
1304
+
1305
+ self._running = False
1306
+ self._tasks: List[asyncio.Task] = []
1307
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
1308
+ self._lock = asyncio.Lock()
1309
+
1310
+ def _random_election_timeout(self) -> float:
1311
+ """Generate a random election timeout"""
1312
+ import random
1313
+ return random.uniform(
1314
+ self.config.election_timeout_min_seconds,
1315
+ self.config.election_timeout_max_seconds
1316
+ )
1317
+
1318
+ @property
1319
+ def is_leader(self) -> bool:
1320
+ """Check if this node is the leader"""
1321
+ return self._role == CoordinationRole.LEADER
1322
+
1323
+ @property
1324
+ def role(self) -> CoordinationRole:
1325
+ """Get current role"""
1326
+ return self._role
1327
+
1328
+ @property
1329
+ def leader_id(self) -> Optional[str]:
1330
+ """Get the current leader ID"""
1331
+ return self._leader_id
1332
+
1333
+ async def start(self) -> None:
1334
+ """Start the coordinator"""
1335
+ if self._running:
1336
+ return
1337
+
1338
+ self._running = True
1339
+ self._tasks.append(asyncio.create_task(self._election_loop()))
1340
+
1341
+ # If no peers, become leader immediately
1342
+ if not self.peers:
1343
+ await self._become_leader()
1344
+
1345
+ logger.info(f"Distributed coordinator started for node {self.node_id}")
1346
+
1347
+ async def stop(self) -> None:
1348
+ """Stop the coordinator"""
1349
+ self._running = False
1350
+ for task in self._tasks:
1351
+ task.cancel()
1352
+ try:
1353
+ await task
1354
+ except asyncio.CancelledError:
1355
+ pass
1356
+ self._tasks.clear()
1357
+ logger.info(f"Distributed coordinator stopped for node {self.node_id}")
1358
+
1359
+ async def _election_loop(self) -> None:
1360
+ """Main election and heartbeat loop"""
1361
+ while self._running:
1362
+ try:
1363
+ if self._role == CoordinationRole.LEADER:
1364
+ # Send heartbeats as leader
1365
+ await self._send_heartbeats()
1366
+ await asyncio.sleep(self.config.heartbeat_interval_seconds)
1367
+ else:
1368
+ # Check for election timeout
1369
+ elapsed = (datetime.now() - self._last_heartbeat).total_seconds()
1370
+ if elapsed > self._election_timeout:
1371
+ await self._start_election()
1372
+ await asyncio.sleep(0.1)
1373
+
1374
+ except Exception as e:
1375
+ logger.error(f"Election loop error: {e}")
1376
+ await asyncio.sleep(1)
1377
+
1378
+ async def _start_election(self) -> None:
1379
+ """Start a leader election"""
1380
+ async with self._lock:
1381
+ self._role = CoordinationRole.CANDIDATE
1382
+ self._current_term += 1
1383
+ self._voted_for = self.node_id
1384
+ self._election_timeout = self._random_election_timeout()
1385
+
1386
+ logger.info(f"Node {self.node_id} starting election for term {self._current_term}")
1387
+
1388
+ # In a real implementation, request votes from peers
1389
+ # For single-node or simple cases, just become leader
1390
+ if not self.peers:
1391
+ await self._become_leader()
1392
+ else:
1393
+ # Simplified: if we're a candidate and no peers respond, become leader
1394
+ votes_received = 1 # Vote for self
1395
+ votes_needed = (len(self.peers) + 1) // 2 + 1
1396
+
1397
+ # In real implementation: send RequestVote RPCs to peers
1398
+ # For now, simulate winning the election
1399
+ if votes_received >= votes_needed or not self.peers:
1400
+ await self._become_leader()
1401
+
1402
+ async def _become_leader(self) -> None:
1403
+ """Transition to leader role"""
1404
+ self._role = CoordinationRole.LEADER
1405
+ self._leader_id = self.node_id
1406
+ self._leader_lease_expires = datetime.now() + timedelta(
1407
+ seconds=self.config.lease_duration_seconds
1408
+ )
1409
+
1410
+ logger.info(f"Node {self.node_id} became leader for term {self._current_term}")
1411
+ await self._trigger_callbacks("leader_elected", self.node_id)
1412
+
1413
+ async def _send_heartbeats(self) -> None:
1414
+ """Send heartbeats to followers"""
1415
+ self._leader_lease_expires = datetime.now() + timedelta(
1416
+ seconds=self.config.lease_duration_seconds
1417
+ )
1418
+ # In real implementation: send AppendEntries RPCs to peers
1419
+
1420
+ def receive_heartbeat(self, leader_id: str, term: int) -> None:
1421
+ """Receive a heartbeat from the leader"""
1422
+ if term >= self._current_term:
1423
+ self._current_term = term
1424
+ self._role = CoordinationRole.FOLLOWER
1425
+ self._leader_id = leader_id
1426
+ self._last_heartbeat = datetime.now()
1427
+ self._voted_for = None
1428
+
1429
+ async def acquire_lock(self, resource_id: str, timeout: float = 30.0) -> bool:
1430
+ """Acquire a distributed lock"""
1431
+ if resource_id not in self._locks:
1432
+ self._locks[resource_id] = asyncio.Lock()
1433
+
1434
+ try:
1435
+ acquired = await asyncio.wait_for(
1436
+ self._locks[resource_id].acquire(),
1437
+ timeout=timeout
1438
+ )
1439
+ if acquired:
1440
+ self._lock_holders[resource_id] = self.node_id
1441
+ logger.debug(f"Node {self.node_id} acquired lock on {resource_id}")
1442
+ return acquired
1443
+ except asyncio.TimeoutError:
1444
+ return False
1445
+
1446
+ def release_lock(self, resource_id: str) -> None:
1447
+ """Release a distributed lock"""
1448
+ if resource_id in self._locks and self._locks[resource_id].locked():
1449
+ self._locks[resource_id].release()
1450
+ self._lock_holders.pop(resource_id, None)
1451
+ logger.debug(f"Node {self.node_id} released lock on {resource_id}")
1452
+
1453
+ def lock(self, resource_id: str, timeout: float = 30.0):
1454
+ """Context manager for distributed lock"""
1455
+ return DistributedLockContext(self, resource_id, timeout)
1456
+
1457
+ def on_event(
1458
+ self,
1459
+ event: str,
1460
+ callback: Callable[[str], Awaitable[None]]
1461
+ ) -> None:
1462
+ """Register a callback for coordination events"""
1463
+ self._callbacks[event].append(callback)
1464
+
1465
+ async def _trigger_callbacks(self, event: str, *args) -> None:
1466
+ """Trigger callbacks for an event"""
1467
+ for callback in self._callbacks.get(event, []):
1468
+ try:
1469
+ await callback(*args)
1470
+ except Exception as e:
1471
+ logger.error(f"Coordination callback error: {e}")
1472
+
1473
+ def get_leader_info(self) -> Optional[LeaderInfo]:
1474
+ """Get information about the current leader"""
1475
+ if self._leader_id:
1476
+ return LeaderInfo(
1477
+ leader_id=self._leader_id,
1478
+ elected_at=datetime.now(), # Would be tracked in real implementation
1479
+ lease_expires_at=self._leader_lease_expires or datetime.now(),
1480
+ term=self._current_term
1481
+ )
1482
+ return None
1483
+
1484
+
1485
+ class DistributedLockContext:
1486
+ """Context manager for distributed locks"""
1487
+
1488
+ def __init__(
1489
+ self,
1490
+ coordinator: DistributedCoordinator,
1491
+ resource_id: str,
1492
+ timeout: float
1493
+ ):
1494
+ self._coordinator = coordinator
1495
+ self._resource_id = resource_id
1496
+ self._timeout = timeout
1497
+
1498
+ async def __aenter__(self):
1499
+ acquired = await self._coordinator.acquire_lock(self._resource_id, self._timeout)
1500
+ if not acquired:
1501
+ raise TimeoutError(f"Failed to acquire lock on {self._resource_id}")
1502
+ return self
1503
+
1504
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1505
+ self._coordinator.release_lock(self._resource_id)
1506
+ return False
1507
+
1508
+
1509
+ # ============================================================================
1510
+ # ACP-006: Agent Dependency Graph
1511
+ # ============================================================================
1512
+
1513
+ @dataclass
1514
+ class AgentDependency:
1515
+ """Represents a dependency between agents"""
1516
+ agent_id: str
1517
+ depends_on: List[str]
1518
+ optional_depends_on: List[str] = field(default_factory=list)
1519
+ startup_timeout_seconds: float = 60.0
1520
+
1521
+
1522
+ class DependencyGraph:
1523
+ """
1524
+ Manages agent startup order based on dependencies.
1525
+
1526
+ Ensures agents start in the correct order, respecting dependencies
1527
+ and detecting circular dependencies.
1528
+
1529
+ Features:
1530
+ - Topological sorting for startup order
1531
+ - Circular dependency detection
1532
+ - Optional vs required dependencies
1533
+ - Parallel startup where possible
1534
+
1535
+ Usage:
1536
+ graph = DependencyGraph()
1537
+
1538
+ graph.add_agent("api-server", depends_on=["database", "cache"])
1539
+ graph.add_agent("database", depends_on=[])
1540
+ graph.add_agent("cache", depends_on=[])
1541
+
1542
+ # Get startup order
1543
+ order = graph.get_startup_order()
1544
+ # Returns: ["database", "cache", "api-server"]
1545
+ """
1546
+
1547
+ def __init__(self):
1548
+ self._agents: Dict[str, AgentDependency] = {}
1549
+ self._graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depends_on
1550
+ self._reverse_graph: Dict[str, Set[str]] = defaultdict(set) # agent -> depended_by
1551
+
1552
+ def add_agent(
1553
+ self,
1554
+ agent_id: str,
1555
+ depends_on: Optional[List[str]] = None,
1556
+ optional_depends_on: Optional[List[str]] = None,
1557
+ startup_timeout: float = 60.0
1558
+ ) -> None:
1559
+ """Add an agent with its dependencies"""
1560
+ depends_on = depends_on or []
1561
+ optional_depends_on = optional_depends_on or []
1562
+
1563
+ self._agents[agent_id] = AgentDependency(
1564
+ agent_id=agent_id,
1565
+ depends_on=depends_on,
1566
+ optional_depends_on=optional_depends_on,
1567
+ startup_timeout_seconds=startup_timeout
1568
+ )
1569
+
1570
+ # Update graphs
1571
+ for dep in depends_on:
1572
+ self._graph[agent_id].add(dep)
1573
+ self._reverse_graph[dep].add(agent_id)
1574
+
1575
+ logger.debug(f"Added agent {agent_id} with dependencies: {depends_on}")
1576
+
1577
+ def remove_agent(self, agent_id: str) -> None:
1578
+ """Remove an agent from the dependency graph"""
1579
+ if agent_id in self._agents:
1580
+ # Remove from graphs
1581
+ for dep in self._graph[agent_id]:
1582
+ self._reverse_graph[dep].discard(agent_id)
1583
+ del self._graph[agent_id]
1584
+ del self._agents[agent_id]
1585
+
1586
+ def get_dependencies(self, agent_id: str) -> List[str]:
1587
+ """Get all dependencies for an agent"""
1588
+ agent = self._agents.get(agent_id)
1589
+ if agent:
1590
+ return agent.depends_on + agent.optional_depends_on
1591
+ return []
1592
+
1593
+ def get_dependents(self, agent_id: str) -> List[str]:
1594
+ """Get all agents that depend on this agent"""
1595
+ return list(self._reverse_graph.get(agent_id, set()))
1596
+
1597
+ def has_circular_dependency(self) -> bool:
1598
+ """Check if there are any circular dependencies"""
1599
+ visited = set()
1600
+ rec_stack = set()
1601
+
1602
+ def dfs(node: str) -> bool:
1603
+ visited.add(node)
1604
+ rec_stack.add(node)
1605
+
1606
+ for neighbor in self._graph.get(node, set()):
1607
+ if neighbor not in visited:
1608
+ if dfs(neighbor):
1609
+ return True
1610
+ elif neighbor in rec_stack:
1611
+ return True
1612
+
1613
+ rec_stack.remove(node)
1614
+ return False
1615
+
1616
+ for agent_id in self._agents:
1617
+ if agent_id not in visited:
1618
+ if dfs(agent_id):
1619
+ return True
1620
+
1621
+ return False
1622
+
1623
+ def get_startup_order(self) -> List[str]:
1624
+ """
1625
+ Get the startup order using topological sort.
1626
+
1627
+ Returns agents in order such that dependencies are started first.
1628
+ Raises ValueError if there are circular dependencies.
1629
+ """
1630
+ if self.has_circular_dependency():
1631
+ raise ValueError("Circular dependency detected in agent graph")
1632
+
1633
+ # Kahn's algorithm for topological sort
1634
+ in_degree = {agent_id: 0 for agent_id in self._agents}
1635
+ for agent_id in self._agents:
1636
+ for dep in self._graph.get(agent_id, set()):
1637
+ if dep in in_degree:
1638
+ in_degree[agent_id] += 1
1639
+
1640
+ # Start with agents that have no dependencies
1641
+ queue = deque([a for a, d in in_degree.items() if d == 0])
1642
+ result = []
1643
+
1644
+ while queue:
1645
+ agent_id = queue.popleft()
1646
+ result.append(agent_id)
1647
+
1648
+ for dependent in self._reverse_graph.get(agent_id, set()):
1649
+ if dependent in in_degree:
1650
+ in_degree[dependent] -= 1
1651
+ if in_degree[dependent] == 0:
1652
+ queue.append(dependent)
1653
+
1654
+ return result
1655
+
1656
+ def get_parallel_startup_groups(self) -> List[List[str]]:
1657
+ """
1658
+ Get groups of agents that can be started in parallel.
1659
+
1660
+ Returns a list of groups, where agents within a group can start
1661
+ simultaneously, but groups must be started in order.
1662
+ """
1663
+ if self.has_circular_dependency():
1664
+ raise ValueError("Circular dependency detected")
1665
+
1666
+ result = []
1667
+ remaining = set(self._agents.keys())
1668
+ started = set()
1669
+
1670
+ while remaining:
1671
+ # Find agents whose dependencies are all started
1672
+ group = []
1673
+ for agent_id in remaining:
1674
+ deps = self._graph.get(agent_id, set())
1675
+ if all(dep in started or dep not in self._agents for dep in deps):
1676
+ group.append(agent_id)
1677
+
1678
+ if not group:
1679
+ raise ValueError("Unable to resolve dependencies")
1680
+
1681
+ result.append(group)
1682
+ for agent_id in group:
1683
+ remaining.remove(agent_id)
1684
+ started.add(agent_id)
1685
+
1686
+ return result
1687
+
1688
+ def get_shutdown_order(self) -> List[str]:
1689
+ """Get the shutdown order (reverse of startup order)"""
1690
+ return list(reversed(self.get_startup_order()))
1691
+
1692
+ def validate(self) -> List[str]:
1693
+ """
1694
+ Validate the dependency graph.
1695
+
1696
+ Returns a list of validation errors, or empty list if valid.
1697
+ """
1698
+ errors = []
1699
+
1700
+ # Check for circular dependencies
1701
+ if self.has_circular_dependency():
1702
+ errors.append("Circular dependency detected")
1703
+
1704
+ # Check for missing dependencies
1705
+ for agent_id, agent in self._agents.items():
1706
+ for dep in agent.depends_on:
1707
+ if dep not in self._agents:
1708
+ errors.append(f"Agent {agent_id} depends on missing agent {dep}")
1709
+
1710
+ return errors
1711
+
1712
+
1713
+ # ============================================================================
1714
+ # ACP-007: Graceful Shutdown
1715
+ # ============================================================================
1716
+
1717
+ @dataclass
1718
+ class ShutdownConfig:
1719
+ """Configuration for graceful shutdown"""
1720
+ drain_timeout_seconds: float = 30.0
1721
+ force_timeout_seconds: float = 60.0
1722
+ checkpoint_enabled: bool = True
1723
+ save_in_flight: bool = True
1724
+
1725
+
1726
+ @dataclass
1727
+ class InFlightOperation:
1728
+ """Represents an in-flight operation during shutdown"""
1729
+ operation_id: str
1730
+ agent_id: str
1731
+ operation_type: str
1732
+ started_at: datetime
1733
+ data: Dict[str, Any] = field(default_factory=dict)
1734
+
1735
+
1736
+ class GracefulShutdownManager:
1737
+ """
1738
+ Manages graceful shutdown to preserve in-flight verifications.
1739
+
1740
+ Features:
1741
+ - Drain period for completing in-flight operations
1742
+ - Operation checkpointing
1743
+ - Configurable force timeout
1744
+ - Shutdown hooks
1745
+
1746
+ Usage:
1747
+ shutdown_manager = GracefulShutdownManager(
1748
+ config=ShutdownConfig(drain_timeout_seconds=30)
1749
+ )
1750
+
1751
+ # Register in-flight operation
1752
+ op_id = shutdown_manager.register_operation(
1753
+ agent_id="claims-agent",
1754
+ operation_type="verification",
1755
+ data={"claim_id": "123"}
1756
+ )
1757
+
1758
+ # Complete operation
1759
+ shutdown_manager.complete_operation(op_id)
1760
+
1761
+ # Initiate graceful shutdown
1762
+ await shutdown_manager.shutdown()
1763
+ """
1764
+
1765
+ def __init__(self, config: Optional[ShutdownConfig] = None):
1766
+ self.config = config or ShutdownConfig()
1767
+ self._phase = ShutdownPhase.RUNNING
1768
+ self._in_flight: Dict[str, InFlightOperation] = {}
1769
+ self._shutdown_hooks: List[Callable[[], Awaitable[None]]] = []
1770
+ self._checkpoint_data: Dict[str, Any] = {}
1771
+ self._lock = asyncio.Lock()
1772
+ self._shutdown_event = asyncio.Event()
1773
+
1774
+ @property
1775
+ def phase(self) -> ShutdownPhase:
1776
+ """Get the current shutdown phase"""
1777
+ return self._phase
1778
+
1779
+ @property
1780
+ def is_shutting_down(self) -> bool:
1781
+ """Check if shutdown is in progress"""
1782
+ return self._phase != ShutdownPhase.RUNNING
1783
+
1784
+ def register_operation(
1785
+ self,
1786
+ agent_id: str,
1787
+ operation_type: str,
1788
+ data: Optional[Dict[str, Any]] = None
1789
+ ) -> str:
1790
+ """Register an in-flight operation"""
1791
+ if self._phase != ShutdownPhase.RUNNING:
1792
+ raise RuntimeError("Cannot register new operations during shutdown")
1793
+
1794
+ operation_id = str(uuid.uuid4())
1795
+ self._in_flight[operation_id] = InFlightOperation(
1796
+ operation_id=operation_id,
1797
+ agent_id=agent_id,
1798
+ operation_type=operation_type,
1799
+ started_at=datetime.now(),
1800
+ data=data or {}
1801
+ )
1802
+ return operation_id
1803
+
1804
+ def complete_operation(self, operation_id: str) -> None:
1805
+ """Mark an operation as complete"""
1806
+ self._in_flight.pop(operation_id, None)
1807
+
1808
+ # Check if all operations complete during draining
1809
+ if self._phase == ShutdownPhase.DRAINING and not self._in_flight:
1810
+ self._shutdown_event.set()
1811
+
1812
+ def get_in_flight_count(self) -> int:
1813
+ """Get the number of in-flight operations"""
1814
+ return len(self._in_flight)
1815
+
1816
+ def get_in_flight_operations(self) -> List[InFlightOperation]:
1817
+ """Get all in-flight operations"""
1818
+ return list(self._in_flight.values())
1819
+
1820
+ def add_shutdown_hook(
1821
+ self,
1822
+ hook: Callable[[], Awaitable[None]]
1823
+ ) -> None:
1824
+ """Add a shutdown hook to be called during shutdown"""
1825
+ self._shutdown_hooks.append(hook)
1826
+
1827
+ async def shutdown(self) -> Dict[str, Any]:
1828
+ """
1829
+ Initiate graceful shutdown.
1830
+
1831
+ Returns a summary of the shutdown process.
1832
+ """
1833
+ async with self._lock:
1834
+ if self._phase != ShutdownPhase.RUNNING:
1835
+ return {"status": "already_shutting_down", "phase": self._phase.value}
1836
+
1837
+ logger.info("Initiating graceful shutdown")
1838
+ result = {
1839
+ "started_at": datetime.now().isoformat(),
1840
+ "in_flight_at_start": len(self._in_flight),
1841
+ "checkpointed": [],
1842
+ "timed_out": []
1843
+ }
1844
+
1845
+ # Phase 1: Draining
1846
+ self._phase = ShutdownPhase.DRAINING
1847
+ logger.info(f"Draining {len(self._in_flight)} in-flight operations")
1848
+
1849
+ if self._in_flight:
1850
+ self._shutdown_event.clear()
1851
+ try:
1852
+ await asyncio.wait_for(
1853
+ self._shutdown_event.wait(),
1854
+ timeout=self.config.drain_timeout_seconds
1855
+ )
1856
+ except asyncio.TimeoutError:
1857
+ logger.warning("Drain timeout reached, saving remaining operations")
1858
+
1859
+ # Checkpoint remaining operations
1860
+ if self.config.save_in_flight:
1861
+ for op_id, op in list(self._in_flight.items()):
1862
+ self._checkpoint_data[op_id] = {
1863
+ "agent_id": op.agent_id,
1864
+ "operation_type": op.operation_type,
1865
+ "data": op.data,
1866
+ "started_at": op.started_at.isoformat()
1867
+ }
1868
+ result["checkpointed"].append(op_id)
1869
+
1870
+ result["timed_out"] = list(self._in_flight.keys())
1871
+
1872
+ # Phase 2: Stopping
1873
+ self._phase = ShutdownPhase.STOPPING
1874
+ logger.info("Running shutdown hooks")
1875
+
1876
+ for hook in self._shutdown_hooks:
1877
+ try:
1878
+ await asyncio.wait_for(
1879
+ hook(),
1880
+ timeout=self.config.force_timeout_seconds
1881
+ )
1882
+ except asyncio.TimeoutError:
1883
+ logger.warning("Shutdown hook timed out")
1884
+ except Exception as e:
1885
+ logger.error(f"Shutdown hook error: {e}")
1886
+
1887
+ # Phase 3: Terminated
1888
+ self._phase = ShutdownPhase.TERMINATED
1889
+ result["completed_at"] = datetime.now().isoformat()
1890
+ result["checkpoint_data"] = self._checkpoint_data
1891
+
1892
+ logger.info("Graceful shutdown complete")
1893
+ return result
1894
+
1895
+ def get_checkpoint_data(self) -> Dict[str, Any]:
1896
+ """Get checkpointed data from shutdown"""
1897
+ return dict(self._checkpoint_data)
1898
+
1899
+ async def restore_from_checkpoint(
1900
+ self,
1901
+ checkpoint_data: Dict[str, Any]
1902
+ ) -> List[InFlightOperation]:
1903
+ """Restore operations from checkpoint data"""
1904
+ restored = []
1905
+ for op_id, data in checkpoint_data.items():
1906
+ op = InFlightOperation(
1907
+ operation_id=op_id,
1908
+ agent_id=data["agent_id"],
1909
+ operation_type=data["operation_type"],
1910
+ started_at=datetime.fromisoformat(data["started_at"]),
1911
+ data=data.get("data", {})
1912
+ )
1913
+ self._in_flight[op_id] = op
1914
+ restored.append(op)
1915
+
1916
+ logger.info(f"Restored {len(restored)} operations from checkpoint")
1917
+ return restored
1918
+
1919
+
1920
+ # ============================================================================
1921
+ # ACP-008: Resource Quotas
1922
+ # ============================================================================
1923
+
1924
+ @dataclass
1925
+ class AgentResourceQuota:
1926
+ """Resource quota limits for an agent"""
1927
+ memory_mb: int = 512
1928
+ cpu_percent: float = 25.0
1929
+ max_concurrent_operations: int = 10
1930
+ max_operations_per_minute: int = 100
1931
+ network_bandwidth_mbps: Optional[float] = None
1932
+ storage_mb: Optional[int] = None
1933
+
1934
+
1935
+ @dataclass
1936
+ class ResourceUsage:
1937
+ """Current resource usage for an agent"""
1938
+ agent_id: str
1939
+ memory_mb: float = 0.0
1940
+ cpu_percent: float = 0.0
1941
+ concurrent_operations: int = 0
1942
+ operations_this_minute: int = 0
1943
+ timestamp: datetime = field(default_factory=datetime.now)
1944
+
1945
+
1946
+ class ResourceQuotaManager:
1947
+ """
1948
+ Manages resource quotas and limits per agent.
1949
+
1950
+ Features:
1951
+ - Memory and CPU limits
1952
+ - Concurrent operation limits
1953
+ - Rate limiting (operations per minute)
1954
+ - Usage tracking and reporting
1955
+
1956
+ Usage:
1957
+ quota_manager = ResourceQuotaManager()
1958
+
1959
+ quota_manager.set_quota("claims-agent", AgentResourceQuota(
1960
+ memory_mb=512,
1961
+ cpu_percent=25,
1962
+ max_concurrent_operations=10
1963
+ ))
1964
+
1965
+ # Check before operation
1966
+ if quota_manager.can_execute("claims-agent"):
1967
+ quota_manager.record_operation("claims-agent")
1968
+ # Execute operation
1969
+ """
1970
+
1971
+ def __init__(self):
1972
+ self._quotas: Dict[str, AgentResourceQuota] = {}
1973
+ self._usage: Dict[str, ResourceUsage] = {}
1974
+ self._operation_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
1975
+ self._lock = asyncio.Lock()
1976
+
1977
+ def set_quota(self, agent_id: str, quota: AgentResourceQuota) -> None:
1978
+ """Set the resource quota for an agent"""
1979
+ self._quotas[agent_id] = quota
1980
+ if agent_id not in self._usage:
1981
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
1982
+ logger.info(f"Set quota for {agent_id}: memory={quota.memory_mb}MB, cpu={quota.cpu_percent}%")
1983
+
1984
+ def get_quota(self, agent_id: str) -> Optional[AgentResourceQuota]:
1985
+ """Get the quota for an agent"""
1986
+ return self._quotas.get(agent_id)
1987
+
1988
+ def can_execute(self, agent_id: str) -> bool:
1989
+ """Check if an agent can execute a new operation"""
1990
+ quota = self._quotas.get(agent_id)
1991
+ if not quota:
1992
+ return True # No quota means no limits
1993
+
1994
+ usage = self._usage.get(agent_id)
1995
+ if not usage:
1996
+ return True
1997
+
1998
+ # Check concurrent operations
1999
+ if usage.concurrent_operations >= quota.max_concurrent_operations:
2000
+ logger.warning(f"Agent {agent_id} at max concurrent operations")
2001
+ return False
2002
+
2003
+ # Check rate limit
2004
+ ops_this_minute = self._count_recent_operations(agent_id, seconds=60)
2005
+ if ops_this_minute >= quota.max_operations_per_minute:
2006
+ logger.warning(f"Agent {agent_id} at rate limit")
2007
+ return False
2008
+
2009
+ # Check memory
2010
+ if usage.memory_mb > quota.memory_mb:
2011
+ logger.warning(f"Agent {agent_id} over memory quota")
2012
+ return False
2013
+
2014
+ # Check CPU
2015
+ if usage.cpu_percent > quota.cpu_percent:
2016
+ logger.warning(f"Agent {agent_id} over CPU quota")
2017
+ return False
2018
+
2019
+ return True
2020
+
2021
+ def record_operation_start(self, agent_id: str) -> None:
2022
+ """Record the start of an operation"""
2023
+ if agent_id not in self._usage:
2024
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
2025
+
2026
+ self._usage[agent_id].concurrent_operations += 1
2027
+ self._operation_counts[agent_id].append(datetime.now())
2028
+
2029
+ def record_operation_end(self, agent_id: str) -> None:
2030
+ """Record the end of an operation"""
2031
+ if agent_id in self._usage:
2032
+ self._usage[agent_id].concurrent_operations = max(
2033
+ 0, self._usage[agent_id].concurrent_operations - 1
2034
+ )
2035
+
2036
+ def update_resource_usage(
2037
+ self,
2038
+ agent_id: str,
2039
+ memory_mb: Optional[float] = None,
2040
+ cpu_percent: Optional[float] = None
2041
+ ) -> None:
2042
+ """Update the resource usage for an agent"""
2043
+ if agent_id not in self._usage:
2044
+ self._usage[agent_id] = ResourceUsage(agent_id=agent_id)
2045
+
2046
+ usage = self._usage[agent_id]
2047
+ if memory_mb is not None:
2048
+ usage.memory_mb = memory_mb
2049
+ if cpu_percent is not None:
2050
+ usage.cpu_percent = cpu_percent
2051
+ usage.timestamp = datetime.now()
2052
+
2053
+ def _count_recent_operations(self, agent_id: str, seconds: int) -> int:
2054
+ """Count operations in the last N seconds"""
2055
+ cutoff = datetime.now() - timedelta(seconds=seconds)
2056
+ count = 0
2057
+ for ts in self._operation_counts.get(agent_id, []):
2058
+ if ts > cutoff:
2059
+ count += 1
2060
+ return count
2061
+
2062
+ def get_usage(self, agent_id: str) -> Optional[ResourceUsage]:
2063
+ """Get current usage for an agent"""
2064
+ return self._usage.get(agent_id)
2065
+
2066
+ def get_all_usage(self) -> Dict[str, ResourceUsage]:
2067
+ """Get usage for all agents"""
2068
+ return dict(self._usage)
2069
+
2070
+ def check_quota_violations(self) -> Dict[str, List[str]]:
2071
+ """Check for quota violations across all agents"""
2072
+ violations = {}
2073
+
2074
+ for agent_id, quota in self._quotas.items():
2075
+ usage = self._usage.get(agent_id)
2076
+ if not usage:
2077
+ continue
2078
+
2079
+ agent_violations = []
2080
+
2081
+ if usage.memory_mb > quota.memory_mb:
2082
+ agent_violations.append(
2083
+ f"Memory: {usage.memory_mb:.1f}MB > {quota.memory_mb}MB"
2084
+ )
2085
+
2086
+ if usage.cpu_percent > quota.cpu_percent:
2087
+ agent_violations.append(
2088
+ f"CPU: {usage.cpu_percent:.1f}% > {quota.cpu_percent}%"
2089
+ )
2090
+
2091
+ if usage.concurrent_operations > quota.max_concurrent_operations:
2092
+ agent_violations.append(
2093
+ f"Concurrent ops: {usage.concurrent_operations} > {quota.max_concurrent_operations}"
2094
+ )
2095
+
2096
+ if agent_violations:
2097
+ violations[agent_id] = agent_violations
2098
+
2099
+ return violations
2100
+
2101
+
2102
+ # ============================================================================
2103
+ # ACP-009: Agent Observability
2104
+ # ============================================================================
2105
+
2106
+ @dataclass
2107
+ class AgentMetric:
2108
+ """A metric measurement for an agent"""
2109
+ name: str
2110
+ value: float
2111
+ labels: Dict[str, str] = field(default_factory=dict)
2112
+ timestamp: datetime = field(default_factory=datetime.now)
2113
+ metric_type: str = "gauge" # gauge, counter, histogram
2114
+
2115
+
2116
+ @dataclass
2117
+ class AgentLogEntry:
2118
+ """A log entry from an agent"""
2119
+ agent_id: str
2120
+ level: str # debug, info, warning, error, critical
2121
+ message: str
2122
+ timestamp: datetime = field(default_factory=datetime.now)
2123
+ context: Dict[str, Any] = field(default_factory=dict)
2124
+
2125
+
2126
+ class AgentObservabilityProvider:
2127
+ """
2128
+ Built-in observability for agents (metrics, logging, tracing).
2129
+
2130
+ Features:
2131
+ - Structured logging with context
2132
+ - Metrics collection (counters, gauges, histograms)
2133
+ - Distributed tracing support
2134
+ - Prometheus-compatible export
2135
+
2136
+ Usage:
2137
+ observability = AgentObservabilityProvider()
2138
+
2139
+ # Record metric
2140
+ observability.record_metric(
2141
+ agent_id="claims-agent",
2142
+ name="verification_latency_ms",
2143
+ value=150.5,
2144
+ labels={"claim_type": "auto"}
2145
+ )
2146
+
2147
+ # Log with context
2148
+ observability.log(
2149
+ agent_id="claims-agent",
2150
+ level="info",
2151
+ message="Verification completed",
2152
+ context={"claim_id": "123", "result": "approved"}
2153
+ )
2154
+
2155
+ # Export metrics
2156
+ metrics = observability.export_prometheus()
2157
+ """
2158
+
2159
+ def __init__(self, max_log_entries: int = 10000, max_metrics: int = 10000):
2160
+ self._metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_metrics))
2161
+ self._logs: deque = deque(maxlen=max_log_entries)
2162
+ self._counters: Dict[str, float] = defaultdict(float)
2163
+ self._gauges: Dict[str, float] = {}
2164
+ self._histograms: Dict[str, List[float]] = defaultdict(list)
2165
+ self._metric_metadata: Dict[str, Dict[str, Any]] = {}
2166
+ self._lock = asyncio.Lock()
2167
+
2168
+ def record_metric(
2169
+ self,
2170
+ agent_id: str,
2171
+ name: str,
2172
+ value: float,
2173
+ labels: Optional[Dict[str, str]] = None,
2174
+ metric_type: str = "gauge"
2175
+ ) -> None:
2176
+ """Record a metric for an agent"""
2177
+ labels = labels or {}
2178
+ labels["agent_id"] = agent_id
2179
+
2180
+ metric = AgentMetric(
2181
+ name=name,
2182
+ value=value,
2183
+ labels=labels,
2184
+ metric_type=metric_type
2185
+ )
2186
+
2187
+ full_name = f"{name}:{self._make_label_key(labels)}"
2188
+ self._metrics[agent_id].append(metric)
2189
+
2190
+ # Update aggregates
2191
+ if metric_type == "counter":
2192
+ self._counters[full_name] += value
2193
+ elif metric_type == "gauge":
2194
+ self._gauges[full_name] = value
2195
+ elif metric_type == "histogram":
2196
+ self._histograms[name].append(value)
2197
+
2198
+ def increment_counter(
2199
+ self,
2200
+ agent_id: str,
2201
+ name: str,
2202
+ value: float = 1.0,
2203
+ labels: Optional[Dict[str, str]] = None
2204
+ ) -> None:
2205
+ """Increment a counter metric"""
2206
+ self.record_metric(agent_id, name, value, labels, metric_type="counter")
2207
+
2208
+ def set_gauge(
2209
+ self,
2210
+ agent_id: str,
2211
+ name: str,
2212
+ value: float,
2213
+ labels: Optional[Dict[str, str]] = None
2214
+ ) -> None:
2215
+ """Set a gauge metric"""
2216
+ self.record_metric(agent_id, name, value, labels, metric_type="gauge")
2217
+
2218
+ def observe_histogram(
2219
+ self,
2220
+ agent_id: str,
2221
+ name: str,
2222
+ value: float,
2223
+ labels: Optional[Dict[str, str]] = None
2224
+ ) -> None:
2225
+ """Observe a value for a histogram metric"""
2226
+ self.record_metric(agent_id, name, value, labels, metric_type="histogram")
2227
+
2228
+ def log(
2229
+ self,
2230
+ agent_id: str,
2231
+ level: str,
2232
+ message: str,
2233
+ context: Optional[Dict[str, Any]] = None
2234
+ ) -> None:
2235
+ """Log a message with structured context"""
2236
+ entry = AgentLogEntry(
2237
+ agent_id=agent_id,
2238
+ level=level,
2239
+ message=message,
2240
+ context=context or {}
2241
+ )
2242
+ self._logs.append(entry)
2243
+
2244
+ # Also log to Python logger
2245
+ log_func = getattr(logger, level.lower(), logger.info)
2246
+ log_func(f"[{agent_id}] {message}", extra={"context": context})
2247
+
2248
+ def get_metrics(
2249
+ self,
2250
+ agent_id: Optional[str] = None,
2251
+ name: Optional[str] = None
2252
+ ) -> List[AgentMetric]:
2253
+ """Get recorded metrics"""
2254
+ if agent_id:
2255
+ metrics = list(self._metrics.get(agent_id, []))
2256
+ else:
2257
+ metrics = []
2258
+ for agent_metrics in self._metrics.values():
2259
+ metrics.extend(agent_metrics)
2260
+
2261
+ if name:
2262
+ metrics = [m for m in metrics if m.name == name]
2263
+
2264
+ return metrics
2265
+
2266
+ def get_logs(
2267
+ self,
2268
+ agent_id: Optional[str] = None,
2269
+ level: Optional[str] = None,
2270
+ limit: int = 100
2271
+ ) -> List[AgentLogEntry]:
2272
+ """Get log entries"""
2273
+ logs = list(self._logs)
2274
+
2275
+ if agent_id:
2276
+ logs = [l for l in logs if l.agent_id == agent_id]
2277
+ if level:
2278
+ logs = [l for l in logs if l.level == level]
2279
+
2280
+ return logs[-limit:]
2281
+
2282
+ def export_prometheus(self) -> str:
2283
+ """Export metrics in Prometheus text format"""
2284
+ lines = []
2285
+
2286
+ # Export counters
2287
+ for full_name, value in self._counters.items():
2288
+ name = full_name.split(":")[0]
2289
+ lines.append(f"# TYPE {name} counter")
2290
+ lines.append(f"{full_name.replace(':', '')} {value}")
2291
+
2292
+ # Export gauges
2293
+ for full_name, value in self._gauges.items():
2294
+ name = full_name.split(":")[0]
2295
+ lines.append(f"# TYPE {name} gauge")
2296
+ lines.append(f"{full_name.replace(':', '')} {value}")
2297
+
2298
+ # Export histogram summaries
2299
+ for name, values in self._histograms.items():
2300
+ if values:
2301
+ lines.append(f"# TYPE {name} histogram")
2302
+ lines.append(f"{name}_count {len(values)}")
2303
+ lines.append(f"{name}_sum {sum(values)}")
2304
+
2305
+ # Calculate percentiles
2306
+ sorted_vals = sorted(values)
2307
+ for p in [0.5, 0.9, 0.99]:
2308
+ idx = int(len(sorted_vals) * p)
2309
+ lines.append(f'{name}{{quantile="{p}"}} {sorted_vals[idx]}')
2310
+
2311
+ return "\n".join(lines)
2312
+
2313
+ def _make_label_key(self, labels: Dict[str, str]) -> str:
2314
+ """Create a unique key from labels"""
2315
+ return ",".join(f'{k}="{v}"' for k, v in sorted(labels.items()))
2316
+
2317
+ def get_agent_summary(self, agent_id: str) -> Dict[str, Any]:
2318
+ """Get an observability summary for an agent"""
2319
+ metrics = self.get_metrics(agent_id)
2320
+ logs = self.get_logs(agent_id)
2321
+
2322
+ return {
2323
+ "agent_id": agent_id,
2324
+ "total_metrics": len(metrics),
2325
+ "total_logs": len(logs),
2326
+ "recent_metrics": metrics[-10:] if metrics else [],
2327
+ "recent_logs": logs[-10:] if logs else [],
2328
+ "log_level_counts": self._count_log_levels(logs)
2329
+ }
2330
+
2331
+ def _count_log_levels(self, logs: List[AgentLogEntry]) -> Dict[str, int]:
2332
+ """Count logs by level"""
2333
+ counts = defaultdict(int)
2334
+ for log in logs:
2335
+ counts[log.level] += 1
2336
+ return dict(counts)
2337
+
2338
+
2339
+ # ============================================================================
2340
+ # ACP-010: Hot Reload
2341
+ # ============================================================================
2342
+
2343
+ @dataclass
2344
+ class HotReloadConfig:
2345
+ """Configuration for hot reload"""
2346
+ enabled: bool = True
2347
+ watch_paths: List[str] = field(default_factory=list)
2348
+ reload_delay_seconds: float = 1.0
2349
+ preserve_state: bool = True
2350
+
2351
+
2352
+ @dataclass
2353
+ class ReloadEvent:
2354
+ """Record of a hot reload event"""
2355
+ agent_id: str
2356
+ old_version: str
2357
+ new_version: str
2358
+ timestamp: datetime = field(default_factory=datetime.now)
2359
+ success: bool = True
2360
+ error: Optional[str] = None
2361
+ preserved_state: Dict[str, Any] = field(default_factory=dict)
2362
+
2363
+
2364
+ class HotReloadManager:
2365
+ """
2366
+ Manages hot reload of agent code without full restart.
2367
+
2368
+ Features:
2369
+ - Code change detection
2370
+ - Graceful reload with state preservation
2371
+ - Version tracking
2372
+ - Rollback support
2373
+
2374
+ Usage:
2375
+ hot_reload = HotReloadManager(
2376
+ config=HotReloadConfig(
2377
+ watch_paths=["./agents"],
2378
+ preserve_state=True
2379
+ )
2380
+ )
2381
+
2382
+ # Register agent
2383
+ hot_reload.register_agent(
2384
+ agent_id="claims-agent",
2385
+ module_name="agents.claims",
2386
+ class_name="ClaimsAgent"
2387
+ )
2388
+
2389
+ # Trigger reload
2390
+ await hot_reload.reload_agent("claims-agent")
2391
+ """
2392
+
2393
+ def __init__(self, config: Optional[HotReloadConfig] = None):
2394
+ self.config = config or HotReloadConfig()
2395
+ self._agents: Dict[str, Dict[str, Any]] = {}
2396
+ self._versions: Dict[str, str] = {}
2397
+ self._previous_versions: Dict[str, Any] = {}
2398
+ self._reload_history: deque = deque(maxlen=100)
2399
+ self._callbacks: Dict[str, List[Callable]] = defaultdict(list)
2400
+ self._lock = asyncio.Lock()
2401
+
2402
+ def register_agent(
2403
+ self,
2404
+ agent_id: str,
2405
+ module_name: str,
2406
+ class_name: str,
2407
+ factory: Optional[Callable[[], Any]] = None,
2408
+ instance: Optional[Any] = None,
2409
+ state_extractor: Optional[Callable[[Any], Dict[str, Any]]] = None,
2410
+ state_injector: Optional[Callable[[Any, Dict[str, Any]], None]] = None
2411
+ ) -> None:
2412
+ """Register an agent for hot reload"""
2413
+ self._agents[agent_id] = {
2414
+ "module_name": module_name,
2415
+ "class_name": class_name,
2416
+ "factory": factory,
2417
+ "instance": instance,
2418
+ "state_extractor": state_extractor,
2419
+ "state_injector": state_injector
2420
+ }
2421
+ self._versions[agent_id] = self._compute_version(module_name)
2422
+ logger.info(f"Registered agent {agent_id} for hot reload (version: {self._versions[agent_id][:8]})")
2423
+
2424
+ def _compute_version(self, module_name: str) -> str:
2425
+ """Compute a version hash for a module"""
2426
+ try:
2427
+ module = sys.modules.get(module_name)
2428
+ if module and hasattr(module, '__file__') and module.__file__:
2429
+ with open(module.__file__, 'rb') as f:
2430
+ return hashlib.sha256(f.read()).hexdigest()
2431
+ except Exception as e:
2432
+ logger.warning(f"Could not compute version for {module_name}: {e}")
2433
+
2434
+ return hashlib.sha256(module_name.encode()).hexdigest()
2435
+
2436
+ async def check_for_changes(self, agent_id: str) -> bool:
2437
+ """Check if an agent's code has changed"""
2438
+ if agent_id not in self._agents:
2439
+ return False
2440
+
2441
+ module_name = self._agents[agent_id]["module_name"]
2442
+ new_version = self._compute_version(module_name)
2443
+ old_version = self._versions.get(agent_id, "")
2444
+
2445
+ return new_version != old_version
2446
+
2447
+ async def reload_agent(
2448
+ self,
2449
+ agent_id: str,
2450
+ force: bool = False
2451
+ ) -> ReloadEvent:
2452
+ """Reload an agent with optional state preservation"""
2453
+ if agent_id not in self._agents:
2454
+ raise ValueError(f"Agent {agent_id} not registered for hot reload")
2455
+
2456
+ async with self._lock:
2457
+ agent_info = self._agents[agent_id]
2458
+ module_name = agent_info["module_name"]
2459
+ class_name = agent_info["class_name"]
2460
+ old_version = self._versions.get(agent_id, "unknown")
2461
+
2462
+ # Check if reload needed
2463
+ if not force and not await self.check_for_changes(agent_id):
2464
+ return ReloadEvent(
2465
+ agent_id=agent_id,
2466
+ old_version=old_version,
2467
+ new_version=old_version,
2468
+ success=True,
2469
+ error="No changes detected"
2470
+ )
2471
+
2472
+ try:
2473
+ # Extract state from current instance
2474
+ preserved_state = {}
2475
+ if self.config.preserve_state and agent_info.get("instance"):
2476
+ extractor = agent_info.get("state_extractor")
2477
+ if extractor:
2478
+ preserved_state = extractor(agent_info["instance"])
2479
+ elif hasattr(agent_info["instance"], 'get_state'):
2480
+ preserved_state = agent_info["instance"].get_state()
2481
+
2482
+ # Stop old instance
2483
+ old_instance = agent_info.get("instance")
2484
+ if old_instance and hasattr(old_instance, 'stop'):
2485
+ if asyncio.iscoroutinefunction(old_instance.stop):
2486
+ await old_instance.stop()
2487
+ else:
2488
+ old_instance.stop()
2489
+
2490
+ # Store for potential rollback
2491
+ self._previous_versions[agent_id] = {
2492
+ "instance": old_instance,
2493
+ "version": old_version
2494
+ }
2495
+
2496
+ # Reload the module
2497
+ if module_name in sys.modules:
2498
+ module = importlib.reload(sys.modules[module_name])
2499
+ else:
2500
+ module = importlib.import_module(module_name)
2501
+
2502
+ # Create new instance
2503
+ agent_class = getattr(module, class_name)
2504
+
2505
+ if agent_info.get("factory"):
2506
+ new_instance = agent_info["factory"]()
2507
+ else:
2508
+ new_instance = agent_class()
2509
+
2510
+ if asyncio.iscoroutine(new_instance):
2511
+ new_instance = await new_instance
2512
+
2513
+ # Inject preserved state
2514
+ if preserved_state:
2515
+ injector = agent_info.get("state_injector")
2516
+ if injector:
2517
+ injector(new_instance, preserved_state)
2518
+ elif hasattr(new_instance, 'set_state'):
2519
+ new_instance.set_state(preserved_state)
2520
+
2521
+ # Start new instance
2522
+ if hasattr(new_instance, 'start'):
2523
+ if asyncio.iscoroutinefunction(new_instance.start):
2524
+ await new_instance.start()
2525
+ else:
2526
+ new_instance.start()
2527
+
2528
+ # Update registry
2529
+ agent_info["instance"] = new_instance
2530
+ new_version = self._compute_version(module_name)
2531
+ self._versions[agent_id] = new_version
2532
+
2533
+ event = ReloadEvent(
2534
+ agent_id=agent_id,
2535
+ old_version=old_version,
2536
+ new_version=new_version,
2537
+ success=True,
2538
+ preserved_state=preserved_state
2539
+ )
2540
+
2541
+ self._reload_history.append(event)
2542
+ await self._trigger_callbacks("reload_success", agent_id, event)
2543
+
2544
+ logger.info(f"Hot reloaded agent {agent_id}: {old_version[:8]} -> {new_version[:8]}")
2545
+ return event
2546
+
2547
+ except Exception as e:
2548
+ event = ReloadEvent(
2549
+ agent_id=agent_id,
2550
+ old_version=old_version,
2551
+ new_version=old_version,
2552
+ success=False,
2553
+ error=str(e)
2554
+ )
2555
+
2556
+ self._reload_history.append(event)
2557
+ await self._trigger_callbacks("reload_failed", agent_id, event)
2558
+
2559
+ logger.error(f"Hot reload failed for {agent_id}: {e}")
2560
+ return event
2561
+
2562
+ async def rollback_agent(self, agent_id: str) -> bool:
2563
+ """Rollback an agent to the previous version"""
2564
+ if agent_id not in self._previous_versions:
2565
+ logger.warning(f"No previous version available for {agent_id}")
2566
+ return False
2567
+
2568
+ async with self._lock:
2569
+ try:
2570
+ prev = self._previous_versions[agent_id]
2571
+ agent_info = self._agents[agent_id]
2572
+
2573
+ # Stop current instance
2574
+ current = agent_info.get("instance")
2575
+ if current and hasattr(current, 'stop'):
2576
+ if asyncio.iscoroutinefunction(current.stop):
2577
+ await current.stop()
2578
+ else:
2579
+ current.stop()
2580
+
2581
+ # Restore previous instance
2582
+ prev_instance = prev["instance"]
2583
+ if prev_instance and hasattr(prev_instance, 'start'):
2584
+ if asyncio.iscoroutinefunction(prev_instance.start):
2585
+ await prev_instance.start()
2586
+ else:
2587
+ prev_instance.start()
2588
+
2589
+ agent_info["instance"] = prev_instance
2590
+ self._versions[agent_id] = prev["version"]
2591
+
2592
+ logger.info(f"Rolled back agent {agent_id} to version {prev['version'][:8]}")
2593
+ return True
2594
+
2595
+ except Exception as e:
2596
+ logger.error(f"Rollback failed for {agent_id}: {e}")
2597
+ return False
2598
+
2599
+ def get_agent_version(self, agent_id: str) -> Optional[str]:
2600
+ """Get the current version of an agent"""
2601
+ return self._versions.get(agent_id)
2602
+
2603
+ def get_agent_instance(self, agent_id: str) -> Optional[Any]:
2604
+ """Get the current instance of an agent"""
2605
+ if agent_id in self._agents:
2606
+ return self._agents[agent_id].get("instance")
2607
+ return None
2608
+
2609
+ def get_reload_history(
2610
+ self,
2611
+ agent_id: Optional[str] = None
2612
+ ) -> List[ReloadEvent]:
2613
+ """Get reload history"""
2614
+ history = list(self._reload_history)
2615
+ if agent_id:
2616
+ history = [e for e in history if e.agent_id == agent_id]
2617
+ return history
2618
+
2619
+ def on_event(
2620
+ self,
2621
+ event: str,
2622
+ callback: Callable[[str, ReloadEvent], Awaitable[None]]
2623
+ ) -> None:
2624
+ """Register a callback for reload events"""
2625
+ self._callbacks[event].append(callback)
2626
+
2627
+ async def _trigger_callbacks(
2628
+ self,
2629
+ event: str,
2630
+ agent_id: str,
2631
+ reload_event: ReloadEvent
2632
+ ) -> None:
2633
+ """Trigger callbacks for an event"""
2634
+ for callback in self._callbacks.get(event, []):
2635
+ try:
2636
+ await callback(agent_id, reload_event)
2637
+ except Exception as e:
2638
+ logger.error(f"Hot reload callback error: {e}")
2639
+
2640
+
2641
+ # ============================================================================
2642
+ # Main Agent Registration
2643
+ # ============================================================================
2644
+
2645
+ @dataclass
2646
+ class AgentRegistration:
2647
+ """Registration details for an agent in the control plane"""
2648
+ agent_type: Type
2649
+ replicas: int = 1
2650
+ dependencies: List[str] = field(default_factory=list)
2651
+ resources: Optional[AgentResourceQuota] = None
2652
+ health_config: Optional[HealthCheckConfig] = None
2653
+ recovery_config: Optional[RecoveryConfig] = None
2654
+ circuit_breaker: Optional[CircuitBreaker] = None
2655
+ metadata: Dict[str, Any] = field(default_factory=dict)
2656
+
2657
+
2658
+ # ============================================================================
2659
+ # Enhanced Agent Control Plane
2660
+ # ============================================================================
2661
+
2662
+ class EnhancedAgentControlPlane:
2663
+ """
2664
+ Enhanced Agent Control Plane with full lifecycle management.
2665
+
2666
+ This is the main interface for managing autonomous AI agents with
2667
+ comprehensive lifecycle features including health monitoring,
2668
+ auto-recovery, circuit breakers, scaling, distributed coordination,
2669
+ dependency management, graceful shutdown, resource quotas,
2670
+ observability, and hot reload.
2671
+
2672
+ Usage:
2673
+ control_plane = EnhancedAgentControlPlane(
2674
+ health_check_interval=30,
2675
+ auto_recovery=True,
2676
+ circuit_breaker=CircuitBreaker(
2677
+ failure_threshold=5,
2678
+ recovery_timeout=60
2679
+ )
2680
+ )
2681
+
2682
+ control_plane.register(
2683
+ ClaimsAgent,
2684
+ replicas=3,
2685
+ dependencies=["message-bus"],
2686
+ resources=AgentResourceQuota(
2687
+ memory_mb=512,
2688
+ cpu_percent=25
2689
+ )
2690
+ )
2691
+
2692
+ await control_plane.start_all()
2693
+ """
2694
+
2695
+ def __init__(
2696
+ self,
2697
+ health_check_interval: float = 30.0,
2698
+ auto_recovery: bool = True,
2699
+ circuit_breaker: Optional[CircuitBreaker] = None,
2700
+ node_id: Optional[str] = None,
2701
+ health_config: Optional[HealthCheckConfig] = None,
2702
+ recovery_config: Optional[RecoveryConfig] = None,
2703
+ scaling_config: Optional[ScalingConfig] = None,
2704
+ shutdown_config: Optional[ShutdownConfig] = None,
2705
+ hot_reload_config: Optional[HotReloadConfig] = None
2706
+ ):
2707
+ """
2708
+ Initialize the Enhanced Agent Control Plane.
2709
+
2710
+ Args:
2711
+ health_check_interval: Interval between health checks (seconds)
2712
+ auto_recovery: Enable automatic recovery of failed agents
2713
+ circuit_breaker: Default circuit breaker configuration
2714
+ node_id: Node ID for distributed coordination
2715
+ health_config: Health check configuration
2716
+ recovery_config: Auto-recovery configuration
2717
+ scaling_config: Agent scaling configuration
2718
+ shutdown_config: Graceful shutdown configuration
2719
+ hot_reload_config: Hot reload configuration
2720
+ """
2721
+ self.node_id = node_id or f"node-{uuid.uuid4().hex[:8]}"
2722
+
2723
+ # Configure health monitoring
2724
+ health_config = health_config or HealthCheckConfig()
2725
+ health_config.liveness_interval_seconds = health_check_interval
2726
+ self.health_monitor = HealthMonitor(config=health_config)
2727
+
2728
+ # Configure auto-recovery
2729
+ recovery_config = recovery_config or RecoveryConfig()
2730
+ recovery_config.enabled = auto_recovery
2731
+ self.recovery_manager = AutoRecoveryManager(config=recovery_config)
2732
+
2733
+ # Configure circuit breakers
2734
+ self.default_circuit_breaker = circuit_breaker
2735
+ self.circuit_breaker_registry = CircuitBreakerRegistry()
2736
+
2737
+ # Configure scaling
2738
+ self.scaler = AgentScaler()
2739
+ self.default_scaling_config = scaling_config or ScalingConfig()
2740
+
2741
+ # Configure distributed coordination
2742
+ self.coordinator = DistributedCoordinator(node_id=self.node_id)
2743
+
2744
+ # Configure dependency graph
2745
+ self.dependency_graph = DependencyGraph()
2746
+
2747
+ # Configure graceful shutdown
2748
+ self.shutdown_manager = GracefulShutdownManager(
2749
+ config=shutdown_config or ShutdownConfig()
2750
+ )
2751
+
2752
+ # Configure resource quotas
2753
+ self.quota_manager = ResourceQuotaManager()
2754
+
2755
+ # Configure observability
2756
+ self.observability = AgentObservabilityProvider()
2757
+
2758
+ # Configure hot reload
2759
+ self.hot_reload = HotReloadManager(
2760
+ config=hot_reload_config or HotReloadConfig()
2761
+ )
2762
+
2763
+ # Agent registrations
2764
+ self._registrations: Dict[str, AgentRegistration] = {}
2765
+ self._instances: Dict[str, List[Any]] = defaultdict(list)
2766
+ self._running = False
2767
+
2768
+ # Wire up callbacks
2769
+ self._setup_callbacks()
2770
+
2771
+ def _setup_callbacks(self) -> None:
2772
+ """Set up internal callbacks between components"""
2773
+ # Health -> Recovery: trigger recovery on health failure
2774
+ async def on_liveness_failed(agent_id: str):
2775
+ self.observability.log(agent_id, "error", "Liveness check failed")
2776
+ self.observability.increment_counter(agent_id, "health_failures_total")
2777
+ await self.recovery_manager.handle_failure(agent_id)
2778
+
2779
+ self.health_monitor.on_event("liveness_failed", on_liveness_failed)
2780
+
2781
+ # Recovery -> Health: register recovered agents
2782
+ async def on_recovery_success(agent_id: str, event: RecoveryEvent):
2783
+ self.observability.log(agent_id, "info", f"Agent recovered (attempt {event.attempt})")
2784
+ self.observability.increment_counter(agent_id, "recoveries_total")
2785
+ agent = self.recovery_manager.get_agent(agent_id)
2786
+ if agent:
2787
+ self.health_monitor.register_agent(agent_id, agent)
2788
+
2789
+ self.recovery_manager.on_event("recovery_success", on_recovery_success)
2790
+
2791
+ def register(
2792
+ self,
2793
+ agent_type: Type,
2794
+ agent_id: Optional[str] = None,
2795
+ replicas: int = 1,
2796
+ dependencies: Optional[List[str]] = None,
2797
+ resources: Optional[AgentResourceQuota] = None,
2798
+ health_config: Optional[HealthCheckConfig] = None,
2799
+ recovery_config: Optional[RecoveryConfig] = None,
2800
+ circuit_breaker: Optional[CircuitBreaker] = None,
2801
+ **metadata
2802
+ ) -> str:
2803
+ """
2804
+ Register an agent type with the control plane.
2805
+
2806
+ Args:
2807
+ agent_type: The agent class to register
2808
+ agent_id: Optional agent ID (defaults to class name)
2809
+ replicas: Number of replicas to create
2810
+ dependencies: List of agent IDs this agent depends on
2811
+ resources: Resource quota for this agent
2812
+ health_config: Health check configuration
2813
+ recovery_config: Auto-recovery configuration
2814
+ circuit_breaker: Circuit breaker for this agent
2815
+ **metadata: Additional metadata
2816
+
2817
+ Returns:
2818
+ The agent ID
2819
+ """
2820
+ agent_id = agent_id or agent_type.__name__
2821
+ dependencies = dependencies or []
2822
+
2823
+ registration = AgentRegistration(
2824
+ agent_type=agent_type,
2825
+ replicas=replicas,
2826
+ dependencies=dependencies,
2827
+ resources=resources,
2828
+ health_config=health_config,
2829
+ recovery_config=recovery_config,
2830
+ circuit_breaker=circuit_breaker or self.default_circuit_breaker,
2831
+ metadata=metadata
2832
+ )
2833
+
2834
+ self._registrations[agent_id] = registration
2835
+
2836
+ # Register with dependency graph
2837
+ self.dependency_graph.add_agent(agent_id, depends_on=dependencies)
2838
+
2839
+ # Register with scaler
2840
+ self.scaler.register_agent_type(
2841
+ agent_type=agent_id,
2842
+ factory=lambda at=agent_type: at(),
2843
+ config=self.default_scaling_config,
2844
+ replicas=replicas
2845
+ )
2846
+
2847
+ # Set resource quota if provided
2848
+ if resources:
2849
+ self.quota_manager.set_quota(agent_id, resources)
2850
+
2851
+ # Register circuit breaker
2852
+ if circuit_breaker:
2853
+ self.circuit_breaker_registry._breakers[agent_id] = circuit_breaker
2854
+
2855
+ self.observability.log(
2856
+ agent_id, "info",
2857
+ f"Registered agent with {replicas} replicas, dependencies: {dependencies}"
2858
+ )
2859
+
2860
+ logger.info(f"Registered agent {agent_id}: replicas={replicas}, dependencies={dependencies}")
2861
+ return agent_id
2862
+
2863
+ async def start_all(self) -> Dict[str, Any]:
2864
+ """
2865
+ Start all registered agents in dependency order.
2866
+
2867
+ Returns:
2868
+ Summary of startup results
2869
+ """
2870
+ if self._running:
2871
+ return {"status": "already_running"}
2872
+
2873
+ result = {
2874
+ "started_at": datetime.now().isoformat(),
2875
+ "agents": {},
2876
+ "errors": []
2877
+ }
2878
+
2879
+ try:
2880
+ # Validate dependency graph
2881
+ errors = self.dependency_graph.validate()
2882
+ if errors:
2883
+ result["errors"] = errors
2884
+ return result
2885
+
2886
+ # Get startup order
2887
+ startup_groups = self.dependency_graph.get_parallel_startup_groups()
2888
+
2889
+ # Start coordinator
2890
+ await self.coordinator.start()
2891
+
2892
+ # Start health monitor
2893
+ await self.health_monitor.start()
2894
+
2895
+ # Start agents in dependency order
2896
+ for group in startup_groups:
2897
+ # Start agents in this group in parallel
2898
+ tasks = []
2899
+ for agent_id in group:
2900
+ tasks.append(self._start_agent(agent_id))
2901
+
2902
+ group_results = await asyncio.gather(*tasks, return_exceptions=True)
2903
+
2904
+ for agent_id, res in zip(group, group_results):
2905
+ if isinstance(res, Exception):
2906
+ result["agents"][agent_id] = {
2907
+ "status": "failed",
2908
+ "error": str(res)
2909
+ }
2910
+ result["errors"].append(f"{agent_id}: {res}")
2911
+ else:
2912
+ result["agents"][agent_id] = res
2913
+
2914
+ # Start scaler
2915
+ await self.scaler.start()
2916
+
2917
+ self._running = True
2918
+ result["status"] = "started"
2919
+
2920
+ except Exception as e:
2921
+ result["status"] = "failed"
2922
+ result["errors"].append(str(e))
2923
+ logger.error(f"Failed to start control plane: {e}")
2924
+
2925
+ return result
2926
+
2927
+ async def _start_agent(self, agent_id: str) -> Dict[str, Any]:
2928
+ """Start a single agent"""
2929
+ registration = self._registrations.get(agent_id)
2930
+ if not registration:
2931
+ raise ValueError(f"Agent {agent_id} not registered")
2932
+
2933
+ result = {
2934
+ "agent_id": agent_id,
2935
+ "status": "starting",
2936
+ "replicas": []
2937
+ }
2938
+
2939
+ # Check resource quota
2940
+ if registration.resources:
2941
+ self.quota_manager.set_quota(agent_id, registration.resources)
2942
+
2943
+ # Create factory for recovery manager
2944
+ def create_agent():
2945
+ return registration.agent_type()
2946
+
2947
+ # Register with recovery manager
2948
+ self.recovery_manager.register_agent(agent_id, create_agent)
2949
+
2950
+ # Create replicas
2951
+ for i in range(registration.replicas):
2952
+ replica_id = f"{agent_id}-{i}"
2953
+ try:
2954
+ instance = create_agent()
2955
+
2956
+ # Start instance if it has start method
2957
+ if hasattr(instance, 'start'):
2958
+ if asyncio.iscoroutinefunction(instance.start):
2959
+ await instance.start()
2960
+ else:
2961
+ instance.start()
2962
+
2963
+ self._instances[agent_id].append(instance)
2964
+
2965
+ # Register with health monitor
2966
+ self.health_monitor.register_agent(replica_id, instance)
2967
+
2968
+ result["replicas"].append({
2969
+ "replica_id": replica_id,
2970
+ "status": "running"
2971
+ })
2972
+
2973
+ self.observability.log(agent_id, "info", f"Started replica {replica_id}")
2974
+
2975
+ except Exception as e:
2976
+ result["replicas"].append({
2977
+ "replica_id": replica_id,
2978
+ "status": "failed",
2979
+ "error": str(e)
2980
+ })
2981
+ self.observability.log(agent_id, "error", f"Failed to start replica {replica_id}: {e}")
2982
+
2983
+ result["status"] = "running"
2984
+ return result
2985
+
2986
+ async def stop_all(self) -> Dict[str, Any]:
2987
+ """
2988
+ Stop all agents gracefully.
2989
+
2990
+ Returns:
2991
+ Summary of shutdown results
2992
+ """
2993
+ if not self._running:
2994
+ return {"status": "not_running"}
2995
+
2996
+ # Initiate graceful shutdown
2997
+ shutdown_result = await self.shutdown_manager.shutdown()
2998
+
2999
+ # Stop components in reverse order
3000
+ await self.scaler.stop()
3001
+ await self.health_monitor.stop()
3002
+ await self.coordinator.stop()
3003
+
3004
+ # Stop agents in reverse dependency order
3005
+ shutdown_order = self.dependency_graph.get_shutdown_order()
3006
+
3007
+ for agent_id in shutdown_order:
3008
+ for instance in self._instances.get(agent_id, []):
3009
+ try:
3010
+ if hasattr(instance, 'stop'):
3011
+ if asyncio.iscoroutinefunction(instance.stop):
3012
+ await instance.stop()
3013
+ else:
3014
+ instance.stop()
3015
+ except Exception as e:
3016
+ logger.error(f"Error stopping {agent_id}: {e}")
3017
+
3018
+ self._instances[agent_id].clear()
3019
+
3020
+ self._running = False
3021
+
3022
+ return {
3023
+ "status": "stopped",
3024
+ "shutdown_result": shutdown_result
3025
+ }
3026
+
3027
+ def get_agent(self, agent_id: str, replica_index: int = 0) -> Optional[Any]:
3028
+ """Get an agent instance by ID"""
3029
+ instances = self._instances.get(agent_id, [])
3030
+ if 0 <= replica_index < len(instances):
3031
+ return instances[replica_index]
3032
+ return None
3033
+
3034
+ async def get_available_agent(self, agent_id: str) -> Optional[Any]:
3035
+ """Get an available agent instance (load balanced)"""
3036
+ # Check circuit breaker
3037
+ breaker = self.circuit_breaker_registry.get(agent_id)
3038
+ if breaker and breaker.is_open:
3039
+ return None
3040
+
3041
+ # Check resource quota
3042
+ if not self.quota_manager.can_execute(agent_id):
3043
+ return None
3044
+
3045
+ # Get replica from scaler
3046
+ return await self.scaler.get_replica(agent_id)
3047
+
3048
+ def get_health_status(self, agent_id: str) -> HealthStatus:
3049
+ """Get the health status of an agent"""
3050
+ return self.health_monitor.get_agent_health(agent_id)
3051
+
3052
+ def get_all_health_status(self) -> Dict[str, HealthStatus]:
3053
+ """Get health status for all agents"""
3054
+ return self.health_monitor.get_all_health_status()
3055
+
3056
+ def get_circuit_breaker(self, agent_id: str) -> Optional[CircuitBreaker]:
3057
+ """Get the circuit breaker for an agent"""
3058
+ return self.circuit_breaker_registry.get(agent_id)
3059
+
3060
+ def get_metrics(self) -> str:
3061
+ """Get Prometheus-formatted metrics"""
3062
+ return self.observability.export_prometheus()
3063
+
3064
+ def get_status(self) -> Dict[str, Any]:
3065
+ """Get comprehensive status of the control plane"""
3066
+ return {
3067
+ "running": self._running,
3068
+ "node_id": self.node_id,
3069
+ "is_leader": self.coordinator.is_leader,
3070
+ "registered_agents": list(self._registrations.keys()),
3071
+ "health_status": {
3072
+ k: v.value for k, v in self.health_monitor.get_all_health_status().items()
3073
+ },
3074
+ "circuit_breakers": {
3075
+ name: cb.get_metrics().__dict__
3076
+ for name, cb in self.circuit_breaker_registry._breakers.items()
3077
+ },
3078
+ "resource_violations": self.quota_manager.check_quota_violations(),
3079
+ "in_flight_operations": self.shutdown_manager.get_in_flight_count()
3080
+ }
3081
+
3082
+
3083
+ # Convenience factory function
3084
+ def create_control_plane(
3085
+ health_check_interval: float = 30.0,
3086
+ auto_recovery: bool = True,
3087
+ circuit_breaker: Optional[CircuitBreaker] = None,
3088
+ **kwargs
3089
+ ) -> EnhancedAgentControlPlane:
3090
+ """
3091
+ Create an enhanced agent control plane.
3092
+
3093
+ This is the recommended way to create a control plane instance.
3094
+
3095
+ Args:
3096
+ health_check_interval: Interval between health checks
3097
+ auto_recovery: Enable automatic recovery
3098
+ circuit_breaker: Default circuit breaker
3099
+ **kwargs: Additional configuration
3100
+
3101
+ Returns:
3102
+ Configured EnhancedAgentControlPlane instance
3103
+ """
3104
+ return EnhancedAgentControlPlane(
3105
+ health_check_interval=health_check_interval,
3106
+ auto_recovery=auto_recovery,
3107
+ circuit_breaker=circuit_breaker,
3108
+ **kwargs
3109
+ )
3110
+
3111
+
3112
+ # Backwards compatibility alias
3113
+ AgentControlPlaneV2 = EnhancedAgentControlPlane