puffinflow 2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puffinflow/__init__.py +132 -0
- puffinflow/core/__init__.py +110 -0
- puffinflow/core/agent/__init__.py +320 -0
- puffinflow/core/agent/base.py +1635 -0
- puffinflow/core/agent/checkpoint.py +50 -0
- puffinflow/core/agent/context.py +521 -0
- puffinflow/core/agent/decorators/__init__.py +90 -0
- puffinflow/core/agent/decorators/builder.py +454 -0
- puffinflow/core/agent/decorators/flexible.py +714 -0
- puffinflow/core/agent/decorators/inspection.py +144 -0
- puffinflow/core/agent/dependencies.py +57 -0
- puffinflow/core/agent/scheduling/__init__.py +21 -0
- puffinflow/core/agent/scheduling/builder.py +160 -0
- puffinflow/core/agent/scheduling/exceptions.py +35 -0
- puffinflow/core/agent/scheduling/inputs.py +137 -0
- puffinflow/core/agent/scheduling/parser.py +209 -0
- puffinflow/core/agent/scheduling/scheduler.py +413 -0
- puffinflow/core/agent/state.py +141 -0
- puffinflow/core/config.py +62 -0
- puffinflow/core/coordination/__init__.py +137 -0
- puffinflow/core/coordination/agent_group.py +359 -0
- puffinflow/core/coordination/agent_pool.py +629 -0
- puffinflow/core/coordination/agent_team.py +577 -0
- puffinflow/core/coordination/coordinator.py +720 -0
- puffinflow/core/coordination/deadlock.py +1759 -0
- puffinflow/core/coordination/fluent_api.py +421 -0
- puffinflow/core/coordination/primitives.py +478 -0
- puffinflow/core/coordination/rate_limiter.py +520 -0
- puffinflow/core/observability/__init__.py +47 -0
- puffinflow/core/observability/agent.py +139 -0
- puffinflow/core/observability/alerting.py +73 -0
- puffinflow/core/observability/config.py +127 -0
- puffinflow/core/observability/context.py +88 -0
- puffinflow/core/observability/core.py +147 -0
- puffinflow/core/observability/decorators.py +105 -0
- puffinflow/core/observability/events.py +71 -0
- puffinflow/core/observability/interfaces.py +196 -0
- puffinflow/core/observability/metrics.py +137 -0
- puffinflow/core/observability/tracing.py +209 -0
- puffinflow/core/reliability/__init__.py +27 -0
- puffinflow/core/reliability/bulkhead.py +96 -0
- puffinflow/core/reliability/circuit_breaker.py +149 -0
- puffinflow/core/reliability/leak_detector.py +122 -0
- puffinflow/core/resources/__init__.py +77 -0
- puffinflow/core/resources/allocation.py +790 -0
- puffinflow/core/resources/pool.py +645 -0
- puffinflow/core/resources/quotas.py +567 -0
- puffinflow/core/resources/requirements.py +217 -0
- puffinflow/version.py +21 -0
- puffinflow-2.dev0.dist-info/METADATA +334 -0
- puffinflow-2.dev0.dist-info/RECORD +55 -0
- puffinflow-2.dev0.dist-info/WHEEL +5 -0
- puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
- puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
- puffinflow-2.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
"""Coordination system with comprehensive monitoring and control."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import inspect
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
import weakref
|
|
10
|
+
from collections.abc import AsyncGenerator, Awaitable
|
|
11
|
+
from dataclasses import asdict, dataclass
|
|
12
|
+
from typing import Any, Optional, Protocol
|
|
13
|
+
|
|
14
|
+
from .deadlock import DeadlockDetector
|
|
15
|
+
from .primitives import (
|
|
16
|
+
CoordinationPrimitive,
|
|
17
|
+
PrimitiveType,
|
|
18
|
+
)
|
|
19
|
+
from .rate_limiter import RateLimiter, RateLimitStrategy
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AgentProtocol(Protocol):
|
|
25
|
+
"""Protocol for agent objects that can be coordinated."""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
state_metadata: dict[str, Any]
|
|
29
|
+
|
|
30
|
+
def _add_to_queue(
|
|
31
|
+
self, state_name: str, priority_boost: int = 0
|
|
32
|
+
) -> Awaitable[None]:
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
async def run_state(self, state_name: str) -> None:
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class CoordinationConfig:
|
|
41
|
+
"""Configuration for coordination system."""
|
|
42
|
+
|
|
43
|
+
detection_interval: float = 1.0
|
|
44
|
+
cleanup_interval: float = 60.0
|
|
45
|
+
max_coordination_timeout: float = 30.0
|
|
46
|
+
enable_metrics: bool = True
|
|
47
|
+
enable_deadlock_detection: bool = True
|
|
48
|
+
max_retry_attempts: int = 3
|
|
49
|
+
backoff_multiplier: float = 1.5
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CoordinationError(Exception):
|
|
53
|
+
"""Base exception for coordination errors."""
|
|
54
|
+
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class CoordinationTimeout(CoordinationError):
|
|
59
|
+
"""Raised when coordination times out."""
|
|
60
|
+
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class AgentCoordinator:
|
|
65
|
+
"""Enhanced agent coordination system with comprehensive monitoring and control."""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self, agent: AgentProtocol, config: Optional[CoordinationConfig] = None
|
|
69
|
+
):
|
|
70
|
+
"""Initialize the coordination system.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
agent: The agent to coordinate
|
|
74
|
+
config: Configuration for the coordination system
|
|
75
|
+
"""
|
|
76
|
+
self.agent = weakref.proxy(agent)
|
|
77
|
+
self.config = config or CoordinationConfig()
|
|
78
|
+
self.instance_id = str(uuid.uuid4())
|
|
79
|
+
|
|
80
|
+
# Components
|
|
81
|
+
self.rate_limiters: dict[str, RateLimiter] = {}
|
|
82
|
+
self.primitives: dict[str, CoordinationPrimitive] = {}
|
|
83
|
+
|
|
84
|
+
# Initialize deadlock detector if enabled
|
|
85
|
+
self.deadlock_detector: Optional[DeadlockDetector] = None
|
|
86
|
+
if self.config.enable_deadlock_detection:
|
|
87
|
+
self.deadlock_detector = DeadlockDetector(
|
|
88
|
+
agent, detection_interval=self.config.detection_interval
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# State management
|
|
92
|
+
self._cleanup_task: Optional[asyncio.Task] = None
|
|
93
|
+
self._shutting_down = False
|
|
94
|
+
self._start_time: Optional[float] = None
|
|
95
|
+
self._coordination_stats = {
|
|
96
|
+
"total_requests": 0,
|
|
97
|
+
"successful_requests": 0,
|
|
98
|
+
"failed_requests": 0,
|
|
99
|
+
"rate_limited_requests": 0,
|
|
100
|
+
"timeout_requests": 0,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# Thread safety
|
|
104
|
+
self._state_lock = asyncio.Lock()
|
|
105
|
+
|
|
106
|
+
logger.info(
|
|
107
|
+
f"coordinator_initialized: instance_id={self.instance_id}, "
|
|
108
|
+
f"agent_name={agent.name}, detection_interval={self.config.detection_interval}, "
|
|
109
|
+
f"cleanup_interval={self.config.cleanup_interval}, "
|
|
110
|
+
f"deadlock_detection={self.config.enable_deadlock_detection}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
async def start(self) -> None:
|
|
114
|
+
"""Start the coordination system."""
|
|
115
|
+
async with self._state_lock:
|
|
116
|
+
if self._cleanup_task is not None:
|
|
117
|
+
logger.warning(
|
|
118
|
+
f"coordinator_already_started: instance_id={self.instance_id}"
|
|
119
|
+
)
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
self._start_time = time.time()
|
|
123
|
+
self._shutting_down = False
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
# Start deadlock detector
|
|
127
|
+
if self.deadlock_detector:
|
|
128
|
+
await self.deadlock_detector.start()
|
|
129
|
+
|
|
130
|
+
# Start cleanup task
|
|
131
|
+
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
|
|
132
|
+
|
|
133
|
+
logger.info(f"coordinator_started: instance_id={self.instance_id}")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(
|
|
137
|
+
f"coordinator_start_failed: instance_id={self.instance_id}, error={e!s}"
|
|
138
|
+
)
|
|
139
|
+
await self._emergency_cleanup()
|
|
140
|
+
raise CoordinationError(f"Failed to start coordinator: {e}") from e
|
|
141
|
+
|
|
142
|
+
async def stop(self) -> None:
|
|
143
|
+
"""Stop the coordination system gracefully."""
|
|
144
|
+
async with self._state_lock:
|
|
145
|
+
if self._shutting_down:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
self._shutting_down = True
|
|
149
|
+
logger.info(f"coordinator_stopping: instance_id={self.instance_id}")
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
# Stop deadlock detector
|
|
153
|
+
if self.deadlock_detector:
|
|
154
|
+
await self.deadlock_detector.stop()
|
|
155
|
+
|
|
156
|
+
# Cancel and wait for cleanup task
|
|
157
|
+
if self._cleanup_task and not self._cleanup_task.done():
|
|
158
|
+
self._cleanup_task.cancel()
|
|
159
|
+
try:
|
|
160
|
+
await asyncio.wait_for(self._cleanup_task, timeout=5.0)
|
|
161
|
+
except (asyncio.CancelledError, asyncio.TimeoutError):
|
|
162
|
+
logger.warning(
|
|
163
|
+
f"cleanup_task_forced_termination: instance_id={self.instance_id}"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Release all coordination resources
|
|
167
|
+
await self._release_all_resources()
|
|
168
|
+
|
|
169
|
+
# Log final statistics
|
|
170
|
+
uptime = time.time() - (self._start_time or 0)
|
|
171
|
+
logger.info(
|
|
172
|
+
f"coordinator_stopped: instance_id={self.instance_id}, "
|
|
173
|
+
f"uptime={uptime:.2f}, total_requests={self._coordination_stats['total_requests']}, "
|
|
174
|
+
f"successful_requests={self._coordination_stats['successful_requests']}, "
|
|
175
|
+
f"failed_requests={self._coordination_stats['failed_requests']}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.error(
|
|
180
|
+
f"coordinator_stop_error: instance_id={self.instance_id}, error={e!s}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
async def _emergency_cleanup(self) -> None:
|
|
184
|
+
"""Emergency cleanup in case of startup failure."""
|
|
185
|
+
try:
|
|
186
|
+
if self.deadlock_detector:
|
|
187
|
+
await self.deadlock_detector.stop()
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error(
|
|
190
|
+
f"emergency_cleanup_failed: instance_id={self.instance_id}, error={e!s}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
async def _release_all_resources(self) -> None:
|
|
194
|
+
"""Release all coordination resources."""
|
|
195
|
+
released_count = 0
|
|
196
|
+
for primitive in self.primitives.values():
|
|
197
|
+
try:
|
|
198
|
+
# Release all acquisitions for this coordinator instance
|
|
199
|
+
caller_prefix = f"{self.instance_id}:"
|
|
200
|
+
for owner in list(primitive._owners):
|
|
201
|
+
if owner.startswith(caller_prefix):
|
|
202
|
+
await primitive.release(owner)
|
|
203
|
+
released_count += 1
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(
|
|
206
|
+
f"resource_release_error: primitive={primitive.name}, error={e!s}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if released_count > 0:
|
|
210
|
+
logger.info(
|
|
211
|
+
f"released_all_resources: instance_id={self.instance_id}, count={released_count}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def add_rate_limiter(
|
|
215
|
+
self,
|
|
216
|
+
name: str,
|
|
217
|
+
max_rate: float,
|
|
218
|
+
strategy: RateLimitStrategy = RateLimitStrategy.TOKEN_BUCKET,
|
|
219
|
+
**kwargs: Any,
|
|
220
|
+
) -> None:
|
|
221
|
+
"""Add a rate limiter.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
name: Name of the rate limiter
|
|
225
|
+
max_rate: Maximum rate (requests per second)
|
|
226
|
+
strategy: Rate limiting strategy
|
|
227
|
+
**kwargs: Additional arguments for the rate limiter
|
|
228
|
+
"""
|
|
229
|
+
if name in self.rate_limiters:
|
|
230
|
+
logger.warning(f"rate_limiter_already_exists: name={name}")
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
self.rate_limiters[name] = RateLimiter(
|
|
234
|
+
max_rate=max_rate, strategy=strategy, **kwargs
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
logger.info(
|
|
238
|
+
f"rate_limiter_added: name={name}, max_rate={max_rate}, strategy={strategy.name}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def create_primitive(
|
|
242
|
+
self, name: str, primitive_type: PrimitiveType, **kwargs: Any
|
|
243
|
+
) -> None:
|
|
244
|
+
"""Create a coordination primitive.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
name: Name of the primitive
|
|
248
|
+
primitive_type: Type of coordination primitive
|
|
249
|
+
**kwargs: Additional arguments for the primitive
|
|
250
|
+
"""
|
|
251
|
+
if name in self.primitives:
|
|
252
|
+
logger.warning(f"primitive_already_exists: name={name}")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
self.primitives[name] = CoordinationPrimitive(
|
|
256
|
+
name=name, type=primitive_type, **kwargs
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
logger.info(
|
|
260
|
+
f"primitive_created: name={name}, type={primitive_type.name}, "
|
|
261
|
+
f"config={kwargs}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
async def coordinate_state_execution(
|
|
265
|
+
self, state_name: str, timeout: Optional[float] = None
|
|
266
|
+
) -> bool:
|
|
267
|
+
"""Coordinate state execution with rate limiting and resource management.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
state_name: Name of the state to coordinate
|
|
271
|
+
timeout: Optional timeout for coordination
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
True if coordination successful, False otherwise
|
|
275
|
+
"""
|
|
276
|
+
coordination_id = str(uuid.uuid4())
|
|
277
|
+
start_time = time.time()
|
|
278
|
+
timeout = timeout or self.config.max_coordination_timeout
|
|
279
|
+
|
|
280
|
+
self._coordination_stats["total_requests"] += 1
|
|
281
|
+
|
|
282
|
+
logger.debug(
|
|
283
|
+
f"coordination_request: state={state_name}, "
|
|
284
|
+
f"coordination_id={coordination_id}, timeout={timeout}"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
# Check rate limits
|
|
289
|
+
if state_name in self.rate_limiters and not await asyncio.wait_for(
|
|
290
|
+
self.rate_limiters[state_name].acquire(), timeout=timeout
|
|
291
|
+
):
|
|
292
|
+
self._coordination_stats["rate_limited_requests"] += 1
|
|
293
|
+
await self._log_coordination_failure(
|
|
294
|
+
state_name, coordination_id, "rate_limit_exceeded"
|
|
295
|
+
)
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
# Check coordination primitives
|
|
299
|
+
caller_id = f"{self.instance_id}:{state_name}:{coordination_id}"
|
|
300
|
+
acquired_primitives = []
|
|
301
|
+
|
|
302
|
+
try:
|
|
303
|
+
for primitive_name, primitive in self.primitives.items():
|
|
304
|
+
remaining_timeout = timeout - (time.time() - start_time)
|
|
305
|
+
if remaining_timeout <= 0:
|
|
306
|
+
raise asyncio.TimeoutError("Coordination timeout")
|
|
307
|
+
|
|
308
|
+
if not await asyncio.wait_for(
|
|
309
|
+
primitive.acquire(caller_id, timeout=remaining_timeout),
|
|
310
|
+
timeout=remaining_timeout,
|
|
311
|
+
):
|
|
312
|
+
await self._log_coordination_failure(
|
|
313
|
+
state_name,
|
|
314
|
+
coordination_id,
|
|
315
|
+
f"primitive_blocked:{primitive_name}",
|
|
316
|
+
)
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
acquired_primitives.append((primitive_name, primitive))
|
|
320
|
+
|
|
321
|
+
# All coordination successful
|
|
322
|
+
self._coordination_stats["successful_requests"] += 1
|
|
323
|
+
duration = time.time() - start_time
|
|
324
|
+
|
|
325
|
+
logger.debug(
|
|
326
|
+
f"coordination_successful: state={state_name}, "
|
|
327
|
+
f"coordination_id={coordination_id}, duration={duration:.3f}, "
|
|
328
|
+
f"acquired_primitives={[name for name, _ in acquired_primitives]}"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return True
|
|
332
|
+
|
|
333
|
+
except asyncio.TimeoutError:
|
|
334
|
+
self._coordination_stats["timeout_requests"] += 1
|
|
335
|
+
# Release any acquired primitives
|
|
336
|
+
for primitive_name, primitive in acquired_primitives:
|
|
337
|
+
try:
|
|
338
|
+
await primitive.release(caller_id)
|
|
339
|
+
except Exception as release_error:
|
|
340
|
+
logger.error(
|
|
341
|
+
f"primitive_release_error: primitive={primitive_name}, "
|
|
342
|
+
f"caller_id={caller_id}, error={release_error!s}"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
await self._log_coordination_failure(
|
|
346
|
+
state_name, coordination_id, "timeout"
|
|
347
|
+
)
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
except Exception as e:
|
|
351
|
+
self._coordination_stats["failed_requests"] += 1
|
|
352
|
+
await self._log_coordination_failure(
|
|
353
|
+
state_name, coordination_id, f"exception:{e!s}"
|
|
354
|
+
)
|
|
355
|
+
return False
|
|
356
|
+
|
|
357
|
+
async def _log_coordination_failure(
|
|
358
|
+
self, state_name: str, coordination_id: str, reason: str
|
|
359
|
+
) -> None:
|
|
360
|
+
"""Log coordination failure with monitoring integration."""
|
|
361
|
+
logger.warning(
|
|
362
|
+
f"coordination_failed: state={state_name}, "
|
|
363
|
+
f"coordination_id={coordination_id}, reason={reason}"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if hasattr(self.agent, "_monitor"):
|
|
367
|
+
try:
|
|
368
|
+
self.agent._monitor.logger.warning(
|
|
369
|
+
f"coordination_failed: state={state_name}, "
|
|
370
|
+
f"coordination_id={coordination_id}, reason={reason}"
|
|
371
|
+
)
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.error(f"monitor_logging_error: {e!s}")
|
|
374
|
+
|
|
375
|
+
async def release_coordination(
|
|
376
|
+
self, state_name: str, coordination_id: Optional[str] = None
|
|
377
|
+
) -> None:
|
|
378
|
+
"""Release coordination resources for a state.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
state_name: Name of the state
|
|
382
|
+
coordination_id: Optional specific coordination ID
|
|
383
|
+
"""
|
|
384
|
+
if coordination_id:
|
|
385
|
+
caller_id = f"{self.instance_id}:{state_name}:{coordination_id}"
|
|
386
|
+
else:
|
|
387
|
+
# Release all coordinations for this state
|
|
388
|
+
caller_prefix = f"{self.instance_id}:{state_name}:"
|
|
389
|
+
|
|
390
|
+
released_count = 0
|
|
391
|
+
|
|
392
|
+
for primitive_name, primitive in self.primitives.items():
|
|
393
|
+
try:
|
|
394
|
+
if coordination_id:
|
|
395
|
+
await primitive.release(caller_id)
|
|
396
|
+
released_count += 1
|
|
397
|
+
else:
|
|
398
|
+
# Release all matching coordination IDs
|
|
399
|
+
for owner in list(primitive._owners):
|
|
400
|
+
if owner.startswith(caller_prefix):
|
|
401
|
+
await primitive.release(owner)
|
|
402
|
+
released_count += 1
|
|
403
|
+
except Exception as e:
|
|
404
|
+
logger.error(
|
|
405
|
+
f"coordination_release_error: primitive={primitive_name}, "
|
|
406
|
+
f"state={state_name}, error={e!s}"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
logger.debug(
|
|
410
|
+
f"coordination_released: state={state_name}, "
|
|
411
|
+
f"coordination_id={coordination_id}, released_count={released_count}"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
def get_status(self) -> dict[str, Any]:
|
|
415
|
+
"""Get comprehensive coordinator status."""
|
|
416
|
+
uptime = time.time() - (self._start_time or 0) if self._start_time else 0
|
|
417
|
+
|
|
418
|
+
return {
|
|
419
|
+
"instance_id": self.instance_id,
|
|
420
|
+
"agent_name": self.agent.name,
|
|
421
|
+
"uptime": uptime,
|
|
422
|
+
"shutting_down": self._shutting_down,
|
|
423
|
+
"config": asdict(self.config),
|
|
424
|
+
"stats": self._coordination_stats.copy(),
|
|
425
|
+
"rate_limiters": {
|
|
426
|
+
name: limiter.get_stats()
|
|
427
|
+
for name, limiter in self.rate_limiters.items()
|
|
428
|
+
},
|
|
429
|
+
"primitives": {
|
|
430
|
+
name: primitive.get_state()
|
|
431
|
+
for name, primitive in self.primitives.items()
|
|
432
|
+
},
|
|
433
|
+
"deadlock_detector": (
|
|
434
|
+
self.deadlock_detector.get_status() if self.deadlock_detector else None
|
|
435
|
+
),
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
async def _cleanup_loop(self) -> None:
|
|
439
|
+
"""Background cleanup loop for maintenance tasks."""
|
|
440
|
+
logger.info(f"cleanup_loop_started: instance_id={self.instance_id}")
|
|
441
|
+
|
|
442
|
+
while not self._shutting_down:
|
|
443
|
+
try:
|
|
444
|
+
cleanup_start = time.time()
|
|
445
|
+
|
|
446
|
+
# Clean up expired primitive acquisitions
|
|
447
|
+
cleanup_count = 0
|
|
448
|
+
for primitive in self.primitives.values():
|
|
449
|
+
try:
|
|
450
|
+
async with primitive._lock:
|
|
451
|
+
before_count = len(primitive._owners)
|
|
452
|
+
primitive._cleanup_expired()
|
|
453
|
+
after_count = len(primitive._owners)
|
|
454
|
+
cleanup_count += before_count - after_count
|
|
455
|
+
except Exception as e:
|
|
456
|
+
logger.error(
|
|
457
|
+
f"primitive_cleanup_error: primitive={primitive.name}, error={e!s}"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
cleanup_duration = time.time() - cleanup_start
|
|
461
|
+
|
|
462
|
+
if cleanup_count > 0 or cleanup_duration > 1.0:
|
|
463
|
+
logger.debug(
|
|
464
|
+
f"cleanup_cycle_completed: instance_id={self.instance_id}, "
|
|
465
|
+
f"cleaned_acquisitions={cleanup_count}, duration={cleanup_duration:.3f}"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
await asyncio.sleep(self.config.cleanup_interval)
|
|
469
|
+
|
|
470
|
+
except asyncio.CancelledError:
|
|
471
|
+
logger.info(f"cleanup_loop_cancelled: instance_id={self.instance_id}")
|
|
472
|
+
break
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.error(
|
|
475
|
+
f"cleanup_loop_error: instance_id={self.instance_id}, error={e!s}"
|
|
476
|
+
)
|
|
477
|
+
# Continue the loop even on errors
|
|
478
|
+
await asyncio.sleep(1.0)
|
|
479
|
+
|
|
480
|
+
logger.info(f"cleanup_loop_stopped: instance_id={self.instance_id}")
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def enhance_agent(
|
|
484
|
+
agent: AgentProtocol, config: Optional[CoordinationConfig] = None
|
|
485
|
+
) -> AgentProtocol:
|
|
486
|
+
"""Add production coordination to an agent with proper method binding.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
agent: The agent to enhance
|
|
490
|
+
config: Optional coordination configuration
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
The enhanced agent
|
|
494
|
+
"""
|
|
495
|
+
# Add coordinator
|
|
496
|
+
coordinator = AgentCoordinator(agent, config)
|
|
497
|
+
agent._coordinator = coordinator # type: ignore
|
|
498
|
+
|
|
499
|
+
# Store original methods
|
|
500
|
+
original_run_state = agent.run_state
|
|
501
|
+
original_cleanup = getattr(agent, "_cleanup", None)
|
|
502
|
+
|
|
503
|
+
# Handle startup coordination
|
|
504
|
+
async def start_coordinator() -> None:
|
|
505
|
+
await coordinator.start()
|
|
506
|
+
|
|
507
|
+
if hasattr(agent, "_startup_tasks"):
|
|
508
|
+
agent._startup_tasks.append(start_coordinator())
|
|
509
|
+
else:
|
|
510
|
+
# Only create task if there's a running event loop
|
|
511
|
+
try:
|
|
512
|
+
asyncio.get_running_loop()
|
|
513
|
+
task = asyncio.create_task(start_coordinator())
|
|
514
|
+
# Store task reference to prevent garbage collection
|
|
515
|
+
if not hasattr(agent, "_coordination_tasks"):
|
|
516
|
+
agent._coordination_tasks = set() # type: ignore
|
|
517
|
+
agent._coordination_tasks.add(task) # type: ignore
|
|
518
|
+
task.add_done_callback(lambda t: agent._coordination_tasks.discard(t)) # type: ignore
|
|
519
|
+
except RuntimeError:
|
|
520
|
+
# No running event loop, coordinator will be started manually
|
|
521
|
+
logger.info(f"no_event_loop_for_auto_start: agent={agent.name}")
|
|
522
|
+
|
|
523
|
+
# Enhanced run_state with proper binding
|
|
524
|
+
async def enhanced_run_state(state_name: str) -> None:
|
|
525
|
+
"""Enhanced state execution with coordination and monitoring."""
|
|
526
|
+
attempt_id = str(uuid.uuid4())
|
|
527
|
+
start_time = time.time()
|
|
528
|
+
|
|
529
|
+
try:
|
|
530
|
+
# Check coordination and rate limits
|
|
531
|
+
if not await agent._coordinator.coordinate_state_execution(state_name): # type: ignore
|
|
532
|
+
if hasattr(agent, "_monitor"):
|
|
533
|
+
agent._monitor.logger.warning(
|
|
534
|
+
f"coordination_failed: state={state_name}, attempt={attempt_id}"
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Requeue with backoff if agent supports it
|
|
538
|
+
if (
|
|
539
|
+
hasattr(agent, "_add_to_queue")
|
|
540
|
+
and hasattr(agent, "state_metadata")
|
|
541
|
+
and state_name in agent.state_metadata
|
|
542
|
+
):
|
|
543
|
+
await agent._add_to_queue(
|
|
544
|
+
state_name,
|
|
545
|
+
priority_boost=-1,
|
|
546
|
+
)
|
|
547
|
+
return
|
|
548
|
+
|
|
549
|
+
# Log execution start
|
|
550
|
+
if hasattr(agent, "_monitor"):
|
|
551
|
+
metadata = {}
|
|
552
|
+
if (
|
|
553
|
+
hasattr(agent, "state_metadata")
|
|
554
|
+
and state_name in agent.state_metadata
|
|
555
|
+
):
|
|
556
|
+
state_meta = agent.state_metadata[state_name]
|
|
557
|
+
metadata = {
|
|
558
|
+
"resources": (
|
|
559
|
+
asdict(state_meta.resources)
|
|
560
|
+
if hasattr(state_meta, "resources")
|
|
561
|
+
else {}
|
|
562
|
+
),
|
|
563
|
+
"dependencies": len(getattr(state_meta, "dependencies", [])),
|
|
564
|
+
"attempts": getattr(state_meta, "attempts", 0),
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
agent._monitor.logger.info(
|
|
568
|
+
f"state_execution_started: state={state_name}, "
|
|
569
|
+
f"attempt={attempt_id}, metadata={metadata}"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Execute original state with monitoring span
|
|
573
|
+
async with agent._execution_span(state_name, attempt_id): # type: ignore
|
|
574
|
+
await original_run_state(state_name)
|
|
575
|
+
|
|
576
|
+
# Record success metrics
|
|
577
|
+
if hasattr(agent, "_monitor"):
|
|
578
|
+
duration = time.time() - start_time
|
|
579
|
+
await agent._monitor.record_metric(
|
|
580
|
+
"state_duration",
|
|
581
|
+
duration,
|
|
582
|
+
{"state": state_name, "status": "success"},
|
|
583
|
+
)
|
|
584
|
+
await agent._monitor.record_metric(
|
|
585
|
+
"state_success", 1, {"state": state_name}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
# Handle failure with monitoring
|
|
590
|
+
if hasattr(agent, "_monitor"):
|
|
591
|
+
duration = time.time() - start_time
|
|
592
|
+
agent._monitor.logger.error(
|
|
593
|
+
f"state_execution_failed: state={state_name}, "
|
|
594
|
+
f"attempt={attempt_id}, error={e!s}, duration={duration:.3f}"
|
|
595
|
+
)
|
|
596
|
+
await agent._monitor.record_metric(
|
|
597
|
+
"state_duration", duration, {"state": state_name, "status": "error"}
|
|
598
|
+
)
|
|
599
|
+
await agent._monitor.record_metric(
|
|
600
|
+
"state_error",
|
|
601
|
+
1,
|
|
602
|
+
{"state": state_name, "error_type": type(e).__name__},
|
|
603
|
+
)
|
|
604
|
+
raise
|
|
605
|
+
|
|
606
|
+
finally:
|
|
607
|
+
# Always release coordination
|
|
608
|
+
await agent._coordinator.release_coordination(state_name, attempt_id) # type: ignore
|
|
609
|
+
|
|
610
|
+
# Bind the enhanced method to the agent
|
|
611
|
+
agent.run_state = enhanced_run_state # type: ignore
|
|
612
|
+
|
|
613
|
+
# Add execution span context manager
|
|
614
|
+
@contextlib.asynccontextmanager
|
|
615
|
+
async def _execution_span(
|
|
616
|
+
state_name: str, attempt_id: str
|
|
617
|
+
) -> AsyncGenerator[None, None]:
|
|
618
|
+
"""Create execution span for monitoring."""
|
|
619
|
+
if hasattr(agent, "_monitor"):
|
|
620
|
+
try:
|
|
621
|
+
async with agent._monitor.monitor_operation(
|
|
622
|
+
"state_execution",
|
|
623
|
+
{"state": state_name, "attempt": attempt_id, "agent": agent.name},
|
|
624
|
+
) as span:
|
|
625
|
+
yield span
|
|
626
|
+
except Exception as e:
|
|
627
|
+
logger.error(f"monitor_span_error: {e!s}")
|
|
628
|
+
yield None
|
|
629
|
+
else:
|
|
630
|
+
yield None
|
|
631
|
+
|
|
632
|
+
agent._execution_span = _execution_span # type: ignore
|
|
633
|
+
|
|
634
|
+
# Enhanced cleanup
|
|
635
|
+
async def enhanced_cleanup() -> None:
|
|
636
|
+
"""Enhanced cleanup with coordination system shutdown."""
|
|
637
|
+
try:
|
|
638
|
+
# Stop coordinator first
|
|
639
|
+
await agent._coordinator.stop() # type: ignore
|
|
640
|
+
|
|
641
|
+
# Run original cleanup if it exists
|
|
642
|
+
if original_cleanup:
|
|
643
|
+
if inspect.iscoroutinefunction(original_cleanup):
|
|
644
|
+
await original_cleanup()
|
|
645
|
+
else:
|
|
646
|
+
original_cleanup()
|
|
647
|
+
|
|
648
|
+
except Exception as e:
|
|
649
|
+
if hasattr(agent, "_monitor"):
|
|
650
|
+
agent._monitor.logger.error(f"cleanup_error: error={e!s}")
|
|
651
|
+
logger.error(f"enhanced_cleanup_error: agent={agent.name}, error={e!s}")
|
|
652
|
+
raise
|
|
653
|
+
|
|
654
|
+
agent._cleanup = enhanced_cleanup # type: ignore
|
|
655
|
+
|
|
656
|
+
# Add utility methods with proper binding
|
|
657
|
+
def add_utility_methods() -> None:
|
|
658
|
+
async def get_coordination_status() -> dict[str, Any]:
|
|
659
|
+
"""Get coordination system status."""
|
|
660
|
+
return agent._coordinator.get_status() # type: ignore
|
|
661
|
+
|
|
662
|
+
async def reset_coordination() -> None:
|
|
663
|
+
"""Reset coordination system."""
|
|
664
|
+
old_config = agent._coordinator.config # type: ignore
|
|
665
|
+
await agent._coordinator.stop() # type: ignore
|
|
666
|
+
agent._coordinator = AgentCoordinator(agent, old_config) # type: ignore
|
|
667
|
+
await agent._coordinator.start() # type: ignore
|
|
668
|
+
|
|
669
|
+
def add_state_rate_limit(
|
|
670
|
+
state_name: str,
|
|
671
|
+
max_rate: float,
|
|
672
|
+
strategy: RateLimitStrategy = RateLimitStrategy.TOKEN_BUCKET,
|
|
673
|
+
**kwargs: Any,
|
|
674
|
+
) -> None:
|
|
675
|
+
"""Add rate limit for specific state."""
|
|
676
|
+
agent._coordinator.add_rate_limiter( # type: ignore
|
|
677
|
+
state_name, max_rate, strategy, **kwargs
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
def add_state_coordination(
|
|
681
|
+
state_name: str, primitive_type: PrimitiveType, **kwargs: Any
|
|
682
|
+
) -> None:
|
|
683
|
+
"""Add coordination primitive for specific state."""
|
|
684
|
+
agent._coordinator.create_primitive( # type: ignore
|
|
685
|
+
f"state_{state_name}", primitive_type, **kwargs
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Bind methods to agent
|
|
689
|
+
agent.get_coordination_status = get_coordination_status # type: ignore
|
|
690
|
+
agent.reset_coordination = reset_coordination # type: ignore
|
|
691
|
+
agent.add_state_rate_limit = add_state_rate_limit # type: ignore
|
|
692
|
+
agent.add_state_coordination = add_state_coordination # type: ignore
|
|
693
|
+
|
|
694
|
+
add_utility_methods()
|
|
695
|
+
|
|
696
|
+
logger.info(
|
|
697
|
+
f"agent_enhanced: agent_name={agent.name}, "
|
|
698
|
+
f"coordinator_id={coordinator.instance_id}"
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
return agent
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def create_coordinated_agent(
|
|
705
|
+
name: str, config: Optional[CoordinationConfig] = None, **agent_kwargs: Any
|
|
706
|
+
) -> Any:
|
|
707
|
+
"""Create an agent with coordination enabled.
|
|
708
|
+
|
|
709
|
+
Args:
|
|
710
|
+
name: Name of the agent
|
|
711
|
+
config: Optional coordination configuration
|
|
712
|
+
**agent_kwargs: Additional arguments for agent creation
|
|
713
|
+
|
|
714
|
+
Returns:
|
|
715
|
+
Enhanced agent with coordination
|
|
716
|
+
"""
|
|
717
|
+
from puffinflow.core.agent.base import Agent
|
|
718
|
+
|
|
719
|
+
agent = Agent(name, **agent_kwargs)
|
|
720
|
+
return enhance_agent(agent, config)
|