puffinflow 2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. puffinflow/__init__.py +132 -0
  2. puffinflow/core/__init__.py +110 -0
  3. puffinflow/core/agent/__init__.py +320 -0
  4. puffinflow/core/agent/base.py +1635 -0
  5. puffinflow/core/agent/checkpoint.py +50 -0
  6. puffinflow/core/agent/context.py +521 -0
  7. puffinflow/core/agent/decorators/__init__.py +90 -0
  8. puffinflow/core/agent/decorators/builder.py +454 -0
  9. puffinflow/core/agent/decorators/flexible.py +714 -0
  10. puffinflow/core/agent/decorators/inspection.py +144 -0
  11. puffinflow/core/agent/dependencies.py +57 -0
  12. puffinflow/core/agent/scheduling/__init__.py +21 -0
  13. puffinflow/core/agent/scheduling/builder.py +160 -0
  14. puffinflow/core/agent/scheduling/exceptions.py +35 -0
  15. puffinflow/core/agent/scheduling/inputs.py +137 -0
  16. puffinflow/core/agent/scheduling/parser.py +209 -0
  17. puffinflow/core/agent/scheduling/scheduler.py +413 -0
  18. puffinflow/core/agent/state.py +141 -0
  19. puffinflow/core/config.py +62 -0
  20. puffinflow/core/coordination/__init__.py +137 -0
  21. puffinflow/core/coordination/agent_group.py +359 -0
  22. puffinflow/core/coordination/agent_pool.py +629 -0
  23. puffinflow/core/coordination/agent_team.py +577 -0
  24. puffinflow/core/coordination/coordinator.py +720 -0
  25. puffinflow/core/coordination/deadlock.py +1759 -0
  26. puffinflow/core/coordination/fluent_api.py +421 -0
  27. puffinflow/core/coordination/primitives.py +478 -0
  28. puffinflow/core/coordination/rate_limiter.py +520 -0
  29. puffinflow/core/observability/__init__.py +47 -0
  30. puffinflow/core/observability/agent.py +139 -0
  31. puffinflow/core/observability/alerting.py +73 -0
  32. puffinflow/core/observability/config.py +127 -0
  33. puffinflow/core/observability/context.py +88 -0
  34. puffinflow/core/observability/core.py +147 -0
  35. puffinflow/core/observability/decorators.py +105 -0
  36. puffinflow/core/observability/events.py +71 -0
  37. puffinflow/core/observability/interfaces.py +196 -0
  38. puffinflow/core/observability/metrics.py +137 -0
  39. puffinflow/core/observability/tracing.py +209 -0
  40. puffinflow/core/reliability/__init__.py +27 -0
  41. puffinflow/core/reliability/bulkhead.py +96 -0
  42. puffinflow/core/reliability/circuit_breaker.py +149 -0
  43. puffinflow/core/reliability/leak_detector.py +122 -0
  44. puffinflow/core/resources/__init__.py +77 -0
  45. puffinflow/core/resources/allocation.py +790 -0
  46. puffinflow/core/resources/pool.py +645 -0
  47. puffinflow/core/resources/quotas.py +567 -0
  48. puffinflow/core/resources/requirements.py +217 -0
  49. puffinflow/version.py +21 -0
  50. puffinflow-2.dev0.dist-info/METADATA +334 -0
  51. puffinflow-2.dev0.dist-info/RECORD +55 -0
  52. puffinflow-2.dev0.dist-info/WHEEL +5 -0
  53. puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
  54. puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
  55. puffinflow-2.dev0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,720 @@
1
+ """Coordination system with comprehensive monitoring and control."""
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import inspect
6
+ import logging
7
+ import time
8
+ import uuid
9
+ import weakref
10
+ from collections.abc import AsyncGenerator, Awaitable
11
+ from dataclasses import asdict, dataclass
12
+ from typing import Any, Optional, Protocol
13
+
14
+ from .deadlock import DeadlockDetector
15
+ from .primitives import (
16
+ CoordinationPrimitive,
17
+ PrimitiveType,
18
+ )
19
+ from .rate_limiter import RateLimiter, RateLimitStrategy
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class AgentProtocol(Protocol):
25
+ """Protocol for agent objects that can be coordinated."""
26
+
27
+ name: str
28
+ state_metadata: dict[str, Any]
29
+
30
+ def _add_to_queue(
31
+ self, state_name: str, priority_boost: int = 0
32
+ ) -> Awaitable[None]:
33
+ ...
34
+
35
+ async def run_state(self, state_name: str) -> None:
36
+ ...
37
+
38
+
39
+ @dataclass
40
+ class CoordinationConfig:
41
+ """Configuration for coordination system."""
42
+
43
+ detection_interval: float = 1.0
44
+ cleanup_interval: float = 60.0
45
+ max_coordination_timeout: float = 30.0
46
+ enable_metrics: bool = True
47
+ enable_deadlock_detection: bool = True
48
+ max_retry_attempts: int = 3
49
+ backoff_multiplier: float = 1.5
50
+
51
+
52
+ class CoordinationError(Exception):
53
+ """Base exception for coordination errors."""
54
+
55
+ pass
56
+
57
+
58
+ class CoordinationTimeout(CoordinationError):
59
+ """Raised when coordination times out."""
60
+
61
+ pass
62
+
63
+
64
+ class AgentCoordinator:
65
+ """Enhanced agent coordination system with comprehensive monitoring and control."""
66
+
67
+ def __init__(
68
+ self, agent: AgentProtocol, config: Optional[CoordinationConfig] = None
69
+ ):
70
+ """Initialize the coordination system.
71
+
72
+ Args:
73
+ agent: The agent to coordinate
74
+ config: Configuration for the coordination system
75
+ """
76
+ self.agent = weakref.proxy(agent)
77
+ self.config = config or CoordinationConfig()
78
+ self.instance_id = str(uuid.uuid4())
79
+
80
+ # Components
81
+ self.rate_limiters: dict[str, RateLimiter] = {}
82
+ self.primitives: dict[str, CoordinationPrimitive] = {}
83
+
84
+ # Initialize deadlock detector if enabled
85
+ self.deadlock_detector: Optional[DeadlockDetector] = None
86
+ if self.config.enable_deadlock_detection:
87
+ self.deadlock_detector = DeadlockDetector(
88
+ agent, detection_interval=self.config.detection_interval
89
+ )
90
+
91
+ # State management
92
+ self._cleanup_task: Optional[asyncio.Task] = None
93
+ self._shutting_down = False
94
+ self._start_time: Optional[float] = None
95
+ self._coordination_stats = {
96
+ "total_requests": 0,
97
+ "successful_requests": 0,
98
+ "failed_requests": 0,
99
+ "rate_limited_requests": 0,
100
+ "timeout_requests": 0,
101
+ }
102
+
103
+ # Thread safety
104
+ self._state_lock = asyncio.Lock()
105
+
106
+ logger.info(
107
+ f"coordinator_initialized: instance_id={self.instance_id}, "
108
+ f"agent_name={agent.name}, detection_interval={self.config.detection_interval}, "
109
+ f"cleanup_interval={self.config.cleanup_interval}, "
110
+ f"deadlock_detection={self.config.enable_deadlock_detection}"
111
+ )
112
+
113
+ async def start(self) -> None:
114
+ """Start the coordination system."""
115
+ async with self._state_lock:
116
+ if self._cleanup_task is not None:
117
+ logger.warning(
118
+ f"coordinator_already_started: instance_id={self.instance_id}"
119
+ )
120
+ return
121
+
122
+ self._start_time = time.time()
123
+ self._shutting_down = False
124
+
125
+ try:
126
+ # Start deadlock detector
127
+ if self.deadlock_detector:
128
+ await self.deadlock_detector.start()
129
+
130
+ # Start cleanup task
131
+ self._cleanup_task = asyncio.create_task(self._cleanup_loop())
132
+
133
+ logger.info(f"coordinator_started: instance_id={self.instance_id}")
134
+
135
+ except Exception as e:
136
+ logger.error(
137
+ f"coordinator_start_failed: instance_id={self.instance_id}, error={e!s}"
138
+ )
139
+ await self._emergency_cleanup()
140
+ raise CoordinationError(f"Failed to start coordinator: {e}") from e
141
+
142
+ async def stop(self) -> None:
143
+ """Stop the coordination system gracefully."""
144
+ async with self._state_lock:
145
+ if self._shutting_down:
146
+ return
147
+
148
+ self._shutting_down = True
149
+ logger.info(f"coordinator_stopping: instance_id={self.instance_id}")
150
+
151
+ try:
152
+ # Stop deadlock detector
153
+ if self.deadlock_detector:
154
+ await self.deadlock_detector.stop()
155
+
156
+ # Cancel and wait for cleanup task
157
+ if self._cleanup_task and not self._cleanup_task.done():
158
+ self._cleanup_task.cancel()
159
+ try:
160
+ await asyncio.wait_for(self._cleanup_task, timeout=5.0)
161
+ except (asyncio.CancelledError, asyncio.TimeoutError):
162
+ logger.warning(
163
+ f"cleanup_task_forced_termination: instance_id={self.instance_id}"
164
+ )
165
+
166
+ # Release all coordination resources
167
+ await self._release_all_resources()
168
+
169
+ # Log final statistics
170
+ uptime = time.time() - (self._start_time or 0)
171
+ logger.info(
172
+ f"coordinator_stopped: instance_id={self.instance_id}, "
173
+ f"uptime={uptime:.2f}, total_requests={self._coordination_stats['total_requests']}, "
174
+ f"successful_requests={self._coordination_stats['successful_requests']}, "
175
+ f"failed_requests={self._coordination_stats['failed_requests']}"
176
+ )
177
+
178
+ except Exception as e:
179
+ logger.error(
180
+ f"coordinator_stop_error: instance_id={self.instance_id}, error={e!s}"
181
+ )
182
+
183
+ async def _emergency_cleanup(self) -> None:
184
+ """Emergency cleanup in case of startup failure."""
185
+ try:
186
+ if self.deadlock_detector:
187
+ await self.deadlock_detector.stop()
188
+ except Exception as e:
189
+ logger.error(
190
+ f"emergency_cleanup_failed: instance_id={self.instance_id}, error={e!s}"
191
+ )
192
+
193
+ async def _release_all_resources(self) -> None:
194
+ """Release all coordination resources."""
195
+ released_count = 0
196
+ for primitive in self.primitives.values():
197
+ try:
198
+ # Release all acquisitions for this coordinator instance
199
+ caller_prefix = f"{self.instance_id}:"
200
+ for owner in list(primitive._owners):
201
+ if owner.startswith(caller_prefix):
202
+ await primitive.release(owner)
203
+ released_count += 1
204
+ except Exception as e:
205
+ logger.error(
206
+ f"resource_release_error: primitive={primitive.name}, error={e!s}"
207
+ )
208
+
209
+ if released_count > 0:
210
+ logger.info(
211
+ f"released_all_resources: instance_id={self.instance_id}, count={released_count}"
212
+ )
213
+
214
+ def add_rate_limiter(
215
+ self,
216
+ name: str,
217
+ max_rate: float,
218
+ strategy: RateLimitStrategy = RateLimitStrategy.TOKEN_BUCKET,
219
+ **kwargs: Any,
220
+ ) -> None:
221
+ """Add a rate limiter.
222
+
223
+ Args:
224
+ name: Name of the rate limiter
225
+ max_rate: Maximum rate (requests per second)
226
+ strategy: Rate limiting strategy
227
+ **kwargs: Additional arguments for the rate limiter
228
+ """
229
+ if name in self.rate_limiters:
230
+ logger.warning(f"rate_limiter_already_exists: name={name}")
231
+ return
232
+
233
+ self.rate_limiters[name] = RateLimiter(
234
+ max_rate=max_rate, strategy=strategy, **kwargs
235
+ )
236
+
237
+ logger.info(
238
+ f"rate_limiter_added: name={name}, max_rate={max_rate}, strategy={strategy.name}"
239
+ )
240
+
241
+ def create_primitive(
242
+ self, name: str, primitive_type: PrimitiveType, **kwargs: Any
243
+ ) -> None:
244
+ """Create a coordination primitive.
245
+
246
+ Args:
247
+ name: Name of the primitive
248
+ primitive_type: Type of coordination primitive
249
+ **kwargs: Additional arguments for the primitive
250
+ """
251
+ if name in self.primitives:
252
+ logger.warning(f"primitive_already_exists: name={name}")
253
+ return
254
+
255
+ self.primitives[name] = CoordinationPrimitive(
256
+ name=name, type=primitive_type, **kwargs
257
+ )
258
+
259
+ logger.info(
260
+ f"primitive_created: name={name}, type={primitive_type.name}, "
261
+ f"config={kwargs}"
262
+ )
263
+
264
+ async def coordinate_state_execution(
265
+ self, state_name: str, timeout: Optional[float] = None
266
+ ) -> bool:
267
+ """Coordinate state execution with rate limiting and resource management.
268
+
269
+ Args:
270
+ state_name: Name of the state to coordinate
271
+ timeout: Optional timeout for coordination
272
+
273
+ Returns:
274
+ True if coordination successful, False otherwise
275
+ """
276
+ coordination_id = str(uuid.uuid4())
277
+ start_time = time.time()
278
+ timeout = timeout or self.config.max_coordination_timeout
279
+
280
+ self._coordination_stats["total_requests"] += 1
281
+
282
+ logger.debug(
283
+ f"coordination_request: state={state_name}, "
284
+ f"coordination_id={coordination_id}, timeout={timeout}"
285
+ )
286
+
287
+ try:
288
+ # Check rate limits
289
+ if state_name in self.rate_limiters and not await asyncio.wait_for(
290
+ self.rate_limiters[state_name].acquire(), timeout=timeout
291
+ ):
292
+ self._coordination_stats["rate_limited_requests"] += 1
293
+ await self._log_coordination_failure(
294
+ state_name, coordination_id, "rate_limit_exceeded"
295
+ )
296
+ return False
297
+
298
+ # Check coordination primitives
299
+ caller_id = f"{self.instance_id}:{state_name}:{coordination_id}"
300
+ acquired_primitives = []
301
+
302
+ try:
303
+ for primitive_name, primitive in self.primitives.items():
304
+ remaining_timeout = timeout - (time.time() - start_time)
305
+ if remaining_timeout <= 0:
306
+ raise asyncio.TimeoutError("Coordination timeout")
307
+
308
+ if not await asyncio.wait_for(
309
+ primitive.acquire(caller_id, timeout=remaining_timeout),
310
+ timeout=remaining_timeout,
311
+ ):
312
+ await self._log_coordination_failure(
313
+ state_name,
314
+ coordination_id,
315
+ f"primitive_blocked:{primitive_name}",
316
+ )
317
+ return False
318
+
319
+ acquired_primitives.append((primitive_name, primitive))
320
+
321
+ # All coordination successful
322
+ self._coordination_stats["successful_requests"] += 1
323
+ duration = time.time() - start_time
324
+
325
+ logger.debug(
326
+ f"coordination_successful: state={state_name}, "
327
+ f"coordination_id={coordination_id}, duration={duration:.3f}, "
328
+ f"acquired_primitives={[name for name, _ in acquired_primitives]}"
329
+ )
330
+
331
+ return True
332
+
333
+ except asyncio.TimeoutError:
334
+ self._coordination_stats["timeout_requests"] += 1
335
+ # Release any acquired primitives
336
+ for primitive_name, primitive in acquired_primitives:
337
+ try:
338
+ await primitive.release(caller_id)
339
+ except Exception as release_error:
340
+ logger.error(
341
+ f"primitive_release_error: primitive={primitive_name}, "
342
+ f"caller_id={caller_id}, error={release_error!s}"
343
+ )
344
+
345
+ await self._log_coordination_failure(
346
+ state_name, coordination_id, "timeout"
347
+ )
348
+ return False
349
+
350
+ except Exception as e:
351
+ self._coordination_stats["failed_requests"] += 1
352
+ await self._log_coordination_failure(
353
+ state_name, coordination_id, f"exception:{e!s}"
354
+ )
355
+ return False
356
+
357
+ async def _log_coordination_failure(
358
+ self, state_name: str, coordination_id: str, reason: str
359
+ ) -> None:
360
+ """Log coordination failure with monitoring integration."""
361
+ logger.warning(
362
+ f"coordination_failed: state={state_name}, "
363
+ f"coordination_id={coordination_id}, reason={reason}"
364
+ )
365
+
366
+ if hasattr(self.agent, "_monitor"):
367
+ try:
368
+ self.agent._monitor.logger.warning(
369
+ f"coordination_failed: state={state_name}, "
370
+ f"coordination_id={coordination_id}, reason={reason}"
371
+ )
372
+ except Exception as e:
373
+ logger.error(f"monitor_logging_error: {e!s}")
374
+
375
+ async def release_coordination(
376
+ self, state_name: str, coordination_id: Optional[str] = None
377
+ ) -> None:
378
+ """Release coordination resources for a state.
379
+
380
+ Args:
381
+ state_name: Name of the state
382
+ coordination_id: Optional specific coordination ID
383
+ """
384
+ if coordination_id:
385
+ caller_id = f"{self.instance_id}:{state_name}:{coordination_id}"
386
+ else:
387
+ # Release all coordinations for this state
388
+ caller_prefix = f"{self.instance_id}:{state_name}:"
389
+
390
+ released_count = 0
391
+
392
+ for primitive_name, primitive in self.primitives.items():
393
+ try:
394
+ if coordination_id:
395
+ await primitive.release(caller_id)
396
+ released_count += 1
397
+ else:
398
+ # Release all matching coordination IDs
399
+ for owner in list(primitive._owners):
400
+ if owner.startswith(caller_prefix):
401
+ await primitive.release(owner)
402
+ released_count += 1
403
+ except Exception as e:
404
+ logger.error(
405
+ f"coordination_release_error: primitive={primitive_name}, "
406
+ f"state={state_name}, error={e!s}"
407
+ )
408
+
409
+ logger.debug(
410
+ f"coordination_released: state={state_name}, "
411
+ f"coordination_id={coordination_id}, released_count={released_count}"
412
+ )
413
+
414
+ def get_status(self) -> dict[str, Any]:
415
+ """Get comprehensive coordinator status."""
416
+ uptime = time.time() - (self._start_time or 0) if self._start_time else 0
417
+
418
+ return {
419
+ "instance_id": self.instance_id,
420
+ "agent_name": self.agent.name,
421
+ "uptime": uptime,
422
+ "shutting_down": self._shutting_down,
423
+ "config": asdict(self.config),
424
+ "stats": self._coordination_stats.copy(),
425
+ "rate_limiters": {
426
+ name: limiter.get_stats()
427
+ for name, limiter in self.rate_limiters.items()
428
+ },
429
+ "primitives": {
430
+ name: primitive.get_state()
431
+ for name, primitive in self.primitives.items()
432
+ },
433
+ "deadlock_detector": (
434
+ self.deadlock_detector.get_status() if self.deadlock_detector else None
435
+ ),
436
+ }
437
+
438
+ async def _cleanup_loop(self) -> None:
439
+ """Background cleanup loop for maintenance tasks."""
440
+ logger.info(f"cleanup_loop_started: instance_id={self.instance_id}")
441
+
442
+ while not self._shutting_down:
443
+ try:
444
+ cleanup_start = time.time()
445
+
446
+ # Clean up expired primitive acquisitions
447
+ cleanup_count = 0
448
+ for primitive in self.primitives.values():
449
+ try:
450
+ async with primitive._lock:
451
+ before_count = len(primitive._owners)
452
+ primitive._cleanup_expired()
453
+ after_count = len(primitive._owners)
454
+ cleanup_count += before_count - after_count
455
+ except Exception as e:
456
+ logger.error(
457
+ f"primitive_cleanup_error: primitive={primitive.name}, error={e!s}"
458
+ )
459
+
460
+ cleanup_duration = time.time() - cleanup_start
461
+
462
+ if cleanup_count > 0 or cleanup_duration > 1.0:
463
+ logger.debug(
464
+ f"cleanup_cycle_completed: instance_id={self.instance_id}, "
465
+ f"cleaned_acquisitions={cleanup_count}, duration={cleanup_duration:.3f}"
466
+ )
467
+
468
+ await asyncio.sleep(self.config.cleanup_interval)
469
+
470
+ except asyncio.CancelledError:
471
+ logger.info(f"cleanup_loop_cancelled: instance_id={self.instance_id}")
472
+ break
473
+ except Exception as e:
474
+ logger.error(
475
+ f"cleanup_loop_error: instance_id={self.instance_id}, error={e!s}"
476
+ )
477
+ # Continue the loop even on errors
478
+ await asyncio.sleep(1.0)
479
+
480
+ logger.info(f"cleanup_loop_stopped: instance_id={self.instance_id}")
481
+
482
+
483
+ def enhance_agent(
484
+ agent: AgentProtocol, config: Optional[CoordinationConfig] = None
485
+ ) -> AgentProtocol:
486
+ """Add production coordination to an agent with proper method binding.
487
+
488
+ Args:
489
+ agent: The agent to enhance
490
+ config: Optional coordination configuration
491
+
492
+ Returns:
493
+ The enhanced agent
494
+ """
495
+ # Add coordinator
496
+ coordinator = AgentCoordinator(agent, config)
497
+ agent._coordinator = coordinator # type: ignore
498
+
499
+ # Store original methods
500
+ original_run_state = agent.run_state
501
+ original_cleanup = getattr(agent, "_cleanup", None)
502
+
503
+ # Handle startup coordination
504
+ async def start_coordinator() -> None:
505
+ await coordinator.start()
506
+
507
+ if hasattr(agent, "_startup_tasks"):
508
+ agent._startup_tasks.append(start_coordinator())
509
+ else:
510
+ # Only create task if there's a running event loop
511
+ try:
512
+ asyncio.get_running_loop()
513
+ task = asyncio.create_task(start_coordinator())
514
+ # Store task reference to prevent garbage collection
515
+ if not hasattr(agent, "_coordination_tasks"):
516
+ agent._coordination_tasks = set() # type: ignore
517
+ agent._coordination_tasks.add(task) # type: ignore
518
+ task.add_done_callback(lambda t: agent._coordination_tasks.discard(t)) # type: ignore
519
+ except RuntimeError:
520
+ # No running event loop, coordinator will be started manually
521
+ logger.info(f"no_event_loop_for_auto_start: agent={agent.name}")
522
+
523
+ # Enhanced run_state with proper binding
524
+ async def enhanced_run_state(state_name: str) -> None:
525
+ """Enhanced state execution with coordination and monitoring."""
526
+ attempt_id = str(uuid.uuid4())
527
+ start_time = time.time()
528
+
529
+ try:
530
+ # Check coordination and rate limits
531
+ if not await agent._coordinator.coordinate_state_execution(state_name): # type: ignore
532
+ if hasattr(agent, "_monitor"):
533
+ agent._monitor.logger.warning(
534
+ f"coordination_failed: state={state_name}, attempt={attempt_id}"
535
+ )
536
+
537
+ # Requeue with backoff if agent supports it
538
+ if (
539
+ hasattr(agent, "_add_to_queue")
540
+ and hasattr(agent, "state_metadata")
541
+ and state_name in agent.state_metadata
542
+ ):
543
+ await agent._add_to_queue(
544
+ state_name,
545
+ priority_boost=-1,
546
+ )
547
+ return
548
+
549
+ # Log execution start
550
+ if hasattr(agent, "_monitor"):
551
+ metadata = {}
552
+ if (
553
+ hasattr(agent, "state_metadata")
554
+ and state_name in agent.state_metadata
555
+ ):
556
+ state_meta = agent.state_metadata[state_name]
557
+ metadata = {
558
+ "resources": (
559
+ asdict(state_meta.resources)
560
+ if hasattr(state_meta, "resources")
561
+ else {}
562
+ ),
563
+ "dependencies": len(getattr(state_meta, "dependencies", [])),
564
+ "attempts": getattr(state_meta, "attempts", 0),
565
+ }
566
+
567
+ agent._monitor.logger.info(
568
+ f"state_execution_started: state={state_name}, "
569
+ f"attempt={attempt_id}, metadata={metadata}"
570
+ )
571
+
572
+ # Execute original state with monitoring span
573
+ async with agent._execution_span(state_name, attempt_id): # type: ignore
574
+ await original_run_state(state_name)
575
+
576
+ # Record success metrics
577
+ if hasattr(agent, "_monitor"):
578
+ duration = time.time() - start_time
579
+ await agent._monitor.record_metric(
580
+ "state_duration",
581
+ duration,
582
+ {"state": state_name, "status": "success"},
583
+ )
584
+ await agent._monitor.record_metric(
585
+ "state_success", 1, {"state": state_name}
586
+ )
587
+
588
+ except Exception as e:
589
+ # Handle failure with monitoring
590
+ if hasattr(agent, "_monitor"):
591
+ duration = time.time() - start_time
592
+ agent._monitor.logger.error(
593
+ f"state_execution_failed: state={state_name}, "
594
+ f"attempt={attempt_id}, error={e!s}, duration={duration:.3f}"
595
+ )
596
+ await agent._monitor.record_metric(
597
+ "state_duration", duration, {"state": state_name, "status": "error"}
598
+ )
599
+ await agent._monitor.record_metric(
600
+ "state_error",
601
+ 1,
602
+ {"state": state_name, "error_type": type(e).__name__},
603
+ )
604
+ raise
605
+
606
+ finally:
607
+ # Always release coordination
608
+ await agent._coordinator.release_coordination(state_name, attempt_id) # type: ignore
609
+
610
+ # Bind the enhanced method to the agent
611
+ agent.run_state = enhanced_run_state # type: ignore
612
+
613
+ # Add execution span context manager
614
+ @contextlib.asynccontextmanager
615
+ async def _execution_span(
616
+ state_name: str, attempt_id: str
617
+ ) -> AsyncGenerator[None, None]:
618
+ """Create execution span for monitoring."""
619
+ if hasattr(agent, "_monitor"):
620
+ try:
621
+ async with agent._monitor.monitor_operation(
622
+ "state_execution",
623
+ {"state": state_name, "attempt": attempt_id, "agent": agent.name},
624
+ ) as span:
625
+ yield span
626
+ except Exception as e:
627
+ logger.error(f"monitor_span_error: {e!s}")
628
+ yield None
629
+ else:
630
+ yield None
631
+
632
+ agent._execution_span = _execution_span # type: ignore
633
+
634
+ # Enhanced cleanup
635
+ async def enhanced_cleanup() -> None:
636
+ """Enhanced cleanup with coordination system shutdown."""
637
+ try:
638
+ # Stop coordinator first
639
+ await agent._coordinator.stop() # type: ignore
640
+
641
+ # Run original cleanup if it exists
642
+ if original_cleanup:
643
+ if inspect.iscoroutinefunction(original_cleanup):
644
+ await original_cleanup()
645
+ else:
646
+ original_cleanup()
647
+
648
+ except Exception as e:
649
+ if hasattr(agent, "_monitor"):
650
+ agent._monitor.logger.error(f"cleanup_error: error={e!s}")
651
+ logger.error(f"enhanced_cleanup_error: agent={agent.name}, error={e!s}")
652
+ raise
653
+
654
+ agent._cleanup = enhanced_cleanup # type: ignore
655
+
656
+ # Add utility methods with proper binding
657
+ def add_utility_methods() -> None:
658
+ async def get_coordination_status() -> dict[str, Any]:
659
+ """Get coordination system status."""
660
+ return agent._coordinator.get_status() # type: ignore
661
+
662
+ async def reset_coordination() -> None:
663
+ """Reset coordination system."""
664
+ old_config = agent._coordinator.config # type: ignore
665
+ await agent._coordinator.stop() # type: ignore
666
+ agent._coordinator = AgentCoordinator(agent, old_config) # type: ignore
667
+ await agent._coordinator.start() # type: ignore
668
+
669
+ def add_state_rate_limit(
670
+ state_name: str,
671
+ max_rate: float,
672
+ strategy: RateLimitStrategy = RateLimitStrategy.TOKEN_BUCKET,
673
+ **kwargs: Any,
674
+ ) -> None:
675
+ """Add rate limit for specific state."""
676
+ agent._coordinator.add_rate_limiter( # type: ignore
677
+ state_name, max_rate, strategy, **kwargs
678
+ )
679
+
680
+ def add_state_coordination(
681
+ state_name: str, primitive_type: PrimitiveType, **kwargs: Any
682
+ ) -> None:
683
+ """Add coordination primitive for specific state."""
684
+ agent._coordinator.create_primitive( # type: ignore
685
+ f"state_{state_name}", primitive_type, **kwargs
686
+ )
687
+
688
+ # Bind methods to agent
689
+ agent.get_coordination_status = get_coordination_status # type: ignore
690
+ agent.reset_coordination = reset_coordination # type: ignore
691
+ agent.add_state_rate_limit = add_state_rate_limit # type: ignore
692
+ agent.add_state_coordination = add_state_coordination # type: ignore
693
+
694
+ add_utility_methods()
695
+
696
+ logger.info(
697
+ f"agent_enhanced: agent_name={agent.name}, "
698
+ f"coordinator_id={coordinator.instance_id}"
699
+ )
700
+
701
+ return agent
702
+
703
+
704
+ def create_coordinated_agent(
705
+ name: str, config: Optional[CoordinationConfig] = None, **agent_kwargs: Any
706
+ ) -> Any:
707
+ """Create an agent with coordination enabled.
708
+
709
+ Args:
710
+ name: Name of the agent
711
+ config: Optional coordination configuration
712
+ **agent_kwargs: Additional arguments for agent creation
713
+
714
+ Returns:
715
+ Enhanced agent with coordination
716
+ """
717
+ from puffinflow.core.agent.base import Agent
718
+
719
+ agent = Agent(name, **agent_kwargs)
720
+ return enhance_agent(agent, config)