agentfield 0.1.22rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. agentfield/__init__.py +66 -0
  2. agentfield/agent.py +3569 -0
  3. agentfield/agent_ai.py +1125 -0
  4. agentfield/agent_cli.py +386 -0
  5. agentfield/agent_field_handler.py +494 -0
  6. agentfield/agent_mcp.py +534 -0
  7. agentfield/agent_registry.py +29 -0
  8. agentfield/agent_server.py +1185 -0
  9. agentfield/agent_utils.py +269 -0
  10. agentfield/agent_workflow.py +323 -0
  11. agentfield/async_config.py +278 -0
  12. agentfield/async_execution_manager.py +1227 -0
  13. agentfield/client.py +1447 -0
  14. agentfield/connection_manager.py +280 -0
  15. agentfield/decorators.py +527 -0
  16. agentfield/did_manager.py +337 -0
  17. agentfield/dynamic_skills.py +304 -0
  18. agentfield/execution_context.py +255 -0
  19. agentfield/execution_state.py +453 -0
  20. agentfield/http_connection_manager.py +429 -0
  21. agentfield/litellm_adapters.py +140 -0
  22. agentfield/logger.py +249 -0
  23. agentfield/mcp_client.py +204 -0
  24. agentfield/mcp_manager.py +340 -0
  25. agentfield/mcp_stdio_bridge.py +550 -0
  26. agentfield/memory.py +723 -0
  27. agentfield/memory_events.py +489 -0
  28. agentfield/multimodal.py +173 -0
  29. agentfield/multimodal_response.py +403 -0
  30. agentfield/pydantic_utils.py +227 -0
  31. agentfield/rate_limiter.py +280 -0
  32. agentfield/result_cache.py +441 -0
  33. agentfield/router.py +190 -0
  34. agentfield/status.py +70 -0
  35. agentfield/types.py +710 -0
  36. agentfield/utils.py +26 -0
  37. agentfield/vc_generator.py +464 -0
  38. agentfield/vision.py +198 -0
  39. agentfield-0.1.22rc2.dist-info/METADATA +102 -0
  40. agentfield-0.1.22rc2.dist-info/RECORD +42 -0
  41. agentfield-0.1.22rc2.dist-info/WHEEL +5 -0
  42. agentfield-0.1.22rc2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1227 @@
1
+ """
2
+ Async Execution Manager for the AgentField SDK.
3
+
4
+ This module provides the central orchestrator for managing hundreds of concurrent
5
+ async executions with intelligent polling, resource management, and comprehensive
6
+ monitoring capabilities.
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime, timezone
14
+ from typing import Any, Dict, List, Optional, Union
15
+ from urllib.parse import urljoin
16
+
17
+ import aiohttp
18
+
19
+ from .async_config import AsyncConfig
20
+ from .execution_state import ExecutionPriority, ExecutionState, ExecutionStatus
21
+ from .http_connection_manager import ConnectionManager
22
+ from .logger import get_logger
23
+ from .result_cache import ResultCache
24
+ from .status import normalize_status
25
+ from .types import WebhookConfig
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ class LazyAsyncLock:
31
+ """Deferred asyncio.Lock that instantiates once the event loop is running."""
32
+
33
+ def __init__(self):
34
+ self._lock: Optional[asyncio.Lock] = None
35
+
36
+ def _lock_obj(self) -> asyncio.Lock:
37
+ if self._lock is None:
38
+ self._lock = asyncio.Lock()
39
+ return self._lock
40
+
41
+ async def __aenter__(self):
42
+ return await self._lock_obj().__aenter__()
43
+
44
+ async def __aexit__(self, exc_type, exc, tb):
45
+ return await self._lock_obj().__aexit__(exc_type, exc, tb)
46
+
47
+
48
+ class LazySemaphore:
49
+ """Deferred asyncio.Semaphore that instantiates within the active loop."""
50
+
51
+ def __init__(self, size_factory):
52
+ self._size_factory = size_factory
53
+ self._sem: Optional[asyncio.Semaphore] = None
54
+
55
+ def _sem_obj(self) -> asyncio.Semaphore:
56
+ if self._sem is None:
57
+ self._sem = asyncio.Semaphore(max(1, int(self._size_factory())))
58
+ return self._sem
59
+
60
+ async def acquire(self):
61
+ return await self._sem_obj().acquire()
62
+
63
+ def release(self):
64
+ self._sem_obj().release()
65
+
66
+ async def __aenter__(self):
67
+ await self.acquire()
68
+ return self
69
+
70
+ async def __aexit__(self, exc_type, exc, tb):
71
+ self.release()
72
+
73
+
74
+ @dataclass
75
+ class PollingMetrics:
76
+ """Metrics for polling performance monitoring."""
77
+
78
+ total_polls: int = 0
79
+ successful_polls: int = 0
80
+ failed_polls: int = 0
81
+ timeout_polls: int = 0
82
+ batch_polls: int = 0
83
+ average_poll_duration: float = 0.0
84
+ last_poll_time: float = field(default_factory=time.time)
85
+
86
+ @property
87
+ def success_rate(self) -> float:
88
+ """Calculate polling success rate as a percentage."""
89
+ if self.total_polls == 0:
90
+ return 0.0
91
+ return (self.successful_polls / self.total_polls) * 100
92
+
93
+ def record_poll(
94
+ self, success: bool, duration: float, timeout: bool = False
95
+ ) -> None:
96
+ """Record a polling operation."""
97
+ self.total_polls += 1
98
+ self.last_poll_time = time.time()
99
+
100
+ if success:
101
+ self.successful_polls += 1
102
+ else:
103
+ self.failed_polls += 1
104
+ if timeout:
105
+ self.timeout_polls += 1
106
+
107
+ # Update average duration using exponential moving average
108
+ alpha = 0.1 # Smoothing factor
109
+ self.average_poll_duration = (
110
+ alpha * duration + (1 - alpha) * self.average_poll_duration
111
+ )
112
+
113
+
114
+ @dataclass
115
+ class ExecutionManagerMetrics:
116
+ """Comprehensive metrics for the execution manager."""
117
+
118
+ # Execution counts
119
+ total_executions: int = 0
120
+ active_executions: int = 0
121
+ completed_executions: int = 0
122
+ failed_executions: int = 0
123
+ cancelled_executions: int = 0
124
+ timeout_executions: int = 0
125
+
126
+ # Performance metrics
127
+ average_execution_time: float = 0.0
128
+ average_queue_time: float = 0.0
129
+ peak_concurrent_executions: int = 0
130
+
131
+ # Resource metrics
132
+ memory_usage_mb: float = 0.0
133
+ cleanup_operations: int = 0
134
+
135
+ # Polling metrics
136
+ polling_metrics: PollingMetrics = field(default_factory=PollingMetrics)
137
+
138
+ # Timestamps
139
+ created_at: float = field(default_factory=time.time)
140
+ last_cleanup: float = field(default_factory=time.time)
141
+
142
+ @property
143
+ def uptime(self) -> float:
144
+ """Get manager uptime in seconds."""
145
+ return time.time() - self.created_at
146
+
147
+ @property
148
+ def success_rate(self) -> float:
149
+ """Calculate execution success rate as a percentage."""
150
+ total_completed = (
151
+ self.completed_executions
152
+ + self.failed_executions
153
+ + self.cancelled_executions
154
+ + self.timeout_executions
155
+ )
156
+ if total_completed == 0:
157
+ return 0.0
158
+ return (self.completed_executions / total_completed) * 100
159
+
160
+
161
+ class AsyncExecutionManager:
162
+ """
163
+ Central orchestrator for managing hundreds of concurrent async executions.
164
+
165
+ This class provides:
166
+ - Concurrent execution tracking with ExecutionState objects
167
+ - Intelligent polling with adaptive intervals based on execution age
168
+ - Resource management with cleanup of completed executions
169
+ - Background polling task coordination using asyncio
170
+ - Thread-safe operations for concurrent access
171
+ - Comprehensive metrics and monitoring
172
+ - Integration with ConnectionManager and ResultCache
173
+ """
174
+
175
+ def __init__(
176
+ self,
177
+ base_url: str,
178
+ config: Optional[AsyncConfig] = None,
179
+ connection_manager: Optional[ConnectionManager] = None,
180
+ result_cache: Optional[ResultCache] = None,
181
+ ):
182
+ """
183
+ Initialize the async execution manager.
184
+
185
+ Args:
186
+ base_url: Base URL for the af server
187
+ config: AsyncConfig instance for configuration parameters
188
+ connection_manager: Optional ConnectionManager instance
189
+ result_cache: Optional ResultCache instance
190
+ """
191
+ self.base_url = base_url.rstrip("/")
192
+ self.config = config or AsyncConfig()
193
+
194
+ # Validate configuration
195
+ self.config.validate()
196
+
197
+ # Initialize components
198
+ self.connection_manager = connection_manager or ConnectionManager(self.config)
199
+ self.result_cache = result_cache or ResultCache(self.config)
200
+
201
+ # Execution tracking
202
+ self._executions: Dict[str, ExecutionState] = {}
203
+ self._execution_lock = LazyAsyncLock()
204
+ self._capacity_semaphore = LazySemaphore(
205
+ lambda: self.config.max_concurrent_executions
206
+ )
207
+
208
+ # Event stream configuration
209
+ self._event_stream_headers: Dict[str, str] = {}
210
+
211
+ # Polling coordination
212
+ self._polling_task: Optional[asyncio.Task] = None
213
+ self._polling_semaphore = LazySemaphore(
214
+ lambda: self.config.max_active_polls
215
+ )
216
+ self._shutdown_event: Optional[asyncio.Event] = None
217
+
218
+ # Metrics and monitoring
219
+ self.metrics = ExecutionManagerMetrics()
220
+
221
+ # Background tasks
222
+ self._cleanup_task: Optional[asyncio.Task] = None
223
+ self._metrics_task: Optional[asyncio.Task] = None
224
+ self._event_stream_task: Optional[asyncio.Task] = None
225
+
226
+ # Circuit breaker state
227
+ self._circuit_breaker_failures = 0
228
+ self._circuit_breaker_last_failure = 0.0
229
+ self._circuit_breaker_open = False
230
+
231
+ logger.debug(f"AsyncExecutionManager initialized with base_url={base_url}")
232
+
233
+ def set_event_stream_headers(self, headers: Optional[Dict[str, str]]) -> None:
234
+ """Configure headers forwarded to the SSE event stream."""
235
+
236
+ if headers is None:
237
+ self._event_stream_headers = {}
238
+ return
239
+
240
+ self._event_stream_headers = {
241
+ key: value for key, value in headers.items() if value is not None
242
+ }
243
+
244
+ async def __aenter__(self):
245
+ """Async context manager entry."""
246
+ await self.start()
247
+ return self
248
+
249
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
250
+ """Async context manager exit."""
251
+ await self.stop()
252
+
253
+ async def start(self) -> None:
254
+ """
255
+ Start the execution manager and all background tasks.
256
+
257
+ Raises:
258
+ RuntimeError: If manager is already started
259
+ """
260
+ if self._polling_task is not None:
261
+ raise RuntimeError("AsyncExecutionManager is already started")
262
+
263
+ # Start components
264
+ await self.connection_manager.start()
265
+ await self.result_cache.start()
266
+
267
+ if self._shutdown_event is None:
268
+ self._shutdown_event = asyncio.Event()
269
+ self._shutdown_event.clear()
270
+
271
+ # Start background tasks
272
+ self._polling_task = asyncio.create_task(self._polling_loop())
273
+ self._cleanup_task = asyncio.create_task(self._cleanup_loop())
274
+
275
+ if self.config.enable_performance_logging:
276
+ self._metrics_task = asyncio.create_task(self._metrics_loop())
277
+
278
+ if self.config.enable_event_stream:
279
+ self._event_stream_task = asyncio.create_task(self._event_stream_loop())
280
+
281
+ logger.info(
282
+ f"AsyncExecutionManager started with max_concurrent={self.config.max_concurrent_executions}"
283
+ )
284
+
285
+ async def stop(self) -> None:
286
+ """
287
+ Stop the execution manager and cleanup all resources.
288
+ """
289
+ logger.info("Stopping AsyncExecutionManager...")
290
+
291
+ # Signal shutdown
292
+ if self._shutdown_event is None:
293
+ self._shutdown_event = asyncio.Event()
294
+ self._shutdown_event.set()
295
+
296
+ # Cancel background tasks
297
+ tasks_to_cancel = [
298
+ self._polling_task,
299
+ self._cleanup_task,
300
+ self._metrics_task,
301
+ self._event_stream_task,
302
+ ]
303
+
304
+ for task in tasks_to_cancel:
305
+ if task:
306
+ task.cancel()
307
+ try:
308
+ await task
309
+ except asyncio.CancelledError:
310
+ pass
311
+
312
+ self._polling_task = None
313
+ self._cleanup_task = None
314
+ self._metrics_task = None
315
+ self._event_stream_task = None
316
+
317
+ # Cancel all active executions
318
+ async with self._execution_lock:
319
+ for execution in self._executions.values():
320
+ if execution.is_active:
321
+ execution.cancel("Manager shutdown")
322
+ self._release_capacity_for_execution(execution)
323
+
324
+ # Stop components
325
+ await self.connection_manager.close()
326
+ await self.result_cache.stop()
327
+
328
+ logger.info("AsyncExecutionManager stopped")
329
+
330
+ async def submit_execution(
331
+ self,
332
+ target: str,
333
+ input_data: Dict[str, Any],
334
+ headers: Optional[Dict[str, str]] = None,
335
+ timeout: Optional[float] = None,
336
+ priority: ExecutionPriority = ExecutionPriority.NORMAL,
337
+ webhook: Optional[Union[WebhookConfig, Dict[str, Any]]] = None,
338
+ ) -> str:
339
+ """
340
+ Submit an async execution and return execution_id.
341
+
342
+ Args:
343
+ target: Target endpoint for execution
344
+ input_data: Input data for the execution
345
+ headers: Optional HTTP headers
346
+ timeout: Optional execution timeout (uses config default if None)
347
+ priority: Execution priority for queue management
348
+
349
+ Returns:
350
+ str: Execution ID for tracking the execution
351
+
352
+ Raises:
353
+ RuntimeError: If manager is not started or at capacity
354
+ aiohttp.ClientError: For HTTP-related errors
355
+ """
356
+ if self._polling_task is None:
357
+ raise RuntimeError("AsyncExecutionManager is not started")
358
+
359
+ # Check circuit breaker
360
+ if self._is_circuit_breaker_open():
361
+ raise RuntimeError("Circuit breaker is open - too many recent failures")
362
+
363
+ # Reserve capacity slot; released once terminal
364
+ await self._capacity_semaphore.acquire()
365
+
366
+ # Prepare request
367
+ url = urljoin(self.base_url, f"/api/v1/execute/async/{target}")
368
+ request_headers = {"Content-Type": "application/json", **(headers or {})}
369
+ payload: Dict[str, Any] = {
370
+ "input": input_data,
371
+ }
372
+
373
+ if webhook:
374
+ if isinstance(webhook, WebhookConfig):
375
+ payload["webhook"] = webhook.to_payload()
376
+ elif isinstance(webhook, dict):
377
+ payload["webhook"] = webhook
378
+ else:
379
+ raise TypeError("webhook must be a WebhookConfig or dict")
380
+
381
+ # Set timeout
382
+ execution_timeout = timeout or self.config.default_execution_timeout
383
+
384
+ try:
385
+ # Submit execution
386
+ start_time = time.time()
387
+ async with self.connection_manager.get_session() as session:
388
+ response = await session.post(
389
+ url,
390
+ json=payload,
391
+ headers=request_headers,
392
+ timeout=self.config.polling_timeout,
393
+ )
394
+ response.raise_for_status()
395
+ result = await response.json()
396
+
397
+ execution_id = result.get("execution_id")
398
+ if not execution_id:
399
+ raise ValueError("Server did not return execution_id")
400
+
401
+ workflow_id = result.get("workflow_id") or result.get("run_id")
402
+ status = self._map_execution_status(result.get("status"))
403
+ created_at = self._parse_timestamp(result.get("created_at"))
404
+ webhook_registered = bool(result.get("webhook_registered"))
405
+ webhook_error = result.get("webhook_error")
406
+
407
+ if webhook and not webhook_registered and webhook_error:
408
+ logger.warning(
409
+ "Webhook registration rejected for %s: %s",
410
+ target,
411
+ webhook_error,
412
+ )
413
+
414
+ # Create execution state
415
+ execution_state = ExecutionState(
416
+ execution_id=execution_id,
417
+ target=target,
418
+ input_data=input_data,
419
+ status=status,
420
+ priority=priority,
421
+ timeout=execution_timeout,
422
+ workflow_id=workflow_id,
423
+ created_at=created_at or datetime.now(timezone.utc),
424
+ updated_at=created_at or datetime.now(timezone.utc),
425
+ webhook_registered=webhook_registered,
426
+ webhook_error=webhook_error,
427
+ )
428
+
429
+ # Store execution
430
+ async with self._execution_lock:
431
+ self._executions[execution_id] = execution_state
432
+ self.metrics.total_executions += 1
433
+ self.metrics.active_executions += 1
434
+
435
+ # Update peak concurrent executions
436
+ if (
437
+ self.metrics.active_executions
438
+ > self.metrics.peak_concurrent_executions
439
+ ):
440
+ self.metrics.peak_concurrent_executions = (
441
+ self.metrics.active_executions
442
+ )
443
+
444
+ # Reset circuit breaker on success
445
+ self._circuit_breaker_failures = 0
446
+
447
+ duration = time.time() - start_time
448
+ logger.debug(
449
+ f"Submitted execution {execution_id[:8]}... for target {target} in {duration:.3f}s"
450
+ )
451
+
452
+ return execution_id
453
+
454
+ except Exception as e:
455
+ self._capacity_semaphore.release()
456
+ self._record_circuit_breaker_failure()
457
+ logger.error(f"Failed to submit execution for target {target}: {e}")
458
+ raise
459
+
460
+ def _map_execution_status(self, status: Optional[str]) -> ExecutionStatus:
461
+ if not status:
462
+ return ExecutionStatus.QUEUED
463
+ normalized = status.lower()
464
+ if normalized in ExecutionStatus._value2member_map_:
465
+ return ExecutionStatus._value2member_map_[normalized]
466
+ return ExecutionStatus.QUEUED
467
+
468
+ @staticmethod
469
+ def _parse_timestamp(value: Optional[str]) -> Optional[datetime]:
470
+ if not value:
471
+ return None
472
+ try:
473
+ return datetime.fromisoformat(value.replace("Z", "+00:00"))
474
+ except ValueError:
475
+ return None
476
+
477
+ async def wait_for_result(
478
+ self, execution_id: str, timeout: Optional[float] = None
479
+ ) -> Any:
480
+ """
481
+ Wait for execution result with intelligent polling.
482
+
483
+ Args:
484
+ execution_id: Execution ID to wait for
485
+ timeout: Optional timeout override
486
+
487
+ Returns:
488
+ Any: Execution result
489
+
490
+ Raises:
491
+ KeyError: If execution_id is not found
492
+ TimeoutError: If execution times out
493
+ RuntimeError: If execution fails or is cancelled
494
+ """
495
+ # Check cache first
496
+ cached_result = self.result_cache.get_execution_result(execution_id)
497
+ if cached_result is not None:
498
+ logger.debug(f"Retrieved cached result for execution {execution_id[:8]}...")
499
+ return cached_result
500
+
501
+ # Get execution state
502
+ async with self._execution_lock:
503
+ execution = self._executions.get(execution_id)
504
+ if execution is None:
505
+ raise KeyError(f"Execution {execution_id} not found")
506
+
507
+ # Set timeout
508
+ wait_timeout = (
509
+ timeout or execution.timeout or self.config.default_execution_timeout
510
+ )
511
+ start_time = time.time()
512
+
513
+ # Wait for completion
514
+ while time.time() - start_time < wait_timeout:
515
+ async with self._execution_lock:
516
+ execution = self._executions.get(execution_id)
517
+ if execution is None:
518
+ raise KeyError(f"Execution {execution_id} was removed")
519
+
520
+ if execution.is_terminal:
521
+ if execution.is_successful:
522
+ # Cache successful result
523
+ if execution.result is not None:
524
+ self.result_cache.set_execution_result(
525
+ execution_id, execution.result
526
+ )
527
+ return execution.result
528
+ elif execution.status == ExecutionStatus.FAILED:
529
+ raise RuntimeError(
530
+ f"Execution failed: {execution.error_message}"
531
+ )
532
+ elif execution.status == ExecutionStatus.CANCELLED:
533
+ raise RuntimeError(
534
+ f"Execution was cancelled: {execution._cancellation_reason}"
535
+ )
536
+ elif execution.status == ExecutionStatus.TIMEOUT:
537
+ raise TimeoutError(
538
+ f"Execution timed out after {execution.timeout} seconds"
539
+ )
540
+
541
+ # Wait before next check
542
+ await asyncio.sleep(0.1)
543
+
544
+ # Timeout reached
545
+ async with self._execution_lock:
546
+ execution = self._executions.get(execution_id)
547
+ if execution and execution.is_active:
548
+ execution.timeout_execution()
549
+ self.metrics.timeout_executions += 1
550
+
551
+ raise TimeoutError(f"Wait timeout reached after {wait_timeout} seconds")
552
+
553
+ async def cancel_execution(
554
+ self, execution_id: str, reason: Optional[str] = None
555
+ ) -> bool:
556
+ """
557
+ Cancel an active execution.
558
+
559
+ Args:
560
+ execution_id: Execution ID to cancel
561
+ reason: Optional cancellation reason
562
+
563
+ Returns:
564
+ bool: True if execution was cancelled, False if not found or already terminal
565
+ """
566
+ async with self._execution_lock:
567
+ execution = self._executions.get(execution_id)
568
+ if execution is None or execution.is_terminal:
569
+ return False
570
+
571
+ execution.cancel(reason)
572
+ self.metrics.cancelled_executions += 1
573
+ self.metrics.active_executions -= 1
574
+
575
+ logger.debug(
576
+ f"Cancelled execution {execution_id[:8]}... - {reason or 'No reason provided'}"
577
+ )
578
+ return True
579
+
580
+ async def get_execution_status(self, execution_id: str) -> Optional[Dict[str, Any]]:
581
+ """
582
+ Get current status of an execution.
583
+
584
+ Args:
585
+ execution_id: Execution ID to check
586
+
587
+ Returns:
588
+ Optional[Dict]: Execution status dictionary or None if not found
589
+ """
590
+ async with self._execution_lock:
591
+ execution = self._executions.get(execution_id)
592
+ if execution is None:
593
+ return None
594
+
595
+ return execution.to_dict()
596
+
597
+ async def list_executions(
598
+ self,
599
+ status_filter: Optional[ExecutionStatus] = None,
600
+ limit: Optional[int] = None,
601
+ ) -> List[Dict[str, Any]]:
602
+ """
603
+ List executions with optional filtering.
604
+
605
+ Args:
606
+ status_filter: Optional status to filter by
607
+ limit: Optional limit on number of results
608
+
609
+ Returns:
610
+ List[Dict]: List of execution status dictionaries
611
+ """
612
+ async with self._execution_lock:
613
+ executions = list(self._executions.values())
614
+
615
+ # Apply status filter
616
+ if status_filter:
617
+ executions = [e for e in executions if e.status == status_filter]
618
+
619
+ # Sort by creation time (newest first)
620
+ executions.sort(key=lambda e: e.created_at, reverse=True)
621
+
622
+ # Apply limit
623
+ if limit:
624
+ executions = executions[:limit]
625
+
626
+ return [execution.to_dict() for execution in executions]
627
+
628
+ async def cleanup_completed_executions(self) -> int:
629
+ """
630
+ Clean up completed executions to manage memory.
631
+
632
+ Returns:
633
+ int: Number of executions cleaned up
634
+ """
635
+ cleanup_count = 0
636
+ current_time = time.time()
637
+
638
+ async with self._execution_lock:
639
+ # Collect terminal executions for retention analysis
640
+ completed_executions = {
641
+ exec_id: execution
642
+ for exec_id, execution in self._executions.items()
643
+ if execution.is_terminal
644
+ }
645
+
646
+ if not completed_executions:
647
+ return 0
648
+
649
+ removal_candidates = set()
650
+
651
+ # Time-based pruning to keep memory bounded during long-running sessions
652
+ retention_seconds = self.config.completed_execution_retention_seconds
653
+ if retention_seconds > 0:
654
+ for exec_id, execution in completed_executions.items():
655
+ end_time = (
656
+ execution.metrics.end_time or execution.metrics.submit_time
657
+ )
658
+ if end_time and (current_time - end_time) > retention_seconds:
659
+ removal_candidates.add(exec_id)
660
+
661
+ # Enforce cap on stored completions after time-based pruning
662
+ remaining = [
663
+ (exec_id, execution)
664
+ for exec_id, execution in completed_executions.items()
665
+ if exec_id not in removal_candidates
666
+ ]
667
+
668
+ if len(remaining) > self.config.max_completed_executions:
669
+ # Remove the oldest executions first
670
+ remaining.sort(key=lambda item: item[1].metrics.end_time or 0)
671
+ overflow = len(remaining) - self.config.max_completed_executions
672
+ for i in range(overflow):
673
+ removal_candidates.add(remaining[i][0])
674
+
675
+ # Apply removals and cache results where applicable
676
+ for exec_id in removal_candidates:
677
+ execution = completed_executions.get(exec_id)
678
+ if execution is None:
679
+ continue
680
+
681
+ if execution.is_successful and execution.result is not None:
682
+ self.result_cache.set_execution_result(exec_id, execution.result)
683
+
684
+ self._release_capacity_for_execution(execution)
685
+ self._executions.pop(exec_id, None)
686
+ cleanup_count += 1
687
+
688
+ if cleanup_count > 0:
689
+ self.metrics.cleanup_operations += 1
690
+ self.metrics.last_cleanup = current_time
691
+ logger.debug(f"Cleaned up {cleanup_count} completed executions")
692
+
693
+ return cleanup_count
694
+
695
+ async def _event_stream_loop(self) -> None:
696
+ """Listen for execution events over SSE and nudge polling."""
697
+ logger.debug("Starting event stream loop")
698
+
699
+ url = urljoin(self.base_url, self.config.event_stream_path)
700
+ backoff = max(self.config.event_stream_retry_backoff, 0.5)
701
+
702
+ while not self._shutdown_event.is_set():
703
+ try:
704
+ request_headers = {"Accept": "text/event-stream"}
705
+ if self._event_stream_headers:
706
+ request_headers.update(self._event_stream_headers)
707
+
708
+ async with self.connection_manager.get_session() as session:
709
+ timeout = aiohttp.ClientTimeout(total=None, sock_read=None)
710
+ async with session.get(
711
+ url, headers=request_headers, timeout=timeout
712
+ ) as response:
713
+ if response.status != 200:
714
+ body = await response.text()
715
+ logger.warn(
716
+ f"Event stream returned {response.status} for {url}: {body[:256]}"
717
+ )
718
+ await asyncio.sleep(backoff)
719
+ continue
720
+
721
+ buffer = ""
722
+ async for chunk in response.content.iter_chunked(1024):
723
+ if self._shutdown_event.is_set():
724
+ break
725
+ if not chunk:
726
+ continue
727
+ try:
728
+ decoded = chunk.decode("utf-8", errors="ignore")
729
+ except Exception:
730
+ continue
731
+
732
+ buffer += decoded
733
+
734
+ while "\n\n" in buffer:
735
+ raw_event, buffer = buffer.split("\n\n", 1)
736
+ data_lines = []
737
+ for line in raw_event.splitlines():
738
+ if line.startswith(":"):
739
+ continue
740
+ if line.startswith("data:"):
741
+ data_lines.append(line[5:].lstrip())
742
+
743
+ if not data_lines:
744
+ continue
745
+
746
+ payload_str = "\n".join(data_lines).strip()
747
+ if not payload_str:
748
+ continue
749
+
750
+ try:
751
+ payload = json.loads(payload_str)
752
+ except json.JSONDecodeError:
753
+ logger.debug(
754
+ f"Failed to decode SSE payload: {payload_str[:120]}"
755
+ )
756
+ continue
757
+
758
+ await self._handle_event_stream_payload(payload)
759
+
760
+ except asyncio.CancelledError:
761
+ break
762
+ except Exception as e:
763
+ if self._shutdown_event.is_set():
764
+ break
765
+ logger.warn(f"Event stream error: {e}")
766
+ await asyncio.sleep(backoff)
767
+
768
+ logger.debug("Event stream loop stopped")
769
+
770
+ async def _handle_event_stream_payload(self, payload: Dict[str, Any]) -> None:
771
+ """Process a single SSE payload."""
772
+ execution_id = payload.get("execution_id") or payload.get("executionId")
773
+ if not execution_id:
774
+ return
775
+
776
+ schedule_poll = False
777
+ status_hint = normalize_status(payload.get("status"))
778
+ event_type = str(payload.get("type", "")).lower()
779
+
780
+ async with self._execution_lock:
781
+ execution = self._executions.get(execution_id)
782
+ if execution is None:
783
+ return
784
+
785
+ if event_type == "execution_started" or status_hint == "running":
786
+ execution.update_status(ExecutionStatus.RUNNING)
787
+ elif status_hint == "queued":
788
+ execution.update_status(ExecutionStatus.QUEUED)
789
+ elif status_hint == "pending":
790
+ execution.update_status(ExecutionStatus.PENDING)
791
+ elif status_hint in {
792
+ "succeeded",
793
+ "failed",
794
+ "cancelled",
795
+ "timeout",
796
+ } or event_type in {"execution_completed", "execution_failed"}:
797
+ if status_hint == "failed":
798
+ execution.update_status(ExecutionStatus.FAILED)
799
+ elif status_hint == "cancelled":
800
+ execution.update_status(ExecutionStatus.CANCELLED)
801
+ elif status_hint == "timeout":
802
+ execution.update_status(ExecutionStatus.TIMEOUT)
803
+ else:
804
+ execution.update_status(ExecutionStatus.SUCCEEDED)
805
+ schedule_poll = True
806
+
807
+ if schedule_poll:
808
+ asyncio.create_task(self._poll_execution_immediate(execution_id))
809
+
810
+ async def _poll_execution_immediate(self, execution_id: str) -> None:
811
+ """Trigger an immediate poll for the provided execution."""
812
+ async with self._execution_lock:
813
+ execution = self._executions.get(execution_id)
814
+
815
+ if execution is None:
816
+ return
817
+
818
+ if execution.is_terminal and execution.result is not None:
819
+ return
820
+
821
+ try:
822
+ await self._poll_single_execution(execution)
823
+ except Exception as exc:
824
+ logger.debug(f"Immediate poll for {execution_id[:8]}... failed: {exc}")
825
+
826
+ async def start_polling_task(self) -> None:
827
+ """
828
+ Start the background polling task.
829
+
830
+ Note: This is automatically called by start() and should not be called manually.
831
+ """
832
+ if self._polling_task is None or self._polling_task.done():
833
+ self._polling_task = asyncio.create_task(self._polling_loop())
834
+ logger.debug("Background polling task started")
835
+
836
+ async def stop_polling_task(self) -> None:
837
+ """
838
+ Stop the background polling task.
839
+
840
+ Note: This is automatically called by stop() and should not be called manually.
841
+ """
842
+ if self._polling_task:
843
+ self._polling_task.cancel()
844
+ try:
845
+ await self._polling_task
846
+ except asyncio.CancelledError:
847
+ pass
848
+ self._polling_task = None
849
+ logger.debug("Background polling task stopped")
850
+
851
+ def get_metrics(self) -> Dict[str, Any]:
852
+ """
853
+ Get comprehensive execution manager metrics.
854
+
855
+ Returns:
856
+ Dict[str, Any]: Metrics dictionary
857
+ """
858
+
859
+ # Update current metrics
860
+ async def _update_metrics():
861
+ async with self._execution_lock:
862
+ active_count = sum(1 for e in self._executions.values() if e.is_active)
863
+ self.metrics.active_executions = active_count
864
+
865
+ # Run the update if we're in an async context
866
+ try:
867
+ loop = asyncio.get_running_loop()
868
+ loop.create_task(_update_metrics())
869
+ except RuntimeError:
870
+ pass # Not in async context
871
+
872
+ return {
873
+ "total_executions": self.metrics.total_executions,
874
+ "active_executions": self.metrics.active_executions,
875
+ "completed_executions": self.metrics.completed_executions,
876
+ "failed_executions": self.metrics.failed_executions,
877
+ "cancelled_executions": self.metrics.cancelled_executions,
878
+ "timeout_executions": self.metrics.timeout_executions,
879
+ "success_rate": self.metrics.success_rate,
880
+ "average_execution_time": self.metrics.average_execution_time,
881
+ "average_queue_time": self.metrics.average_queue_time,
882
+ "peak_concurrent_executions": self.metrics.peak_concurrent_executions,
883
+ "memory_usage_mb": self.metrics.memory_usage_mb,
884
+ "cleanup_operations": self.metrics.cleanup_operations,
885
+ "uptime": self.metrics.uptime,
886
+ "polling_metrics": {
887
+ "total_polls": self.metrics.polling_metrics.total_polls,
888
+ "successful_polls": self.metrics.polling_metrics.successful_polls,
889
+ "failed_polls": self.metrics.polling_metrics.failed_polls,
890
+ "success_rate": self.metrics.polling_metrics.success_rate,
891
+ "average_poll_duration": self.metrics.polling_metrics.average_poll_duration,
892
+ "batch_polls": self.metrics.polling_metrics.batch_polls,
893
+ },
894
+ "circuit_breaker": {
895
+ "failures": self._circuit_breaker_failures,
896
+ "is_open": self._circuit_breaker_open,
897
+ "last_failure": self._circuit_breaker_last_failure,
898
+ },
899
+ "connection_manager": self.connection_manager.get_metrics().__dict__,
900
+ "result_cache": self.result_cache.get_stats(),
901
+ }
902
+
903
+ async def _polling_loop(self) -> None:
904
+ """Background task for intelligent polling of active executions."""
905
+ logger.debug("Starting polling loop")
906
+
907
+ while not self._shutdown_event.is_set():
908
+ try:
909
+ await self._poll_active_executions()
910
+ await asyncio.sleep(self.config.batch_poll_interval)
911
+
912
+ except asyncio.CancelledError:
913
+ break
914
+ except Exception as e:
915
+ logger.error(f"Polling loop error: {e}")
916
+ await asyncio.sleep(1.0) # Brief pause on error
917
+
918
+ logger.debug("Polling loop stopped")
919
+
920
+ async def _poll_active_executions(self) -> None:
921
+ """Poll all active executions that are ready for polling."""
922
+ # Get executions ready for polling
923
+ executions_to_poll = []
924
+
925
+ async with self._execution_lock:
926
+ for execution in self._executions.values():
927
+ if execution.should_poll:
928
+ # Check for timeout
929
+ if execution.is_overdue:
930
+ execution.timeout_execution()
931
+ self.metrics.timeout_executions += 1
932
+ self.metrics.active_executions -= 1
933
+ continue
934
+
935
+ executions_to_poll.append(execution)
936
+
937
+ if not executions_to_poll:
938
+ return
939
+
940
+ # Use batch polling if enabled and beneficial
941
+ if (
942
+ self.config.enable_batch_polling and len(executions_to_poll) >= 3
943
+ ): # Batch threshold
944
+ await self._batch_poll_executions(executions_to_poll)
945
+ else:
946
+ await self._individual_poll_executions(executions_to_poll)
947
+
948
+ async def _batch_poll_executions(self, executions: List[ExecutionState]) -> None:
949
+ """Poll multiple executions in batches for efficiency."""
950
+ # Split into batches
951
+ batch_size = min(self.config.batch_size, len(executions))
952
+
953
+ for i in range(0, len(executions), batch_size):
954
+ batch = executions[i : i + batch_size]
955
+
956
+ # Create batch requests
957
+ requests = []
958
+ for execution in batch:
959
+ requests.append(
960
+ {
961
+ "method": "GET",
962
+ "url": self._execution_status_url(execution.execution_id),
963
+ "timeout": self.config.polling_timeout,
964
+ }
965
+ )
966
+
967
+ # Execute batch
968
+ start_time = time.time()
969
+ try:
970
+ responses = await self.connection_manager.batch_request(requests)
971
+ duration = time.time() - start_time
972
+
973
+ # Process responses
974
+ for execution, response in zip(batch, responses):
975
+ await self._process_poll_response(
976
+ execution, response, duration / len(batch)
977
+ )
978
+
979
+ self.metrics.polling_metrics.batch_polls += 1
980
+
981
+ except Exception as e:
982
+ logger.error(f"Batch polling failed: {e}")
983
+ # Fall back to individual polling
984
+ await self._individual_poll_executions(batch)
985
+
986
+ async def _individual_poll_executions(
987
+ self, executions: List[ExecutionState]
988
+ ) -> None:
989
+ """Poll executions individually with concurrency control."""
990
+
991
+ # Use semaphore to limit concurrent polls
992
+ async def poll_single(execution: ExecutionState):
993
+ async with self._polling_semaphore:
994
+ await self._poll_single_execution(execution)
995
+
996
+ # Create tasks for concurrent polling
997
+ tasks = [poll_single(execution) for execution in executions]
998
+ await asyncio.gather(*tasks, return_exceptions=True)
999
+
1000
+ async def _poll_single_execution(self, execution: ExecutionState) -> None:
1001
+ """Poll a single execution for status updates."""
1002
+ url = self._execution_status_url(execution.execution_id)
1003
+
1004
+ start_time = time.time()
1005
+ try:
1006
+ response = await self.connection_manager.request(
1007
+ "GET", url, timeout=self.config.polling_timeout
1008
+ )
1009
+ duration = time.time() - start_time
1010
+
1011
+ await self._process_poll_response(execution, response, duration)
1012
+
1013
+ except Exception as e:
1014
+ duration = time.time() - start_time
1015
+ await self._process_poll_response(execution, e, duration)
1016
+
1017
+ async def _process_poll_response(
1018
+ self, execution: ExecutionState, response: Any, duration: float
1019
+ ) -> None:
1020
+ """Process the response from a polling operation."""
1021
+ success = False
1022
+ timeout_occurred = False
1023
+
1024
+ try:
1025
+ if isinstance(response, Exception):
1026
+ # Handle error response
1027
+ if isinstance(response, asyncio.TimeoutError):
1028
+ timeout_occurred = True
1029
+
1030
+ execution.record_poll_attempt(False, duration)
1031
+
1032
+ # Update poll interval based on failure
1033
+ new_interval = min(
1034
+ execution.current_poll_interval * 1.5, self.config.max_poll_interval
1035
+ )
1036
+ execution.update_poll_interval(new_interval)
1037
+
1038
+ logger.debug(
1039
+ f"Poll failed for execution {execution.execution_id[:8]}...: {response}"
1040
+ )
1041
+
1042
+ else:
1043
+ # Handle successful response
1044
+ response.raise_for_status()
1045
+ status_data = await response.json()
1046
+
1047
+ # Update execution state
1048
+ await self._update_execution_from_status(execution, status_data)
1049
+
1050
+ execution.record_poll_attempt(True, duration)
1051
+ success = True
1052
+
1053
+ # Update poll interval based on execution age
1054
+ new_interval = self.config.get_poll_interval_for_age(execution.age)
1055
+ execution.update_poll_interval(new_interval)
1056
+
1057
+ except Exception as e:
1058
+ execution.record_poll_attempt(False, duration)
1059
+ logger.error(
1060
+ f"Error processing poll response for {execution.execution_id[:8]}...: {e}"
1061
+ )
1062
+
1063
+ finally:
1064
+ # Record metrics
1065
+ self.metrics.polling_metrics.record_poll(
1066
+ success, duration, timeout_occurred
1067
+ )
1068
+
1069
+ def _execution_status_url(self, execution_id: str) -> str:
1070
+ """Return the canonical status endpoint for an execution."""
1071
+ return urljoin(self.base_url, f"/api/v1/executions/{execution_id}")
1072
+
1073
+ async def _update_execution_from_status(
1074
+ self, execution: ExecutionState, status_data: Dict[str, Any]
1075
+ ) -> None:
1076
+ """Update execution state from status response."""
1077
+ raw_status = status_data.get("status")
1078
+ normalized = normalize_status(raw_status)
1079
+
1080
+ try:
1081
+ new_status = ExecutionStatus(normalized)
1082
+ except ValueError:
1083
+ logger.warning(
1084
+ "Unknown status '%s' for execution %s",
1085
+ normalized,
1086
+ execution.execution_id[:8],
1087
+ )
1088
+ return
1089
+
1090
+ old_status = execution.status
1091
+
1092
+ # Update status
1093
+ if new_status != old_status:
1094
+ if new_status == ExecutionStatus.SUCCEEDED:
1095
+ result = status_data.get("result")
1096
+ execution.set_result(result)
1097
+
1098
+ async with self._execution_lock:
1099
+ self.metrics.completed_executions += 1
1100
+ self.metrics.active_executions -= 1
1101
+ self._release_capacity_for_execution(execution)
1102
+
1103
+ elif new_status == ExecutionStatus.FAILED:
1104
+ error_msg = status_data.get("error", "Execution failed")
1105
+ error_details = status_data.get("error_details")
1106
+ execution.set_error(error_msg, error_details)
1107
+
1108
+ async with self._execution_lock:
1109
+ self.metrics.failed_executions += 1
1110
+ self.metrics.active_executions -= 1
1111
+ self._release_capacity_for_execution(execution)
1112
+ elif new_status == ExecutionStatus.CANCELLED:
1113
+ execution.update_status(new_status)
1114
+
1115
+ async with self._execution_lock:
1116
+ self.metrics.cancelled_executions += 1
1117
+ self.metrics.active_executions -= 1
1118
+ self._release_capacity_for_execution(execution)
1119
+
1120
+ elif new_status == ExecutionStatus.TIMEOUT:
1121
+ execution.update_status(new_status)
1122
+
1123
+ async with self._execution_lock:
1124
+ self.metrics.timeout_executions += 1
1125
+ self.metrics.active_executions -= 1
1126
+ self._release_capacity_for_execution(execution)
1127
+
1128
+ else:
1129
+ execution.update_status(new_status)
1130
+
1131
+ old_repr = getattr(old_status, "value", old_status)
1132
+ new_repr = getattr(new_status, "value", new_status)
1133
+ logger.debug(
1134
+ f"Execution {execution.execution_id[:8]}... status: {old_repr} -> {new_repr}"
1135
+ )
1136
+
1137
+ def _release_capacity_for_execution(self, execution: ExecutionState) -> None:
1138
+ if getattr(execution, "_capacity_released", False):
1139
+ return
1140
+ execution._capacity_released = True
1141
+ try:
1142
+ self._capacity_semaphore.release()
1143
+ except ValueError:
1144
+ # Semaphore already fully released (can occur during shutdown cleanup)
1145
+ pass
1146
+
1147
+ async def _cleanup_loop(self) -> None:
1148
+ """Background task for periodic cleanup of completed executions."""
1149
+ logger.debug("Starting cleanup loop")
1150
+
1151
+ while not self._shutdown_event.is_set():
1152
+ try:
1153
+ await asyncio.sleep(self.config.cleanup_interval)
1154
+ await self.cleanup_completed_executions()
1155
+
1156
+ except asyncio.CancelledError:
1157
+ break
1158
+ except Exception as e:
1159
+ logger.error(f"Cleanup loop error: {e}")
1160
+
1161
+ logger.debug("Cleanup loop stopped")
1162
+
1163
+ async def _metrics_loop(self) -> None:
1164
+ """Background task for periodic metrics logging."""
1165
+ logger.debug("Starting metrics loop")
1166
+
1167
+ while not self._shutdown_event.is_set():
1168
+ try:
1169
+ await asyncio.sleep(60.0) # Log metrics every minute
1170
+
1171
+ metrics = self.get_metrics()
1172
+ logger.debug(
1173
+ f"Execution metrics: "
1174
+ f"active={metrics['active_executions']}, "
1175
+ f"total={metrics['total_executions']}, "
1176
+ f"success_rate={metrics['success_rate']:.1f}%, "
1177
+ f"poll_success_rate={metrics['polling_metrics']['success_rate']:.1f}%"
1178
+ )
1179
+
1180
+ except asyncio.CancelledError:
1181
+ break
1182
+ except Exception as e:
1183
+ logger.error(f"Metrics loop error: {e}")
1184
+
1185
+ logger.debug("Metrics loop stopped")
1186
+
1187
+ def _is_circuit_breaker_open(self) -> bool:
1188
+ """Check if circuit breaker is open."""
1189
+ if not self._circuit_breaker_open:
1190
+ return False
1191
+
1192
+ # Check if recovery timeout has passed
1193
+ if (
1194
+ time.time() - self._circuit_breaker_last_failure
1195
+ > self.config.circuit_breaker_recovery_timeout
1196
+ ):
1197
+ self._circuit_breaker_open = False
1198
+ self._circuit_breaker_failures = 0
1199
+ logger.info("Circuit breaker closed - attempting recovery")
1200
+ return False
1201
+
1202
+ return True
1203
+
1204
+ def _record_circuit_breaker_failure(self) -> None:
1205
+ """Record a failure for circuit breaker logic."""
1206
+ self._circuit_breaker_failures += 1
1207
+ self._circuit_breaker_last_failure = time.time()
1208
+
1209
+ if (
1210
+ self._circuit_breaker_failures
1211
+ >= self.config.circuit_breaker_failure_threshold
1212
+ ):
1213
+ self._circuit_breaker_open = True
1214
+ logger.warn(
1215
+ f"Circuit breaker opened after {self._circuit_breaker_failures} failures"
1216
+ )
1217
+
1218
+ def __repr__(self) -> str:
1219
+ """String representation of the execution manager."""
1220
+ return (
1221
+ f"AsyncExecutionManager("
1222
+ f"base_url='{self.base_url}', "
1223
+ f"active_executions={self.metrics.active_executions}, "
1224
+ f"total_executions={self.metrics.total_executions}, "
1225
+ f"success_rate={self.metrics.success_rate:.1f}%"
1226
+ f")"
1227
+ )