agentfield 0.1.22rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentfield/__init__.py +66 -0
- agentfield/agent.py +3569 -0
- agentfield/agent_ai.py +1125 -0
- agentfield/agent_cli.py +386 -0
- agentfield/agent_field_handler.py +494 -0
- agentfield/agent_mcp.py +534 -0
- agentfield/agent_registry.py +29 -0
- agentfield/agent_server.py +1185 -0
- agentfield/agent_utils.py +269 -0
- agentfield/agent_workflow.py +323 -0
- agentfield/async_config.py +278 -0
- agentfield/async_execution_manager.py +1227 -0
- agentfield/client.py +1447 -0
- agentfield/connection_manager.py +280 -0
- agentfield/decorators.py +527 -0
- agentfield/did_manager.py +337 -0
- agentfield/dynamic_skills.py +304 -0
- agentfield/execution_context.py +255 -0
- agentfield/execution_state.py +453 -0
- agentfield/http_connection_manager.py +429 -0
- agentfield/litellm_adapters.py +140 -0
- agentfield/logger.py +249 -0
- agentfield/mcp_client.py +204 -0
- agentfield/mcp_manager.py +340 -0
- agentfield/mcp_stdio_bridge.py +550 -0
- agentfield/memory.py +723 -0
- agentfield/memory_events.py +489 -0
- agentfield/multimodal.py +173 -0
- agentfield/multimodal_response.py +403 -0
- agentfield/pydantic_utils.py +227 -0
- agentfield/rate_limiter.py +280 -0
- agentfield/result_cache.py +441 -0
- agentfield/router.py +190 -0
- agentfield/status.py +70 -0
- agentfield/types.py +710 -0
- agentfield/utils.py +26 -0
- agentfield/vc_generator.py +464 -0
- agentfield/vision.py +198 -0
- agentfield-0.1.22rc2.dist-info/METADATA +102 -0
- agentfield-0.1.22rc2.dist-info/RECORD +42 -0
- agentfield-0.1.22rc2.dist-info/WHEEL +5 -0
- agentfield-0.1.22rc2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async Execution Manager for the AgentField SDK.
|
|
3
|
+
|
|
4
|
+
This module provides the central orchestrator for managing hundreds of concurrent
|
|
5
|
+
async executions with intelligent polling, resource management, and comprehensive
|
|
6
|
+
monitoring capabilities.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any, Dict, List, Optional, Union
|
|
15
|
+
from urllib.parse import urljoin
|
|
16
|
+
|
|
17
|
+
import aiohttp
|
|
18
|
+
|
|
19
|
+
from .async_config import AsyncConfig
|
|
20
|
+
from .execution_state import ExecutionPriority, ExecutionState, ExecutionStatus
|
|
21
|
+
from .http_connection_manager import ConnectionManager
|
|
22
|
+
from .logger import get_logger
|
|
23
|
+
from .result_cache import ResultCache
|
|
24
|
+
from .status import normalize_status
|
|
25
|
+
from .types import WebhookConfig
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LazyAsyncLock:
|
|
31
|
+
"""Deferred asyncio.Lock that instantiates once the event loop is running."""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self._lock: Optional[asyncio.Lock] = None
|
|
35
|
+
|
|
36
|
+
def _lock_obj(self) -> asyncio.Lock:
|
|
37
|
+
if self._lock is None:
|
|
38
|
+
self._lock = asyncio.Lock()
|
|
39
|
+
return self._lock
|
|
40
|
+
|
|
41
|
+
async def __aenter__(self):
|
|
42
|
+
return await self._lock_obj().__aenter__()
|
|
43
|
+
|
|
44
|
+
async def __aexit__(self, exc_type, exc, tb):
|
|
45
|
+
return await self._lock_obj().__aexit__(exc_type, exc, tb)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LazySemaphore:
|
|
49
|
+
"""Deferred asyncio.Semaphore that instantiates within the active loop."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, size_factory):
|
|
52
|
+
self._size_factory = size_factory
|
|
53
|
+
self._sem: Optional[asyncio.Semaphore] = None
|
|
54
|
+
|
|
55
|
+
def _sem_obj(self) -> asyncio.Semaphore:
|
|
56
|
+
if self._sem is None:
|
|
57
|
+
self._sem = asyncio.Semaphore(max(1, int(self._size_factory())))
|
|
58
|
+
return self._sem
|
|
59
|
+
|
|
60
|
+
async def acquire(self):
|
|
61
|
+
return await self._sem_obj().acquire()
|
|
62
|
+
|
|
63
|
+
def release(self):
|
|
64
|
+
self._sem_obj().release()
|
|
65
|
+
|
|
66
|
+
async def __aenter__(self):
|
|
67
|
+
await self.acquire()
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
async def __aexit__(self, exc_type, exc, tb):
|
|
71
|
+
self.release()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class PollingMetrics:
|
|
76
|
+
"""Metrics for polling performance monitoring."""
|
|
77
|
+
|
|
78
|
+
total_polls: int = 0
|
|
79
|
+
successful_polls: int = 0
|
|
80
|
+
failed_polls: int = 0
|
|
81
|
+
timeout_polls: int = 0
|
|
82
|
+
batch_polls: int = 0
|
|
83
|
+
average_poll_duration: float = 0.0
|
|
84
|
+
last_poll_time: float = field(default_factory=time.time)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def success_rate(self) -> float:
|
|
88
|
+
"""Calculate polling success rate as a percentage."""
|
|
89
|
+
if self.total_polls == 0:
|
|
90
|
+
return 0.0
|
|
91
|
+
return (self.successful_polls / self.total_polls) * 100
|
|
92
|
+
|
|
93
|
+
def record_poll(
|
|
94
|
+
self, success: bool, duration: float, timeout: bool = False
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Record a polling operation."""
|
|
97
|
+
self.total_polls += 1
|
|
98
|
+
self.last_poll_time = time.time()
|
|
99
|
+
|
|
100
|
+
if success:
|
|
101
|
+
self.successful_polls += 1
|
|
102
|
+
else:
|
|
103
|
+
self.failed_polls += 1
|
|
104
|
+
if timeout:
|
|
105
|
+
self.timeout_polls += 1
|
|
106
|
+
|
|
107
|
+
# Update average duration using exponential moving average
|
|
108
|
+
alpha = 0.1 # Smoothing factor
|
|
109
|
+
self.average_poll_duration = (
|
|
110
|
+
alpha * duration + (1 - alpha) * self.average_poll_duration
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class ExecutionManagerMetrics:
|
|
116
|
+
"""Comprehensive metrics for the execution manager."""
|
|
117
|
+
|
|
118
|
+
# Execution counts
|
|
119
|
+
total_executions: int = 0
|
|
120
|
+
active_executions: int = 0
|
|
121
|
+
completed_executions: int = 0
|
|
122
|
+
failed_executions: int = 0
|
|
123
|
+
cancelled_executions: int = 0
|
|
124
|
+
timeout_executions: int = 0
|
|
125
|
+
|
|
126
|
+
# Performance metrics
|
|
127
|
+
average_execution_time: float = 0.0
|
|
128
|
+
average_queue_time: float = 0.0
|
|
129
|
+
peak_concurrent_executions: int = 0
|
|
130
|
+
|
|
131
|
+
# Resource metrics
|
|
132
|
+
memory_usage_mb: float = 0.0
|
|
133
|
+
cleanup_operations: int = 0
|
|
134
|
+
|
|
135
|
+
# Polling metrics
|
|
136
|
+
polling_metrics: PollingMetrics = field(default_factory=PollingMetrics)
|
|
137
|
+
|
|
138
|
+
# Timestamps
|
|
139
|
+
created_at: float = field(default_factory=time.time)
|
|
140
|
+
last_cleanup: float = field(default_factory=time.time)
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def uptime(self) -> float:
|
|
144
|
+
"""Get manager uptime in seconds."""
|
|
145
|
+
return time.time() - self.created_at
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def success_rate(self) -> float:
|
|
149
|
+
"""Calculate execution success rate as a percentage."""
|
|
150
|
+
total_completed = (
|
|
151
|
+
self.completed_executions
|
|
152
|
+
+ self.failed_executions
|
|
153
|
+
+ self.cancelled_executions
|
|
154
|
+
+ self.timeout_executions
|
|
155
|
+
)
|
|
156
|
+
if total_completed == 0:
|
|
157
|
+
return 0.0
|
|
158
|
+
return (self.completed_executions / total_completed) * 100
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class AsyncExecutionManager:
|
|
162
|
+
"""
|
|
163
|
+
Central orchestrator for managing hundreds of concurrent async executions.
|
|
164
|
+
|
|
165
|
+
This class provides:
|
|
166
|
+
- Concurrent execution tracking with ExecutionState objects
|
|
167
|
+
- Intelligent polling with adaptive intervals based on execution age
|
|
168
|
+
- Resource management with cleanup of completed executions
|
|
169
|
+
- Background polling task coordination using asyncio
|
|
170
|
+
- Thread-safe operations for concurrent access
|
|
171
|
+
- Comprehensive metrics and monitoring
|
|
172
|
+
- Integration with ConnectionManager and ResultCache
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
base_url: str,
|
|
178
|
+
config: Optional[AsyncConfig] = None,
|
|
179
|
+
connection_manager: Optional[ConnectionManager] = None,
|
|
180
|
+
result_cache: Optional[ResultCache] = None,
|
|
181
|
+
):
|
|
182
|
+
"""
|
|
183
|
+
Initialize the async execution manager.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
base_url: Base URL for the af server
|
|
187
|
+
config: AsyncConfig instance for configuration parameters
|
|
188
|
+
connection_manager: Optional ConnectionManager instance
|
|
189
|
+
result_cache: Optional ResultCache instance
|
|
190
|
+
"""
|
|
191
|
+
self.base_url = base_url.rstrip("/")
|
|
192
|
+
self.config = config or AsyncConfig()
|
|
193
|
+
|
|
194
|
+
# Validate configuration
|
|
195
|
+
self.config.validate()
|
|
196
|
+
|
|
197
|
+
# Initialize components
|
|
198
|
+
self.connection_manager = connection_manager or ConnectionManager(self.config)
|
|
199
|
+
self.result_cache = result_cache or ResultCache(self.config)
|
|
200
|
+
|
|
201
|
+
# Execution tracking
|
|
202
|
+
self._executions: Dict[str, ExecutionState] = {}
|
|
203
|
+
self._execution_lock = LazyAsyncLock()
|
|
204
|
+
self._capacity_semaphore = LazySemaphore(
|
|
205
|
+
lambda: self.config.max_concurrent_executions
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Event stream configuration
|
|
209
|
+
self._event_stream_headers: Dict[str, str] = {}
|
|
210
|
+
|
|
211
|
+
# Polling coordination
|
|
212
|
+
self._polling_task: Optional[asyncio.Task] = None
|
|
213
|
+
self._polling_semaphore = LazySemaphore(
|
|
214
|
+
lambda: self.config.max_active_polls
|
|
215
|
+
)
|
|
216
|
+
self._shutdown_event: Optional[asyncio.Event] = None
|
|
217
|
+
|
|
218
|
+
# Metrics and monitoring
|
|
219
|
+
self.metrics = ExecutionManagerMetrics()
|
|
220
|
+
|
|
221
|
+
# Background tasks
|
|
222
|
+
self._cleanup_task: Optional[asyncio.Task] = None
|
|
223
|
+
self._metrics_task: Optional[asyncio.Task] = None
|
|
224
|
+
self._event_stream_task: Optional[asyncio.Task] = None
|
|
225
|
+
|
|
226
|
+
# Circuit breaker state
|
|
227
|
+
self._circuit_breaker_failures = 0
|
|
228
|
+
self._circuit_breaker_last_failure = 0.0
|
|
229
|
+
self._circuit_breaker_open = False
|
|
230
|
+
|
|
231
|
+
logger.debug(f"AsyncExecutionManager initialized with base_url={base_url}")
|
|
232
|
+
|
|
233
|
+
def set_event_stream_headers(self, headers: Optional[Dict[str, str]]) -> None:
|
|
234
|
+
"""Configure headers forwarded to the SSE event stream."""
|
|
235
|
+
|
|
236
|
+
if headers is None:
|
|
237
|
+
self._event_stream_headers = {}
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
self._event_stream_headers = {
|
|
241
|
+
key: value for key, value in headers.items() if value is not None
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
async def __aenter__(self):
|
|
245
|
+
"""Async context manager entry."""
|
|
246
|
+
await self.start()
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
250
|
+
"""Async context manager exit."""
|
|
251
|
+
await self.stop()
|
|
252
|
+
|
|
253
|
+
async def start(self) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Start the execution manager and all background tasks.
|
|
256
|
+
|
|
257
|
+
Raises:
|
|
258
|
+
RuntimeError: If manager is already started
|
|
259
|
+
"""
|
|
260
|
+
if self._polling_task is not None:
|
|
261
|
+
raise RuntimeError("AsyncExecutionManager is already started")
|
|
262
|
+
|
|
263
|
+
# Start components
|
|
264
|
+
await self.connection_manager.start()
|
|
265
|
+
await self.result_cache.start()
|
|
266
|
+
|
|
267
|
+
if self._shutdown_event is None:
|
|
268
|
+
self._shutdown_event = asyncio.Event()
|
|
269
|
+
self._shutdown_event.clear()
|
|
270
|
+
|
|
271
|
+
# Start background tasks
|
|
272
|
+
self._polling_task = asyncio.create_task(self._polling_loop())
|
|
273
|
+
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
|
|
274
|
+
|
|
275
|
+
if self.config.enable_performance_logging:
|
|
276
|
+
self._metrics_task = asyncio.create_task(self._metrics_loop())
|
|
277
|
+
|
|
278
|
+
if self.config.enable_event_stream:
|
|
279
|
+
self._event_stream_task = asyncio.create_task(self._event_stream_loop())
|
|
280
|
+
|
|
281
|
+
logger.info(
|
|
282
|
+
f"AsyncExecutionManager started with max_concurrent={self.config.max_concurrent_executions}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
async def stop(self) -> None:
|
|
286
|
+
"""
|
|
287
|
+
Stop the execution manager and cleanup all resources.
|
|
288
|
+
"""
|
|
289
|
+
logger.info("Stopping AsyncExecutionManager...")
|
|
290
|
+
|
|
291
|
+
# Signal shutdown
|
|
292
|
+
if self._shutdown_event is None:
|
|
293
|
+
self._shutdown_event = asyncio.Event()
|
|
294
|
+
self._shutdown_event.set()
|
|
295
|
+
|
|
296
|
+
# Cancel background tasks
|
|
297
|
+
tasks_to_cancel = [
|
|
298
|
+
self._polling_task,
|
|
299
|
+
self._cleanup_task,
|
|
300
|
+
self._metrics_task,
|
|
301
|
+
self._event_stream_task,
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
for task in tasks_to_cancel:
|
|
305
|
+
if task:
|
|
306
|
+
task.cancel()
|
|
307
|
+
try:
|
|
308
|
+
await task
|
|
309
|
+
except asyncio.CancelledError:
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
self._polling_task = None
|
|
313
|
+
self._cleanup_task = None
|
|
314
|
+
self._metrics_task = None
|
|
315
|
+
self._event_stream_task = None
|
|
316
|
+
|
|
317
|
+
# Cancel all active executions
|
|
318
|
+
async with self._execution_lock:
|
|
319
|
+
for execution in self._executions.values():
|
|
320
|
+
if execution.is_active:
|
|
321
|
+
execution.cancel("Manager shutdown")
|
|
322
|
+
self._release_capacity_for_execution(execution)
|
|
323
|
+
|
|
324
|
+
# Stop components
|
|
325
|
+
await self.connection_manager.close()
|
|
326
|
+
await self.result_cache.stop()
|
|
327
|
+
|
|
328
|
+
logger.info("AsyncExecutionManager stopped")
|
|
329
|
+
|
|
330
|
+
async def submit_execution(
|
|
331
|
+
self,
|
|
332
|
+
target: str,
|
|
333
|
+
input_data: Dict[str, Any],
|
|
334
|
+
headers: Optional[Dict[str, str]] = None,
|
|
335
|
+
timeout: Optional[float] = None,
|
|
336
|
+
priority: ExecutionPriority = ExecutionPriority.NORMAL,
|
|
337
|
+
webhook: Optional[Union[WebhookConfig, Dict[str, Any]]] = None,
|
|
338
|
+
) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Submit an async execution and return execution_id.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
target: Target endpoint for execution
|
|
344
|
+
input_data: Input data for the execution
|
|
345
|
+
headers: Optional HTTP headers
|
|
346
|
+
timeout: Optional execution timeout (uses config default if None)
|
|
347
|
+
priority: Execution priority for queue management
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
str: Execution ID for tracking the execution
|
|
351
|
+
|
|
352
|
+
Raises:
|
|
353
|
+
RuntimeError: If manager is not started or at capacity
|
|
354
|
+
aiohttp.ClientError: For HTTP-related errors
|
|
355
|
+
"""
|
|
356
|
+
if self._polling_task is None:
|
|
357
|
+
raise RuntimeError("AsyncExecutionManager is not started")
|
|
358
|
+
|
|
359
|
+
# Check circuit breaker
|
|
360
|
+
if self._is_circuit_breaker_open():
|
|
361
|
+
raise RuntimeError("Circuit breaker is open - too many recent failures")
|
|
362
|
+
|
|
363
|
+
# Reserve capacity slot; released once terminal
|
|
364
|
+
await self._capacity_semaphore.acquire()
|
|
365
|
+
|
|
366
|
+
# Prepare request
|
|
367
|
+
url = urljoin(self.base_url, f"/api/v1/execute/async/{target}")
|
|
368
|
+
request_headers = {"Content-Type": "application/json", **(headers or {})}
|
|
369
|
+
payload: Dict[str, Any] = {
|
|
370
|
+
"input": input_data,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if webhook:
|
|
374
|
+
if isinstance(webhook, WebhookConfig):
|
|
375
|
+
payload["webhook"] = webhook.to_payload()
|
|
376
|
+
elif isinstance(webhook, dict):
|
|
377
|
+
payload["webhook"] = webhook
|
|
378
|
+
else:
|
|
379
|
+
raise TypeError("webhook must be a WebhookConfig or dict")
|
|
380
|
+
|
|
381
|
+
# Set timeout
|
|
382
|
+
execution_timeout = timeout or self.config.default_execution_timeout
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
# Submit execution
|
|
386
|
+
start_time = time.time()
|
|
387
|
+
async with self.connection_manager.get_session() as session:
|
|
388
|
+
response = await session.post(
|
|
389
|
+
url,
|
|
390
|
+
json=payload,
|
|
391
|
+
headers=request_headers,
|
|
392
|
+
timeout=self.config.polling_timeout,
|
|
393
|
+
)
|
|
394
|
+
response.raise_for_status()
|
|
395
|
+
result = await response.json()
|
|
396
|
+
|
|
397
|
+
execution_id = result.get("execution_id")
|
|
398
|
+
if not execution_id:
|
|
399
|
+
raise ValueError("Server did not return execution_id")
|
|
400
|
+
|
|
401
|
+
workflow_id = result.get("workflow_id") or result.get("run_id")
|
|
402
|
+
status = self._map_execution_status(result.get("status"))
|
|
403
|
+
created_at = self._parse_timestamp(result.get("created_at"))
|
|
404
|
+
webhook_registered = bool(result.get("webhook_registered"))
|
|
405
|
+
webhook_error = result.get("webhook_error")
|
|
406
|
+
|
|
407
|
+
if webhook and not webhook_registered and webhook_error:
|
|
408
|
+
logger.warning(
|
|
409
|
+
"Webhook registration rejected for %s: %s",
|
|
410
|
+
target,
|
|
411
|
+
webhook_error,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Create execution state
|
|
415
|
+
execution_state = ExecutionState(
|
|
416
|
+
execution_id=execution_id,
|
|
417
|
+
target=target,
|
|
418
|
+
input_data=input_data,
|
|
419
|
+
status=status,
|
|
420
|
+
priority=priority,
|
|
421
|
+
timeout=execution_timeout,
|
|
422
|
+
workflow_id=workflow_id,
|
|
423
|
+
created_at=created_at or datetime.now(timezone.utc),
|
|
424
|
+
updated_at=created_at or datetime.now(timezone.utc),
|
|
425
|
+
webhook_registered=webhook_registered,
|
|
426
|
+
webhook_error=webhook_error,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Store execution
|
|
430
|
+
async with self._execution_lock:
|
|
431
|
+
self._executions[execution_id] = execution_state
|
|
432
|
+
self.metrics.total_executions += 1
|
|
433
|
+
self.metrics.active_executions += 1
|
|
434
|
+
|
|
435
|
+
# Update peak concurrent executions
|
|
436
|
+
if (
|
|
437
|
+
self.metrics.active_executions
|
|
438
|
+
> self.metrics.peak_concurrent_executions
|
|
439
|
+
):
|
|
440
|
+
self.metrics.peak_concurrent_executions = (
|
|
441
|
+
self.metrics.active_executions
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Reset circuit breaker on success
|
|
445
|
+
self._circuit_breaker_failures = 0
|
|
446
|
+
|
|
447
|
+
duration = time.time() - start_time
|
|
448
|
+
logger.debug(
|
|
449
|
+
f"Submitted execution {execution_id[:8]}... for target {target} in {duration:.3f}s"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return execution_id
|
|
453
|
+
|
|
454
|
+
except Exception as e:
|
|
455
|
+
self._capacity_semaphore.release()
|
|
456
|
+
self._record_circuit_breaker_failure()
|
|
457
|
+
logger.error(f"Failed to submit execution for target {target}: {e}")
|
|
458
|
+
raise
|
|
459
|
+
|
|
460
|
+
def _map_execution_status(self, status: Optional[str]) -> ExecutionStatus:
|
|
461
|
+
if not status:
|
|
462
|
+
return ExecutionStatus.QUEUED
|
|
463
|
+
normalized = status.lower()
|
|
464
|
+
if normalized in ExecutionStatus._value2member_map_:
|
|
465
|
+
return ExecutionStatus._value2member_map_[normalized]
|
|
466
|
+
return ExecutionStatus.QUEUED
|
|
467
|
+
|
|
468
|
+
@staticmethod
|
|
469
|
+
def _parse_timestamp(value: Optional[str]) -> Optional[datetime]:
|
|
470
|
+
if not value:
|
|
471
|
+
return None
|
|
472
|
+
try:
|
|
473
|
+
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
474
|
+
except ValueError:
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
async def wait_for_result(
|
|
478
|
+
self, execution_id: str, timeout: Optional[float] = None
|
|
479
|
+
) -> Any:
|
|
480
|
+
"""
|
|
481
|
+
Wait for execution result with intelligent polling.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
execution_id: Execution ID to wait for
|
|
485
|
+
timeout: Optional timeout override
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Any: Execution result
|
|
489
|
+
|
|
490
|
+
Raises:
|
|
491
|
+
KeyError: If execution_id is not found
|
|
492
|
+
TimeoutError: If execution times out
|
|
493
|
+
RuntimeError: If execution fails or is cancelled
|
|
494
|
+
"""
|
|
495
|
+
# Check cache first
|
|
496
|
+
cached_result = self.result_cache.get_execution_result(execution_id)
|
|
497
|
+
if cached_result is not None:
|
|
498
|
+
logger.debug(f"Retrieved cached result for execution {execution_id[:8]}...")
|
|
499
|
+
return cached_result
|
|
500
|
+
|
|
501
|
+
# Get execution state
|
|
502
|
+
async with self._execution_lock:
|
|
503
|
+
execution = self._executions.get(execution_id)
|
|
504
|
+
if execution is None:
|
|
505
|
+
raise KeyError(f"Execution {execution_id} not found")
|
|
506
|
+
|
|
507
|
+
# Set timeout
|
|
508
|
+
wait_timeout = (
|
|
509
|
+
timeout or execution.timeout or self.config.default_execution_timeout
|
|
510
|
+
)
|
|
511
|
+
start_time = time.time()
|
|
512
|
+
|
|
513
|
+
# Wait for completion
|
|
514
|
+
while time.time() - start_time < wait_timeout:
|
|
515
|
+
async with self._execution_lock:
|
|
516
|
+
execution = self._executions.get(execution_id)
|
|
517
|
+
if execution is None:
|
|
518
|
+
raise KeyError(f"Execution {execution_id} was removed")
|
|
519
|
+
|
|
520
|
+
if execution.is_terminal:
|
|
521
|
+
if execution.is_successful:
|
|
522
|
+
# Cache successful result
|
|
523
|
+
if execution.result is not None:
|
|
524
|
+
self.result_cache.set_execution_result(
|
|
525
|
+
execution_id, execution.result
|
|
526
|
+
)
|
|
527
|
+
return execution.result
|
|
528
|
+
elif execution.status == ExecutionStatus.FAILED:
|
|
529
|
+
raise RuntimeError(
|
|
530
|
+
f"Execution failed: {execution.error_message}"
|
|
531
|
+
)
|
|
532
|
+
elif execution.status == ExecutionStatus.CANCELLED:
|
|
533
|
+
raise RuntimeError(
|
|
534
|
+
f"Execution was cancelled: {execution._cancellation_reason}"
|
|
535
|
+
)
|
|
536
|
+
elif execution.status == ExecutionStatus.TIMEOUT:
|
|
537
|
+
raise TimeoutError(
|
|
538
|
+
f"Execution timed out after {execution.timeout} seconds"
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Wait before next check
|
|
542
|
+
await asyncio.sleep(0.1)
|
|
543
|
+
|
|
544
|
+
# Timeout reached
|
|
545
|
+
async with self._execution_lock:
|
|
546
|
+
execution = self._executions.get(execution_id)
|
|
547
|
+
if execution and execution.is_active:
|
|
548
|
+
execution.timeout_execution()
|
|
549
|
+
self.metrics.timeout_executions += 1
|
|
550
|
+
|
|
551
|
+
raise TimeoutError(f"Wait timeout reached after {wait_timeout} seconds")
|
|
552
|
+
|
|
553
|
+
async def cancel_execution(
|
|
554
|
+
self, execution_id: str, reason: Optional[str] = None
|
|
555
|
+
) -> bool:
|
|
556
|
+
"""
|
|
557
|
+
Cancel an active execution.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
execution_id: Execution ID to cancel
|
|
561
|
+
reason: Optional cancellation reason
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
bool: True if execution was cancelled, False if not found or already terminal
|
|
565
|
+
"""
|
|
566
|
+
async with self._execution_lock:
|
|
567
|
+
execution = self._executions.get(execution_id)
|
|
568
|
+
if execution is None or execution.is_terminal:
|
|
569
|
+
return False
|
|
570
|
+
|
|
571
|
+
execution.cancel(reason)
|
|
572
|
+
self.metrics.cancelled_executions += 1
|
|
573
|
+
self.metrics.active_executions -= 1
|
|
574
|
+
|
|
575
|
+
logger.debug(
|
|
576
|
+
f"Cancelled execution {execution_id[:8]}... - {reason or 'No reason provided'}"
|
|
577
|
+
)
|
|
578
|
+
return True
|
|
579
|
+
|
|
580
|
+
async def get_execution_status(self, execution_id: str) -> Optional[Dict[str, Any]]:
|
|
581
|
+
"""
|
|
582
|
+
Get current status of an execution.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
execution_id: Execution ID to check
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
Optional[Dict]: Execution status dictionary or None if not found
|
|
589
|
+
"""
|
|
590
|
+
async with self._execution_lock:
|
|
591
|
+
execution = self._executions.get(execution_id)
|
|
592
|
+
if execution is None:
|
|
593
|
+
return None
|
|
594
|
+
|
|
595
|
+
return execution.to_dict()
|
|
596
|
+
|
|
597
|
+
async def list_executions(
|
|
598
|
+
self,
|
|
599
|
+
status_filter: Optional[ExecutionStatus] = None,
|
|
600
|
+
limit: Optional[int] = None,
|
|
601
|
+
) -> List[Dict[str, Any]]:
|
|
602
|
+
"""
|
|
603
|
+
List executions with optional filtering.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
status_filter: Optional status to filter by
|
|
607
|
+
limit: Optional limit on number of results
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
List[Dict]: List of execution status dictionaries
|
|
611
|
+
"""
|
|
612
|
+
async with self._execution_lock:
|
|
613
|
+
executions = list(self._executions.values())
|
|
614
|
+
|
|
615
|
+
# Apply status filter
|
|
616
|
+
if status_filter:
|
|
617
|
+
executions = [e for e in executions if e.status == status_filter]
|
|
618
|
+
|
|
619
|
+
# Sort by creation time (newest first)
|
|
620
|
+
executions.sort(key=lambda e: e.created_at, reverse=True)
|
|
621
|
+
|
|
622
|
+
# Apply limit
|
|
623
|
+
if limit:
|
|
624
|
+
executions = executions[:limit]
|
|
625
|
+
|
|
626
|
+
return [execution.to_dict() for execution in executions]
|
|
627
|
+
|
|
628
|
+
async def cleanup_completed_executions(self) -> int:
|
|
629
|
+
"""
|
|
630
|
+
Clean up completed executions to manage memory.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
int: Number of executions cleaned up
|
|
634
|
+
"""
|
|
635
|
+
cleanup_count = 0
|
|
636
|
+
current_time = time.time()
|
|
637
|
+
|
|
638
|
+
async with self._execution_lock:
|
|
639
|
+
# Collect terminal executions for retention analysis
|
|
640
|
+
completed_executions = {
|
|
641
|
+
exec_id: execution
|
|
642
|
+
for exec_id, execution in self._executions.items()
|
|
643
|
+
if execution.is_terminal
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
if not completed_executions:
|
|
647
|
+
return 0
|
|
648
|
+
|
|
649
|
+
removal_candidates = set()
|
|
650
|
+
|
|
651
|
+
# Time-based pruning to keep memory bounded during long-running sessions
|
|
652
|
+
retention_seconds = self.config.completed_execution_retention_seconds
|
|
653
|
+
if retention_seconds > 0:
|
|
654
|
+
for exec_id, execution in completed_executions.items():
|
|
655
|
+
end_time = (
|
|
656
|
+
execution.metrics.end_time or execution.metrics.submit_time
|
|
657
|
+
)
|
|
658
|
+
if end_time and (current_time - end_time) > retention_seconds:
|
|
659
|
+
removal_candidates.add(exec_id)
|
|
660
|
+
|
|
661
|
+
# Enforce cap on stored completions after time-based pruning
|
|
662
|
+
remaining = [
|
|
663
|
+
(exec_id, execution)
|
|
664
|
+
for exec_id, execution in completed_executions.items()
|
|
665
|
+
if exec_id not in removal_candidates
|
|
666
|
+
]
|
|
667
|
+
|
|
668
|
+
if len(remaining) > self.config.max_completed_executions:
|
|
669
|
+
# Remove the oldest executions first
|
|
670
|
+
remaining.sort(key=lambda item: item[1].metrics.end_time or 0)
|
|
671
|
+
overflow = len(remaining) - self.config.max_completed_executions
|
|
672
|
+
for i in range(overflow):
|
|
673
|
+
removal_candidates.add(remaining[i][0])
|
|
674
|
+
|
|
675
|
+
# Apply removals and cache results where applicable
|
|
676
|
+
for exec_id in removal_candidates:
|
|
677
|
+
execution = completed_executions.get(exec_id)
|
|
678
|
+
if execution is None:
|
|
679
|
+
continue
|
|
680
|
+
|
|
681
|
+
if execution.is_successful and execution.result is not None:
|
|
682
|
+
self.result_cache.set_execution_result(exec_id, execution.result)
|
|
683
|
+
|
|
684
|
+
self._release_capacity_for_execution(execution)
|
|
685
|
+
self._executions.pop(exec_id, None)
|
|
686
|
+
cleanup_count += 1
|
|
687
|
+
|
|
688
|
+
if cleanup_count > 0:
|
|
689
|
+
self.metrics.cleanup_operations += 1
|
|
690
|
+
self.metrics.last_cleanup = current_time
|
|
691
|
+
logger.debug(f"Cleaned up {cleanup_count} completed executions")
|
|
692
|
+
|
|
693
|
+
return cleanup_count
|
|
694
|
+
|
|
695
|
+
async def _event_stream_loop(self) -> None:
|
|
696
|
+
"""Listen for execution events over SSE and nudge polling."""
|
|
697
|
+
logger.debug("Starting event stream loop")
|
|
698
|
+
|
|
699
|
+
url = urljoin(self.base_url, self.config.event_stream_path)
|
|
700
|
+
backoff = max(self.config.event_stream_retry_backoff, 0.5)
|
|
701
|
+
|
|
702
|
+
while not self._shutdown_event.is_set():
|
|
703
|
+
try:
|
|
704
|
+
request_headers = {"Accept": "text/event-stream"}
|
|
705
|
+
if self._event_stream_headers:
|
|
706
|
+
request_headers.update(self._event_stream_headers)
|
|
707
|
+
|
|
708
|
+
async with self.connection_manager.get_session() as session:
|
|
709
|
+
timeout = aiohttp.ClientTimeout(total=None, sock_read=None)
|
|
710
|
+
async with session.get(
|
|
711
|
+
url, headers=request_headers, timeout=timeout
|
|
712
|
+
) as response:
|
|
713
|
+
if response.status != 200:
|
|
714
|
+
body = await response.text()
|
|
715
|
+
logger.warn(
|
|
716
|
+
f"Event stream returned {response.status} for {url}: {body[:256]}"
|
|
717
|
+
)
|
|
718
|
+
await asyncio.sleep(backoff)
|
|
719
|
+
continue
|
|
720
|
+
|
|
721
|
+
buffer = ""
|
|
722
|
+
async for chunk in response.content.iter_chunked(1024):
|
|
723
|
+
if self._shutdown_event.is_set():
|
|
724
|
+
break
|
|
725
|
+
if not chunk:
|
|
726
|
+
continue
|
|
727
|
+
try:
|
|
728
|
+
decoded = chunk.decode("utf-8", errors="ignore")
|
|
729
|
+
except Exception:
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
buffer += decoded
|
|
733
|
+
|
|
734
|
+
while "\n\n" in buffer:
|
|
735
|
+
raw_event, buffer = buffer.split("\n\n", 1)
|
|
736
|
+
data_lines = []
|
|
737
|
+
for line in raw_event.splitlines():
|
|
738
|
+
if line.startswith(":"):
|
|
739
|
+
continue
|
|
740
|
+
if line.startswith("data:"):
|
|
741
|
+
data_lines.append(line[5:].lstrip())
|
|
742
|
+
|
|
743
|
+
if not data_lines:
|
|
744
|
+
continue
|
|
745
|
+
|
|
746
|
+
payload_str = "\n".join(data_lines).strip()
|
|
747
|
+
if not payload_str:
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
try:
|
|
751
|
+
payload = json.loads(payload_str)
|
|
752
|
+
except json.JSONDecodeError:
|
|
753
|
+
logger.debug(
|
|
754
|
+
f"Failed to decode SSE payload: {payload_str[:120]}"
|
|
755
|
+
)
|
|
756
|
+
continue
|
|
757
|
+
|
|
758
|
+
await self._handle_event_stream_payload(payload)
|
|
759
|
+
|
|
760
|
+
except asyncio.CancelledError:
|
|
761
|
+
break
|
|
762
|
+
except Exception as e:
|
|
763
|
+
if self._shutdown_event.is_set():
|
|
764
|
+
break
|
|
765
|
+
logger.warn(f"Event stream error: {e}")
|
|
766
|
+
await asyncio.sleep(backoff)
|
|
767
|
+
|
|
768
|
+
logger.debug("Event stream loop stopped")
|
|
769
|
+
|
|
770
|
+
async def _handle_event_stream_payload(self, payload: Dict[str, Any]) -> None:
|
|
771
|
+
"""Process a single SSE payload."""
|
|
772
|
+
execution_id = payload.get("execution_id") or payload.get("executionId")
|
|
773
|
+
if not execution_id:
|
|
774
|
+
return
|
|
775
|
+
|
|
776
|
+
schedule_poll = False
|
|
777
|
+
status_hint = normalize_status(payload.get("status"))
|
|
778
|
+
event_type = str(payload.get("type", "")).lower()
|
|
779
|
+
|
|
780
|
+
async with self._execution_lock:
|
|
781
|
+
execution = self._executions.get(execution_id)
|
|
782
|
+
if execution is None:
|
|
783
|
+
return
|
|
784
|
+
|
|
785
|
+
if event_type == "execution_started" or status_hint == "running":
|
|
786
|
+
execution.update_status(ExecutionStatus.RUNNING)
|
|
787
|
+
elif status_hint == "queued":
|
|
788
|
+
execution.update_status(ExecutionStatus.QUEUED)
|
|
789
|
+
elif status_hint == "pending":
|
|
790
|
+
execution.update_status(ExecutionStatus.PENDING)
|
|
791
|
+
elif status_hint in {
|
|
792
|
+
"succeeded",
|
|
793
|
+
"failed",
|
|
794
|
+
"cancelled",
|
|
795
|
+
"timeout",
|
|
796
|
+
} or event_type in {"execution_completed", "execution_failed"}:
|
|
797
|
+
if status_hint == "failed":
|
|
798
|
+
execution.update_status(ExecutionStatus.FAILED)
|
|
799
|
+
elif status_hint == "cancelled":
|
|
800
|
+
execution.update_status(ExecutionStatus.CANCELLED)
|
|
801
|
+
elif status_hint == "timeout":
|
|
802
|
+
execution.update_status(ExecutionStatus.TIMEOUT)
|
|
803
|
+
else:
|
|
804
|
+
execution.update_status(ExecutionStatus.SUCCEEDED)
|
|
805
|
+
schedule_poll = True
|
|
806
|
+
|
|
807
|
+
if schedule_poll:
|
|
808
|
+
asyncio.create_task(self._poll_execution_immediate(execution_id))
|
|
809
|
+
|
|
810
|
+
async def _poll_execution_immediate(self, execution_id: str) -> None:
|
|
811
|
+
"""Trigger an immediate poll for the provided execution."""
|
|
812
|
+
async with self._execution_lock:
|
|
813
|
+
execution = self._executions.get(execution_id)
|
|
814
|
+
|
|
815
|
+
if execution is None:
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
if execution.is_terminal and execution.result is not None:
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
try:
|
|
822
|
+
await self._poll_single_execution(execution)
|
|
823
|
+
except Exception as exc:
|
|
824
|
+
logger.debug(f"Immediate poll for {execution_id[:8]}... failed: {exc}")
|
|
825
|
+
|
|
826
|
+
async def start_polling_task(self) -> None:
|
|
827
|
+
"""
|
|
828
|
+
Start the background polling task.
|
|
829
|
+
|
|
830
|
+
Note: This is automatically called by start() and should not be called manually.
|
|
831
|
+
"""
|
|
832
|
+
if self._polling_task is None or self._polling_task.done():
|
|
833
|
+
self._polling_task = asyncio.create_task(self._polling_loop())
|
|
834
|
+
logger.debug("Background polling task started")
|
|
835
|
+
|
|
836
|
+
async def stop_polling_task(self) -> None:
|
|
837
|
+
"""
|
|
838
|
+
Stop the background polling task.
|
|
839
|
+
|
|
840
|
+
Note: This is automatically called by stop() and should not be called manually.
|
|
841
|
+
"""
|
|
842
|
+
if self._polling_task:
|
|
843
|
+
self._polling_task.cancel()
|
|
844
|
+
try:
|
|
845
|
+
await self._polling_task
|
|
846
|
+
except asyncio.CancelledError:
|
|
847
|
+
pass
|
|
848
|
+
self._polling_task = None
|
|
849
|
+
logger.debug("Background polling task stopped")
|
|
850
|
+
|
|
851
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
852
|
+
"""
|
|
853
|
+
Get comprehensive execution manager metrics.
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
Dict[str, Any]: Metrics dictionary
|
|
857
|
+
"""
|
|
858
|
+
|
|
859
|
+
# Update current metrics
|
|
860
|
+
async def _update_metrics():
|
|
861
|
+
async with self._execution_lock:
|
|
862
|
+
active_count = sum(1 for e in self._executions.values() if e.is_active)
|
|
863
|
+
self.metrics.active_executions = active_count
|
|
864
|
+
|
|
865
|
+
# Run the update if we're in an async context
|
|
866
|
+
try:
|
|
867
|
+
loop = asyncio.get_running_loop()
|
|
868
|
+
loop.create_task(_update_metrics())
|
|
869
|
+
except RuntimeError:
|
|
870
|
+
pass # Not in async context
|
|
871
|
+
|
|
872
|
+
return {
|
|
873
|
+
"total_executions": self.metrics.total_executions,
|
|
874
|
+
"active_executions": self.metrics.active_executions,
|
|
875
|
+
"completed_executions": self.metrics.completed_executions,
|
|
876
|
+
"failed_executions": self.metrics.failed_executions,
|
|
877
|
+
"cancelled_executions": self.metrics.cancelled_executions,
|
|
878
|
+
"timeout_executions": self.metrics.timeout_executions,
|
|
879
|
+
"success_rate": self.metrics.success_rate,
|
|
880
|
+
"average_execution_time": self.metrics.average_execution_time,
|
|
881
|
+
"average_queue_time": self.metrics.average_queue_time,
|
|
882
|
+
"peak_concurrent_executions": self.metrics.peak_concurrent_executions,
|
|
883
|
+
"memory_usage_mb": self.metrics.memory_usage_mb,
|
|
884
|
+
"cleanup_operations": self.metrics.cleanup_operations,
|
|
885
|
+
"uptime": self.metrics.uptime,
|
|
886
|
+
"polling_metrics": {
|
|
887
|
+
"total_polls": self.metrics.polling_metrics.total_polls,
|
|
888
|
+
"successful_polls": self.metrics.polling_metrics.successful_polls,
|
|
889
|
+
"failed_polls": self.metrics.polling_metrics.failed_polls,
|
|
890
|
+
"success_rate": self.metrics.polling_metrics.success_rate,
|
|
891
|
+
"average_poll_duration": self.metrics.polling_metrics.average_poll_duration,
|
|
892
|
+
"batch_polls": self.metrics.polling_metrics.batch_polls,
|
|
893
|
+
},
|
|
894
|
+
"circuit_breaker": {
|
|
895
|
+
"failures": self._circuit_breaker_failures,
|
|
896
|
+
"is_open": self._circuit_breaker_open,
|
|
897
|
+
"last_failure": self._circuit_breaker_last_failure,
|
|
898
|
+
},
|
|
899
|
+
"connection_manager": self.connection_manager.get_metrics().__dict__,
|
|
900
|
+
"result_cache": self.result_cache.get_stats(),
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
async def _polling_loop(self) -> None:
|
|
904
|
+
"""Background task for intelligent polling of active executions."""
|
|
905
|
+
logger.debug("Starting polling loop")
|
|
906
|
+
|
|
907
|
+
while not self._shutdown_event.is_set():
|
|
908
|
+
try:
|
|
909
|
+
await self._poll_active_executions()
|
|
910
|
+
await asyncio.sleep(self.config.batch_poll_interval)
|
|
911
|
+
|
|
912
|
+
except asyncio.CancelledError:
|
|
913
|
+
break
|
|
914
|
+
except Exception as e:
|
|
915
|
+
logger.error(f"Polling loop error: {e}")
|
|
916
|
+
await asyncio.sleep(1.0) # Brief pause on error
|
|
917
|
+
|
|
918
|
+
logger.debug("Polling loop stopped")
|
|
919
|
+
|
|
920
|
+
async def _poll_active_executions(self) -> None:
|
|
921
|
+
"""Poll all active executions that are ready for polling."""
|
|
922
|
+
# Get executions ready for polling
|
|
923
|
+
executions_to_poll = []
|
|
924
|
+
|
|
925
|
+
async with self._execution_lock:
|
|
926
|
+
for execution in self._executions.values():
|
|
927
|
+
if execution.should_poll:
|
|
928
|
+
# Check for timeout
|
|
929
|
+
if execution.is_overdue:
|
|
930
|
+
execution.timeout_execution()
|
|
931
|
+
self.metrics.timeout_executions += 1
|
|
932
|
+
self.metrics.active_executions -= 1
|
|
933
|
+
continue
|
|
934
|
+
|
|
935
|
+
executions_to_poll.append(execution)
|
|
936
|
+
|
|
937
|
+
if not executions_to_poll:
|
|
938
|
+
return
|
|
939
|
+
|
|
940
|
+
# Use batch polling if enabled and beneficial
|
|
941
|
+
if (
|
|
942
|
+
self.config.enable_batch_polling and len(executions_to_poll) >= 3
|
|
943
|
+
): # Batch threshold
|
|
944
|
+
await self._batch_poll_executions(executions_to_poll)
|
|
945
|
+
else:
|
|
946
|
+
await self._individual_poll_executions(executions_to_poll)
|
|
947
|
+
|
|
948
|
+
async def _batch_poll_executions(self, executions: List[ExecutionState]) -> None:
|
|
949
|
+
"""Poll multiple executions in batches for efficiency."""
|
|
950
|
+
# Split into batches
|
|
951
|
+
batch_size = min(self.config.batch_size, len(executions))
|
|
952
|
+
|
|
953
|
+
for i in range(0, len(executions), batch_size):
|
|
954
|
+
batch = executions[i : i + batch_size]
|
|
955
|
+
|
|
956
|
+
# Create batch requests
|
|
957
|
+
requests = []
|
|
958
|
+
for execution in batch:
|
|
959
|
+
requests.append(
|
|
960
|
+
{
|
|
961
|
+
"method": "GET",
|
|
962
|
+
"url": self._execution_status_url(execution.execution_id),
|
|
963
|
+
"timeout": self.config.polling_timeout,
|
|
964
|
+
}
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Execute batch
|
|
968
|
+
start_time = time.time()
|
|
969
|
+
try:
|
|
970
|
+
responses = await self.connection_manager.batch_request(requests)
|
|
971
|
+
duration = time.time() - start_time
|
|
972
|
+
|
|
973
|
+
# Process responses
|
|
974
|
+
for execution, response in zip(batch, responses):
|
|
975
|
+
await self._process_poll_response(
|
|
976
|
+
execution, response, duration / len(batch)
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
self.metrics.polling_metrics.batch_polls += 1
|
|
980
|
+
|
|
981
|
+
except Exception as e:
|
|
982
|
+
logger.error(f"Batch polling failed: {e}")
|
|
983
|
+
# Fall back to individual polling
|
|
984
|
+
await self._individual_poll_executions(batch)
|
|
985
|
+
|
|
986
|
+
async def _individual_poll_executions(
|
|
987
|
+
self, executions: List[ExecutionState]
|
|
988
|
+
) -> None:
|
|
989
|
+
"""Poll executions individually with concurrency control."""
|
|
990
|
+
|
|
991
|
+
# Use semaphore to limit concurrent polls
|
|
992
|
+
async def poll_single(execution: ExecutionState):
|
|
993
|
+
async with self._polling_semaphore:
|
|
994
|
+
await self._poll_single_execution(execution)
|
|
995
|
+
|
|
996
|
+
# Create tasks for concurrent polling
|
|
997
|
+
tasks = [poll_single(execution) for execution in executions]
|
|
998
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
999
|
+
|
|
1000
|
+
async def _poll_single_execution(self, execution: ExecutionState) -> None:
|
|
1001
|
+
"""Poll a single execution for status updates."""
|
|
1002
|
+
url = self._execution_status_url(execution.execution_id)
|
|
1003
|
+
|
|
1004
|
+
start_time = time.time()
|
|
1005
|
+
try:
|
|
1006
|
+
response = await self.connection_manager.request(
|
|
1007
|
+
"GET", url, timeout=self.config.polling_timeout
|
|
1008
|
+
)
|
|
1009
|
+
duration = time.time() - start_time
|
|
1010
|
+
|
|
1011
|
+
await self._process_poll_response(execution, response, duration)
|
|
1012
|
+
|
|
1013
|
+
except Exception as e:
|
|
1014
|
+
duration = time.time() - start_time
|
|
1015
|
+
await self._process_poll_response(execution, e, duration)
|
|
1016
|
+
|
|
1017
|
+
async def _process_poll_response(
|
|
1018
|
+
self, execution: ExecutionState, response: Any, duration: float
|
|
1019
|
+
) -> None:
|
|
1020
|
+
"""Process the response from a polling operation."""
|
|
1021
|
+
success = False
|
|
1022
|
+
timeout_occurred = False
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
if isinstance(response, Exception):
|
|
1026
|
+
# Handle error response
|
|
1027
|
+
if isinstance(response, asyncio.TimeoutError):
|
|
1028
|
+
timeout_occurred = True
|
|
1029
|
+
|
|
1030
|
+
execution.record_poll_attempt(False, duration)
|
|
1031
|
+
|
|
1032
|
+
# Update poll interval based on failure
|
|
1033
|
+
new_interval = min(
|
|
1034
|
+
execution.current_poll_interval * 1.5, self.config.max_poll_interval
|
|
1035
|
+
)
|
|
1036
|
+
execution.update_poll_interval(new_interval)
|
|
1037
|
+
|
|
1038
|
+
logger.debug(
|
|
1039
|
+
f"Poll failed for execution {execution.execution_id[:8]}...: {response}"
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
else:
|
|
1043
|
+
# Handle successful response
|
|
1044
|
+
response.raise_for_status()
|
|
1045
|
+
status_data = await response.json()
|
|
1046
|
+
|
|
1047
|
+
# Update execution state
|
|
1048
|
+
await self._update_execution_from_status(execution, status_data)
|
|
1049
|
+
|
|
1050
|
+
execution.record_poll_attempt(True, duration)
|
|
1051
|
+
success = True
|
|
1052
|
+
|
|
1053
|
+
# Update poll interval based on execution age
|
|
1054
|
+
new_interval = self.config.get_poll_interval_for_age(execution.age)
|
|
1055
|
+
execution.update_poll_interval(new_interval)
|
|
1056
|
+
|
|
1057
|
+
except Exception as e:
|
|
1058
|
+
execution.record_poll_attempt(False, duration)
|
|
1059
|
+
logger.error(
|
|
1060
|
+
f"Error processing poll response for {execution.execution_id[:8]}...: {e}"
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
finally:
|
|
1064
|
+
# Record metrics
|
|
1065
|
+
self.metrics.polling_metrics.record_poll(
|
|
1066
|
+
success, duration, timeout_occurred
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
def _execution_status_url(self, execution_id: str) -> str:
|
|
1070
|
+
"""Return the canonical status endpoint for an execution."""
|
|
1071
|
+
return urljoin(self.base_url, f"/api/v1/executions/{execution_id}")
|
|
1072
|
+
|
|
1073
|
+
async def _update_execution_from_status(
|
|
1074
|
+
self, execution: ExecutionState, status_data: Dict[str, Any]
|
|
1075
|
+
) -> None:
|
|
1076
|
+
"""Update execution state from status response."""
|
|
1077
|
+
raw_status = status_data.get("status")
|
|
1078
|
+
normalized = normalize_status(raw_status)
|
|
1079
|
+
|
|
1080
|
+
try:
|
|
1081
|
+
new_status = ExecutionStatus(normalized)
|
|
1082
|
+
except ValueError:
|
|
1083
|
+
logger.warning(
|
|
1084
|
+
"Unknown status '%s' for execution %s",
|
|
1085
|
+
normalized,
|
|
1086
|
+
execution.execution_id[:8],
|
|
1087
|
+
)
|
|
1088
|
+
return
|
|
1089
|
+
|
|
1090
|
+
old_status = execution.status
|
|
1091
|
+
|
|
1092
|
+
# Update status
|
|
1093
|
+
if new_status != old_status:
|
|
1094
|
+
if new_status == ExecutionStatus.SUCCEEDED:
|
|
1095
|
+
result = status_data.get("result")
|
|
1096
|
+
execution.set_result(result)
|
|
1097
|
+
|
|
1098
|
+
async with self._execution_lock:
|
|
1099
|
+
self.metrics.completed_executions += 1
|
|
1100
|
+
self.metrics.active_executions -= 1
|
|
1101
|
+
self._release_capacity_for_execution(execution)
|
|
1102
|
+
|
|
1103
|
+
elif new_status == ExecutionStatus.FAILED:
|
|
1104
|
+
error_msg = status_data.get("error", "Execution failed")
|
|
1105
|
+
error_details = status_data.get("error_details")
|
|
1106
|
+
execution.set_error(error_msg, error_details)
|
|
1107
|
+
|
|
1108
|
+
async with self._execution_lock:
|
|
1109
|
+
self.metrics.failed_executions += 1
|
|
1110
|
+
self.metrics.active_executions -= 1
|
|
1111
|
+
self._release_capacity_for_execution(execution)
|
|
1112
|
+
elif new_status == ExecutionStatus.CANCELLED:
|
|
1113
|
+
execution.update_status(new_status)
|
|
1114
|
+
|
|
1115
|
+
async with self._execution_lock:
|
|
1116
|
+
self.metrics.cancelled_executions += 1
|
|
1117
|
+
self.metrics.active_executions -= 1
|
|
1118
|
+
self._release_capacity_for_execution(execution)
|
|
1119
|
+
|
|
1120
|
+
elif new_status == ExecutionStatus.TIMEOUT:
|
|
1121
|
+
execution.update_status(new_status)
|
|
1122
|
+
|
|
1123
|
+
async with self._execution_lock:
|
|
1124
|
+
self.metrics.timeout_executions += 1
|
|
1125
|
+
self.metrics.active_executions -= 1
|
|
1126
|
+
self._release_capacity_for_execution(execution)
|
|
1127
|
+
|
|
1128
|
+
else:
|
|
1129
|
+
execution.update_status(new_status)
|
|
1130
|
+
|
|
1131
|
+
old_repr = getattr(old_status, "value", old_status)
|
|
1132
|
+
new_repr = getattr(new_status, "value", new_status)
|
|
1133
|
+
logger.debug(
|
|
1134
|
+
f"Execution {execution.execution_id[:8]}... status: {old_repr} -> {new_repr}"
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
def _release_capacity_for_execution(self, execution: ExecutionState) -> None:
|
|
1138
|
+
if getattr(execution, "_capacity_released", False):
|
|
1139
|
+
return
|
|
1140
|
+
execution._capacity_released = True
|
|
1141
|
+
try:
|
|
1142
|
+
self._capacity_semaphore.release()
|
|
1143
|
+
except ValueError:
|
|
1144
|
+
# Semaphore already fully released (can occur during shutdown cleanup)
|
|
1145
|
+
pass
|
|
1146
|
+
|
|
1147
|
+
async def _cleanup_loop(self) -> None:
|
|
1148
|
+
"""Background task for periodic cleanup of completed executions."""
|
|
1149
|
+
logger.debug("Starting cleanup loop")
|
|
1150
|
+
|
|
1151
|
+
while not self._shutdown_event.is_set():
|
|
1152
|
+
try:
|
|
1153
|
+
await asyncio.sleep(self.config.cleanup_interval)
|
|
1154
|
+
await self.cleanup_completed_executions()
|
|
1155
|
+
|
|
1156
|
+
except asyncio.CancelledError:
|
|
1157
|
+
break
|
|
1158
|
+
except Exception as e:
|
|
1159
|
+
logger.error(f"Cleanup loop error: {e}")
|
|
1160
|
+
|
|
1161
|
+
logger.debug("Cleanup loop stopped")
|
|
1162
|
+
|
|
1163
|
+
async def _metrics_loop(self) -> None:
|
|
1164
|
+
"""Background task for periodic metrics logging."""
|
|
1165
|
+
logger.debug("Starting metrics loop")
|
|
1166
|
+
|
|
1167
|
+
while not self._shutdown_event.is_set():
|
|
1168
|
+
try:
|
|
1169
|
+
await asyncio.sleep(60.0) # Log metrics every minute
|
|
1170
|
+
|
|
1171
|
+
metrics = self.get_metrics()
|
|
1172
|
+
logger.debug(
|
|
1173
|
+
f"Execution metrics: "
|
|
1174
|
+
f"active={metrics['active_executions']}, "
|
|
1175
|
+
f"total={metrics['total_executions']}, "
|
|
1176
|
+
f"success_rate={metrics['success_rate']:.1f}%, "
|
|
1177
|
+
f"poll_success_rate={metrics['polling_metrics']['success_rate']:.1f}%"
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
except asyncio.CancelledError:
|
|
1181
|
+
break
|
|
1182
|
+
except Exception as e:
|
|
1183
|
+
logger.error(f"Metrics loop error: {e}")
|
|
1184
|
+
|
|
1185
|
+
logger.debug("Metrics loop stopped")
|
|
1186
|
+
|
|
1187
|
+
def _is_circuit_breaker_open(self) -> bool:
|
|
1188
|
+
"""Check if circuit breaker is open."""
|
|
1189
|
+
if not self._circuit_breaker_open:
|
|
1190
|
+
return False
|
|
1191
|
+
|
|
1192
|
+
# Check if recovery timeout has passed
|
|
1193
|
+
if (
|
|
1194
|
+
time.time() - self._circuit_breaker_last_failure
|
|
1195
|
+
> self.config.circuit_breaker_recovery_timeout
|
|
1196
|
+
):
|
|
1197
|
+
self._circuit_breaker_open = False
|
|
1198
|
+
self._circuit_breaker_failures = 0
|
|
1199
|
+
logger.info("Circuit breaker closed - attempting recovery")
|
|
1200
|
+
return False
|
|
1201
|
+
|
|
1202
|
+
return True
|
|
1203
|
+
|
|
1204
|
+
def _record_circuit_breaker_failure(self) -> None:
|
|
1205
|
+
"""Record a failure for circuit breaker logic."""
|
|
1206
|
+
self._circuit_breaker_failures += 1
|
|
1207
|
+
self._circuit_breaker_last_failure = time.time()
|
|
1208
|
+
|
|
1209
|
+
if (
|
|
1210
|
+
self._circuit_breaker_failures
|
|
1211
|
+
>= self.config.circuit_breaker_failure_threshold
|
|
1212
|
+
):
|
|
1213
|
+
self._circuit_breaker_open = True
|
|
1214
|
+
logger.warn(
|
|
1215
|
+
f"Circuit breaker opened after {self._circuit_breaker_failures} failures"
|
|
1216
|
+
)
|
|
1217
|
+
|
|
1218
|
+
def __repr__(self) -> str:
|
|
1219
|
+
"""String representation of the execution manager."""
|
|
1220
|
+
return (
|
|
1221
|
+
f"AsyncExecutionManager("
|
|
1222
|
+
f"base_url='{self.base_url}', "
|
|
1223
|
+
f"active_executions={self.metrics.active_executions}, "
|
|
1224
|
+
f"total_executions={self.metrics.total_executions}, "
|
|
1225
|
+
f"success_rate={self.metrics.success_rate:.1f}%"
|
|
1226
|
+
f")"
|
|
1227
|
+
)
|