puffinflow 2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puffinflow/__init__.py +132 -0
- puffinflow/core/__init__.py +110 -0
- puffinflow/core/agent/__init__.py +320 -0
- puffinflow/core/agent/base.py +1635 -0
- puffinflow/core/agent/checkpoint.py +50 -0
- puffinflow/core/agent/context.py +521 -0
- puffinflow/core/agent/decorators/__init__.py +90 -0
- puffinflow/core/agent/decorators/builder.py +454 -0
- puffinflow/core/agent/decorators/flexible.py +714 -0
- puffinflow/core/agent/decorators/inspection.py +144 -0
- puffinflow/core/agent/dependencies.py +57 -0
- puffinflow/core/agent/scheduling/__init__.py +21 -0
- puffinflow/core/agent/scheduling/builder.py +160 -0
- puffinflow/core/agent/scheduling/exceptions.py +35 -0
- puffinflow/core/agent/scheduling/inputs.py +137 -0
- puffinflow/core/agent/scheduling/parser.py +209 -0
- puffinflow/core/agent/scheduling/scheduler.py +413 -0
- puffinflow/core/agent/state.py +141 -0
- puffinflow/core/config.py +62 -0
- puffinflow/core/coordination/__init__.py +137 -0
- puffinflow/core/coordination/agent_group.py +359 -0
- puffinflow/core/coordination/agent_pool.py +629 -0
- puffinflow/core/coordination/agent_team.py +577 -0
- puffinflow/core/coordination/coordinator.py +720 -0
- puffinflow/core/coordination/deadlock.py +1759 -0
- puffinflow/core/coordination/fluent_api.py +421 -0
- puffinflow/core/coordination/primitives.py +478 -0
- puffinflow/core/coordination/rate_limiter.py +520 -0
- puffinflow/core/observability/__init__.py +47 -0
- puffinflow/core/observability/agent.py +139 -0
- puffinflow/core/observability/alerting.py +73 -0
- puffinflow/core/observability/config.py +127 -0
- puffinflow/core/observability/context.py +88 -0
- puffinflow/core/observability/core.py +147 -0
- puffinflow/core/observability/decorators.py +105 -0
- puffinflow/core/observability/events.py +71 -0
- puffinflow/core/observability/interfaces.py +196 -0
- puffinflow/core/observability/metrics.py +137 -0
- puffinflow/core/observability/tracing.py +209 -0
- puffinflow/core/reliability/__init__.py +27 -0
- puffinflow/core/reliability/bulkhead.py +96 -0
- puffinflow/core/reliability/circuit_breaker.py +149 -0
- puffinflow/core/reliability/leak_detector.py +122 -0
- puffinflow/core/resources/__init__.py +77 -0
- puffinflow/core/resources/allocation.py +790 -0
- puffinflow/core/resources/pool.py +645 -0
- puffinflow/core/resources/quotas.py +567 -0
- puffinflow/core/resources/requirements.py +217 -0
- puffinflow/version.py +21 -0
- puffinflow-2.dev0.dist-info/METADATA +334 -0
- puffinflow-2.dev0.dist-info/RECORD +55 -0
- puffinflow-2.dev0.dist-info/WHEEL +5 -0
- puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
- puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
- puffinflow-2.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
"""Agent pool with dynamic scaling capabilities."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from collections import deque
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import AsyncGenerator
|
|
14
|
+
|
|
15
|
+
from ..agent.base import Agent, AgentResult
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ScalingPolicy(Enum):
|
|
21
|
+
"""Scaling policy options."""
|
|
22
|
+
|
|
23
|
+
MANUAL = "manual"
|
|
24
|
+
AUTO_CPU = "auto_cpu"
|
|
25
|
+
AUTO_QUEUE = "auto_queue"
|
|
26
|
+
AUTO_LATENCY = "auto_latency"
|
|
27
|
+
CUSTOM = "custom"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class WorkItem:
|
|
32
|
+
"""Work item for agent processing."""
|
|
33
|
+
|
|
34
|
+
id: str
|
|
35
|
+
data: Any
|
|
36
|
+
priority: int = 0
|
|
37
|
+
created_at: float = field(default_factory=time.time)
|
|
38
|
+
assigned_at: Optional[float] = None
|
|
39
|
+
completed_at: Optional[float] = None
|
|
40
|
+
agent_name: Optional[str] = None
|
|
41
|
+
retries: int = 0
|
|
42
|
+
max_retries: int = 3
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def processing_time(self) -> Optional[float]:
|
|
46
|
+
"""Get processing time if completed."""
|
|
47
|
+
if self.assigned_at and self.completed_at:
|
|
48
|
+
return self.completed_at - self.assigned_at
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def wait_time(self) -> Optional[float]:
|
|
53
|
+
"""Get time spent waiting in queue."""
|
|
54
|
+
if self.assigned_at:
|
|
55
|
+
return self.assigned_at - self.created_at
|
|
56
|
+
return time.time() - self.created_at
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class CompletedWork:
|
|
61
|
+
"""Completed work result."""
|
|
62
|
+
|
|
63
|
+
work_item: WorkItem
|
|
64
|
+
agent: Agent
|
|
65
|
+
result: AgentResult
|
|
66
|
+
success: bool
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class WorkQueue:
|
|
70
|
+
"""Priority queue for work items."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, max_size: Optional[int] = None):
|
|
73
|
+
self._queue: deque[WorkItem] = deque()
|
|
74
|
+
self._max_size = max_size
|
|
75
|
+
self._priority_queue: list[tuple[int, WorkItem]] = []
|
|
76
|
+
self._use_priority = False
|
|
77
|
+
|
|
78
|
+
def add_work(self, work_item: WorkItem) -> bool:
|
|
79
|
+
"""Add work item to queue."""
|
|
80
|
+
if self._max_size and len(self._queue) >= self._max_size:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
if work_item.priority > 0:
|
|
84
|
+
self._use_priority = True
|
|
85
|
+
import heapq
|
|
86
|
+
|
|
87
|
+
heapq.heappush(self._priority_queue, (-work_item.priority, work_item))
|
|
88
|
+
else:
|
|
89
|
+
self._queue.append(work_item)
|
|
90
|
+
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
def get_work(self) -> Optional[WorkItem]:
|
|
94
|
+
"""Get next work item."""
|
|
95
|
+
# Try priority queue first
|
|
96
|
+
if self._priority_queue:
|
|
97
|
+
import heapq
|
|
98
|
+
|
|
99
|
+
priority_tuple = heapq.heappop(self._priority_queue)
|
|
100
|
+
_, work_item = priority_tuple
|
|
101
|
+
work_item.assigned_at = time.time()
|
|
102
|
+
return work_item
|
|
103
|
+
|
|
104
|
+
# Then regular queue
|
|
105
|
+
if self._queue:
|
|
106
|
+
work_item = self._queue.popleft()
|
|
107
|
+
work_item.assigned_at = time.time()
|
|
108
|
+
return work_item
|
|
109
|
+
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def size(self) -> int:
|
|
113
|
+
"""Get queue size."""
|
|
114
|
+
return len(self._queue) + len(self._priority_queue)
|
|
115
|
+
|
|
116
|
+
def is_empty(self) -> bool:
|
|
117
|
+
"""Check if queue is empty."""
|
|
118
|
+
return len(self._queue) == 0 and len(self._priority_queue) == 0
|
|
119
|
+
|
|
120
|
+
def clear(self) -> None:
|
|
121
|
+
"""Clear all work items."""
|
|
122
|
+
self._queue.clear()
|
|
123
|
+
self._priority_queue.clear()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class AgentPool:
|
|
127
|
+
"""Agent pool with dynamic scaling."""
|
|
128
|
+
|
|
129
|
+
def __init__(
|
|
130
|
+
self,
|
|
131
|
+
agent_factory: Callable[[int], Agent],
|
|
132
|
+
min_size: int = 1,
|
|
133
|
+
max_size: int = 10,
|
|
134
|
+
scaling_policy: ScalingPolicy = ScalingPolicy.AUTO_QUEUE,
|
|
135
|
+
scale_up_threshold: float = 2.0,
|
|
136
|
+
scale_down_threshold: float = 0.5,
|
|
137
|
+
scale_check_interval: float = 10.0,
|
|
138
|
+
):
|
|
139
|
+
self.agent_factory = agent_factory
|
|
140
|
+
self.min_size = min_size
|
|
141
|
+
self.max_size = max_size
|
|
142
|
+
self.scaling_policy = scaling_policy
|
|
143
|
+
self.scale_up_threshold = scale_up_threshold
|
|
144
|
+
self.scale_down_threshold = scale_down_threshold
|
|
145
|
+
self.scale_check_interval = scale_check_interval
|
|
146
|
+
|
|
147
|
+
self._agents: list[Agent] = []
|
|
148
|
+
self._active_agents: set[str] = set()
|
|
149
|
+
self._idle_agents: set[str] = set()
|
|
150
|
+
self._agent_tasks: dict[str, asyncio.Task] = {}
|
|
151
|
+
self._scaling_task: Optional[asyncio.Task] = None
|
|
152
|
+
self._metrics: dict[str, Any] = {
|
|
153
|
+
"total_processed": 0,
|
|
154
|
+
"total_errors": 0,
|
|
155
|
+
"avg_processing_time": 0.0,
|
|
156
|
+
"current_queue_size": 0,
|
|
157
|
+
"scale_events": [],
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Initialize minimum agents
|
|
161
|
+
for i in range(min_size):
|
|
162
|
+
agent = agent_factory(i)
|
|
163
|
+
self._agents.append(agent)
|
|
164
|
+
self._idle_agents.add(agent.name)
|
|
165
|
+
|
|
166
|
+
def auto_scale(self) -> "PoolContext":
|
|
167
|
+
"""Get auto-scaling context manager."""
|
|
168
|
+
return PoolContext(self)
|
|
169
|
+
|
|
170
|
+
async def scale_up(self, count: int = 1) -> int:
|
|
171
|
+
"""Scale up by adding agents."""
|
|
172
|
+
added = 0
|
|
173
|
+
current_size = len(self._agents)
|
|
174
|
+
|
|
175
|
+
for _i in range(count):
|
|
176
|
+
if current_size + added >= self.max_size:
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
agent = self.agent_factory(current_size + added)
|
|
180
|
+
self._agents.append(agent)
|
|
181
|
+
self._idle_agents.add(agent.name)
|
|
182
|
+
added += 1
|
|
183
|
+
|
|
184
|
+
logger.info(f"Scaled up: added agent {agent.name}")
|
|
185
|
+
|
|
186
|
+
if added > 0:
|
|
187
|
+
self._metrics["scale_events"].append(
|
|
188
|
+
{
|
|
189
|
+
"type": "scale_up",
|
|
190
|
+
"count": added,
|
|
191
|
+
"timestamp": time.time(),
|
|
192
|
+
"total_agents": len(self._agents),
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return added
|
|
197
|
+
|
|
198
|
+
async def scale_down(self, count: int = 1) -> int:
|
|
199
|
+
"""Scale down by removing idle agents."""
|
|
200
|
+
removed = 0
|
|
201
|
+
current_size = len(self._agents)
|
|
202
|
+
|
|
203
|
+
# Don't go below minimum
|
|
204
|
+
max_removable = max(0, current_size - self.min_size)
|
|
205
|
+
count = min(count, max_removable)
|
|
206
|
+
|
|
207
|
+
# Remove idle agents first
|
|
208
|
+
idle_agents = list(self._idle_agents)
|
|
209
|
+
for i in range(min(count, len(idle_agents))):
|
|
210
|
+
agent_name = idle_agents[i]
|
|
211
|
+
|
|
212
|
+
# Find and remove agent
|
|
213
|
+
agent_to_remove = None
|
|
214
|
+
for agent in self._agents:
|
|
215
|
+
if agent.name == agent_name:
|
|
216
|
+
agent_to_remove = agent
|
|
217
|
+
break
|
|
218
|
+
|
|
219
|
+
if agent_to_remove:
|
|
220
|
+
self._agents.remove(agent_to_remove)
|
|
221
|
+
self._idle_agents.discard(agent_name)
|
|
222
|
+
removed += 1
|
|
223
|
+
|
|
224
|
+
logger.info(f"Scaled down: removed agent {agent_name}")
|
|
225
|
+
|
|
226
|
+
if removed > 0:
|
|
227
|
+
self._metrics["scale_events"].append(
|
|
228
|
+
{
|
|
229
|
+
"type": "scale_down",
|
|
230
|
+
"count": removed,
|
|
231
|
+
"timestamp": time.time(),
|
|
232
|
+
"total_agents": len(self._agents),
|
|
233
|
+
}
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return removed
|
|
237
|
+
|
|
238
|
+
async def _auto_scaling_loop(self, work_queue: WorkQueue) -> None:
|
|
239
|
+
"""Auto-scaling monitoring loop."""
|
|
240
|
+
while True:
|
|
241
|
+
try:
|
|
242
|
+
await asyncio.sleep(self.scale_check_interval)
|
|
243
|
+
|
|
244
|
+
queue_size = work_queue.size()
|
|
245
|
+
active_count = len(self._active_agents)
|
|
246
|
+
idle_count = len(self._idle_agents)
|
|
247
|
+
|
|
248
|
+
self._metrics["current_queue_size"] = queue_size
|
|
249
|
+
|
|
250
|
+
# Auto-scaling logic based on policy
|
|
251
|
+
if self.scaling_policy == ScalingPolicy.AUTO_QUEUE:
|
|
252
|
+
# Scale based on queue size vs active agents
|
|
253
|
+
if queue_size > active_count * self.scale_up_threshold:
|
|
254
|
+
await self.scale_up()
|
|
255
|
+
elif (
|
|
256
|
+
queue_size < active_count * self.scale_down_threshold
|
|
257
|
+
and idle_count > 0
|
|
258
|
+
):
|
|
259
|
+
await self.scale_down()
|
|
260
|
+
|
|
261
|
+
elif self.scaling_policy == ScalingPolicy.AUTO_CPU:
|
|
262
|
+
# Scale based on CPU usage with adaptive thresholds and hysteresis
|
|
263
|
+
try:
|
|
264
|
+
import psutil
|
|
265
|
+
|
|
266
|
+
# Get CPU usage over a short interval for more accurate measurement
|
|
267
|
+
cpu_percent = psutil.cpu_percent(interval=1.0)
|
|
268
|
+
|
|
269
|
+
# Get system load average to understand CPU pressure
|
|
270
|
+
try:
|
|
271
|
+
load_avg = psutil.getloadavg()[0] # 1-minute load average
|
|
272
|
+
cpu_count = psutil.cpu_count()
|
|
273
|
+
load_per_cpu = (
|
|
274
|
+
load_avg / cpu_count
|
|
275
|
+
if cpu_count is not None and cpu_count > 0
|
|
276
|
+
else load_avg
|
|
277
|
+
)
|
|
278
|
+
except (AttributeError, OSError):
|
|
279
|
+
# getloadavg not available on Windows
|
|
280
|
+
load_per_cpu = cpu_percent / 100.0
|
|
281
|
+
|
|
282
|
+
# Adaptive thresholds based on agent pool state
|
|
283
|
+
base_scale_up_cpu = 75.0
|
|
284
|
+
base_scale_down_cpu = 25.0
|
|
285
|
+
|
|
286
|
+
# Adjust thresholds based on queue pressure
|
|
287
|
+
queue_pressure = queue_size / max(active_count, 1)
|
|
288
|
+
if queue_pressure > 2.0:
|
|
289
|
+
# High queue pressure - lower CPU threshold for scaling up
|
|
290
|
+
scale_up_cpu = max(base_scale_up_cpu - 15, 60.0)
|
|
291
|
+
else:
|
|
292
|
+
scale_up_cpu = base_scale_up_cpu
|
|
293
|
+
|
|
294
|
+
if queue_pressure < 0.5:
|
|
295
|
+
# Low queue pressure - higher CPU threshold for scaling down
|
|
296
|
+
scale_down_cpu = min(base_scale_down_cpu + 15, 40.0)
|
|
297
|
+
else:
|
|
298
|
+
scale_down_cpu = base_scale_down_cpu
|
|
299
|
+
|
|
300
|
+
# Hysteresis: track recent scaling decisions to prevent flapping
|
|
301
|
+
current_time = time.time()
|
|
302
|
+
if not hasattr(self, "_last_cpu_scale_time"):
|
|
303
|
+
self._last_cpu_scale_time = 0.0
|
|
304
|
+
if not hasattr(self, "_cpu_scale_cooldown"):
|
|
305
|
+
self._cpu_scale_cooldown = 30.0 # 30 second cooldown
|
|
306
|
+
|
|
307
|
+
time_since_last_scale = current_time - self._last_cpu_scale_time
|
|
308
|
+
|
|
309
|
+
# Scale up conditions: high CPU AND (queue backlog OR high load average)
|
|
310
|
+
should_scale_up = (
|
|
311
|
+
cpu_percent > scale_up_cpu
|
|
312
|
+
and (queue_size > 0 or load_per_cpu > 0.8)
|
|
313
|
+
and active_count < self.max_size
|
|
314
|
+
and time_since_last_scale > self._cpu_scale_cooldown
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Scale down conditions: low CPU AND low load AND idle agents available
|
|
318
|
+
should_scale_down = (
|
|
319
|
+
cpu_percent < scale_down_cpu
|
|
320
|
+
and load_per_cpu < 0.3
|
|
321
|
+
and idle_count > 0
|
|
322
|
+
and active_count > self.min_size
|
|
323
|
+
and time_since_last_scale > self._cpu_scale_cooldown
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
if should_scale_up:
|
|
327
|
+
logger.info(
|
|
328
|
+
f"CPU-based scale up: CPU={cpu_percent:.1f}%, "
|
|
329
|
+
f"load_per_cpu={load_per_cpu:.2f}, queue_size={queue_size}"
|
|
330
|
+
)
|
|
331
|
+
await self.scale_up()
|
|
332
|
+
self._last_cpu_scale_time = current_time
|
|
333
|
+
elif should_scale_down:
|
|
334
|
+
logger.info(
|
|
335
|
+
f"CPU-based scale down: CPU={cpu_percent:.1f}%, "
|
|
336
|
+
f"load_per_cpu={load_per_cpu:.2f}, idle_count={idle_count}"
|
|
337
|
+
)
|
|
338
|
+
await self.scale_down()
|
|
339
|
+
self._last_cpu_scale_time = current_time
|
|
340
|
+
|
|
341
|
+
# Update metrics for monitoring
|
|
342
|
+
self._metrics.update(
|
|
343
|
+
{
|
|
344
|
+
"cpu_percent": cpu_percent,
|
|
345
|
+
"load_per_cpu": load_per_cpu,
|
|
346
|
+
"scale_up_cpu_threshold": scale_up_cpu,
|
|
347
|
+
"scale_down_cpu_threshold": scale_down_cpu,
|
|
348
|
+
"queue_pressure": queue_pressure,
|
|
349
|
+
}
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
except ImportError:
|
|
353
|
+
logger.warning("psutil not available for CPU-based scaling")
|
|
354
|
+
|
|
355
|
+
except asyncio.CancelledError:
|
|
356
|
+
break
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error(f"Error in auto-scaling loop: {e}")
|
|
359
|
+
|
|
360
|
+
def process_queue(self, work_queue: WorkQueue) -> "WorkProcessor":
|
|
361
|
+
"""Process work queue with pool."""
|
|
362
|
+
return WorkProcessor(self, work_queue)
|
|
363
|
+
|
|
364
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
365
|
+
"""Get pool metrics."""
|
|
366
|
+
return {
|
|
367
|
+
**self._metrics,
|
|
368
|
+
"total_agents": len(self._agents),
|
|
369
|
+
"active_agents": len(self._active_agents),
|
|
370
|
+
"idle_agents": len(self._idle_agents),
|
|
371
|
+
"min_size": self.min_size,
|
|
372
|
+
"max_size": self.max_size,
|
|
373
|
+
"scaling_policy": self.scaling_policy.value,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
@property
|
|
377
|
+
def active_agents(self) -> int:
|
|
378
|
+
"""Get count of active agents."""
|
|
379
|
+
return len(self._active_agents)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class PoolContext:
|
|
383
|
+
"""Context manager for pool operations."""
|
|
384
|
+
|
|
385
|
+
def __init__(self, pool: AgentPool):
|
|
386
|
+
self.pool = pool
|
|
387
|
+
|
|
388
|
+
async def __aenter__(self) -> "AgentPool":
|
|
389
|
+
"""Enter context - start auto-scaling if enabled."""
|
|
390
|
+
if self.pool.scaling_policy != ScalingPolicy.MANUAL:
|
|
391
|
+
# Auto-scaling will be started when processing begins
|
|
392
|
+
pass
|
|
393
|
+
return self.pool
|
|
394
|
+
|
|
395
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
396
|
+
"""Exit context - cleanup."""
|
|
397
|
+
if self.pool._scaling_task:
|
|
398
|
+
self.pool._scaling_task.cancel()
|
|
399
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
400
|
+
await self.pool._scaling_task
|
|
401
|
+
|
|
402
|
+
# Cancel all agent tasks
|
|
403
|
+
for task in self.pool._agent_tasks.values():
|
|
404
|
+
if not task.done():
|
|
405
|
+
task.cancel()
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class WorkProcessor:
|
|
409
|
+
"""Processes work items using agent pool."""
|
|
410
|
+
|
|
411
|
+
def __init__(self, pool: AgentPool, work_queue: WorkQueue):
|
|
412
|
+
self.pool = pool
|
|
413
|
+
self.work_queue = work_queue
|
|
414
|
+
self._completed_queue: asyncio.Queue = asyncio.Queue()
|
|
415
|
+
self._running = False
|
|
416
|
+
self._worker_tasks: list[asyncio.Task] = []
|
|
417
|
+
|
|
418
|
+
async def __aiter__(self) -> "AsyncGenerator[CompletedWork, None]":
|
|
419
|
+
"""Async iterator for completed work."""
|
|
420
|
+
if not self._running:
|
|
421
|
+
await self._start_processing()
|
|
422
|
+
|
|
423
|
+
while self._running or not self._completed_queue.empty():
|
|
424
|
+
try:
|
|
425
|
+
completed_work = await asyncio.wait_for(
|
|
426
|
+
self._completed_queue.get(), timeout=1.0
|
|
427
|
+
)
|
|
428
|
+
yield completed_work
|
|
429
|
+
except asyncio.TimeoutError:
|
|
430
|
+
if self.work_queue.is_empty() and len(self.pool._active_agents) == 0:
|
|
431
|
+
break
|
|
432
|
+
|
|
433
|
+
async def _start_processing(self) -> None:
|
|
434
|
+
"""Start processing work items."""
|
|
435
|
+
self._running = True
|
|
436
|
+
|
|
437
|
+
# Start auto-scaling if enabled
|
|
438
|
+
if self.pool.scaling_policy != ScalingPolicy.MANUAL:
|
|
439
|
+
self.pool._scaling_task = asyncio.create_task(
|
|
440
|
+
self.pool._auto_scaling_loop(self.work_queue)
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Start worker tasks for each agent
|
|
444
|
+
for agent in self.pool._agents:
|
|
445
|
+
task = asyncio.create_task(self._worker_loop(agent))
|
|
446
|
+
self._worker_tasks.append(task)
|
|
447
|
+
self.pool._agent_tasks[agent.name] = task
|
|
448
|
+
|
|
449
|
+
async def _worker_loop(self, agent: Agent) -> None:
|
|
450
|
+
"""Worker loop for individual agent."""
|
|
451
|
+
while self._running:
|
|
452
|
+
try:
|
|
453
|
+
# Get work item
|
|
454
|
+
work_item = self.work_queue.get_work()
|
|
455
|
+
if not work_item:
|
|
456
|
+
# No work available, mark as idle
|
|
457
|
+
self.pool._active_agents.discard(agent.name)
|
|
458
|
+
self.pool._idle_agents.add(agent.name)
|
|
459
|
+
await asyncio.sleep(0.1)
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
# Mark as active
|
|
463
|
+
self.pool._active_agents.add(agent.name)
|
|
464
|
+
self.pool._idle_agents.discard(agent.name)
|
|
465
|
+
work_item.agent_name = agent.name
|
|
466
|
+
|
|
467
|
+
# Process work item
|
|
468
|
+
try:
|
|
469
|
+
# Set work data in agent
|
|
470
|
+
agent.set_variable("work_item", work_item.data)
|
|
471
|
+
agent.set_variable("work_id", work_item.id)
|
|
472
|
+
|
|
473
|
+
# Run agent
|
|
474
|
+
result = await agent.run()
|
|
475
|
+
work_item.completed_at = time.time()
|
|
476
|
+
|
|
477
|
+
# Create completed work
|
|
478
|
+
completed_work = CompletedWork(
|
|
479
|
+
work_item=work_item,
|
|
480
|
+
agent=agent,
|
|
481
|
+
result=result,
|
|
482
|
+
success=result.is_success,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
# Update metrics
|
|
486
|
+
self.pool._metrics["total_processed"] += 1
|
|
487
|
+
if work_item.processing_time:
|
|
488
|
+
# Update average processing time
|
|
489
|
+
current_avg = self.pool._metrics["avg_processing_time"]
|
|
490
|
+
total_processed = self.pool._metrics["total_processed"]
|
|
491
|
+
new_avg = (
|
|
492
|
+
(current_avg * (total_processed - 1))
|
|
493
|
+
+ work_item.processing_time
|
|
494
|
+
) / total_processed
|
|
495
|
+
self.pool._metrics["avg_processing_time"] = new_avg
|
|
496
|
+
|
|
497
|
+
# Queue completed work
|
|
498
|
+
await self._completed_queue.put(completed_work)
|
|
499
|
+
|
|
500
|
+
except Exception as e:
|
|
501
|
+
logger.error(f"Error processing work item {work_item.id}: {e}")
|
|
502
|
+
|
|
503
|
+
# Handle retry
|
|
504
|
+
work_item.retries += 1
|
|
505
|
+
if work_item.retries < work_item.max_retries:
|
|
506
|
+
# Re-queue for retry
|
|
507
|
+
self.work_queue.add_work(work_item)
|
|
508
|
+
else:
|
|
509
|
+
# Max retries reached
|
|
510
|
+
self.pool._metrics["total_errors"] += 1
|
|
511
|
+
|
|
512
|
+
from ..agent import AgentStatus
|
|
513
|
+
|
|
514
|
+
error_result = AgentResult(
|
|
515
|
+
agent_name=agent.name, status=AgentStatus.FAILED, error=e
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
completed_work = CompletedWork(
|
|
519
|
+
work_item=work_item,
|
|
520
|
+
agent=agent,
|
|
521
|
+
result=error_result,
|
|
522
|
+
success=False,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
await self._completed_queue.put(completed_work)
|
|
526
|
+
|
|
527
|
+
except asyncio.CancelledError:
|
|
528
|
+
break
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.error(f"Error in worker loop for {agent.name}: {e}")
|
|
531
|
+
await asyncio.sleep(1)
|
|
532
|
+
|
|
533
|
+
# Mark as idle when shutting down
|
|
534
|
+
self.pool._active_agents.discard(agent.name)
|
|
535
|
+
self.pool._idle_agents.add(agent.name)
|
|
536
|
+
|
|
537
|
+
async def stop(self) -> None:
|
|
538
|
+
"""Stop processing."""
|
|
539
|
+
self._running = False
|
|
540
|
+
|
|
541
|
+
# Cancel all worker tasks
|
|
542
|
+
for task in self._worker_tasks:
|
|
543
|
+
task.cancel()
|
|
544
|
+
|
|
545
|
+
# Wait for tasks to complete
|
|
546
|
+
await asyncio.gather(*self._worker_tasks, return_exceptions=True)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
class DynamicProcessingPool:
|
|
550
|
+
"""High-level dynamic processing pool."""
|
|
551
|
+
|
|
552
|
+
def __init__(
|
|
553
|
+
self,
|
|
554
|
+
agent_factory: Callable[[int], Agent],
|
|
555
|
+
min_agents: int = 2,
|
|
556
|
+
max_agents: int = 10,
|
|
557
|
+
):
|
|
558
|
+
self.pool = AgentPool(
|
|
559
|
+
agent_factory=agent_factory, min_size=min_agents, max_size=max_agents
|
|
560
|
+
)
|
|
561
|
+
self.work_queue = WorkQueue()
|
|
562
|
+
self.results: list[CompletedWork] = []
|
|
563
|
+
|
|
564
|
+
async def process_workload(
|
|
565
|
+
self, work_items: list[WorkItem]
|
|
566
|
+
) -> list[dict[str, Any]]:
|
|
567
|
+
"""Process a complete workload."""
|
|
568
|
+
# Add all work to queue
|
|
569
|
+
for item in work_items:
|
|
570
|
+
self.work_queue.add_work(item)
|
|
571
|
+
|
|
572
|
+
results = []
|
|
573
|
+
|
|
574
|
+
# Process with auto-scaling
|
|
575
|
+
async with self.pool.auto_scale() as pool:
|
|
576
|
+
work_processor = pool.process_queue(self.work_queue)
|
|
577
|
+
async for completed_work in work_processor:
|
|
578
|
+
result_dict = {
|
|
579
|
+
"work_item_id": completed_work.work_item.id,
|
|
580
|
+
"agent_name": completed_work.agent.name,
|
|
581
|
+
"success": completed_work.success,
|
|
582
|
+
"processing_time": completed_work.work_item.processing_time,
|
|
583
|
+
"wait_time": completed_work.work_item.wait_time,
|
|
584
|
+
"result": (
|
|
585
|
+
completed_work.result.outputs
|
|
586
|
+
if completed_work.success
|
|
587
|
+
else None
|
|
588
|
+
),
|
|
589
|
+
"error": (
|
|
590
|
+
str(completed_work.result.error)
|
|
591
|
+
if completed_work.result.error
|
|
592
|
+
else None
|
|
593
|
+
),
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
results.append(result_dict)
|
|
597
|
+
self.results.append(completed_work)
|
|
598
|
+
|
|
599
|
+
return results
|
|
600
|
+
|
|
601
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
602
|
+
"""Get processing statistics."""
|
|
603
|
+
if not self.results:
|
|
604
|
+
return {}
|
|
605
|
+
|
|
606
|
+
successful = [r for r in self.results if r.success]
|
|
607
|
+
failed = [r for r in self.results if not r.success]
|
|
608
|
+
|
|
609
|
+
processing_times = [
|
|
610
|
+
r.work_item.processing_time
|
|
611
|
+
for r in successful
|
|
612
|
+
if r.work_item.processing_time
|
|
613
|
+
]
|
|
614
|
+
|
|
615
|
+
wait_times = [
|
|
616
|
+
r.work_item.wait_time for r in self.results if r.work_item.wait_time
|
|
617
|
+
]
|
|
618
|
+
|
|
619
|
+
return {
|
|
620
|
+
"total_processed": len(self.results),
|
|
621
|
+
"successful": len(successful),
|
|
622
|
+
"failed": len(failed),
|
|
623
|
+
"success_rate": len(successful) / len(self.results) * 100,
|
|
624
|
+
"avg_processing_time": (
|
|
625
|
+
sum(processing_times) / len(processing_times) if processing_times else 0
|
|
626
|
+
),
|
|
627
|
+
"avg_wait_time": sum(wait_times) / len(wait_times) if wait_times else 0,
|
|
628
|
+
"pool_metrics": self.pool.get_metrics(),
|
|
629
|
+
}
|