puffinflow 2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puffinflow/__init__.py +132 -0
- puffinflow/core/__init__.py +110 -0
- puffinflow/core/agent/__init__.py +320 -0
- puffinflow/core/agent/base.py +1635 -0
- puffinflow/core/agent/checkpoint.py +50 -0
- puffinflow/core/agent/context.py +521 -0
- puffinflow/core/agent/decorators/__init__.py +90 -0
- puffinflow/core/agent/decorators/builder.py +454 -0
- puffinflow/core/agent/decorators/flexible.py +714 -0
- puffinflow/core/agent/decorators/inspection.py +144 -0
- puffinflow/core/agent/dependencies.py +57 -0
- puffinflow/core/agent/scheduling/__init__.py +21 -0
- puffinflow/core/agent/scheduling/builder.py +160 -0
- puffinflow/core/agent/scheduling/exceptions.py +35 -0
- puffinflow/core/agent/scheduling/inputs.py +137 -0
- puffinflow/core/agent/scheduling/parser.py +209 -0
- puffinflow/core/agent/scheduling/scheduler.py +413 -0
- puffinflow/core/agent/state.py +141 -0
- puffinflow/core/config.py +62 -0
- puffinflow/core/coordination/__init__.py +137 -0
- puffinflow/core/coordination/agent_group.py +359 -0
- puffinflow/core/coordination/agent_pool.py +629 -0
- puffinflow/core/coordination/agent_team.py +577 -0
- puffinflow/core/coordination/coordinator.py +720 -0
- puffinflow/core/coordination/deadlock.py +1759 -0
- puffinflow/core/coordination/fluent_api.py +421 -0
- puffinflow/core/coordination/primitives.py +478 -0
- puffinflow/core/coordination/rate_limiter.py +520 -0
- puffinflow/core/observability/__init__.py +47 -0
- puffinflow/core/observability/agent.py +139 -0
- puffinflow/core/observability/alerting.py +73 -0
- puffinflow/core/observability/config.py +127 -0
- puffinflow/core/observability/context.py +88 -0
- puffinflow/core/observability/core.py +147 -0
- puffinflow/core/observability/decorators.py +105 -0
- puffinflow/core/observability/events.py +71 -0
- puffinflow/core/observability/interfaces.py +196 -0
- puffinflow/core/observability/metrics.py +137 -0
- puffinflow/core/observability/tracing.py +209 -0
- puffinflow/core/reliability/__init__.py +27 -0
- puffinflow/core/reliability/bulkhead.py +96 -0
- puffinflow/core/reliability/circuit_breaker.py +149 -0
- puffinflow/core/reliability/leak_detector.py +122 -0
- puffinflow/core/resources/__init__.py +77 -0
- puffinflow/core/resources/allocation.py +790 -0
- puffinflow/core/resources/pool.py +645 -0
- puffinflow/core/resources/quotas.py +567 -0
- puffinflow/core/resources/requirements.py +217 -0
- puffinflow/version.py +21 -0
- puffinflow-2.dev0.dist-info/METADATA +334 -0
- puffinflow-2.dev0.dist-info/RECORD +55 -0
- puffinflow-2.dev0.dist-info/WHEEL +5 -0
- puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
- puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
- puffinflow-2.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
"""Resource pool implementation with advanced features.
|
|
2
|
+
|
|
3
|
+
Provides a comprehensive resource management system with leak detection,
|
|
4
|
+
quota enforcement, preemption capabilities, and detailed usage tracking.
|
|
5
|
+
Supports CPU, memory, I/O, network, and GPU resources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any, Optional
|
|
14
|
+
|
|
15
|
+
# Import from the canonical source to ensure consistent enum instances
|
|
16
|
+
from .requirements import (
|
|
17
|
+
ResourceRequirements,
|
|
18
|
+
ResourceType,
|
|
19
|
+
get_resource_amount,
|
|
20
|
+
safe_check_resource_type,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Import leak detector with fallback
|
|
24
|
+
try:
|
|
25
|
+
from ..reliability.leak_detector import leak_detector
|
|
26
|
+
except ImportError:
|
|
27
|
+
# Mock leak detector if not available
|
|
28
|
+
class MockLeakDetector:
|
|
29
|
+
def track_allocation(
|
|
30
|
+
self, state_name: Any, agent_name: Any, resources: Any
|
|
31
|
+
) -> None:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def track_release(self, state_name: Any, agent_name: Any) -> None:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
def detect_leaks(self) -> list[Any]:
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
41
|
+
return {"leak_detection": "mock"}
|
|
42
|
+
|
|
43
|
+
leak_detector = MockLeakDetector() # type: ignore
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ResourceAllocationError(Exception):
|
|
49
|
+
"""Base class for resource allocation errors."""
|
|
50
|
+
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ResourceOverflowError(ResourceAllocationError):
|
|
55
|
+
"""Raised when resource allocation would exceed system limits."""
|
|
56
|
+
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ResourceQuotaExceededError(ResourceAllocationError):
|
|
61
|
+
"""Raised when a state/agent exceeds its assigned resource quota."""
|
|
62
|
+
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class ResourceUsageStats:
|
|
68
|
+
"""Statistics container for tracking resource usage patterns."""
|
|
69
|
+
|
|
70
|
+
peak_usage: float = 0.0
|
|
71
|
+
current_usage: float = 0.0
|
|
72
|
+
total_allocations: int = 0
|
|
73
|
+
failed_allocations: int = 0
|
|
74
|
+
last_allocation_time: Optional[float] = None
|
|
75
|
+
total_wait_time: float = 0.0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ResourcePool:
|
|
79
|
+
"""Advanced resource management system with comprehensive features."""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
total_cpu: float = 4.0,
|
|
84
|
+
total_memory: float = 1024.0,
|
|
85
|
+
total_io: float = 100.0,
|
|
86
|
+
total_network: float = 100.0,
|
|
87
|
+
total_gpu: float = 0.0,
|
|
88
|
+
enable_quotas: bool = False,
|
|
89
|
+
enable_preemption: bool = False,
|
|
90
|
+
enable_leak_detection: bool = True,
|
|
91
|
+
):
|
|
92
|
+
"""Initialize resource pool with specified capacities and features."""
|
|
93
|
+
# Resource capacity limits
|
|
94
|
+
self.resources = {
|
|
95
|
+
ResourceType.CPU: total_cpu,
|
|
96
|
+
ResourceType.MEMORY: total_memory,
|
|
97
|
+
ResourceType.IO: total_io,
|
|
98
|
+
ResourceType.NETWORK: total_network,
|
|
99
|
+
ResourceType.GPU: total_gpu,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# Currently available resources
|
|
103
|
+
self.available = self.resources.copy()
|
|
104
|
+
|
|
105
|
+
# Synchronization primitives
|
|
106
|
+
self._lock = asyncio.Lock()
|
|
107
|
+
self._condition = asyncio.Condition(self._lock)
|
|
108
|
+
|
|
109
|
+
# Resource allocation tracking
|
|
110
|
+
self._allocations: dict[str, dict[ResourceType, float]] = {}
|
|
111
|
+
self._allocation_times: dict[str, float] = {}
|
|
112
|
+
|
|
113
|
+
# Usage statistics
|
|
114
|
+
self._usage_stats = {
|
|
115
|
+
rt: ResourceUsageStats()
|
|
116
|
+
for rt in ResourceType
|
|
117
|
+
if rt != ResourceType.NONE and rt != ResourceType.ALL
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Feature flags
|
|
121
|
+
self.enable_quotas = enable_quotas
|
|
122
|
+
self._quotas: dict[str, dict[ResourceType, float]] = {}
|
|
123
|
+
|
|
124
|
+
self.enable_preemption = enable_preemption
|
|
125
|
+
self._preempted_states: set[str] = set()
|
|
126
|
+
|
|
127
|
+
# Historical data
|
|
128
|
+
self._allocation_history: dict[ResourceType, list[tuple]] = defaultdict(list)
|
|
129
|
+
self._usage_history: list[tuple] = []
|
|
130
|
+
self._history_retention = 3600
|
|
131
|
+
|
|
132
|
+
# Queue management
|
|
133
|
+
self._waiting_states: set[str] = set()
|
|
134
|
+
|
|
135
|
+
# Leak detection
|
|
136
|
+
self.enable_leak_detection = enable_leak_detection
|
|
137
|
+
self._agent_names: dict[str, str] = {}
|
|
138
|
+
|
|
139
|
+
async def set_quota(
|
|
140
|
+
self, state_name: str, resource_type: ResourceType, limit: float
|
|
141
|
+
) -> None:
|
|
142
|
+
"""Set resource quota for a specific state."""
|
|
143
|
+
if not self.enable_quotas:
|
|
144
|
+
raise RuntimeError("Quotas are not enabled for this resource pool")
|
|
145
|
+
|
|
146
|
+
async with self._lock:
|
|
147
|
+
if state_name not in self._quotas:
|
|
148
|
+
self._quotas[state_name] = {}
|
|
149
|
+
self._quotas[state_name][resource_type] = limit
|
|
150
|
+
|
|
151
|
+
def _check_quota(self, state_name: str, requirements: ResourceRequirements) -> bool:
|
|
152
|
+
"""Check if allocation would exceed assigned quota."""
|
|
153
|
+
if not self.enable_quotas:
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
current_usage = self._allocations.get(state_name, {})
|
|
157
|
+
|
|
158
|
+
for resource_type in [
|
|
159
|
+
ResourceType.CPU,
|
|
160
|
+
ResourceType.MEMORY,
|
|
161
|
+
ResourceType.IO,
|
|
162
|
+
ResourceType.NETWORK,
|
|
163
|
+
ResourceType.GPU,
|
|
164
|
+
]:
|
|
165
|
+
# Only check quotas for resources that are actually requested
|
|
166
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
167
|
+
quota = self._quotas.get(state_name, {}).get(resource_type)
|
|
168
|
+
if quota is None:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
required = get_resource_amount(requirements, resource_type)
|
|
172
|
+
current = current_usage.get(resource_type, 0.0)
|
|
173
|
+
|
|
174
|
+
if current + required > quota:
|
|
175
|
+
logger.warning(
|
|
176
|
+
f"Quota exceeded for {state_name}: {resource_type.name} "
|
|
177
|
+
f"(current: {current}, required: {required}, quota: {quota})"
|
|
178
|
+
)
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
async def acquire(
|
|
184
|
+
self,
|
|
185
|
+
state_name: str,
|
|
186
|
+
requirements: ResourceRequirements,
|
|
187
|
+
timeout: Optional[float] = None,
|
|
188
|
+
allow_preemption: bool = False,
|
|
189
|
+
agent_name: Optional[str] = None,
|
|
190
|
+
) -> bool:
|
|
191
|
+
"""Acquire resources for a state with advanced features."""
|
|
192
|
+
start_time = time.time()
|
|
193
|
+
|
|
194
|
+
# Store agent name for leak detection
|
|
195
|
+
if agent_name and self.enable_leak_detection:
|
|
196
|
+
self._agent_names[state_name] = agent_name
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
# Validate and fix requirements if needed
|
|
200
|
+
requirements = self._validate_and_fix_requirements(requirements)
|
|
201
|
+
|
|
202
|
+
async with self._condition:
|
|
203
|
+
# Check if requirements exceed total available resources
|
|
204
|
+
self._validate_requirements_against_total(requirements)
|
|
205
|
+
|
|
206
|
+
# Check quota constraints
|
|
207
|
+
if not self._check_quota(state_name, requirements):
|
|
208
|
+
raise ResourceQuotaExceededError(f"Quota exceeded for {state_name}")
|
|
209
|
+
|
|
210
|
+
# Wait for resources to become available
|
|
211
|
+
while not self._can_allocate(requirements):
|
|
212
|
+
self._waiting_states.add(state_name)
|
|
213
|
+
|
|
214
|
+
# Try preemption if enabled
|
|
215
|
+
if (
|
|
216
|
+
allow_preemption
|
|
217
|
+
and self.enable_preemption
|
|
218
|
+
and self._try_preemption(state_name, requirements)
|
|
219
|
+
):
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
# Wait with timeout
|
|
223
|
+
if timeout:
|
|
224
|
+
remaining_time = timeout - (time.time() - start_time)
|
|
225
|
+
if remaining_time <= 0:
|
|
226
|
+
self._waiting_states.discard(state_name)
|
|
227
|
+
self._update_stats_failure(requirements)
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
await asyncio.wait_for(
|
|
232
|
+
self._condition.wait(), timeout=remaining_time
|
|
233
|
+
)
|
|
234
|
+
except asyncio.TimeoutError:
|
|
235
|
+
self._waiting_states.discard(state_name)
|
|
236
|
+
self._update_stats_failure(requirements)
|
|
237
|
+
return False
|
|
238
|
+
else:
|
|
239
|
+
await self._condition.wait()
|
|
240
|
+
|
|
241
|
+
# Remove from waiting states
|
|
242
|
+
self._waiting_states.discard(state_name)
|
|
243
|
+
|
|
244
|
+
# Perform allocation
|
|
245
|
+
self._allocate(state_name, requirements)
|
|
246
|
+
|
|
247
|
+
# Track for leak detection
|
|
248
|
+
if self.enable_leak_detection:
|
|
249
|
+
agent = self._agent_names.get(state_name, "unknown")
|
|
250
|
+
resource_dict = self._build_resource_dict(requirements)
|
|
251
|
+
leak_detector.track_allocation(state_name, agent, resource_dict)
|
|
252
|
+
|
|
253
|
+
# Update statistics
|
|
254
|
+
self._update_stats(state_name, requirements, start_time)
|
|
255
|
+
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
except Exception as e:
|
|
259
|
+
self._update_stats_failure(requirements)
|
|
260
|
+
logger.error(f"Error acquiring resources for {state_name}: {e}")
|
|
261
|
+
raise
|
|
262
|
+
|
|
263
|
+
def _validate_and_fix_requirements(
|
|
264
|
+
self, requirements: ResourceRequirements
|
|
265
|
+
) -> ResourceRequirements:
|
|
266
|
+
"""Validate and fix resource requirements if needed."""
|
|
267
|
+
try:
|
|
268
|
+
# Check for negative resource values
|
|
269
|
+
resource_values = {
|
|
270
|
+
"cpu_units": getattr(requirements, "cpu_units", 0.0),
|
|
271
|
+
"memory_mb": getattr(requirements, "memory_mb", 0.0),
|
|
272
|
+
"io_weight": getattr(requirements, "io_weight", 0.0),
|
|
273
|
+
"network_weight": getattr(requirements, "network_weight", 0.0),
|
|
274
|
+
"gpu_units": getattr(requirements, "gpu_units", 0.0),
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
for attr_name, value in resource_values.items():
|
|
278
|
+
if value < 0:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Negative resource requirement: {attr_name}={value}"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Test bitwise operations
|
|
284
|
+
requirements.resource_types & ResourceType.CPU
|
|
285
|
+
logger.debug(f"Requirements validation passed: {requirements}")
|
|
286
|
+
|
|
287
|
+
return requirements
|
|
288
|
+
|
|
289
|
+
except ValueError:
|
|
290
|
+
# Re-raise ValueError for negative resource requirements
|
|
291
|
+
raise
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Error validating requirements: {e}")
|
|
294
|
+
# Create a safe fallback
|
|
295
|
+
fallback = ResourceRequirements(
|
|
296
|
+
cpu_units=getattr(requirements, "cpu_units", 1.0),
|
|
297
|
+
memory_mb=getattr(requirements, "memory_mb", 100.0),
|
|
298
|
+
io_weight=getattr(requirements, "io_weight", 1.0),
|
|
299
|
+
network_weight=getattr(requirements, "network_weight", 1.0),
|
|
300
|
+
gpu_units=getattr(requirements, "gpu_units", 0.0),
|
|
301
|
+
resource_types=ResourceType.ALL,
|
|
302
|
+
)
|
|
303
|
+
logger.info(f"Using fallback requirements: {fallback}")
|
|
304
|
+
return fallback
|
|
305
|
+
|
|
306
|
+
def _validate_requirements_against_total(
|
|
307
|
+
self, requirements: ResourceRequirements
|
|
308
|
+
) -> None:
|
|
309
|
+
"""Validate that requirements don't exceed total available resources."""
|
|
310
|
+
for resource_type in [
|
|
311
|
+
ResourceType.CPU,
|
|
312
|
+
ResourceType.MEMORY,
|
|
313
|
+
ResourceType.IO,
|
|
314
|
+
ResourceType.NETWORK,
|
|
315
|
+
ResourceType.GPU,
|
|
316
|
+
]:
|
|
317
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
318
|
+
required = get_resource_amount(requirements, resource_type)
|
|
319
|
+
total_available = self.resources.get(resource_type, 0.0)
|
|
320
|
+
|
|
321
|
+
if required > total_available:
|
|
322
|
+
raise ResourceOverflowError(
|
|
323
|
+
f"Required {resource_type.name} ({required}) exceeds total available ({total_available})"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
def _build_resource_dict(
|
|
327
|
+
self, requirements: ResourceRequirements
|
|
328
|
+
) -> dict[str, float]:
|
|
329
|
+
"""Build resource dictionary for leak detection."""
|
|
330
|
+
resource_dict = {}
|
|
331
|
+
|
|
332
|
+
for resource_type in [
|
|
333
|
+
ResourceType.CPU,
|
|
334
|
+
ResourceType.MEMORY,
|
|
335
|
+
ResourceType.IO,
|
|
336
|
+
ResourceType.NETWORK,
|
|
337
|
+
ResourceType.GPU,
|
|
338
|
+
]:
|
|
339
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
340
|
+
amount = get_resource_amount(requirements, resource_type)
|
|
341
|
+
if amount > 0 and resource_type.name:
|
|
342
|
+
resource_dict[resource_type.name.lower()] = amount
|
|
343
|
+
|
|
344
|
+
return resource_dict
|
|
345
|
+
|
|
346
|
+
def _can_allocate(self, requirements: ResourceRequirements) -> bool:
|
|
347
|
+
"""Check if resources can be allocated immediately."""
|
|
348
|
+
try:
|
|
349
|
+
logger.debug(f"Checking allocation for: {requirements}")
|
|
350
|
+
|
|
351
|
+
for resource_type in [
|
|
352
|
+
ResourceType.CPU,
|
|
353
|
+
ResourceType.MEMORY,
|
|
354
|
+
ResourceType.IO,
|
|
355
|
+
ResourceType.NETWORK,
|
|
356
|
+
ResourceType.GPU,
|
|
357
|
+
]:
|
|
358
|
+
# Use safe check for resource type
|
|
359
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
360
|
+
required = get_resource_amount(requirements, resource_type)
|
|
361
|
+
available = self.available.get(resource_type, 0.0)
|
|
362
|
+
|
|
363
|
+
logger.debug(
|
|
364
|
+
f"Resource {resource_type.name}: required={required}, available={available}"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
if required > available:
|
|
368
|
+
logger.debug(
|
|
369
|
+
f"Cannot allocate - insufficient {resource_type.name}"
|
|
370
|
+
)
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
return True
|
|
374
|
+
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.error(f"Error in _can_allocate: {e}")
|
|
377
|
+
logger.error(f"Requirements: {requirements}")
|
|
378
|
+
logger.error(f"Requirements type: {type(requirements)}")
|
|
379
|
+
# In case of error, assume we can't allocate safely
|
|
380
|
+
return False
|
|
381
|
+
|
|
382
|
+
def _allocate(self, state_name: str, requirements: ResourceRequirements) -> None:
|
|
383
|
+
"""Perform the actual resource allocation."""
|
|
384
|
+
try:
|
|
385
|
+
if state_name not in self._allocations:
|
|
386
|
+
self._allocations[state_name] = {}
|
|
387
|
+
|
|
388
|
+
# Record allocation timestamp
|
|
389
|
+
self._allocation_times[state_name] = time.time()
|
|
390
|
+
|
|
391
|
+
# Allocate each requested resource type
|
|
392
|
+
for resource_type in [
|
|
393
|
+
ResourceType.CPU,
|
|
394
|
+
ResourceType.MEMORY,
|
|
395
|
+
ResourceType.IO,
|
|
396
|
+
ResourceType.NETWORK,
|
|
397
|
+
ResourceType.GPU,
|
|
398
|
+
]:
|
|
399
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
400
|
+
amount = get_resource_amount(requirements, resource_type)
|
|
401
|
+
if amount > 0:
|
|
402
|
+
self._allocations[state_name][resource_type] = amount
|
|
403
|
+
self.available[resource_type] -= amount
|
|
404
|
+
|
|
405
|
+
logger.debug(
|
|
406
|
+
f"Allocated {amount} {resource_type.name} to {state_name}"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
logger.error(f"Error in _allocate: {e}")
|
|
411
|
+
raise
|
|
412
|
+
|
|
413
|
+
def _try_preemption(
|
|
414
|
+
self, state_name: str, requirements: ResourceRequirements
|
|
415
|
+
) -> bool:
|
|
416
|
+
"""Attempt to preempt lower-priority states."""
|
|
417
|
+
if not self.enable_preemption:
|
|
418
|
+
return False
|
|
419
|
+
|
|
420
|
+
try:
|
|
421
|
+
# Find candidates for preemption
|
|
422
|
+
candidates = []
|
|
423
|
+
for allocated_state, resources in self._allocations.items():
|
|
424
|
+
if allocated_state != state_name:
|
|
425
|
+
total_resources = sum(resources.values())
|
|
426
|
+
candidates.append((allocated_state, total_resources))
|
|
427
|
+
|
|
428
|
+
if not candidates:
|
|
429
|
+
return False
|
|
430
|
+
|
|
431
|
+
# Sort by resource usage (preempt largest first)
|
|
432
|
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
433
|
+
|
|
434
|
+
# Simulate preemption
|
|
435
|
+
would_free = {
|
|
436
|
+
rt: 0.0
|
|
437
|
+
for rt in ResourceType
|
|
438
|
+
if rt != ResourceType.NONE and rt != ResourceType.ALL
|
|
439
|
+
}
|
|
440
|
+
preempt_list = []
|
|
441
|
+
|
|
442
|
+
for candidate_state, _ in candidates:
|
|
443
|
+
candidate_resources = self._allocations[candidate_state]
|
|
444
|
+
for rt, amount in candidate_resources.items():
|
|
445
|
+
would_free[rt] += amount # type: ignore
|
|
446
|
+
preempt_list.append(candidate_state)
|
|
447
|
+
|
|
448
|
+
# Check if preemption would free enough resources
|
|
449
|
+
could_satisfy = True
|
|
450
|
+
for resource_type in [
|
|
451
|
+
ResourceType.CPU,
|
|
452
|
+
ResourceType.MEMORY,
|
|
453
|
+
ResourceType.IO,
|
|
454
|
+
ResourceType.NETWORK,
|
|
455
|
+
ResourceType.GPU,
|
|
456
|
+
]:
|
|
457
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
458
|
+
required = get_resource_amount(requirements, resource_type)
|
|
459
|
+
available_after = (
|
|
460
|
+
self.available[resource_type] + would_free[resource_type] # type: ignore
|
|
461
|
+
)
|
|
462
|
+
if required > available_after:
|
|
463
|
+
could_satisfy = False
|
|
464
|
+
break
|
|
465
|
+
|
|
466
|
+
if could_satisfy:
|
|
467
|
+
# Perform actual preemption
|
|
468
|
+
for preempt_state in preempt_list:
|
|
469
|
+
self._preempt_state(preempt_state)
|
|
470
|
+
return True
|
|
471
|
+
|
|
472
|
+
return False
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logger.error(f"Error in preemption: {e}")
|
|
476
|
+
return False
|
|
477
|
+
|
|
478
|
+
def _preempt_state(self, state_name: str) -> None:
|
|
479
|
+
"""Forcibly preempt a state."""
|
|
480
|
+
try:
|
|
481
|
+
if state_name in self._allocations:
|
|
482
|
+
# Return resources to pool
|
|
483
|
+
for resource_type, amount in self._allocations[state_name].items():
|
|
484
|
+
self.available[resource_type] += amount
|
|
485
|
+
|
|
486
|
+
# Track preemption
|
|
487
|
+
self._preempted_states.add(state_name)
|
|
488
|
+
del self._allocations[state_name]
|
|
489
|
+
|
|
490
|
+
# Remove from leak detection
|
|
491
|
+
if self.enable_leak_detection:
|
|
492
|
+
agent = self._agent_names.get(state_name, "unknown")
|
|
493
|
+
leak_detector.track_release(state_name, agent)
|
|
494
|
+
|
|
495
|
+
logger.warning(f"Preempted state {state_name}")
|
|
496
|
+
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.error(f"Error preempting state {state_name}: {e}")
|
|
499
|
+
|
|
500
|
+
async def release(self, state_name: str) -> None:
|
|
501
|
+
"""Release all resources held by a state."""
|
|
502
|
+
try:
|
|
503
|
+
async with self._condition:
|
|
504
|
+
if state_name in self._allocations:
|
|
505
|
+
# Return resources to pool
|
|
506
|
+
for resource_type, amount in self._allocations[state_name].items():
|
|
507
|
+
self.available[resource_type] += amount
|
|
508
|
+
logger.debug(
|
|
509
|
+
f"Released {amount} {resource_type.name} from {state_name}"
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
# Clean up tracking
|
|
513
|
+
del self._allocations[state_name]
|
|
514
|
+
if state_name in self._allocation_times:
|
|
515
|
+
del self._allocation_times[state_name]
|
|
516
|
+
|
|
517
|
+
# Update leak detection
|
|
518
|
+
if self.enable_leak_detection:
|
|
519
|
+
agent = self._agent_names.get(state_name, "unknown")
|
|
520
|
+
leak_detector.track_release(state_name, agent)
|
|
521
|
+
if state_name in self._agent_names:
|
|
522
|
+
del self._agent_names[state_name]
|
|
523
|
+
|
|
524
|
+
# Notify waiting states
|
|
525
|
+
self._condition.notify_all()
|
|
526
|
+
|
|
527
|
+
except Exception as e:
|
|
528
|
+
logger.error(f"Error releasing resources for {state_name}: {e}")
|
|
529
|
+
|
|
530
|
+
def _update_stats(
|
|
531
|
+
self, state_name: str, requirements: ResourceRequirements, start_time: float
|
|
532
|
+
) -> None:
|
|
533
|
+
"""Update usage statistics after successful allocation."""
|
|
534
|
+
try:
|
|
535
|
+
wait_time = time.time() - start_time
|
|
536
|
+
current_time = time.time()
|
|
537
|
+
|
|
538
|
+
# Add to usage history
|
|
539
|
+
self._usage_history.append((current_time, self.available.copy()))
|
|
540
|
+
|
|
541
|
+
# Update stats for each resource type
|
|
542
|
+
for resource_type in [
|
|
543
|
+
ResourceType.CPU,
|
|
544
|
+
ResourceType.MEMORY,
|
|
545
|
+
ResourceType.IO,
|
|
546
|
+
ResourceType.NETWORK,
|
|
547
|
+
ResourceType.GPU,
|
|
548
|
+
]:
|
|
549
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
550
|
+
amount = get_resource_amount(requirements, resource_type)
|
|
551
|
+
if amount <= 0:
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
stats = self._usage_stats[resource_type] # type: ignore
|
|
555
|
+
stats.total_allocations += 1
|
|
556
|
+
stats.total_wait_time += wait_time
|
|
557
|
+
stats.last_allocation_time = current_time
|
|
558
|
+
|
|
559
|
+
# Calculate current usage
|
|
560
|
+
current_usage = sum(
|
|
561
|
+
alloc.get(resource_type, 0.0)
|
|
562
|
+
for alloc in self._allocations.values()
|
|
563
|
+
)
|
|
564
|
+
stats.current_usage = current_usage
|
|
565
|
+
stats.peak_usage = max(stats.peak_usage, current_usage)
|
|
566
|
+
|
|
567
|
+
# Record historical data
|
|
568
|
+
self._allocation_history[resource_type].append(
|
|
569
|
+
(current_time, current_usage)
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Clean up old history
|
|
573
|
+
cutoff = current_time - self._history_retention
|
|
574
|
+
self._usage_history = [
|
|
575
|
+
(t, usage) for t, usage in self._usage_history if t >= cutoff
|
|
576
|
+
]
|
|
577
|
+
|
|
578
|
+
for resource_type in self._allocation_history:
|
|
579
|
+
self._allocation_history[resource_type] = [
|
|
580
|
+
(t, usage)
|
|
581
|
+
for t, usage in self._allocation_history[resource_type]
|
|
582
|
+
if t >= cutoff
|
|
583
|
+
]
|
|
584
|
+
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logger.error(f"Error updating stats: {e}")
|
|
587
|
+
|
|
588
|
+
def _update_stats_failure(self, requirements: ResourceRequirements) -> None:
|
|
589
|
+
"""Update statistics for failed allocations."""
|
|
590
|
+
try:
|
|
591
|
+
for resource_type in [
|
|
592
|
+
ResourceType.CPU,
|
|
593
|
+
ResourceType.MEMORY,
|
|
594
|
+
ResourceType.IO,
|
|
595
|
+
ResourceType.NETWORK,
|
|
596
|
+
ResourceType.GPU,
|
|
597
|
+
]:
|
|
598
|
+
if safe_check_resource_type(requirements, resource_type):
|
|
599
|
+
amount = get_resource_amount(requirements, resource_type)
|
|
600
|
+
if amount > 0:
|
|
601
|
+
self._usage_stats[resource_type].failed_allocations += 1 # type: ignore
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.error(f"Error updating failure stats: {e}")
|
|
604
|
+
|
|
605
|
+
# Information methods
|
|
606
|
+
def get_usage_stats(self) -> dict[ResourceType, ResourceUsageStats]:
|
|
607
|
+
"""Get usage statistics for all resource types."""
|
|
608
|
+
return self._usage_stats.copy() # type: ignore
|
|
609
|
+
|
|
610
|
+
def get_state_allocations(self) -> dict[str, dict[ResourceType, float]]:
|
|
611
|
+
"""Get current allocations by state."""
|
|
612
|
+
return self._allocations.copy()
|
|
613
|
+
|
|
614
|
+
def get_waiting_states(self) -> set[str]:
|
|
615
|
+
"""Get states waiting for resources."""
|
|
616
|
+
return self._waiting_states.copy()
|
|
617
|
+
|
|
618
|
+
def get_preempted_states(self) -> set[str]:
|
|
619
|
+
"""Get states that were preempted."""
|
|
620
|
+
return self._preempted_states.copy()
|
|
621
|
+
|
|
622
|
+
def check_leaks(self) -> list[Any]:
|
|
623
|
+
"""Check for resource leaks."""
|
|
624
|
+
if not self.enable_leak_detection:
|
|
625
|
+
return []
|
|
626
|
+
try:
|
|
627
|
+
return leak_detector.detect_leaks()
|
|
628
|
+
except Exception as e:
|
|
629
|
+
logger.error(f"Error checking leaks: {e}")
|
|
630
|
+
return []
|
|
631
|
+
|
|
632
|
+
def get_leak_metrics(self) -> dict[str, Any]:
|
|
633
|
+
"""Get leak detection metrics."""
|
|
634
|
+
if not self.enable_leak_detection:
|
|
635
|
+
return {"leak_detection": "disabled"}
|
|
636
|
+
try:
|
|
637
|
+
return leak_detector.get_metrics()
|
|
638
|
+
except Exception as e:
|
|
639
|
+
logger.error(f"Error getting leak metrics: {e}")
|
|
640
|
+
return {"leak_detection": "error", "error": str(e)}
|
|
641
|
+
|
|
642
|
+
async def force_release(self, state_name: str) -> None:
|
|
643
|
+
"""Force release resources from a state."""
|
|
644
|
+
logger.warning(f"Force releasing resources for state {state_name}")
|
|
645
|
+
await self.release(state_name)
|