alloc 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alloc/__init__.py +11 -0
- alloc/artifact_writer.py +67 -0
- alloc/callbacks.py +342 -0
- alloc/catalog/__init__.py +138 -0
- alloc/catalog/default_rate_card.json +18 -0
- alloc/catalog/gpus.v1.json +174 -0
- alloc/cli.py +1341 -0
- alloc/config.py +124 -0
- alloc/context.py +191 -0
- alloc/display.py +580 -0
- alloc/extractor_runner.py +141 -0
- alloc/ghost.py +167 -0
- alloc/model_extractor.py +170 -0
- alloc/model_registry.py +138 -0
- alloc/probe.py +461 -0
- alloc/stability.py +144 -0
- alloc/upload.py +138 -0
- alloc/yaml_config.py +287 -0
- alloc-0.0.1.dist-info/METADATA +256 -0
- alloc-0.0.1.dist-info/RECORD +23 -0
- alloc-0.0.1.dist-info/WHEEL +5 -0
- alloc-0.0.1.dist-info/entry_points.txt +2 -0
- alloc-0.0.1.dist-info/top_level.txt +1 -0
alloc/probe.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
"""Alloc Probe — external GPU monitor via pynvml.
|
|
2
|
+
|
|
3
|
+
Monitors a training process from outside: polls GPU memory, utilization,
|
|
4
|
+
and power draw in a background thread. No modifications to user code.
|
|
5
|
+
|
|
6
|
+
Graceful no-op if pynvml is not installed or no GPU is available.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import signal
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import List, Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StopReason(str, Enum):
|
|
22
|
+
STABLE = "stable"
|
|
23
|
+
TIMEOUT = "timeout"
|
|
24
|
+
PROCESS_EXIT = "process_exit"
|
|
25
|
+
ERROR = "error"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ProbeSample:
|
|
30
|
+
"""Single GPU metrics sample."""
|
|
31
|
+
|
|
32
|
+
timestamp: float
|
|
33
|
+
memory_used_mb: float
|
|
34
|
+
memory_total_mb: float
|
|
35
|
+
gpu_util_pct: float
|
|
36
|
+
power_watts: float
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ProbeResult:
|
|
41
|
+
"""Result of a Probe monitoring session."""
|
|
42
|
+
|
|
43
|
+
peak_vram_mb: float = 0.0
|
|
44
|
+
avg_gpu_util: float = 0.0
|
|
45
|
+
avg_power_watts: float = 0.0
|
|
46
|
+
duration_seconds: float = 0.0
|
|
47
|
+
samples: list = field(default_factory=list)
|
|
48
|
+
exit_code: Optional[int] = None
|
|
49
|
+
error: Optional[str] = None
|
|
50
|
+
probe_mode: Optional[str] = None
|
|
51
|
+
steps_profiled: Optional[int] = None
|
|
52
|
+
stop_reason: Optional[str] = None
|
|
53
|
+
gpu_name: Optional[str] = None
|
|
54
|
+
gpu_total_vram_mb: Optional[float] = None
|
|
55
|
+
calibration_duration_s: Optional[float] = None
|
|
56
|
+
driver_version: Optional[str] = None
|
|
57
|
+
cuda_version: Optional[str] = None
|
|
58
|
+
sm_version: Optional[str] = None
|
|
59
|
+
num_gpus_detected: int = 1
|
|
60
|
+
process_map: Optional[list] = None
|
|
61
|
+
per_gpu_peak_vram_mb: Optional[list] = None
|
|
62
|
+
detected_interconnect: Optional[str] = None # "nvlink", "pcie", or None
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def peak_vram_gb(self) -> float:
|
|
66
|
+
return round(self.peak_vram_mb / 1024, 2)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def vram_utilization_pct(self) -> Optional[float]:
|
|
70
|
+
if self.gpu_total_vram_mb and self.gpu_total_vram_mb > 0:
|
|
71
|
+
return round(self.peak_vram_mb / self.gpu_total_vram_mb * 100, 1)
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _try_import_pynvml():
|
|
76
|
+
"""Try to import pynvml. Returns module or None."""
|
|
77
|
+
try:
|
|
78
|
+
import pynvml
|
|
79
|
+
return pynvml
|
|
80
|
+
except ImportError:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _check_stable(samples, window=20, variance_threshold=5.0):
|
|
85
|
+
"""Check if GPU metrics have stabilized over the last `window` samples.
|
|
86
|
+
|
|
87
|
+
Stability = std dev of GPU util over last `window` samples < threshold.
|
|
88
|
+
"""
|
|
89
|
+
if len(samples) < window:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
recent = samples[-window:]
|
|
93
|
+
utils = [s.gpu_util_pct for s in recent]
|
|
94
|
+
mean = sum(utils) / len(utils)
|
|
95
|
+
variance = sum((u - mean) ** 2 for u in utils) / len(utils)
|
|
96
|
+
return variance ** 0.5 < variance_threshold
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _get_child_pids(pid):
|
|
100
|
+
# type: (int) -> List[int]
|
|
101
|
+
"""Get child PIDs of a process. Returns empty list on failure."""
|
|
102
|
+
try:
|
|
103
|
+
result = subprocess.run(
|
|
104
|
+
["pgrep", "-P", str(pid)],
|
|
105
|
+
capture_output=True,
|
|
106
|
+
text=True,
|
|
107
|
+
timeout=5,
|
|
108
|
+
)
|
|
109
|
+
out = result.stdout.strip()
|
|
110
|
+
if out:
|
|
111
|
+
return [int(p) for p in out.split("\n") if p.strip()]
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _discover_gpu_indices(proc_pid, pynvml, fallback_index=0):
|
|
118
|
+
# type: (int, ..., int) -> List[int]
|
|
119
|
+
"""Discover which GPUs a process (and its children) are using.
|
|
120
|
+
|
|
121
|
+
Iterates all GPU devices and checks running compute processes.
|
|
122
|
+
Falls back to [fallback_index] if discovery fails or finds nothing.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
126
|
+
except Exception:
|
|
127
|
+
return [fallback_index]
|
|
128
|
+
|
|
129
|
+
# Collect target PIDs: the main process + its children
|
|
130
|
+
target_pids = {proc_pid}
|
|
131
|
+
for child in _get_child_pids(proc_pid):
|
|
132
|
+
target_pids.add(child)
|
|
133
|
+
# Also check grandchildren (common with torchrun/accelerate)
|
|
134
|
+
for grandchild in _get_child_pids(child):
|
|
135
|
+
target_pids.add(grandchild)
|
|
136
|
+
|
|
137
|
+
found_indices = []
|
|
138
|
+
for idx in range(device_count):
|
|
139
|
+
try:
|
|
140
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
141
|
+
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
142
|
+
for p in procs:
|
|
143
|
+
if p.pid in target_pids:
|
|
144
|
+
found_indices.append(idx)
|
|
145
|
+
break
|
|
146
|
+
except Exception:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
return found_indices if found_indices else [fallback_index]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _detect_interconnect(handles, pynvml):
|
|
153
|
+
# type: (list, ...) -> Optional[str]
|
|
154
|
+
"""Detect GPU interconnect type using NVML topology API.
|
|
155
|
+
|
|
156
|
+
Checks topology between GPU pairs. Returns "nvlink" if any pair
|
|
157
|
+
is connected via NVLink, "pcie" if all pairs use PCIe, or None
|
|
158
|
+
if detection fails or only one GPU.
|
|
159
|
+
"""
|
|
160
|
+
if len(handles) < 2:
|
|
161
|
+
return None
|
|
162
|
+
try:
|
|
163
|
+
# Check topology between first two GPU handles
|
|
164
|
+
# NVML topology levels: SINGLE(10)=NVLink, MULTIPLE(20)=NVLink multi-hop,
|
|
165
|
+
# HOSTBRIDGE(30)=PCIe bridge, NODE(40)=same NUMA, SYSTEM(50)=cross-socket
|
|
166
|
+
level = pynvml.nvmlDeviceGetTopologyCommonAncestor(handles[0], handles[1])
|
|
167
|
+
# pynvml may return an int or an enum; normalize to int
|
|
168
|
+
level_val = int(level) if not isinstance(level, int) else level
|
|
169
|
+
if level_val <= 20:
|
|
170
|
+
return "nvlink"
|
|
171
|
+
return "pcie"
|
|
172
|
+
except Exception:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def probe_command(
|
|
177
|
+
command, # type: list
|
|
178
|
+
*,
|
|
179
|
+
poll_interval_ms=500, # type: int
|
|
180
|
+
timeout_seconds=120, # type: int
|
|
181
|
+
gpu_index=0, # type: int
|
|
182
|
+
calibrate=True, # type: bool
|
|
183
|
+
):
|
|
184
|
+
# type: (...) -> ProbeResult
|
|
185
|
+
"""Run a command and monitor GPU usage externally.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
command: Command to run as subprocess (e.g. ["python", "train.py"])
|
|
189
|
+
poll_interval_ms: How often to poll GPU metrics (default 500ms)
|
|
190
|
+
timeout_seconds: Max time to monitor (default 120s)
|
|
191
|
+
gpu_index: Which GPU to monitor (default 0)
|
|
192
|
+
calibrate: If True (default), auto-stop when metrics stabilize.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
ProbeResult with peak VRAM, avg utilization, samples timeseries.
|
|
196
|
+
On failure, returns a result with error set — never raises.
|
|
197
|
+
"""
|
|
198
|
+
pynvml = _try_import_pynvml()
|
|
199
|
+
|
|
200
|
+
# Launch the subprocess
|
|
201
|
+
try:
|
|
202
|
+
proc = subprocess.Popen(
|
|
203
|
+
command,
|
|
204
|
+
stdout=sys.stdout,
|
|
205
|
+
stderr=sys.stderr,
|
|
206
|
+
)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
return ProbeResult(
|
|
209
|
+
error=f"Failed to start process: {e}",
|
|
210
|
+
stop_reason=StopReason.ERROR.value,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if pynvml is None:
|
|
214
|
+
# No GPU monitoring — just wait for process
|
|
215
|
+
try:
|
|
216
|
+
if timeout_seconds > 0:
|
|
217
|
+
proc.wait(timeout=timeout_seconds)
|
|
218
|
+
else:
|
|
219
|
+
proc.wait()
|
|
220
|
+
except subprocess.TimeoutExpired:
|
|
221
|
+
proc.terminate()
|
|
222
|
+
proc.wait(timeout=10)
|
|
223
|
+
return ProbeResult(
|
|
224
|
+
exit_code=proc.returncode,
|
|
225
|
+
error="pynvml not installed — install with: pip install alloc[gpu]",
|
|
226
|
+
stop_reason=StopReason.ERROR.value,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Monitor with pynvml
|
|
230
|
+
samples = [] # type: list[ProbeSample]
|
|
231
|
+
stop_event = threading.Event()
|
|
232
|
+
ramp_up_samples = 20 # Skip first 20 samples for stability check
|
|
233
|
+
stop_reason_ref = [None] # type: list[Optional[str]]
|
|
234
|
+
gpu_info_ref = [None, None] # type: list # [gpu_name, gpu_total_vram_mb]
|
|
235
|
+
hw_info_ref = [None, None, None] # type: list # [driver_version, cuda_version, sm_version]
|
|
236
|
+
calibration_time_ref = [None] # type: list[Optional[float]]
|
|
237
|
+
num_gpus_ref = [1] # type: list[int]
|
|
238
|
+
process_map_ref = [None] # type: list
|
|
239
|
+
per_gpu_peaks_ref = [{}] # type: list[dict] # {handle_idx: peak_vram_mb}
|
|
240
|
+
detected_ic_ref = [None] # type: list[Optional[str]]
|
|
241
|
+
|
|
242
|
+
def _monitor():
|
|
243
|
+
try:
|
|
244
|
+
pynvml.nvmlInit()
|
|
245
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
|
|
246
|
+
|
|
247
|
+
# Capture GPU name and total VRAM
|
|
248
|
+
try:
|
|
249
|
+
name = pynvml.nvmlDeviceGetName(handle)
|
|
250
|
+
if isinstance(name, bytes):
|
|
251
|
+
name = name.decode("utf-8")
|
|
252
|
+
gpu_info_ref[0] = name
|
|
253
|
+
mem_info_init = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
254
|
+
gpu_info_ref[1] = mem_info_init.total / (1024 * 1024)
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
257
|
+
|
|
258
|
+
# Capture hardware context (driver, CUDA, SM version)
|
|
259
|
+
try:
|
|
260
|
+
drv = pynvml.nvmlSystemGetDriverVersion()
|
|
261
|
+
if isinstance(drv, bytes):
|
|
262
|
+
drv = drv.decode("utf-8")
|
|
263
|
+
hw_info_ref[0] = drv
|
|
264
|
+
except Exception:
|
|
265
|
+
pass
|
|
266
|
+
try:
|
|
267
|
+
cuda_ver_int = pynvml.nvmlSystemGetCudaDriverVersion()
|
|
268
|
+
major = cuda_ver_int // 1000
|
|
269
|
+
minor = (cuda_ver_int % 1000) // 10
|
|
270
|
+
hw_info_ref[1] = f"{major}.{minor}"
|
|
271
|
+
except Exception:
|
|
272
|
+
pass
|
|
273
|
+
try:
|
|
274
|
+
sm_major, sm_minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
|
275
|
+
hw_info_ref[2] = f"{sm_major}.{sm_minor}"
|
|
276
|
+
except Exception:
|
|
277
|
+
pass
|
|
278
|
+
|
|
279
|
+
handles = [handle]
|
|
280
|
+
discovery_done = False
|
|
281
|
+
|
|
282
|
+
while not stop_event.is_set():
|
|
283
|
+
# After 5 samples, try to discover all GPUs used by the process
|
|
284
|
+
if not discovery_done and len(samples) >= 5 and proc.pid:
|
|
285
|
+
try:
|
|
286
|
+
discovered = _discover_gpu_indices(proc.pid, pynvml, fallback_index=gpu_index)
|
|
287
|
+
if len(discovered) > 1:
|
|
288
|
+
handles = []
|
|
289
|
+
pmap = []
|
|
290
|
+
for idx in discovered:
|
|
291
|
+
h = pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
292
|
+
handles.append(h)
|
|
293
|
+
pmap.append({"gpu_index": idx})
|
|
294
|
+
num_gpus_ref[0] = len(handles)
|
|
295
|
+
process_map_ref[0] = pmap
|
|
296
|
+
except Exception:
|
|
297
|
+
pass
|
|
298
|
+
# Detect interconnect type between discovered GPUs
|
|
299
|
+
detected_ic_ref[0] = _detect_interconnect(handles, pynvml)
|
|
300
|
+
discovery_done = True
|
|
301
|
+
|
|
302
|
+
# Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
|
|
303
|
+
try:
|
|
304
|
+
vram_vals = []
|
|
305
|
+
util_vals = []
|
|
306
|
+
power_vals = []
|
|
307
|
+
total_mb = 0.0
|
|
308
|
+
for h in handles:
|
|
309
|
+
mi = pynvml.nvmlDeviceGetMemoryInfo(h)
|
|
310
|
+
ut = pynvml.nvmlDeviceGetUtilizationRates(h)
|
|
311
|
+
pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
|
|
312
|
+
vram_vals.append(mi.used / (1024 * 1024))
|
|
313
|
+
util_vals.append(ut.gpu)
|
|
314
|
+
power_vals.append(pw)
|
|
315
|
+
total_mb = mi.total / (1024 * 1024)
|
|
316
|
+
|
|
317
|
+
# Track per-GPU peak VRAM for multi-GPU runs
|
|
318
|
+
if len(handles) > 1:
|
|
319
|
+
pgp = per_gpu_peaks_ref[0]
|
|
320
|
+
for gi, vm in enumerate(vram_vals):
|
|
321
|
+
pgp[gi] = max(pgp.get(gi, 0.0), vm)
|
|
322
|
+
|
|
323
|
+
samples.append(ProbeSample(
|
|
324
|
+
timestamp=time.time(),
|
|
325
|
+
memory_used_mb=max(vram_vals),
|
|
326
|
+
memory_total_mb=total_mb,
|
|
327
|
+
gpu_util_pct=sum(util_vals) / len(util_vals),
|
|
328
|
+
power_watts=sum(power_vals) / len(power_vals),
|
|
329
|
+
))
|
|
330
|
+
except Exception:
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
# Calibrate mode: auto-stop when stable
|
|
334
|
+
if calibrate and len(samples) > ramp_up_samples:
|
|
335
|
+
from alloc.stability import check_stability, RAMP_UP_SAMPLES
|
|
336
|
+
sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
|
|
337
|
+
if sr.is_stable:
|
|
338
|
+
stop_reason_ref[0] = StopReason.STABLE.value
|
|
339
|
+
calibration_time_ref[0] = time.time()
|
|
340
|
+
stop_event.set()
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
stop_event.wait(poll_interval_ms / 1000.0)
|
|
344
|
+
except Exception:
|
|
345
|
+
pass
|
|
346
|
+
finally:
|
|
347
|
+
try:
|
|
348
|
+
pynvml.nvmlShutdown()
|
|
349
|
+
except Exception:
|
|
350
|
+
pass
|
|
351
|
+
|
|
352
|
+
monitor_thread = threading.Thread(target=_monitor, daemon=True)
|
|
353
|
+
monitor_thread.start()
|
|
354
|
+
|
|
355
|
+
start_time = time.time()
|
|
356
|
+
|
|
357
|
+
if calibrate:
|
|
358
|
+
# Calibrate mode (new default): wait for stability, process exit, or timeout
|
|
359
|
+
while not stop_event.is_set() and proc.poll() is None:
|
|
360
|
+
elapsed = time.time() - start_time
|
|
361
|
+
if timeout_seconds > 0 and elapsed >= timeout_seconds:
|
|
362
|
+
stop_reason_ref[0] = StopReason.TIMEOUT.value
|
|
363
|
+
break
|
|
364
|
+
stop_event.wait(0.5)
|
|
365
|
+
|
|
366
|
+
if proc.poll() is not None and stop_reason_ref[0] is None:
|
|
367
|
+
stop_reason_ref[0] = StopReason.PROCESS_EXIT.value
|
|
368
|
+
|
|
369
|
+
# Gracefully terminate the process if still running
|
|
370
|
+
if proc.poll() is None:
|
|
371
|
+
proc.send_signal(signal.SIGTERM)
|
|
372
|
+
try:
|
|
373
|
+
proc.wait(timeout=15)
|
|
374
|
+
except subprocess.TimeoutExpired:
|
|
375
|
+
proc.kill()
|
|
376
|
+
proc.wait()
|
|
377
|
+
|
|
378
|
+
else:
|
|
379
|
+
# Full mode: wait for process to complete
|
|
380
|
+
try:
|
|
381
|
+
if timeout_seconds > 0:
|
|
382
|
+
proc.wait(timeout=timeout_seconds)
|
|
383
|
+
else:
|
|
384
|
+
proc.wait()
|
|
385
|
+
stop_reason_ref[0] = StopReason.PROCESS_EXIT.value
|
|
386
|
+
except subprocess.TimeoutExpired:
|
|
387
|
+
stop_reason_ref[0] = StopReason.TIMEOUT.value
|
|
388
|
+
proc.terminate()
|
|
389
|
+
try:
|
|
390
|
+
proc.wait(timeout=10)
|
|
391
|
+
except subprocess.TimeoutExpired:
|
|
392
|
+
proc.kill()
|
|
393
|
+
proc.wait()
|
|
394
|
+
|
|
395
|
+
stop_event.set()
|
|
396
|
+
monitor_thread.join(timeout=5)
|
|
397
|
+
duration = time.time() - start_time
|
|
398
|
+
|
|
399
|
+
# Determine probe_mode
|
|
400
|
+
if calibrate:
|
|
401
|
+
mode = "calibrate"
|
|
402
|
+
else:
|
|
403
|
+
mode = "full"
|
|
404
|
+
|
|
405
|
+
# Compute calibration duration if we stopped due to stability
|
|
406
|
+
cal_duration = None
|
|
407
|
+
if calibration_time_ref[0] is not None:
|
|
408
|
+
cal_duration = round(calibration_time_ref[0] - start_time, 2)
|
|
409
|
+
|
|
410
|
+
if not samples:
|
|
411
|
+
return ProbeResult(
|
|
412
|
+
duration_seconds=round(duration, 2),
|
|
413
|
+
exit_code=proc.returncode,
|
|
414
|
+
probe_mode=mode,
|
|
415
|
+
stop_reason=stop_reason_ref[0],
|
|
416
|
+
gpu_name=gpu_info_ref[0],
|
|
417
|
+
gpu_total_vram_mb=gpu_info_ref[1],
|
|
418
|
+
driver_version=hw_info_ref[0],
|
|
419
|
+
cuda_version=hw_info_ref[1],
|
|
420
|
+
sm_version=hw_info_ref[2],
|
|
421
|
+
num_gpus_detected=num_gpus_ref[0],
|
|
422
|
+
process_map=process_map_ref[0],
|
|
423
|
+
detected_interconnect=detected_ic_ref[0],
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
peak_vram = max(s.memory_used_mb for s in samples)
|
|
427
|
+
avg_util = sum(s.gpu_util_pct for s in samples) / len(samples)
|
|
428
|
+
avg_power = sum(s.power_watts for s in samples) / len(samples)
|
|
429
|
+
|
|
430
|
+
return ProbeResult(
|
|
431
|
+
peak_vram_mb=round(peak_vram, 1),
|
|
432
|
+
avg_gpu_util=round(avg_util, 1),
|
|
433
|
+
avg_power_watts=round(avg_power, 1),
|
|
434
|
+
duration_seconds=round(duration, 2),
|
|
435
|
+
samples=[
|
|
436
|
+
{
|
|
437
|
+
"t": round(s.timestamp - samples[0].timestamp, 2),
|
|
438
|
+
"vram_mb": round(s.memory_used_mb, 1),
|
|
439
|
+
"gpu_util_pct": round(s.gpu_util_pct, 1),
|
|
440
|
+
"power_w": round(s.power_watts, 1),
|
|
441
|
+
}
|
|
442
|
+
for s in samples
|
|
443
|
+
],
|
|
444
|
+
exit_code=proc.returncode,
|
|
445
|
+
probe_mode=mode,
|
|
446
|
+
steps_profiled=None,
|
|
447
|
+
stop_reason=stop_reason_ref[0],
|
|
448
|
+
gpu_name=gpu_info_ref[0],
|
|
449
|
+
gpu_total_vram_mb=gpu_info_ref[1],
|
|
450
|
+
calibration_duration_s=cal_duration,
|
|
451
|
+
driver_version=hw_info_ref[0],
|
|
452
|
+
cuda_version=hw_info_ref[1],
|
|
453
|
+
sm_version=hw_info_ref[2],
|
|
454
|
+
num_gpus_detected=num_gpus_ref[0],
|
|
455
|
+
process_map=process_map_ref[0],
|
|
456
|
+
per_gpu_peak_vram_mb=(
|
|
457
|
+
[round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
|
|
458
|
+
if len(per_gpu_peaks_ref[0]) > 1 else None
|
|
459
|
+
),
|
|
460
|
+
detected_interconnect=detected_ic_ref[0],
|
|
461
|
+
)
|
alloc/stability.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Multi-signal stability detection for calibrate-and-exit mode.
|
|
2
|
+
|
|
3
|
+
Pure functions — no side effects, no threading, no I/O.
|
|
4
|
+
Determines when GPU metrics have stabilized enough to stop monitoring.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
from alloc.probe import ProbeSample
|
|
13
|
+
|
|
14
|
+
# --- Constants ---
|
|
15
|
+
|
|
16
|
+
RAMP_UP_SAMPLES = 20 # Skip first 20 samples (~10s warmup at 500ms poll)
|
|
17
|
+
MIN_SAMPLES_FOR_STABILITY = 30 # Earliest stability possible = sample 30 (~15s)
|
|
18
|
+
MIN_PEAK_VRAM_MB = 100.0 # Require >100MB to confirm GPU is actually being used
|
|
19
|
+
VRAM_PLATEAU_WINDOW_S = 15.0 # No new VRAM peak in last 15 seconds
|
|
20
|
+
UTIL_STABILITY_WINDOW = 20 # Rolling window for util/power std dev checks
|
|
21
|
+
GPU_UTIL_STD_THRESHOLD = 5.0 # GPU util std dev < 5% = stable
|
|
22
|
+
POWER_STD_THRESHOLD = 10.0 # Power draw std dev < 10W = stable
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class StabilityResult:
|
|
27
|
+
"""Result of a stability check."""
|
|
28
|
+
|
|
29
|
+
is_stable: bool
|
|
30
|
+
reason: str
|
|
31
|
+
vram_plateau: bool
|
|
32
|
+
util_stable: bool
|
|
33
|
+
power_stable: bool
|
|
34
|
+
current_peak_vram_mb: float
|
|
35
|
+
sample_count: int
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def check_stability(samples, poll_interval_ms=500):
|
|
39
|
+
# type: (List[ProbeSample], int) -> StabilityResult
|
|
40
|
+
"""Check if GPU metrics have stabilized across multiple signals.
|
|
41
|
+
|
|
42
|
+
ALL must hold for is_stable=True:
|
|
43
|
+
1. len(samples) >= MIN_SAMPLES_FOR_STABILITY
|
|
44
|
+
2. peak_vram > MIN_PEAK_VRAM_MB (GPU actually in use)
|
|
45
|
+
3. VRAM plateau: no new peak in last VRAM_PLATEAU_WINDOW_S seconds
|
|
46
|
+
4. GPU util std dev < GPU_UTIL_STD_THRESHOLD over last UTIL_STABILITY_WINDOW post-ramp-up samples
|
|
47
|
+
5. Power std dev < POWER_STD_THRESHOLD over last UTIL_STABILITY_WINDOW post-ramp-up samples
|
|
48
|
+
"""
|
|
49
|
+
count = len(samples)
|
|
50
|
+
|
|
51
|
+
if count < MIN_SAMPLES_FOR_STABILITY:
|
|
52
|
+
return StabilityResult(
|
|
53
|
+
is_stable=False,
|
|
54
|
+
reason="insufficient_samples",
|
|
55
|
+
vram_plateau=False,
|
|
56
|
+
util_stable=False,
|
|
57
|
+
power_stable=False,
|
|
58
|
+
current_peak_vram_mb=0.0,
|
|
59
|
+
sample_count=count,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
peak_vram = max(s.memory_used_mb for s in samples)
|
|
63
|
+
|
|
64
|
+
if peak_vram < MIN_PEAK_VRAM_MB:
|
|
65
|
+
return StabilityResult(
|
|
66
|
+
is_stable=False,
|
|
67
|
+
reason="below_min_vram",
|
|
68
|
+
vram_plateau=False,
|
|
69
|
+
util_stable=False,
|
|
70
|
+
power_stable=False,
|
|
71
|
+
current_peak_vram_mb=peak_vram,
|
|
72
|
+
sample_count=count,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
vram_plateau = _check_vram_plateau(samples, poll_interval_ms)
|
|
76
|
+
|
|
77
|
+
# Use post-ramp-up samples for util/power checks
|
|
78
|
+
post_ramp = samples[RAMP_UP_SAMPLES:]
|
|
79
|
+
recent_util = [s.gpu_util_pct for s in post_ramp[-UTIL_STABILITY_WINDOW:]]
|
|
80
|
+
recent_power = [s.power_watts for s in post_ramp[-UTIL_STABILITY_WINDOW:]]
|
|
81
|
+
|
|
82
|
+
util_stable = _check_metric_stable(recent_util, GPU_UTIL_STD_THRESHOLD)
|
|
83
|
+
power_stable = _check_metric_stable(recent_power, POWER_STD_THRESHOLD)
|
|
84
|
+
|
|
85
|
+
is_stable = vram_plateau and util_stable and power_stable
|
|
86
|
+
|
|
87
|
+
reasons = []
|
|
88
|
+
if vram_plateau:
|
|
89
|
+
reasons.append("vram_plateau")
|
|
90
|
+
if util_stable:
|
|
91
|
+
reasons.append("util_stable")
|
|
92
|
+
if power_stable:
|
|
93
|
+
reasons.append("power_stable")
|
|
94
|
+
|
|
95
|
+
if not is_stable:
|
|
96
|
+
failing = []
|
|
97
|
+
if not vram_plateau:
|
|
98
|
+
failing.append("vram_rising")
|
|
99
|
+
if not util_stable:
|
|
100
|
+
failing.append("util_unstable")
|
|
101
|
+
if not power_stable:
|
|
102
|
+
failing.append("power_unstable")
|
|
103
|
+
reason = "+".join(failing)
|
|
104
|
+
else:
|
|
105
|
+
reason = "+".join(reasons)
|
|
106
|
+
|
|
107
|
+
return StabilityResult(
|
|
108
|
+
is_stable=is_stable,
|
|
109
|
+
reason=reason,
|
|
110
|
+
vram_plateau=vram_plateau,
|
|
111
|
+
util_stable=util_stable,
|
|
112
|
+
power_stable=power_stable,
|
|
113
|
+
current_peak_vram_mb=peak_vram,
|
|
114
|
+
sample_count=count,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _check_vram_plateau(samples, poll_interval_ms):
|
|
119
|
+
# type: (List[ProbeSample], int) -> bool
|
|
120
|
+
"""Check if VRAM has plateaued — no new peak in the last window."""
|
|
121
|
+
if not samples:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
peak_vram = -1.0
|
|
125
|
+
peak_index = 0
|
|
126
|
+
for i, s in enumerate(samples):
|
|
127
|
+
if s.memory_used_mb > peak_vram:
|
|
128
|
+
peak_vram = s.memory_used_mb
|
|
129
|
+
peak_index = i
|
|
130
|
+
|
|
131
|
+
window_samples = int(VRAM_PLATEAU_WINDOW_S / (poll_interval_ms / 1000.0))
|
|
132
|
+
return (len(samples) - 1 - peak_index) >= window_samples
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _check_metric_stable(values, threshold):
|
|
136
|
+
# type: (List[float], float) -> bool
|
|
137
|
+
"""Check if population std dev of values is below threshold."""
|
|
138
|
+
if not values:
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
n = len(values)
|
|
142
|
+
mean = sum(values) / n
|
|
143
|
+
variance = sum((v - mean) ** 2 for v in values) / n
|
|
144
|
+
return variance ** 0.5 < threshold
|