alloc 0.0.14__tar.gz → 0.0.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.0.14 → alloc-0.0.16}/PKG-INFO +1 -1
- {alloc-0.0.14 → alloc-0.0.16}/pyproject.toml +1 -1
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/__init__.py +1 -1
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/cli.py +7 -5
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/probe.py +59 -16
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/PKG-INFO +1 -1
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_probe_multi.py +167 -0
- {alloc-0.0.14 → alloc-0.0.16}/README.md +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/setup.cfg +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/artifact_loader.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/browser_auth.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/callbacks.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/__init__.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/gpus.v1.json +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/code_analyzer.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/config.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/context.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_display.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_engine.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_rules.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/display.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/extractor_runner.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/ghost.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/model_extractor.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/model_registry.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/stability.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/upload.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc/yaml_config.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/SOURCES.txt +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_artifact.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_artifact_loader.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_auth.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_callbacks.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_catalog.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_cli.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_code_analyzer.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_context.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnose_cli.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnosis_engine.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnosis_rules.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_extractor_activation.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_ghost.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_ghost_degradation.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_init_from_org.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_interconnect.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_model_extractor.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_probe_hw.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_scan_auth.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_stability.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_topology_strategy.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_upload.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_verdict.py +0 -0
- {alloc-0.0.14 → alloc-0.0.16}/tests/test_yaml_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.16
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.16"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
|
|
|
9
9
|
_warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
|
|
10
10
|
del _warnings
|
|
11
11
|
|
|
12
|
-
__version__ = "0.0.
|
|
12
|
+
__version__ = "0.0.16"
|
|
13
13
|
|
|
14
14
|
from alloc.ghost import ghost, GhostReport
|
|
15
15
|
from alloc.callbacks import AllocCallback as HuggingFaceCallback
|
|
@@ -2184,11 +2184,13 @@ def scan(
|
|
|
2184
2184
|
resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
|
|
2185
2185
|
else:
|
|
2186
2186
|
# Token refresh failed — fall back to unauthenticated scan
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
)
|
|
2187
|
+
# Always print to stderr (not stdout) so JSON output is clean
|
|
2188
|
+
import sys as _sys
|
|
2189
|
+
print(
|
|
2190
|
+
"Session expired — falling back to public scan "
|
|
2191
|
+
"(org fleet context unavailable). Run `alloc login` to restore.",
|
|
2192
|
+
file=_sys.stderr,
|
|
2193
|
+
)
|
|
2192
2194
|
del headers["Authorization"]
|
|
2193
2195
|
resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)
|
|
2194
2196
|
|
|
@@ -469,6 +469,8 @@ def probe_command(
|
|
|
469
469
|
pass
|
|
470
470
|
|
|
471
471
|
handles = [handle]
|
|
472
|
+
# Map from handle index → physical GPU index (for per_gpu_peaks keying)
|
|
473
|
+
handle_gpu_indices = [gpu_index]
|
|
472
474
|
discovery_done = False
|
|
473
475
|
discovery_attempts = 0
|
|
474
476
|
max_discovery_attempts = 3 # Retry at samples 5, 15, 30
|
|
@@ -487,6 +489,35 @@ def probe_command(
|
|
|
487
489
|
except ValueError:
|
|
488
490
|
pass
|
|
489
491
|
|
|
492
|
+
# Early-initialize handles for all expected GPUs so per_gpu_peaks
|
|
493
|
+
# is populated from sample 0 — don't depend on process-tree
|
|
494
|
+
# discovery timing. Discovery still runs for process_map and to
|
|
495
|
+
# confirm which specific GPUs are in use via PID matching.
|
|
496
|
+
# NOTE: Do NOT set num_gpus_ref here — that would satisfy
|
|
497
|
+
# discovery_done prematurely and prevent retries at samples 15/30.
|
|
498
|
+
early_init_indices = None # type: Optional[list]
|
|
499
|
+
if expected_gpus > 1:
|
|
500
|
+
try:
|
|
501
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
502
|
+
if device_count >= expected_gpus:
|
|
503
|
+
early_handles = []
|
|
504
|
+
early_indices = []
|
|
505
|
+
for idx in range(device_count):
|
|
506
|
+
if len(early_handles) >= expected_gpus:
|
|
507
|
+
break
|
|
508
|
+
try:
|
|
509
|
+
h = pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
510
|
+
early_handles.append(h)
|
|
511
|
+
early_indices.append(idx)
|
|
512
|
+
except Exception:
|
|
513
|
+
pass
|
|
514
|
+
if len(early_handles) >= expected_gpus:
|
|
515
|
+
handles = early_handles
|
|
516
|
+
handle_gpu_indices = early_indices
|
|
517
|
+
early_init_indices = early_indices
|
|
518
|
+
except Exception:
|
|
519
|
+
pass
|
|
520
|
+
|
|
490
521
|
while not stop_event.is_set():
|
|
491
522
|
# Retry GPU discovery: at samples 5, 15, 30
|
|
492
523
|
# Keep retrying if we haven't found all expected GPUs yet
|
|
@@ -503,10 +534,12 @@ def probe_command(
|
|
|
503
534
|
)
|
|
504
535
|
if len(discovered) > 1:
|
|
505
536
|
handles = []
|
|
537
|
+
handle_gpu_indices = []
|
|
506
538
|
pmap = []
|
|
507
539
|
for idx in discovered:
|
|
508
540
|
h = pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
509
541
|
handles.append(h)
|
|
542
|
+
handle_gpu_indices.append(idx)
|
|
510
543
|
pmap.append({"gpu_index": idx})
|
|
511
544
|
num_gpus_ref[0] = len(handles)
|
|
512
545
|
process_map_ref[0] = pmap
|
|
@@ -527,14 +560,21 @@ def probe_command(
|
|
|
527
560
|
# Stop retrying if we found expected count or exhausted attempts
|
|
528
561
|
if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
|
|
529
562
|
discovery_done = True
|
|
563
|
+
# If discovery never confirmed multi-GPU via PID matching
|
|
564
|
+
# but early-init opened handles for expected GPUs, generate
|
|
565
|
+
# a fallback process_map from the early-init indices.
|
|
566
|
+
if process_map_ref[0] is None and early_init_indices is not None:
|
|
567
|
+
process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
|
|
568
|
+
num_gpus_ref[0] = len(early_init_indices)
|
|
530
569
|
|
|
531
570
|
# Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
|
|
532
|
-
try:
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
571
|
+
# Per-GPU try/except: one bad handle must not prevent tracking others
|
|
572
|
+
vram_vals = []
|
|
573
|
+
util_vals = []
|
|
574
|
+
power_vals = []
|
|
575
|
+
total_mb = 0.0
|
|
576
|
+
for h in handles:
|
|
577
|
+
try:
|
|
538
578
|
mi = pynvml.nvmlDeviceGetMemoryInfo(h)
|
|
539
579
|
ut = pynvml.nvmlDeviceGetUtilizationRates(h)
|
|
540
580
|
pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
|
|
@@ -542,25 +582,28 @@ def probe_command(
|
|
|
542
582
|
util_vals.append(ut.gpu)
|
|
543
583
|
power_vals.append(pw)
|
|
544
584
|
total_mb = mi.total / (1024 * 1024)
|
|
585
|
+
except Exception:
|
|
586
|
+
pass
|
|
545
587
|
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
588
|
+
# Track per-GPU peak VRAM (always, even single GPU —
|
|
589
|
+
# discovery may expand handles later, and we need history from sample 0)
|
|
590
|
+
pgp = per_gpu_peaks_ref[0]
|
|
591
|
+
for gi, vm in enumerate(vram_vals):
|
|
592
|
+
pgp[gi] = max(pgp.get(gi, 0.0), vm)
|
|
551
593
|
|
|
594
|
+
if vram_vals:
|
|
552
595
|
samples.append(ProbeSample(
|
|
553
596
|
timestamp=time.time(),
|
|
554
597
|
memory_used_mb=max(vram_vals),
|
|
555
598
|
memory_total_mb=total_mb,
|
|
556
|
-
gpu_util_pct=sum(util_vals) / len(util_vals),
|
|
557
|
-
power_watts=sum(power_vals) / len(power_vals),
|
|
599
|
+
gpu_util_pct=sum(util_vals) / len(util_vals) if util_vals else 0.0,
|
|
600
|
+
power_watts=sum(power_vals) / len(power_vals) if power_vals else 0.0,
|
|
558
601
|
))
|
|
559
|
-
except Exception:
|
|
560
|
-
pass
|
|
561
602
|
|
|
562
603
|
# Calibrate mode: auto-stop when stable
|
|
563
|
-
|
|
604
|
+
# Delay stability check until GPU discovery is complete —
|
|
605
|
+
# prevents calibrate-and-exit before finding all expected GPUs.
|
|
606
|
+
if calibrate and discovery_done and len(samples) > ramp_up_samples:
|
|
564
607
|
from alloc.stability import check_stability, RAMP_UP_SAMPLES
|
|
565
608
|
sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
|
|
566
609
|
if sr.is_stable:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.16
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -245,3 +245,170 @@ def test_active_gpu_fallback_not_used_without_expected():
|
|
|
245
245
|
with patch("alloc.probe._read_child_env", return_value=None):
|
|
246
246
|
result = _discover_gpu_indices(1000, mock, fallback_index=0)
|
|
247
247
|
assert result == [0] # Falls back to default
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ── Early handle initialization for expected GPUs ──
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def test_early_init_opens_handles_for_expected_gpus():
|
|
254
|
+
"""When expected_gpus > 1 and device_count >= expected, early-init should
|
|
255
|
+
open handles for all expected GPUs."""
|
|
256
|
+
mock_pynvml = MagicMock()
|
|
257
|
+
mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=2)
|
|
258
|
+
|
|
259
|
+
handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
|
|
260
|
+
mock_pynvml.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=lambda i: handles_map[i])
|
|
261
|
+
|
|
262
|
+
# Simulate early-init logic from probe_command._monitor()
|
|
263
|
+
expected_gpus = 2
|
|
264
|
+
handles = [handles_map[0]]
|
|
265
|
+
|
|
266
|
+
if expected_gpus > 1:
|
|
267
|
+
device_count = mock_pynvml.nvmlDeviceGetCount()
|
|
268
|
+
if device_count >= expected_gpus:
|
|
269
|
+
early_handles = []
|
|
270
|
+
early_indices = []
|
|
271
|
+
for idx in range(device_count):
|
|
272
|
+
if len(early_handles) >= expected_gpus:
|
|
273
|
+
break
|
|
274
|
+
h = mock_pynvml.nvmlDeviceGetHandleByIndex(idx)
|
|
275
|
+
early_handles.append(h)
|
|
276
|
+
early_indices.append(idx)
|
|
277
|
+
if len(early_handles) >= expected_gpus:
|
|
278
|
+
handles = early_handles
|
|
279
|
+
|
|
280
|
+
assert len(handles) == 2
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def test_early_init_skipped_when_fewer_devices():
|
|
284
|
+
"""When device_count < expected_gpus, early-init should not change handles."""
|
|
285
|
+
mock_pynvml = MagicMock()
|
|
286
|
+
mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=1)
|
|
287
|
+
|
|
288
|
+
expected_gpus = 2
|
|
289
|
+
handles = [MagicMock(name="gpu0")]
|
|
290
|
+
original_handles = list(handles)
|
|
291
|
+
|
|
292
|
+
if expected_gpus > 1:
|
|
293
|
+
device_count = mock_pynvml.nvmlDeviceGetCount()
|
|
294
|
+
if device_count >= expected_gpus:
|
|
295
|
+
assert False, "Should not reach here"
|
|
296
|
+
|
|
297
|
+
assert len(handles) == 1
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def test_per_gpu_sampling_resilient_to_partial_failure():
|
|
301
|
+
"""Per-GPU try/except: one GPU failure should not prevent others from
|
|
302
|
+
being sampled into per_gpu_peaks."""
|
|
303
|
+
mock_pynvml = MagicMock()
|
|
304
|
+
handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
|
|
305
|
+
|
|
306
|
+
mem_ok = MagicMock()
|
|
307
|
+
mem_ok.total = 24 * 1024 * 1024 * 1024
|
|
308
|
+
mem_ok.used = 8000 * 1024 * 1024
|
|
309
|
+
|
|
310
|
+
def mem_info_side_effect(h):
|
|
311
|
+
if h == handles_map[1]:
|
|
312
|
+
raise RuntimeError("GPU 1 memory read failed")
|
|
313
|
+
return mem_ok
|
|
314
|
+
|
|
315
|
+
mock_pynvml.nvmlDeviceGetMemoryInfo = MagicMock(side_effect=mem_info_side_effect)
|
|
316
|
+
util = MagicMock()
|
|
317
|
+
util.gpu = 80
|
|
318
|
+
mock_pynvml.nvmlDeviceGetUtilizationRates = MagicMock(return_value=util)
|
|
319
|
+
mock_pynvml.nvmlDeviceGetPowerUsage = MagicMock(return_value=100_000)
|
|
320
|
+
|
|
321
|
+
# Simulate the per-GPU sampling loop
|
|
322
|
+
handles = [handles_map[0], handles_map[1]]
|
|
323
|
+
per_gpu_peaks = {}
|
|
324
|
+
vram_vals = []
|
|
325
|
+
|
|
326
|
+
for h in handles:
|
|
327
|
+
try:
|
|
328
|
+
mi = mock_pynvml.nvmlDeviceGetMemoryInfo(h)
|
|
329
|
+
vram_vals.append(mi.used / (1024 * 1024))
|
|
330
|
+
except Exception:
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
for gi, vm in enumerate(vram_vals):
|
|
334
|
+
per_gpu_peaks[gi] = max(per_gpu_peaks.get(gi, 0.0), vm)
|
|
335
|
+
|
|
336
|
+
# GPU 0 tracked, GPU 1 skipped
|
|
337
|
+
assert 0 in per_gpu_peaks
|
|
338
|
+
assert per_gpu_peaks[0] > 0
|
|
339
|
+
assert len(vram_vals) == 1
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def test_stability_delayed_until_discovery_done():
|
|
343
|
+
"""Stability check requires discovery_done=True."""
|
|
344
|
+
# Single GPU: expected=1, num_gpus=1 → done immediately
|
|
345
|
+
assert 1 >= 1 # num_gpus >= expected
|
|
346
|
+
|
|
347
|
+
# Multi GPU with early-init: expected=2, num_gpus=2 → done at sample 5
|
|
348
|
+
assert 2 >= 2
|
|
349
|
+
|
|
350
|
+
# Multi GPU, discovery incomplete: expected=4, found=2 → NOT done
|
|
351
|
+
assert not (2 >= 4)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def test_per_gpu_peaks_to_result_list():
|
|
355
|
+
"""per_gpu_peaks dict should convert to sorted list for ProbeResult."""
|
|
356
|
+
peaks = {0: 8000.5, 1: 12000.3}
|
|
357
|
+
result = [round(peaks[i], 1) for i in sorted(peaks)] if peaks else None
|
|
358
|
+
assert result == [8000.5, 12000.3]
|
|
359
|
+
|
|
360
|
+
empty = {}
|
|
361
|
+
result_empty = [round(empty[i], 1) for i in sorted(empty)] if empty else None
|
|
362
|
+
assert result_empty is None
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def test_fallback_process_map_from_early_init():
|
|
366
|
+
"""When discovery exhausts all attempts without confirming GPUs,
|
|
367
|
+
process_map should be generated from early-init indices."""
|
|
368
|
+
early_init_indices = [0, 1]
|
|
369
|
+
process_map_ref = [None]
|
|
370
|
+
num_gpus_ref = [1]
|
|
371
|
+
discovery_attempts = 3
|
|
372
|
+
max_discovery_attempts = 3
|
|
373
|
+
expected_gpus = 2
|
|
374
|
+
|
|
375
|
+
# Simulate: discovery exhausted, never found >1 via PID matching
|
|
376
|
+
if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
|
|
377
|
+
discovery_done = True
|
|
378
|
+
if process_map_ref[0] is None and early_init_indices is not None:
|
|
379
|
+
process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
|
|
380
|
+
num_gpus_ref[0] = len(early_init_indices)
|
|
381
|
+
|
|
382
|
+
assert process_map_ref[0] == [{"gpu_index": 0}, {"gpu_index": 1}]
|
|
383
|
+
assert num_gpus_ref[0] == 2
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def test_no_fallback_process_map_when_discovery_succeeded():
|
|
387
|
+
"""When discovery already set process_map, fallback should not overwrite."""
|
|
388
|
+
early_init_indices = [0, 1]
|
|
389
|
+
process_map_ref = [[{"gpu_index": 2}, {"gpu_index": 3}]] # discovery found GPUs 2,3
|
|
390
|
+
num_gpus_ref = [2]
|
|
391
|
+
|
|
392
|
+
# Simulate fallback condition
|
|
393
|
+
if process_map_ref[0] is None and early_init_indices is not None:
|
|
394
|
+
process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
|
|
395
|
+
|
|
396
|
+
# Should keep discovery's result, not overwrite
|
|
397
|
+
assert process_map_ref[0] == [{"gpu_index": 2}, {"gpu_index": 3}]
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def test_early_init_does_not_set_num_gpus_ref():
|
|
401
|
+
"""Early-init must NOT set num_gpus_ref — that would satisfy discovery_done
|
|
402
|
+
prematurely and prevent retries at samples 15/30."""
|
|
403
|
+
# Simulate the early-init code path
|
|
404
|
+
num_gpus_ref = [1]
|
|
405
|
+
expected_gpus = 2
|
|
406
|
+
device_count = 2 # enough devices
|
|
407
|
+
|
|
408
|
+
# Early-init opens handles but does NOT touch num_gpus_ref
|
|
409
|
+
early_init_indices = list(range(expected_gpus))
|
|
410
|
+
|
|
411
|
+
# num_gpus_ref should still be 1
|
|
412
|
+
assert num_gpus_ref[0] == 1
|
|
413
|
+
# So discovery_done check fails: 1 < 2
|
|
414
|
+
assert not (num_gpus_ref[0] >= expected_gpus)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|