alloc 0.0.14__tar.gz → 0.0.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {alloc-0.0.14 → alloc-0.0.16}/PKG-INFO +1 -1
  2. {alloc-0.0.14 → alloc-0.0.16}/pyproject.toml +1 -1
  3. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/__init__.py +1 -1
  4. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/cli.py +7 -5
  5. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/probe.py +59 -16
  6. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/PKG-INFO +1 -1
  7. {alloc-0.0.14 → alloc-0.0.16}/tests/test_probe_multi.py +167 -0
  8. {alloc-0.0.14 → alloc-0.0.16}/README.md +0 -0
  9. {alloc-0.0.14 → alloc-0.0.16}/setup.cfg +0 -0
  10. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/artifact_loader.py +0 -0
  11. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/artifact_writer.py +0 -0
  12. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/browser_auth.py +0 -0
  13. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/callbacks.py +0 -0
  14. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/__init__.py +0 -0
  15. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/default_rate_card.json +0 -0
  16. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/catalog/gpus.v1.json +0 -0
  17. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/code_analyzer.py +0 -0
  18. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/config.py +0 -0
  19. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/context.py +0 -0
  20. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_display.py +0 -0
  21. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_engine.py +0 -0
  22. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/diagnosis_rules.py +0 -0
  23. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/display.py +0 -0
  24. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/extractor_runner.py +0 -0
  25. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/ghost.py +0 -0
  26. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/model_extractor.py +0 -0
  27. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/model_registry.py +0 -0
  28. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/stability.py +0 -0
  29. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/upload.py +0 -0
  30. {alloc-0.0.14 → alloc-0.0.16}/src/alloc/yaml_config.py +0 -0
  31. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/SOURCES.txt +0 -0
  32. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/dependency_links.txt +0 -0
  33. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/entry_points.txt +0 -0
  34. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/requires.txt +0 -0
  35. {alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/top_level.txt +0 -0
  36. {alloc-0.0.14 → alloc-0.0.16}/tests/test_artifact.py +0 -0
  37. {alloc-0.0.14 → alloc-0.0.16}/tests/test_artifact_loader.py +0 -0
  38. {alloc-0.0.14 → alloc-0.0.16}/tests/test_auth.py +0 -0
  39. {alloc-0.0.14 → alloc-0.0.16}/tests/test_callbacks.py +0 -0
  40. {alloc-0.0.14 → alloc-0.0.16}/tests/test_catalog.py +0 -0
  41. {alloc-0.0.14 → alloc-0.0.16}/tests/test_cli.py +0 -0
  42. {alloc-0.0.14 → alloc-0.0.16}/tests/test_code_analyzer.py +0 -0
  43. {alloc-0.0.14 → alloc-0.0.16}/tests/test_context.py +0 -0
  44. {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnose_cli.py +0 -0
  45. {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnosis_engine.py +0 -0
  46. {alloc-0.0.14 → alloc-0.0.16}/tests/test_diagnosis_rules.py +0 -0
  47. {alloc-0.0.14 → alloc-0.0.16}/tests/test_extractor_activation.py +0 -0
  48. {alloc-0.0.14 → alloc-0.0.16}/tests/test_ghost.py +0 -0
  49. {alloc-0.0.14 → alloc-0.0.16}/tests/test_ghost_degradation.py +0 -0
  50. {alloc-0.0.14 → alloc-0.0.16}/tests/test_init_from_org.py +0 -0
  51. {alloc-0.0.14 → alloc-0.0.16}/tests/test_interconnect.py +0 -0
  52. {alloc-0.0.14 → alloc-0.0.16}/tests/test_model_extractor.py +0 -0
  53. {alloc-0.0.14 → alloc-0.0.16}/tests/test_probe_hw.py +0 -0
  54. {alloc-0.0.14 → alloc-0.0.16}/tests/test_scan_auth.py +0 -0
  55. {alloc-0.0.14 → alloc-0.0.16}/tests/test_stability.py +0 -0
  56. {alloc-0.0.14 → alloc-0.0.16}/tests/test_topology_strategy.py +0 -0
  57. {alloc-0.0.14 → alloc-0.0.16}/tests/test_upload.py +0 -0
  58. {alloc-0.0.14 → alloc-0.0.16}/tests/test_verdict.py +0 -0
  59. {alloc-0.0.14 → alloc-0.0.16}/tests/test_yaml_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.14
3
+ Version: 0.0.16
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.14"
7
+ version = "0.0.16"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
9
9
  _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
10
10
  del _warnings
11
11
 
12
- __version__ = "0.0.14"
12
+ __version__ = "0.0.16"
13
13
 
14
14
  from alloc.ghost import ghost, GhostReport
15
15
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -2184,11 +2184,13 @@ def scan(
2184
2184
  resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
2185
2185
  else:
2186
2186
  # Token refresh failed — fall back to unauthenticated scan
2187
- if not json_output:
2188
- console.print(
2189
- "[yellow]Session expired — falling back to public scan "
2190
- "(org fleet context unavailable). Run `alloc login` to restore.[/yellow]",
2191
- )
2187
+ # Always print to stderr (not stdout) so JSON output is clean
2188
+ import sys as _sys
2189
+ print(
2190
+ "Session expired falling back to public scan "
2191
+ "(org fleet context unavailable). Run `alloc login` to restore.",
2192
+ file=_sys.stderr,
2193
+ )
2192
2194
  del headers["Authorization"]
2193
2195
  resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)
2194
2196
 
@@ -469,6 +469,8 @@ def probe_command(
469
469
  pass
470
470
 
471
471
  handles = [handle]
472
+ # Map from handle index → physical GPU index (for per_gpu_peaks keying)
473
+ handle_gpu_indices = [gpu_index]
472
474
  discovery_done = False
473
475
  discovery_attempts = 0
474
476
  max_discovery_attempts = 3 # Retry at samples 5, 15, 30
@@ -487,6 +489,35 @@ def probe_command(
487
489
  except ValueError:
488
490
  pass
489
491
 
492
+ # Early-initialize handles for all expected GPUs so per_gpu_peaks
493
+ # is populated from sample 0 — don't depend on process-tree
494
+ # discovery timing. Discovery still runs for process_map and to
495
+ # confirm which specific GPUs are in use via PID matching.
496
+ # NOTE: Do NOT set num_gpus_ref here — that would satisfy
497
+ # discovery_done prematurely and prevent retries at samples 15/30.
498
+ early_init_indices = None # type: Optional[list]
499
+ if expected_gpus > 1:
500
+ try:
501
+ device_count = pynvml.nvmlDeviceGetCount()
502
+ if device_count >= expected_gpus:
503
+ early_handles = []
504
+ early_indices = []
505
+ for idx in range(device_count):
506
+ if len(early_handles) >= expected_gpus:
507
+ break
508
+ try:
509
+ h = pynvml.nvmlDeviceGetHandleByIndex(idx)
510
+ early_handles.append(h)
511
+ early_indices.append(idx)
512
+ except Exception:
513
+ pass
514
+ if len(early_handles) >= expected_gpus:
515
+ handles = early_handles
516
+ handle_gpu_indices = early_indices
517
+ early_init_indices = early_indices
518
+ except Exception:
519
+ pass
520
+
490
521
  while not stop_event.is_set():
491
522
  # Retry GPU discovery: at samples 5, 15, 30
492
523
  # Keep retrying if we haven't found all expected GPUs yet
@@ -503,10 +534,12 @@ def probe_command(
503
534
  )
504
535
  if len(discovered) > 1:
505
536
  handles = []
537
+ handle_gpu_indices = []
506
538
  pmap = []
507
539
  for idx in discovered:
508
540
  h = pynvml.nvmlDeviceGetHandleByIndex(idx)
509
541
  handles.append(h)
542
+ handle_gpu_indices.append(idx)
510
543
  pmap.append({"gpu_index": idx})
511
544
  num_gpus_ref[0] = len(handles)
512
545
  process_map_ref[0] = pmap
@@ -527,14 +560,21 @@ def probe_command(
527
560
  # Stop retrying if we found expected count or exhausted attempts
528
561
  if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
529
562
  discovery_done = True
563
+ # If discovery never confirmed multi-GPU via PID matching
564
+ # but early-init opened handles for expected GPUs, generate
565
+ # a fallback process_map from the early-init indices.
566
+ if process_map_ref[0] is None and early_init_indices is not None:
567
+ process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
568
+ num_gpus_ref[0] = len(early_init_indices)
530
569
 
531
570
  # Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
532
- try:
533
- vram_vals = []
534
- util_vals = []
535
- power_vals = []
536
- total_mb = 0.0
537
- for h in handles:
571
+ # Per-GPU try/except: one bad handle must not prevent tracking others
572
+ vram_vals = []
573
+ util_vals = []
574
+ power_vals = []
575
+ total_mb = 0.0
576
+ for h in handles:
577
+ try:
538
578
  mi = pynvml.nvmlDeviceGetMemoryInfo(h)
539
579
  ut = pynvml.nvmlDeviceGetUtilizationRates(h)
540
580
  pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
@@ -542,25 +582,28 @@ def probe_command(
542
582
  util_vals.append(ut.gpu)
543
583
  power_vals.append(pw)
544
584
  total_mb = mi.total / (1024 * 1024)
585
+ except Exception:
586
+ pass
545
587
 
546
- # Track per-GPU peak VRAM (always, even single GPU —
547
- # discovery may expand handles later, and we need history from sample 0)
548
- pgp = per_gpu_peaks_ref[0]
549
- for gi, vm in enumerate(vram_vals):
550
- pgp[gi] = max(pgp.get(gi, 0.0), vm)
588
+ # Track per-GPU peak VRAM (always, even single GPU —
589
+ # discovery may expand handles later, and we need history from sample 0)
590
+ pgp = per_gpu_peaks_ref[0]
591
+ for gi, vm in enumerate(vram_vals):
592
+ pgp[gi] = max(pgp.get(gi, 0.0), vm)
551
593
 
594
+ if vram_vals:
552
595
  samples.append(ProbeSample(
553
596
  timestamp=time.time(),
554
597
  memory_used_mb=max(vram_vals),
555
598
  memory_total_mb=total_mb,
556
- gpu_util_pct=sum(util_vals) / len(util_vals),
557
- power_watts=sum(power_vals) / len(power_vals),
599
+ gpu_util_pct=sum(util_vals) / len(util_vals) if util_vals else 0.0,
600
+ power_watts=sum(power_vals) / len(power_vals) if power_vals else 0.0,
558
601
  ))
559
- except Exception:
560
- pass
561
602
 
562
603
  # Calibrate mode: auto-stop when stable
563
- if calibrate and len(samples) > ramp_up_samples:
604
+ # Delay stability check until GPU discovery is complete —
605
+ # prevents calibrate-and-exit before finding all expected GPUs.
606
+ if calibrate and discovery_done and len(samples) > ramp_up_samples:
564
607
  from alloc.stability import check_stability, RAMP_UP_SAMPLES
565
608
  sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
566
609
  if sr.is_stable:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.14
3
+ Version: 0.0.16
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -245,3 +245,170 @@ def test_active_gpu_fallback_not_used_without_expected():
245
245
  with patch("alloc.probe._read_child_env", return_value=None):
246
246
  result = _discover_gpu_indices(1000, mock, fallback_index=0)
247
247
  assert result == [0] # Falls back to default
248
+
249
+
250
+ # ── Early handle initialization for expected GPUs ──
251
+
252
+
253
+ def test_early_init_opens_handles_for_expected_gpus():
254
+ """When expected_gpus > 1 and device_count >= expected, early-init should
255
+ open handles for all expected GPUs."""
256
+ mock_pynvml = MagicMock()
257
+ mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=2)
258
+
259
+ handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
260
+ mock_pynvml.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=lambda i: handles_map[i])
261
+
262
+ # Simulate early-init logic from probe_command._monitor()
263
+ expected_gpus = 2
264
+ handles = [handles_map[0]]
265
+
266
+ if expected_gpus > 1:
267
+ device_count = mock_pynvml.nvmlDeviceGetCount()
268
+ if device_count >= expected_gpus:
269
+ early_handles = []
270
+ early_indices = []
271
+ for idx in range(device_count):
272
+ if len(early_handles) >= expected_gpus:
273
+ break
274
+ h = mock_pynvml.nvmlDeviceGetHandleByIndex(idx)
275
+ early_handles.append(h)
276
+ early_indices.append(idx)
277
+ if len(early_handles) >= expected_gpus:
278
+ handles = early_handles
279
+
280
+ assert len(handles) == 2
281
+
282
+
283
+ def test_early_init_skipped_when_fewer_devices():
284
+ """When device_count < expected_gpus, early-init should not change handles."""
285
+ mock_pynvml = MagicMock()
286
+ mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=1)
287
+
288
+ expected_gpus = 2
289
+ handles = [MagicMock(name="gpu0")]
290
+ original_handles = list(handles)
291
+
292
+ if expected_gpus > 1:
293
+ device_count = mock_pynvml.nvmlDeviceGetCount()
294
+ if device_count >= expected_gpus:
295
+ assert False, "Should not reach here"
296
+
297
+ assert len(handles) == 1
298
+
299
+
300
+ def test_per_gpu_sampling_resilient_to_partial_failure():
301
+ """Per-GPU try/except: one GPU failure should not prevent others from
302
+ being sampled into per_gpu_peaks."""
303
+ mock_pynvml = MagicMock()
304
+ handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
305
+
306
+ mem_ok = MagicMock()
307
+ mem_ok.total = 24 * 1024 * 1024 * 1024
308
+ mem_ok.used = 8000 * 1024 * 1024
309
+
310
+ def mem_info_side_effect(h):
311
+ if h == handles_map[1]:
312
+ raise RuntimeError("GPU 1 memory read failed")
313
+ return mem_ok
314
+
315
+ mock_pynvml.nvmlDeviceGetMemoryInfo = MagicMock(side_effect=mem_info_side_effect)
316
+ util = MagicMock()
317
+ util.gpu = 80
318
+ mock_pynvml.nvmlDeviceGetUtilizationRates = MagicMock(return_value=util)
319
+ mock_pynvml.nvmlDeviceGetPowerUsage = MagicMock(return_value=100_000)
320
+
321
+ # Simulate the per-GPU sampling loop
322
+ handles = [handles_map[0], handles_map[1]]
323
+ per_gpu_peaks = {}
324
+ vram_vals = []
325
+
326
+ for h in handles:
327
+ try:
328
+ mi = mock_pynvml.nvmlDeviceGetMemoryInfo(h)
329
+ vram_vals.append(mi.used / (1024 * 1024))
330
+ except Exception:
331
+ pass
332
+
333
+ for gi, vm in enumerate(vram_vals):
334
+ per_gpu_peaks[gi] = max(per_gpu_peaks.get(gi, 0.0), vm)
335
+
336
+ # GPU 0 tracked, GPU 1 skipped
337
+ assert 0 in per_gpu_peaks
338
+ assert per_gpu_peaks[0] > 0
339
+ assert len(vram_vals) == 1
340
+
341
+
342
+ def test_stability_delayed_until_discovery_done():
343
+ """Stability check requires discovery_done=True."""
344
+ # Single GPU: expected=1, num_gpus=1 → done immediately
345
+ assert 1 >= 1 # num_gpus >= expected
346
+
347
+ # Multi GPU with early-init: expected=2, num_gpus=2 → done at sample 5
348
+ assert 2 >= 2
349
+
350
+ # Multi GPU, discovery incomplete: expected=4, found=2 → NOT done
351
+ assert not (2 >= 4)
352
+
353
+
354
+ def test_per_gpu_peaks_to_result_list():
355
+ """per_gpu_peaks dict should convert to sorted list for ProbeResult."""
356
+ peaks = {0: 8000.5, 1: 12000.3}
357
+ result = [round(peaks[i], 1) for i in sorted(peaks)] if peaks else None
358
+ assert result == [8000.5, 12000.3]
359
+
360
+ empty = {}
361
+ result_empty = [round(empty[i], 1) for i in sorted(empty)] if empty else None
362
+ assert result_empty is None
363
+
364
+
365
+ def test_fallback_process_map_from_early_init():
366
+ """When discovery exhausts all attempts without confirming GPUs,
367
+ process_map should be generated from early-init indices."""
368
+ early_init_indices = [0, 1]
369
+ process_map_ref = [None]
370
+ num_gpus_ref = [1]
371
+ discovery_attempts = 3
372
+ max_discovery_attempts = 3
373
+ expected_gpus = 2
374
+
375
+ # Simulate: discovery exhausted, never found >1 via PID matching
376
+ if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
377
+ discovery_done = True
378
+ if process_map_ref[0] is None and early_init_indices is not None:
379
+ process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
380
+ num_gpus_ref[0] = len(early_init_indices)
381
+
382
+ assert process_map_ref[0] == [{"gpu_index": 0}, {"gpu_index": 1}]
383
+ assert num_gpus_ref[0] == 2
384
+
385
+
386
+ def test_no_fallback_process_map_when_discovery_succeeded():
387
+ """When discovery already set process_map, fallback should not overwrite."""
388
+ early_init_indices = [0, 1]
389
+ process_map_ref = [[{"gpu_index": 2}, {"gpu_index": 3}]] # discovery found GPUs 2,3
390
+ num_gpus_ref = [2]
391
+
392
+ # Simulate fallback condition
393
+ if process_map_ref[0] is None and early_init_indices is not None:
394
+ process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
395
+
396
+ # Should keep discovery's result, not overwrite
397
+ assert process_map_ref[0] == [{"gpu_index": 2}, {"gpu_index": 3}]
398
+
399
+
400
+ def test_early_init_does_not_set_num_gpus_ref():
401
+ """Early-init must NOT set num_gpus_ref — that would satisfy discovery_done
402
+ prematurely and prevent retries at samples 15/30."""
403
+ # Simulate the early-init code path
404
+ num_gpus_ref = [1]
405
+ expected_gpus = 2
406
+ device_count = 2 # enough devices
407
+
408
+ # Early-init opens handles but does NOT touch num_gpus_ref
409
+ early_init_indices = list(range(expected_gpus))
410
+
411
+ # num_gpus_ref should still be 1
412
+ assert num_gpus_ref[0] == 1
413
+ # So discovery_done check fails: 1 < 2
414
+ assert not (num_gpus_ref[0] >= expected_gpus)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes