alloc 0.0.13__tar.gz → 0.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {alloc-0.0.13 → alloc-0.0.15}/PKG-INFO +1 -1
  2. {alloc-0.0.13 → alloc-0.0.15}/pyproject.toml +1 -1
  3. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/__init__.py +1 -1
  4. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/cli.py +18 -6
  5. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/probe.py +52 -18
  6. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/PKG-INFO +1 -1
  7. {alloc-0.0.13 → alloc-0.0.15}/tests/test_probe_multi.py +115 -0
  8. {alloc-0.0.13 → alloc-0.0.15}/README.md +0 -0
  9. {alloc-0.0.13 → alloc-0.0.15}/setup.cfg +0 -0
  10. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/artifact_loader.py +0 -0
  11. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/artifact_writer.py +0 -0
  12. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/browser_auth.py +0 -0
  13. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/callbacks.py +0 -0
  14. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/catalog/__init__.py +0 -0
  15. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/catalog/default_rate_card.json +0 -0
  16. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/catalog/gpus.v1.json +0 -0
  17. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/code_analyzer.py +0 -0
  18. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/config.py +0 -0
  19. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/context.py +0 -0
  20. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/diagnosis_display.py +0 -0
  21. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/diagnosis_engine.py +0 -0
  22. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/diagnosis_rules.py +0 -0
  23. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/display.py +0 -0
  24. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/extractor_runner.py +0 -0
  25. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/ghost.py +0 -0
  26. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/model_extractor.py +0 -0
  27. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/model_registry.py +0 -0
  28. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/stability.py +0 -0
  29. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/upload.py +0 -0
  30. {alloc-0.0.13 → alloc-0.0.15}/src/alloc/yaml_config.py +0 -0
  31. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/SOURCES.txt +0 -0
  32. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/dependency_links.txt +0 -0
  33. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/entry_points.txt +0 -0
  34. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/requires.txt +0 -0
  35. {alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/top_level.txt +0 -0
  36. {alloc-0.0.13 → alloc-0.0.15}/tests/test_artifact.py +0 -0
  37. {alloc-0.0.13 → alloc-0.0.15}/tests/test_artifact_loader.py +0 -0
  38. {alloc-0.0.13 → alloc-0.0.15}/tests/test_auth.py +0 -0
  39. {alloc-0.0.13 → alloc-0.0.15}/tests/test_callbacks.py +0 -0
  40. {alloc-0.0.13 → alloc-0.0.15}/tests/test_catalog.py +0 -0
  41. {alloc-0.0.13 → alloc-0.0.15}/tests/test_cli.py +0 -0
  42. {alloc-0.0.13 → alloc-0.0.15}/tests/test_code_analyzer.py +0 -0
  43. {alloc-0.0.13 → alloc-0.0.15}/tests/test_context.py +0 -0
  44. {alloc-0.0.13 → alloc-0.0.15}/tests/test_diagnose_cli.py +0 -0
  45. {alloc-0.0.13 → alloc-0.0.15}/tests/test_diagnosis_engine.py +0 -0
  46. {alloc-0.0.13 → alloc-0.0.15}/tests/test_diagnosis_rules.py +0 -0
  47. {alloc-0.0.13 → alloc-0.0.15}/tests/test_extractor_activation.py +0 -0
  48. {alloc-0.0.13 → alloc-0.0.15}/tests/test_ghost.py +0 -0
  49. {alloc-0.0.13 → alloc-0.0.15}/tests/test_ghost_degradation.py +0 -0
  50. {alloc-0.0.13 → alloc-0.0.15}/tests/test_init_from_org.py +0 -0
  51. {alloc-0.0.13 → alloc-0.0.15}/tests/test_interconnect.py +0 -0
  52. {alloc-0.0.13 → alloc-0.0.15}/tests/test_model_extractor.py +0 -0
  53. {alloc-0.0.13 → alloc-0.0.15}/tests/test_probe_hw.py +0 -0
  54. {alloc-0.0.13 → alloc-0.0.15}/tests/test_scan_auth.py +0 -0
  55. {alloc-0.0.13 → alloc-0.0.15}/tests/test_stability.py +0 -0
  56. {alloc-0.0.13 → alloc-0.0.15}/tests/test_topology_strategy.py +0 -0
  57. {alloc-0.0.13 → alloc-0.0.15}/tests/test_upload.py +0 -0
  58. {alloc-0.0.13 → alloc-0.0.15}/tests/test_verdict.py +0 -0
  59. {alloc-0.0.13 → alloc-0.0.15}/tests/test_yaml_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.13"
7
+ version = "0.0.15"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
9
9
  _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
10
10
  del _warnings
11
11
 
12
- __version__ = "0.0.13"
12
+ __version__ = "0.0.15"
13
13
 
14
14
  from alloc.ghost import ghost, GhostReport
15
15
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -564,8 +564,11 @@ def run(
564
564
  else:
565
565
  console.print("[dim]Tip: alloc login --browser to connect your dashboard[/dim]")
566
566
 
567
+ # Propagate non-zero exit code — but NOT when calibrate mode
568
+ # intentionally killed the process (torchrun exits non-zero on SIGTERM)
567
569
  if result.exit_code and result.exit_code != 0:
568
- raise typer.Exit(result.exit_code)
570
+ if result.stop_reason not in ("stable", "timeout"):
571
+ raise typer.Exit(result.exit_code)
569
572
 
570
573
 
571
574
  @app.command()
@@ -2115,6 +2118,13 @@ def scan(
2115
2118
  """Remote ghost scan via Alloc API — no GPU needed."""
2116
2119
  import httpx
2117
2120
 
2121
+ # When --json, redirect console to stderr so nothing contaminates stdout.
2122
+ from rich.console import Console as _Console
2123
+ if json_output:
2124
+ console = _Console(stderr=True)
2125
+ else:
2126
+ console = _Console()
2127
+
2118
2128
  # Resolve param count from model name or explicit flag
2119
2129
  resolved_param_count = param_count_b or _model_to_params(model)
2120
2130
  if resolved_param_count is None:
@@ -2174,11 +2184,13 @@ def scan(
2174
2184
  resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
2175
2185
  else:
2176
2186
  # Token refresh failed — fall back to unauthenticated scan
2177
- if not json_output:
2178
- console.print(
2179
- "[yellow]Session expired — falling back to public scan "
2180
- "(org fleet context unavailable). Run `alloc login` to restore.[/yellow]",
2181
- )
2187
+ # Always print to stderr (not stdout) so JSON output is clean
2188
+ import sys as _sys
2189
+ print(
2190
+ "Session expired falling back to public scan "
2191
+ "(org fleet context unavailable). Run `alloc login` to restore.",
2192
+ file=_sys.stderr,
2193
+ )
2182
2194
  del headers["Authorization"]
2183
2195
  resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)
2184
2196
 
@@ -469,6 +469,8 @@ def probe_command(
469
469
  pass
470
470
 
471
471
  handles = [handle]
472
+ # Map from handle index → physical GPU index (for per_gpu_peaks keying)
473
+ handle_gpu_indices = [gpu_index]
472
474
  discovery_done = False
473
475
  discovery_attempts = 0
474
476
  max_discovery_attempts = 3 # Retry at samples 5, 15, 30
@@ -487,6 +489,32 @@ def probe_command(
487
489
  except ValueError:
488
490
  pass
489
491
 
492
+ # Early-initialize handles for all expected GPUs so per_gpu_peaks
493
+ # is populated from sample 0 — don't depend on process-tree
494
+ # discovery timing. Discovery still runs for process_map and to
495
+ # confirm which specific GPUs are in use.
496
+ if expected_gpus > 1:
497
+ try:
498
+ device_count = pynvml.nvmlDeviceGetCount()
499
+ if device_count >= expected_gpus:
500
+ early_handles = []
501
+ early_indices = []
502
+ for idx in range(device_count):
503
+ if len(early_handles) >= expected_gpus:
504
+ break
505
+ try:
506
+ h = pynvml.nvmlDeviceGetHandleByIndex(idx)
507
+ early_handles.append(h)
508
+ early_indices.append(idx)
509
+ except Exception:
510
+ pass
511
+ if len(early_handles) >= expected_gpus:
512
+ handles = early_handles
513
+ handle_gpu_indices = early_indices
514
+ num_gpus_ref[0] = len(handles)
515
+ except Exception:
516
+ pass
517
+
490
518
  while not stop_event.is_set():
491
519
  # Retry GPU discovery: at samples 5, 15, 30
492
520
  # Keep retrying if we haven't found all expected GPUs yet
@@ -503,10 +531,12 @@ def probe_command(
503
531
  )
504
532
  if len(discovered) > 1:
505
533
  handles = []
534
+ handle_gpu_indices = []
506
535
  pmap = []
507
536
  for idx in discovered:
508
537
  h = pynvml.nvmlDeviceGetHandleByIndex(idx)
509
538
  handles.append(h)
539
+ handle_gpu_indices.append(idx)
510
540
  pmap.append({"gpu_index": idx})
511
541
  num_gpus_ref[0] = len(handles)
512
542
  process_map_ref[0] = pmap
@@ -529,12 +559,13 @@ def probe_command(
529
559
  discovery_done = True
530
560
 
531
561
  # Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
532
- try:
533
- vram_vals = []
534
- util_vals = []
535
- power_vals = []
536
- total_mb = 0.0
537
- for h in handles:
562
+ # Per-GPU try/except: one bad handle must not prevent tracking others
563
+ vram_vals = []
564
+ util_vals = []
565
+ power_vals = []
566
+ total_mb = 0.0
567
+ for h in handles:
568
+ try:
538
569
  mi = pynvml.nvmlDeviceGetMemoryInfo(h)
539
570
  ut = pynvml.nvmlDeviceGetUtilizationRates(h)
540
571
  pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
@@ -542,25 +573,28 @@ def probe_command(
542
573
  util_vals.append(ut.gpu)
543
574
  power_vals.append(pw)
544
575
  total_mb = mi.total / (1024 * 1024)
576
+ except Exception:
577
+ pass
545
578
 
546
- # Track per-GPU peak VRAM (always, even single GPU —
547
- # discovery may expand handles later, and we need history from sample 0)
548
- pgp = per_gpu_peaks_ref[0]
549
- for gi, vm in enumerate(vram_vals):
550
- pgp[gi] = max(pgp.get(gi, 0.0), vm)
579
+ # Track per-GPU peak VRAM (always, even single GPU —
580
+ # discovery may expand handles later, and we need history from sample 0)
581
+ pgp = per_gpu_peaks_ref[0]
582
+ for gi, vm in enumerate(vram_vals):
583
+ pgp[gi] = max(pgp.get(gi, 0.0), vm)
551
584
 
585
+ if vram_vals:
552
586
  samples.append(ProbeSample(
553
587
  timestamp=time.time(),
554
588
  memory_used_mb=max(vram_vals),
555
589
  memory_total_mb=total_mb,
556
- gpu_util_pct=sum(util_vals) / len(util_vals),
557
- power_watts=sum(power_vals) / len(power_vals),
590
+ gpu_util_pct=sum(util_vals) / len(util_vals) if util_vals else 0.0,
591
+ power_watts=sum(power_vals) / len(power_vals) if power_vals else 0.0,
558
592
  ))
559
- except Exception:
560
- pass
561
593
 
562
594
  # Calibrate mode: auto-stop when stable
563
- if calibrate and len(samples) > ramp_up_samples:
595
+ # Delay stability check until GPU discovery is complete —
596
+ # prevents calibrate-and-exit before finding all expected GPUs.
597
+ if calibrate and discovery_done and len(samples) > ramp_up_samples:
564
598
  from alloc.stability import check_stability, RAMP_UP_SAMPLES
565
599
  sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
566
600
  if sr.is_stable:
@@ -699,8 +733,8 @@ def probe_command(
699
733
  num_gpus_detected=num_gpus_ref[0],
700
734
  process_map=process_map_ref[0],
701
735
  per_gpu_peak_vram_mb=(
702
- [round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
703
- if num_gpus_ref[0] >= 1 and per_gpu_peaks_ref[0] else None
736
+ [round(per_gpu_peaks_ref[0][i], 1) for i in sorted(per_gpu_peaks_ref[0])]
737
+ if per_gpu_peaks_ref[0] else None
704
738
  ),
705
739
  detected_interconnect=detected_ic_ref[0],
706
740
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -245,3 +245,118 @@ def test_active_gpu_fallback_not_used_without_expected():
245
245
  with patch("alloc.probe._read_child_env", return_value=None):
246
246
  result = _discover_gpu_indices(1000, mock, fallback_index=0)
247
247
  assert result == [0] # Falls back to default
248
+
249
+
250
+ # ── Early handle initialization for expected GPUs ──
251
+
252
+
253
+ def test_early_init_opens_handles_for_expected_gpus():
254
+ """When expected_gpus > 1 and device_count >= expected, early-init should
255
+ open handles for all expected GPUs."""
256
+ mock_pynvml = MagicMock()
257
+ mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=2)
258
+
259
+ handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
260
+ mock_pynvml.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=lambda i: handles_map[i])
261
+
262
+ # Simulate early-init logic from probe_command._monitor()
263
+ expected_gpus = 2
264
+ handles = [handles_map[0]]
265
+
266
+ if expected_gpus > 1:
267
+ device_count = mock_pynvml.nvmlDeviceGetCount()
268
+ if device_count >= expected_gpus:
269
+ early_handles = []
270
+ early_indices = []
271
+ for idx in range(device_count):
272
+ if len(early_handles) >= expected_gpus:
273
+ break
274
+ h = mock_pynvml.nvmlDeviceGetHandleByIndex(idx)
275
+ early_handles.append(h)
276
+ early_indices.append(idx)
277
+ if len(early_handles) >= expected_gpus:
278
+ handles = early_handles
279
+
280
+ assert len(handles) == 2
281
+
282
+
283
+ def test_early_init_skipped_when_fewer_devices():
284
+ """When device_count < expected_gpus, early-init should not change handles."""
285
+ mock_pynvml = MagicMock()
286
+ mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=1)
287
+
288
+ expected_gpus = 2
289
+ handles = [MagicMock(name="gpu0")]
290
+ original_handles = list(handles)
291
+
292
+ if expected_gpus > 1:
293
+ device_count = mock_pynvml.nvmlDeviceGetCount()
294
+ if device_count >= expected_gpus:
295
+ assert False, "Should not reach here"
296
+
297
+ assert len(handles) == 1
298
+
299
+
300
+ def test_per_gpu_sampling_resilient_to_partial_failure():
301
+ """Per-GPU try/except: one GPU failure should not prevent others from
302
+ being sampled into per_gpu_peaks."""
303
+ mock_pynvml = MagicMock()
304
+ handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
305
+
306
+ mem_ok = MagicMock()
307
+ mem_ok.total = 24 * 1024 * 1024 * 1024
308
+ mem_ok.used = 8000 * 1024 * 1024
309
+
310
+ def mem_info_side_effect(h):
311
+ if h == handles_map[1]:
312
+ raise RuntimeError("GPU 1 memory read failed")
313
+ return mem_ok
314
+
315
+ mock_pynvml.nvmlDeviceGetMemoryInfo = MagicMock(side_effect=mem_info_side_effect)
316
+ util = MagicMock()
317
+ util.gpu = 80
318
+ mock_pynvml.nvmlDeviceGetUtilizationRates = MagicMock(return_value=util)
319
+ mock_pynvml.nvmlDeviceGetPowerUsage = MagicMock(return_value=100_000)
320
+
321
+ # Simulate the per-GPU sampling loop
322
+ handles = [handles_map[0], handles_map[1]]
323
+ per_gpu_peaks = {}
324
+ vram_vals = []
325
+
326
+ for h in handles:
327
+ try:
328
+ mi = mock_pynvml.nvmlDeviceGetMemoryInfo(h)
329
+ vram_vals.append(mi.used / (1024 * 1024))
330
+ except Exception:
331
+ pass
332
+
333
+ for gi, vm in enumerate(vram_vals):
334
+ per_gpu_peaks[gi] = max(per_gpu_peaks.get(gi, 0.0), vm)
335
+
336
+ # GPU 0 tracked, GPU 1 skipped
337
+ assert 0 in per_gpu_peaks
338
+ assert per_gpu_peaks[0] > 0
339
+ assert len(vram_vals) == 1
340
+
341
+
342
+ def test_stability_delayed_until_discovery_done():
343
+ """Stability check requires discovery_done=True."""
344
+ # Single GPU: expected=1, num_gpus=1 → done immediately
345
+ assert 1 >= 1 # num_gpus >= expected
346
+
347
+ # Multi GPU with early-init: expected=2, num_gpus=2 → done at sample 5
348
+ assert 2 >= 2
349
+
350
+ # Multi GPU, discovery incomplete: expected=4, found=2 → NOT done
351
+ assert not (2 >= 4)
352
+
353
+
354
+ def test_per_gpu_peaks_to_result_list():
355
+ """per_gpu_peaks dict should convert to sorted list for ProbeResult."""
356
+ peaks = {0: 8000.5, 1: 12000.3}
357
+ result = [round(peaks[i], 1) for i in sorted(peaks)] if peaks else None
358
+ assert result == [8000.5, 12000.3]
359
+
360
+ empty = {}
361
+ result_empty = [round(empty[i], 1) for i in sorted(empty)] if empty else None
362
+ assert result_empty is None
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes