alloc 0.0.9__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {alloc-0.0.9 → alloc-0.0.11}/PKG-INFO +2 -2
  2. {alloc-0.0.9 → alloc-0.0.11}/README.md +1 -1
  3. {alloc-0.0.9 → alloc-0.0.11}/pyproject.toml +1 -1
  4. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/__init__.py +1 -1
  5. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/browser_auth.py +3 -2
  6. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/callbacks.py +4 -1
  7. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/cli.py +79 -16
  8. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/code_analyzer.py +28 -0
  9. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/diagnosis_engine.py +1 -1
  10. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/probe.py +35 -9
  11. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/PKG-INFO +2 -2
  12. {alloc-0.0.9 → alloc-0.0.11}/tests/test_callbacks.py +68 -0
  13. {alloc-0.0.9 → alloc-0.0.11}/tests/test_diagnosis_engine.py +12 -0
  14. {alloc-0.0.9 → alloc-0.0.11}/tests/test_probe_multi.py +55 -0
  15. alloc-0.0.11/tests/test_topology_strategy.py +215 -0
  16. alloc-0.0.9/tests/test_topology_strategy.py +0 -93
  17. {alloc-0.0.9 → alloc-0.0.11}/setup.cfg +0 -0
  18. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/artifact_loader.py +0 -0
  19. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/artifact_writer.py +0 -0
  20. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/catalog/__init__.py +0 -0
  21. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/catalog/default_rate_card.json +0 -0
  22. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/catalog/gpus.v1.json +0 -0
  23. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/config.py +0 -0
  24. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/context.py +0 -0
  25. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/diagnosis_display.py +0 -0
  26. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/diagnosis_rules.py +0 -0
  27. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/display.py +0 -0
  28. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/extractor_runner.py +0 -0
  29. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/ghost.py +0 -0
  30. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/model_extractor.py +0 -0
  31. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/model_registry.py +0 -0
  32. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/stability.py +0 -0
  33. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/upload.py +0 -0
  34. {alloc-0.0.9 → alloc-0.0.11}/src/alloc/yaml_config.py +0 -0
  35. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/SOURCES.txt +0 -0
  36. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/dependency_links.txt +0 -0
  37. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/entry_points.txt +0 -0
  38. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/requires.txt +0 -0
  39. {alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/top_level.txt +0 -0
  40. {alloc-0.0.9 → alloc-0.0.11}/tests/test_artifact.py +0 -0
  41. {alloc-0.0.9 → alloc-0.0.11}/tests/test_artifact_loader.py +0 -0
  42. {alloc-0.0.9 → alloc-0.0.11}/tests/test_auth.py +0 -0
  43. {alloc-0.0.9 → alloc-0.0.11}/tests/test_catalog.py +0 -0
  44. {alloc-0.0.9 → alloc-0.0.11}/tests/test_cli.py +0 -0
  45. {alloc-0.0.9 → alloc-0.0.11}/tests/test_code_analyzer.py +0 -0
  46. {alloc-0.0.9 → alloc-0.0.11}/tests/test_context.py +0 -0
  47. {alloc-0.0.9 → alloc-0.0.11}/tests/test_diagnose_cli.py +0 -0
  48. {alloc-0.0.9 → alloc-0.0.11}/tests/test_diagnosis_rules.py +0 -0
  49. {alloc-0.0.9 → alloc-0.0.11}/tests/test_extractor_activation.py +0 -0
  50. {alloc-0.0.9 → alloc-0.0.11}/tests/test_ghost.py +0 -0
  51. {alloc-0.0.9 → alloc-0.0.11}/tests/test_ghost_degradation.py +0 -0
  52. {alloc-0.0.9 → alloc-0.0.11}/tests/test_init_from_org.py +0 -0
  53. {alloc-0.0.9 → alloc-0.0.11}/tests/test_interconnect.py +0 -0
  54. {alloc-0.0.9 → alloc-0.0.11}/tests/test_model_extractor.py +0 -0
  55. {alloc-0.0.9 → alloc-0.0.11}/tests/test_probe_hw.py +0 -0
  56. {alloc-0.0.9 → alloc-0.0.11}/tests/test_scan_auth.py +0 -0
  57. {alloc-0.0.9 → alloc-0.0.11}/tests/test_stability.py +0 -0
  58. {alloc-0.0.9 → alloc-0.0.11}/tests/test_upload.py +0 -0
  59. {alloc-0.0.9 → alloc-0.0.11}/tests/test_verdict.py +0 -0
  60. {alloc-0.0.9 → alloc-0.0.11}/tests/test_yaml_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
40
40
  ```
41
41
 
42
42
  ```
43
- alloc v0.0.8 — Calibrate
43
+ alloc v0.0.9 — Calibrate
44
44
 
45
45
  Run Summary
46
46
  Peak VRAM 31.2 GB / 40.0 GB (A100)
@@ -12,7 +12,7 @@ alloc run python train.py
12
12
  ```
13
13
 
14
14
  ```
15
- alloc v0.0.8 — Calibrate
15
+ alloc v0.0.9 — Calibrate
16
16
 
17
17
  Run Summary
18
18
  Peak VRAM 31.2 GB / 40.0 GB (A100)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.9"
7
+ version = "0.0.11"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
9
9
  _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
10
10
  del _warnings
11
11
 
12
- __version__ = "0.0.9"
12
+ __version__ = "0.0.11"
13
13
 
14
14
  from alloc.ghost import ghost, GhostReport
15
15
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -121,8 +121,9 @@ def browser_login(
121
121
  })
122
122
  authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
123
123
 
124
- # Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
125
- server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
124
+ # Bind to 127.0.0.1 only the auth callback server should never be
125
+ # reachable from the network.
126
+ server = HTTPServer(("127.0.0.1", port), _CallbackHandler)
126
127
  server.auth_code = None # type: ignore[attr-defined]
127
128
  server.auth_error = None # type: ignore[attr-defined]
128
129
  server.timeout = 1 # poll interval for handle_request()
@@ -501,7 +501,10 @@ class _NvmlMonitor:
501
501
 
502
502
  self._hw_context["nvlink_active_links"] = active_links
503
503
  except Exception:
504
- pass
504
+ # NVLink detection code failed after entering the try block.
505
+ # We know NVML is functional (handles exist), so fall back to
506
+ # generic "nvlink" rather than leaving interconnect_type unset.
507
+ self._hw_context["interconnect_type"] = "nvlink"
505
508
 
506
509
  self._thread = threading.Thread(target=self._sample_loop, daemon=True)
507
510
  self._thread.start()
@@ -328,6 +328,7 @@ def run(
328
328
  no_config: bool = typer.Option(False, "--no-config", help="Skip .alloc.yaml (use catalog defaults)"),
329
329
  after: Optional[str] = typer.Option(None, "--after", help="Previous run ID to compare against (outcome tracking)"),
330
330
  experiment: Optional[str] = typer.Option(None, "--experiment", "-e", help="Experiment group name"),
331
+ strategy: Optional[str] = typer.Option(None, "--strategy", help="Override detected strategy (ddp, fsdp, deepspeed, tp, pp, etc.)"),
331
332
  ):
332
333
  """Run a training command with GPU monitoring."""
333
334
  from alloc.probe import probe_command
@@ -342,6 +343,18 @@ def run(
342
343
  console.print("Usage: alloc run python train.py")
343
344
  raise typer.Exit(1)
344
345
 
346
+ # Validate --strategy against API-accepted values
347
+ _VALID_STRATEGIES = {
348
+ "ddp", "fsdp", "deepspeed", "tp", "pp",
349
+ "tp+dp", "pp+dp", "tp+pp+dp", "tp+pp+fsdp",
350
+ }
351
+ if strategy and strategy.lower() not in _VALID_STRATEGIES:
352
+ console.print(
353
+ f"[red]Invalid --strategy '{strategy}'. "
354
+ f"Valid values: {', '.join(sorted(_VALID_STRATEGIES))}[/red]"
355
+ )
356
+ raise typer.Exit(1)
357
+
345
358
  # ALLOC_POLICY: "warn" or "enforce" forces full monitoring
346
359
  alloc_policy = os.environ.get("ALLOC_POLICY", "").lower().strip()
347
360
  if alloc_policy and alloc_policy not in ("warn", "enforce"):
@@ -425,10 +438,25 @@ def run(
425
438
  # Discover environment context (git, container, Ray)
426
439
  from alloc.context import discover_context
427
440
  env_context = discover_context()
441
+
442
+ # AST strategy hint: detect FSDP/DDP/DeepSpeed from script source
443
+ ast_hint = None # type: Optional[str]
444
+ try:
445
+ from alloc.code_analyzer import detect_strategy_hint
446
+ # Find the .py script in the command (e.g. "python train.py" or "torchrun ... train.py")
447
+ for arg in command:
448
+ if arg.endswith(".py") and os.path.isfile(arg):
449
+ ast_hint = detect_strategy_hint(arg)
450
+ break
451
+ except Exception:
452
+ pass # Never crash on AST analysis failure
453
+
428
454
  topology = _infer_parallel_topology_from_env(
429
455
  num_gpus_detected=result.num_gpus_detected,
430
456
  config_interconnect=gpu_context.get("interconnect") if gpu_context else None,
431
457
  detected_interconnect=result.detected_interconnect,
458
+ strategy_override=strategy,
459
+ ast_strategy_hint=ast_hint,
432
460
  )
433
461
  objective = os.environ.get("ALLOC_OBJECTIVE", "").strip().lower() or _objective_from_context(gpu_context)
434
462
  max_budget_hourly = _max_budget_hourly_from_context(gpu_context)
@@ -456,6 +484,7 @@ def run(
456
484
  "pp_degree": topology.get("pp_degree"),
457
485
  "dp_degree": topology.get("dp_degree"),
458
486
  "strategy": topology.get("strategy"),
487
+ "strategy_detection_method": topology.get("strategy_detection_method"),
459
488
  "interconnect_type": topology.get("interconnect_type"),
460
489
  "process_map": result.process_map,
461
490
  "objective": objective,
@@ -3522,8 +3551,22 @@ def _print_gpu_context_detail(ctx: dict) -> None:
3522
3551
  console.print(Panel("\n".join(lines), title="GPU Context (.alloc.yaml)", border_style="cyan", padding=(1, 0)))
3523
3552
 
3524
3553
 
3525
- def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_interconnect: Optional[str] = None, detected_interconnect: Optional[str] = None) -> dict:
3526
- """Infer distributed topology hints from common launcher env vars."""
3554
+ def _infer_parallel_topology_from_env(
3555
+ *,
3556
+ num_gpus_detected: int,
3557
+ config_interconnect: Optional[str] = None,
3558
+ detected_interconnect: Optional[str] = None,
3559
+ strategy_override: Optional[str] = None,
3560
+ ast_strategy_hint: Optional[str] = None,
3561
+ ) -> dict:
3562
+ """Infer distributed topology hints from common launcher env vars.
3563
+
3564
+ Strategy precedence:
3565
+ 1. --strategy override (user explicit)
3566
+ 2. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed)
3567
+ 3. Env var inference (TP/PP/DP degrees)
3568
+ 4. None (unknown — never silently default to ddp)
3569
+ """
3527
3570
 
3528
3571
  def _get_int(name: str) -> Optional[int]:
3529
3572
  val = os.environ.get(name)
@@ -3545,8 +3588,12 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3545
3588
 
3546
3589
  tp = _get_int("TP_SIZE") or _get_int("TENSOR_PARALLEL_SIZE")
3547
3590
  pp = _get_int("PP_SIZE") or _get_int("PIPELINE_PARALLEL_SIZE")
3548
- dp = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
3591
+ dp_explicit = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
3592
+ dp = dp_explicit
3549
3593
 
3594
+ # Derive dp from WORLD_SIZE when not explicitly set.
3595
+ # This gives us the degree but does NOT imply strategy=ddp
3596
+ # (WORLD_SIZE is set for both DDP and FSDP).
3550
3597
  if dp is None and world_size is not None:
3551
3598
  denom = (tp or 1) * (pp or 1)
3552
3599
  if denom > 0 and world_size % denom == 0:
@@ -3563,26 +3610,41 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3563
3610
  if interconnect not in ("pcie", "nvlink", "nvlink_switch", "nvlink_p2p", "infiniband", "unknown"):
3564
3611
  interconnect = "unknown"
3565
3612
 
3566
- # Infer strategy from degrees only when evidence exists
3613
+ # Strategy detection with strict precedence and provenance tracking
3567
3614
  strategy = None
3568
- has_tp = tp is not None and tp > 1
3569
- has_pp = pp is not None and pp > 1
3570
- if has_tp and has_pp:
3615
+ strategy_detection_method = None # type: Optional[str]
3616
+
3617
+ # 1. Explicit --strategy override
3618
+ if strategy_override:
3619
+ strategy = strategy_override.lower()
3620
+ strategy_detection_method = "user_override"
3621
+ # 2. Env var inference from explicit TP/PP/DP degree env vars
3622
+ # TP_SIZE/PP_SIZE unambiguously identify the strategy.
3623
+ # DP_SIZE (explicit) implies DDP. But WORLD_SIZE-derived dp does NOT
3624
+ # imply DDP — FSDP uses the same WORLD_SIZE.
3625
+ elif tp is not None and tp > 1 and pp is not None and pp > 1:
3571
3626
  strategy = "tp+pp+dp"
3572
- elif has_tp:
3627
+ strategy_detection_method = "env_degrees"
3628
+ elif tp is not None and tp > 1:
3573
3629
  strategy = "tp+dp" if (dp is not None and dp > 1) else "tp"
3574
- elif has_pp:
3630
+ strategy_detection_method = "env_degrees"
3631
+ elif pp is not None and pp > 1:
3575
3632
  strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
3576
- elif dp is not None and dp > 1:
3577
- strategy = "ddp"
3578
- elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
3579
- # Multiple GPUs detected via NVML with no TP/PP env vars →
3580
- # DDP is PyTorch's default and the only realistic inference.
3581
- # This is NOT the old `or "ddp"` — it only fires when probe
3582
- # actually observed multiple GPU processes.
3633
+ strategy_detection_method = "env_degrees"
3634
+ elif dp_explicit is not None and dp_explicit > 1:
3583
3635
  strategy = "ddp"
3636
+ strategy_detection_method = "env_degrees"
3637
+ # 3. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed in script)
3638
+ elif ast_strategy_hint and num_gpus_detected > 1:
3639
+ strategy = ast_strategy_hint
3640
+ strategy_detection_method = "ast_analysis"
3584
3641
  if dp is None:
3585
3642
  dp = num_gpus_detected
3643
+ # 4. No trustworthy signal — leave strategy=None
3644
+ # (Never silently collapse unknown distributed runs to ddp)
3645
+
3646
+ if strategy and dp is None and num_gpus_detected > 1:
3647
+ dp = num_gpus_detected
3586
3648
 
3587
3649
  return {
3588
3650
  "num_nodes": nnodes or 1,
@@ -3592,6 +3654,7 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3592
3654
  "dp_degree": dp,
3593
3655
  "interconnect_type": interconnect,
3594
3656
  "strategy": strategy,
3657
+ "strategy_detection_method": strategy_detection_method,
3595
3658
  }
3596
3659
 
3597
3660
 
@@ -132,6 +132,34 @@ def analyze_script(script_path: str) -> CodeFindings:
132
132
  return findings
133
133
 
134
134
 
135
+ def detect_strategy_hint(script_path: str) -> Optional[str]:
136
+ """Lightweight AST check: return strategy kind if detectable, else None.
137
+
138
+ Returns one of: 'fsdp', 'ddp', 'deepspeed', 'data_parallel', or None.
139
+ Never crashes — returns None on any error.
140
+ """
141
+ try:
142
+ if not os.path.isfile(script_path):
143
+ return None
144
+ with open(script_path, "r") as f:
145
+ source = f.read()
146
+ tree = ast.parse(source, filename=script_path)
147
+ imports = _walk_imports(tree)
148
+ distributed = _find_distributed(tree, imports, source.splitlines(), script_path)
149
+ # Priority: fsdp > deepspeed > ddp
150
+ # data_parallel is single-process (not a distributed strategy) — ignored.
151
+ kinds = {d.kind for d in distributed}
152
+ if "fsdp" in kinds:
153
+ return "fsdp"
154
+ if "deepspeed" in kinds:
155
+ return "deepspeed"
156
+ if "ddp" in kinds:
157
+ return "ddp"
158
+ return None
159
+ except Exception:
160
+ return None
161
+
162
+
135
163
  # ---------------------------------------------------------------------------
136
164
  # Import resolution
137
165
  # ---------------------------------------------------------------------------
@@ -403,7 +403,7 @@ def _estimate_model_params(model_name: str) -> Optional[float]:
403
403
  "whisper-large": 1.55,
404
404
  }
405
405
 
406
- for key, params in estimates.items():
406
+ for key, params in sorted(estimates.items(), key=lambda x: len(x[0]), reverse=True):
407
407
  if key in name:
408
408
  return params
409
409
 
@@ -215,8 +215,19 @@ def _discover_gpu_indices(proc_pid, pynvml, fallback_index=0, expected_gpus=None
215
215
  if 0 <= idx < device_count:
216
216
  visible_physical.append(idx)
217
217
  except ValueError:
218
- visible_physical = list(range(device_count))
219
- break
218
+ # UUID-style device identifiers — try NVML UUID matching
219
+ try:
220
+ for phys_idx in range(device_count):
221
+ handle = pynvml.nvmlDeviceGetHandleByIndex(phys_idx)
222
+ uuid = pynvml.nvmlDeviceGetUUID(handle)
223
+ if isinstance(uuid, bytes):
224
+ uuid = uuid.decode("utf-8", errors="replace")
225
+ if d in uuid:
226
+ visible_physical.append(phys_idx)
227
+ break
228
+ except Exception:
229
+ visible_physical = list(range(device_count))
230
+ break
220
231
  search_indices = visible_physical if visible_physical else list(range(device_count))
221
232
  else:
222
233
  search_indices = list(range(device_count))
@@ -363,12 +374,27 @@ def probe_command(
363
374
  """
364
375
  pynvml = _try_import_pynvml()
365
376
 
366
- # Launch the user's training subprocess — do NOT modify env (their warnings matter)
377
+ # Launch the user's training subprocess.
378
+ # Suppress only pynvml/torch.cuda FutureWarning noise — these come from
379
+ # Alloc's own callbacks or from torch internals, not from user code.
380
+ # Propagates to torchrun children and most Ray workers via env inheritance.
381
+ child_env = os.environ.copy()
382
+ existing_pw = child_env.get("PYTHONWARNINGS", "")
383
+ alloc_filters = (
384
+ "ignore::FutureWarning:pynvml,"
385
+ "ignore::DeprecationWarning:pynvml,"
386
+ "ignore::FutureWarning:torch.cuda,"
387
+ "ignore::DeprecationWarning:torch.cuda"
388
+ )
389
+ child_env["PYTHONWARNINGS"] = (
390
+ f"{existing_pw},{alloc_filters}" if existing_pw else alloc_filters
391
+ )
367
392
  try:
368
393
  proc = subprocess.Popen(
369
394
  command,
370
395
  stdout=sys.stdout,
371
396
  stderr=sys.stderr,
397
+ env=child_env,
372
398
  )
373
399
  except Exception as e:
374
400
  return ProbeResult(
@@ -507,11 +533,11 @@ def probe_command(
507
533
  power_vals.append(pw)
508
534
  total_mb = mi.total / (1024 * 1024)
509
535
 
510
- # Track per-GPU peak VRAM for multi-GPU runs
511
- if len(handles) > 1:
512
- pgp = per_gpu_peaks_ref[0]
513
- for gi, vm in enumerate(vram_vals):
514
- pgp[gi] = max(pgp.get(gi, 0.0), vm)
536
+ # Track per-GPU peak VRAM (always, even single GPU
537
+ # discovery may expand handles later, and we need history from sample 0)
538
+ pgp = per_gpu_peaks_ref[0]
539
+ for gi, vm in enumerate(vram_vals):
540
+ pgp[gi] = max(pgp.get(gi, 0.0), vm)
515
541
 
516
542
  samples.append(ProbeSample(
517
543
  timestamp=time.time(),
@@ -664,7 +690,7 @@ def probe_command(
664
690
  process_map=process_map_ref[0],
665
691
  per_gpu_peak_vram_mb=(
666
692
  [round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
667
- if len(per_gpu_peaks_ref[0]) > 1 else None
693
+ if num_gpus_ref[0] > 1 and per_gpu_peaks_ref[0] else None
668
694
  ),
669
695
  detected_interconnect=detected_ic_ref[0],
670
696
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
40
40
  ```
41
41
 
42
42
  ```
43
- alloc v0.0.8 — Calibrate
43
+ alloc v0.0.9 — Calibrate
44
44
 
45
45
  Run Summary
46
46
  Peak VRAM 31.2 GB / 40.0 GB (A100)
@@ -1189,3 +1189,71 @@ class TestNvmlMonitorThreadSafety:
1189
1189
  assert len(probe["per_rank_peak_vram_mb"]) == 2
1190
1190
  for peak in probe["per_rank_peak_vram_mb"]:
1191
1191
  assert peak > 0
1192
+
1193
+
1194
+ class TestNvmlMonitorNvlinkFallback:
1195
+ def test_nvlink_detection_failure_sets_nvlink_fallback(self):
1196
+ """When the outer NVLink detection block raises, fall back to 'nvlink'.
1197
+
1198
+ We trigger this by making nvmlDeviceGetNvLinkState raise on the first
1199
+ call (inner except breaks the loop → active_links=0 → 'pcie'), and
1200
+ then making the active_links comparison itself blow up. The simplest
1201
+ trigger is having _gpu_handles[0] raise IndexError (empty list after
1202
+ the early-return guard).
1203
+ """
1204
+ mock_pynvml = MagicMock()
1205
+ mock_pynvml.nvmlInit.return_value = None
1206
+ mock_pynvml.nvmlShutdown.return_value = None
1207
+ mock_pynvml.nvmlDeviceGetCount.return_value = 2
1208
+ mock_pynvml.nvmlDeviceGetName.return_value = "NVIDIA A100-SXM4-80GB"
1209
+ mem = SimpleNamespace(total=80 * 1024**3, used=1 * 1024**3)
1210
+ mock_pynvml.nvmlDeviceGetMemoryInfo.return_value = mem
1211
+ mock_pynvml.nvmlSystemGetDriverVersion.return_value = "535"
1212
+ mock_pynvml.nvmlSystemGetCudaDriverVersion.return_value = 12000
1213
+ mock_pynvml.nvmlDeviceGetCudaComputeCapability.return_value = (8, 0)
1214
+ util = SimpleNamespace(gpu=75, memory=60)
1215
+ mock_pynvml.nvmlDeviceGetUtilizationRates.return_value = util
1216
+ mock_pynvml.nvmlDeviceGetPowerUsage.return_value = 300000
1217
+
1218
+ # Use a handle list that passes the `if not self._gpu_handles` guard
1219
+ # (it's truthy) but raises IndexError on `self._gpu_handles[0]`.
1220
+ class BadHandleList:
1221
+ """Truthy but raises on index access."""
1222
+ def __bool__(self):
1223
+ return True
1224
+ def __len__(self):
1225
+ return 2
1226
+ def __iter__(self):
1227
+ return iter([])
1228
+ def __getitem__(self, idx):
1229
+ raise IndexError("corrupted handle list")
1230
+
1231
+ with patch("alloc.callbacks._try_import_pynvml", return_value=mock_pynvml):
1232
+ monitor = _NvmlMonitor()
1233
+
1234
+ # Replace handles after __init__ but before start().
1235
+ # start() will re-populate from nvmlDeviceGetCount, so we also need to
1236
+ # make the handle-building loop produce our bad list. We do this by
1237
+ # patching nvmlDeviceGetHandleByIndex to raise, so _gpu_handles stays
1238
+ # empty after the try/except in handle building. But that triggers
1239
+ # the early return. Instead, we patch _gpu_handles AFTER start()
1240
+ # builds them but BEFORE NVLink detection runs. We achieve this by
1241
+ # having nvmlDeviceGetCudaComputeCapability (the last hw-context call
1242
+ # before NVLink detection) swap in the bad handles as a side effect.
1243
+ original_sm = mock_pynvml.nvmlDeviceGetCudaComputeCapability
1244
+
1245
+ def swap_handles_then_return_sm(handle):
1246
+ monitor._gpu_handles = BadHandleList()
1247
+ return (8, 0)
1248
+
1249
+ mock_pynvml.nvmlDeviceGetCudaComputeCapability = MagicMock(
1250
+ side_effect=swap_handles_then_return_sm
1251
+ )
1252
+
1253
+ monitor.start()
1254
+ import time
1255
+ time.sleep(0.02)
1256
+ monitor.stop()
1257
+
1258
+ hw, _ = monitor.get_results()
1259
+ assert hw.get("interconnect_type") == "nvlink"
@@ -359,3 +359,15 @@ def test_estimate_model_params_known_vision_model():
359
359
 
360
360
  result = _estimate_model_params("stable-diffusion")
361
361
  assert result == 0.865
362
+
363
+
364
+ def test_estimate_model_params_gpt2_medium_prefix_match():
365
+ """gpt2-medium-finetuned should match gpt2-medium (0.355), not gpt2 (0.124)."""
366
+ result = _estimate_model_params("gpt2-medium-finetuned")
367
+ assert result == 0.355
368
+
369
+
370
+ def test_estimate_model_params_gpt2_alone():
371
+ """Plain gpt2 should still match 0.124."""
372
+ result = _estimate_model_params("gpt2")
373
+ assert result == 0.124
@@ -158,6 +158,61 @@ def test_parse_plain_python():
158
158
  assert _parse_launcher_gpu_count(["python", "train.py"]) is None
159
159
 
160
160
 
161
+ # ── CVD UUID resolution ──
162
+
163
+
164
+ def test_cvd_uuid_resolves_to_correct_index():
165
+ """UUID-style CUDA_VISIBLE_DEVICES should resolve to the matching physical GPU index."""
166
+ mock = _mock_pynvml_multi_gpu(
167
+ proc_pid=1000,
168
+ gpu_process_map={0: [1000], 1: [], 2: []},
169
+ )
170
+ mock.nvmlDeviceGetCount.return_value = 3
171
+
172
+ # Set up UUID resolution: GPU 0 → UUID-A, GPU 1 → UUID-B, GPU 2 → UUID-C
173
+ uuid_map = {0: "GPU-aaaa-1111", 1: "GPU-bbbb-2222", 2: "GPU-cccc-3333"}
174
+ handles = {}
175
+ for idx in range(3):
176
+ handles[idx] = MagicMock(name=f"handle_{idx}")
177
+
178
+ def get_handle(idx):
179
+ return handles[idx]
180
+
181
+ def get_uuid(handle):
182
+ for idx, h in handles.items():
183
+ if handle == h:
184
+ return uuid_map[idx]
185
+ return "GPU-unknown"
186
+
187
+ mock.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=get_handle)
188
+ mock.nvmlDeviceGetUUID = MagicMock(side_effect=get_uuid)
189
+
190
+ # CVD set to GPU 2's UUID
191
+ with patch("alloc.probe._get_child_pids", return_value=[]):
192
+ with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-cccc-3333"}):
193
+ result = _discover_gpu_indices(1000, mock, fallback_index=0)
194
+ # Should only search GPU index 2
195
+ assert 2 in result or result == [0] # either found on idx 2, or fallback if no PID match
196
+
197
+
198
+ def test_cvd_invalid_uuid_falls_back_to_all_gpus():
199
+ """Invalid UUID that doesn't match any device should fall back to all GPUs."""
200
+ mock = _mock_pynvml_multi_gpu(
201
+ proc_pid=1000,
202
+ gpu_process_map={0: [1000], 1: []},
203
+ )
204
+ mock.nvmlDeviceGetCount.return_value = 2
205
+
206
+ # UUID lookup raises for all devices
207
+ mock.nvmlDeviceGetUUID = MagicMock(side_effect=RuntimeError("no UUID support"))
208
+
209
+ with patch("alloc.probe._get_child_pids", return_value=[]):
210
+ with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-nonexistent"}):
211
+ result = _discover_gpu_indices(1000, mock, fallback_index=0)
212
+ # Should fall back to searching all GPUs and find PID 1000 on GPU 0
213
+ assert 0 in result
214
+
215
+
161
216
  def test_parse_torch_distributed_launch():
162
217
  assert _parse_launcher_gpu_count([
163
218
  "python", "-m", "torch.distributed.launch", "--nproc_per_node=2", "train.py"
@@ -0,0 +1,215 @@
1
+ """Tests for strategy inference from topology degrees (P0-B)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from unittest.mock import patch
7
+
8
+ from alloc.cli import _infer_parallel_topology_from_env
9
+
10
+
11
+ class TestStrategyInference:
12
+ """Strategy should be inferred from TP/PP/DP degrees when present."""
13
+
14
+ def _topo(self, env=None, num_gpus=4, **kwargs):
15
+ env = env or {}
16
+ with patch.dict(os.environ, env, clear=False):
17
+ return _infer_parallel_topology_from_env(
18
+ num_gpus_detected=num_gpus,
19
+ **kwargs,
20
+ )
21
+
22
+ def test_no_degrees_multi_gpu_strategy_none(self):
23
+ """When no degree env vars and no AST hint, strategy stays None."""
24
+ result = self._topo({}, num_gpus=4)
25
+ assert result["strategy"] is None
26
+ assert result["strategy_detection_method"] is None
27
+
28
+ def test_single_gpu_no_degrees_strategy_none(self):
29
+ """Single GPU with no degrees → strategy stays None."""
30
+ result = self._topo({}, num_gpus=1)
31
+ assert result["strategy"] is None
32
+
33
+ def test_world_size_only_strategy_none(self):
34
+ """WORLD_SIZE=4 with no explicit DP_SIZE → strategy=None (ambiguous)."""
35
+ result = self._topo({"WORLD_SIZE": "4"})
36
+ assert result["strategy"] is None
37
+ assert result["dp_degree"] == 4 # degree still derived for topology
38
+
39
+ def test_explicit_dp_size_is_ddp(self):
40
+ """Explicit DP_SIZE=4 → strategy=ddp."""
41
+ result = self._topo({"DP_SIZE": "4"})
42
+ assert result["strategy"] == "ddp"
43
+ assert result["dp_degree"] == 4
44
+
45
+ def test_tp_only(self):
46
+ """TP_SIZE=4 alone → strategy=tp."""
47
+ result = self._topo({"TP_SIZE": "4"})
48
+ assert result["strategy"] == "tp"
49
+
50
+ def test_pp_only(self):
51
+ """PP_SIZE=4 alone → strategy=pp."""
52
+ result = self._topo({"PP_SIZE": "4"})
53
+ assert result["strategy"] == "pp"
54
+
55
+ def test_tp_dp(self):
56
+ """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
57
+ result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
58
+ assert result["strategy"] == "tp+dp"
59
+
60
+ def test_pp_dp(self):
61
+ """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
62
+ result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
63
+ assert result["strategy"] == "pp+dp"
64
+
65
+ def test_tp_pp_dp(self):
66
+ """All three degrees → strategy=tp+pp+dp."""
67
+ result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
68
+ assert result["strategy"] == "tp+pp+dp"
69
+
70
+ def test_tp_pp_no_dp(self):
71
+ """TP+PP without explicit DP → strategy=tp+pp+dp."""
72
+ result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
73
+ assert result["strategy"] == "tp+pp+dp"
74
+
75
+ def test_tp_size_1_not_counted(self):
76
+ """TP_SIZE=1 should not count as tensor parallelism."""
77
+ result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
78
+ assert result["strategy"] == "ddp"
79
+
80
+ def test_pp_size_1_not_counted(self):
81
+ """PP_SIZE=1 should not count as pipeline parallelism."""
82
+ result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
83
+ assert result["strategy"] == "ddp"
84
+
85
+ def test_dp_inferred_from_world_size(self):
86
+ """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
87
+ result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
88
+ assert result["dp_degree"] == 4
89
+ assert result["strategy"] == "tp+dp"
90
+
91
+
92
+ class TestStrategyOverride:
93
+ """--strategy override takes highest precedence."""
94
+
95
+ def _topo(self, env=None, num_gpus=4, **kwargs):
96
+ env = env or {}
97
+ with patch.dict(os.environ, env, clear=False):
98
+ return _infer_parallel_topology_from_env(
99
+ num_gpus_detected=num_gpus,
100
+ **kwargs,
101
+ )
102
+
103
+ def test_override_beats_env_degrees(self):
104
+ """Explicit --strategy overrides env var inference."""
105
+ result = self._topo({"WORLD_SIZE": "4"}, strategy_override="fsdp")
106
+ assert result["strategy"] == "fsdp"
107
+ assert result["strategy_detection_method"] == "user_override"
108
+
109
+ def test_override_beats_ast_hint(self):
110
+ """Explicit --strategy overrides AST hint."""
111
+ result = self._topo({}, num_gpus=2, strategy_override="fsdp", ast_strategy_hint="ddp")
112
+ assert result["strategy"] == "fsdp"
113
+ assert result["strategy_detection_method"] == "user_override"
114
+
115
+ def test_ast_hint_used_when_multi_gpu(self):
116
+ """AST hint used when no env degrees and multi-GPU."""
117
+ result = self._topo({}, num_gpus=2, ast_strategy_hint="fsdp")
118
+ assert result["strategy"] == "fsdp"
119
+ assert result["strategy_detection_method"] == "ast_analysis"
120
+ assert result["dp_degree"] == 2
121
+
122
+ def test_ast_hint_ignored_when_single_gpu(self):
123
+ """AST hint ignored for single GPU — no distributed strategy applies."""
124
+ result = self._topo({}, num_gpus=1, ast_strategy_hint="fsdp")
125
+ assert result["strategy"] is None
126
+
127
+ def test_env_degrees_beat_ast_hint(self):
128
+ """Env var TP/PP degrees take precedence over AST hint."""
129
+ result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"}, ast_strategy_hint="fsdp")
130
+ assert result["strategy"] == "tp+dp"
131
+ assert result["strategy_detection_method"] == "env_degrees"
132
+
133
+ def test_world_size_plus_ast_fsdp_returns_fsdp(self):
134
+ """WORLD_SIZE=2 + ast_hint='fsdp' → strategy='fsdp' (real torchrun FSDP case)."""
135
+ result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2, ast_strategy_hint="fsdp")
136
+ assert result["strategy"] == "fsdp"
137
+ assert result["strategy_detection_method"] == "ast_analysis"
138
+ assert result["dp_degree"] == 2
139
+
140
+ def test_world_size_plus_ast_deepspeed_returns_deepspeed(self):
141
+ """WORLD_SIZE=4 + ast_hint='deepspeed' → strategy='deepspeed'."""
142
+ result = self._topo({"WORLD_SIZE": "4"}, num_gpus=4, ast_strategy_hint="deepspeed")
143
+ assert result["strategy"] == "deepspeed"
144
+ assert result["strategy_detection_method"] == "ast_analysis"
145
+
146
+ def test_world_size_only_no_hint_stays_none(self):
147
+ """WORLD_SIZE=2 with no AST hint → strategy=None (ambiguous)."""
148
+ result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2)
149
+ assert result["strategy"] is None
150
+ assert result["dp_degree"] == 2
151
+
152
+ def test_unknown_multi_gpu_stays_none(self):
153
+ """Multi-GPU with no hint and no env vars → strategy=None, not ddp."""
154
+ result = self._topo({}, num_gpus=4)
155
+ assert result["strategy"] is None
156
+ assert result["strategy_detection_method"] is None
157
+
158
+ def test_strategy_detection_method_in_result(self):
159
+ """strategy_detection_method is always present in result."""
160
+ result = self._topo({"DP_SIZE": "4"})
161
+ assert "strategy_detection_method" in result
162
+ assert result["strategy_detection_method"] == "env_degrees"
163
+
164
+
165
+ class TestDetectStrategyHint:
166
+ """code_analyzer.detect_strategy_hint returns correct strategy from AST."""
167
+
168
+ def test_fsdp_script(self, tmp_path):
169
+ script = tmp_path / "train_fsdp.py"
170
+ script.write_text(
171
+ "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
172
+ "model = FSDP(model)\n"
173
+ )
174
+ from alloc.code_analyzer import detect_strategy_hint
175
+ assert detect_strategy_hint(str(script)) == "fsdp"
176
+
177
+ def test_ddp_script(self, tmp_path):
178
+ script = tmp_path / "train_ddp.py"
179
+ script.write_text(
180
+ "from torch.nn.parallel import DistributedDataParallel as DDP\n"
181
+ "model = DDP(model)\n"
182
+ )
183
+ from alloc.code_analyzer import detect_strategy_hint
184
+ assert detect_strategy_hint(str(script)) == "ddp"
185
+
186
+ def test_fsdp_beats_ddp_when_both_present(self, tmp_path):
187
+ script = tmp_path / "train_both.py"
188
+ script.write_text(
189
+ "from torch.nn.parallel import DistributedDataParallel as DDP\n"
190
+ "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
191
+ "model = FSDP(model)\n"
192
+ )
193
+ from alloc.code_analyzer import detect_strategy_hint
194
+ assert detect_strategy_hint(str(script)) == "fsdp"
195
+
196
+ def test_no_distributed_returns_none(self, tmp_path):
197
+ script = tmp_path / "train_simple.py"
198
+ script.write_text("import torch\nmodel = torch.nn.Linear(10, 10)\n")
199
+ from alloc.code_analyzer import detect_strategy_hint
200
+ assert detect_strategy_hint(str(script)) is None
201
+
202
+ def test_nonexistent_file_returns_none(self):
203
+ from alloc.code_analyzer import detect_strategy_hint
204
+ assert detect_strategy_hint("/nonexistent/train.py") is None
205
+
206
+
207
+ class TestProcessMapInProbeDictAssembly:
208
+ """process_map should reach probe_dict from ProbeResult."""
209
+
210
+ def test_process_map_present_in_topology_return(self):
211
+ """Topology dict now includes strategy field."""
212
+ with patch.dict(os.environ, {"DP_SIZE": "4"}, clear=False):
213
+ topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
214
+ assert "strategy" in topo
215
+ assert topo["strategy"] == "ddp"
@@ -1,93 +0,0 @@
1
- """Tests for strategy inference from topology degrees (P0-B)."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- from unittest.mock import patch
7
-
8
- from alloc.cli import _infer_parallel_topology_from_env
9
-
10
-
11
- class TestStrategyInference:
12
- """Strategy should be inferred from TP/PP/DP degrees when present."""
13
-
14
- def _topo(self, env=None, num_gpus=4):
15
- env = env or {}
16
- with patch.dict(os.environ, env, clear=False):
17
- return _infer_parallel_topology_from_env(
18
- num_gpus_detected=num_gpus,
19
- )
20
-
21
- def test_no_degrees_multi_gpu_infers_ddp(self):
22
- """When no degree env vars but multiple GPUs detected, infer DDP."""
23
- result = self._topo({}, num_gpus=4)
24
- assert result["strategy"] == "ddp"
25
- assert result["dp_degree"] == 4
26
-
27
- def test_single_gpu_no_degrees_strategy_none(self):
28
- """Single GPU with no degrees → strategy stays None."""
29
- result = self._topo({}, num_gpus=1)
30
- assert result["strategy"] is None
31
-
32
- def test_dp_only_is_ddp(self):
33
- """WORLD_SIZE=4 with no TP/PP → dp inferred → strategy=ddp."""
34
- result = self._topo({"WORLD_SIZE": "4"})
35
- assert result["strategy"] == "ddp"
36
- assert result["dp_degree"] == 4
37
-
38
- def test_tp_only(self):
39
- """TP_SIZE=4 alone → strategy=tp."""
40
- result = self._topo({"TP_SIZE": "4"})
41
- assert result["strategy"] == "tp"
42
-
43
- def test_pp_only(self):
44
- """PP_SIZE=4 alone → strategy=pp."""
45
- result = self._topo({"PP_SIZE": "4"})
46
- assert result["strategy"] == "pp"
47
-
48
- def test_tp_dp(self):
49
- """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
50
- result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
51
- assert result["strategy"] == "tp+dp"
52
-
53
- def test_pp_dp(self):
54
- """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
55
- result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
56
- assert result["strategy"] == "pp+dp"
57
-
58
- def test_tp_pp_dp(self):
59
- """All three degrees → strategy=tp+pp+dp."""
60
- result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
61
- assert result["strategy"] == "tp+pp+dp"
62
-
63
- def test_tp_pp_no_dp(self):
64
- """TP+PP without explicit DP → strategy=tp+pp+dp."""
65
- result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
66
- assert result["strategy"] == "tp+pp+dp"
67
-
68
- def test_tp_size_1_not_counted(self):
69
- """TP_SIZE=1 should not count as tensor parallelism."""
70
- result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
71
- assert result["strategy"] == "ddp"
72
-
73
- def test_pp_size_1_not_counted(self):
74
- """PP_SIZE=1 should not count as pipeline parallelism."""
75
- result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
76
- assert result["strategy"] == "ddp"
77
-
78
- def test_dp_inferred_from_world_size(self):
79
- """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
80
- result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
81
- assert result["dp_degree"] == 4
82
- assert result["strategy"] == "tp+dp"
83
-
84
-
85
- class TestProcessMapInProbeDictAssembly:
86
- """process_map should reach probe_dict from ProbeResult."""
87
-
88
- def test_process_map_present_in_topology_return(self):
89
- """Topology dict now includes strategy field."""
90
- with patch.dict(os.environ, {"WORLD_SIZE": "4"}, clear=False):
91
- topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
92
- assert "strategy" in topo
93
- assert topo["strategy"] == "ddp"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes