alloc 0.0.10__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {alloc-0.0.10 → alloc-0.0.12}/PKG-INFO +1 -1
  2. {alloc-0.0.10 → alloc-0.0.12}/pyproject.toml +1 -1
  3. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/__init__.py +1 -1
  4. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/cli.py +79 -16
  5. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/code_analyzer.py +28 -0
  6. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/probe.py +32 -7
  7. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/PKG-INFO +1 -1
  8. alloc-0.0.12/tests/test_topology_strategy.py +215 -0
  9. alloc-0.0.10/tests/test_topology_strategy.py +0 -93
  10. {alloc-0.0.10 → alloc-0.0.12}/README.md +0 -0
  11. {alloc-0.0.10 → alloc-0.0.12}/setup.cfg +0 -0
  12. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/artifact_loader.py +0 -0
  13. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/artifact_writer.py +0 -0
  14. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/browser_auth.py +0 -0
  15. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/callbacks.py +0 -0
  16. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/catalog/__init__.py +0 -0
  17. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/catalog/default_rate_card.json +0 -0
  18. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/catalog/gpus.v1.json +0 -0
  19. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/config.py +0 -0
  20. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/context.py +0 -0
  21. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/diagnosis_display.py +0 -0
  22. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/diagnosis_engine.py +0 -0
  23. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/diagnosis_rules.py +0 -0
  24. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/display.py +0 -0
  25. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/extractor_runner.py +0 -0
  26. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/ghost.py +0 -0
  27. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/model_extractor.py +0 -0
  28. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/model_registry.py +0 -0
  29. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/stability.py +0 -0
  30. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/upload.py +0 -0
  31. {alloc-0.0.10 → alloc-0.0.12}/src/alloc/yaml_config.py +0 -0
  32. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/SOURCES.txt +0 -0
  33. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/dependency_links.txt +0 -0
  34. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/entry_points.txt +0 -0
  35. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/requires.txt +0 -0
  36. {alloc-0.0.10 → alloc-0.0.12}/src/alloc.egg-info/top_level.txt +0 -0
  37. {alloc-0.0.10 → alloc-0.0.12}/tests/test_artifact.py +0 -0
  38. {alloc-0.0.10 → alloc-0.0.12}/tests/test_artifact_loader.py +0 -0
  39. {alloc-0.0.10 → alloc-0.0.12}/tests/test_auth.py +0 -0
  40. {alloc-0.0.10 → alloc-0.0.12}/tests/test_callbacks.py +0 -0
  41. {alloc-0.0.10 → alloc-0.0.12}/tests/test_catalog.py +0 -0
  42. {alloc-0.0.10 → alloc-0.0.12}/tests/test_cli.py +0 -0
  43. {alloc-0.0.10 → alloc-0.0.12}/tests/test_code_analyzer.py +0 -0
  44. {alloc-0.0.10 → alloc-0.0.12}/tests/test_context.py +0 -0
  45. {alloc-0.0.10 → alloc-0.0.12}/tests/test_diagnose_cli.py +0 -0
  46. {alloc-0.0.10 → alloc-0.0.12}/tests/test_diagnosis_engine.py +0 -0
  47. {alloc-0.0.10 → alloc-0.0.12}/tests/test_diagnosis_rules.py +0 -0
  48. {alloc-0.0.10 → alloc-0.0.12}/tests/test_extractor_activation.py +0 -0
  49. {alloc-0.0.10 → alloc-0.0.12}/tests/test_ghost.py +0 -0
  50. {alloc-0.0.10 → alloc-0.0.12}/tests/test_ghost_degradation.py +0 -0
  51. {alloc-0.0.10 → alloc-0.0.12}/tests/test_init_from_org.py +0 -0
  52. {alloc-0.0.10 → alloc-0.0.12}/tests/test_interconnect.py +0 -0
  53. {alloc-0.0.10 → alloc-0.0.12}/tests/test_model_extractor.py +0 -0
  54. {alloc-0.0.10 → alloc-0.0.12}/tests/test_probe_hw.py +0 -0
  55. {alloc-0.0.10 → alloc-0.0.12}/tests/test_probe_multi.py +0 -0
  56. {alloc-0.0.10 → alloc-0.0.12}/tests/test_scan_auth.py +0 -0
  57. {alloc-0.0.10 → alloc-0.0.12}/tests/test_stability.py +0 -0
  58. {alloc-0.0.10 → alloc-0.0.12}/tests/test_upload.py +0 -0
  59. {alloc-0.0.10 → alloc-0.0.12}/tests/test_verdict.py +0 -0
  60. {alloc-0.0.10 → alloc-0.0.12}/tests/test_yaml_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.10"
7
+ version = "0.0.12"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
9
9
  _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
10
10
  del _warnings
11
11
 
12
- __version__ = "0.0.10"
12
+ __version__ = "0.0.12"
13
13
 
14
14
  from alloc.ghost import ghost, GhostReport
15
15
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -328,6 +328,7 @@ def run(
328
328
  no_config: bool = typer.Option(False, "--no-config", help="Skip .alloc.yaml (use catalog defaults)"),
329
329
  after: Optional[str] = typer.Option(None, "--after", help="Previous run ID to compare against (outcome tracking)"),
330
330
  experiment: Optional[str] = typer.Option(None, "--experiment", "-e", help="Experiment group name"),
331
+ strategy: Optional[str] = typer.Option(None, "--strategy", help="Override detected strategy (ddp, fsdp, deepspeed, tp, pp, etc.)"),
331
332
  ):
332
333
  """Run a training command with GPU monitoring."""
333
334
  from alloc.probe import probe_command
@@ -342,6 +343,18 @@ def run(
342
343
  console.print("Usage: alloc run python train.py")
343
344
  raise typer.Exit(1)
344
345
 
346
+ # Validate --strategy against API-accepted values
347
+ _VALID_STRATEGIES = {
348
+ "ddp", "fsdp", "deepspeed", "tp", "pp",
349
+ "tp+dp", "pp+dp", "tp+pp+dp", "tp+pp+fsdp",
350
+ }
351
+ if strategy and strategy.lower() not in _VALID_STRATEGIES:
352
+ console.print(
353
+ f"[red]Invalid --strategy '{strategy}'. "
354
+ f"Valid values: {', '.join(sorted(_VALID_STRATEGIES))}[/red]"
355
+ )
356
+ raise typer.Exit(1)
357
+
345
358
  # ALLOC_POLICY: "warn" or "enforce" forces full monitoring
346
359
  alloc_policy = os.environ.get("ALLOC_POLICY", "").lower().strip()
347
360
  if alloc_policy and alloc_policy not in ("warn", "enforce"):
@@ -425,10 +438,25 @@ def run(
425
438
  # Discover environment context (git, container, Ray)
426
439
  from alloc.context import discover_context
427
440
  env_context = discover_context()
441
+
442
+ # AST strategy hint: detect FSDP/DDP/DeepSpeed from script source
443
+ ast_hint = None # type: Optional[str]
444
+ try:
445
+ from alloc.code_analyzer import detect_strategy_hint
446
+ # Find the .py script in the command (e.g. "python train.py" or "torchrun ... train.py")
447
+ for arg in command:
448
+ if arg.endswith(".py") and os.path.isfile(arg):
449
+ ast_hint = detect_strategy_hint(arg)
450
+ break
451
+ except Exception:
452
+ pass # Never crash on AST analysis failure
453
+
428
454
  topology = _infer_parallel_topology_from_env(
429
455
  num_gpus_detected=result.num_gpus_detected,
430
456
  config_interconnect=gpu_context.get("interconnect") if gpu_context else None,
431
457
  detected_interconnect=result.detected_interconnect,
458
+ strategy_override=strategy,
459
+ ast_strategy_hint=ast_hint,
432
460
  )
433
461
  objective = os.environ.get("ALLOC_OBJECTIVE", "").strip().lower() or _objective_from_context(gpu_context)
434
462
  max_budget_hourly = _max_budget_hourly_from_context(gpu_context)
@@ -456,6 +484,7 @@ def run(
456
484
  "pp_degree": topology.get("pp_degree"),
457
485
  "dp_degree": topology.get("dp_degree"),
458
486
  "strategy": topology.get("strategy"),
487
+ "strategy_detection_method": topology.get("strategy_detection_method"),
459
488
  "interconnect_type": topology.get("interconnect_type"),
460
489
  "process_map": result.process_map,
461
490
  "objective": objective,
@@ -3522,8 +3551,22 @@ def _print_gpu_context_detail(ctx: dict) -> None:
3522
3551
  console.print(Panel("\n".join(lines), title="GPU Context (.alloc.yaml)", border_style="cyan", padding=(1, 0)))
3523
3552
 
3524
3553
 
3525
- def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_interconnect: Optional[str] = None, detected_interconnect: Optional[str] = None) -> dict:
3526
- """Infer distributed topology hints from common launcher env vars."""
3554
+ def _infer_parallel_topology_from_env(
3555
+ *,
3556
+ num_gpus_detected: int,
3557
+ config_interconnect: Optional[str] = None,
3558
+ detected_interconnect: Optional[str] = None,
3559
+ strategy_override: Optional[str] = None,
3560
+ ast_strategy_hint: Optional[str] = None,
3561
+ ) -> dict:
3562
+ """Infer distributed topology hints from common launcher env vars.
3563
+
3564
+ Strategy precedence:
3565
+ 1. --strategy override (user explicit)
3566
+ 2. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed)
3567
+ 3. Env var inference (TP/PP/DP degrees)
3568
+ 4. None (unknown — never silently default to ddp)
3569
+ """
3527
3570
 
3528
3571
  def _get_int(name: str) -> Optional[int]:
3529
3572
  val = os.environ.get(name)
@@ -3545,8 +3588,12 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3545
3588
 
3546
3589
  tp = _get_int("TP_SIZE") or _get_int("TENSOR_PARALLEL_SIZE")
3547
3590
  pp = _get_int("PP_SIZE") or _get_int("PIPELINE_PARALLEL_SIZE")
3548
- dp = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
3591
+ dp_explicit = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
3592
+ dp = dp_explicit
3549
3593
 
3594
+ # Derive dp from WORLD_SIZE when not explicitly set.
3595
+ # This gives us the degree but does NOT imply strategy=ddp
3596
+ # (WORLD_SIZE is set for both DDP and FSDP).
3550
3597
  if dp is None and world_size is not None:
3551
3598
  denom = (tp or 1) * (pp or 1)
3552
3599
  if denom > 0 and world_size % denom == 0:
@@ -3563,26 +3610,41 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3563
3610
  if interconnect not in ("pcie", "nvlink", "nvlink_switch", "nvlink_p2p", "infiniband", "unknown"):
3564
3611
  interconnect = "unknown"
3565
3612
 
3566
- # Infer strategy from degrees only when evidence exists
3613
+ # Strategy detection with strict precedence and provenance tracking
3567
3614
  strategy = None
3568
- has_tp = tp is not None and tp > 1
3569
- has_pp = pp is not None and pp > 1
3570
- if has_tp and has_pp:
3615
+ strategy_detection_method = None # type: Optional[str]
3616
+
3617
+ # 1. Explicit --strategy override
3618
+ if strategy_override:
3619
+ strategy = strategy_override.lower()
3620
+ strategy_detection_method = "user_override"
3621
+ # 2. Env var inference from explicit TP/PP/DP degree env vars
3622
+ # TP_SIZE/PP_SIZE unambiguously identify the strategy.
3623
+ # DP_SIZE (explicit) implies DDP. But WORLD_SIZE-derived dp does NOT
3624
+ # imply DDP — FSDP uses the same WORLD_SIZE.
3625
+ elif tp is not None and tp > 1 and pp is not None and pp > 1:
3571
3626
  strategy = "tp+pp+dp"
3572
- elif has_tp:
3627
+ strategy_detection_method = "env_degrees"
3628
+ elif tp is not None and tp > 1:
3573
3629
  strategy = "tp+dp" if (dp is not None and dp > 1) else "tp"
3574
- elif has_pp:
3630
+ strategy_detection_method = "env_degrees"
3631
+ elif pp is not None and pp > 1:
3575
3632
  strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
3576
- elif dp is not None and dp > 1:
3577
- strategy = "ddp"
3578
- elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
3579
- # Multiple GPUs detected via NVML with no TP/PP env vars →
3580
- # DDP is PyTorch's default and the only realistic inference.
3581
- # This is NOT the old `or "ddp"` — it only fires when probe
3582
- # actually observed multiple GPU processes.
3633
+ strategy_detection_method = "env_degrees"
3634
+ elif dp_explicit is not None and dp_explicit > 1:
3583
3635
  strategy = "ddp"
3636
+ strategy_detection_method = "env_degrees"
3637
+ # 3. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed in script)
3638
+ elif ast_strategy_hint and num_gpus_detected > 1:
3639
+ strategy = ast_strategy_hint
3640
+ strategy_detection_method = "ast_analysis"
3584
3641
  if dp is None:
3585
3642
  dp = num_gpus_detected
3643
+ # 4. No trustworthy signal — leave strategy=None
3644
+ # (Never silently collapse unknown distributed runs to ddp)
3645
+
3646
+ if strategy and dp is None and num_gpus_detected > 1:
3647
+ dp = num_gpus_detected
3586
3648
 
3587
3649
  return {
3588
3650
  "num_nodes": nnodes or 1,
@@ -3592,6 +3654,7 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
3592
3654
  "dp_degree": dp,
3593
3655
  "interconnect_type": interconnect,
3594
3656
  "strategy": strategy,
3657
+ "strategy_detection_method": strategy_detection_method,
3595
3658
  }
3596
3659
 
3597
3660
 
@@ -132,6 +132,34 @@ def analyze_script(script_path: str) -> CodeFindings:
132
132
  return findings
133
133
 
134
134
 
135
+ def detect_strategy_hint(script_path: str) -> Optional[str]:
136
+ """Lightweight AST check: return strategy kind if detectable, else None.
137
+
138
+ Returns one of: 'fsdp', 'ddp', 'deepspeed', 'data_parallel', or None.
139
+ Never crashes — returns None on any error.
140
+ """
141
+ try:
142
+ if not os.path.isfile(script_path):
143
+ return None
144
+ with open(script_path, "r") as f:
145
+ source = f.read()
146
+ tree = ast.parse(source, filename=script_path)
147
+ imports = _walk_imports(tree)
148
+ distributed = _find_distributed(tree, imports, source.splitlines(), script_path)
149
+ # Priority: fsdp > deepspeed > ddp
150
+ # data_parallel is single-process (not a distributed strategy) — ignored.
151
+ kinds = {d.kind for d in distributed}
152
+ if "fsdp" in kinds:
153
+ return "fsdp"
154
+ if "deepspeed" in kinds:
155
+ return "deepspeed"
156
+ if "ddp" in kinds:
157
+ return "ddp"
158
+ return None
159
+ except Exception:
160
+ return None
161
+
162
+
135
163
  # ---------------------------------------------------------------------------
136
164
  # Import resolution
137
165
  # ---------------------------------------------------------------------------
@@ -374,12 +374,27 @@ def probe_command(
374
374
  """
375
375
  pynvml = _try_import_pynvml()
376
376
 
377
- # Launch the user's training subprocess — do NOT modify env (their warnings matter)
377
+ # Launch the user's training subprocess.
378
+ # Suppress only pynvml/torch.cuda FutureWarning noise — these come from
379
+ # Alloc's own callbacks or from torch internals, not from user code.
380
+ # Propagates to torchrun children and most Ray workers via env inheritance.
381
+ child_env = os.environ.copy()
382
+ existing_pw = child_env.get("PYTHONWARNINGS", "")
383
+ alloc_filters = (
384
+ "ignore::FutureWarning:pynvml,"
385
+ "ignore::DeprecationWarning:pynvml,"
386
+ "ignore::FutureWarning:torch.cuda,"
387
+ "ignore::DeprecationWarning:torch.cuda"
388
+ )
389
+ child_env["PYTHONWARNINGS"] = (
390
+ f"{existing_pw},{alloc_filters}" if existing_pw else alloc_filters
391
+ )
378
392
  try:
379
393
  proc = subprocess.Popen(
380
394
  command,
381
395
  stdout=sys.stdout,
382
396
  stderr=sys.stderr,
397
+ env=child_env,
383
398
  )
384
399
  except Exception as e:
385
400
  return ProbeResult(
@@ -495,6 +510,16 @@ def probe_command(
495
510
  pmap.append({"gpu_index": idx})
496
511
  num_gpus_ref[0] = len(handles)
497
512
  process_map_ref[0] = pmap
513
+ # Immediately sample all discovered GPUs so per_gpu_peaks
514
+ # is populated even if the process exits right after discovery
515
+ pgp = per_gpu_peaks_ref[0]
516
+ for gi, h in enumerate(handles):
517
+ try:
518
+ mi = pynvml.nvmlDeviceGetMemoryInfo(h)
519
+ vm = mi.used / (1024 * 1024)
520
+ pgp[gi] = max(pgp.get(gi, 0.0), vm)
521
+ except Exception:
522
+ pass
498
523
  except Exception:
499
524
  pass
500
525
  # Detect interconnect type between discovered GPUs
@@ -518,11 +543,11 @@ def probe_command(
518
543
  power_vals.append(pw)
519
544
  total_mb = mi.total / (1024 * 1024)
520
545
 
521
- # Track per-GPU peak VRAM for multi-GPU runs
522
- if len(handles) > 1:
523
- pgp = per_gpu_peaks_ref[0]
524
- for gi, vm in enumerate(vram_vals):
525
- pgp[gi] = max(pgp.get(gi, 0.0), vm)
546
+ # Track per-GPU peak VRAM (always, even single GPU
547
+ # discovery may expand handles later, and we need history from sample 0)
548
+ pgp = per_gpu_peaks_ref[0]
549
+ for gi, vm in enumerate(vram_vals):
550
+ pgp[gi] = max(pgp.get(gi, 0.0), vm)
526
551
 
527
552
  samples.append(ProbeSample(
528
553
  timestamp=time.time(),
@@ -675,7 +700,7 @@ def probe_command(
675
700
  process_map=process_map_ref[0],
676
701
  per_gpu_peak_vram_mb=(
677
702
  [round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
678
- if len(per_gpu_peaks_ref[0]) > 1 else None
703
+ if num_gpus_ref[0] > 1 and per_gpu_peaks_ref[0] else None
679
704
  ),
680
705
  detected_interconnect=detected_ic_ref[0],
681
706
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -0,0 +1,215 @@
1
+ """Tests for strategy inference from topology degrees (P0-B)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from unittest.mock import patch
7
+
8
+ from alloc.cli import _infer_parallel_topology_from_env
9
+
10
+
11
+ class TestStrategyInference:
12
+ """Strategy should be inferred from TP/PP/DP degrees when present."""
13
+
14
+ def _topo(self, env=None, num_gpus=4, **kwargs):
15
+ env = env or {}
16
+ with patch.dict(os.environ, env, clear=False):
17
+ return _infer_parallel_topology_from_env(
18
+ num_gpus_detected=num_gpus,
19
+ **kwargs,
20
+ )
21
+
22
+ def test_no_degrees_multi_gpu_strategy_none(self):
23
+ """When no degree env vars and no AST hint, strategy stays None."""
24
+ result = self._topo({}, num_gpus=4)
25
+ assert result["strategy"] is None
26
+ assert result["strategy_detection_method"] is None
27
+
28
+ def test_single_gpu_no_degrees_strategy_none(self):
29
+ """Single GPU with no degrees → strategy stays None."""
30
+ result = self._topo({}, num_gpus=1)
31
+ assert result["strategy"] is None
32
+
33
+ def test_world_size_only_strategy_none(self):
34
+ """WORLD_SIZE=4 with no explicit DP_SIZE → strategy=None (ambiguous)."""
35
+ result = self._topo({"WORLD_SIZE": "4"})
36
+ assert result["strategy"] is None
37
+ assert result["dp_degree"] == 4 # degree still derived for topology
38
+
39
+ def test_explicit_dp_size_is_ddp(self):
40
+ """Explicit DP_SIZE=4 → strategy=ddp."""
41
+ result = self._topo({"DP_SIZE": "4"})
42
+ assert result["strategy"] == "ddp"
43
+ assert result["dp_degree"] == 4
44
+
45
+ def test_tp_only(self):
46
+ """TP_SIZE=4 alone → strategy=tp."""
47
+ result = self._topo({"TP_SIZE": "4"})
48
+ assert result["strategy"] == "tp"
49
+
50
+ def test_pp_only(self):
51
+ """PP_SIZE=4 alone → strategy=pp."""
52
+ result = self._topo({"PP_SIZE": "4"})
53
+ assert result["strategy"] == "pp"
54
+
55
+ def test_tp_dp(self):
56
+ """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
57
+ result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
58
+ assert result["strategy"] == "tp+dp"
59
+
60
+ def test_pp_dp(self):
61
+ """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
62
+ result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
63
+ assert result["strategy"] == "pp+dp"
64
+
65
+ def test_tp_pp_dp(self):
66
+ """All three degrees → strategy=tp+pp+dp."""
67
+ result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
68
+ assert result["strategy"] == "tp+pp+dp"
69
+
70
+ def test_tp_pp_no_dp(self):
71
+ """TP+PP without explicit DP → strategy=tp+pp+dp."""
72
+ result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
73
+ assert result["strategy"] == "tp+pp+dp"
74
+
75
+ def test_tp_size_1_not_counted(self):
76
+ """TP_SIZE=1 should not count as tensor parallelism."""
77
+ result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
78
+ assert result["strategy"] == "ddp"
79
+
80
+ def test_pp_size_1_not_counted(self):
81
+ """PP_SIZE=1 should not count as pipeline parallelism."""
82
+ result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
83
+ assert result["strategy"] == "ddp"
84
+
85
+ def test_dp_inferred_from_world_size(self):
86
+ """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
87
+ result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
88
+ assert result["dp_degree"] == 4
89
+ assert result["strategy"] == "tp+dp"
90
+
91
+
92
+ class TestStrategyOverride:
93
+ """--strategy override takes highest precedence."""
94
+
95
+ def _topo(self, env=None, num_gpus=4, **kwargs):
96
+ env = env or {}
97
+ with patch.dict(os.environ, env, clear=False):
98
+ return _infer_parallel_topology_from_env(
99
+ num_gpus_detected=num_gpus,
100
+ **kwargs,
101
+ )
102
+
103
+ def test_override_beats_env_degrees(self):
104
+ """Explicit --strategy overrides env var inference."""
105
+ result = self._topo({"WORLD_SIZE": "4"}, strategy_override="fsdp")
106
+ assert result["strategy"] == "fsdp"
107
+ assert result["strategy_detection_method"] == "user_override"
108
+
109
+ def test_override_beats_ast_hint(self):
110
+ """Explicit --strategy overrides AST hint."""
111
+ result = self._topo({}, num_gpus=2, strategy_override="fsdp", ast_strategy_hint="ddp")
112
+ assert result["strategy"] == "fsdp"
113
+ assert result["strategy_detection_method"] == "user_override"
114
+
115
+ def test_ast_hint_used_when_multi_gpu(self):
116
+ """AST hint used when no env degrees and multi-GPU."""
117
+ result = self._topo({}, num_gpus=2, ast_strategy_hint="fsdp")
118
+ assert result["strategy"] == "fsdp"
119
+ assert result["strategy_detection_method"] == "ast_analysis"
120
+ assert result["dp_degree"] == 2
121
+
122
+ def test_ast_hint_ignored_when_single_gpu(self):
123
+ """AST hint ignored for single GPU — no distributed strategy applies."""
124
+ result = self._topo({}, num_gpus=1, ast_strategy_hint="fsdp")
125
+ assert result["strategy"] is None
126
+
127
+ def test_env_degrees_beat_ast_hint(self):
128
+ """Env var TP/PP degrees take precedence over AST hint."""
129
+ result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"}, ast_strategy_hint="fsdp")
130
+ assert result["strategy"] == "tp+dp"
131
+ assert result["strategy_detection_method"] == "env_degrees"
132
+
133
+ def test_world_size_plus_ast_fsdp_returns_fsdp(self):
134
+ """WORLD_SIZE=2 + ast_hint='fsdp' → strategy='fsdp' (real torchrun FSDP case)."""
135
+ result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2, ast_strategy_hint="fsdp")
136
+ assert result["strategy"] == "fsdp"
137
+ assert result["strategy_detection_method"] == "ast_analysis"
138
+ assert result["dp_degree"] == 2
139
+
140
+ def test_world_size_plus_ast_deepspeed_returns_deepspeed(self):
141
+ """WORLD_SIZE=4 + ast_hint='deepspeed' → strategy='deepspeed'."""
142
+ result = self._topo({"WORLD_SIZE": "4"}, num_gpus=4, ast_strategy_hint="deepspeed")
143
+ assert result["strategy"] == "deepspeed"
144
+ assert result["strategy_detection_method"] == "ast_analysis"
145
+
146
+ def test_world_size_only_no_hint_stays_none(self):
147
+ """WORLD_SIZE=2 with no AST hint → strategy=None (ambiguous)."""
148
+ result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2)
149
+ assert result["strategy"] is None
150
+ assert result["dp_degree"] == 2
151
+
152
+ def test_unknown_multi_gpu_stays_none(self):
153
+ """Multi-GPU with no hint and no env vars → strategy=None, not ddp."""
154
+ result = self._topo({}, num_gpus=4)
155
+ assert result["strategy"] is None
156
+ assert result["strategy_detection_method"] is None
157
+
158
+ def test_strategy_detection_method_in_result(self):
159
+ """strategy_detection_method is always present in result."""
160
+ result = self._topo({"DP_SIZE": "4"})
161
+ assert "strategy_detection_method" in result
162
+ assert result["strategy_detection_method"] == "env_degrees"
163
+
164
+
165
+ class TestDetectStrategyHint:
166
+ """code_analyzer.detect_strategy_hint returns correct strategy from AST."""
167
+
168
+ def test_fsdp_script(self, tmp_path):
169
+ script = tmp_path / "train_fsdp.py"
170
+ script.write_text(
171
+ "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
172
+ "model = FSDP(model)\n"
173
+ )
174
+ from alloc.code_analyzer import detect_strategy_hint
175
+ assert detect_strategy_hint(str(script)) == "fsdp"
176
+
177
+ def test_ddp_script(self, tmp_path):
178
+ script = tmp_path / "train_ddp.py"
179
+ script.write_text(
180
+ "from torch.nn.parallel import DistributedDataParallel as DDP\n"
181
+ "model = DDP(model)\n"
182
+ )
183
+ from alloc.code_analyzer import detect_strategy_hint
184
+ assert detect_strategy_hint(str(script)) == "ddp"
185
+
186
+ def test_fsdp_beats_ddp_when_both_present(self, tmp_path):
187
+ script = tmp_path / "train_both.py"
188
+ script.write_text(
189
+ "from torch.nn.parallel import DistributedDataParallel as DDP\n"
190
+ "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
191
+ "model = FSDP(model)\n"
192
+ )
193
+ from alloc.code_analyzer import detect_strategy_hint
194
+ assert detect_strategy_hint(str(script)) == "fsdp"
195
+
196
+ def test_no_distributed_returns_none(self, tmp_path):
197
+ script = tmp_path / "train_simple.py"
198
+ script.write_text("import torch\nmodel = torch.nn.Linear(10, 10)\n")
199
+ from alloc.code_analyzer import detect_strategy_hint
200
+ assert detect_strategy_hint(str(script)) is None
201
+
202
+ def test_nonexistent_file_returns_none(self):
203
+ from alloc.code_analyzer import detect_strategy_hint
204
+ assert detect_strategy_hint("/nonexistent/train.py") is None
205
+
206
+
207
+ class TestProcessMapInProbeDictAssembly:
208
+ """process_map should reach probe_dict from ProbeResult."""
209
+
210
+ def test_process_map_present_in_topology_return(self):
211
+ """Topology dict now includes strategy field."""
212
+ with patch.dict(os.environ, {"DP_SIZE": "4"}, clear=False):
213
+ topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
214
+ assert "strategy" in topo
215
+ assert topo["strategy"] == "ddp"
@@ -1,93 +0,0 @@
1
- """Tests for strategy inference from topology degrees (P0-B)."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- from unittest.mock import patch
7
-
8
- from alloc.cli import _infer_parallel_topology_from_env
9
-
10
-
11
- class TestStrategyInference:
12
- """Strategy should be inferred from TP/PP/DP degrees when present."""
13
-
14
- def _topo(self, env=None, num_gpus=4):
15
- env = env or {}
16
- with patch.dict(os.environ, env, clear=False):
17
- return _infer_parallel_topology_from_env(
18
- num_gpus_detected=num_gpus,
19
- )
20
-
21
- def test_no_degrees_multi_gpu_infers_ddp(self):
22
- """When no degree env vars but multiple GPUs detected, infer DDP."""
23
- result = self._topo({}, num_gpus=4)
24
- assert result["strategy"] == "ddp"
25
- assert result["dp_degree"] == 4
26
-
27
- def test_single_gpu_no_degrees_strategy_none(self):
28
- """Single GPU with no degrees → strategy stays None."""
29
- result = self._topo({}, num_gpus=1)
30
- assert result["strategy"] is None
31
-
32
- def test_dp_only_is_ddp(self):
33
- """WORLD_SIZE=4 with no TP/PP → dp inferred → strategy=ddp."""
34
- result = self._topo({"WORLD_SIZE": "4"})
35
- assert result["strategy"] == "ddp"
36
- assert result["dp_degree"] == 4
37
-
38
- def test_tp_only(self):
39
- """TP_SIZE=4 alone → strategy=tp."""
40
- result = self._topo({"TP_SIZE": "4"})
41
- assert result["strategy"] == "tp"
42
-
43
- def test_pp_only(self):
44
- """PP_SIZE=4 alone → strategy=pp."""
45
- result = self._topo({"PP_SIZE": "4"})
46
- assert result["strategy"] == "pp"
47
-
48
- def test_tp_dp(self):
49
- """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
50
- result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
51
- assert result["strategy"] == "tp+dp"
52
-
53
- def test_pp_dp(self):
54
- """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
55
- result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
56
- assert result["strategy"] == "pp+dp"
57
-
58
- def test_tp_pp_dp(self):
59
- """All three degrees → strategy=tp+pp+dp."""
60
- result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
61
- assert result["strategy"] == "tp+pp+dp"
62
-
63
- def test_tp_pp_no_dp(self):
64
- """TP+PP without explicit DP → strategy=tp+pp+dp."""
65
- result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
66
- assert result["strategy"] == "tp+pp+dp"
67
-
68
- def test_tp_size_1_not_counted(self):
69
- """TP_SIZE=1 should not count as tensor parallelism."""
70
- result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
71
- assert result["strategy"] == "ddp"
72
-
73
- def test_pp_size_1_not_counted(self):
74
- """PP_SIZE=1 should not count as pipeline parallelism."""
75
- result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
76
- assert result["strategy"] == "ddp"
77
-
78
- def test_dp_inferred_from_world_size(self):
79
- """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
80
- result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
81
- assert result["dp_degree"] == 4
82
- assert result["strategy"] == "tp+dp"
83
-
84
-
85
- class TestProcessMapInProbeDictAssembly:
86
- """process_map should reach probe_dict from ProbeResult."""
87
-
88
- def test_process_map_present_in_topology_return(self):
89
- """Topology dict now includes strategy field."""
90
- with patch.dict(os.environ, {"WORLD_SIZE": "4"}, clear=False):
91
- topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
92
- assert "strategy" in topo
93
- assert topo["strategy"] == "ddp"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes