alloc 0.0.10__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.0.10 → alloc-0.0.11}/PKG-INFO +1 -1
- {alloc-0.0.10 → alloc-0.0.11}/pyproject.toml +1 -1
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/__init__.py +1 -1
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/cli.py +79 -16
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/code_analyzer.py +28 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/probe.py +22 -7
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/PKG-INFO +1 -1
- alloc-0.0.11/tests/test_topology_strategy.py +215 -0
- alloc-0.0.10/tests/test_topology_strategy.py +0 -93
- {alloc-0.0.10 → alloc-0.0.11}/README.md +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/setup.cfg +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/artifact_loader.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/browser_auth.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/callbacks.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/catalog/__init__.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/catalog/gpus.v1.json +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/config.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/context.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/diagnosis_display.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/diagnosis_engine.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/diagnosis_rules.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/display.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/extractor_runner.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/ghost.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/model_extractor.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/model_registry.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/stability.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/upload.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc/yaml_config.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/SOURCES.txt +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_artifact.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_artifact_loader.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_auth.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_callbacks.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_catalog.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_cli.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_code_analyzer.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_context.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_diagnose_cli.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_diagnosis_engine.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_diagnosis_rules.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_extractor_activation.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_ghost.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_ghost_degradation.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_init_from_org.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_interconnect.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_model_extractor.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_probe_hw.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_probe_multi.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_scan_auth.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_stability.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_upload.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_verdict.py +0 -0
- {alloc-0.0.10 → alloc-0.0.11}/tests/test_yaml_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.11"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
|
|
|
9
9
|
_warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
|
|
10
10
|
del _warnings
|
|
11
11
|
|
|
12
|
-
__version__ = "0.0.
|
|
12
|
+
__version__ = "0.0.11"
|
|
13
13
|
|
|
14
14
|
from alloc.ghost import ghost, GhostReport
|
|
15
15
|
from alloc.callbacks import AllocCallback as HuggingFaceCallback
|
|
@@ -328,6 +328,7 @@ def run(
|
|
|
328
328
|
no_config: bool = typer.Option(False, "--no-config", help="Skip .alloc.yaml (use catalog defaults)"),
|
|
329
329
|
after: Optional[str] = typer.Option(None, "--after", help="Previous run ID to compare against (outcome tracking)"),
|
|
330
330
|
experiment: Optional[str] = typer.Option(None, "--experiment", "-e", help="Experiment group name"),
|
|
331
|
+
strategy: Optional[str] = typer.Option(None, "--strategy", help="Override detected strategy (ddp, fsdp, deepspeed, tp, pp, etc.)"),
|
|
331
332
|
):
|
|
332
333
|
"""Run a training command with GPU monitoring."""
|
|
333
334
|
from alloc.probe import probe_command
|
|
@@ -342,6 +343,18 @@ def run(
|
|
|
342
343
|
console.print("Usage: alloc run python train.py")
|
|
343
344
|
raise typer.Exit(1)
|
|
344
345
|
|
|
346
|
+
# Validate --strategy against API-accepted values
|
|
347
|
+
_VALID_STRATEGIES = {
|
|
348
|
+
"ddp", "fsdp", "deepspeed", "tp", "pp",
|
|
349
|
+
"tp+dp", "pp+dp", "tp+pp+dp", "tp+pp+fsdp",
|
|
350
|
+
}
|
|
351
|
+
if strategy and strategy.lower() not in _VALID_STRATEGIES:
|
|
352
|
+
console.print(
|
|
353
|
+
f"[red]Invalid --strategy '{strategy}'. "
|
|
354
|
+
f"Valid values: {', '.join(sorted(_VALID_STRATEGIES))}[/red]"
|
|
355
|
+
)
|
|
356
|
+
raise typer.Exit(1)
|
|
357
|
+
|
|
345
358
|
# ALLOC_POLICY: "warn" or "enforce" forces full monitoring
|
|
346
359
|
alloc_policy = os.environ.get("ALLOC_POLICY", "").lower().strip()
|
|
347
360
|
if alloc_policy and alloc_policy not in ("warn", "enforce"):
|
|
@@ -425,10 +438,25 @@ def run(
|
|
|
425
438
|
# Discover environment context (git, container, Ray)
|
|
426
439
|
from alloc.context import discover_context
|
|
427
440
|
env_context = discover_context()
|
|
441
|
+
|
|
442
|
+
# AST strategy hint: detect FSDP/DDP/DeepSpeed from script source
|
|
443
|
+
ast_hint = None # type: Optional[str]
|
|
444
|
+
try:
|
|
445
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
446
|
+
# Find the .py script in the command (e.g. "python train.py" or "torchrun ... train.py")
|
|
447
|
+
for arg in command:
|
|
448
|
+
if arg.endswith(".py") and os.path.isfile(arg):
|
|
449
|
+
ast_hint = detect_strategy_hint(arg)
|
|
450
|
+
break
|
|
451
|
+
except Exception:
|
|
452
|
+
pass # Never crash on AST analysis failure
|
|
453
|
+
|
|
428
454
|
topology = _infer_parallel_topology_from_env(
|
|
429
455
|
num_gpus_detected=result.num_gpus_detected,
|
|
430
456
|
config_interconnect=gpu_context.get("interconnect") if gpu_context else None,
|
|
431
457
|
detected_interconnect=result.detected_interconnect,
|
|
458
|
+
strategy_override=strategy,
|
|
459
|
+
ast_strategy_hint=ast_hint,
|
|
432
460
|
)
|
|
433
461
|
objective = os.environ.get("ALLOC_OBJECTIVE", "").strip().lower() or _objective_from_context(gpu_context)
|
|
434
462
|
max_budget_hourly = _max_budget_hourly_from_context(gpu_context)
|
|
@@ -456,6 +484,7 @@ def run(
|
|
|
456
484
|
"pp_degree": topology.get("pp_degree"),
|
|
457
485
|
"dp_degree": topology.get("dp_degree"),
|
|
458
486
|
"strategy": topology.get("strategy"),
|
|
487
|
+
"strategy_detection_method": topology.get("strategy_detection_method"),
|
|
459
488
|
"interconnect_type": topology.get("interconnect_type"),
|
|
460
489
|
"process_map": result.process_map,
|
|
461
490
|
"objective": objective,
|
|
@@ -3522,8 +3551,22 @@ def _print_gpu_context_detail(ctx: dict) -> None:
|
|
|
3522
3551
|
console.print(Panel("\n".join(lines), title="GPU Context (.alloc.yaml)", border_style="cyan", padding=(1, 0)))
|
|
3523
3552
|
|
|
3524
3553
|
|
|
3525
|
-
def _infer_parallel_topology_from_env(
|
|
3526
|
-
|
|
3554
|
+
def _infer_parallel_topology_from_env(
|
|
3555
|
+
*,
|
|
3556
|
+
num_gpus_detected: int,
|
|
3557
|
+
config_interconnect: Optional[str] = None,
|
|
3558
|
+
detected_interconnect: Optional[str] = None,
|
|
3559
|
+
strategy_override: Optional[str] = None,
|
|
3560
|
+
ast_strategy_hint: Optional[str] = None,
|
|
3561
|
+
) -> dict:
|
|
3562
|
+
"""Infer distributed topology hints from common launcher env vars.
|
|
3563
|
+
|
|
3564
|
+
Strategy precedence:
|
|
3565
|
+
1. --strategy override (user explicit)
|
|
3566
|
+
2. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed)
|
|
3567
|
+
3. Env var inference (TP/PP/DP degrees)
|
|
3568
|
+
4. None (unknown — never silently default to ddp)
|
|
3569
|
+
"""
|
|
3527
3570
|
|
|
3528
3571
|
def _get_int(name: str) -> Optional[int]:
|
|
3529
3572
|
val = os.environ.get(name)
|
|
@@ -3545,8 +3588,12 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
|
|
|
3545
3588
|
|
|
3546
3589
|
tp = _get_int("TP_SIZE") or _get_int("TENSOR_PARALLEL_SIZE")
|
|
3547
3590
|
pp = _get_int("PP_SIZE") or _get_int("PIPELINE_PARALLEL_SIZE")
|
|
3548
|
-
|
|
3591
|
+
dp_explicit = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
|
|
3592
|
+
dp = dp_explicit
|
|
3549
3593
|
|
|
3594
|
+
# Derive dp from WORLD_SIZE when not explicitly set.
|
|
3595
|
+
# This gives us the degree but does NOT imply strategy=ddp
|
|
3596
|
+
# (WORLD_SIZE is set for both DDP and FSDP).
|
|
3550
3597
|
if dp is None and world_size is not None:
|
|
3551
3598
|
denom = (tp or 1) * (pp or 1)
|
|
3552
3599
|
if denom > 0 and world_size % denom == 0:
|
|
@@ -3563,26 +3610,41 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
|
|
|
3563
3610
|
if interconnect not in ("pcie", "nvlink", "nvlink_switch", "nvlink_p2p", "infiniband", "unknown"):
|
|
3564
3611
|
interconnect = "unknown"
|
|
3565
3612
|
|
|
3566
|
-
#
|
|
3613
|
+
# Strategy detection with strict precedence and provenance tracking
|
|
3567
3614
|
strategy = None
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3615
|
+
strategy_detection_method = None # type: Optional[str]
|
|
3616
|
+
|
|
3617
|
+
# 1. Explicit --strategy override
|
|
3618
|
+
if strategy_override:
|
|
3619
|
+
strategy = strategy_override.lower()
|
|
3620
|
+
strategy_detection_method = "user_override"
|
|
3621
|
+
# 2. Env var inference from explicit TP/PP/DP degree env vars
|
|
3622
|
+
# TP_SIZE/PP_SIZE unambiguously identify the strategy.
|
|
3623
|
+
# DP_SIZE (explicit) implies DDP. But WORLD_SIZE-derived dp does NOT
|
|
3624
|
+
# imply DDP — FSDP uses the same WORLD_SIZE.
|
|
3625
|
+
elif tp is not None and tp > 1 and pp is not None and pp > 1:
|
|
3571
3626
|
strategy = "tp+pp+dp"
|
|
3572
|
-
|
|
3627
|
+
strategy_detection_method = "env_degrees"
|
|
3628
|
+
elif tp is not None and tp > 1:
|
|
3573
3629
|
strategy = "tp+dp" if (dp is not None and dp > 1) else "tp"
|
|
3574
|
-
|
|
3630
|
+
strategy_detection_method = "env_degrees"
|
|
3631
|
+
elif pp is not None and pp > 1:
|
|
3575
3632
|
strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
|
|
3579
|
-
# Multiple GPUs detected via NVML with no TP/PP env vars →
|
|
3580
|
-
# DDP is PyTorch's default and the only realistic inference.
|
|
3581
|
-
# This is NOT the old `or "ddp"` — it only fires when probe
|
|
3582
|
-
# actually observed multiple GPU processes.
|
|
3633
|
+
strategy_detection_method = "env_degrees"
|
|
3634
|
+
elif dp_explicit is not None and dp_explicit > 1:
|
|
3583
3635
|
strategy = "ddp"
|
|
3636
|
+
strategy_detection_method = "env_degrees"
|
|
3637
|
+
# 3. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed in script)
|
|
3638
|
+
elif ast_strategy_hint and num_gpus_detected > 1:
|
|
3639
|
+
strategy = ast_strategy_hint
|
|
3640
|
+
strategy_detection_method = "ast_analysis"
|
|
3584
3641
|
if dp is None:
|
|
3585
3642
|
dp = num_gpus_detected
|
|
3643
|
+
# 4. No trustworthy signal — leave strategy=None
|
|
3644
|
+
# (Never silently collapse unknown distributed runs to ddp)
|
|
3645
|
+
|
|
3646
|
+
if strategy and dp is None and num_gpus_detected > 1:
|
|
3647
|
+
dp = num_gpus_detected
|
|
3586
3648
|
|
|
3587
3649
|
return {
|
|
3588
3650
|
"num_nodes": nnodes or 1,
|
|
@@ -3592,6 +3654,7 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
|
|
|
3592
3654
|
"dp_degree": dp,
|
|
3593
3655
|
"interconnect_type": interconnect,
|
|
3594
3656
|
"strategy": strategy,
|
|
3657
|
+
"strategy_detection_method": strategy_detection_method,
|
|
3595
3658
|
}
|
|
3596
3659
|
|
|
3597
3660
|
|
|
@@ -132,6 +132,34 @@ def analyze_script(script_path: str) -> CodeFindings:
|
|
|
132
132
|
return findings
|
|
133
133
|
|
|
134
134
|
|
|
135
|
+
def detect_strategy_hint(script_path: str) -> Optional[str]:
|
|
136
|
+
"""Lightweight AST check: return strategy kind if detectable, else None.
|
|
137
|
+
|
|
138
|
+
Returns one of: 'fsdp', 'ddp', 'deepspeed', 'data_parallel', or None.
|
|
139
|
+
Never crashes — returns None on any error.
|
|
140
|
+
"""
|
|
141
|
+
try:
|
|
142
|
+
if not os.path.isfile(script_path):
|
|
143
|
+
return None
|
|
144
|
+
with open(script_path, "r") as f:
|
|
145
|
+
source = f.read()
|
|
146
|
+
tree = ast.parse(source, filename=script_path)
|
|
147
|
+
imports = _walk_imports(tree)
|
|
148
|
+
distributed = _find_distributed(tree, imports, source.splitlines(), script_path)
|
|
149
|
+
# Priority: fsdp > deepspeed > ddp
|
|
150
|
+
# data_parallel is single-process (not a distributed strategy) — ignored.
|
|
151
|
+
kinds = {d.kind for d in distributed}
|
|
152
|
+
if "fsdp" in kinds:
|
|
153
|
+
return "fsdp"
|
|
154
|
+
if "deepspeed" in kinds:
|
|
155
|
+
return "deepspeed"
|
|
156
|
+
if "ddp" in kinds:
|
|
157
|
+
return "ddp"
|
|
158
|
+
return None
|
|
159
|
+
except Exception:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
135
163
|
# ---------------------------------------------------------------------------
|
|
136
164
|
# Import resolution
|
|
137
165
|
# ---------------------------------------------------------------------------
|
|
@@ -374,12 +374,27 @@ def probe_command(
|
|
|
374
374
|
"""
|
|
375
375
|
pynvml = _try_import_pynvml()
|
|
376
376
|
|
|
377
|
-
# Launch the user's training subprocess
|
|
377
|
+
# Launch the user's training subprocess.
|
|
378
|
+
# Suppress only pynvml/torch.cuda FutureWarning noise — these come from
|
|
379
|
+
# Alloc's own callbacks or from torch internals, not from user code.
|
|
380
|
+
# Propagates to torchrun children and most Ray workers via env inheritance.
|
|
381
|
+
child_env = os.environ.copy()
|
|
382
|
+
existing_pw = child_env.get("PYTHONWARNINGS", "")
|
|
383
|
+
alloc_filters = (
|
|
384
|
+
"ignore::FutureWarning:pynvml,"
|
|
385
|
+
"ignore::DeprecationWarning:pynvml,"
|
|
386
|
+
"ignore::FutureWarning:torch.cuda,"
|
|
387
|
+
"ignore::DeprecationWarning:torch.cuda"
|
|
388
|
+
)
|
|
389
|
+
child_env["PYTHONWARNINGS"] = (
|
|
390
|
+
f"{existing_pw},{alloc_filters}" if existing_pw else alloc_filters
|
|
391
|
+
)
|
|
378
392
|
try:
|
|
379
393
|
proc = subprocess.Popen(
|
|
380
394
|
command,
|
|
381
395
|
stdout=sys.stdout,
|
|
382
396
|
stderr=sys.stderr,
|
|
397
|
+
env=child_env,
|
|
383
398
|
)
|
|
384
399
|
except Exception as e:
|
|
385
400
|
return ProbeResult(
|
|
@@ -518,11 +533,11 @@ def probe_command(
|
|
|
518
533
|
power_vals.append(pw)
|
|
519
534
|
total_mb = mi.total / (1024 * 1024)
|
|
520
535
|
|
|
521
|
-
# Track per-GPU peak VRAM
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
536
|
+
# Track per-GPU peak VRAM (always, even single GPU —
|
|
537
|
+
# discovery may expand handles later, and we need history from sample 0)
|
|
538
|
+
pgp = per_gpu_peaks_ref[0]
|
|
539
|
+
for gi, vm in enumerate(vram_vals):
|
|
540
|
+
pgp[gi] = max(pgp.get(gi, 0.0), vm)
|
|
526
541
|
|
|
527
542
|
samples.append(ProbeSample(
|
|
528
543
|
timestamp=time.time(),
|
|
@@ -675,7 +690,7 @@ def probe_command(
|
|
|
675
690
|
process_map=process_map_ref[0],
|
|
676
691
|
per_gpu_peak_vram_mb=(
|
|
677
692
|
[round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
|
|
678
|
-
if
|
|
693
|
+
if num_gpus_ref[0] > 1 and per_gpu_peaks_ref[0] else None
|
|
679
694
|
),
|
|
680
695
|
detected_interconnect=detected_ic_ref[0],
|
|
681
696
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Tests for strategy inference from topology degrees (P0-B)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from unittest.mock import patch
|
|
7
|
+
|
|
8
|
+
from alloc.cli import _infer_parallel_topology_from_env
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestStrategyInference:
|
|
12
|
+
"""Strategy should be inferred from TP/PP/DP degrees when present."""
|
|
13
|
+
|
|
14
|
+
def _topo(self, env=None, num_gpus=4, **kwargs):
|
|
15
|
+
env = env or {}
|
|
16
|
+
with patch.dict(os.environ, env, clear=False):
|
|
17
|
+
return _infer_parallel_topology_from_env(
|
|
18
|
+
num_gpus_detected=num_gpus,
|
|
19
|
+
**kwargs,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def test_no_degrees_multi_gpu_strategy_none(self):
|
|
23
|
+
"""When no degree env vars and no AST hint, strategy stays None."""
|
|
24
|
+
result = self._topo({}, num_gpus=4)
|
|
25
|
+
assert result["strategy"] is None
|
|
26
|
+
assert result["strategy_detection_method"] is None
|
|
27
|
+
|
|
28
|
+
def test_single_gpu_no_degrees_strategy_none(self):
|
|
29
|
+
"""Single GPU with no degrees → strategy stays None."""
|
|
30
|
+
result = self._topo({}, num_gpus=1)
|
|
31
|
+
assert result["strategy"] is None
|
|
32
|
+
|
|
33
|
+
def test_world_size_only_strategy_none(self):
|
|
34
|
+
"""WORLD_SIZE=4 with no explicit DP_SIZE → strategy=None (ambiguous)."""
|
|
35
|
+
result = self._topo({"WORLD_SIZE": "4"})
|
|
36
|
+
assert result["strategy"] is None
|
|
37
|
+
assert result["dp_degree"] == 4 # degree still derived for topology
|
|
38
|
+
|
|
39
|
+
def test_explicit_dp_size_is_ddp(self):
|
|
40
|
+
"""Explicit DP_SIZE=4 → strategy=ddp."""
|
|
41
|
+
result = self._topo({"DP_SIZE": "4"})
|
|
42
|
+
assert result["strategy"] == "ddp"
|
|
43
|
+
assert result["dp_degree"] == 4
|
|
44
|
+
|
|
45
|
+
def test_tp_only(self):
|
|
46
|
+
"""TP_SIZE=4 alone → strategy=tp."""
|
|
47
|
+
result = self._topo({"TP_SIZE": "4"})
|
|
48
|
+
assert result["strategy"] == "tp"
|
|
49
|
+
|
|
50
|
+
def test_pp_only(self):
|
|
51
|
+
"""PP_SIZE=4 alone → strategy=pp."""
|
|
52
|
+
result = self._topo({"PP_SIZE": "4"})
|
|
53
|
+
assert result["strategy"] == "pp"
|
|
54
|
+
|
|
55
|
+
def test_tp_dp(self):
|
|
56
|
+
"""TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
|
|
57
|
+
result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
|
|
58
|
+
assert result["strategy"] == "tp+dp"
|
|
59
|
+
|
|
60
|
+
def test_pp_dp(self):
|
|
61
|
+
"""PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
|
|
62
|
+
result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
|
|
63
|
+
assert result["strategy"] == "pp+dp"
|
|
64
|
+
|
|
65
|
+
def test_tp_pp_dp(self):
|
|
66
|
+
"""All three degrees → strategy=tp+pp+dp."""
|
|
67
|
+
result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
|
|
68
|
+
assert result["strategy"] == "tp+pp+dp"
|
|
69
|
+
|
|
70
|
+
def test_tp_pp_no_dp(self):
|
|
71
|
+
"""TP+PP without explicit DP → strategy=tp+pp+dp."""
|
|
72
|
+
result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
|
|
73
|
+
assert result["strategy"] == "tp+pp+dp"
|
|
74
|
+
|
|
75
|
+
def test_tp_size_1_not_counted(self):
|
|
76
|
+
"""TP_SIZE=1 should not count as tensor parallelism."""
|
|
77
|
+
result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
|
|
78
|
+
assert result["strategy"] == "ddp"
|
|
79
|
+
|
|
80
|
+
def test_pp_size_1_not_counted(self):
|
|
81
|
+
"""PP_SIZE=1 should not count as pipeline parallelism."""
|
|
82
|
+
result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
|
|
83
|
+
assert result["strategy"] == "ddp"
|
|
84
|
+
|
|
85
|
+
def test_dp_inferred_from_world_size(self):
|
|
86
|
+
"""DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
|
|
87
|
+
result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
|
|
88
|
+
assert result["dp_degree"] == 4
|
|
89
|
+
assert result["strategy"] == "tp+dp"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TestStrategyOverride:
|
|
93
|
+
"""--strategy override takes highest precedence."""
|
|
94
|
+
|
|
95
|
+
def _topo(self, env=None, num_gpus=4, **kwargs):
|
|
96
|
+
env = env or {}
|
|
97
|
+
with patch.dict(os.environ, env, clear=False):
|
|
98
|
+
return _infer_parallel_topology_from_env(
|
|
99
|
+
num_gpus_detected=num_gpus,
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def test_override_beats_env_degrees(self):
|
|
104
|
+
"""Explicit --strategy overrides env var inference."""
|
|
105
|
+
result = self._topo({"WORLD_SIZE": "4"}, strategy_override="fsdp")
|
|
106
|
+
assert result["strategy"] == "fsdp"
|
|
107
|
+
assert result["strategy_detection_method"] == "user_override"
|
|
108
|
+
|
|
109
|
+
def test_override_beats_ast_hint(self):
|
|
110
|
+
"""Explicit --strategy overrides AST hint."""
|
|
111
|
+
result = self._topo({}, num_gpus=2, strategy_override="fsdp", ast_strategy_hint="ddp")
|
|
112
|
+
assert result["strategy"] == "fsdp"
|
|
113
|
+
assert result["strategy_detection_method"] == "user_override"
|
|
114
|
+
|
|
115
|
+
def test_ast_hint_used_when_multi_gpu(self):
|
|
116
|
+
"""AST hint used when no env degrees and multi-GPU."""
|
|
117
|
+
result = self._topo({}, num_gpus=2, ast_strategy_hint="fsdp")
|
|
118
|
+
assert result["strategy"] == "fsdp"
|
|
119
|
+
assert result["strategy_detection_method"] == "ast_analysis"
|
|
120
|
+
assert result["dp_degree"] == 2
|
|
121
|
+
|
|
122
|
+
def test_ast_hint_ignored_when_single_gpu(self):
|
|
123
|
+
"""AST hint ignored for single GPU — no distributed strategy applies."""
|
|
124
|
+
result = self._topo({}, num_gpus=1, ast_strategy_hint="fsdp")
|
|
125
|
+
assert result["strategy"] is None
|
|
126
|
+
|
|
127
|
+
def test_env_degrees_beat_ast_hint(self):
|
|
128
|
+
"""Env var TP/PP degrees take precedence over AST hint."""
|
|
129
|
+
result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"}, ast_strategy_hint="fsdp")
|
|
130
|
+
assert result["strategy"] == "tp+dp"
|
|
131
|
+
assert result["strategy_detection_method"] == "env_degrees"
|
|
132
|
+
|
|
133
|
+
def test_world_size_plus_ast_fsdp_returns_fsdp(self):
|
|
134
|
+
"""WORLD_SIZE=2 + ast_hint='fsdp' → strategy='fsdp' (real torchrun FSDP case)."""
|
|
135
|
+
result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2, ast_strategy_hint="fsdp")
|
|
136
|
+
assert result["strategy"] == "fsdp"
|
|
137
|
+
assert result["strategy_detection_method"] == "ast_analysis"
|
|
138
|
+
assert result["dp_degree"] == 2
|
|
139
|
+
|
|
140
|
+
def test_world_size_plus_ast_deepspeed_returns_deepspeed(self):
|
|
141
|
+
"""WORLD_SIZE=4 + ast_hint='deepspeed' → strategy='deepspeed'."""
|
|
142
|
+
result = self._topo({"WORLD_SIZE": "4"}, num_gpus=4, ast_strategy_hint="deepspeed")
|
|
143
|
+
assert result["strategy"] == "deepspeed"
|
|
144
|
+
assert result["strategy_detection_method"] == "ast_analysis"
|
|
145
|
+
|
|
146
|
+
def test_world_size_only_no_hint_stays_none(self):
|
|
147
|
+
"""WORLD_SIZE=2 with no AST hint → strategy=None (ambiguous)."""
|
|
148
|
+
result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2)
|
|
149
|
+
assert result["strategy"] is None
|
|
150
|
+
assert result["dp_degree"] == 2
|
|
151
|
+
|
|
152
|
+
def test_unknown_multi_gpu_stays_none(self):
|
|
153
|
+
"""Multi-GPU with no hint and no env vars → strategy=None, not ddp."""
|
|
154
|
+
result = self._topo({}, num_gpus=4)
|
|
155
|
+
assert result["strategy"] is None
|
|
156
|
+
assert result["strategy_detection_method"] is None
|
|
157
|
+
|
|
158
|
+
def test_strategy_detection_method_in_result(self):
|
|
159
|
+
"""strategy_detection_method is always present in result."""
|
|
160
|
+
result = self._topo({"DP_SIZE": "4"})
|
|
161
|
+
assert "strategy_detection_method" in result
|
|
162
|
+
assert result["strategy_detection_method"] == "env_degrees"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class TestDetectStrategyHint:
|
|
166
|
+
"""code_analyzer.detect_strategy_hint returns correct strategy from AST."""
|
|
167
|
+
|
|
168
|
+
def test_fsdp_script(self, tmp_path):
|
|
169
|
+
script = tmp_path / "train_fsdp.py"
|
|
170
|
+
script.write_text(
|
|
171
|
+
"from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
|
|
172
|
+
"model = FSDP(model)\n"
|
|
173
|
+
)
|
|
174
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
175
|
+
assert detect_strategy_hint(str(script)) == "fsdp"
|
|
176
|
+
|
|
177
|
+
def test_ddp_script(self, tmp_path):
|
|
178
|
+
script = tmp_path / "train_ddp.py"
|
|
179
|
+
script.write_text(
|
|
180
|
+
"from torch.nn.parallel import DistributedDataParallel as DDP\n"
|
|
181
|
+
"model = DDP(model)\n"
|
|
182
|
+
)
|
|
183
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
184
|
+
assert detect_strategy_hint(str(script)) == "ddp"
|
|
185
|
+
|
|
186
|
+
def test_fsdp_beats_ddp_when_both_present(self, tmp_path):
|
|
187
|
+
script = tmp_path / "train_both.py"
|
|
188
|
+
script.write_text(
|
|
189
|
+
"from torch.nn.parallel import DistributedDataParallel as DDP\n"
|
|
190
|
+
"from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
|
|
191
|
+
"model = FSDP(model)\n"
|
|
192
|
+
)
|
|
193
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
194
|
+
assert detect_strategy_hint(str(script)) == "fsdp"
|
|
195
|
+
|
|
196
|
+
def test_no_distributed_returns_none(self, tmp_path):
|
|
197
|
+
script = tmp_path / "train_simple.py"
|
|
198
|
+
script.write_text("import torch\nmodel = torch.nn.Linear(10, 10)\n")
|
|
199
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
200
|
+
assert detect_strategy_hint(str(script)) is None
|
|
201
|
+
|
|
202
|
+
def test_nonexistent_file_returns_none(self):
|
|
203
|
+
from alloc.code_analyzer import detect_strategy_hint
|
|
204
|
+
assert detect_strategy_hint("/nonexistent/train.py") is None
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class TestProcessMapInProbeDictAssembly:
|
|
208
|
+
"""process_map should reach probe_dict from ProbeResult."""
|
|
209
|
+
|
|
210
|
+
def test_process_map_present_in_topology_return(self):
|
|
211
|
+
"""Topology dict now includes strategy field."""
|
|
212
|
+
with patch.dict(os.environ, {"DP_SIZE": "4"}, clear=False):
|
|
213
|
+
topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
|
|
214
|
+
assert "strategy" in topo
|
|
215
|
+
assert topo["strategy"] == "ddp"
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
"""Tests for strategy inference from topology degrees (P0-B)."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
from unittest.mock import patch
|
|
7
|
-
|
|
8
|
-
from alloc.cli import _infer_parallel_topology_from_env
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestStrategyInference:
|
|
12
|
-
"""Strategy should be inferred from TP/PP/DP degrees when present."""
|
|
13
|
-
|
|
14
|
-
def _topo(self, env=None, num_gpus=4):
|
|
15
|
-
env = env or {}
|
|
16
|
-
with patch.dict(os.environ, env, clear=False):
|
|
17
|
-
return _infer_parallel_topology_from_env(
|
|
18
|
-
num_gpus_detected=num_gpus,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
def test_no_degrees_multi_gpu_infers_ddp(self):
|
|
22
|
-
"""When no degree env vars but multiple GPUs detected, infer DDP."""
|
|
23
|
-
result = self._topo({}, num_gpus=4)
|
|
24
|
-
assert result["strategy"] == "ddp"
|
|
25
|
-
assert result["dp_degree"] == 4
|
|
26
|
-
|
|
27
|
-
def test_single_gpu_no_degrees_strategy_none(self):
|
|
28
|
-
"""Single GPU with no degrees → strategy stays None."""
|
|
29
|
-
result = self._topo({}, num_gpus=1)
|
|
30
|
-
assert result["strategy"] is None
|
|
31
|
-
|
|
32
|
-
def test_dp_only_is_ddp(self):
|
|
33
|
-
"""WORLD_SIZE=4 with no TP/PP → dp inferred → strategy=ddp."""
|
|
34
|
-
result = self._topo({"WORLD_SIZE": "4"})
|
|
35
|
-
assert result["strategy"] == "ddp"
|
|
36
|
-
assert result["dp_degree"] == 4
|
|
37
|
-
|
|
38
|
-
def test_tp_only(self):
|
|
39
|
-
"""TP_SIZE=4 alone → strategy=tp."""
|
|
40
|
-
result = self._topo({"TP_SIZE": "4"})
|
|
41
|
-
assert result["strategy"] == "tp"
|
|
42
|
-
|
|
43
|
-
def test_pp_only(self):
|
|
44
|
-
"""PP_SIZE=4 alone → strategy=pp."""
|
|
45
|
-
result = self._topo({"PP_SIZE": "4"})
|
|
46
|
-
assert result["strategy"] == "pp"
|
|
47
|
-
|
|
48
|
-
def test_tp_dp(self):
|
|
49
|
-
"""TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
|
|
50
|
-
result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
|
|
51
|
-
assert result["strategy"] == "tp+dp"
|
|
52
|
-
|
|
53
|
-
def test_pp_dp(self):
|
|
54
|
-
"""PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
|
|
55
|
-
result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
|
|
56
|
-
assert result["strategy"] == "pp+dp"
|
|
57
|
-
|
|
58
|
-
def test_tp_pp_dp(self):
|
|
59
|
-
"""All three degrees → strategy=tp+pp+dp."""
|
|
60
|
-
result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
|
|
61
|
-
assert result["strategy"] == "tp+pp+dp"
|
|
62
|
-
|
|
63
|
-
def test_tp_pp_no_dp(self):
|
|
64
|
-
"""TP+PP without explicit DP → strategy=tp+pp+dp."""
|
|
65
|
-
result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
|
|
66
|
-
assert result["strategy"] == "tp+pp+dp"
|
|
67
|
-
|
|
68
|
-
def test_tp_size_1_not_counted(self):
|
|
69
|
-
"""TP_SIZE=1 should not count as tensor parallelism."""
|
|
70
|
-
result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
|
|
71
|
-
assert result["strategy"] == "ddp"
|
|
72
|
-
|
|
73
|
-
def test_pp_size_1_not_counted(self):
|
|
74
|
-
"""PP_SIZE=1 should not count as pipeline parallelism."""
|
|
75
|
-
result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
|
|
76
|
-
assert result["strategy"] == "ddp"
|
|
77
|
-
|
|
78
|
-
def test_dp_inferred_from_world_size(self):
|
|
79
|
-
"""DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
|
|
80
|
-
result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
|
|
81
|
-
assert result["dp_degree"] == 4
|
|
82
|
-
assert result["strategy"] == "tp+dp"
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class TestProcessMapInProbeDictAssembly:
|
|
86
|
-
"""process_map should reach probe_dict from ProbeResult."""
|
|
87
|
-
|
|
88
|
-
def test_process_map_present_in_topology_return(self):
|
|
89
|
-
"""Topology dict now includes strategy field."""
|
|
90
|
-
with patch.dict(os.environ, {"WORLD_SIZE": "4"}, clear=False):
|
|
91
|
-
topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
|
|
92
|
-
assert "strategy" in topo
|
|
93
|
-
assert topo["strategy"] == "ddp"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|