alloc 0.0.8__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.0.8 → alloc-0.0.9}/PKG-INFO +2 -2
- {alloc-0.0.8 → alloc-0.0.9}/README.md +1 -1
- {alloc-0.0.8 → alloc-0.0.9}/pyproject.toml +1 -1
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/__init__.py +1 -1
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/cli.py +23 -6
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/extractor_runner.py +24 -1
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/PKG-INFO +2 -2
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_auth.py +29 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_topology_strategy.py +9 -3
- {alloc-0.0.8 → alloc-0.0.9}/setup.cfg +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/artifact_loader.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/browser_auth.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/callbacks.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/catalog/__init__.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/catalog/gpus.v1.json +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/code_analyzer.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/config.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/context.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/diagnosis_display.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/diagnosis_engine.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/diagnosis_rules.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/display.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/ghost.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/model_extractor.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/model_registry.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/probe.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/stability.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/upload.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc/yaml_config.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/SOURCES.txt +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_artifact.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_artifact_loader.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_callbacks.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_catalog.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_cli.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_code_analyzer.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_context.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_diagnose_cli.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_diagnosis_engine.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_diagnosis_rules.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_extractor_activation.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_ghost.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_ghost_degradation.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_init_from_org.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_interconnect.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_model_extractor.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_probe_hw.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_probe_multi.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_scan_auth.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_stability.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_upload.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_verdict.py +0 -0
- {alloc-0.0.8 → alloc-0.0.9}/tests/test_yaml_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -40,7 +40,7 @@ alloc run python train.py
|
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
```
|
|
43
|
-
alloc v0.0.
|
|
43
|
+
alloc v0.0.8 — Calibrate
|
|
44
44
|
|
|
45
45
|
Run Summary
|
|
46
46
|
Peak VRAM 31.2 GB / 40.0 GB (A100)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.9"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
|
|
|
9
9
|
_warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
|
|
10
10
|
del _warnings
|
|
11
11
|
|
|
12
|
-
__version__ = "0.0.
|
|
12
|
+
__version__ = "0.0.9"
|
|
13
13
|
|
|
14
14
|
from alloc.ghost import ghost, GhostReport
|
|
15
15
|
from alloc.callbacks import AllocCallback as HuggingFaceCallback
|
|
@@ -2400,23 +2400,33 @@ def whoami(
|
|
|
2400
2400
|
profile = _get("/profile")
|
|
2401
2401
|
fleet = _get("/gpu-fleet")
|
|
2402
2402
|
else:
|
|
2403
|
-
|
|
2403
|
+
# whoami is a status command — report structured result, exit 0
|
|
2404
|
+
if e.response.status_code == 401:
|
|
2405
|
+
out["token_status"] = "expired"
|
|
2406
|
+
else:
|
|
2407
|
+
out["token_status"] = "error"
|
|
2404
2408
|
out["error"] = f"API error {e.response.status_code}"
|
|
2409
|
+
if json_output:
|
|
2405
2410
|
_print_json(out)
|
|
2406
2411
|
else:
|
|
2407
|
-
|
|
2412
|
+
if e.response.status_code == 401:
|
|
2413
|
+
console.print("[yellow]Token expired.[/yellow]")
|
|
2414
|
+
else:
|
|
2415
|
+
console.print(f"[red]API error {e.response.status_code}[/red]")
|
|
2408
2416
|
console.print("[dim]Run: alloc login[/dim]")
|
|
2409
|
-
|
|
2417
|
+
return
|
|
2410
2418
|
except httpx.ConnectError:
|
|
2419
|
+
out["token_status"] = "unreachable"
|
|
2420
|
+
out["error"] = f"Cannot connect to {api_url}"
|
|
2411
2421
|
if json_output:
|
|
2412
|
-
out["error"] = f"Cannot connect to {api_url}"
|
|
2413
2422
|
_print_json(out)
|
|
2414
2423
|
else:
|
|
2415
2424
|
console.print(f"[red]Cannot connect to {api_url}[/red]")
|
|
2416
|
-
|
|
2425
|
+
return
|
|
2417
2426
|
|
|
2418
2427
|
# API validated the token — now we know login is real
|
|
2419
2428
|
out["logged_in"] = True
|
|
2429
|
+
out["token_status"] = "valid"
|
|
2420
2430
|
|
|
2421
2431
|
gpus = fleet.get("gpus") or []
|
|
2422
2432
|
fleet_count = len([g for g in gpus if g.get("fleet_status") == "in_fleet"])
|
|
@@ -3565,7 +3575,14 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
|
|
|
3565
3575
|
strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
|
|
3566
3576
|
elif dp is not None and dp > 1:
|
|
3567
3577
|
strategy = "ddp"
|
|
3568
|
-
|
|
3578
|
+
elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
|
|
3579
|
+
# Multiple GPUs detected via NVML with no TP/PP env vars →
|
|
3580
|
+
# DDP is PyTorch's default and the only realistic inference.
|
|
3581
|
+
# This is NOT the old `or "ddp"` — it only fires when probe
|
|
3582
|
+
# actually observed multiple GPU processes.
|
|
3583
|
+
strategy = "ddp"
|
|
3584
|
+
if dp is None:
|
|
3585
|
+
dp = num_gpus_detected
|
|
3569
3586
|
|
|
3570
3587
|
return {
|
|
3571
3588
|
"num_nodes": nnodes or 1,
|
|
@@ -281,7 +281,30 @@ def main():
|
|
|
281
281
|
"activation_method": activation_result.get("activation_method"),
|
|
282
282
|
}
|
|
283
283
|
else:
|
|
284
|
-
|
|
284
|
+
# No model found — check if this is a distributed training script
|
|
285
|
+
# that hides the model inside __main__ guard or main()
|
|
286
|
+
_is_dist = False
|
|
287
|
+
try:
|
|
288
|
+
import torch.distributed as _dist_mod
|
|
289
|
+
if _dist_mod.is_initialized():
|
|
290
|
+
_is_dist = True
|
|
291
|
+
except Exception:
|
|
292
|
+
pass
|
|
293
|
+
if not _is_dist:
|
|
294
|
+
# Check if module imported distributed primitives
|
|
295
|
+
for attr_name in dir(module):
|
|
296
|
+
try:
|
|
297
|
+
obj = getattr(module, attr_name)
|
|
298
|
+
mod_name = getattr(obj, "__module__", "") or ""
|
|
299
|
+
if "torch.distributed" in mod_name or "torch.nn.parallel" in mod_name:
|
|
300
|
+
_is_dist = True
|
|
301
|
+
break
|
|
302
|
+
except Exception:
|
|
303
|
+
continue
|
|
304
|
+
if _is_dist:
|
|
305
|
+
result = {"status": "error_distributed", "error": "no model found — script uses distributed training"}
|
|
306
|
+
else:
|
|
307
|
+
result = {"status": "no_model"}
|
|
285
308
|
|
|
286
309
|
with open(sidecar_path, "w") as f:
|
|
287
310
|
json.dump(result, f)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -40,7 +40,7 @@ alloc run python train.py
|
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
```
|
|
43
|
-
alloc v0.0.
|
|
43
|
+
alloc v0.0.8 — Calibrate
|
|
44
44
|
|
|
45
45
|
Run Summary
|
|
46
46
|
Peak VRAM 31.2 GB / 40.0 GB (A100)
|
|
@@ -68,6 +68,34 @@ def test_whoami_not_logged_in_json(tmp_path: Path):
|
|
|
68
68
|
assert data["api_url"] == "https://api.example.com"
|
|
69
69
|
|
|
70
70
|
|
|
71
|
+
def test_whoami_stale_token_json(tmp_path: Path):
|
|
72
|
+
"""Stale token should exit 0 with token_status: expired."""
|
|
73
|
+
mock_resp = MagicMock()
|
|
74
|
+
mock_resp.status_code = 401
|
|
75
|
+
mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError(
|
|
76
|
+
"Unauthorized", request=MagicMock(), response=mock_resp,
|
|
77
|
+
)
|
|
78
|
+
mock_client = MagicMock()
|
|
79
|
+
mock_client.__enter__.return_value = mock_client
|
|
80
|
+
mock_client.__exit__.return_value = False
|
|
81
|
+
mock_client.get.return_value = mock_resp
|
|
82
|
+
|
|
83
|
+
env = {
|
|
84
|
+
"HOME": str(tmp_path),
|
|
85
|
+
"ALLOC_API_URL": "https://api.example.com",
|
|
86
|
+
"ALLOC_TOKEN": "stale-token",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
with patch("httpx.Client", return_value=mock_client), \
|
|
90
|
+
patch("alloc.cli.try_refresh_access_token", return_value=None):
|
|
91
|
+
result = runner.invoke(app, ["whoami", "--json"], env=env)
|
|
92
|
+
|
|
93
|
+
assert result.exit_code == 0
|
|
94
|
+
data = json.loads(result.output)
|
|
95
|
+
assert data["logged_in"] is False
|
|
96
|
+
assert data["token_status"] == "expired"
|
|
97
|
+
|
|
98
|
+
|
|
71
99
|
def test_whoami_logged_in_json(tmp_path: Path):
|
|
72
100
|
profile_resp = MagicMock()
|
|
73
101
|
profile_resp.raise_for_status.return_value = None
|
|
@@ -110,6 +138,7 @@ def test_whoami_logged_in_json(tmp_path: Path):
|
|
|
110
138
|
assert result.exit_code == 0
|
|
111
139
|
data = json.loads(result.output)
|
|
112
140
|
assert data["logged_in"] is True
|
|
141
|
+
assert data["token_status"] == "valid"
|
|
113
142
|
assert data["token_source"] == "env"
|
|
114
143
|
assert data["email"] == "user@example.com"
|
|
115
144
|
assert data["fleet_count"] == 1
|
|
@@ -18,9 +18,15 @@ class TestStrategyInference:
|
|
|
18
18
|
num_gpus_detected=num_gpus,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
"""When no degree env vars
|
|
23
|
-
result = self._topo({})
|
|
21
|
+
def test_no_degrees_multi_gpu_infers_ddp(self):
|
|
22
|
+
"""When no degree env vars but multiple GPUs detected, infer DDP."""
|
|
23
|
+
result = self._topo({}, num_gpus=4)
|
|
24
|
+
assert result["strategy"] == "ddp"
|
|
25
|
+
assert result["dp_degree"] == 4
|
|
26
|
+
|
|
27
|
+
def test_single_gpu_no_degrees_strategy_none(self):
|
|
28
|
+
"""Single GPU with no degrees → strategy stays None."""
|
|
29
|
+
result = self._topo({}, num_gpus=1)
|
|
24
30
|
assert result["strategy"] is None
|
|
25
31
|
|
|
26
32
|
def test_dp_only_is_ddp(self):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|