alloc 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alloc/__init__.py +11 -0
- alloc/artifact_writer.py +67 -0
- alloc/callbacks.py +342 -0
- alloc/catalog/__init__.py +138 -0
- alloc/catalog/default_rate_card.json +18 -0
- alloc/catalog/gpus.v1.json +174 -0
- alloc/cli.py +1341 -0
- alloc/config.py +124 -0
- alloc/context.py +191 -0
- alloc/display.py +580 -0
- alloc/extractor_runner.py +141 -0
- alloc/ghost.py +167 -0
- alloc/model_extractor.py +170 -0
- alloc/model_registry.py +138 -0
- alloc/probe.py +461 -0
- alloc/stability.py +144 -0
- alloc/upload.py +138 -0
- alloc/yaml_config.py +287 -0
- alloc-0.0.1.dist-info/METADATA +256 -0
- alloc-0.0.1.dist-info/RECORD +23 -0
- alloc-0.0.1.dist-info/WHEEL +5 -0
- alloc-0.0.1.dist-info/entry_points.txt +2 -0
- alloc-0.0.1.dist-info/top_level.txt +1 -0
alloc/upload.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Artifact upload — POST extracted fields to /runs/ingest as JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gzip
|
|
6
|
+
import json
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UploadLimitError(Exception):
|
|
11
|
+
"""Raised when the server rejects an upload due to tier limits."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, status_code: int, detail: dict):
|
|
14
|
+
self.status_code = status_code
|
|
15
|
+
self.detail = detail
|
|
16
|
+
msg = detail.get("message", f"Upload rejected (HTTP {status_code})")
|
|
17
|
+
super().__init__(msg)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _normalize_gpu_type(raw_name: object) -> Optional[str]:
|
|
21
|
+
"""Best-effort normalization from NVML GPU names to catalog IDs."""
|
|
22
|
+
if raw_name is None:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
name = str(raw_name).strip()
|
|
26
|
+
if not name:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
upper = name.upper()
|
|
30
|
+
if "H100" in upper and "NVL" in upper:
|
|
31
|
+
return "H100-NVL"
|
|
32
|
+
if "H200" in upper:
|
|
33
|
+
return "H200"
|
|
34
|
+
if "H100" in upper:
|
|
35
|
+
return "H100-80GB"
|
|
36
|
+
if "A100" in upper and "40" in upper:
|
|
37
|
+
return "A100-40GB"
|
|
38
|
+
if "A100" in upper:
|
|
39
|
+
return "A100-80GB"
|
|
40
|
+
if "A10G" in upper:
|
|
41
|
+
return "A10G"
|
|
42
|
+
if "L40S" in upper:
|
|
43
|
+
return "L40S"
|
|
44
|
+
if "L4" in upper:
|
|
45
|
+
return "L4"
|
|
46
|
+
if "T4" in upper:
|
|
47
|
+
return "T4"
|
|
48
|
+
if "V100" in upper:
|
|
49
|
+
return "V100-32GB"
|
|
50
|
+
if "4090" in upper:
|
|
51
|
+
return "RTX-4090"
|
|
52
|
+
if "3090" in upper:
|
|
53
|
+
return "RTX-3090"
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _to_positive_int(value: object, default: int = 1) -> int:
|
|
58
|
+
"""Parse a positive integer with a safe fallback."""
|
|
59
|
+
try:
|
|
60
|
+
parsed = int(value) # type: ignore[arg-type]
|
|
61
|
+
return parsed if parsed > 0 else default
|
|
62
|
+
except Exception:
|
|
63
|
+
return default
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def upload_artifact(artifact_path: str, api_url: str, token: str) -> dict:
|
|
67
|
+
"""Upload a .json.gz artifact to POST /runs/ingest.
|
|
68
|
+
|
|
69
|
+
Reads the artifact, extracts summary fields, and sends JSON.
|
|
70
|
+
Returns response dict with run_id and status.
|
|
71
|
+
Raises UploadLimitError on 402/403/429, other errors via raise_for_status.
|
|
72
|
+
"""
|
|
73
|
+
import httpx
|
|
74
|
+
|
|
75
|
+
with gzip.open(artifact_path, "rt", encoding="utf-8") as f:
|
|
76
|
+
report = json.load(f)
|
|
77
|
+
|
|
78
|
+
probe = report.get("probe") or {}
|
|
79
|
+
ghost = report.get("ghost") or {}
|
|
80
|
+
hardware = report.get("hardware") or {}
|
|
81
|
+
|
|
82
|
+
gpu_type = (
|
|
83
|
+
probe.get("gpu_type")
|
|
84
|
+
or _normalize_gpu_type(probe.get("gpu_name"))
|
|
85
|
+
or _normalize_gpu_type(hardware.get("gpu_name"))
|
|
86
|
+
)
|
|
87
|
+
num_gpus = _to_positive_int(
|
|
88
|
+
probe.get("num_gpus") or hardware.get("num_gpus_detected"),
|
|
89
|
+
default=1,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
payload = {
|
|
93
|
+
"model_name": probe.get("model_name"),
|
|
94
|
+
"gpu_type": gpu_type,
|
|
95
|
+
"num_gpus": num_gpus,
|
|
96
|
+
"strategy": probe.get("strategy"),
|
|
97
|
+
"tp_degree": probe.get("tp_degree"),
|
|
98
|
+
"pp_degree": probe.get("pp_degree"),
|
|
99
|
+
"dp_degree": probe.get("dp_degree"),
|
|
100
|
+
"num_nodes": probe.get("num_nodes"),
|
|
101
|
+
"gpus_per_node": probe.get("gpus_per_node"),
|
|
102
|
+
"interconnect_type": probe.get("interconnect_type"),
|
|
103
|
+
"objective": probe.get("objective"),
|
|
104
|
+
"max_budget_hourly": probe.get("max_budget_hourly"),
|
|
105
|
+
"peak_vram_mb": probe.get("peak_vram_mb"),
|
|
106
|
+
"avg_gpu_util": probe.get("avg_gpu_util"),
|
|
107
|
+
"avg_power_watts": probe.get("avg_power_watts"),
|
|
108
|
+
"duration_s": probe.get("duration_seconds"),
|
|
109
|
+
"exit_code": probe.get("exit_code"),
|
|
110
|
+
"probe_samples": probe.get("samples"),
|
|
111
|
+
"step_count": probe.get("step_count"),
|
|
112
|
+
"step_time_ms_p50": probe.get("step_time_ms_p50"),
|
|
113
|
+
"step_time_ms_p90": probe.get("step_time_ms_p90"),
|
|
114
|
+
"samples_per_sec": probe.get("samples_per_sec"),
|
|
115
|
+
"dataloader_wait_pct": probe.get("dataloader_wait_pct"),
|
|
116
|
+
"ghost_report": ghost if ghost else None,
|
|
117
|
+
"source": probe.get("source") or "cli",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
with httpx.Client(timeout=30) as client:
|
|
121
|
+
resp = client.post(
|
|
122
|
+
f"{api_url}/runs/ingest",
|
|
123
|
+
json=payload,
|
|
124
|
+
headers={
|
|
125
|
+
"Content-Type": "application/json",
|
|
126
|
+
"Authorization": f"Bearer {token}",
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if resp.status_code in (402, 403, 429):
|
|
131
|
+
try:
|
|
132
|
+
detail = resp.json().get("detail", {})
|
|
133
|
+
except Exception:
|
|
134
|
+
detail = {"message": resp.text[:200]}
|
|
135
|
+
raise UploadLimitError(resp.status_code, detail)
|
|
136
|
+
|
|
137
|
+
resp.raise_for_status()
|
|
138
|
+
return resp.json()
|
alloc/yaml_config.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
""".alloc.yaml config — GPU fleet, explore, budget, priority.
|
|
2
|
+
|
|
3
|
+
Searches for .alloc.yaml in cwd, then parents, then ~/.alloc/preferences.yaml.
|
|
4
|
+
Never crashes. Returns None or defaults on missing/invalid config.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from dataclasses import dataclass, field, asdict
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
CONFIG_FILENAME = ".alloc.yaml"
|
|
18
|
+
GLOBAL_PREFS_PATH = Path.home() / ".alloc" / "preferences.yaml"
|
|
19
|
+
|
|
20
|
+
_ALLOWED_OBJECTIVES = {
|
|
21
|
+
"cheapest",
|
|
22
|
+
"fastest",
|
|
23
|
+
"fastest_within_budget",
|
|
24
|
+
"best_value",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
_ALLOWED_INTERCONNECTS = {
|
|
28
|
+
"pcie",
|
|
29
|
+
"nvlink",
|
|
30
|
+
"infiniband",
|
|
31
|
+
"unknown",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FleetEntry:
|
|
37
|
+
"""A GPU in the user's fleet or explore list."""
|
|
38
|
+
|
|
39
|
+
gpu: str # GPU ID or alias (e.g. "H100", "nvidia-h100-sxm-80gb")
|
|
40
|
+
cloud: Optional[str] = None # "aws", "gcp", "azure", "lambda", etc.
|
|
41
|
+
count: Optional[int] = None # Max GPUs available
|
|
42
|
+
rate: Optional[float] = None # Custom $/hr override
|
|
43
|
+
explore: bool = False # True = "I don't have this but want to evaluate"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class AllocConfig:
|
|
48
|
+
"""Parsed .alloc.yaml content."""
|
|
49
|
+
|
|
50
|
+
fleet: List[FleetEntry] = field(default_factory=list)
|
|
51
|
+
explore: List[FleetEntry] = field(default_factory=list)
|
|
52
|
+
objective: Optional[str] = None # cheapest | fastest | fastest_within_budget | best_value
|
|
53
|
+
priority_cost: int = 50 # 0-100, latency = 100 - cost
|
|
54
|
+
budget_monthly: Optional[float] = None # Monthly budget in USD
|
|
55
|
+
budget_hourly: Optional[float] = None # Hourly budget cap
|
|
56
|
+
org_budget_monthly: Optional[float] = None # Org ceiling (from --from-org sync)
|
|
57
|
+
interconnect: Optional[str] = None # pcie | nvlink | infiniband | unknown
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def priority_latency(self) -> int:
|
|
61
|
+
return 100 - self.priority_cost
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def fleet_gpu_ids(self) -> List[str]:
|
|
65
|
+
"""GPU IDs from fleet entries."""
|
|
66
|
+
return [e.gpu for e in self.fleet]
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def explore_gpu_ids(self) -> List[str]:
|
|
70
|
+
"""GPU IDs from explore entries."""
|
|
71
|
+
return [e.gpu for e in self.explore]
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def all_gpu_ids(self) -> List[str]:
|
|
75
|
+
"""All GPU IDs (fleet + explore)."""
|
|
76
|
+
return self.fleet_gpu_ids + self.explore_gpu_ids
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def rate_overrides(self) -> Dict[str, float]:
|
|
80
|
+
"""GPU ID → custom $/hr for entries with rate set."""
|
|
81
|
+
overrides = {}
|
|
82
|
+
for e in self.fleet + self.explore:
|
|
83
|
+
if e.rate is not None:
|
|
84
|
+
overrides[e.gpu] = e.rate
|
|
85
|
+
return overrides
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> dict:
|
|
88
|
+
"""Serialize to dict suitable for YAML output."""
|
|
89
|
+
d = {} # type: dict
|
|
90
|
+
|
|
91
|
+
if self.objective is not None:
|
|
92
|
+
d["objective"] = self.objective
|
|
93
|
+
|
|
94
|
+
if self.fleet:
|
|
95
|
+
d["fleet"] = [_entry_to_dict(e) for e in self.fleet]
|
|
96
|
+
|
|
97
|
+
if self.explore:
|
|
98
|
+
d["explore"] = [_entry_to_dict(e) for e in self.explore]
|
|
99
|
+
|
|
100
|
+
d["priority"] = {
|
|
101
|
+
"cost": self.priority_cost,
|
|
102
|
+
"latency": self.priority_latency,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if self.budget_monthly is not None:
|
|
106
|
+
d.setdefault("budget", {})["monthly_usd"] = self.budget_monthly
|
|
107
|
+
if self.budget_hourly is not None:
|
|
108
|
+
d.setdefault("budget", {})["hourly_usd"] = self.budget_hourly
|
|
109
|
+
if self.org_budget_monthly is not None:
|
|
110
|
+
d.setdefault("budget", {})["org_ceiling_usd"] = self.org_budget_monthly
|
|
111
|
+
|
|
112
|
+
if self.interconnect is not None:
|
|
113
|
+
d["interconnect"] = self.interconnect
|
|
114
|
+
|
|
115
|
+
return d
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _entry_to_dict(e: FleetEntry) -> dict:
|
|
119
|
+
"""Serialize a FleetEntry, omitting None/default fields."""
|
|
120
|
+
d = {"gpu": e.gpu} # type: dict
|
|
121
|
+
if e.cloud is not None:
|
|
122
|
+
d["cloud"] = e.cloud
|
|
123
|
+
if e.count is not None:
|
|
124
|
+
d["count"] = e.count
|
|
125
|
+
if e.rate is not None:
|
|
126
|
+
d["rate"] = e.rate
|
|
127
|
+
return d
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def load_alloc_config(path: Optional[str] = None) -> Optional[AllocConfig]:
|
|
131
|
+
"""Load and parse .alloc.yaml.
|
|
132
|
+
|
|
133
|
+
Search order:
|
|
134
|
+
1. Explicit path (if provided)
|
|
135
|
+
2. .alloc.yaml in cwd
|
|
136
|
+
3. .alloc.yaml in parent directories (up to filesystem root)
|
|
137
|
+
4. ~/.alloc/preferences.yaml
|
|
138
|
+
|
|
139
|
+
Returns None if no config file found or parse fails.
|
|
140
|
+
"""
|
|
141
|
+
config_path = _find_config(path)
|
|
142
|
+
if config_path is None:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
with open(config_path, "r") as f:
|
|
147
|
+
raw = yaml.safe_load(f)
|
|
148
|
+
except Exception:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
if not isinstance(raw, dict):
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
return _parse_config(raw)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def validate_config(config: AllocConfig) -> List[str]:
|
|
158
|
+
"""Validate an AllocConfig. Returns list of error strings (empty = valid)."""
|
|
159
|
+
errors = []
|
|
160
|
+
|
|
161
|
+
if config.objective is not None and config.objective not in _ALLOWED_OBJECTIVES:
|
|
162
|
+
errors.append(
|
|
163
|
+
f"objective must be one of {sorted(_ALLOWED_OBJECTIVES)}, got {config.objective}"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if config.interconnect is not None and config.interconnect not in _ALLOWED_INTERCONNECTS:
|
|
167
|
+
errors.append(
|
|
168
|
+
f"interconnect must be one of {sorted(_ALLOWED_INTERCONNECTS)}, got {config.interconnect}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if config.priority_cost < 0 or config.priority_cost > 100:
|
|
172
|
+
errors.append(f"priority.cost must be 0-100, got {config.priority_cost}")
|
|
173
|
+
|
|
174
|
+
if config.budget_monthly is not None and config.budget_monthly < 0:
|
|
175
|
+
errors.append(f"budget.monthly_usd must be >= 0, got {config.budget_monthly}")
|
|
176
|
+
|
|
177
|
+
if config.budget_hourly is not None and config.budget_hourly < 0:
|
|
178
|
+
errors.append(f"budget.hourly_usd must be >= 0, got {config.budget_hourly}")
|
|
179
|
+
|
|
180
|
+
for entry in config.fleet + config.explore:
|
|
181
|
+
if not entry.gpu:
|
|
182
|
+
errors.append("Fleet/explore entry missing 'gpu' field")
|
|
183
|
+
if entry.rate is not None and entry.rate < 0:
|
|
184
|
+
errors.append(f"Rate for {entry.gpu} must be >= 0, got {entry.rate}")
|
|
185
|
+
if entry.count is not None and entry.count < 1:
|
|
186
|
+
errors.append(f"Count for {entry.gpu} must be >= 1, got {entry.count}")
|
|
187
|
+
|
|
188
|
+
return errors
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def write_alloc_config(config: AllocConfig, path: Optional[str] = None) -> str:
|
|
192
|
+
"""Write config to YAML file. Returns the path written to."""
|
|
193
|
+
out_path = path or os.path.join(os.getcwd(), CONFIG_FILENAME)
|
|
194
|
+
|
|
195
|
+
data = config.to_dict()
|
|
196
|
+
|
|
197
|
+
with open(out_path, "w") as f:
|
|
198
|
+
f.write("# Alloc GPU configuration\n")
|
|
199
|
+
f.write("# Docs: https://alloclabs.com/docs/right-sizing\n\n")
|
|
200
|
+
yaml.dump(data, f, default_flow_style=False, sort_keys=False)
|
|
201
|
+
|
|
202
|
+
return out_path
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _find_config(explicit_path: Optional[str] = None) -> Optional[str]:
|
|
206
|
+
"""Find .alloc.yaml by searching cwd → parents → global prefs."""
|
|
207
|
+
if explicit_path:
|
|
208
|
+
if os.path.isfile(explicit_path):
|
|
209
|
+
return explicit_path
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# Walk from cwd upward
|
|
213
|
+
current = Path.cwd()
|
|
214
|
+
for _ in range(50): # Safety limit
|
|
215
|
+
candidate = current / CONFIG_FILENAME
|
|
216
|
+
if candidate.is_file():
|
|
217
|
+
return str(candidate)
|
|
218
|
+
parent = current.parent
|
|
219
|
+
if parent == current:
|
|
220
|
+
break
|
|
221
|
+
current = parent
|
|
222
|
+
|
|
223
|
+
# Global preferences
|
|
224
|
+
if GLOBAL_PREFS_PATH.is_file():
|
|
225
|
+
return str(GLOBAL_PREFS_PATH)
|
|
226
|
+
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _parse_config(raw: dict) -> AllocConfig:
|
|
231
|
+
"""Parse raw YAML dict into AllocConfig."""
|
|
232
|
+
objective = raw.get("objective")
|
|
233
|
+
if not isinstance(objective, str) or not objective.strip():
|
|
234
|
+
objective = None
|
|
235
|
+
|
|
236
|
+
fleet = []
|
|
237
|
+
for item in raw.get("fleet", []):
|
|
238
|
+
if isinstance(item, str):
|
|
239
|
+
fleet.append(FleetEntry(gpu=item))
|
|
240
|
+
elif isinstance(item, dict):
|
|
241
|
+
fleet.append(FleetEntry(
|
|
242
|
+
gpu=item.get("gpu", ""),
|
|
243
|
+
cloud=item.get("cloud"),
|
|
244
|
+
count=item.get("count"),
|
|
245
|
+
rate=item.get("rate"),
|
|
246
|
+
explore=False,
|
|
247
|
+
))
|
|
248
|
+
|
|
249
|
+
explore = []
|
|
250
|
+
for item in raw.get("explore", []):
|
|
251
|
+
if isinstance(item, str):
|
|
252
|
+
explore.append(FleetEntry(gpu=item, explore=True))
|
|
253
|
+
elif isinstance(item, dict):
|
|
254
|
+
explore.append(FleetEntry(
|
|
255
|
+
gpu=item.get("gpu", ""),
|
|
256
|
+
cloud=item.get("cloud"),
|
|
257
|
+
count=item.get("count"),
|
|
258
|
+
rate=item.get("rate"),
|
|
259
|
+
explore=True,
|
|
260
|
+
))
|
|
261
|
+
|
|
262
|
+
priority = raw.get("priority", {})
|
|
263
|
+
priority_cost = priority.get("cost", 50) if isinstance(priority, dict) else 50
|
|
264
|
+
|
|
265
|
+
budget = raw.get("budget", {})
|
|
266
|
+
budget_monthly = budget.get("monthly_usd") if isinstance(budget, dict) else None
|
|
267
|
+
budget_hourly = budget.get("hourly_usd") if isinstance(budget, dict) else None
|
|
268
|
+
org_budget_monthly = budget.get("org_ceiling_usd") if isinstance(budget, dict) else None
|
|
269
|
+
|
|
270
|
+
interconnect = raw.get("interconnect")
|
|
271
|
+
if isinstance(interconnect, str) and interconnect.strip():
|
|
272
|
+
interconnect = interconnect.strip().lower()
|
|
273
|
+
if interconnect not in _ALLOWED_INTERCONNECTS:
|
|
274
|
+
interconnect = None
|
|
275
|
+
else:
|
|
276
|
+
interconnect = None
|
|
277
|
+
|
|
278
|
+
return AllocConfig(
|
|
279
|
+
fleet=fleet,
|
|
280
|
+
explore=explore,
|
|
281
|
+
objective=objective,
|
|
282
|
+
priority_cost=priority_cost,
|
|
283
|
+
budget_monthly=budget_monthly,
|
|
284
|
+
budget_hourly=budget_hourly,
|
|
285
|
+
org_budget_monthly=org_budget_monthly,
|
|
286
|
+
interconnect=interconnect,
|
|
287
|
+
)
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alloc
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
|
+
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://alloclabs.com
|
|
8
|
+
Project-URL: Repository, https://github.com/alloc-labs/alloc
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: typer>=0.9.0
|
|
21
|
+
Requires-Dist: rich>=13.0.0
|
|
22
|
+
Requires-Dist: httpx>=0.24.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Provides-Extra: gpu
|
|
26
|
+
Requires-Dist: pynvml>=11.5.0; extra == "gpu"
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# alloc (by [Alloc Labs](https://www.alloclabs.com))
|
|
32
|
+
|
|
33
|
+
Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
34
|
+
|
|
35
|
+
[](https://www.alloclabs.com)
|
|
36
|
+
[](https://pypi.org/project/alloc/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
> Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
|
|
40
|
+
|
|
41
|
+
## What Alloc Does
|
|
42
|
+
|
|
43
|
+
Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
|
|
44
|
+
|
|
45
|
+
- **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
|
|
46
|
+
- **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
|
|
47
|
+
- **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
|
|
48
|
+
|
|
49
|
+
Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
|
|
50
|
+
|
|
51
|
+
## Who This Is For
|
|
52
|
+
|
|
53
|
+
- **Solo engineers** who want a fast sanity check before burning GPU time
|
|
54
|
+
- **ML teams** who need repeatable right-sizing and bottleneck visibility
|
|
55
|
+
- **Platform/infra leads** who want budget-aware controls without rewriting training code
|
|
56
|
+
|
|
57
|
+
## Why It Is Low Friction
|
|
58
|
+
|
|
59
|
+
- **No code changes required** for baseline value (`alloc run`)
|
|
60
|
+
- **Optional deeper integration** via callbacks when you want richer timing signals
|
|
61
|
+
- **Local-first artifacts** so users still get value without cloud connectivity
|
|
62
|
+
- **Progressive adoption** from local CLI to team workflows and governance
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install alloc
|
|
68
|
+
|
|
69
|
+
# With GPU monitoring support (NVML via pynvml)
|
|
70
|
+
pip install alloc[gpu]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Notes:
|
|
74
|
+
- `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
|
|
75
|
+
- `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
|
|
76
|
+
|
|
77
|
+
## Commands
|
|
78
|
+
|
|
79
|
+
### `alloc scan`: Remote Ghost Scan (no GPU needed)
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
alloc scan --model llama-3-70b --gpu A100-80GB
|
|
83
|
+
alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
|
|
84
|
+
alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
|
|
85
|
+
|
|
86
|
+
# Objective + budget constraints
|
|
87
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
|
|
88
|
+
|
|
89
|
+
# Topology hints (optional, improves planner quality)
|
|
90
|
+
alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### `alloc ghost`: Local VRAM estimation
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
alloc ghost train.py --dtype bf16 --batch-size 32
|
|
97
|
+
alloc ghost train.py --param-count-b 7.0 # manual override
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
|
|
101
|
+
|
|
102
|
+
### `alloc run`: Training with GPU monitoring
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
alloc run python train.py # calibrate and exit (default)
|
|
106
|
+
alloc run --full python train.py # monitor full training run
|
|
107
|
+
alloc run torchrun --nproc_per_node=4 train.py
|
|
108
|
+
alloc run -- python train.py --epochs 10
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
|
|
112
|
+
|
|
113
|
+
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
|
|
114
|
+
|
|
115
|
+
**Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
|
|
116
|
+
|
|
117
|
+
**Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
|
|
118
|
+
|
|
119
|
+
### `alloc login`: Authenticate with dashboard
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
alloc login
|
|
123
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
124
|
+
|
|
125
|
+
alloc login --token <ACCESS_TOKEN>
|
|
126
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### `alloc whoami`: Show current auth + org context
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
alloc whoami
|
|
133
|
+
alloc whoami --json
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
137
|
+
|
|
138
|
+
### `alloc logout`: Clear local session
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc logout
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
145
|
+
|
|
146
|
+
### `alloc upload`: Upload artifact to dashboard
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
alloc upload alloc_artifact.json.gz
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
153
|
+
|
|
154
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
155
|
+
|
|
156
|
+
### `alloc catalog`: Browse GPU hardware catalog
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
alloc catalog list # list all 13 GPUs (sorted by VRAM)
|
|
160
|
+
alloc catalog list --sort cost # sort by $/hr
|
|
161
|
+
alloc catalog list --sort tflops # sort by BF16 TFLOPS
|
|
162
|
+
alloc catalog show H100 # detailed specs for H100
|
|
163
|
+
alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
|
|
167
|
+
|
|
168
|
+
### `alloc init`: Configure GPU fleet and budget
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
alloc init # interactive wizard
|
|
172
|
+
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
173
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
177
|
+
|
|
178
|
+
### `alloc version`
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
alloc version
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Python API
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import alloc
|
|
188
|
+
|
|
189
|
+
# Static VRAM analysis (never crashes your training)
|
|
190
|
+
report = alloc.ghost(model)
|
|
191
|
+
print(report.total_gb) # e.g., 115.42
|
|
192
|
+
|
|
193
|
+
# Or from param count (no torch needed)
|
|
194
|
+
report = alloc.ghost(param_count_b=7.0, dtype="bf16")
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Framework Callbacks
|
|
198
|
+
|
|
199
|
+
Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# HuggingFace Transformers
|
|
203
|
+
from alloc import HuggingFaceCallback
|
|
204
|
+
trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
|
|
205
|
+
|
|
206
|
+
# PyTorch Lightning
|
|
207
|
+
from alloc import LightningCallback
|
|
208
|
+
trainer = Trainer(..., callbacks=[LightningCallback()])
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
|
|
212
|
+
|
|
213
|
+
## Configuration
|
|
214
|
+
|
|
215
|
+
Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
|
|
216
|
+
|
|
217
|
+
| Variable | Default | Description |
|
|
218
|
+
|----------|---------|-------------|
|
|
219
|
+
| `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
|
|
220
|
+
| `ALLOC_TOKEN` | (empty) | Auth token for API calls |
|
|
221
|
+
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
|
|
222
|
+
| `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
|
|
223
|
+
| `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
|
|
224
|
+
|
|
225
|
+
## Architecture
|
|
226
|
+
|
|
227
|
+
| Module | Purpose |
|
|
228
|
+
|--------|---------|
|
|
229
|
+
| `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
|
|
230
|
+
| `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
|
|
231
|
+
| `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
|
|
232
|
+
| `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
|
|
233
|
+
| `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
|
|
234
|
+
| `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
|
|
235
|
+
| `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
|
|
236
|
+
| `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
|
|
237
|
+
| `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
|
|
238
|
+
| `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
|
|
239
|
+
| `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
|
|
240
|
+
| `display.py` | Rich terminal formatting for reports. |
|
|
241
|
+
| `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
|
|
242
|
+
|
|
243
|
+
## Design Principles
|
|
244
|
+
|
|
245
|
+
1. **Zero config**: `alloc run python train.py` works out of the box
|
|
246
|
+
2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
|
|
247
|
+
3. **Never crash user's training**: All Alloc failures are caught and training continues
|
|
248
|
+
4. **Progressive disclosure**: Individual use first, team governance later
|
|
249
|
+
|
|
250
|
+
## Telemetry Levels
|
|
251
|
+
|
|
252
|
+
Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
|
|
253
|
+
|
|
254
|
+
- **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
|
|
255
|
+
- **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
|
|
256
|
+
- **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
|