alloc 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
alloc/upload.py ADDED
@@ -0,0 +1,138 @@
1
+ """Artifact upload — POST extracted fields to /runs/ingest as JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import json
7
+ from typing import Optional
8
+
9
+
10
+ class UploadLimitError(Exception):
11
+ """Raised when the server rejects an upload due to tier limits."""
12
+
13
+ def __init__(self, status_code: int, detail: dict):
14
+ self.status_code = status_code
15
+ self.detail = detail
16
+ msg = detail.get("message", f"Upload rejected (HTTP {status_code})")
17
+ super().__init__(msg)
18
+
19
+
20
+ def _normalize_gpu_type(raw_name: object) -> Optional[str]:
21
+ """Best-effort normalization from NVML GPU names to catalog IDs."""
22
+ if raw_name is None:
23
+ return None
24
+
25
+ name = str(raw_name).strip()
26
+ if not name:
27
+ return None
28
+
29
+ upper = name.upper()
30
+ if "H100" in upper and "NVL" in upper:
31
+ return "H100-NVL"
32
+ if "H200" in upper:
33
+ return "H200"
34
+ if "H100" in upper:
35
+ return "H100-80GB"
36
+ if "A100" in upper and "40" in upper:
37
+ return "A100-40GB"
38
+ if "A100" in upper:
39
+ return "A100-80GB"
40
+ if "A10G" in upper:
41
+ return "A10G"
42
+ if "L40S" in upper:
43
+ return "L40S"
44
+ if "L4" in upper:
45
+ return "L4"
46
+ if "T4" in upper:
47
+ return "T4"
48
+ if "V100" in upper:
49
+ return "V100-32GB"
50
+ if "4090" in upper:
51
+ return "RTX-4090"
52
+ if "3090" in upper:
53
+ return "RTX-3090"
54
+ return None
55
+
56
+
57
+ def _to_positive_int(value: object, default: int = 1) -> int:
58
+ """Parse a positive integer with a safe fallback."""
59
+ try:
60
+ parsed = int(value) # type: ignore[arg-type]
61
+ return parsed if parsed > 0 else default
62
+ except Exception:
63
+ return default
64
+
65
+
66
+ def upload_artifact(artifact_path: str, api_url: str, token: str) -> dict:
67
+ """Upload a .json.gz artifact to POST /runs/ingest.
68
+
69
+ Reads the artifact, extracts summary fields, and sends JSON.
70
+ Returns response dict with run_id and status.
71
+ Raises UploadLimitError on 402/403/429, other errors via raise_for_status.
72
+ """
73
+ import httpx
74
+
75
+ with gzip.open(artifact_path, "rt", encoding="utf-8") as f:
76
+ report = json.load(f)
77
+
78
+ probe = report.get("probe") or {}
79
+ ghost = report.get("ghost") or {}
80
+ hardware = report.get("hardware") or {}
81
+
82
+ gpu_type = (
83
+ probe.get("gpu_type")
84
+ or _normalize_gpu_type(probe.get("gpu_name"))
85
+ or _normalize_gpu_type(hardware.get("gpu_name"))
86
+ )
87
+ num_gpus = _to_positive_int(
88
+ probe.get("num_gpus") or hardware.get("num_gpus_detected"),
89
+ default=1,
90
+ )
91
+
92
+ payload = {
93
+ "model_name": probe.get("model_name"),
94
+ "gpu_type": gpu_type,
95
+ "num_gpus": num_gpus,
96
+ "strategy": probe.get("strategy"),
97
+ "tp_degree": probe.get("tp_degree"),
98
+ "pp_degree": probe.get("pp_degree"),
99
+ "dp_degree": probe.get("dp_degree"),
100
+ "num_nodes": probe.get("num_nodes"),
101
+ "gpus_per_node": probe.get("gpus_per_node"),
102
+ "interconnect_type": probe.get("interconnect_type"),
103
+ "objective": probe.get("objective"),
104
+ "max_budget_hourly": probe.get("max_budget_hourly"),
105
+ "peak_vram_mb": probe.get("peak_vram_mb"),
106
+ "avg_gpu_util": probe.get("avg_gpu_util"),
107
+ "avg_power_watts": probe.get("avg_power_watts"),
108
+ "duration_s": probe.get("duration_seconds"),
109
+ "exit_code": probe.get("exit_code"),
110
+ "probe_samples": probe.get("samples"),
111
+ "step_count": probe.get("step_count"),
112
+ "step_time_ms_p50": probe.get("step_time_ms_p50"),
113
+ "step_time_ms_p90": probe.get("step_time_ms_p90"),
114
+ "samples_per_sec": probe.get("samples_per_sec"),
115
+ "dataloader_wait_pct": probe.get("dataloader_wait_pct"),
116
+ "ghost_report": ghost if ghost else None,
117
+ "source": probe.get("source") or "cli",
118
+ }
119
+
120
+ with httpx.Client(timeout=30) as client:
121
+ resp = client.post(
122
+ f"{api_url}/runs/ingest",
123
+ json=payload,
124
+ headers={
125
+ "Content-Type": "application/json",
126
+ "Authorization": f"Bearer {token}",
127
+ },
128
+ )
129
+
130
+ if resp.status_code in (402, 403, 429):
131
+ try:
132
+ detail = resp.json().get("detail", {})
133
+ except Exception:
134
+ detail = {"message": resp.text[:200]}
135
+ raise UploadLimitError(resp.status_code, detail)
136
+
137
+ resp.raise_for_status()
138
+ return resp.json()
alloc/yaml_config.py ADDED
@@ -0,0 +1,287 @@
1
+ """.alloc.yaml config — GPU fleet, explore, budget, priority.
2
+
3
+ Searches for .alloc.yaml in cwd, then parents, then ~/.alloc/preferences.yaml.
4
+ Never crashes. Returns None or defaults on missing/invalid config.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass, field, asdict
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional
13
+
14
+ import yaml
15
+
16
+
17
+ CONFIG_FILENAME = ".alloc.yaml"
18
+ GLOBAL_PREFS_PATH = Path.home() / ".alloc" / "preferences.yaml"
19
+
20
+ _ALLOWED_OBJECTIVES = {
21
+ "cheapest",
22
+ "fastest",
23
+ "fastest_within_budget",
24
+ "best_value",
25
+ }
26
+
27
+ _ALLOWED_INTERCONNECTS = {
28
+ "pcie",
29
+ "nvlink",
30
+ "infiniband",
31
+ "unknown",
32
+ }
33
+
34
+
35
+ @dataclass
36
+ class FleetEntry:
37
+ """A GPU in the user's fleet or explore list."""
38
+
39
+ gpu: str # GPU ID or alias (e.g. "H100", "nvidia-h100-sxm-80gb")
40
+ cloud: Optional[str] = None # "aws", "gcp", "azure", "lambda", etc.
41
+ count: Optional[int] = None # Max GPUs available
42
+ rate: Optional[float] = None # Custom $/hr override
43
+ explore: bool = False # True = "I don't have this but want to evaluate"
44
+
45
+
46
+ @dataclass
47
+ class AllocConfig:
48
+ """Parsed .alloc.yaml content."""
49
+
50
+ fleet: List[FleetEntry] = field(default_factory=list)
51
+ explore: List[FleetEntry] = field(default_factory=list)
52
+ objective: Optional[str] = None # cheapest | fastest | fastest_within_budget | best_value
53
+ priority_cost: int = 50 # 0-100, latency = 100 - cost
54
+ budget_monthly: Optional[float] = None # Monthly budget in USD
55
+ budget_hourly: Optional[float] = None # Hourly budget cap
56
+ org_budget_monthly: Optional[float] = None # Org ceiling (from --from-org sync)
57
+ interconnect: Optional[str] = None # pcie | nvlink | infiniband | unknown
58
+
59
+ @property
60
+ def priority_latency(self) -> int:
61
+ return 100 - self.priority_cost
62
+
63
+ @property
64
+ def fleet_gpu_ids(self) -> List[str]:
65
+ """GPU IDs from fleet entries."""
66
+ return [e.gpu for e in self.fleet]
67
+
68
+ @property
69
+ def explore_gpu_ids(self) -> List[str]:
70
+ """GPU IDs from explore entries."""
71
+ return [e.gpu for e in self.explore]
72
+
73
+ @property
74
+ def all_gpu_ids(self) -> List[str]:
75
+ """All GPU IDs (fleet + explore)."""
76
+ return self.fleet_gpu_ids + self.explore_gpu_ids
77
+
78
+ @property
79
+ def rate_overrides(self) -> Dict[str, float]:
80
+ """GPU ID → custom $/hr for entries with rate set."""
81
+ overrides = {}
82
+ for e in self.fleet + self.explore:
83
+ if e.rate is not None:
84
+ overrides[e.gpu] = e.rate
85
+ return overrides
86
+
87
+ def to_dict(self) -> dict:
88
+ """Serialize to dict suitable for YAML output."""
89
+ d = {} # type: dict
90
+
91
+ if self.objective is not None:
92
+ d["objective"] = self.objective
93
+
94
+ if self.fleet:
95
+ d["fleet"] = [_entry_to_dict(e) for e in self.fleet]
96
+
97
+ if self.explore:
98
+ d["explore"] = [_entry_to_dict(e) for e in self.explore]
99
+
100
+ d["priority"] = {
101
+ "cost": self.priority_cost,
102
+ "latency": self.priority_latency,
103
+ }
104
+
105
+ if self.budget_monthly is not None:
106
+ d.setdefault("budget", {})["monthly_usd"] = self.budget_monthly
107
+ if self.budget_hourly is not None:
108
+ d.setdefault("budget", {})["hourly_usd"] = self.budget_hourly
109
+ if self.org_budget_monthly is not None:
110
+ d.setdefault("budget", {})["org_ceiling_usd"] = self.org_budget_monthly
111
+
112
+ if self.interconnect is not None:
113
+ d["interconnect"] = self.interconnect
114
+
115
+ return d
116
+
117
+
118
+ def _entry_to_dict(e: FleetEntry) -> dict:
119
+ """Serialize a FleetEntry, omitting None/default fields."""
120
+ d = {"gpu": e.gpu} # type: dict
121
+ if e.cloud is not None:
122
+ d["cloud"] = e.cloud
123
+ if e.count is not None:
124
+ d["count"] = e.count
125
+ if e.rate is not None:
126
+ d["rate"] = e.rate
127
+ return d
128
+
129
+
130
+ def load_alloc_config(path: Optional[str] = None) -> Optional[AllocConfig]:
131
+ """Load and parse .alloc.yaml.
132
+
133
+ Search order:
134
+ 1. Explicit path (if provided)
135
+ 2. .alloc.yaml in cwd
136
+ 3. .alloc.yaml in parent directories (up to filesystem root)
137
+ 4. ~/.alloc/preferences.yaml
138
+
139
+ Returns None if no config file found or parse fails.
140
+ """
141
+ config_path = _find_config(path)
142
+ if config_path is None:
143
+ return None
144
+
145
+ try:
146
+ with open(config_path, "r") as f:
147
+ raw = yaml.safe_load(f)
148
+ except Exception:
149
+ return None
150
+
151
+ if not isinstance(raw, dict):
152
+ return None
153
+
154
+ return _parse_config(raw)
155
+
156
+
157
+ def validate_config(config: AllocConfig) -> List[str]:
158
+ """Validate an AllocConfig. Returns list of error strings (empty = valid)."""
159
+ errors = []
160
+
161
+ if config.objective is not None and config.objective not in _ALLOWED_OBJECTIVES:
162
+ errors.append(
163
+ f"objective must be one of {sorted(_ALLOWED_OBJECTIVES)}, got {config.objective}"
164
+ )
165
+
166
+ if config.interconnect is not None and config.interconnect not in _ALLOWED_INTERCONNECTS:
167
+ errors.append(
168
+ f"interconnect must be one of {sorted(_ALLOWED_INTERCONNECTS)}, got {config.interconnect}"
169
+ )
170
+
171
+ if config.priority_cost < 0 or config.priority_cost > 100:
172
+ errors.append(f"priority.cost must be 0-100, got {config.priority_cost}")
173
+
174
+ if config.budget_monthly is not None and config.budget_monthly < 0:
175
+ errors.append(f"budget.monthly_usd must be >= 0, got {config.budget_monthly}")
176
+
177
+ if config.budget_hourly is not None and config.budget_hourly < 0:
178
+ errors.append(f"budget.hourly_usd must be >= 0, got {config.budget_hourly}")
179
+
180
+ for entry in config.fleet + config.explore:
181
+ if not entry.gpu:
182
+ errors.append("Fleet/explore entry missing 'gpu' field")
183
+ if entry.rate is not None and entry.rate < 0:
184
+ errors.append(f"Rate for {entry.gpu} must be >= 0, got {entry.rate}")
185
+ if entry.count is not None and entry.count < 1:
186
+ errors.append(f"Count for {entry.gpu} must be >= 1, got {entry.count}")
187
+
188
+ return errors
189
+
190
+
191
+ def write_alloc_config(config: AllocConfig, path: Optional[str] = None) -> str:
192
+ """Write config to YAML file. Returns the path written to."""
193
+ out_path = path or os.path.join(os.getcwd(), CONFIG_FILENAME)
194
+
195
+ data = config.to_dict()
196
+
197
+ with open(out_path, "w") as f:
198
+ f.write("# Alloc GPU configuration\n")
199
+ f.write("# Docs: https://alloclabs.com/docs/right-sizing\n\n")
200
+ yaml.dump(data, f, default_flow_style=False, sort_keys=False)
201
+
202
+ return out_path
203
+
204
+
205
+ def _find_config(explicit_path: Optional[str] = None) -> Optional[str]:
206
+ """Find .alloc.yaml by searching cwd → parents → global prefs."""
207
+ if explicit_path:
208
+ if os.path.isfile(explicit_path):
209
+ return explicit_path
210
+ return None
211
+
212
+ # Walk from cwd upward
213
+ current = Path.cwd()
214
+ for _ in range(50): # Safety limit
215
+ candidate = current / CONFIG_FILENAME
216
+ if candidate.is_file():
217
+ return str(candidate)
218
+ parent = current.parent
219
+ if parent == current:
220
+ break
221
+ current = parent
222
+
223
+ # Global preferences
224
+ if GLOBAL_PREFS_PATH.is_file():
225
+ return str(GLOBAL_PREFS_PATH)
226
+
227
+ return None
228
+
229
+
230
+ def _parse_config(raw: dict) -> AllocConfig:
231
+ """Parse raw YAML dict into AllocConfig."""
232
+ objective = raw.get("objective")
233
+ if not isinstance(objective, str) or not objective.strip():
234
+ objective = None
235
+
236
+ fleet = []
237
+ for item in raw.get("fleet", []):
238
+ if isinstance(item, str):
239
+ fleet.append(FleetEntry(gpu=item))
240
+ elif isinstance(item, dict):
241
+ fleet.append(FleetEntry(
242
+ gpu=item.get("gpu", ""),
243
+ cloud=item.get("cloud"),
244
+ count=item.get("count"),
245
+ rate=item.get("rate"),
246
+ explore=False,
247
+ ))
248
+
249
+ explore = []
250
+ for item in raw.get("explore", []):
251
+ if isinstance(item, str):
252
+ explore.append(FleetEntry(gpu=item, explore=True))
253
+ elif isinstance(item, dict):
254
+ explore.append(FleetEntry(
255
+ gpu=item.get("gpu", ""),
256
+ cloud=item.get("cloud"),
257
+ count=item.get("count"),
258
+ rate=item.get("rate"),
259
+ explore=True,
260
+ ))
261
+
262
+ priority = raw.get("priority", {})
263
+ priority_cost = priority.get("cost", 50) if isinstance(priority, dict) else 50
264
+
265
+ budget = raw.get("budget", {})
266
+ budget_monthly = budget.get("monthly_usd") if isinstance(budget, dict) else None
267
+ budget_hourly = budget.get("hourly_usd") if isinstance(budget, dict) else None
268
+ org_budget_monthly = budget.get("org_ceiling_usd") if isinstance(budget, dict) else None
269
+
270
+ interconnect = raw.get("interconnect")
271
+ if isinstance(interconnect, str) and interconnect.strip():
272
+ interconnect = interconnect.strip().lower()
273
+ if interconnect not in _ALLOWED_INTERCONNECTS:
274
+ interconnect = None
275
+ else:
276
+ interconnect = None
277
+
278
+ return AllocConfig(
279
+ fleet=fleet,
280
+ explore=explore,
281
+ objective=objective,
282
+ priority_cost=priority_cost,
283
+ budget_monthly=budget_monthly,
284
+ budget_hourly=budget_hourly,
285
+ org_budget_monthly=org_budget_monthly,
286
+ interconnect=interconnect,
287
+ )
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: alloc
3
+ Version: 0.0.1
4
+ Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
+ Author-email: Alloc Labs <hello@alloclabs.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://alloclabs.com
8
+ Project-URL: Repository, https://github.com/alloc-labs/alloc
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.8
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: typer>=0.9.0
21
+ Requires-Dist: rich>=13.0.0
22
+ Requires-Dist: httpx>=0.24.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: pyyaml>=6.0
25
+ Provides-Extra: gpu
26
+ Requires-Dist: pynvml>=11.5.0; extra == "gpu"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
30
+
31
+ # alloc (by [Alloc Labs](https://www.alloclabs.com))
32
+
33
+ Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
34
+
35
+ [![Website](https://img.shields.io/badge/alloclabs.com-website-22c55e)](https://www.alloclabs.com)
36
+ [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
37
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
38
+
39
+ > Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
40
+
41
+ ## What Alloc Does
42
+
43
+ Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
44
+
45
+ - **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
46
+ - **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
47
+ - **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
48
+
49
+ Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
50
+
51
+ ## Who This Is For
52
+
53
+ - **Solo engineers** who want a fast sanity check before burning GPU time
54
+ - **ML teams** who need repeatable right-sizing and bottleneck visibility
55
+ - **Platform/infra leads** who want budget-aware controls without rewriting training code
56
+
57
+ ## Why It Is Low Friction
58
+
59
+ - **No code changes required** for baseline value (`alloc run`)
60
+ - **Optional deeper integration** via callbacks when you want richer timing signals
61
+ - **Local-first artifacts** so users still get value without cloud connectivity
62
+ - **Progressive adoption** from local CLI to team workflows and governance
63
+
64
+ ## Install
65
+
66
+ ```bash
67
+ pip install alloc
68
+
69
+ # With GPU monitoring support (NVML via pynvml)
70
+ pip install alloc[gpu]
71
+ ```
72
+
73
+ Notes:
74
+ - `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
75
+ - `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
76
+
77
+ ## Commands
78
+
79
+ ### `alloc scan`: Remote Ghost Scan (no GPU needed)
80
+
81
+ ```bash
82
+ alloc scan --model llama-3-70b --gpu A100-80GB
83
+ alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
84
+ alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
85
+
86
+ # Objective + budget constraints
87
+ alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
88
+
89
+ # Topology hints (optional, improves planner quality)
90
+ alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
91
+ ```
92
+
93
+ ### `alloc ghost`: Local VRAM estimation
94
+
95
+ ```bash
96
+ alloc ghost train.py --dtype bf16 --batch-size 32
97
+ alloc ghost train.py --param-count-b 7.0 # manual override
98
+ ```
99
+
100
+ Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
101
+
102
+ ### `alloc run`: Training with GPU monitoring
103
+
104
+ ```bash
105
+ alloc run python train.py # calibrate and exit (default)
106
+ alloc run --full python train.py # monitor full training run
107
+ alloc run torchrun --nproc_per_node=4 train.py
108
+ alloc run -- python train.py --epochs 10
109
+ ```
110
+
111
+ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
112
+
113
+ **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
114
+
115
+ **Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
116
+
117
+ **Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
118
+
119
+ ### `alloc login`: Authenticate with dashboard
120
+
121
+ ```bash
122
+ alloc login
123
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
124
+
125
+ alloc login --token <ACCESS_TOKEN>
126
+ # Paste an access token from the dashboard (no password prompt)
127
+ ```
128
+
129
+ ### `alloc whoami`: Show current auth + org context
130
+
131
+ ```bash
132
+ alloc whoami
133
+ alloc whoami --json
134
+ ```
135
+
136
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
137
+
138
+ ### `alloc logout`: Clear local session
139
+
140
+ ```bash
141
+ alloc logout
142
+ ```
143
+
144
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
145
+
146
+ ### `alloc upload`: Upload artifact to dashboard
147
+
148
+ ```bash
149
+ alloc upload alloc_artifact.json.gz
150
+ ```
151
+
152
+ Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
153
+
154
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
155
+
156
+ ### `alloc catalog`: Browse GPU hardware catalog
157
+
158
+ ```bash
159
+ alloc catalog list # list all 13 GPUs (sorted by VRAM)
160
+ alloc catalog list --sort cost # sort by $/hr
161
+ alloc catalog list --sort tflops # sort by BF16 TFLOPS
162
+ alloc catalog show H100 # detailed specs for H100
163
+ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
164
+ ```
165
+
166
+ Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
167
+
168
+ ### `alloc init`: Configure GPU fleet and budget
169
+
170
+ ```bash
171
+ alloc init # interactive wizard
172
+ alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
173
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
174
+ ```
175
+
176
+ Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
177
+
178
+ ### `alloc version`
179
+
180
+ ```bash
181
+ alloc version
182
+ ```
183
+
184
+ ## Python API
185
+
186
+ ```python
187
+ import alloc
188
+
189
+ # Static VRAM analysis (never crashes your training)
190
+ report = alloc.ghost(model)
191
+ print(report.total_gb) # e.g., 115.42
192
+
193
+ # Or from param count (no torch needed)
194
+ report = alloc.ghost(param_count_b=7.0, dtype="bf16")
195
+ ```
196
+
197
+ ## Framework Callbacks
198
+
199
+ Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
200
+
201
+ ```python
202
+ # HuggingFace Transformers
203
+ from alloc import HuggingFaceCallback
204
+ trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
205
+
206
+ # PyTorch Lightning
207
+ from alloc import LightningCallback
208
+ trainer = Trainer(..., callbacks=[LightningCallback()])
209
+ ```
210
+
211
+ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
212
+
213
+ ## Configuration
214
+
215
+ Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
216
+
217
+ | Variable | Default | Description |
218
+ |----------|---------|-------------|
219
+ | `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
220
+ | `ALLOC_TOKEN` | (empty) | Auth token for API calls |
221
+ | `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
222
+ | `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
223
+ | `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
224
+
225
+ ## Architecture
226
+
227
+ | Module | Purpose |
228
+ |--------|---------|
229
+ | `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
230
+ | `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
231
+ | `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
232
+ | `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
233
+ | `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
234
+ | `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
235
+ | `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
236
+ | `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
237
+ | `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
238
+ | `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
239
+ | `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
240
+ | `display.py` | Rich terminal formatting for reports. |
241
+ | `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
242
+
243
+ ## Design Principles
244
+
245
+ 1. **Zero config**: `alloc run python train.py` works out of the box
246
+ 2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
247
+ 3. **Never crash user's training**: All Alloc failures are caught and training continues
248
+ 4. **Progressive disclosure**: Individual use first, team governance later
249
+
250
+ ## Telemetry Levels
251
+
252
+ Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
253
+
254
+ - **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
255
+ - **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
256
+ - **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.