alloc 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
alloc/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """Alloc — GPU intelligence for ML training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.0.1"
6
+
7
+ from alloc.ghost import ghost, GhostReport
8
+ from alloc.callbacks import AllocCallback as HuggingFaceCallback
9
+ from alloc.callbacks import AllocLightningCallback as LightningCallback
10
+
11
+ __all__ = ["ghost", "GhostReport", "HuggingFaceCallback", "LightningCallback", "__version__"]
@@ -0,0 +1,67 @@
1
+ """Artifact Writer — write alloc_artifact.json.gz.
2
+
3
+ Optionally uploads to W&B if wandb is active.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import gzip
9
+ import json
10
+ import os
11
+ from datetime import datetime, timezone
12
+ from typing import Optional
13
+
14
+
15
+ def write_report(
16
+ ghost_report: Optional[dict] = None,
17
+ probe_result: Optional[dict] = None,
18
+ output_path: Optional[str] = None,
19
+ hardware_context: Optional[dict] = None,
20
+ context: Optional[dict] = None,
21
+ ) -> str:
22
+ """Write an artifact to disk.
23
+
24
+ Resolution order for output path:
25
+ 1. Explicit output_path parameter
26
+ 2. ALLOC_OUT env var
27
+ 3. ./alloc_artifact.json.gz
28
+
29
+ Returns the path written to. Never raises.
30
+ """
31
+ try:
32
+ resolved_path = (
33
+ output_path
34
+ or os.environ.get("ALLOC_OUT", "")
35
+ or "alloc_artifact.json.gz"
36
+ )
37
+
38
+ report = {
39
+ "version": "0.0.1",
40
+ "timestamp": datetime.now(timezone.utc).isoformat(),
41
+ "ghost": ghost_report,
42
+ "probe": probe_result,
43
+ "hardware": hardware_context,
44
+ "context": context if context else None,
45
+ }
46
+
47
+ with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
48
+ json.dump(report, f, indent=2)
49
+
50
+ _try_wandb_upload(resolved_path)
51
+ return resolved_path
52
+ except Exception:
53
+ return ""
54
+
55
+
56
+ def _try_wandb_upload(path: str) -> None:
57
+ """Upload to W&B if wandb is active. Silent no-op otherwise."""
58
+ if not os.environ.get("WANDB_RUN_ID"):
59
+ return
60
+ try:
61
+ import wandb
62
+ if wandb.run is not None:
63
+ artifact = wandb.Artifact("alloc-profile", type="profile")
64
+ artifact.add_file(path)
65
+ wandb.run.log_artifact(artifact)
66
+ except Exception:
67
+ pass
alloc/callbacks.py ADDED
@@ -0,0 +1,342 @@
1
+ """Alloc Framework Callbacks — capture training step timing for artifact enrichment.
2
+
3
+ Callbacks for popular ML frameworks. Write timing stats to a sidecar file
4
+ (.alloc_callback.json) so the probe can compute throughput, step latency,
5
+ and dataloader wait estimates.
6
+
7
+ Usage (HuggingFace):
8
+ from alloc.callbacks import AllocCallback
9
+ trainer = Trainer(..., callbacks=[AllocCallback()])
10
+
11
+ Usage (PyTorch Lightning):
12
+ from alloc.callbacks import AllocLightningCallback
13
+ trainer = Trainer(..., callbacks=[AllocLightningCallback()])
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import math
20
+ import os
21
+ import time
22
+ from typing import Any, Dict, List, Optional
23
+
24
+
25
+ # ── Helpers ──────────────────────────────────────────────────────────────
26
+
27
+ _ROLLING_WINDOW = 200 # keep last N step times
28
+ _WRITE_EVERY = 50 # flush sidecar every N steps
29
+
30
+
31
+ def _compute_percentile(sorted_values, pct):
32
+ # type: (List[float], float) -> float
33
+ """Compute a percentile from an already-sorted list. No numpy."""
34
+ if not sorted_values:
35
+ return 0.0
36
+ n = len(sorted_values)
37
+ k = (pct / 100.0) * (n - 1)
38
+ f = math.floor(k)
39
+ c = math.ceil(k)
40
+ if f == c:
41
+ return sorted_values[int(k)]
42
+ return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
43
+
44
+
45
+ def _compute_timing_stats(step_times_ms):
46
+ # type: (List[float]) -> Dict[str, float]
47
+ """Compute p50, p90, mean, std, cv from a list of step times (ms)."""
48
+ if not step_times_ms:
49
+ return {}
50
+ sorted_vals = sorted(step_times_ms)
51
+ n = len(sorted_vals)
52
+ mean = sum(sorted_vals) / n
53
+ variance = sum((x - mean) ** 2 for x in sorted_vals) / n
54
+ std = math.sqrt(variance)
55
+ cv = std / mean if mean > 0 else 0.0
56
+ return {
57
+ "p50": round(_compute_percentile(sorted_vals, 50), 2),
58
+ "p90": round(_compute_percentile(sorted_vals, 90), 2),
59
+ "mean": round(mean, 2),
60
+ "std": round(std, 2),
61
+ "cv": round(cv, 4),
62
+ }
63
+
64
+
65
+ def _estimate_dataloader_wait(cv):
66
+ # type: (float) -> float
67
+ """Estimate dataloader wait % from step-time coefficient of variation.
68
+
69
+ Maps CV 0.1–0.5 linearly to 0–30%, capped at 30%.
70
+ Low CV = consistent steps = no data stalls.
71
+ High CV = irregular steps = likely waiting for data.
72
+
73
+ This is a heuristic estimate — direct measurement would require
74
+ monkey-patching the dataloader, which violates zero-config.
75
+ """
76
+ if cv <= 0.1:
77
+ return 0.0
78
+ if cv >= 0.5:
79
+ return 30.0
80
+ # Linear interpolation: 0.1 → 0%, 0.5 → 30%
81
+ return round((cv - 0.1) / 0.4 * 30.0, 1)
82
+
83
+
84
+ def _detect_distributed():
85
+ # type: () -> tuple
86
+ """Detect if running inside a torch.distributed process group.
87
+
88
+ Returns (is_distributed, rank, world_size). Fail-safe: returns
89
+ (False, 0, 1) if torch.distributed is unavailable or not initialized.
90
+ """
91
+ try:
92
+ import torch.distributed as dist
93
+ if dist.is_initialized():
94
+ return True, dist.get_rank(), dist.get_world_size()
95
+ except Exception:
96
+ pass
97
+ return False, 0, 1
98
+
99
+
100
+ def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
101
+ # type: (List[float], float) -> Optional[float]
102
+ """Estimate communication overhead % for distributed training.
103
+
104
+ Uses the p90/p50 spread as a proxy for sync barrier delays.
105
+ Subtracts estimated dataloader contribution to avoid double-counting.
106
+ Returns None if insufficient data.
107
+ """
108
+ if len(step_times_ms) < 10:
109
+ return None
110
+ sorted_vals = sorted(step_times_ms)
111
+ p50 = _compute_percentile(sorted_vals, 50)
112
+ p90 = _compute_percentile(sorted_vals, 90)
113
+ if p50 <= 0:
114
+ return None
115
+ raw_pct = ((p90 - p50) / p50) * 100
116
+ comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
117
+ return round(min(40.0, comm_pct), 1)
118
+
119
+
120
+ def _write_callback_data(data):
121
+ # type: (Dict[str, Any]) -> None
122
+ """Write callback data to the alloc sidecar file.
123
+
124
+ Creates .alloc_callback.json next to the artifact.
125
+ Fail-safe: if write fails, training continues unaffected.
126
+ """
127
+ path = os.path.join(os.getcwd(), ".alloc_callback.json")
128
+ try:
129
+ with open(path, "w") as f:
130
+ json.dump(data, f)
131
+ except Exception:
132
+ pass
133
+
134
+
135
+ def _build_sidecar(
136
+ framework, # type: str
137
+ step_count, # type: int
138
+ step_times_ms, # type: List[float]
139
+ batch_size, # type: Optional[int]
140
+ is_distributed=False, # type: bool
141
+ rank=0, # type: int
142
+ world_size=1, # type: int
143
+ ):
144
+ # type: (...) -> Dict[str, Any]
145
+ """Build the sidecar dict from collected timing data."""
146
+ stats = _compute_timing_stats(step_times_ms)
147
+ cv = stats.get("cv", 0.0)
148
+ dataloader_wait_pct = _estimate_dataloader_wait(cv)
149
+
150
+ samples_per_sec = None # type: Optional[float]
151
+ p50 = stats.get("p50")
152
+ if p50 and p50 > 0 and batch_size and batch_size > 0:
153
+ samples_per_sec = round(batch_size / (p50 / 1000.0), 2)
154
+
155
+ data = {
156
+ "framework": framework,
157
+ "step_count": step_count,
158
+ "step_time_ms_p50": stats.get("p50"),
159
+ "step_time_ms_p90": stats.get("p90"),
160
+ "step_time_ms_mean": stats.get("mean"),
161
+ "step_time_ms_std": stats.get("std"),
162
+ "samples_per_sec": samples_per_sec,
163
+ "batch_size": batch_size,
164
+ "dataloader_wait_pct": dataloader_wait_pct,
165
+ }
166
+
167
+ if is_distributed:
168
+ data["is_distributed"] = True
169
+ data["rank"] = rank
170
+ data["world_size"] = world_size
171
+ comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
172
+ if comm is not None:
173
+ data["comm_overhead_pct"] = comm
174
+
175
+ return data
176
+
177
+
178
+ # ── HuggingFace Callback ─────────────────────────────────────────────────
179
+
180
+ try:
181
+ from transformers import TrainerCallback
182
+
183
+ class AllocCallback(TrainerCallback):
184
+ """HuggingFace Trainer callback that captures step timing for Alloc."""
185
+
186
+ def __init__(self):
187
+ # type: () -> None
188
+ self.step_count = 0 # type: int
189
+ self._step_times_ms = [] # type: List[float]
190
+ self._step_start = None # type: Optional[float]
191
+ self._batch_size = None # type: Optional[int]
192
+ self._last_write_step = 0 # type: int
193
+ self._dist_checked = False # type: bool
194
+ self._is_distributed = False # type: bool
195
+ self._rank = 0 # type: int
196
+ self._world_size = 1 # type: int
197
+
198
+ def on_step_begin(self, args, state, control, **kwargs):
199
+ self._step_start = time.monotonic()
200
+ # Detect distributed once after process group is initialized
201
+ if not self._dist_checked:
202
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
203
+ self._dist_checked = True
204
+
205
+ def on_step_end(self, args, state, control, **kwargs):
206
+ self.step_count = state.global_step
207
+
208
+ # Compute step duration
209
+ if self._step_start is not None:
210
+ elapsed_ms = (time.monotonic() - self._step_start) * 1000.0
211
+ self._step_times_ms.append(elapsed_ms)
212
+ # Rolling window
213
+ if len(self._step_times_ms) > _ROLLING_WINDOW:
214
+ self._step_times_ms = self._step_times_ms[-_ROLLING_WINDOW:]
215
+ self._step_start = None
216
+
217
+ # Resolve batch size once
218
+ if self._batch_size is None:
219
+ try:
220
+ bs = args.per_device_train_batch_size
221
+ ga = getattr(args, "gradient_accumulation_steps", 1) or 1
222
+ self._batch_size = bs * ga
223
+ except Exception:
224
+ self._batch_size = None
225
+
226
+ # Periodic write
227
+ if self.step_count - self._last_write_step >= _WRITE_EVERY:
228
+ self._flush()
229
+ self._last_write_step = self.step_count
230
+
231
+ def on_train_end(self, args, state, control, **kwargs):
232
+ self.step_count = state.global_step
233
+ self._flush()
234
+
235
+ def _flush(self):
236
+ # type: () -> None
237
+ data = _build_sidecar(
238
+ framework="huggingface",
239
+ step_count=self.step_count,
240
+ step_times_ms=self._step_times_ms,
241
+ batch_size=self._batch_size,
242
+ is_distributed=self._is_distributed,
243
+ rank=self._rank,
244
+ world_size=self._world_size,
245
+ )
246
+ _write_callback_data(data)
247
+
248
+ except ImportError:
249
+ # transformers not installed — provide a stub that raises a clear error
250
+ class AllocCallback: # type: ignore[no-redef]
251
+ """Stub — install transformers to use AllocCallback."""
252
+
253
+ def __init__(self):
254
+ raise ImportError(
255
+ "AllocCallback requires the `transformers` package. "
256
+ "Install with: pip install transformers"
257
+ )
258
+
259
+
260
+ # ── PyTorch Lightning Callback ───────────────────────────────────────────
261
+
262
+ try:
263
+ from lightning.pytorch.callbacks import Callback as LightningBaseCallback
264
+
265
+ class AllocLightningCallback(LightningBaseCallback):
266
+ """PyTorch Lightning callback that captures step timing for Alloc."""
267
+
268
+ def __init__(self):
269
+ # type: () -> None
270
+ super().__init__()
271
+ self.step_count = 0 # type: int
272
+ self._step_times_ms = [] # type: List[float]
273
+ self._step_start = None # type: Optional[float]
274
+ self._batch_size = None # type: Optional[int]
275
+ self._last_write_step = 0 # type: int
276
+ self._dist_checked = False # type: bool
277
+ self._is_distributed = False # type: bool
278
+ self._rank = 0 # type: int
279
+ self._world_size = 1 # type: int
280
+
281
+ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
282
+ self._step_start = time.monotonic()
283
+ if not self._dist_checked:
284
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
285
+ self._dist_checked = True
286
+
287
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
288
+ self.step_count = trainer.global_step
289
+
290
+ # Compute step duration
291
+ if self._step_start is not None:
292
+ elapsed_ms = (time.monotonic() - self._step_start) * 1000.0
293
+ self._step_times_ms.append(elapsed_ms)
294
+ if len(self._step_times_ms) > _ROLLING_WINDOW:
295
+ self._step_times_ms = self._step_times_ms[-_ROLLING_WINDOW:]
296
+ self._step_start = None
297
+
298
+ # Resolve batch size once
299
+ if self._batch_size is None:
300
+ try:
301
+ if hasattr(batch, "__len__"):
302
+ self._batch_size = len(batch)
303
+ elif hasattr(batch, "shape"):
304
+ self._batch_size = batch.shape[0]
305
+ elif hasattr(trainer, "datamodule") and trainer.datamodule is not None:
306
+ dm = trainer.datamodule
307
+ if hasattr(dm, "batch_size"):
308
+ self._batch_size = dm.batch_size
309
+ except Exception:
310
+ self._batch_size = None
311
+
312
+ # Periodic write
313
+ if self.step_count - self._last_write_step >= _WRITE_EVERY:
314
+ self._flush()
315
+ self._last_write_step = self.step_count
316
+
317
+ def on_train_end(self, trainer, pl_module):
318
+ self.step_count = trainer.global_step
319
+ self._flush()
320
+
321
+ def _flush(self):
322
+ # type: () -> None
323
+ data = _build_sidecar(
324
+ framework="lightning",
325
+ step_count=self.step_count,
326
+ step_times_ms=self._step_times_ms,
327
+ batch_size=self._batch_size,
328
+ is_distributed=self._is_distributed,
329
+ rank=self._rank,
330
+ world_size=self._world_size,
331
+ )
332
+ _write_callback_data(data)
333
+
334
+ except ImportError:
335
+ class AllocLightningCallback: # type: ignore[no-redef]
336
+ """Stub — install lightning to use AllocLightningCallback."""
337
+
338
+ def __init__(self):
339
+ raise ImportError(
340
+ "AllocLightningCallback requires the `lightning` package. "
341
+ "Install with: pip install lightning"
342
+ )
@@ -0,0 +1,138 @@
1
+ """GPU catalog — offline hardware specs and pricing for CLI.
2
+
3
+ Source of truth: apps/api/src/engine/catalog/gpus.v1.json
4
+ This is a bundled copy for offline CLI use. Update when the API catalog changes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ _CATALOG_DIR = Path(__file__).parent
14
+
15
+ # Aliases for common shorthand names
16
+ _ALIASES = {
17
+ "H100": "nvidia-h100-sxm-80gb",
18
+ "H100-80GB": "nvidia-h100-sxm-80gb",
19
+ "A100": "nvidia-a100-sxm-80gb",
20
+ "A100-80GB": "nvidia-a100-sxm-80gb",
21
+ "A100-40GB": "nvidia-a100-40gb",
22
+ "A10G": "nvidia-a10g-24gb",
23
+ "L40S": "nvidia-l40s-48gb",
24
+ "L4": "nvidia-l4-24gb",
25
+ "T4": "nvidia-t4-16gb",
26
+ "V100": "nvidia-v100-32gb",
27
+ "V100-32GB": "nvidia-v100-32gb",
28
+ "V100-16GB": "nvidia-v100-16gb",
29
+ "RTX-4090": "nvidia-rtx4090-24gb",
30
+ "RTX-3090": "nvidia-rtx3090-24gb",
31
+ "H200": "nvidia-h200-141gb",
32
+ "H100-NVL": "nvidia-h100-nvl-94gb",
33
+ }
34
+
35
+
36
+ def _load_catalog() -> dict:
37
+ """Load GPU catalog from bundled JSON."""
38
+ with open(_CATALOG_DIR / "gpus.v1.json") as f:
39
+ return json.load(f)
40
+
41
+
42
+ def _load_rate_card() -> dict:
43
+ """Load default rate card from bundled JSON."""
44
+ with open(_CATALOG_DIR / "default_rate_card.json") as f:
45
+ return json.load(f)
46
+
47
+
48
+ def list_gpus() -> List[dict]:
49
+ """Return all GPUs sorted by VRAM descending.
50
+
51
+ Each entry has: id, display_name, vendor, vram_gb, architecture,
52
+ bandwidth_gbps, bf16_tflops, tdp_watts, pricing.
53
+ """
54
+ catalog = _load_catalog()
55
+ rate_card = _load_rate_card()
56
+
57
+ result = []
58
+ for gpu_id, spec in catalog.get("gpus", {}).items():
59
+ pricing = rate_card.get("rates", {}).get(spec["display_name"], {})
60
+ result.append({
61
+ "id": gpu_id,
62
+ "display_name": spec["display_name"],
63
+ "vendor": spec.get("vendor", "nvidia"),
64
+ "vram_gb": spec["vram_gb"],
65
+ "architecture": spec.get("architecture", ""),
66
+ "bandwidth_gbps": spec.get("bandwidth_gbps", 0),
67
+ "bf16_tflops": spec.get("bf16_tflops", 0),
68
+ "fp16_tflops": spec.get("fp16_tflops", 0),
69
+ "fp32_tflops": spec.get("fp32_tflops", 0),
70
+ "tf32_tflops": spec.get("tf32_tflops", 0),
71
+ "tdp_watts": spec.get("tdp_watts", 0),
72
+ "interconnect": spec.get("interconnect"),
73
+ "pricing": pricing,
74
+ })
75
+
76
+ return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
77
+
78
+
79
+ def get_default_rate(gpu_name: str) -> Optional[float]:
80
+ """Look up the average default $/hr for a GPU by name or alias.
81
+
82
+ Tries to match the probe-reported GPU name against catalog display names.
83
+ Returns the average across clouds, or None if not found.
84
+ """
85
+ rate_card = _load_rate_card()
86
+ rates = rate_card.get("rates", {})
87
+
88
+ # Direct match by display name
89
+ for display_name, cloud_rates in rates.items():
90
+ if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
91
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
92
+ return sum(vals) / len(vals) if vals else None
93
+
94
+ # Try aliases → display name
95
+ for alias, stable_id in _ALIASES.items():
96
+ if alias.lower() in gpu_name.lower():
97
+ catalog = _load_catalog()
98
+ spec = catalog.get("gpus", {}).get(stable_id)
99
+ if spec:
100
+ dn = spec.get("display_name", "")
101
+ cloud_rates = rates.get(dn, {})
102
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
103
+ return sum(vals) / len(vals) if vals else None
104
+
105
+ return None
106
+
107
+
108
+ def get_gpu(gpu_id: str) -> Optional[dict]:
109
+ """Look up a GPU by stable ID or alias.
110
+
111
+ Returns full spec dict or None if not found.
112
+ """
113
+ # Resolve aliases
114
+ resolved = _ALIASES.get(gpu_id, gpu_id)
115
+
116
+ catalog = _load_catalog()
117
+ rate_card = _load_rate_card()
118
+
119
+ spec = catalog.get("gpus", {}).get(resolved)
120
+ if not spec:
121
+ return None
122
+
123
+ pricing = rate_card.get("rates", {}).get(spec["display_name"], {})
124
+ return {
125
+ "id": resolved,
126
+ "display_name": spec["display_name"],
127
+ "vendor": spec.get("vendor", "nvidia"),
128
+ "vram_gb": spec["vram_gb"],
129
+ "architecture": spec.get("architecture", ""),
130
+ "bandwidth_gbps": spec.get("bandwidth_gbps", 0),
131
+ "bf16_tflops": spec.get("bf16_tflops", 0),
132
+ "fp16_tflops": spec.get("fp16_tflops", 0),
133
+ "fp32_tflops": spec.get("fp32_tflops", 0),
134
+ "tf32_tflops": spec.get("tf32_tflops", 0),
135
+ "tdp_watts": spec.get("tdp_watts", 0),
136
+ "interconnect": spec.get("interconnect"),
137
+ "pricing": pricing,
138
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "version": "1",
3
+ "rates": {
4
+ "H200": { "aws": 5.50, "gcp": 5.30, "azure": 5.40 },
5
+ "H100-80GB": { "aws": 4.00, "gcp": 3.90, "azure": 3.85 },
6
+ "H100-NVL": { "aws": 4.50, "gcp": 4.40, "azure": 4.30 },
7
+ "A100-80GB": { "aws": 2.50, "gcp": 2.48, "azure": 2.55 },
8
+ "A100-40GB": { "aws": 2.00, "gcp": 1.95, "azure": 2.10 },
9
+ "A10G": { "aws": 0.75, "gcp": 0.70, "azure": 0.80 },
10
+ "L40S": { "aws": 1.50, "gcp": 1.45, "azure": 1.55 },
11
+ "L4": { "aws": 0.50, "gcp": 0.45, "azure": 0.55 },
12
+ "T4": { "aws": 0.35, "gcp": 0.30, "azure": 0.40 },
13
+ "V100-32GB": { "aws": 1.20, "gcp": 1.15, "azure": 1.25 },
14
+ "V100-16GB": { "aws": 0.90, "gcp": 0.85, "azure": 0.95 },
15
+ "RTX-4090": { "lambda": 0.70, "coreweave": 0.74 },
16
+ "RTX-3090": { "lambda": 0.50, "coreweave": 0.54 }
17
+ }
18
+ }