alloc 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alloc/__init__.py +11 -0
- alloc/artifact_writer.py +67 -0
- alloc/callbacks.py +342 -0
- alloc/catalog/__init__.py +138 -0
- alloc/catalog/default_rate_card.json +18 -0
- alloc/catalog/gpus.v1.json +174 -0
- alloc/cli.py +1341 -0
- alloc/config.py +124 -0
- alloc/context.py +191 -0
- alloc/display.py +580 -0
- alloc/extractor_runner.py +141 -0
- alloc/ghost.py +167 -0
- alloc/model_extractor.py +170 -0
- alloc/model_registry.py +138 -0
- alloc/probe.py +461 -0
- alloc/stability.py +144 -0
- alloc/upload.py +138 -0
- alloc/yaml_config.py +287 -0
- alloc-0.0.1.dist-info/METADATA +256 -0
- alloc-0.0.1.dist-info/RECORD +23 -0
- alloc-0.0.1.dist-info/WHEEL +5 -0
- alloc-0.0.1.dist-info/entry_points.txt +2 -0
- alloc-0.0.1.dist-info/top_level.txt +1 -0
alloc/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Alloc — GPU intelligence for ML training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.0.1"
|
|
6
|
+
|
|
7
|
+
from alloc.ghost import ghost, GhostReport
|
|
8
|
+
from alloc.callbacks import AllocCallback as HuggingFaceCallback
|
|
9
|
+
from alloc.callbacks import AllocLightningCallback as LightningCallback
|
|
10
|
+
|
|
11
|
+
__all__ = ["ghost", "GhostReport", "HuggingFaceCallback", "LightningCallback", "__version__"]
|
alloc/artifact_writer.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Artifact Writer — write alloc_artifact.json.gz.
|
|
2
|
+
|
|
3
|
+
Optionally uploads to W&B if wandb is active.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import gzip
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def write_report(
|
|
16
|
+
ghost_report: Optional[dict] = None,
|
|
17
|
+
probe_result: Optional[dict] = None,
|
|
18
|
+
output_path: Optional[str] = None,
|
|
19
|
+
hardware_context: Optional[dict] = None,
|
|
20
|
+
context: Optional[dict] = None,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Write an artifact to disk.
|
|
23
|
+
|
|
24
|
+
Resolution order for output path:
|
|
25
|
+
1. Explicit output_path parameter
|
|
26
|
+
2. ALLOC_OUT env var
|
|
27
|
+
3. ./alloc_artifact.json.gz
|
|
28
|
+
|
|
29
|
+
Returns the path written to. Never raises.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
resolved_path = (
|
|
33
|
+
output_path
|
|
34
|
+
or os.environ.get("ALLOC_OUT", "")
|
|
35
|
+
or "alloc_artifact.json.gz"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
report = {
|
|
39
|
+
"version": "0.0.1",
|
|
40
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
41
|
+
"ghost": ghost_report,
|
|
42
|
+
"probe": probe_result,
|
|
43
|
+
"hardware": hardware_context,
|
|
44
|
+
"context": context if context else None,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
|
|
48
|
+
json.dump(report, f, indent=2)
|
|
49
|
+
|
|
50
|
+
_try_wandb_upload(resolved_path)
|
|
51
|
+
return resolved_path
|
|
52
|
+
except Exception:
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _try_wandb_upload(path: str) -> None:
|
|
57
|
+
"""Upload to W&B if wandb is active. Silent no-op otherwise."""
|
|
58
|
+
if not os.environ.get("WANDB_RUN_ID"):
|
|
59
|
+
return
|
|
60
|
+
try:
|
|
61
|
+
import wandb
|
|
62
|
+
if wandb.run is not None:
|
|
63
|
+
artifact = wandb.Artifact("alloc-profile", type="profile")
|
|
64
|
+
artifact.add_file(path)
|
|
65
|
+
wandb.run.log_artifact(artifact)
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
alloc/callbacks.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Alloc Framework Callbacks — capture training step timing for artifact enrichment.
|
|
2
|
+
|
|
3
|
+
Callbacks for popular ML frameworks. Write timing stats to a sidecar file
|
|
4
|
+
(.alloc_callback.json) so the probe can compute throughput, step latency,
|
|
5
|
+
and dataloader wait estimates.
|
|
6
|
+
|
|
7
|
+
Usage (HuggingFace):
|
|
8
|
+
from alloc.callbacks import AllocCallback
|
|
9
|
+
trainer = Trainer(..., callbacks=[AllocCallback()])
|
|
10
|
+
|
|
11
|
+
Usage (PyTorch Lightning):
|
|
12
|
+
from alloc.callbacks import AllocLightningCallback
|
|
13
|
+
trainer = Trainer(..., callbacks=[AllocLightningCallback()])
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import math
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ── Helpers ──────────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
_ROLLING_WINDOW = 200 # keep last N step times
|
|
28
|
+
_WRITE_EVERY = 50 # flush sidecar every N steps
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _compute_percentile(sorted_values, pct):
|
|
32
|
+
# type: (List[float], float) -> float
|
|
33
|
+
"""Compute a percentile from an already-sorted list. No numpy."""
|
|
34
|
+
if not sorted_values:
|
|
35
|
+
return 0.0
|
|
36
|
+
n = len(sorted_values)
|
|
37
|
+
k = (pct / 100.0) * (n - 1)
|
|
38
|
+
f = math.floor(k)
|
|
39
|
+
c = math.ceil(k)
|
|
40
|
+
if f == c:
|
|
41
|
+
return sorted_values[int(k)]
|
|
42
|
+
return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _compute_timing_stats(step_times_ms):
|
|
46
|
+
# type: (List[float]) -> Dict[str, float]
|
|
47
|
+
"""Compute p50, p90, mean, std, cv from a list of step times (ms)."""
|
|
48
|
+
if not step_times_ms:
|
|
49
|
+
return {}
|
|
50
|
+
sorted_vals = sorted(step_times_ms)
|
|
51
|
+
n = len(sorted_vals)
|
|
52
|
+
mean = sum(sorted_vals) / n
|
|
53
|
+
variance = sum((x - mean) ** 2 for x in sorted_vals) / n
|
|
54
|
+
std = math.sqrt(variance)
|
|
55
|
+
cv = std / mean if mean > 0 else 0.0
|
|
56
|
+
return {
|
|
57
|
+
"p50": round(_compute_percentile(sorted_vals, 50), 2),
|
|
58
|
+
"p90": round(_compute_percentile(sorted_vals, 90), 2),
|
|
59
|
+
"mean": round(mean, 2),
|
|
60
|
+
"std": round(std, 2),
|
|
61
|
+
"cv": round(cv, 4),
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _estimate_dataloader_wait(cv):
|
|
66
|
+
# type: (float) -> float
|
|
67
|
+
"""Estimate dataloader wait % from step-time coefficient of variation.
|
|
68
|
+
|
|
69
|
+
Maps CV 0.1–0.5 linearly to 0–30%, capped at 30%.
|
|
70
|
+
Low CV = consistent steps = no data stalls.
|
|
71
|
+
High CV = irregular steps = likely waiting for data.
|
|
72
|
+
|
|
73
|
+
This is a heuristic estimate — direct measurement would require
|
|
74
|
+
monkey-patching the dataloader, which violates zero-config.
|
|
75
|
+
"""
|
|
76
|
+
if cv <= 0.1:
|
|
77
|
+
return 0.0
|
|
78
|
+
if cv >= 0.5:
|
|
79
|
+
return 30.0
|
|
80
|
+
# Linear interpolation: 0.1 → 0%, 0.5 → 30%
|
|
81
|
+
return round((cv - 0.1) / 0.4 * 30.0, 1)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _detect_distributed():
|
|
85
|
+
# type: () -> tuple
|
|
86
|
+
"""Detect if running inside a torch.distributed process group.
|
|
87
|
+
|
|
88
|
+
Returns (is_distributed, rank, world_size). Fail-safe: returns
|
|
89
|
+
(False, 0, 1) if torch.distributed is unavailable or not initialized.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
import torch.distributed as dist
|
|
93
|
+
if dist.is_initialized():
|
|
94
|
+
return True, dist.get_rank(), dist.get_world_size()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
return False, 0, 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
|
|
101
|
+
# type: (List[float], float) -> Optional[float]
|
|
102
|
+
"""Estimate communication overhead % for distributed training.
|
|
103
|
+
|
|
104
|
+
Uses the p90/p50 spread as a proxy for sync barrier delays.
|
|
105
|
+
Subtracts estimated dataloader contribution to avoid double-counting.
|
|
106
|
+
Returns None if insufficient data.
|
|
107
|
+
"""
|
|
108
|
+
if len(step_times_ms) < 10:
|
|
109
|
+
return None
|
|
110
|
+
sorted_vals = sorted(step_times_ms)
|
|
111
|
+
p50 = _compute_percentile(sorted_vals, 50)
|
|
112
|
+
p90 = _compute_percentile(sorted_vals, 90)
|
|
113
|
+
if p50 <= 0:
|
|
114
|
+
return None
|
|
115
|
+
raw_pct = ((p90 - p50) / p50) * 100
|
|
116
|
+
comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
|
|
117
|
+
return round(min(40.0, comm_pct), 1)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _write_callback_data(data):
|
|
121
|
+
# type: (Dict[str, Any]) -> None
|
|
122
|
+
"""Write callback data to the alloc sidecar file.
|
|
123
|
+
|
|
124
|
+
Creates .alloc_callback.json next to the artifact.
|
|
125
|
+
Fail-safe: if write fails, training continues unaffected.
|
|
126
|
+
"""
|
|
127
|
+
path = os.path.join(os.getcwd(), ".alloc_callback.json")
|
|
128
|
+
try:
|
|
129
|
+
with open(path, "w") as f:
|
|
130
|
+
json.dump(data, f)
|
|
131
|
+
except Exception:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _build_sidecar(
|
|
136
|
+
framework, # type: str
|
|
137
|
+
step_count, # type: int
|
|
138
|
+
step_times_ms, # type: List[float]
|
|
139
|
+
batch_size, # type: Optional[int]
|
|
140
|
+
is_distributed=False, # type: bool
|
|
141
|
+
rank=0, # type: int
|
|
142
|
+
world_size=1, # type: int
|
|
143
|
+
):
|
|
144
|
+
# type: (...) -> Dict[str, Any]
|
|
145
|
+
"""Build the sidecar dict from collected timing data."""
|
|
146
|
+
stats = _compute_timing_stats(step_times_ms)
|
|
147
|
+
cv = stats.get("cv", 0.0)
|
|
148
|
+
dataloader_wait_pct = _estimate_dataloader_wait(cv)
|
|
149
|
+
|
|
150
|
+
samples_per_sec = None # type: Optional[float]
|
|
151
|
+
p50 = stats.get("p50")
|
|
152
|
+
if p50 and p50 > 0 and batch_size and batch_size > 0:
|
|
153
|
+
samples_per_sec = round(batch_size / (p50 / 1000.0), 2)
|
|
154
|
+
|
|
155
|
+
data = {
|
|
156
|
+
"framework": framework,
|
|
157
|
+
"step_count": step_count,
|
|
158
|
+
"step_time_ms_p50": stats.get("p50"),
|
|
159
|
+
"step_time_ms_p90": stats.get("p90"),
|
|
160
|
+
"step_time_ms_mean": stats.get("mean"),
|
|
161
|
+
"step_time_ms_std": stats.get("std"),
|
|
162
|
+
"samples_per_sec": samples_per_sec,
|
|
163
|
+
"batch_size": batch_size,
|
|
164
|
+
"dataloader_wait_pct": dataloader_wait_pct,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if is_distributed:
|
|
168
|
+
data["is_distributed"] = True
|
|
169
|
+
data["rank"] = rank
|
|
170
|
+
data["world_size"] = world_size
|
|
171
|
+
comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
|
|
172
|
+
if comm is not None:
|
|
173
|
+
data["comm_overhead_pct"] = comm
|
|
174
|
+
|
|
175
|
+
return data
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ── HuggingFace Callback ─────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
from transformers import TrainerCallback
|
|
182
|
+
|
|
183
|
+
class AllocCallback(TrainerCallback):
|
|
184
|
+
"""HuggingFace Trainer callback that captures step timing for Alloc."""
|
|
185
|
+
|
|
186
|
+
def __init__(self):
|
|
187
|
+
# type: () -> None
|
|
188
|
+
self.step_count = 0 # type: int
|
|
189
|
+
self._step_times_ms = [] # type: List[float]
|
|
190
|
+
self._step_start = None # type: Optional[float]
|
|
191
|
+
self._batch_size = None # type: Optional[int]
|
|
192
|
+
self._last_write_step = 0 # type: int
|
|
193
|
+
self._dist_checked = False # type: bool
|
|
194
|
+
self._is_distributed = False # type: bool
|
|
195
|
+
self._rank = 0 # type: int
|
|
196
|
+
self._world_size = 1 # type: int
|
|
197
|
+
|
|
198
|
+
def on_step_begin(self, args, state, control, **kwargs):
|
|
199
|
+
self._step_start = time.monotonic()
|
|
200
|
+
# Detect distributed once after process group is initialized
|
|
201
|
+
if not self._dist_checked:
|
|
202
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
203
|
+
self._dist_checked = True
|
|
204
|
+
|
|
205
|
+
def on_step_end(self, args, state, control, **kwargs):
|
|
206
|
+
self.step_count = state.global_step
|
|
207
|
+
|
|
208
|
+
# Compute step duration
|
|
209
|
+
if self._step_start is not None:
|
|
210
|
+
elapsed_ms = (time.monotonic() - self._step_start) * 1000.0
|
|
211
|
+
self._step_times_ms.append(elapsed_ms)
|
|
212
|
+
# Rolling window
|
|
213
|
+
if len(self._step_times_ms) > _ROLLING_WINDOW:
|
|
214
|
+
self._step_times_ms = self._step_times_ms[-_ROLLING_WINDOW:]
|
|
215
|
+
self._step_start = None
|
|
216
|
+
|
|
217
|
+
# Resolve batch size once
|
|
218
|
+
if self._batch_size is None:
|
|
219
|
+
try:
|
|
220
|
+
bs = args.per_device_train_batch_size
|
|
221
|
+
ga = getattr(args, "gradient_accumulation_steps", 1) or 1
|
|
222
|
+
self._batch_size = bs * ga
|
|
223
|
+
except Exception:
|
|
224
|
+
self._batch_size = None
|
|
225
|
+
|
|
226
|
+
# Periodic write
|
|
227
|
+
if self.step_count - self._last_write_step >= _WRITE_EVERY:
|
|
228
|
+
self._flush()
|
|
229
|
+
self._last_write_step = self.step_count
|
|
230
|
+
|
|
231
|
+
def on_train_end(self, args, state, control, **kwargs):
|
|
232
|
+
self.step_count = state.global_step
|
|
233
|
+
self._flush()
|
|
234
|
+
|
|
235
|
+
def _flush(self):
|
|
236
|
+
# type: () -> None
|
|
237
|
+
data = _build_sidecar(
|
|
238
|
+
framework="huggingface",
|
|
239
|
+
step_count=self.step_count,
|
|
240
|
+
step_times_ms=self._step_times_ms,
|
|
241
|
+
batch_size=self._batch_size,
|
|
242
|
+
is_distributed=self._is_distributed,
|
|
243
|
+
rank=self._rank,
|
|
244
|
+
world_size=self._world_size,
|
|
245
|
+
)
|
|
246
|
+
_write_callback_data(data)
|
|
247
|
+
|
|
248
|
+
except ImportError:
|
|
249
|
+
# transformers not installed — provide a stub that raises a clear error
|
|
250
|
+
class AllocCallback: # type: ignore[no-redef]
|
|
251
|
+
"""Stub — install transformers to use AllocCallback."""
|
|
252
|
+
|
|
253
|
+
def __init__(self):
|
|
254
|
+
raise ImportError(
|
|
255
|
+
"AllocCallback requires the `transformers` package. "
|
|
256
|
+
"Install with: pip install transformers"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# ── PyTorch Lightning Callback ───────────────────────────────────────────
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
from lightning.pytorch.callbacks import Callback as LightningBaseCallback
|
|
264
|
+
|
|
265
|
+
class AllocLightningCallback(LightningBaseCallback):
|
|
266
|
+
"""PyTorch Lightning callback that captures step timing for Alloc."""
|
|
267
|
+
|
|
268
|
+
def __init__(self):
|
|
269
|
+
# type: () -> None
|
|
270
|
+
super().__init__()
|
|
271
|
+
self.step_count = 0 # type: int
|
|
272
|
+
self._step_times_ms = [] # type: List[float]
|
|
273
|
+
self._step_start = None # type: Optional[float]
|
|
274
|
+
self._batch_size = None # type: Optional[int]
|
|
275
|
+
self._last_write_step = 0 # type: int
|
|
276
|
+
self._dist_checked = False # type: bool
|
|
277
|
+
self._is_distributed = False # type: bool
|
|
278
|
+
self._rank = 0 # type: int
|
|
279
|
+
self._world_size = 1 # type: int
|
|
280
|
+
|
|
281
|
+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
282
|
+
self._step_start = time.monotonic()
|
|
283
|
+
if not self._dist_checked:
|
|
284
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
285
|
+
self._dist_checked = True
|
|
286
|
+
|
|
287
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
288
|
+
self.step_count = trainer.global_step
|
|
289
|
+
|
|
290
|
+
# Compute step duration
|
|
291
|
+
if self._step_start is not None:
|
|
292
|
+
elapsed_ms = (time.monotonic() - self._step_start) * 1000.0
|
|
293
|
+
self._step_times_ms.append(elapsed_ms)
|
|
294
|
+
if len(self._step_times_ms) > _ROLLING_WINDOW:
|
|
295
|
+
self._step_times_ms = self._step_times_ms[-_ROLLING_WINDOW:]
|
|
296
|
+
self._step_start = None
|
|
297
|
+
|
|
298
|
+
# Resolve batch size once
|
|
299
|
+
if self._batch_size is None:
|
|
300
|
+
try:
|
|
301
|
+
if hasattr(batch, "__len__"):
|
|
302
|
+
self._batch_size = len(batch)
|
|
303
|
+
elif hasattr(batch, "shape"):
|
|
304
|
+
self._batch_size = batch.shape[0]
|
|
305
|
+
elif hasattr(trainer, "datamodule") and trainer.datamodule is not None:
|
|
306
|
+
dm = trainer.datamodule
|
|
307
|
+
if hasattr(dm, "batch_size"):
|
|
308
|
+
self._batch_size = dm.batch_size
|
|
309
|
+
except Exception:
|
|
310
|
+
self._batch_size = None
|
|
311
|
+
|
|
312
|
+
# Periodic write
|
|
313
|
+
if self.step_count - self._last_write_step >= _WRITE_EVERY:
|
|
314
|
+
self._flush()
|
|
315
|
+
self._last_write_step = self.step_count
|
|
316
|
+
|
|
317
|
+
def on_train_end(self, trainer, pl_module):
|
|
318
|
+
self.step_count = trainer.global_step
|
|
319
|
+
self._flush()
|
|
320
|
+
|
|
321
|
+
def _flush(self):
|
|
322
|
+
# type: () -> None
|
|
323
|
+
data = _build_sidecar(
|
|
324
|
+
framework="lightning",
|
|
325
|
+
step_count=self.step_count,
|
|
326
|
+
step_times_ms=self._step_times_ms,
|
|
327
|
+
batch_size=self._batch_size,
|
|
328
|
+
is_distributed=self._is_distributed,
|
|
329
|
+
rank=self._rank,
|
|
330
|
+
world_size=self._world_size,
|
|
331
|
+
)
|
|
332
|
+
_write_callback_data(data)
|
|
333
|
+
|
|
334
|
+
except ImportError:
|
|
335
|
+
class AllocLightningCallback: # type: ignore[no-redef]
|
|
336
|
+
"""Stub — install lightning to use AllocLightningCallback."""
|
|
337
|
+
|
|
338
|
+
def __init__(self):
|
|
339
|
+
raise ImportError(
|
|
340
|
+
"AllocLightningCallback requires the `lightning` package. "
|
|
341
|
+
"Install with: pip install lightning"
|
|
342
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""GPU catalog — offline hardware specs and pricing for CLI.
|
|
2
|
+
|
|
3
|
+
Source of truth: apps/api/src/engine/catalog/gpus.v1.json
|
|
4
|
+
This is a bundled copy for offline CLI use. Update when the API catalog changes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
_CATALOG_DIR = Path(__file__).parent
|
|
14
|
+
|
|
15
|
+
# Aliases for common shorthand names
|
|
16
|
+
_ALIASES = {
|
|
17
|
+
"H100": "nvidia-h100-sxm-80gb",
|
|
18
|
+
"H100-80GB": "nvidia-h100-sxm-80gb",
|
|
19
|
+
"A100": "nvidia-a100-sxm-80gb",
|
|
20
|
+
"A100-80GB": "nvidia-a100-sxm-80gb",
|
|
21
|
+
"A100-40GB": "nvidia-a100-40gb",
|
|
22
|
+
"A10G": "nvidia-a10g-24gb",
|
|
23
|
+
"L40S": "nvidia-l40s-48gb",
|
|
24
|
+
"L4": "nvidia-l4-24gb",
|
|
25
|
+
"T4": "nvidia-t4-16gb",
|
|
26
|
+
"V100": "nvidia-v100-32gb",
|
|
27
|
+
"V100-32GB": "nvidia-v100-32gb",
|
|
28
|
+
"V100-16GB": "nvidia-v100-16gb",
|
|
29
|
+
"RTX-4090": "nvidia-rtx4090-24gb",
|
|
30
|
+
"RTX-3090": "nvidia-rtx3090-24gb",
|
|
31
|
+
"H200": "nvidia-h200-141gb",
|
|
32
|
+
"H100-NVL": "nvidia-h100-nvl-94gb",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _load_catalog() -> dict:
|
|
37
|
+
"""Load GPU catalog from bundled JSON."""
|
|
38
|
+
with open(_CATALOG_DIR / "gpus.v1.json") as f:
|
|
39
|
+
return json.load(f)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_rate_card() -> dict:
|
|
43
|
+
"""Load default rate card from bundled JSON."""
|
|
44
|
+
with open(_CATALOG_DIR / "default_rate_card.json") as f:
|
|
45
|
+
return json.load(f)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def list_gpus() -> List[dict]:
|
|
49
|
+
"""Return all GPUs sorted by VRAM descending.
|
|
50
|
+
|
|
51
|
+
Each entry has: id, display_name, vendor, vram_gb, architecture,
|
|
52
|
+
bandwidth_gbps, bf16_tflops, tdp_watts, pricing.
|
|
53
|
+
"""
|
|
54
|
+
catalog = _load_catalog()
|
|
55
|
+
rate_card = _load_rate_card()
|
|
56
|
+
|
|
57
|
+
result = []
|
|
58
|
+
for gpu_id, spec in catalog.get("gpus", {}).items():
|
|
59
|
+
pricing = rate_card.get("rates", {}).get(spec["display_name"], {})
|
|
60
|
+
result.append({
|
|
61
|
+
"id": gpu_id,
|
|
62
|
+
"display_name": spec["display_name"],
|
|
63
|
+
"vendor": spec.get("vendor", "nvidia"),
|
|
64
|
+
"vram_gb": spec["vram_gb"],
|
|
65
|
+
"architecture": spec.get("architecture", ""),
|
|
66
|
+
"bandwidth_gbps": spec.get("bandwidth_gbps", 0),
|
|
67
|
+
"bf16_tflops": spec.get("bf16_tflops", 0),
|
|
68
|
+
"fp16_tflops": spec.get("fp16_tflops", 0),
|
|
69
|
+
"fp32_tflops": spec.get("fp32_tflops", 0),
|
|
70
|
+
"tf32_tflops": spec.get("tf32_tflops", 0),
|
|
71
|
+
"tdp_watts": spec.get("tdp_watts", 0),
|
|
72
|
+
"interconnect": spec.get("interconnect"),
|
|
73
|
+
"pricing": pricing,
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_default_rate(gpu_name: str) -> Optional[float]:
|
|
80
|
+
"""Look up the average default $/hr for a GPU by name or alias.
|
|
81
|
+
|
|
82
|
+
Tries to match the probe-reported GPU name against catalog display names.
|
|
83
|
+
Returns the average across clouds, or None if not found.
|
|
84
|
+
"""
|
|
85
|
+
rate_card = _load_rate_card()
|
|
86
|
+
rates = rate_card.get("rates", {})
|
|
87
|
+
|
|
88
|
+
# Direct match by display name
|
|
89
|
+
for display_name, cloud_rates in rates.items():
|
|
90
|
+
if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
|
|
91
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
92
|
+
return sum(vals) / len(vals) if vals else None
|
|
93
|
+
|
|
94
|
+
# Try aliases → display name
|
|
95
|
+
for alias, stable_id in _ALIASES.items():
|
|
96
|
+
if alias.lower() in gpu_name.lower():
|
|
97
|
+
catalog = _load_catalog()
|
|
98
|
+
spec = catalog.get("gpus", {}).get(stable_id)
|
|
99
|
+
if spec:
|
|
100
|
+
dn = spec.get("display_name", "")
|
|
101
|
+
cloud_rates = rates.get(dn, {})
|
|
102
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
103
|
+
return sum(vals) / len(vals) if vals else None
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_gpu(gpu_id: str) -> Optional[dict]:
|
|
109
|
+
"""Look up a GPU by stable ID or alias.
|
|
110
|
+
|
|
111
|
+
Returns full spec dict or None if not found.
|
|
112
|
+
"""
|
|
113
|
+
# Resolve aliases
|
|
114
|
+
resolved = _ALIASES.get(gpu_id, gpu_id)
|
|
115
|
+
|
|
116
|
+
catalog = _load_catalog()
|
|
117
|
+
rate_card = _load_rate_card()
|
|
118
|
+
|
|
119
|
+
spec = catalog.get("gpus", {}).get(resolved)
|
|
120
|
+
if not spec:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
pricing = rate_card.get("rates", {}).get(spec["display_name"], {})
|
|
124
|
+
return {
|
|
125
|
+
"id": resolved,
|
|
126
|
+
"display_name": spec["display_name"],
|
|
127
|
+
"vendor": spec.get("vendor", "nvidia"),
|
|
128
|
+
"vram_gb": spec["vram_gb"],
|
|
129
|
+
"architecture": spec.get("architecture", ""),
|
|
130
|
+
"bandwidth_gbps": spec.get("bandwidth_gbps", 0),
|
|
131
|
+
"bf16_tflops": spec.get("bf16_tflops", 0),
|
|
132
|
+
"fp16_tflops": spec.get("fp16_tflops", 0),
|
|
133
|
+
"fp32_tflops": spec.get("fp32_tflops", 0),
|
|
134
|
+
"tf32_tflops": spec.get("tf32_tflops", 0),
|
|
135
|
+
"tdp_watts": spec.get("tdp_watts", 0),
|
|
136
|
+
"interconnect": spec.get("interconnect"),
|
|
137
|
+
"pricing": pricing,
|
|
138
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1",
|
|
3
|
+
"rates": {
|
|
4
|
+
"H200": { "aws": 5.50, "gcp": 5.30, "azure": 5.40 },
|
|
5
|
+
"H100-80GB": { "aws": 4.00, "gcp": 3.90, "azure": 3.85 },
|
|
6
|
+
"H100-NVL": { "aws": 4.50, "gcp": 4.40, "azure": 4.30 },
|
|
7
|
+
"A100-80GB": { "aws": 2.50, "gcp": 2.48, "azure": 2.55 },
|
|
8
|
+
"A100-40GB": { "aws": 2.00, "gcp": 1.95, "azure": 2.10 },
|
|
9
|
+
"A10G": { "aws": 0.75, "gcp": 0.70, "azure": 0.80 },
|
|
10
|
+
"L40S": { "aws": 1.50, "gcp": 1.45, "azure": 1.55 },
|
|
11
|
+
"L4": { "aws": 0.50, "gcp": 0.45, "azure": 0.55 },
|
|
12
|
+
"T4": { "aws": 0.35, "gcp": 0.30, "azure": 0.40 },
|
|
13
|
+
"V100-32GB": { "aws": 1.20, "gcp": 1.15, "azure": 1.25 },
|
|
14
|
+
"V100-16GB": { "aws": 0.90, "gcp": 0.85, "azure": 0.95 },
|
|
15
|
+
"RTX-4090": { "lambda": 0.70, "coreweave": 0.74 },
|
|
16
|
+
"RTX-3090": { "lambda": 0.50, "coreweave": 0.54 }
|
|
17
|
+
}
|
|
18
|
+
}
|