alloc 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.4.0 → alloc-0.5.0}/PKG-INFO +26 -3
- {alloc-0.4.0 → alloc-0.5.0}/README.md +24 -1
- {alloc-0.4.0 → alloc-0.5.0}/pyproject.toml +2 -2
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/__init__.py +1 -1
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/callbacks.py +69 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/__init__.py +29 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/cli.py +447 -61
- alloc-0.5.0/src/alloc/config.py +124 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/display.py +33 -4
- alloc-0.5.0/src/alloc/extractor_runner.py +141 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/ghost.py +9 -2
- alloc-0.5.0/src/alloc/model_extractor.py +170 -0
- alloc-0.5.0/src/alloc/model_registry.py +138 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/probe.py +49 -2
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/yaml_config.py +51 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/PKG-INFO +26 -3
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/SOURCES.txt +4 -0
- alloc-0.5.0/tests/test_auth.py +155 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_callbacks.py +98 -0
- alloc-0.5.0/tests/test_init_from_org.py +98 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_yaml_config.py +2 -0
- alloc-0.4.0/src/alloc/config.py +0 -65
- alloc-0.4.0/src/alloc/model_extractor.py +0 -332
- {alloc-0.4.0 → alloc-0.5.0}/setup.cfg +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/gpus.v1.json +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/context.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/stability.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc/upload.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_artifact.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_catalog.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_cli.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_context.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_ghost.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_model_extractor.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_probe_hw.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_probe_multi.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_stability.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_upload.py +0 -0
- {alloc-0.4.0 → alloc-0.5.0}/tests/test_verdict.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Homepage, https://alloclabs.com
|
|
8
8
|
Project-URL: Repository, https://github.com/alloc-labs/alloc
|
|
9
9
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -120,9 +120,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
|
|
|
120
120
|
|
|
121
121
|
```bash
|
|
122
122
|
alloc login
|
|
123
|
-
# Prompts for email + password, stores token in ~/.alloc/config.json
|
|
123
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
124
|
+
|
|
125
|
+
alloc login --token <ACCESS_TOKEN>
|
|
126
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### `alloc whoami`: Show current auth + org context
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
alloc whoami
|
|
133
|
+
alloc whoami --json
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
137
|
+
|
|
138
|
+
### `alloc logout`: Clear local session
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc logout
|
|
124
142
|
```
|
|
125
143
|
|
|
144
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
145
|
+
|
|
126
146
|
### `alloc upload`: Upload artifact to dashboard
|
|
127
147
|
|
|
128
148
|
```bash
|
|
@@ -131,6 +151,8 @@ alloc upload alloc_artifact.json.gz
|
|
|
131
151
|
|
|
132
152
|
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
133
153
|
|
|
154
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
155
|
+
|
|
134
156
|
### `alloc catalog`: Browse GPU hardware catalog
|
|
135
157
|
|
|
136
158
|
```bash
|
|
@@ -148,6 +170,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
|
|
|
148
170
|
```bash
|
|
149
171
|
alloc init # interactive wizard
|
|
150
172
|
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
173
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
151
174
|
```
|
|
152
175
|
|
|
153
176
|
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
@@ -90,9 +90,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
|
|
|
90
90
|
|
|
91
91
|
```bash
|
|
92
92
|
alloc login
|
|
93
|
-
# Prompts for email + password, stores token in ~/.alloc/config.json
|
|
93
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
94
|
+
|
|
95
|
+
alloc login --token <ACCESS_TOKEN>
|
|
96
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `alloc whoami`: Show current auth + org context
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
alloc whoami
|
|
103
|
+
alloc whoami --json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
107
|
+
|
|
108
|
+
### `alloc logout`: Clear local session
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
alloc logout
|
|
94
112
|
```
|
|
95
113
|
|
|
114
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
115
|
+
|
|
96
116
|
### `alloc upload`: Upload artifact to dashboard
|
|
97
117
|
|
|
98
118
|
```bash
|
|
@@ -101,6 +121,8 @@ alloc upload alloc_artifact.json.gz
|
|
|
101
121
|
|
|
102
122
|
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
103
123
|
|
|
124
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
125
|
+
|
|
104
126
|
### `alloc catalog`: Browse GPU hardware catalog
|
|
105
127
|
|
|
106
128
|
```bash
|
|
@@ -118,6 +140,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
|
|
|
118
140
|
```bash
|
|
119
141
|
alloc init # interactive wizard
|
|
120
142
|
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
143
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
121
144
|
```
|
|
122
145
|
|
|
123
146
|
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
license =
|
|
10
|
+
license = "Apache-2.0"
|
|
11
11
|
requires-python = ">=3.8"
|
|
12
12
|
authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
|
|
13
13
|
classifiers = [
|
|
@@ -81,6 +81,42 @@ def _estimate_dataloader_wait(cv):
|
|
|
81
81
|
return round((cv - 0.1) / 0.4 * 30.0, 1)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
def _detect_distributed():
|
|
85
|
+
# type: () -> tuple
|
|
86
|
+
"""Detect if running inside a torch.distributed process group.
|
|
87
|
+
|
|
88
|
+
Returns (is_distributed, rank, world_size). Fail-safe: returns
|
|
89
|
+
(False, 0, 1) if torch.distributed is unavailable or not initialized.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
import torch.distributed as dist
|
|
93
|
+
if dist.is_initialized():
|
|
94
|
+
return True, dist.get_rank(), dist.get_world_size()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
return False, 0, 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
|
|
101
|
+
# type: (List[float], float) -> Optional[float]
|
|
102
|
+
"""Estimate communication overhead % for distributed training.
|
|
103
|
+
|
|
104
|
+
Uses the p90/p50 spread as a proxy for sync barrier delays.
|
|
105
|
+
Subtracts estimated dataloader contribution to avoid double-counting.
|
|
106
|
+
Returns None if insufficient data.
|
|
107
|
+
"""
|
|
108
|
+
if len(step_times_ms) < 10:
|
|
109
|
+
return None
|
|
110
|
+
sorted_vals = sorted(step_times_ms)
|
|
111
|
+
p50 = _compute_percentile(sorted_vals, 50)
|
|
112
|
+
p90 = _compute_percentile(sorted_vals, 90)
|
|
113
|
+
if p50 <= 0:
|
|
114
|
+
return None
|
|
115
|
+
raw_pct = ((p90 - p50) / p50) * 100
|
|
116
|
+
comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
|
|
117
|
+
return round(min(40.0, comm_pct), 1)
|
|
118
|
+
|
|
119
|
+
|
|
84
120
|
def _write_callback_data(data):
|
|
85
121
|
# type: (Dict[str, Any]) -> None
|
|
86
122
|
"""Write callback data to the alloc sidecar file.
|
|
@@ -101,6 +137,9 @@ def _build_sidecar(
|
|
|
101
137
|
step_count, # type: int
|
|
102
138
|
step_times_ms, # type: List[float]
|
|
103
139
|
batch_size, # type: Optional[int]
|
|
140
|
+
is_distributed=False, # type: bool
|
|
141
|
+
rank=0, # type: int
|
|
142
|
+
world_size=1, # type: int
|
|
104
143
|
):
|
|
105
144
|
# type: (...) -> Dict[str, Any]
|
|
106
145
|
"""Build the sidecar dict from collected timing data."""
|
|
@@ -124,6 +163,15 @@ def _build_sidecar(
|
|
|
124
163
|
"batch_size": batch_size,
|
|
125
164
|
"dataloader_wait_pct": dataloader_wait_pct,
|
|
126
165
|
}
|
|
166
|
+
|
|
167
|
+
if is_distributed:
|
|
168
|
+
data["is_distributed"] = True
|
|
169
|
+
data["rank"] = rank
|
|
170
|
+
data["world_size"] = world_size
|
|
171
|
+
comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
|
|
172
|
+
if comm is not None:
|
|
173
|
+
data["comm_overhead_pct"] = comm
|
|
174
|
+
|
|
127
175
|
return data
|
|
128
176
|
|
|
129
177
|
|
|
@@ -142,9 +190,17 @@ try:
|
|
|
142
190
|
self._step_start = None # type: Optional[float]
|
|
143
191
|
self._batch_size = None # type: Optional[int]
|
|
144
192
|
self._last_write_step = 0 # type: int
|
|
193
|
+
self._dist_checked = False # type: bool
|
|
194
|
+
self._is_distributed = False # type: bool
|
|
195
|
+
self._rank = 0 # type: int
|
|
196
|
+
self._world_size = 1 # type: int
|
|
145
197
|
|
|
146
198
|
def on_step_begin(self, args, state, control, **kwargs):
|
|
147
199
|
self._step_start = time.monotonic()
|
|
200
|
+
# Detect distributed once after process group is initialized
|
|
201
|
+
if not self._dist_checked:
|
|
202
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
203
|
+
self._dist_checked = True
|
|
148
204
|
|
|
149
205
|
def on_step_end(self, args, state, control, **kwargs):
|
|
150
206
|
self.step_count = state.global_step
|
|
@@ -183,6 +239,9 @@ try:
|
|
|
183
239
|
step_count=self.step_count,
|
|
184
240
|
step_times_ms=self._step_times_ms,
|
|
185
241
|
batch_size=self._batch_size,
|
|
242
|
+
is_distributed=self._is_distributed,
|
|
243
|
+
rank=self._rank,
|
|
244
|
+
world_size=self._world_size,
|
|
186
245
|
)
|
|
187
246
|
_write_callback_data(data)
|
|
188
247
|
|
|
@@ -214,9 +273,16 @@ try:
|
|
|
214
273
|
self._step_start = None # type: Optional[float]
|
|
215
274
|
self._batch_size = None # type: Optional[int]
|
|
216
275
|
self._last_write_step = 0 # type: int
|
|
276
|
+
self._dist_checked = False # type: bool
|
|
277
|
+
self._is_distributed = False # type: bool
|
|
278
|
+
self._rank = 0 # type: int
|
|
279
|
+
self._world_size = 1 # type: int
|
|
217
280
|
|
|
218
281
|
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
219
282
|
self._step_start = time.monotonic()
|
|
283
|
+
if not self._dist_checked:
|
|
284
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
285
|
+
self._dist_checked = True
|
|
220
286
|
|
|
221
287
|
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
222
288
|
self.step_count = trainer.global_step
|
|
@@ -259,6 +325,9 @@ try:
|
|
|
259
325
|
step_count=self.step_count,
|
|
260
326
|
step_times_ms=self._step_times_ms,
|
|
261
327
|
batch_size=self._batch_size,
|
|
328
|
+
is_distributed=self._is_distributed,
|
|
329
|
+
rank=self._rank,
|
|
330
|
+
world_size=self._world_size,
|
|
262
331
|
)
|
|
263
332
|
_write_callback_data(data)
|
|
264
333
|
|
|
@@ -76,6 +76,35 @@ def list_gpus() -> List[dict]:
|
|
|
76
76
|
return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def get_default_rate(gpu_name: str) -> Optional[float]:
|
|
80
|
+
"""Look up the average default $/hr for a GPU by name or alias.
|
|
81
|
+
|
|
82
|
+
Tries to match the probe-reported GPU name against catalog display names.
|
|
83
|
+
Returns the average across clouds, or None if not found.
|
|
84
|
+
"""
|
|
85
|
+
rate_card = _load_rate_card()
|
|
86
|
+
rates = rate_card.get("rates", {})
|
|
87
|
+
|
|
88
|
+
# Direct match by display name
|
|
89
|
+
for display_name, cloud_rates in rates.items():
|
|
90
|
+
if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
|
|
91
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
92
|
+
return sum(vals) / len(vals) if vals else None
|
|
93
|
+
|
|
94
|
+
# Try aliases → display name
|
|
95
|
+
for alias, stable_id in _ALIASES.items():
|
|
96
|
+
if alias.lower() in gpu_name.lower():
|
|
97
|
+
catalog = _load_catalog()
|
|
98
|
+
spec = catalog.get("gpus", {}).get(stable_id)
|
|
99
|
+
if spec:
|
|
100
|
+
dn = spec.get("display_name", "")
|
|
101
|
+
cloud_rates = rates.get(dn, {})
|
|
102
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
103
|
+
return sum(vals) / len(vals) if vals else None
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
79
108
|
def get_gpu(gpu_id: str) -> Optional[dict]:
|
|
80
109
|
"""Look up a GPU by stable ID or alias.
|
|
81
110
|
|