alloc 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {alloc-0.4.0 → alloc-0.5.0}/PKG-INFO +26 -3
  2. {alloc-0.4.0 → alloc-0.5.0}/README.md +24 -1
  3. {alloc-0.4.0 → alloc-0.5.0}/pyproject.toml +2 -2
  4. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/__init__.py +1 -1
  5. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/callbacks.py +69 -0
  6. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/__init__.py +29 -0
  7. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/cli.py +447 -61
  8. alloc-0.5.0/src/alloc/config.py +124 -0
  9. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/display.py +33 -4
  10. alloc-0.5.0/src/alloc/extractor_runner.py +141 -0
  11. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/ghost.py +9 -2
  12. alloc-0.5.0/src/alloc/model_extractor.py +170 -0
  13. alloc-0.5.0/src/alloc/model_registry.py +138 -0
  14. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/probe.py +49 -2
  15. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/yaml_config.py +51 -0
  16. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/PKG-INFO +26 -3
  17. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/SOURCES.txt +4 -0
  18. alloc-0.5.0/tests/test_auth.py +155 -0
  19. {alloc-0.4.0 → alloc-0.5.0}/tests/test_callbacks.py +98 -0
  20. alloc-0.5.0/tests/test_init_from_org.py +98 -0
  21. {alloc-0.4.0 → alloc-0.5.0}/tests/test_yaml_config.py +2 -0
  22. alloc-0.4.0/src/alloc/config.py +0 -65
  23. alloc-0.4.0/src/alloc/model_extractor.py +0 -332
  24. {alloc-0.4.0 → alloc-0.5.0}/setup.cfg +0 -0
  25. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/artifact_writer.py +0 -0
  26. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/default_rate_card.json +0 -0
  27. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/gpus.v1.json +0 -0
  28. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/context.py +0 -0
  29. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/stability.py +0 -0
  30. {alloc-0.4.0 → alloc-0.5.0}/src/alloc/upload.py +0 -0
  31. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/dependency_links.txt +0 -0
  32. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/entry_points.txt +0 -0
  33. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/requires.txt +0 -0
  34. {alloc-0.4.0 → alloc-0.5.0}/src/alloc.egg-info/top_level.txt +0 -0
  35. {alloc-0.4.0 → alloc-0.5.0}/tests/test_artifact.py +0 -0
  36. {alloc-0.4.0 → alloc-0.5.0}/tests/test_catalog.py +0 -0
  37. {alloc-0.4.0 → alloc-0.5.0}/tests/test_cli.py +0 -0
  38. {alloc-0.4.0 → alloc-0.5.0}/tests/test_context.py +0 -0
  39. {alloc-0.4.0 → alloc-0.5.0}/tests/test_ghost.py +0 -0
  40. {alloc-0.4.0 → alloc-0.5.0}/tests/test_model_extractor.py +0 -0
  41. {alloc-0.4.0 → alloc-0.5.0}/tests/test_probe_hw.py +0 -0
  42. {alloc-0.4.0 → alloc-0.5.0}/tests/test_probe_multi.py +0 -0
  43. {alloc-0.4.0 → alloc-0.5.0}/tests/test_stability.py +0 -0
  44. {alloc-0.4.0 → alloc-0.5.0}/tests/test_upload.py +0 -0
  45. {alloc-0.4.0 → alloc-0.5.0}/tests/test_verdict.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Homepage, https://alloclabs.com
8
8
  Project-URL: Repository, https://github.com/alloc-labs/alloc
9
9
  Classifier: Development Status :: 3 - Alpha
@@ -120,9 +120,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
120
120
 
121
121
  ```bash
122
122
  alloc login
123
- # Prompts for email + password, stores token in ~/.alloc/config.json
123
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
124
+
125
+ alloc login --token <ACCESS_TOKEN>
126
+ # Paste an access token from the dashboard (no password prompt)
127
+ ```
128
+
129
+ ### `alloc whoami`: Show current auth + org context
130
+
131
+ ```bash
132
+ alloc whoami
133
+ alloc whoami --json
134
+ ```
135
+
136
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
137
+
138
+ ### `alloc logout`: Clear local session
139
+
140
+ ```bash
141
+ alloc logout
124
142
  ```
125
143
 
144
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
145
+
126
146
  ### `alloc upload`: Upload artifact to dashboard
127
147
 
128
148
  ```bash
@@ -131,6 +151,8 @@ alloc upload alloc_artifact.json.gz
131
151
 
132
152
  Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
133
153
 
154
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
155
+
134
156
  ### `alloc catalog`: Browse GPU hardware catalog
135
157
 
136
158
  ```bash
@@ -148,6 +170,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
148
170
  ```bash
149
171
  alloc init # interactive wizard
150
172
  alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
173
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
151
174
  ```
152
175
 
153
176
  Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
@@ -90,9 +90,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
90
90
 
91
91
  ```bash
92
92
  alloc login
93
- # Prompts for email + password, stores token in ~/.alloc/config.json
93
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
94
+
95
+ alloc login --token <ACCESS_TOKEN>
96
+ # Paste an access token from the dashboard (no password prompt)
97
+ ```
98
+
99
+ ### `alloc whoami`: Show current auth + org context
100
+
101
+ ```bash
102
+ alloc whoami
103
+ alloc whoami --json
104
+ ```
105
+
106
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
107
+
108
+ ### `alloc logout`: Clear local session
109
+
110
+ ```bash
111
+ alloc logout
94
112
  ```
95
113
 
114
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
115
+
96
116
  ### `alloc upload`: Upload artifact to dashboard
97
117
 
98
118
  ```bash
@@ -101,6 +121,8 @@ alloc upload alloc_artifact.json.gz
101
121
 
102
122
  Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
103
123
 
124
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
125
+
104
126
  ### `alloc catalog`: Browse GPU hardware catalog
105
127
 
106
128
  ```bash
@@ -118,6 +140,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
118
140
  ```bash
119
141
  alloc init # interactive wizard
120
142
  alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
143
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
121
144
  ```
122
145
 
123
146
  Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
- license = {text = "Apache-2.0"}
10
+ license = "Apache-2.0"
11
11
  requires-python = ">=3.8"
12
12
  authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
13
13
  classifiers = [
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.4.0"
5
+ __version__ = "0.5.0"
6
6
 
7
7
  from alloc.ghost import ghost, GhostReport
8
8
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -81,6 +81,42 @@ def _estimate_dataloader_wait(cv):
81
81
  return round((cv - 0.1) / 0.4 * 30.0, 1)
82
82
 
83
83
 
84
+ def _detect_distributed():
85
+ # type: () -> tuple
86
+ """Detect if running inside a torch.distributed process group.
87
+
88
+ Returns (is_distributed, rank, world_size). Fail-safe: returns
89
+ (False, 0, 1) if torch.distributed is unavailable or not initialized.
90
+ """
91
+ try:
92
+ import torch.distributed as dist
93
+ if dist.is_initialized():
94
+ return True, dist.get_rank(), dist.get_world_size()
95
+ except Exception:
96
+ pass
97
+ return False, 0, 1
98
+
99
+
100
+ def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
101
+ # type: (List[float], float) -> Optional[float]
102
+ """Estimate communication overhead % for distributed training.
103
+
104
+ Uses the p90/p50 spread as a proxy for sync barrier delays.
105
+ Subtracts estimated dataloader contribution to avoid double-counting.
106
+ Returns None if insufficient data.
107
+ """
108
+ if len(step_times_ms) < 10:
109
+ return None
110
+ sorted_vals = sorted(step_times_ms)
111
+ p50 = _compute_percentile(sorted_vals, 50)
112
+ p90 = _compute_percentile(sorted_vals, 90)
113
+ if p50 <= 0:
114
+ return None
115
+ raw_pct = ((p90 - p50) / p50) * 100
116
+ comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
117
+ return round(min(40.0, comm_pct), 1)
118
+
119
+
84
120
  def _write_callback_data(data):
85
121
  # type: (Dict[str, Any]) -> None
86
122
  """Write callback data to the alloc sidecar file.
@@ -101,6 +137,9 @@ def _build_sidecar(
101
137
  step_count, # type: int
102
138
  step_times_ms, # type: List[float]
103
139
  batch_size, # type: Optional[int]
140
+ is_distributed=False, # type: bool
141
+ rank=0, # type: int
142
+ world_size=1, # type: int
104
143
  ):
105
144
  # type: (...) -> Dict[str, Any]
106
145
  """Build the sidecar dict from collected timing data."""
@@ -124,6 +163,15 @@ def _build_sidecar(
124
163
  "batch_size": batch_size,
125
164
  "dataloader_wait_pct": dataloader_wait_pct,
126
165
  }
166
+
167
+ if is_distributed:
168
+ data["is_distributed"] = True
169
+ data["rank"] = rank
170
+ data["world_size"] = world_size
171
+ comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
172
+ if comm is not None:
173
+ data["comm_overhead_pct"] = comm
174
+
127
175
  return data
128
176
 
129
177
 
@@ -142,9 +190,17 @@ try:
142
190
  self._step_start = None # type: Optional[float]
143
191
  self._batch_size = None # type: Optional[int]
144
192
  self._last_write_step = 0 # type: int
193
+ self._dist_checked = False # type: bool
194
+ self._is_distributed = False # type: bool
195
+ self._rank = 0 # type: int
196
+ self._world_size = 1 # type: int
145
197
 
146
198
  def on_step_begin(self, args, state, control, **kwargs):
147
199
  self._step_start = time.monotonic()
200
+ # Detect distributed once after process group is initialized
201
+ if not self._dist_checked:
202
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
203
+ self._dist_checked = True
148
204
 
149
205
  def on_step_end(self, args, state, control, **kwargs):
150
206
  self.step_count = state.global_step
@@ -183,6 +239,9 @@ try:
183
239
  step_count=self.step_count,
184
240
  step_times_ms=self._step_times_ms,
185
241
  batch_size=self._batch_size,
242
+ is_distributed=self._is_distributed,
243
+ rank=self._rank,
244
+ world_size=self._world_size,
186
245
  )
187
246
  _write_callback_data(data)
188
247
 
@@ -214,9 +273,16 @@ try:
214
273
  self._step_start = None # type: Optional[float]
215
274
  self._batch_size = None # type: Optional[int]
216
275
  self._last_write_step = 0 # type: int
276
+ self._dist_checked = False # type: bool
277
+ self._is_distributed = False # type: bool
278
+ self._rank = 0 # type: int
279
+ self._world_size = 1 # type: int
217
280
 
218
281
  def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
219
282
  self._step_start = time.monotonic()
283
+ if not self._dist_checked:
284
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
285
+ self._dist_checked = True
220
286
 
221
287
  def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
222
288
  self.step_count = trainer.global_step
@@ -259,6 +325,9 @@ try:
259
325
  step_count=self.step_count,
260
326
  step_times_ms=self._step_times_ms,
261
327
  batch_size=self._batch_size,
328
+ is_distributed=self._is_distributed,
329
+ rank=self._rank,
330
+ world_size=self._world_size,
262
331
  )
263
332
  _write_callback_data(data)
264
333
 
@@ -76,6 +76,35 @@ def list_gpus() -> List[dict]:
76
76
  return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
77
77
 
78
78
 
79
+ def get_default_rate(gpu_name: str) -> Optional[float]:
80
+ """Look up the average default $/hr for a GPU by name or alias.
81
+
82
+ Tries to match the probe-reported GPU name against catalog display names.
83
+ Returns the average across clouds, or None if not found.
84
+ """
85
+ rate_card = _load_rate_card()
86
+ rates = rate_card.get("rates", {})
87
+
88
+ # Direct match by display name
89
+ for display_name, cloud_rates in rates.items():
90
+ if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
91
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
92
+ return sum(vals) / len(vals) if vals else None
93
+
94
+ # Try aliases → display name
95
+ for alias, stable_id in _ALIASES.items():
96
+ if alias.lower() in gpu_name.lower():
97
+ catalog = _load_catalog()
98
+ spec = catalog.get("gpus", {}).get(stable_id)
99
+ if spec:
100
+ dn = spec.get("display_name", "")
101
+ cloud_rates = rates.get(dn, {})
102
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
103
+ return sum(vals) / len(vals) if vals else None
104
+
105
+ return None
106
+
107
+
79
108
  def get_gpu(gpu_id: str) -> Optional[dict]:
80
109
  """Look up a GPU by stable ID or alias.
81
110