PyPI - alloc - Versions diffs - 0.4.0__tar.gz → 0.5.0__tar.gz - Mend

alloc 0.4.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{alloc-0.4.0 → alloc-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.4.0
+Version: 0.5.0
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
-License: Apache-2.0
+License-Expression: Apache-2.0
 Project-URL: Homepage, https://alloclabs.com
 Project-URL: Repository, https://github.com/alloc-labs/alloc
 Classifier: Development Status :: 3 - Alpha
@@ -120,9 +120,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
 ```bash
 alloc login
-# Prompts for email + password, stores token in ~/.alloc/config.json
+# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
+alloc login --token <ACCESS_TOKEN>
+# Paste an access token from the dashboard (no password prompt)
+```
+### `alloc whoami`: Show current auth + org context
+```bash
+alloc whoami
+alloc whoami --json
+```
+Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
+### `alloc logout`: Clear local session
+```bash
+alloc logout
 ```
+Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
 ### `alloc upload`: Upload artifact to dashboard
 ```bash
@@ -131,6 +151,8 @@ alloc upload alloc_artifact.json.gz
 Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
+If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
 ### `alloc catalog`: Browse GPU hardware catalog
 ```bash
@@ -148,6 +170,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
 ```bash
 alloc init                     # interactive wizard
 alloc init --yes               # non-interactive defaults (full catalog, 50/50 priority)
+alloc init --from-org --yes    # pull fleet/budget/objective from your org (requires alloc login)
 ```
 Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.

{alloc-0.4.0 → alloc-0.5.0}/README.md RENAMED Viewed

@@ -90,9 +90,29 @@ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writ
 ```bash
 alloc login
-# Prompts for email + password, stores token in ~/.alloc/config.json
+# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
+alloc login --token <ACCESS_TOKEN>
+# Paste an access token from the dashboard (no password prompt)
+```
+### `alloc whoami`: Show current auth + org context
+```bash
+alloc whoami
+alloc whoami --json
+```
+Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
+### `alloc logout`: Clear local session
+```bash
+alloc logout
 ```
+Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
 ### `alloc upload`: Upload artifact to dashboard
 ```bash
@@ -101,6 +121,8 @@ alloc upload alloc_artifact.json.gz
 Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
+If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
 ### `alloc catalog`: Browse GPU hardware catalog
 ```bash
@@ -118,6 +140,7 @@ Offline reference for GPU specs, interconnect details, and cloud pricing. Suppor
 ```bash
 alloc init                     # interactive wizard
 alloc init --yes               # non-interactive defaults (full catalog, 50/50 priority)
+alloc init --from-org --yes    # pull fleet/budget/objective from your org (requires alloc login)
 ```
 Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.

{alloc-0.4.0 → alloc-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.4.0"
+version = "0.5.0"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
-license = {text = "Apache-2.0"}
+license = "Apache-2.0"
 requires-python = ">=3.8"
 authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
 classifiers = [

{alloc-0.4.0 → alloc-0.5.0}/src/alloc/__init__.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.4.0 → alloc-0.5.0}/src/alloc/callbacks.py RENAMED Viewed

@@ -81,6 +81,42 @@ def _estimate_dataloader_wait(cv):
     return round((cv - 0.1) / 0.4 * 30.0, 1)
+def _detect_distributed():
+    # type: () -> tuple
+    """Detect if running inside a torch.distributed process group.
+    Returns (is_distributed, rank, world_size). Fail-safe: returns
+    (False, 0, 1) if torch.distributed is unavailable or not initialized.
+    """
+    try:
+        import torch.distributed as dist
+        if dist.is_initialized():
+            return True, dist.get_rank(), dist.get_world_size()
+    except Exception:
+        pass
+    return False, 0, 1
+def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
+    # type: (List[float], float) -> Optional[float]
+    """Estimate communication overhead % for distributed training.
+    Uses the p90/p50 spread as a proxy for sync barrier delays.
+    Subtracts estimated dataloader contribution to avoid double-counting.
+    Returns None if insufficient data.
+    """
+    if len(step_times_ms) < 10:
+        return None
+    sorted_vals = sorted(step_times_ms)
+    p50 = _compute_percentile(sorted_vals, 50)
+    p90 = _compute_percentile(sorted_vals, 90)
+    if p50 <= 0:
+        return None
+    raw_pct = ((p90 - p50) / p50) * 100
+    comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
+    return round(min(40.0, comm_pct), 1)
 def _write_callback_data(data):
     # type: (Dict[str, Any]) -> None
     """Write callback data to the alloc sidecar file.
@@ -101,6 +137,9 @@ def _build_sidecar(
     step_count,      # type: int
     step_times_ms,   # type: List[float]
     batch_size,      # type: Optional[int]
+    is_distributed=False,  # type: bool
+    rank=0,          # type: int
+    world_size=1,    # type: int
 ):
     # type: (...) -> Dict[str, Any]
     """Build the sidecar dict from collected timing data."""
@@ -124,6 +163,15 @@ def _build_sidecar(
         "batch_size": batch_size,
         "dataloader_wait_pct": dataloader_wait_pct,
     }
+    if is_distributed:
+        data["is_distributed"] = True
+        data["rank"] = rank
+        data["world_size"] = world_size
+        comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
+        if comm is not None:
+            data["comm_overhead_pct"] = comm
     return data
@@ -142,9 +190,17 @@ try:
             self._step_start = None      # type: Optional[float]
             self._batch_size = None      # type: Optional[int]
             self._last_write_step = 0    # type: int
+            self._dist_checked = False   # type: bool
+            self._is_distributed = False  # type: bool
+            self._rank = 0               # type: int
+            self._world_size = 1         # type: int
         def on_step_begin(self, args, state, control, **kwargs):
             self._step_start = time.monotonic()
+            # Detect distributed once after process group is initialized
+            if not self._dist_checked:
+                self._is_distributed, self._rank, self._world_size = _detect_distributed()
+                self._dist_checked = True
         def on_step_end(self, args, state, control, **kwargs):
             self.step_count = state.global_step
@@ -183,6 +239,9 @@ try:
                 step_count=self.step_count,
                 step_times_ms=self._step_times_ms,
                 batch_size=self._batch_size,
+                is_distributed=self._is_distributed,
+                rank=self._rank,
+                world_size=self._world_size,
             )
             _write_callback_data(data)
@@ -214,9 +273,16 @@ try:
             self._step_start = None      # type: Optional[float]
             self._batch_size = None      # type: Optional[int]
             self._last_write_step = 0    # type: int
+            self._dist_checked = False   # type: bool
+            self._is_distributed = False  # type: bool
+            self._rank = 0               # type: int
+            self._world_size = 1         # type: int
         def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
             self._step_start = time.monotonic()
+            if not self._dist_checked:
+                self._is_distributed, self._rank, self._world_size = _detect_distributed()
+                self._dist_checked = True
         def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
             self.step_count = trainer.global_step
@@ -259,6 +325,9 @@ try:
                 step_count=self.step_count,
                 step_times_ms=self._step_times_ms,
                 batch_size=self._batch_size,
+                is_distributed=self._is_distributed,
+                rank=self._rank,
+                world_size=self._world_size,
             )
             _write_callback_data(data)

{alloc-0.4.0 → alloc-0.5.0}/src/alloc/catalog/__init__.py RENAMED Viewed

@@ -76,6 +76,35 @@ def list_gpus() -> List[dict]:
     return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
+def get_default_rate(gpu_name: str) -> Optional[float]:
+    """Look up the average default $/hr for a GPU by name or alias.
+    Tries to match the probe-reported GPU name against catalog display names.
+    Returns the average across clouds, or None if not found.
+    """
+    rate_card = _load_rate_card()
+    rates = rate_card.get("rates", {})
+    # Direct match by display name
+    for display_name, cloud_rates in rates.items():
+        if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
+            vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
+            return sum(vals) / len(vals) if vals else None
+    # Try aliases → display name
+    for alias, stable_id in _ALIASES.items():
+        if alias.lower() in gpu_name.lower():
+            catalog = _load_catalog()
+            spec = catalog.get("gpus", {}).get(stable_id)
+            if spec:
+                dn = spec.get("display_name", "")
+                cloud_rates = rates.get(dn, {})
+                vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
+                return sum(vals) / len(vals) if vals else None
+    return None
 def get_gpu(gpu_id: str) -> Optional[dict]:
     """Look up a GPU by stable ID or alias.

alloc 0.4.0__tar.gz → 0.5.0__tar.gz

alloc 0.4.0tar.gz → 0.5.0tar.gz