PyPI - gpu-dev - Versions diffs - 0.6.2__tar.gz → 0.6.4__tar.gz - Mend

gpu-dev 0.6.2tar.gz → 0.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.6.2
+Version: 0.6.4
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.6.2
+Version: 0.6.4
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt RENAMED Viewed

@@ -32,7 +32,11 @@ docs/docker-mark-blue.svg
 docs/icons8-cursor-ai.svg
 sdk/python/README.md
 sdk/python/pyproject.toml
+sdk/python/examples/batch_multi_gpu.py
+sdk/python/examples/interactive_debug.py
 sdk/python/examples/quickstart.ipynb
+sdk/python/examples/run_tests.py
+sdk/python/examples/submit_job.py
 sdk/python/src/gpu_dev/__init__.py
 sdk/python/src/gpu_dev/py.typed
 sdk/python/src/gpu_dev/_async/__init__.py

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -1192,8 +1192,10 @@ def reserve(
                                 # Build choices
                                 choices = []
-                                # Get available disks (exclude in-use and deleted disks)
-                                available_disks = [d for d in existing_disks if not d['in_use'] and not d.get('is_deleted', False)]
+                                # Show all non-deleted disks, marking in-use ones as disabled
+                                all_disks = [d for d in existing_disks if not d.get('is_deleted', False)]
+                                available_disks = [d for d in all_disks if not d['in_use']]
+                                in_use_disks = [d for d in all_disks if d['in_use']]
                                 if available_disks:
                                     choices.append(questionary.Separator("=== Available Disks ==="))
@@ -1204,6 +1206,17 @@ def reserve(
                                             value=("select", d['name'])
                                         ))
+                                if in_use_disks:
+                                    choices.append(questionary.Separator("=== In Use ==="))
+                                    for d in in_use_disks:
+                                        res_id = d.get('reservation_id', '?')[:8]
+                                        display = f"{d['name']} ({d['size_gb']}GB) — in use by {res_id}"
+                                        choices.append(questionary.Choice(
+                                            title=display,
+                                            value=("in_use", d['name']),
+                                            disabled="currently in use",
+                                        ))
                                 choices.append(questionary.Separator("=== Options ==="))
                                 choices.append(questionary.Choice(
                                     title="Create a new disk",
@@ -3307,12 +3320,21 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
         # Fast path: if reservation ID given, check local SSH config first (no network)
         if reservation_id:
             ssh_config_dir = Path.home() / ".gpu-dev"
-            matches = list(ssh_config_dir.glob(f"{reservation_id}*-sshconfig")) if ssh_config_dir.exists() else []
-            if matches:
-                pod_name = f"gpu-dev-{reservation_id[:8]}"
-                rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
-                os.execvp("ssh", ["ssh", pod_name])
-                return
+            config_file = ssh_config_dir / f"{reservation_id[:8]}-sshconfig"
+            if config_file.exists():
+                config_text = config_file.read_text()
+                fqdn_line = [l.strip() for l in config_text.splitlines() if l.strip().startswith("HostName")]
+                if fqdn_line:
+                    fqdn = fqdn_line[0].split(None, 1)[1]
+                    pod_name = f"gpu-dev-{reservation_id[:8]}"
+                    rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
+                    import subprocess, sys
+                    sys.exit(subprocess.call([
+                        "ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
+                        "-o", "ProxyCommand=gpu-dev-ssh-proxy %h %p",
+                        "-o", "ForwardAgent=yes",
+                        f"dev@{fqdn}",
+                    ]))
         with Live(
             Spinner("dots", text="📡 Fetching reservation details..."), console=console
@@ -3543,7 +3565,9 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
     except KeyboardInterrupt:
         rprint("\n[yellow]Connection cancelled by user[/yellow]")
     except Exception as e:
+        import traceback
         rprint(f"[red]❌ Error: {str(e)}[/red]")
+        traceback.print_exc()
 @main.command(name="get-ssh-config")

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.6.2"
+version = "0.6.4"
 description = "CLI tool for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/README.md RENAMED Viewed

@@ -44,6 +44,23 @@ with client.reserve(gpu_type="t4") as sb:
 # reservation cancelled automatically
 ```
+## Progress Tracking
+```python
+# Built-in progress logging
+sandbox = client.reserve(gpu_type="h100", on_progress=True)
+# [  1.5s] pending
+# [  3.2s] preparing
+# [  8.1s] 🚀 Container running
+# [ 22.4s] Ready
+# Custom callback
+sandbox = client.reserve(
+    gpu_type="h100",
+    on_progress=lambda msg, t: print(f"⏳ [{t:.0f}s] {msg}")
+)
+```
 ## Available GPU Types
 | Type | GPUs/node | Architecture |
@@ -75,6 +92,7 @@ client = GpuDev(GpuDevConfig(github_user="octocat"))  # Explicit config
 | `list(status=[...])` | List reservations as `Sandbox` objects |
 | `availability()` | GPU availability by type |
 | `disks()` | List persistent disks |
+| `search_logs(reservation_id)` | Get processing logs for any reservation |
 ### `Sandbox` — Reserved Environment
@@ -82,6 +100,8 @@ client = GpuDev(GpuDevConfig(github_user="octocat"))  # Explicit config
 sandbox = client.reserve(gpu_type="h100")
 ```
+**Methods:**
 | Method | Description |
 |--------|-------------|
 | `exec(command, timeout=None)` | Run shell command, returns `ExecResult` |
@@ -91,7 +111,11 @@ sandbox = client.reserve(gpu_type="h100")
 | `extend(hours)` | Extend duration |
 | `refresh()` | Refresh status from server |
 | `add_user(github_username)` | Grant SSH access to another user |
-| `wait_until_ready(timeout_minutes)` | Block until active |
+| `wait_until_ready(timeout, on_progress)` | Block until active |
+| `logs()` | Get reservation processing log |
+| `pod_logs(lines=50)` | Get container stdout via SSH |
+**Properties:**
 | Property | Description |
 |----------|-------------|
@@ -101,8 +125,15 @@ sandbox = client.reserve(gpu_type="h100")
 | `gpu_count` | Number of GPUs |
 | `ssh_command` | SSH command string |
 | `pod_name` | SSH hostname |
+| `fqdn` | Fully-qualified domain name |
 | `is_active` | Whether ready for commands |
 | `expires_at` | Expiration time |
+| `disk_name` | Attached persistent disk |
+| `instance_type` | EC2 instance type |
+| `created_at` | Creation timestamp |
+| `node_ip` | Node public IP |
+| `detailed_status` | Detailed status message |
+| `user_id` | Owner's user ID |
 ### `ExecResult`
@@ -113,6 +144,21 @@ result.stdout     # "hello\n"
 result.stderr     # ""
 ```
+## Logs & Debugging
+```python
+# Reservation processing log (what happened during setup)
+for entry in sandbox.logs():
+    print(f"[{entry['timestamp'][11:23]}] {entry['message']}")
+# Look up logs for any reservation by ID prefix
+for entry in client.search_logs("abc12345"):
+    print(f"[{entry['timestamp'][11:23]}] {entry['message']}")
+# Container stdout (via SSH)
+print(sandbox.pod_logs(lines=20))
+```
 ## Spot Instances
 Use spot instances for lower cost (may be preempted):
@@ -129,6 +175,7 @@ Data persists across reservations when using named disks:
 # First session
 sb = client.reserve(gpu_type="h100", disk_name="my-project")
 sb.exec("pip install torch && echo done")
+sb.cancel()
 # Later session — packages still installed
 sb = client.reserve(gpu_type="h100", disk_name="my-project")
@@ -185,3 +232,9 @@ except GpuDevValidationError as e:
 except GpuDevTimeoutError:
     print("Reservation timed out — GPUs may be busy")
 ```
+Credentials are cached to disk (45-min TTL) and auto-refreshed on expiry — no manual re-auth needed in long-running notebooks.
+## Interactive Notebook
+See [examples/quickstart.ipynb](examples/quickstart.ipynb) for a hands-on walkthrough.

gpu_dev-0.6.4/sdk/python/examples/batch_multi_gpu.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Run the same job across multiple GPU types and compare results.
+Useful for benchmarking or testing compatibility across hardware.
+Usage:
+    python batch_multi_gpu.py
+"""
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from gpu_dev import GpuDev, GpuDevError
+client = GpuDev()
+BENCHMARK_CMD = """
+python3 -c '
+import torch, time
+gpu = torch.cuda.get_device_name(0)
+x = torch.randn(4096, 4096, device="cuda")
+torch.cuda.synchronize()
+t0 = time.time()
+for _ in range(100):
+    y = x @ x
+torch.cuda.synchronize()
+ms = (time.time() - t0) * 1000
+print(f"{gpu}|{ms:.0f}")
+'
+"""
+GPU_TYPES = ["t4", "l4", "rtxpro6000"]
+def run_benchmark(gpu_type: str) -> dict:
+    try:
+        sb = client.reserve(
+            gpu_type=gpu_type,
+            gpu_count=1,
+            hours=0.25,
+            name=f"bench-{gpu_type}",
+        )
+        result = sb.exec(BENCHMARK_CMD.strip(), timeout=30)
+        sb.cancel()
+        if result.exit_code == 0 and "|" in result.stdout:
+            gpu_name, ms = result.stdout.strip().split("|")
+            return {"gpu_type": gpu_type, "gpu_name": gpu_name, "ms": float(ms), "ok": True}
+        return {"gpu_type": gpu_type, "error": result.stderr or result.stdout, "ok": False}
+    except GpuDevError as e:
+        return {"gpu_type": gpu_type, "error": str(e), "ok": False}
+print(f"Benchmarking matmul 4096x4096 x100 across {len(GPU_TYPES)} GPU types...\n")
+# Run in parallel
+with ThreadPoolExecutor(max_workers=len(GPU_TYPES)) as ex:
+    futures = {ex.submit(run_benchmark, gt): gt for gt in GPU_TYPES}
+    print(f"{'GPU Type':15s} {'GPU Name':30s} {'Time':>8s}")
+    print("-" * 55)
+    for future in as_completed(futures):
+        r = future.result()
+        if r["ok"]:
+            print(f"{r['gpu_type']:15s} {r['gpu_name']:30s} {r['ms']:>7.0f}ms")
+        else:
+            print(f"{r['gpu_type']:15s} FAILED: {r['error'][:40]}")
+print("\nDone")

gpu_dev-0.6.4/sdk/python/examples/interactive_debug.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Interactive debugging: reserve a GPU, poke around, inspect logs.
+Use this in a Python REPL or Jupyter notebook for ad-hoc debugging.
+    from gpu_dev import GpuDev
+    client = GpuDev()
+    exec(open("examples/interactive_debug.py").read())
+"""
+from gpu_dev import GpuDev
+client = GpuDev()
+# Show what's available
+print("GPU Availability:")
+for gpu, info in sorted(client.availability().items()):
+    if info.total > 0:
+        print(f"  {gpu:15s} {info.available:>3d}/{info.total} free")
+# Show active reservations
+print("\nActive reservations:")
+for sb in client.list():
+    print(f"  {sb.id[:8]}  {sb.gpu_count}x {sb.gpu_type:10s}  {sb.status.value:10s}  disk={sb.disk_name or '-'}")
+# Show disks
+print("\nDisks:")
+for d in client.disks():
+    status = "IN USE" if d.in_use else "free"
+    print(f"  {d.name:20s}  {d.snapshot_count:>3d} snapshots  {status}")
+# Reconnect to most recent active reservation
+active = client.list(status=["active"])
+if active:
+    sb = active[0]
+    print(f"\nReconnected to {sb.id[:8]} ({sb.gpu_count}x {sb.gpu_type})")
+    print(f"  SSH: ssh {sb.pod_name}")
+    print(f"  Disk: {sb.disk_name}")
+    print(f"  Expires: {sb.expires_at}")
+    # Quick health check
+    result = sb.exec("nvidia-smi -L 2>&1 | head -4", timeout=5)
+    if result.exit_code == 0:
+        print(f"  GPU: {result.stdout.strip()}")
+    else:
+        print(f"  GPU check failed (exit {result.exit_code})")
+    # Show setup logs
+    print(f"\n  Setup log:")
+    for entry in sb.logs():
+        print(f"    [{entry['timestamp'][11:19]}] {entry['message'][:70]}")
+else:
+    print("\nNo active reservations")
+# Look up a past reservation's logs
+# client.search_logs("abc12345")

gpu_dev-0.6.4/sdk/python/examples/run_tests.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Run tests on a GPU server with a persistent disk snapshot.
+Loads a pre-configured environment from a named disk and runs
+a test suite — useful for CI or interactive debugging.
+Usage:
+    python run_tests.py
+    python run_tests.py --branch feature/my-fix
+"""
+import sys
+from gpu_dev import GpuDev, GpuDevTimeoutError
+branch = sys.argv[1] if len(sys.argv) > 1 else "main"
+client = GpuDev()
+print(f"Reserving H100 with 'pytorch-dev' disk (branch: {branch})...")
+try:
+    sb = client.reserve(
+        gpu_type="h100",
+        gpu_count=1,
+        hours=2,
+        disk_name="pytorch-dev",       # pre-compiled PyTorch environment
+        name=f"test-{branch[:20]}",
+        on_progress=True,
+    )
+except GpuDevTimeoutError:
+    print("No GPU capacity available — try again later or use spot")
+    sys.exit(1)
+print(f"\nRunning on {sb.pod_name} ({sb.instance_type})")
+# Pull latest code
+result = sb.exec(f"""
+    cd /home/dev/pytorch && \
+    git fetch origin && \
+    git checkout {branch} && \
+    git pull origin {branch}
+""", timeout=120)
+print(result.stdout[-200:] if result.stdout else "(no output)")
+if result.exit_code != 0:
+    print(f"Git checkout failed: {result.stderr}")
+    sb.cancel()
+    sys.exit(1)
+# Run tests
+print(f"\nRunning tests on {branch}...")
+result = sb.exec(
+    "cd /home/dev/pytorch && python test/run_test.py test_torch 2>&1 | tail -30",
+    timeout=1800,
+)
+print(result.stdout)
+# Show timing from reservation logs
+print("\nReservation timeline:")
+for entry in sb.logs():
+    print(f"  [{entry['timestamp'][11:23]}] {entry['message'][:80]}")
+exit_code = result.exit_code
+sb.cancel()
+print(f"\nTests {'PASSED' if exit_code == 0 else 'FAILED'} (exit {exit_code})")
+sys.exit(exit_code)

gpu_dev-0.6.4/sdk/python/examples/submit_job.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Submit a training job to a GPU server and wait for results.
+Usage:
+    python submit_job.py
+"""
+from gpu_dev import GpuDev
+client = GpuDev()
+# Reserve a T4 GPU, auto-cancel when done
+with client.reserve(gpu_type="t4", hours=1, name="training-job", on_progress=True) as sb:
+    print(f"\nReserved: {sb.id[:8]} on {sb.instance_type}")
+    print(f"SSH: {sb.ssh_command}\n")
+    # Upload training script
+    sb.upload("./train.py", "/home/dev/train.py")
+    # Run training
+    print("Starting training...")
+    result = sb.exec("cd /home/dev && python train.py 2>&1", timeout=600)
+    print(result.stdout)
+    if result.exit_code != 0:
+        print(f"Training failed (exit {result.exit_code})")
+        print(result.stderr)
+    else:
+        # Download results
+        sb.download("/home/dev/output/", "./results/")
+        print("Results downloaded to ./results/")
+    # Check logs if something went wrong
+    if result.exit_code != 0:
+        print("\nReservation logs:")
+        for entry in sb.logs("error"):
+            print(f"  [{entry['timestamp'][11:23]}] {entry['message']}")
+# Reservation auto-cancelled
+print("Done — reservation cleaned up")

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_backend/aws.py RENAMED Viewed

@@ -30,16 +30,19 @@ _PREFIX = "pytorch-gpu-dev"
 _CRED_CACHE_PATH = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
 _CRED_CACHE_TTL = 2700  # 45 min (SSO session tokens typically last 1h)
-# Module-level session cache — reused across AwsBackend instances in the same process
+# Module-level session cache with expiry tracking
 _cached_session: boto3.Session | None = None
+_cached_session_expires: float = 0
 def _get_session() -> boto3.Session:
     """Get a boto3 session with disk-cached credentials (saves ~900ms SSO resolution)."""
-    global _cached_session
-    if _cached_session is not None:
+    global _cached_session, _cached_session_expires
+    if _cached_session is not None and time.time() < _cached_session_expires:
         return _cached_session
+    _cached_session = None
     # Try disk-cached credentials
     try:
         if _CRED_CACHE_PATH.exists():
@@ -50,6 +53,7 @@ def _get_session() -> boto3.Session:
                     aws_secret_access_key=cached["secret_key"],
                     aws_session_token=cached["token"],
                 )
+                _cached_session_expires = cached["expires"]
                 return _cached_session
     except Exception:
         pass
@@ -80,6 +84,7 @@ def _get_session() -> boto3.Session:
         pass
     _cached_session = session
+    _cached_session_expires = time.time() + _CRED_CACHE_TTL
     return session
@@ -100,11 +105,13 @@ class AwsBackend:
         self._reservations = self._ddb.Table(f"{_PREFIX}-reservations")
         self._availability = self._ddb.Table(f"{_PREFIX}-gpu-availability")
         self._disks = self._ddb.Table(f"{_PREFIX}-disks")
+        self._queue_url: str | None = None
     def _refresh_on_expired(self) -> None:
         """Clear cached session and reinitialize clients."""
-        global _cached_session
+        global _cached_session, _cached_session_expires
         _cached_session = None
+        _cached_session_expires = 0
         try:
             _CRED_CACHE_PATH.unlink(missing_ok=True)
         except Exception:
@@ -182,13 +189,17 @@ class AwsBackend:
                 return self._item_to_info(item)
             return None
-        resp = self._reservations.query(
-            IndexName="UserIndex",
-            KeyConditionExpression="user_id = :uid",
-            FilterExpression="begins_with(reservation_id, :rid)",
-            ExpressionAttributeValues={":uid": user_id, ":rid": reservation_id},
-        )
+        query_kwargs = {
+            "IndexName": "UserIndex",
+            "KeyConditionExpression": "user_id = :uid",
+            "FilterExpression": "begins_with(reservation_id, :rid)",
+            "ExpressionAttributeValues": {":uid": user_id, ":rid": reservation_id},
+        }
+        resp = self._reservations.query(**query_kwargs)
         items = resp.get("Items", [])
+        while not items and "LastEvaluatedKey" in resp:
+            resp = self._reservations.query(**query_kwargs, ExclusiveStartKey=resp["LastEvaluatedKey"])
+            items = resp.get("Items", [])
         if len(items) == 1:
             return self._item_to_info(items[0])
         return None

{gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_sync/client.py RENAMED Viewed

@@ -243,3 +243,63 @@ class GpuDev:
         """
         user_info = self._auth()
         return self._backend.list_disks(user_info["user_id"])
+    def search_logs(
+        self,
+        reservation_id: str,
+    ) -> list[dict[str, str]]:
+        """Get status history for any reservation by ID.
+        Args:
+            reservation_id: Full or prefix (8+ chars) reservation ID.
+        Returns:
+            List of ``{"timestamp": "...", "message": "..."}`` dicts.
+        Example::
+            for entry in client.search_logs("abc12345"):
+                print(f"[{entry['timestamp']}] {entry['message']}")
+        """
+        from .._backend.aws import _get_session, _PREFIX
+        session = _get_session()
+        region = getattr(self._backend, "_region", "us-east-2")
+        ddb = session.resource("dynamodb", region_name=region)
+        table = ddb.Table(f"{_PREFIX}-reservations")
+        # Try direct lookup first, then query UserIndex by prefix
+        try:
+            user_info = self._auth()
+            if len(reservation_id) >= 32:
+                resp = table.get_item(Key={"reservation_id": reservation_id})
+                item = resp.get("Item")
+            else:
+                query_kwargs = {
+                    "IndexName": "UserIndex",
+                    "KeyConditionExpression": "user_id = :uid",
+                    "FilterExpression": "begins_with(reservation_id, :rid)",
+                    "ExpressionAttributeValues": {
+                        ":uid": user_info["user_id"],
+                        ":rid": reservation_id,
+                    },
+                }
+                item = None
+                resp = table.query(**query_kwargs)
+                if resp.get("Items"):
+                    item = resp["Items"][0]
+                else:
+                    while "LastEvaluatedKey" in resp and not item:
+                        resp = table.query(**query_kwargs, ExclusiveStartKey=resp["LastEvaluatedKey"])
+                        if resp.get("Items"):
+                            item = resp["Items"][0]
+            if not item:
+                return []
+            history = item.get("status_history", [])
+            return [
+                {"timestamp": str(e.get("timestamp", "")), "message": str(e.get("message", ""))}
+                for e in history
+            ]
+        except Exception:
+            return []

gpu-dev 0.6.2__tar.gz → 0.6.4__tar.gz

gpu-dev 0.6.2tar.gz → 0.6.4tar.gz