gpu-dev 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/PKG-INFO +23 -3
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/README.md +22 -2
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +23 -3
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +21 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +1 -1
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +93 -54
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +57 -10
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +40 -11
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/pyproject.toml +1 -1
- gpu_dev-0.6.2/sdk/python/README.md +187 -0
- gpu_dev-0.6.2/sdk/python/examples/quickstart.ipynb +365 -0
- gpu_dev-0.6.2/sdk/python/pyproject.toml +27 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/__init__.py +60 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_async/__init__.py +2 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/aws.py +315 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/protocol.py +53 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/client.py +245 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/sandbox.py +243 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/ssh.py +121 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/config.py +45 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/enums.py +44 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/errors.py +33 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/models.py +73 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/py.typed +0 -0
- gpu_dev-0.6.2/sdk/python/tests/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/tests/test_models.py +69 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/kubernetes.tf +66 -70
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/index.py +135 -27
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-config.tf +2 -1
- gpu_dev-0.6.2/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-user-data.sh +5 -1
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/.gitignore +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/CLAUDE.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/README.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/admin/README.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/admin/generate_stats.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/admin/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/docs/icons8-cursor-ai.svg +0 -0
- /gpu_dev-0.6.0/terraform-gpu-devservers/subnet-0fe3a2c45570091ad → /gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/setup.cfg +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/tests/submit/README.md +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.6.0 → gpu_dev-0.6.2}/tests/submit/success/run.sh +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
|
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
19
|
-
# GPU Developer CLI
|
|
19
|
+
# GPU Developer CLI & SDK
|
|
20
20
|
|
|
21
|
-
A command-line tool for reserving and managing GPU development servers
|
|
21
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
22
|
+
|
|
23
|
+
## Python SDK
|
|
24
|
+
|
|
25
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from gpu_dev import GpuDev
|
|
29
|
+
|
|
30
|
+
client = GpuDev()
|
|
31
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
32
|
+
result = sandbox.exec("nvidia-smi")
|
|
33
|
+
print(result.stdout)
|
|
34
|
+
sandbox.cancel()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## CLI
|
|
22
42
|
|
|
23
43
|
## Table of Contents
|
|
24
44
|
|
|
@@ -1,6 +1,26 @@
|
|
|
1
|
-
# GPU Developer CLI
|
|
1
|
+
# GPU Developer CLI & SDK
|
|
2
2
|
|
|
3
|
-
A command-line tool for reserving and managing GPU development servers
|
|
3
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
4
|
+
|
|
5
|
+
## Python SDK
|
|
6
|
+
|
|
7
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from gpu_dev import GpuDev
|
|
11
|
+
|
|
12
|
+
client = GpuDev()
|
|
13
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
14
|
+
result = sandbox.exec("nvidia-smi")
|
|
15
|
+
print(result.stdout)
|
|
16
|
+
sandbox.cancel()
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## CLI
|
|
4
24
|
|
|
5
25
|
## Table of Contents
|
|
6
26
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
|
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
19
|
-
# GPU Developer CLI
|
|
19
|
+
# GPU Developer CLI & SDK
|
|
20
20
|
|
|
21
|
-
A command-line tool for reserving and managing GPU development servers
|
|
21
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
22
|
+
|
|
23
|
+
## Python SDK
|
|
24
|
+
|
|
25
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from gpu_dev import GpuDev
|
|
29
|
+
|
|
30
|
+
client = GpuDev()
|
|
31
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
32
|
+
result = sandbox.exec("nvidia-smi")
|
|
33
|
+
print(result.stdout)
|
|
34
|
+
sandbox.cancel()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## CLI
|
|
22
42
|
|
|
23
43
|
## Table of Contents
|
|
24
44
|
|
|
@@ -30,6 +30,27 @@ docs/USER_GUIDE.md
|
|
|
30
30
|
docs/devgpu-features.html
|
|
31
31
|
docs/docker-mark-blue.svg
|
|
32
32
|
docs/icons8-cursor-ai.svg
|
|
33
|
+
sdk/python/README.md
|
|
34
|
+
sdk/python/pyproject.toml
|
|
35
|
+
sdk/python/examples/quickstart.ipynb
|
|
36
|
+
sdk/python/src/gpu_dev/__init__.py
|
|
37
|
+
sdk/python/src/gpu_dev/py.typed
|
|
38
|
+
sdk/python/src/gpu_dev/_async/__init__.py
|
|
39
|
+
sdk/python/src/gpu_dev/_backend/__init__.py
|
|
40
|
+
sdk/python/src/gpu_dev/_backend/aws.py
|
|
41
|
+
sdk/python/src/gpu_dev/_backend/protocol.py
|
|
42
|
+
sdk/python/src/gpu_dev/_sync/__init__.py
|
|
43
|
+
sdk/python/src/gpu_dev/_sync/client.py
|
|
44
|
+
sdk/python/src/gpu_dev/_sync/sandbox.py
|
|
45
|
+
sdk/python/src/gpu_dev/_transport/__init__.py
|
|
46
|
+
sdk/python/src/gpu_dev/_transport/ssh.py
|
|
47
|
+
sdk/python/src/gpu_dev/common/__init__.py
|
|
48
|
+
sdk/python/src/gpu_dev/common/config.py
|
|
49
|
+
sdk/python/src/gpu_dev/common/enums.py
|
|
50
|
+
sdk/python/src/gpu_dev/common/errors.py
|
|
51
|
+
sdk/python/src/gpu_dev/common/models.py
|
|
52
|
+
sdk/python/tests/__init__.py
|
|
53
|
+
sdk/python/tests/test_models.py
|
|
33
54
|
terraform-gpu-devservers/.terraform.lock.hcl
|
|
34
55
|
terraform-gpu-devservers/README.md
|
|
35
56
|
terraform-gpu-devservers/alb.tf
|
|
@@ -13,7 +13,7 @@ from rich.spinner import Spinner
|
|
|
13
13
|
# SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
|
|
14
14
|
# at reservation time (pods fetch live keys via init container) — caching only skips the
|
|
15
15
|
# pre-flight "are you who you say you are" check.
|
|
16
|
-
_SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
|
|
16
|
+
_SSH_CACHE_TTL_SECONDS = 14 * 24 * 60 * 60
|
|
17
17
|
_SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
|
|
18
18
|
|
|
19
19
|
# Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
|
|
@@ -41,33 +41,51 @@ from .interactive import (
|
|
|
41
41
|
console = Console()
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
_east1_table = None
|
|
45
|
+
|
|
44
46
|
def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
|
|
45
47
|
"""Fetch reservations from current region + prod-east1 if on prod."""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
global _east1_table
|
|
49
|
+
|
|
50
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
51
|
+
|
|
52
|
+
def _fetch_primary():
|
|
53
|
+
return reservation_mgr.list_reservations(
|
|
54
|
+
user_filter=user_filter, statuses_to_include=statuses)
|
|
55
|
+
|
|
56
|
+
def _fetch_east1():
|
|
57
|
+
global _east1_table
|
|
50
58
|
cfg = config or load_config()
|
|
51
|
-
if cfg.user_config.get("environment")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
59
|
+
if cfg.user_config.get("environment") != "prod":
|
|
60
|
+
return []
|
|
61
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
62
|
+
if not east1_env or not user_filter:
|
|
63
|
+
return []
|
|
64
|
+
if _east1_table is None:
|
|
65
|
+
_east1_table = cfg.session.resource(
|
|
66
|
+
"dynamodb", region_name=east1_env["region"]
|
|
67
|
+
).Table("pytorch-gpu-dev-reservations")
|
|
68
|
+
results = []
|
|
69
|
+
for st in (statuses or ["active"]):
|
|
70
|
+
resp = _east1_table.query(
|
|
71
|
+
IndexName="UserStatusIndex",
|
|
72
|
+
KeyConditionExpression="user_id = :uid AND #s = :status",
|
|
73
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
74
|
+
ExpressionAttributeValues={":uid": user_filter, ":status": st},
|
|
75
|
+
)
|
|
76
|
+
for item in resp.get("Items", []):
|
|
77
|
+
item["_region"] = "us-east-1"
|
|
78
|
+
results.append(item)
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
with ThreadPoolExecutor(max_workers=2) as ex:
|
|
83
|
+
f1 = ex.submit(_fetch_primary)
|
|
84
|
+
f2 = ex.submit(_fetch_east1)
|
|
85
|
+
reservations = f1.result()
|
|
86
|
+
reservations.extend(f2.result())
|
|
69
87
|
except Exception:
|
|
70
|
-
|
|
88
|
+
reservations = _fetch_primary()
|
|
71
89
|
return reservations
|
|
72
90
|
|
|
73
91
|
|
|
@@ -608,6 +626,8 @@ def main(ctx: click.Context) -> None:
|
|
|
608
626
|
)
|
|
609
627
|
@click.option("--spot", is_flag=True, default=False,
|
|
610
628
|
help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
|
|
629
|
+
@click.option("--fast-cache", is_flag=True, default=False, hidden=True,
|
|
630
|
+
help="Use NVMe local cache for faster session restore (experimental).")
|
|
611
631
|
@click.pass_context
|
|
612
632
|
def reserve(
|
|
613
633
|
ctx: click.Context,
|
|
@@ -629,6 +649,7 @@ def reserve(
|
|
|
629
649
|
disk: Optional[str],
|
|
630
650
|
node_label: tuple,
|
|
631
651
|
spot: bool = False,
|
|
652
|
+
fast_cache: bool = False,
|
|
632
653
|
) -> None:
|
|
633
654
|
"""Reserve GPU development server(s)
|
|
634
655
|
|
|
@@ -746,7 +767,10 @@ def reserve(
|
|
|
746
767
|
else:
|
|
747
768
|
f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
|
|
748
769
|
ssh_result = None
|
|
749
|
-
|
|
770
|
+
# Only fetch availability if we need the interactive picker
|
|
771
|
+
need_interactive = gpu_type is None
|
|
772
|
+
if need_interactive:
|
|
773
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
750
774
|
|
|
751
775
|
# Surface auth failure first (most actionable).
|
|
752
776
|
try:
|
|
@@ -758,7 +782,7 @@ def reserve(
|
|
|
758
782
|
|
|
759
783
|
if ssh_result is None:
|
|
760
784
|
ssh_result = f_ssh.result()
|
|
761
|
-
availability_info = f_avail.result()
|
|
785
|
+
availability_info = f_avail.result() if need_interactive else None
|
|
762
786
|
|
|
763
787
|
# Surface SSH validation failure with the same UX as before.
|
|
764
788
|
if not ssh_result.get("valid"):
|
|
@@ -1108,11 +1132,13 @@ def reserve(
|
|
|
1108
1132
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1109
1133
|
return
|
|
1110
1134
|
|
|
1111
|
-
# Validate SSH key matches configured GitHub username
|
|
1112
|
-
live.update(Spinner("dots", text="🔐 Validating SSH key..."))
|
|
1135
|
+
# Validate SSH key matches configured GitHub username (cached, ~0ms)
|
|
1113
1136
|
if not _validate_ssh_key_or_exit(config, live):
|
|
1114
1137
|
return
|
|
1115
1138
|
|
|
1139
|
+
live.update(Spinner("dots", text="📡 Preparing reservation..."))
|
|
1140
|
+
reservation_mgr = ReservationManager(config)
|
|
1141
|
+
|
|
1116
1142
|
# Track if user explicitly requests no persistent disk
|
|
1117
1143
|
explicit_no_disk = explicit_no_disk_from_param
|
|
1118
1144
|
|
|
@@ -1224,11 +1250,6 @@ def reserve(
|
|
|
1224
1250
|
rprint(f"[yellow]Use a different disk or wait for the reservation to end[/yellow]")
|
|
1225
1251
|
return
|
|
1226
1252
|
|
|
1227
|
-
live.update(
|
|
1228
|
-
Spinner("dots", text="📡 Setting up reservation manager...")
|
|
1229
|
-
)
|
|
1230
|
-
reservation_mgr = ReservationManager(config)
|
|
1231
|
-
|
|
1232
1253
|
# Submit reservation request
|
|
1233
1254
|
live.update(
|
|
1234
1255
|
Spinner("dots", text="📡 Submitting reservation request...")
|
|
@@ -1364,6 +1385,7 @@ def reserve(
|
|
|
1364
1385
|
spot=spot,
|
|
1365
1386
|
node_labels=node_labels if node_labels else None,
|
|
1366
1387
|
trace=trace,
|
|
1388
|
+
fast_cache=fast_cache,
|
|
1367
1389
|
)
|
|
1368
1390
|
reservation_ids = [reservation_id] if reservation_id else None
|
|
1369
1391
|
|
|
@@ -2887,36 +2909,42 @@ def _show_availability() -> None:
|
|
|
2887
2909
|
) as live:
|
|
2888
2910
|
config = load_config()
|
|
2889
2911
|
|
|
2890
|
-
# Authenticate
|
|
2912
|
+
# Authenticate and fetch availability (both regions in parallel)
|
|
2891
2913
|
try:
|
|
2892
2914
|
user_info = authenticate_user(config)
|
|
2893
2915
|
reservation_mgr = ReservationManager(config)
|
|
2894
|
-
|
|
2916
|
+
|
|
2917
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2918
|
+
_env_name = config.user_config.get("environment", "prod")
|
|
2919
|
+
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
2920
|
+
|
|
2921
|
+
def _fetch_east1_spot():
|
|
2922
|
+
if _env_name != "prod" or not _east1_spot_types:
|
|
2923
|
+
return {}
|
|
2924
|
+
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
2925
|
+
east1_table = config.session.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability")
|
|
2926
|
+
result = {}
|
|
2927
|
+
for item in east1_table.scan().get("Items", []):
|
|
2928
|
+
gt = item.get("gpu_type", "")
|
|
2929
|
+
if gt in _east1_spot_types:
|
|
2930
|
+
result[gt] = {
|
|
2931
|
+
"available": int(item.get("available_gpus", 0)),
|
|
2932
|
+
"total": int(item.get("total_gpus", 0)),
|
|
2933
|
+
"max_reservable": int(item.get("max_reservable", 0)),
|
|
2934
|
+
"spot_info": item.get("spot_info", {}),
|
|
2935
|
+
}
|
|
2936
|
+
return result
|
|
2937
|
+
|
|
2938
|
+
with ThreadPoolExecutor(max_workers=2) as ex:
|
|
2939
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
2940
|
+
f_spot = ex.submit(_fetch_east1_spot)
|
|
2941
|
+
availability_info = f_avail.result()
|
|
2942
|
+
spot_region_info = f_spot.result()
|
|
2895
2943
|
except RuntimeError as e:
|
|
2896
2944
|
live.stop()
|
|
2897
2945
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
2898
2946
|
return
|
|
2899
2947
|
|
|
2900
|
-
# Cross-region: fetch spot availability from prod-east1
|
|
2901
|
-
spot_region_info = {}
|
|
2902
|
-
_env_name = config.user_config.get("environment", "prod")
|
|
2903
|
-
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
2904
|
-
if _env_name == "prod" and _east1_spot_types:
|
|
2905
|
-
try:
|
|
2906
|
-
import boto3 as _b3
|
|
2907
|
-
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
2908
|
-
for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
|
|
2909
|
-
gt = item.get("gpu_type", "")
|
|
2910
|
-
if gt in _east1_spot_types:
|
|
2911
|
-
spot_region_info[gt] = {
|
|
2912
|
-
"available": int(item.get("available_gpus", 0)),
|
|
2913
|
-
"total": int(item.get("total_gpus", 0)),
|
|
2914
|
-
"max_reservable": int(item.get("max_reservable", 0)),
|
|
2915
|
-
"spot_info": item.get("spot_info", {}),
|
|
2916
|
-
}
|
|
2917
|
-
except Exception:
|
|
2918
|
-
pass
|
|
2919
|
-
|
|
2920
2948
|
if availability_info:
|
|
2921
2949
|
# GPU architecture mapping (for display)
|
|
2922
2950
|
gpu_architectures = {
|
|
@@ -3273,8 +3301,19 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3273
3301
|
For VS Code Remote or manual SSH, use 'gpu-dev show' to see full SSH command.
|
|
3274
3302
|
"""
|
|
3275
3303
|
import subprocess
|
|
3304
|
+
from pathlib import Path
|
|
3276
3305
|
|
|
3277
3306
|
try:
|
|
3307
|
+
# Fast path: if reservation ID given, check local SSH config first (no network)
|
|
3308
|
+
if reservation_id:
|
|
3309
|
+
ssh_config_dir = Path.home() / ".gpu-dev"
|
|
3310
|
+
matches = list(ssh_config_dir.glob(f"{reservation_id}*-sshconfig")) if ssh_config_dir.exists() else []
|
|
3311
|
+
if matches:
|
|
3312
|
+
pod_name = f"gpu-dev-{reservation_id[:8]}"
|
|
3313
|
+
rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
|
|
3314
|
+
os.execvp("ssh", ["ssh", pod_name])
|
|
3315
|
+
return
|
|
3316
|
+
|
|
3278
3317
|
with Live(
|
|
3279
3318
|
Spinner("dots", text="📡 Fetching reservation details..."), console=console
|
|
3280
3319
|
) as live:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import json
|
|
5
5
|
import boto3
|
|
6
|
+
import botocore.exceptions
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Dict, Any, Optional
|
|
8
9
|
|
|
@@ -72,17 +73,63 @@ class Config:
|
|
|
72
73
|
self._sqs_client = None
|
|
73
74
|
self._dynamodb = None
|
|
74
75
|
|
|
76
|
+
_CRED_CACHE = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
|
|
77
|
+
|
|
75
78
|
def _create_aws_session(self):
|
|
76
|
-
"""Create AWS session
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
79
|
+
"""Create AWS session, caching resolved credentials to skip SSO resolution (~900ms)."""
|
|
80
|
+
import time as _time
|
|
81
|
+
|
|
82
|
+
# Try cached credentials first (avoids 900ms SSO resolution)
|
|
83
|
+
try:
|
|
84
|
+
if self._CRED_CACHE.exists():
|
|
85
|
+
cached = json.loads(self._CRED_CACHE.read_text())
|
|
86
|
+
if _time.time() < cached.get("expires", 0):
|
|
87
|
+
return boto3.Session(
|
|
88
|
+
aws_access_key_id=cached["access_key"],
|
|
89
|
+
aws_secret_access_key=cached["secret_key"],
|
|
90
|
+
aws_session_token=cached["token"],
|
|
91
|
+
region_name=self.aws_region,
|
|
92
|
+
)
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Resolve credentials from SSO/profile (slow path, ~900ms)
|
|
97
|
+
try:
|
|
98
|
+
session = boto3.Session(profile_name="gpu-dev")
|
|
99
|
+
creds = session.get_credentials()
|
|
100
|
+
if not creds:
|
|
101
|
+
raise Exception("no credentials")
|
|
102
|
+
except Exception:
|
|
103
|
+
session = boto3.Session()
|
|
104
|
+
creds = session.get_credentials()
|
|
105
|
+
|
|
106
|
+
# Cache resolved credentials (safe — they're short-lived STS tokens)
|
|
107
|
+
try:
|
|
108
|
+
frozen = creds.get_frozen_credentials()
|
|
109
|
+
if frozen.token:
|
|
110
|
+
self._CRED_CACHE.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
self._CRED_CACHE.write_text(json.dumps({
|
|
112
|
+
"access_key": frozen.access_key,
|
|
113
|
+
"secret_key": frozen.secret_key,
|
|
114
|
+
"token": frozen.token,
|
|
115
|
+
"expires": _time.time() + 2700, # cache 45min (SSO tokens last ~1h)
|
|
116
|
+
}))
|
|
117
|
+
self._CRED_CACHE.chmod(0o600)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
return session
|
|
122
|
+
|
|
123
|
+
def refresh_session(self):
|
|
124
|
+
"""Clear cached credentials and re-resolve. Called on ExpiredTokenException."""
|
|
125
|
+
try:
|
|
126
|
+
self._CRED_CACHE.unlink(missing_ok=True)
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
self.session = self._create_aws_session()
|
|
130
|
+
self._sts_client = None
|
|
131
|
+
self._sqs_client = None
|
|
132
|
+
self._dynamodb = None
|
|
86
133
|
|
|
87
134
|
@property
|
|
88
135
|
def sts_client(self):
|
|
@@ -23,6 +23,8 @@ from .name_generator import sanitize_name
|
|
|
23
23
|
def _spot_stage_number(status: str) -> tuple:
|
|
24
24
|
"""Map a spot provisioning status message to a numbered step (N, total)."""
|
|
25
25
|
s = status.lower()
|
|
26
|
+
if "no spot capacity" in s or "no capacity" in s:
|
|
27
|
+
return 1, 7 # stuck at step 1, but message itself says why
|
|
26
28
|
if "requested" in s or "waiting for aws" in s or "allocate capacity" in s:
|
|
27
29
|
return 1, 7
|
|
28
30
|
if "allocated" in s or "launching" in s or "booting" in s:
|
|
@@ -424,6 +426,18 @@ class ReservationManager:
|
|
|
424
426
|
self.reservations_table = config.dynamodb.Table(
|
|
425
427
|
config.reservations_table)
|
|
426
428
|
|
|
429
|
+
def _retry_on_expired(self, fn):
|
|
430
|
+
"""Call fn, auto-refresh credentials on ExpiredTokenException."""
|
|
431
|
+
try:
|
|
432
|
+
return fn()
|
|
433
|
+
except Exception as e:
|
|
434
|
+
if "ExpiredToken" in str(type(e).__name__) or "expired" in str(e).lower():
|
|
435
|
+
self.config.refresh_session()
|
|
436
|
+
self.reservations_table = self.config.dynamodb.Table(
|
|
437
|
+
self.config.reservations_table)
|
|
438
|
+
return fn()
|
|
439
|
+
raise
|
|
440
|
+
|
|
427
441
|
def create_reservation(
|
|
428
442
|
self,
|
|
429
443
|
user_id: str,
|
|
@@ -442,6 +456,7 @@ class ReservationManager:
|
|
|
442
456
|
node_labels: Optional[Dict[str, str]] = None,
|
|
443
457
|
trace: bool = False,
|
|
444
458
|
spot: bool = False,
|
|
459
|
+
fast_cache: bool = False,
|
|
445
460
|
) -> Optional[str]:
|
|
446
461
|
"""Create a new GPU reservation"""
|
|
447
462
|
try:
|
|
@@ -524,6 +539,9 @@ class ReservationManager:
|
|
|
524
539
|
if spot:
|
|
525
540
|
message["spot"] = True
|
|
526
541
|
|
|
542
|
+
if fast_cache:
|
|
543
|
+
message["fast_cache"] = True
|
|
544
|
+
|
|
527
545
|
# Add trace flag and CLI start timestamp
|
|
528
546
|
if trace:
|
|
529
547
|
message["trace"] = True
|
|
@@ -801,20 +819,21 @@ class ReservationManager:
|
|
|
801
819
|
For multi-node reservations, returns info for all nodes in the group.
|
|
802
820
|
"""
|
|
803
821
|
try:
|
|
804
|
-
#
|
|
822
|
+
# Short ID prefix — query UserIndex with server-side filter
|
|
805
823
|
response = self.reservations_table.query(
|
|
806
824
|
IndexName="UserIndex",
|
|
807
825
|
KeyConditionExpression="user_id = :user_id",
|
|
808
|
-
|
|
826
|
+
FilterExpression="begins_with(reservation_id, :rid)",
|
|
827
|
+
ExpressionAttributeValues={":user_id": user_id, ":rid": reservation_id},
|
|
809
828
|
)
|
|
810
829
|
all_reservations = response.get("Items", [])
|
|
811
830
|
|
|
812
|
-
# Handle pagination for UserIndex query
|
|
813
831
|
while "LastEvaluatedKey" in response:
|
|
814
832
|
response = self.reservations_table.query(
|
|
815
833
|
IndexName="UserIndex",
|
|
816
834
|
KeyConditionExpression="user_id = :user_id",
|
|
817
|
-
|
|
835
|
+
FilterExpression="begins_with(reservation_id, :rid)",
|
|
836
|
+
ExpressionAttributeValues={":user_id": user_id, ":rid": reservation_id},
|
|
818
837
|
ExclusiveStartKey=response["LastEvaluatedKey"]
|
|
819
838
|
)
|
|
820
839
|
all_reservations.extend(response.get("Items", []))
|
|
@@ -1078,9 +1097,16 @@ class ReservationManager:
|
|
|
1078
1097
|
)
|
|
1079
1098
|
all_items.extend(response.get("Items", []))
|
|
1080
1099
|
|
|
1100
|
+
# Fetch queue lengths for all GPU types in parallel
|
|
1101
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1102
|
+
gpu_types_list = [item["gpu_type"] for item in all_items]
|
|
1103
|
+
with ThreadPoolExecutor(max_workers=10) as ex:
|
|
1104
|
+
queue_futures = {gt: ex.submit(self._get_queue_length_for_gpu_type, gt) for gt in gpu_types_list}
|
|
1105
|
+
queue_lengths = {gt: f.result() for gt, f in queue_futures.items()}
|
|
1106
|
+
|
|
1081
1107
|
for item in all_items:
|
|
1082
1108
|
gpu_type = item["gpu_type"]
|
|
1083
|
-
queue_length =
|
|
1109
|
+
queue_length = queue_lengths.get(gpu_type, 0)
|
|
1084
1110
|
estimated_wait = queue_length * 15 if queue_length > 0 else 0
|
|
1085
1111
|
|
|
1086
1112
|
# size_etas is a DDB Map of {size_str: epoch_seconds (Decimal)} — pass through
|
|
@@ -1210,7 +1236,6 @@ class ReservationManager:
|
|
|
1210
1236
|
try:
|
|
1211
1237
|
total_count = 0
|
|
1212
1238
|
|
|
1213
|
-
# Count queued reservations for this GPU type
|
|
1214
1239
|
for status in ["queued", "pending"]:
|
|
1215
1240
|
try:
|
|
1216
1241
|
response = self.reservations_table.query(
|
|
@@ -1221,10 +1246,10 @@ class ReservationManager:
|
|
|
1221
1246
|
":status": status,
|
|
1222
1247
|
":gpu_type": gpu_type,
|
|
1223
1248
|
},
|
|
1249
|
+
Select="COUNT",
|
|
1224
1250
|
)
|
|
1225
|
-
total_count +=
|
|
1251
|
+
total_count += response.get("Count", 0)
|
|
1226
1252
|
|
|
1227
|
-
# Handle pagination for StatusGpuTypeIndex query
|
|
1228
1253
|
while "LastEvaluatedKey" in response:
|
|
1229
1254
|
response = self.reservations_table.query(
|
|
1230
1255
|
IndexName="StatusGpuTypeIndex",
|
|
@@ -1234,9 +1259,10 @@ class ReservationManager:
|
|
|
1234
1259
|
":status": status,
|
|
1235
1260
|
":gpu_type": gpu_type,
|
|
1236
1261
|
},
|
|
1262
|
+
Select="COUNT",
|
|
1237
1263
|
ExclusiveStartKey=response["LastEvaluatedKey"]
|
|
1238
1264
|
)
|
|
1239
|
-
total_count +=
|
|
1265
|
+
total_count += response.get("Count", 0)
|
|
1240
1266
|
except Exception as query_error:
|
|
1241
1267
|
# Fallback to scanning if the composite index doesn't exist yet
|
|
1242
1268
|
console.print(
|
|
@@ -1904,9 +1930,12 @@ class ReservationManager:
|
|
|
1904
1930
|
detailed = first_queued.get("current_detailed_status", "")
|
|
1905
1931
|
# Spot stages come through current_detailed_status — show as
|
|
1906
1932
|
# numbered steps so users see progress and don't give up.
|
|
1907
|
-
if detailed and ("spot" in detailed.lower() or "node" in detailed.lower() or "instance" in detailed.lower()):
|
|
1933
|
+
if detailed and ("spot" in detailed.lower() or "node" in detailed.lower() or "instance" in detailed.lower() or "capacity" in detailed.lower()):
|
|
1908
1934
|
step, total = _spot_stage_number(detailed)
|
|
1909
|
-
|
|
1935
|
+
if "no spot capacity" in detailed.lower() or "no capacity" in detailed.lower():
|
|
1936
|
+
message = f"⚠️ {detailed}"
|
|
1937
|
+
else:
|
|
1938
|
+
message = f"⏳ Step {step}/{total}: {detailed}"
|
|
1910
1939
|
elif is_multinode:
|
|
1911
1940
|
total_gpus = sum(
|
|
1912
1941
|
node["gpu_count"] for node in node_details if node["reservation"])
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.2"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|