gpu-dev 0.5.32__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/PKG-INFO +23 -3
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/README.md +22 -2
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +23 -3
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +21 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +1 -1
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +134 -68
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +63 -15
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +45 -14
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/pyproject.toml +1 -1
- gpu_dev-0.6.2/sdk/python/README.md +187 -0
- gpu_dev-0.6.2/sdk/python/examples/quickstart.ipynb +365 -0
- gpu_dev-0.6.2/sdk/python/pyproject.toml +27 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/__init__.py +60 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_async/__init__.py +2 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/aws.py +315 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/protocol.py +53 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/client.py +245 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/sandbox.py +243 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/ssh.py +121 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/config.py +45 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/enums.py +44 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/errors.py +33 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/models.py +73 -0
- gpu_dev-0.6.2/sdk/python/src/gpu_dev/py.typed +0 -0
- gpu_dev-0.6.2/sdk/python/tests/__init__.py +0 -0
- gpu_dev-0.6.2/sdk/python/tests/test_models.py +69 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/kubernetes.tf +66 -70
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/index.py +308 -171
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda.tf +29 -5
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/main.tf +6 -4
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-config.tf +2 -1
- gpu_dev-0.6.2/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-user-data.sh +91 -4
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +3 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.gitignore +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/CLAUDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/icons8-cursor-ai.svg +0 -0
- /gpu_dev-0.5.32/terraform-gpu-devservers/subnet-0fe3a2c45570091ad → /gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/setup.cfg +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/success/run.sh +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
|
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
19
|
-
# GPU Developer CLI
|
|
19
|
+
# GPU Developer CLI & SDK
|
|
20
20
|
|
|
21
|
-
A command-line tool for reserving and managing GPU development servers
|
|
21
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
22
|
+
|
|
23
|
+
## Python SDK
|
|
24
|
+
|
|
25
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from gpu_dev import GpuDev
|
|
29
|
+
|
|
30
|
+
client = GpuDev()
|
|
31
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
32
|
+
result = sandbox.exec("nvidia-smi")
|
|
33
|
+
print(result.stdout)
|
|
34
|
+
sandbox.cancel()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## CLI
|
|
22
42
|
|
|
23
43
|
## Table of Contents
|
|
24
44
|
|
|
@@ -1,6 +1,26 @@
|
|
|
1
|
-
# GPU Developer CLI
|
|
1
|
+
# GPU Developer CLI & SDK
|
|
2
2
|
|
|
3
|
-
A command-line tool for reserving and managing GPU development servers
|
|
3
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
4
|
+
|
|
5
|
+
## Python SDK
|
|
6
|
+
|
|
7
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from gpu_dev import GpuDev
|
|
11
|
+
|
|
12
|
+
client = GpuDev()
|
|
13
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
14
|
+
result = sandbox.exec("nvidia-smi")
|
|
15
|
+
print(result.stdout)
|
|
16
|
+
sandbox.cancel()
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## CLI
|
|
4
24
|
|
|
5
25
|
## Table of Contents
|
|
6
26
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
|
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
19
|
-
# GPU Developer CLI
|
|
19
|
+
# GPU Developer CLI & SDK
|
|
20
20
|
|
|
21
|
-
A command-line tool for reserving and managing GPU development servers
|
|
21
|
+
A command-line tool and Python SDK for reserving and managing GPU development servers.
|
|
22
|
+
|
|
23
|
+
## Python SDK
|
|
24
|
+
|
|
25
|
+
For programmatic access, use the [Python SDK](../../sdk/python/README.md):
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from gpu_dev import GpuDev
|
|
29
|
+
|
|
30
|
+
client = GpuDev()
|
|
31
|
+
sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
|
|
32
|
+
result = sandbox.exec("nvidia-smi")
|
|
33
|
+
print(result.stdout)
|
|
34
|
+
sandbox.cancel()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## CLI
|
|
22
42
|
|
|
23
43
|
## Table of Contents
|
|
24
44
|
|
|
@@ -30,6 +30,27 @@ docs/USER_GUIDE.md
|
|
|
30
30
|
docs/devgpu-features.html
|
|
31
31
|
docs/docker-mark-blue.svg
|
|
32
32
|
docs/icons8-cursor-ai.svg
|
|
33
|
+
sdk/python/README.md
|
|
34
|
+
sdk/python/pyproject.toml
|
|
35
|
+
sdk/python/examples/quickstart.ipynb
|
|
36
|
+
sdk/python/src/gpu_dev/__init__.py
|
|
37
|
+
sdk/python/src/gpu_dev/py.typed
|
|
38
|
+
sdk/python/src/gpu_dev/_async/__init__.py
|
|
39
|
+
sdk/python/src/gpu_dev/_backend/__init__.py
|
|
40
|
+
sdk/python/src/gpu_dev/_backend/aws.py
|
|
41
|
+
sdk/python/src/gpu_dev/_backend/protocol.py
|
|
42
|
+
sdk/python/src/gpu_dev/_sync/__init__.py
|
|
43
|
+
sdk/python/src/gpu_dev/_sync/client.py
|
|
44
|
+
sdk/python/src/gpu_dev/_sync/sandbox.py
|
|
45
|
+
sdk/python/src/gpu_dev/_transport/__init__.py
|
|
46
|
+
sdk/python/src/gpu_dev/_transport/ssh.py
|
|
47
|
+
sdk/python/src/gpu_dev/common/__init__.py
|
|
48
|
+
sdk/python/src/gpu_dev/common/config.py
|
|
49
|
+
sdk/python/src/gpu_dev/common/enums.py
|
|
50
|
+
sdk/python/src/gpu_dev/common/errors.py
|
|
51
|
+
sdk/python/src/gpu_dev/common/models.py
|
|
52
|
+
sdk/python/tests/__init__.py
|
|
53
|
+
sdk/python/tests/test_models.py
|
|
33
54
|
terraform-gpu-devservers/.terraform.lock.hcl
|
|
34
55
|
terraform-gpu-devservers/README.md
|
|
35
56
|
terraform-gpu-devservers/alb.tf
|
|
@@ -13,7 +13,7 @@ from rich.spinner import Spinner
|
|
|
13
13
|
# SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
|
|
14
14
|
# at reservation time (pods fetch live keys via init container) — caching only skips the
|
|
15
15
|
# pre-flight "are you who you say you are" check.
|
|
16
|
-
_SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
|
|
16
|
+
_SSH_CACHE_TTL_SECONDS = 14 * 24 * 60 * 60
|
|
17
17
|
_SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
|
|
18
18
|
|
|
19
19
|
# Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
|
|
@@ -41,33 +41,51 @@ from .interactive import (
|
|
|
41
41
|
console = Console()
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
_east1_table = None
|
|
45
|
+
|
|
44
46
|
def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
|
|
45
47
|
"""Fetch reservations from current region + prod-east1 if on prod."""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
global _east1_table
|
|
49
|
+
|
|
50
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
51
|
+
|
|
52
|
+
def _fetch_primary():
|
|
53
|
+
return reservation_mgr.list_reservations(
|
|
54
|
+
user_filter=user_filter, statuses_to_include=statuses)
|
|
55
|
+
|
|
56
|
+
def _fetch_east1():
|
|
57
|
+
global _east1_table
|
|
50
58
|
cfg = config or load_config()
|
|
51
|
-
if cfg.user_config.get("environment")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
59
|
+
if cfg.user_config.get("environment") != "prod":
|
|
60
|
+
return []
|
|
61
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
62
|
+
if not east1_env or not user_filter:
|
|
63
|
+
return []
|
|
64
|
+
if _east1_table is None:
|
|
65
|
+
_east1_table = cfg.session.resource(
|
|
66
|
+
"dynamodb", region_name=east1_env["region"]
|
|
67
|
+
).Table("pytorch-gpu-dev-reservations")
|
|
68
|
+
results = []
|
|
69
|
+
for st in (statuses or ["active"]):
|
|
70
|
+
resp = _east1_table.query(
|
|
71
|
+
IndexName="UserStatusIndex",
|
|
72
|
+
KeyConditionExpression="user_id = :uid AND #s = :status",
|
|
73
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
74
|
+
ExpressionAttributeValues={":uid": user_filter, ":status": st},
|
|
75
|
+
)
|
|
76
|
+
for item in resp.get("Items", []):
|
|
77
|
+
item["_region"] = "us-east-1"
|
|
78
|
+
results.append(item)
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
with ThreadPoolExecutor(max_workers=2) as ex:
|
|
83
|
+
f1 = ex.submit(_fetch_primary)
|
|
84
|
+
f2 = ex.submit(_fetch_east1)
|
|
85
|
+
reservations = f1.result()
|
|
86
|
+
reservations.extend(f2.result())
|
|
69
87
|
except Exception:
|
|
70
|
-
|
|
88
|
+
reservations = _fetch_primary()
|
|
71
89
|
return reservations
|
|
72
90
|
|
|
73
91
|
|
|
@@ -608,6 +626,8 @@ def main(ctx: click.Context) -> None:
|
|
|
608
626
|
)
|
|
609
627
|
@click.option("--spot", is_flag=True, default=False,
|
|
610
628
|
help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
|
|
629
|
+
@click.option("--fast-cache", is_flag=True, default=False, hidden=True,
|
|
630
|
+
help="Use NVMe local cache for faster session restore (experimental).")
|
|
611
631
|
@click.pass_context
|
|
612
632
|
def reserve(
|
|
613
633
|
ctx: click.Context,
|
|
@@ -629,6 +649,7 @@ def reserve(
|
|
|
629
649
|
disk: Optional[str],
|
|
630
650
|
node_label: tuple,
|
|
631
651
|
spot: bool = False,
|
|
652
|
+
fast_cache: bool = False,
|
|
632
653
|
) -> None:
|
|
633
654
|
"""Reserve GPU development server(s)
|
|
634
655
|
|
|
@@ -746,7 +767,10 @@ def reserve(
|
|
|
746
767
|
else:
|
|
747
768
|
f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
|
|
748
769
|
ssh_result = None
|
|
749
|
-
|
|
770
|
+
# Only fetch availability if we need the interactive picker
|
|
771
|
+
need_interactive = gpu_type is None
|
|
772
|
+
if need_interactive:
|
|
773
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
750
774
|
|
|
751
775
|
# Surface auth failure first (most actionable).
|
|
752
776
|
try:
|
|
@@ -758,7 +782,7 @@ def reserve(
|
|
|
758
782
|
|
|
759
783
|
if ssh_result is None:
|
|
760
784
|
ssh_result = f_ssh.result()
|
|
761
|
-
availability_info = f_avail.result()
|
|
785
|
+
availability_info = f_avail.result() if need_interactive else None
|
|
762
786
|
|
|
763
787
|
# Surface SSH validation failure with the same UX as before.
|
|
764
788
|
if not ssh_result.get("valid"):
|
|
@@ -897,6 +921,13 @@ def reserve(
|
|
|
897
921
|
|
|
898
922
|
else:
|
|
899
923
|
# Non-interactive mode - use defaults and validate
|
|
924
|
+
# Route --spot to east1 when on prod (env vars override config region)
|
|
925
|
+
if spot and load_config().user_config.get("environment") == "prod":
|
|
926
|
+
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
927
|
+
if east1_cfg:
|
|
928
|
+
import os as _os
|
|
929
|
+
_os.environ["AWS_REGION"] = east1_cfg["region"]
|
|
930
|
+
|
|
900
931
|
if gpu_type is None:
|
|
901
932
|
gpu_type = "a100"
|
|
902
933
|
if hours is None:
|
|
@@ -1101,11 +1132,13 @@ def reserve(
|
|
|
1101
1132
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1102
1133
|
return
|
|
1103
1134
|
|
|
1104
|
-
# Validate SSH key matches configured GitHub username
|
|
1105
|
-
live.update(Spinner("dots", text="🔐 Validating SSH key..."))
|
|
1135
|
+
# Validate SSH key matches configured GitHub username (cached, ~0ms)
|
|
1106
1136
|
if not _validate_ssh_key_or_exit(config, live):
|
|
1107
1137
|
return
|
|
1108
1138
|
|
|
1139
|
+
live.update(Spinner("dots", text="📡 Preparing reservation..."))
|
|
1140
|
+
reservation_mgr = ReservationManager(config)
|
|
1141
|
+
|
|
1109
1142
|
# Track if user explicitly requests no persistent disk
|
|
1110
1143
|
explicit_no_disk = explicit_no_disk_from_param
|
|
1111
1144
|
|
|
@@ -1217,11 +1250,6 @@ def reserve(
|
|
|
1217
1250
|
rprint(f"[yellow]Use a different disk or wait for the reservation to end[/yellow]")
|
|
1218
1251
|
return
|
|
1219
1252
|
|
|
1220
|
-
live.update(
|
|
1221
|
-
Spinner("dots", text="📡 Setting up reservation manager...")
|
|
1222
|
-
)
|
|
1223
|
-
reservation_mgr = ReservationManager(config)
|
|
1224
|
-
|
|
1225
1253
|
# Submit reservation request
|
|
1226
1254
|
live.update(
|
|
1227
1255
|
Spinner("dots", text="📡 Submitting reservation request...")
|
|
@@ -1357,6 +1385,7 @@ def reserve(
|
|
|
1357
1385
|
spot=spot,
|
|
1358
1386
|
node_labels=node_labels if node_labels else None,
|
|
1359
1387
|
trace=trace,
|
|
1388
|
+
fast_cache=fast_cache,
|
|
1360
1389
|
)
|
|
1361
1390
|
reservation_ids = [reservation_id] if reservation_id else None
|
|
1362
1391
|
|
|
@@ -2568,10 +2597,21 @@ def cancel(
|
|
|
2568
2597
|
with Live(
|
|
2569
2598
|
Spinner("dots", text="📡 Cancelling reservations..."), console=console
|
|
2570
2599
|
) as live:
|
|
2600
|
+
# Build east1 reservation manager for cross-region cancellations
|
|
2601
|
+
east1_mgr = None
|
|
2602
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
2603
|
+
if east1_env:
|
|
2604
|
+
import os as _os
|
|
2605
|
+
_east1_config = Config()
|
|
2606
|
+
_east1_config.aws_region = east1_env["region"]
|
|
2607
|
+
east1_mgr = ReservationManager(_east1_config)
|
|
2608
|
+
|
|
2571
2609
|
for reservation in reservations:
|
|
2572
2610
|
res_id = reservation.get("reservation_id", "")
|
|
2573
2611
|
if res_id:
|
|
2574
|
-
|
|
2612
|
+
# Use east1 manager for east1 reservations
|
|
2613
|
+
mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
|
|
2614
|
+
success = mgr.cancel_reservation(
|
|
2575
2615
|
res_id, user_info["user_id"]
|
|
2576
2616
|
)
|
|
2577
2617
|
if success:
|
|
@@ -2869,36 +2909,42 @@ def _show_availability() -> None:
|
|
|
2869
2909
|
) as live:
|
|
2870
2910
|
config = load_config()
|
|
2871
2911
|
|
|
2872
|
-
# Authenticate
|
|
2912
|
+
# Authenticate and fetch availability (both regions in parallel)
|
|
2873
2913
|
try:
|
|
2874
2914
|
user_info = authenticate_user(config)
|
|
2875
2915
|
reservation_mgr = ReservationManager(config)
|
|
2876
|
-
|
|
2916
|
+
|
|
2917
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
2918
|
+
_env_name = config.user_config.get("environment", "prod")
|
|
2919
|
+
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
2920
|
+
|
|
2921
|
+
def _fetch_east1_spot():
|
|
2922
|
+
if _env_name != "prod" or not _east1_spot_types:
|
|
2923
|
+
return {}
|
|
2924
|
+
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
2925
|
+
east1_table = config.session.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability")
|
|
2926
|
+
result = {}
|
|
2927
|
+
for item in east1_table.scan().get("Items", []):
|
|
2928
|
+
gt = item.get("gpu_type", "")
|
|
2929
|
+
if gt in _east1_spot_types:
|
|
2930
|
+
result[gt] = {
|
|
2931
|
+
"available": int(item.get("available_gpus", 0)),
|
|
2932
|
+
"total": int(item.get("total_gpus", 0)),
|
|
2933
|
+
"max_reservable": int(item.get("max_reservable", 0)),
|
|
2934
|
+
"spot_info": item.get("spot_info", {}),
|
|
2935
|
+
}
|
|
2936
|
+
return result
|
|
2937
|
+
|
|
2938
|
+
with ThreadPoolExecutor(max_workers=2) as ex:
|
|
2939
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
2940
|
+
f_spot = ex.submit(_fetch_east1_spot)
|
|
2941
|
+
availability_info = f_avail.result()
|
|
2942
|
+
spot_region_info = f_spot.result()
|
|
2877
2943
|
except RuntimeError as e:
|
|
2878
2944
|
live.stop()
|
|
2879
2945
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
2880
2946
|
return
|
|
2881
2947
|
|
|
2882
|
-
# Cross-region: fetch spot availability from prod-east1
|
|
2883
|
-
spot_region_info = {}
|
|
2884
|
-
_env_name = config.user_config.get("environment", "prod")
|
|
2885
|
-
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
2886
|
-
if _env_name == "prod" and _east1_spot_types:
|
|
2887
|
-
try:
|
|
2888
|
-
import boto3 as _b3
|
|
2889
|
-
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
2890
|
-
for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
|
|
2891
|
-
gt = item.get("gpu_type", "")
|
|
2892
|
-
if gt in _east1_spot_types:
|
|
2893
|
-
spot_region_info[gt] = {
|
|
2894
|
-
"available": int(item.get("available_gpus", 0)),
|
|
2895
|
-
"total": int(item.get("total_gpus", 0)),
|
|
2896
|
-
"max_reservable": int(item.get("max_reservable", 0)),
|
|
2897
|
-
"spot_info": item.get("spot_info", {}),
|
|
2898
|
-
}
|
|
2899
|
-
except Exception:
|
|
2900
|
-
pass
|
|
2901
|
-
|
|
2902
2948
|
if availability_info:
|
|
2903
2949
|
# GPU architecture mapping (for display)
|
|
2904
2950
|
gpu_architectures = {
|
|
@@ -3255,8 +3301,19 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3255
3301
|
For VS Code Remote or manual SSH, use 'gpu-dev show' to see full SSH command.
|
|
3256
3302
|
"""
|
|
3257
3303
|
import subprocess
|
|
3304
|
+
from pathlib import Path
|
|
3258
3305
|
|
|
3259
3306
|
try:
|
|
3307
|
+
# Fast path: if reservation ID given, check local SSH config first (no network)
|
|
3308
|
+
if reservation_id:
|
|
3309
|
+
ssh_config_dir = Path.home() / ".gpu-dev"
|
|
3310
|
+
matches = list(ssh_config_dir.glob(f"{reservation_id}*-sshconfig")) if ssh_config_dir.exists() else []
|
|
3311
|
+
if matches:
|
|
3312
|
+
pod_name = f"gpu-dev-{reservation_id[:8]}"
|
|
3313
|
+
rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
|
|
3314
|
+
os.execvp("ssh", ["ssh", pod_name])
|
|
3315
|
+
return
|
|
3316
|
+
|
|
3260
3317
|
with Live(
|
|
3261
3318
|
Spinner("dots", text="📡 Fetching reservation details..."), console=console
|
|
3262
3319
|
) as live:
|
|
@@ -3301,21 +3358,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3301
3358
|
|
|
3302
3359
|
live.start()
|
|
3303
3360
|
|
|
3304
|
-
#
|
|
3305
|
-
_sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
|
|
3306
|
-
if _sel and _sel.get("_region") == "us-east-1":
|
|
3307
|
-
import os as _os
|
|
3308
|
-
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
3309
|
-
_os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
|
|
3310
|
-
_east1_config = Config()
|
|
3311
|
-
_east1_config.aws_region = east1_cfg["region"]
|
|
3312
|
-
reservation_mgr = ReservationManager(_east1_config)
|
|
3313
|
-
|
|
3314
|
-
# Get connection info
|
|
3361
|
+
# Try current region first, then cross-region if not found
|
|
3315
3362
|
connection_info = reservation_mgr.get_connection_info(
|
|
3316
3363
|
reservation_id, user_info["user_id"]
|
|
3317
3364
|
)
|
|
3318
3365
|
|
|
3366
|
+
# If not found, try the other region
|
|
3367
|
+
if not connection_info:
|
|
3368
|
+
import os as _os
|
|
3369
|
+
current_env = config.user_config.get("environment", "prod")
|
|
3370
|
+
other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
|
|
3371
|
+
other_env_name = other_envs.get(current_env)
|
|
3372
|
+
if other_env_name:
|
|
3373
|
+
other_env = Config.ENVIRONMENTS.get(other_env_name, {})
|
|
3374
|
+
if other_env:
|
|
3375
|
+
_os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
|
|
3376
|
+
_other_config = Config()
|
|
3377
|
+
_other_config.aws_region = other_env["region"]
|
|
3378
|
+
other_mgr = ReservationManager(_other_config)
|
|
3379
|
+
connection_info = other_mgr.get_connection_info(
|
|
3380
|
+
reservation_id, user_info["user_id"]
|
|
3381
|
+
)
|
|
3382
|
+
if connection_info:
|
|
3383
|
+
reservation_mgr = other_mgr
|
|
3384
|
+
|
|
3319
3385
|
live.stop()
|
|
3320
3386
|
|
|
3321
3387
|
if not connection_info:
|
|
@@ -3864,7 +3930,7 @@ def set(key: str, value: str) -> None:
|
|
|
3864
3930
|
|
|
3865
3931
|
|
|
3866
3932
|
@config.command()
|
|
3867
|
-
@click.argument("env_name", type=click.Choice(["test", "prod"
|
|
3933
|
+
@click.argument("env_name", type=click.Choice(["test", "prod"]))
|
|
3868
3934
|
def environment(env_name: str) -> None:
|
|
3869
3935
|
"""Set the environment
|
|
3870
3936
|
|
|
@@ -3876,7 +3942,7 @@ def environment(env_name: str) -> None:
|
|
|
3876
3942
|
\b
|
|
3877
3943
|
Examples:
|
|
3878
3944
|
gpu-dev config environment prod # Production (us-east-2)
|
|
3879
|
-
gpu-dev config environment prod
|
|
3945
|
+
gpu-dev config environment prod # Production (spot accessible via interactive picker)
|
|
3880
3946
|
gpu-dev config environment test # Test (us-west-1)
|
|
3881
3947
|
|
|
3882
3948
|
Environment configurations:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import json
|
|
5
5
|
import boto3
|
|
6
|
+
import botocore.exceptions
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Dict, Any, Optional
|
|
8
9
|
|
|
@@ -42,13 +43,14 @@ class Config:
|
|
|
42
43
|
# Load unified config (handles migration from legacy files)
|
|
43
44
|
self.user_config = self._load_config()
|
|
44
45
|
|
|
45
|
-
# Get region
|
|
46
|
-
|
|
46
|
+
# Get region: env vars take priority (for spot routing), then config, then default
|
|
47
|
+
env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
|
|
48
|
+
if env_region and env_region != self.user_config.get("region"):
|
|
49
|
+
self.aws_region = env_region
|
|
50
|
+
elif self.user_config.get("region"):
|
|
47
51
|
self.aws_region = self.user_config["region"]
|
|
48
52
|
else:
|
|
49
|
-
self.aws_region =
|
|
50
|
-
"AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
|
|
51
|
-
)
|
|
53
|
+
self.aws_region = "us-east-2"
|
|
52
54
|
|
|
53
55
|
os.environ["AWS_DEFAULT_REGION"] = self.aws_region
|
|
54
56
|
|
|
@@ -71,17 +73,63 @@ class Config:
|
|
|
71
73
|
self._sqs_client = None
|
|
72
74
|
self._dynamodb = None
|
|
73
75
|
|
|
76
|
+
_CRED_CACHE = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
|
|
77
|
+
|
|
74
78
|
def _create_aws_session(self):
|
|
75
|
-
"""Create AWS session
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
79
|
+
"""Create AWS session, caching resolved credentials to skip SSO resolution (~900ms)."""
|
|
80
|
+
import time as _time
|
|
81
|
+
|
|
82
|
+
# Try cached credentials first (avoids 900ms SSO resolution)
|
|
83
|
+
try:
|
|
84
|
+
if self._CRED_CACHE.exists():
|
|
85
|
+
cached = json.loads(self._CRED_CACHE.read_text())
|
|
86
|
+
if _time.time() < cached.get("expires", 0):
|
|
87
|
+
return boto3.Session(
|
|
88
|
+
aws_access_key_id=cached["access_key"],
|
|
89
|
+
aws_secret_access_key=cached["secret_key"],
|
|
90
|
+
aws_session_token=cached["token"],
|
|
91
|
+
region_name=self.aws_region,
|
|
92
|
+
)
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Resolve credentials from SSO/profile (slow path, ~900ms)
|
|
97
|
+
try:
|
|
98
|
+
session = boto3.Session(profile_name="gpu-dev")
|
|
99
|
+
creds = session.get_credentials()
|
|
100
|
+
if not creds:
|
|
101
|
+
raise Exception("no credentials")
|
|
102
|
+
except Exception:
|
|
103
|
+
session = boto3.Session()
|
|
104
|
+
creds = session.get_credentials()
|
|
105
|
+
|
|
106
|
+
# Cache resolved credentials (safe — they're short-lived STS tokens)
|
|
107
|
+
try:
|
|
108
|
+
frozen = creds.get_frozen_credentials()
|
|
109
|
+
if frozen.token:
|
|
110
|
+
self._CRED_CACHE.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
self._CRED_CACHE.write_text(json.dumps({
|
|
112
|
+
"access_key": frozen.access_key,
|
|
113
|
+
"secret_key": frozen.secret_key,
|
|
114
|
+
"token": frozen.token,
|
|
115
|
+
"expires": _time.time() + 2700, # cache 45min (SSO tokens last ~1h)
|
|
116
|
+
}))
|
|
117
|
+
self._CRED_CACHE.chmod(0o600)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
return session
|
|
122
|
+
|
|
123
|
+
def refresh_session(self):
|
|
124
|
+
"""Clear cached credentials and re-resolve. Called on ExpiredTokenException."""
|
|
125
|
+
try:
|
|
126
|
+
self._CRED_CACHE.unlink(missing_ok=True)
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
self.session = self._create_aws_session()
|
|
130
|
+
self._sts_client = None
|
|
131
|
+
self._sqs_client = None
|
|
132
|
+
self._dynamodb = None
|
|
85
133
|
|
|
86
134
|
@property
|
|
87
135
|
def sts_client(self):
|
|
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
|
|
|
355
355
|
return False
|
|
356
356
|
|
|
357
357
|
if not disk['in_use']:
|
|
358
|
-
|
|
359
|
-
|
|
358
|
+
# DDB says not locked — but check if EBS volume is still physically attached
|
|
359
|
+
try:
|
|
360
|
+
ec2 = config.session.client('ec2', region_name=config.aws_region)
|
|
361
|
+
vols = ec2.describe_volumes(Filters=[
|
|
362
|
+
{"Name": "tag:gpu-dev-user", "Values": [user_id]},
|
|
363
|
+
{"Name": "tag:disk_name", "Values": [disk_name]},
|
|
364
|
+
{"Name": "status", "Values": ["in-use"]},
|
|
365
|
+
]).get("Volumes", [])
|
|
366
|
+
if not vols:
|
|
367
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
368
|
+
return False
|
|
369
|
+
print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
|
|
370
|
+
except Exception:
|
|
371
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
372
|
+
return False
|
|
360
373
|
|
|
361
374
|
operation_id = str(uuid.uuid4())
|
|
362
375
|
|