gpu-dev 0.6.5__tar.gz → 0.6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/PKG-INFO +1 -1
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +7 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/pyproject.toml +1 -1
- gpu_dev-0.6.6/sdk/python/examples/parallel_experiments.ipynb +360 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/aws.py +33 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/protocol.py +8 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/client.py +36 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/availability.tf +1 -1
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/expiry.tf +1 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/availability_updater/index.py +8 -7
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +1 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py +27 -13
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/dns_utils.py +1 -2
- gpu_dev-0.6.6/terraform-gpu-devservers/lambda/shared/requirements.txt +3 -0
- gpu_dev-0.6.5/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -2
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/.gitignore +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/CLAUDE.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/admin/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/admin/generate_stats.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/admin/requirements.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/pyproject.toml +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/tests/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/setup.cfg +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.6.5/terraform-gpu-devservers/lambda/reservation_expiry → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/availability_updater}/requirements.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.6.5/terraform-gpu-devservers/lambda/reservation_processor → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/reservation_expiry}/requirements.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.6.5/terraform-gpu-devservers/lambda/shared → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/reservation_processor}/requirements.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/tests/submit/README.md +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.6.5 → gpu_dev-0.6.6}/tests/submit/success/run.sh +0 -0
|
@@ -34,6 +34,7 @@ sdk/python/README.md
|
|
|
34
34
|
sdk/python/pyproject.toml
|
|
35
35
|
sdk/python/examples/batch_multi_gpu.py
|
|
36
36
|
sdk/python/examples/interactive_debug.py
|
|
37
|
+
sdk/python/examples/parallel_experiments.ipynb
|
|
37
38
|
sdk/python/examples/quickstart.ipynb
|
|
38
39
|
sdk/python/examples/run_tests.py
|
|
39
40
|
sdk/python/examples/submit_job.py
|
|
@@ -243,6 +243,13 @@ def list_disks(user_id: str, config: Config) -> List[Dict]:
|
|
|
243
243
|
except Exception:
|
|
244
244
|
pass
|
|
245
245
|
|
|
246
|
+
# Filter out expired deleted disks (delete_date has passed)
|
|
247
|
+
today = datetime.now(timezone.utc).strftime('%Y-%m-%d')
|
|
248
|
+
disks = [
|
|
249
|
+
d for d in disks
|
|
250
|
+
if not (d.get('is_deleted') and d.get('delete_date') and str(d['delete_date']) <= today)
|
|
251
|
+
]
|
|
252
|
+
|
|
246
253
|
# Sort by last_used (most recent first)
|
|
247
254
|
disks.sort(key=lambda d: d['last_used'] or datetime.min.replace(tzinfo=timezone.utc), reverse=True)
|
|
248
255
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.6"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"# Parallel Experiments with Persistent Disks\n",
|
|
8
|
+
"\n",
|
|
9
|
+
"This notebook demonstrates:\n",
|
|
10
|
+
"1. Creating a base environment on a persistent disk\n",
|
|
11
|
+
"2. Making changes (installing packages, modifying code)\n",
|
|
12
|
+
"3. Cloning the disk for parallel experiments\n",
|
|
13
|
+
"4. Running two experiments simultaneously on different GPUs\n",
|
|
14
|
+
"5. Comparing results and measuring timings"
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"cell_type": "code",
|
|
19
|
+
"execution_count": null,
|
|
20
|
+
"metadata": {},
|
|
21
|
+
"outputs": [],
|
|
22
|
+
"source": [
|
|
23
|
+
"%pip install -e .. -q"
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"cell_type": "code",
|
|
28
|
+
"execution_count": null,
|
|
29
|
+
"metadata": {},
|
|
30
|
+
"outputs": [],
|
|
31
|
+
"source": [
|
|
32
|
+
"import time\n",
|
|
33
|
+
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
|
34
|
+
"from gpu_dev import GpuDev\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"client = GpuDev()\n",
|
|
37
|
+
"print(f\"SDK v{__import__('gpu_dev').__version__}\")"
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"cell_type": "markdown",
|
|
42
|
+
"metadata": {},
|
|
43
|
+
"source": [
|
|
44
|
+
"## Step 1: Create Base Environment on Persistent Disk\n",
|
|
45
|
+
"\n",
|
|
46
|
+
"Reserve a GPU with a persistent disk and set up the base experiment."
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"cell_type": "code",
|
|
51
|
+
"execution_count": null,
|
|
52
|
+
"metadata": {},
|
|
53
|
+
"outputs": [],
|
|
54
|
+
"source": [
|
|
55
|
+
"t0 = time.time()\n",
|
|
56
|
+
"\n",
|
|
57
|
+
"base = client.reserve(\n",
|
|
58
|
+
" gpu_type=\"t4\",\n",
|
|
59
|
+
" gpu_count=1,\n",
|
|
60
|
+
" hours=1,\n",
|
|
61
|
+
" disk_name=\"experiment-base\",\n",
|
|
62
|
+
" name=\"base-setup\",\n",
|
|
63
|
+
")\n",
|
|
64
|
+
"\n",
|
|
65
|
+
"reserve_time = time.time() - t0\n",
|
|
66
|
+
"print(f\"Reserved in {reserve_time:.1f}s\")\n",
|
|
67
|
+
"print(f\"Disk: {base.disk_name}\")\n",
|
|
68
|
+
"print(f\"GPU: {base.gpu_type} x{base.gpu_count}\")"
|
|
69
|
+
]
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"cell_type": "code",
|
|
73
|
+
"execution_count": null,
|
|
74
|
+
"metadata": {},
|
|
75
|
+
"outputs": [],
|
|
76
|
+
"source": [
|
|
77
|
+
"# Set up the base experiment: install packages + write training script\n",
|
|
78
|
+
"base.exec(\"pip install -q wandb timm\")\n",
|
|
79
|
+
"\n",
|
|
80
|
+
"# Write a parameterized training script\n",
|
|
81
|
+
"base.exec(r\"\"\"\n",
|
|
82
|
+
"cat > /home/dev/train.py << 'SCRIPT'\n",
|
|
83
|
+
"import torch\n",
|
|
84
|
+
"import torch.nn as nn\n",
|
|
85
|
+
"import time\n",
|
|
86
|
+
"import json\n",
|
|
87
|
+
"import os\n",
|
|
88
|
+
"import sys\n",
|
|
89
|
+
"\n",
|
|
90
|
+
"# Read experiment config from env\n",
|
|
91
|
+
"LR = float(os.environ.get('LR', '0.001'))\n",
|
|
92
|
+
"BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '64'))\n",
|
|
93
|
+
"EPOCHS = int(os.environ.get('EPOCHS', '5'))\n",
|
|
94
|
+
"EXP_NAME = os.environ.get('EXP_NAME', 'default')\n",
|
|
95
|
+
"\n",
|
|
96
|
+
"print(f\"Experiment: {EXP_NAME}\")\n",
|
|
97
|
+
"print(f\"Config: lr={LR}, batch_size={BATCH_SIZE}, epochs={EPOCHS}\")\n",
|
|
98
|
+
"print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
|
|
99
|
+
"print(f\"PyTorch: {torch.__version__}\")\n",
|
|
100
|
+
"\n",
|
|
101
|
+
"# Simple CNN on synthetic data\n",
|
|
102
|
+
"model = nn.Sequential(\n",
|
|
103
|
+
" nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),\n",
|
|
104
|
+
" nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),\n",
|
|
105
|
+
" nn.Flatten(), nn.Linear(64 * 8 * 8, 10)\n",
|
|
106
|
+
").cuda()\n",
|
|
107
|
+
"\n",
|
|
108
|
+
"optimizer = torch.optim.Adam(model.parameters(), lr=LR)\n",
|
|
109
|
+
"criterion = nn.CrossEntropyLoss()\n",
|
|
110
|
+
"\n",
|
|
111
|
+
"results = {'experiment': EXP_NAME, 'lr': LR, 'batch_size': BATCH_SIZE, 'losses': [], 'epoch_times': []}\n",
|
|
112
|
+
"\n",
|
|
113
|
+
"for epoch in range(EPOCHS):\n",
|
|
114
|
+
" t_start = time.time()\n",
|
|
115
|
+
" epoch_loss = 0\n",
|
|
116
|
+
" for step in range(50):\n",
|
|
117
|
+
" x = torch.randn(BATCH_SIZE, 3, 32, 32, device='cuda')\n",
|
|
118
|
+
" y = torch.randint(0, 10, (BATCH_SIZE,), device='cuda')\n",
|
|
119
|
+
" loss = criterion(model(x), y)\n",
|
|
120
|
+
" optimizer.zero_grad()\n",
|
|
121
|
+
" loss.backward()\n",
|
|
122
|
+
" optimizer.step()\n",
|
|
123
|
+
" epoch_loss += loss.item()\n",
|
|
124
|
+
" avg_loss = epoch_loss / 50\n",
|
|
125
|
+
" epoch_time = time.time() - t_start\n",
|
|
126
|
+
" results['losses'].append(avg_loss)\n",
|
|
127
|
+
" results['epoch_times'].append(epoch_time)\n",
|
|
128
|
+
" print(f\" Epoch {epoch+1}/{EPOCHS}: loss={avg_loss:.4f} ({epoch_time:.2f}s)\")\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"results['final_loss'] = results['losses'][-1]\n",
|
|
131
|
+
"results['avg_epoch_time'] = sum(results['epoch_times']) / len(results['epoch_times'])\n",
|
|
132
|
+
"\n",
|
|
133
|
+
"with open(f'/home/dev/results_{EXP_NAME}.json', 'w') as f:\n",
|
|
134
|
+
" json.dump(results, f)\n",
|
|
135
|
+
"print(f\"Results saved to /home/dev/results_{EXP_NAME}.json\")\n",
|
|
136
|
+
"SCRIPT\n",
|
|
137
|
+
"\"\"\")\n",
|
|
138
|
+
"\n",
|
|
139
|
+
"# Verify\n",
|
|
140
|
+
"result = base.exec(\"ls -la /home/dev/train.py && python3 -c 'import wandb, timm; print(\\\"packages OK\\\")'\")\n",
|
|
141
|
+
"print(result.stdout.strip())"
|
|
142
|
+
]
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"cell_type": "markdown",
|
|
146
|
+
"metadata": {},
|
|
147
|
+
"source": [
|
|
148
|
+
"## Step 2: Shut Down and Clone the Disk\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"Cancel the base reservation (disk is snapshotted automatically),\n",
|
|
151
|
+
"then clone it for a parallel experiment."
|
|
152
|
+
]
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"cell_type": "code",
|
|
156
|
+
"execution_count": null,
|
|
157
|
+
"metadata": {},
|
|
158
|
+
"outputs": [],
|
|
159
|
+
"source": [
|
|
160
|
+
"t0 = time.time()\n",
|
|
161
|
+
"base.cancel()\n",
|
|
162
|
+
"cancel_time = time.time() - t0\n",
|
|
163
|
+
"print(f\"Base cancelled in {cancel_time:.1f}s (disk snapshotted)\")"
|
|
164
|
+
]
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"cell_type": "code",
|
|
168
|
+
"execution_count": null,
|
|
169
|
+
"metadata": {},
|
|
170
|
+
"outputs": [],
|
|
171
|
+
"source": [
|
|
172
|
+
"# Clone the disk for the second experiment\n",
|
|
173
|
+
"t0 = time.time()\n",
|
|
174
|
+
"client.clone_disk(\"experiment-base\", \"experiment-variant\")\n",
|
|
175
|
+
"clone_time = time.time() - t0\n",
|
|
176
|
+
"print(f\"Disk cloned in {clone_time:.1f}s\")\n",
|
|
177
|
+
"\n",
|
|
178
|
+
"# Show both disks\n",
|
|
179
|
+
"for disk in client.disks():\n",
|
|
180
|
+
" if 'experiment' in disk.name:\n",
|
|
181
|
+
" print(f\" {disk.name:25s} {disk.size_gb}GB {disk.snapshot_count} snapshots\")"
|
|
182
|
+
]
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"cell_type": "markdown",
|
|
186
|
+
"metadata": {},
|
|
187
|
+
"source": [
|
|
188
|
+
"## Step 3: Run Parallel Experiments\n",
|
|
189
|
+
"\n",
|
|
190
|
+
"Launch two reservations simultaneously \u2014 one on the original disk (high LR),\n",
|
|
191
|
+
"one on the cloned disk (low LR). Both have the same training script pre-installed."
|
|
192
|
+
]
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
"cell_type": "code",
|
|
196
|
+
"execution_count": null,
|
|
197
|
+
"metadata": {},
|
|
198
|
+
"outputs": [],
|
|
199
|
+
"source": [
|
|
200
|
+
"experiments = [\n",
|
|
201
|
+
" {\"name\": \"high-lr\", \"disk\": \"experiment-base\", \"env\": \"LR=0.01 BATCH_SIZE=128 EPOCHS=5 EXP_NAME=high_lr\"},\n",
|
|
202
|
+
" {\"name\": \"low-lr\", \"disk\": \"experiment-variant\", \"env\": \"LR=0.0001 BATCH_SIZE=32 EPOCHS=5 EXP_NAME=low_lr\"},\n",
|
|
203
|
+
"]\n",
|
|
204
|
+
"\n",
|
|
205
|
+
"def run_experiment(exp):\n",
|
|
206
|
+
" \"\"\"Reserve GPU, run training, collect results, cancel.\"\"\"\n",
|
|
207
|
+
" timings = {}\n",
|
|
208
|
+
" \n",
|
|
209
|
+
" # Reserve\n",
|
|
210
|
+
" t0 = time.time()\n",
|
|
211
|
+
" sb = client.reserve(\n",
|
|
212
|
+
" gpu_type=\"t4\",\n",
|
|
213
|
+
" gpu_count=1,\n",
|
|
214
|
+
" hours=0.5,\n",
|
|
215
|
+
" disk_name=exp[\"disk\"],\n",
|
|
216
|
+
" name=exp[\"name\"],\n",
|
|
217
|
+
" )\n",
|
|
218
|
+
" timings['reserve'] = time.time() - t0\n",
|
|
219
|
+
" \n",
|
|
220
|
+
" # Run training\n",
|
|
221
|
+
" t0 = time.time()\n",
|
|
222
|
+
" result = sb.exec(f\"{exp['env']} python3 /home/dev/train.py\", timeout=120)\n",
|
|
223
|
+
" timings['train'] = time.time() - t0\n",
|
|
224
|
+
" train_output = result.stdout.strip()\n",
|
|
225
|
+
" \n",
|
|
226
|
+
" # Collect results\n",
|
|
227
|
+
" exp_name = exp['env'].split('EXP_NAME=')[1].split()[0]\n",
|
|
228
|
+
" result = sb.exec(f\"cat /home/dev/results_{exp_name}.json\")\n",
|
|
229
|
+
" import json\n",
|
|
230
|
+
" results = json.loads(result.stdout.strip())\n",
|
|
231
|
+
" \n",
|
|
232
|
+
" # Cancel\n",
|
|
233
|
+
" t0 = time.time()\n",
|
|
234
|
+
" sb.cancel()\n",
|
|
235
|
+
" timings['cancel'] = time.time() - t0\n",
|
|
236
|
+
" \n",
|
|
237
|
+
" return {\n",
|
|
238
|
+
" 'experiment': exp['name'],\n",
|
|
239
|
+
" 'timings': timings,\n",
|
|
240
|
+
" 'results': results,\n",
|
|
241
|
+
" 'train_output': train_output,\n",
|
|
242
|
+
" }\n",
|
|
243
|
+
"\n",
|
|
244
|
+
"# Run both experiments in parallel\n",
|
|
245
|
+
"t_total = time.time()\n",
|
|
246
|
+
"with ThreadPoolExecutor(max_workers=2) as pool:\n",
|
|
247
|
+
" futures = {pool.submit(run_experiment, exp): exp['name'] for exp in experiments}\n",
|
|
248
|
+
" outputs = {}\n",
|
|
249
|
+
" for future in as_completed(futures):\n",
|
|
250
|
+
" name = futures[future]\n",
|
|
251
|
+
" outputs[name] = future.result()\n",
|
|
252
|
+
" print(f\"\u2705 {name} completed\")\n",
|
|
253
|
+
"\n",
|
|
254
|
+
"total_time = time.time() - t_total\n",
|
|
255
|
+
"print(f\"\\nBoth experiments completed in {total_time:.1f}s (parallel)\")"
|
|
256
|
+
]
|
|
257
|
+
},
|
|
258
|
+
{
|
|
259
|
+
"cell_type": "markdown",
|
|
260
|
+
"metadata": {},
|
|
261
|
+
"source": [
|
|
262
|
+
"## Step 4: Compare Results"
|
|
263
|
+
]
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
"cell_type": "code",
|
|
267
|
+
"execution_count": null,
|
|
268
|
+
"metadata": {},
|
|
269
|
+
"outputs": [],
|
|
270
|
+
"source": [
|
|
271
|
+
"print(\"=\" * 60)\n",
|
|
272
|
+
"print(f\"{'Metric':<25s} {'High LR':>15s} {'Low LR':>15s}\")\n",
|
|
273
|
+
"print(\"=\" * 60)\n",
|
|
274
|
+
"\n",
|
|
275
|
+
"high = outputs['high-lr']['results']\n",
|
|
276
|
+
"low = outputs['low-lr']['results']\n",
|
|
277
|
+
"\n",
|
|
278
|
+
"print(f\"{'Learning Rate':<25s} {high['lr']:>15.4f} {low['lr']:>15.4f}\")\n",
|
|
279
|
+
"print(f\"{'Batch Size':<25s} {high['batch_size']:>15d} {low['batch_size']:>15d}\")\n",
|
|
280
|
+
"print(f\"{'Final Loss':<25s} {high['final_loss']:>15.4f} {low['final_loss']:>15.4f}\")\n",
|
|
281
|
+
"print(f\"{'Avg Epoch Time (s)':<25s} {high['avg_epoch_time']:>15.2f} {low['avg_epoch_time']:>15.2f}\")\n",
|
|
282
|
+
"print()\n",
|
|
283
|
+
"\n",
|
|
284
|
+
"# Loss progression\n",
|
|
285
|
+
"print(\"Loss progression:\")\n",
|
|
286
|
+
"for i in range(len(high['losses'])):\n",
|
|
287
|
+
" print(f\" Epoch {i+1}: high_lr={high['losses'][i]:.4f} low_lr={low['losses'][i]:.4f}\")"
|
|
288
|
+
]
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
"cell_type": "markdown",
|
|
292
|
+
"metadata": {},
|
|
293
|
+
"source": [
|
|
294
|
+
"## Step 5: Timing Breakdown"
|
|
295
|
+
]
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"cell_type": "code",
|
|
299
|
+
"execution_count": null,
|
|
300
|
+
"metadata": {},
|
|
301
|
+
"outputs": [],
|
|
302
|
+
"source": [
|
|
303
|
+
"print(\"\\n\u23f1\ufe0f Timing Breakdown\")\n",
|
|
304
|
+
"print(\"=\" * 60)\n",
|
|
305
|
+
"print(f\"{'Phase':<25s} {'High LR':>15s} {'Low LR':>15s}\")\n",
|
|
306
|
+
"print(\"-\" * 60)\n",
|
|
307
|
+
"\n",
|
|
308
|
+
"for phase in ['reserve', 'train', 'cancel']:\n",
|
|
309
|
+
" h = outputs['high-lr']['timings'][phase]\n",
|
|
310
|
+
" l = outputs['low-lr']['timings'][phase]\n",
|
|
311
|
+
" print(f\"{phase.capitalize():<25s} {h:>14.1f}s {l:>14.1f}s\")\n",
|
|
312
|
+
"\n",
|
|
313
|
+
"print(\"-\" * 60)\n",
|
|
314
|
+
"h_total = sum(outputs['high-lr']['timings'].values())\n",
|
|
315
|
+
"l_total = sum(outputs['low-lr']['timings'].values())\n",
|
|
316
|
+
"print(f\"{'Total (sequential)':<25s} {h_total:>14.1f}s {l_total:>14.1f}s\")\n",
|
|
317
|
+
"print(f\"{'Total (parallel)':<25s} {total_time:>14.1f}s {'\u2014':>15s}\")\n",
|
|
318
|
+
"print(f\"{'Speedup':<25s} {(h_total + l_total) / total_time:>14.1f}x {'':>15s}\")\n",
|
|
319
|
+
"print()\n",
|
|
320
|
+
"print(f\"Disk clone time: {clone_time:.1f}s\")\n",
|
|
321
|
+
"print(f\"Base setup + cancel: {reserve_time + cancel_time:.1f}s\")"
|
|
322
|
+
]
|
|
323
|
+
},
|
|
324
|
+
{
|
|
325
|
+
"cell_type": "markdown",
|
|
326
|
+
"metadata": {},
|
|
327
|
+
"source": [
|
|
328
|
+
"## Cleanup\n",
|
|
329
|
+
"\n",
|
|
330
|
+
"Remove the experiment disks if you don't need them."
|
|
331
|
+
]
|
|
332
|
+
},
|
|
333
|
+
{
|
|
334
|
+
"cell_type": "code",
|
|
335
|
+
"execution_count": null,
|
|
336
|
+
"metadata": {},
|
|
337
|
+
"outputs": [],
|
|
338
|
+
"source": [
|
|
339
|
+
"# Uncomment to delete experiment disks:\n",
|
|
340
|
+
"# client.delete_disk(\"experiment-base\")\n",
|
|
341
|
+
"# client.delete_disk(\"experiment-variant\")\n",
|
|
342
|
+
"print(\"Done! Disks preserved for inspection.\")\n",
|
|
343
|
+
"print(\"Delete with: client.delete_disk('experiment-base')\")"
|
|
344
|
+
]
|
|
345
|
+
}
|
|
346
|
+
],
|
|
347
|
+
"metadata": {
|
|
348
|
+
"kernelspec": {
|
|
349
|
+
"display_name": "Python 3",
|
|
350
|
+
"language": "python",
|
|
351
|
+
"name": "python3"
|
|
352
|
+
},
|
|
353
|
+
"language_info": {
|
|
354
|
+
"name": "python",
|
|
355
|
+
"version": "3.12.0"
|
|
356
|
+
}
|
|
357
|
+
},
|
|
358
|
+
"nbformat": 4,
|
|
359
|
+
"nbformat_minor": 4
|
|
360
|
+
}
|
|
@@ -282,6 +282,39 @@ class AwsBackend:
|
|
|
282
282
|
for item in resp.get("Items", [])
|
|
283
283
|
]
|
|
284
284
|
|
|
285
|
+
def clone_disk(self, user_id: str, source_disk: str, target_disk: str) -> str:
|
|
286
|
+
import uuid
|
|
287
|
+
from datetime import datetime, timezone
|
|
288
|
+
operation_id = str(uuid.uuid4())
|
|
289
|
+
self._sqs.send_message(
|
|
290
|
+
QueueUrl=self._get_queue_url(),
|
|
291
|
+
MessageBody=json.dumps({
|
|
292
|
+
"action": "clone_disk",
|
|
293
|
+
"operation_id": operation_id,
|
|
294
|
+
"user_id": user_id,
|
|
295
|
+
"source_disk": source_disk,
|
|
296
|
+
"target_disk": target_disk,
|
|
297
|
+
"requested_at": datetime.now(timezone.utc).isoformat(),
|
|
298
|
+
}),
|
|
299
|
+
)
|
|
300
|
+
return operation_id
|
|
301
|
+
|
|
302
|
+
def delete_disk(self, user_id: str, disk_name: str) -> str:
|
|
303
|
+
import uuid
|
|
304
|
+
from datetime import datetime, timezone
|
|
305
|
+
operation_id = str(uuid.uuid4())
|
|
306
|
+
self._sqs.send_message(
|
|
307
|
+
QueueUrl=self._get_queue_url(),
|
|
308
|
+
MessageBody=json.dumps({
|
|
309
|
+
"action": "delete_disk",
|
|
310
|
+
"operation_id": operation_id,
|
|
311
|
+
"user_id": user_id,
|
|
312
|
+
"disk_name": disk_name,
|
|
313
|
+
"requested_at": datetime.now(timezone.utc).isoformat(),
|
|
314
|
+
}),
|
|
315
|
+
)
|
|
316
|
+
return operation_id
|
|
317
|
+
|
|
285
318
|
def add_user(self, reservation_id: str, user_id: str, github_username: str) -> bool:
|
|
286
319
|
message = {
|
|
287
320
|
"type": "add_user",
|
|
@@ -48,6 +48,14 @@ class Backend(Protocol):
|
|
|
48
48
|
"""List persistent disks for a user."""
|
|
49
49
|
...
|
|
50
50
|
|
|
51
|
+
def clone_disk(self, user_id: str, source_disk: str, target_disk: str) -> str:
|
|
52
|
+
"""Clone a disk. Returns operation_id."""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
def delete_disk(self, user_id: str, disk_name: str) -> str:
|
|
56
|
+
"""Delete a disk. Returns operation_id."""
|
|
57
|
+
...
|
|
58
|
+
|
|
51
59
|
def add_user(self, reservation_id: str, user_id: str, github_username: str) -> bool:
|
|
52
60
|
"""Grant SSH access to another user."""
|
|
53
61
|
...
|
|
@@ -244,6 +244,42 @@ class GpuDev:
|
|
|
244
244
|
user_info = self._auth()
|
|
245
245
|
return self._backend.list_disks(user_info["user_id"])
|
|
246
246
|
|
|
247
|
+
def clone_disk(self, source: str, target: str, *, poll: bool = True, timeout: int = 120) -> str:
|
|
248
|
+
"""Clone a persistent disk.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
source: Name of the source disk.
|
|
252
|
+
target: Name for the new cloned disk.
|
|
253
|
+
poll: Wait for the clone to complete (default True).
|
|
254
|
+
timeout: Max seconds to wait when polling.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Operation ID.
|
|
258
|
+
"""
|
|
259
|
+
user_info = self._auth()
|
|
260
|
+
op_id = self._backend.clone_disk(user_info["user_id"], source, target)
|
|
261
|
+
if poll:
|
|
262
|
+
import time
|
|
263
|
+
deadline = time.time() + timeout
|
|
264
|
+
while time.time() < deadline:
|
|
265
|
+
disks = self._backend.list_disks(user_info["user_id"])
|
|
266
|
+
if any(d.name == target for d in disks):
|
|
267
|
+
return op_id
|
|
268
|
+
time.sleep(2)
|
|
269
|
+
return op_id
|
|
270
|
+
|
|
271
|
+
def delete_disk(self, name: str) -> str:
|
|
272
|
+
"""Delete a persistent disk.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
name: Disk name to delete.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Operation ID.
|
|
279
|
+
"""
|
|
280
|
+
user_info = self._auth()
|
|
281
|
+
return self._backend.delete_disk(user_info["user_id"], name)
|
|
282
|
+
|
|
247
283
|
def search_logs(
|
|
248
284
|
self,
|
|
249
285
|
reservation_id: str,
|
|
@@ -25,7 +25,7 @@ resource "aws_lambda_function" "availability_updater" {
|
|
|
25
25
|
function_name = "${var.prefix}-availability-updater"
|
|
26
26
|
role = aws_iam_role.availability_updater_role.arn
|
|
27
27
|
handler = "index.handler"
|
|
28
|
-
runtime = "python3.
|
|
28
|
+
runtime = "python3.13"
|
|
29
29
|
timeout = 300
|
|
30
30
|
# 1769 MB is the sweet spot — Lambda allocates one full vCPU at this threshold.
|
|
31
31
|
# Beyond 1769 MB you get fractional second vCPUs (less linear gain), and our work is single-threaded.
|
|
@@ -15,6 +15,7 @@ resource "aws_lambda_function" "reservation_expiry" {
|
|
|
15
15
|
environment {
|
|
16
16
|
variables = {
|
|
17
17
|
RESERVATIONS_TABLE = aws_dynamodb_table.gpu_reservations.name
|
|
18
|
+
DISKS_TABLE_NAME = aws_dynamodb_table.disks.name
|
|
18
19
|
EKS_CLUSTER_NAME = aws_eks_cluster.gpu_dev_cluster.name
|
|
19
20
|
REGION = local.current_config.aws_region
|
|
20
21
|
WARNING_MINUTES = "30" # Warn 30 minutes before expiry
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -330,6 +330,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=
|
|
|
330
330
|
|
|
331
331
|
single_node_max = 0 # Max available on any single node
|
|
332
332
|
schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
|
|
333
|
+
full_node_gpu_counts = [] # Track actual GPU count per full node (accounts for MIG)
|
|
333
334
|
for node in nodes.items:
|
|
334
335
|
if is_node_ready_and_schedulable(node):
|
|
335
336
|
available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
|
|
@@ -349,24 +350,24 @@ def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=
|
|
|
349
350
|
# Count as full node if all GPUs are available
|
|
350
351
|
if total_on_node > 0 and available_on_node == total_on_node:
|
|
351
352
|
full_nodes_available += 1
|
|
353
|
+
full_node_gpu_counts.append(total_on_node)
|
|
352
354
|
|
|
353
355
|
total_gpus = schedulable_total_gpus
|
|
354
356
|
# For MIG SKUs override running_instances to the number of MIG-partitioned nodes
|
|
355
357
|
if is_mig_sku:
|
|
356
358
|
running_instances = sum(1 for n in nodes.items if is_node_ready_and_schedulable(n) and int((n.status.allocatable or {}).get(resource_name, "0")) > 0)
|
|
357
359
|
|
|
358
|
-
# Calculate max reservable
|
|
359
|
-
#
|
|
360
|
+
# Calculate max reservable using actual per-node GPU counts (not ASG gpus_per_instance)
|
|
361
|
+
# This correctly accounts for MIG-configured nodes that have fewer full GPUs
|
|
360
362
|
multinode_gpu_types = ['h100', 'h200', 'b200', 'a100']
|
|
361
|
-
if gpu_type in multinode_gpu_types and
|
|
362
|
-
|
|
363
|
-
|
|
363
|
+
if gpu_type in multinode_gpu_types and full_node_gpu_counts:
|
|
364
|
+
# Sum the top N full nodes (up to 4 for multinode)
|
|
365
|
+
sorted_counts = sorted(full_node_gpu_counts, reverse=True)
|
|
366
|
+
max_reservable = sum(sorted_counts[:4])
|
|
364
367
|
|
|
365
|
-
# If no full nodes available, fall back to single node max
|
|
366
368
|
if max_reservable == 0:
|
|
367
369
|
max_reservable = single_node_max
|
|
368
370
|
else:
|
|
369
|
-
# For all other GPU types (T4, L4, T4-small, etc.), only single node
|
|
370
371
|
max_reservable = single_node_max
|
|
371
372
|
|
|
372
373
|
logger.info(f"Found {full_nodes_available} full nodes available for {gpu_type}, max reservable: {max_reservable} (single node max: {single_node_max})")
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -2895,28 +2895,42 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2895
2895
|
return keys
|
|
2896
2896
|
|
|
2897
2897
|
def _check_nvme_cache_on_node(_target_node, _user_id):
|
|
2898
|
-
"""Check if target node
|
|
2899
|
-
if not _target_node:
|
|
2900
|
-
return False
|
|
2898
|
+
"""Check if target node (or any node of this GPU type) has NVMe cache."""
|
|
2901
2899
|
try:
|
|
2902
2900
|
v1 = client.CoreV1Api(k8s_client)
|
|
2903
2901
|
cache_dir = f"/mnt/nvme/user-cache/{_nvme_cache_user_dir(_user_id)}"
|
|
2902
|
+
|
|
2903
|
+
if _target_node:
|
|
2904
|
+
field_sel = f"spec.nodeName={_target_node},status.phase=Running"
|
|
2905
|
+
else:
|
|
2906
|
+
field_sel = "status.phase=Running"
|
|
2907
|
+
|
|
2904
2908
|
pods = v1.list_namespaced_pod(
|
|
2905
2909
|
"kube-system",
|
|
2906
|
-
field_selector=
|
|
2910
|
+
field_selector=field_sel,
|
|
2907
2911
|
label_selector="app=image-prepuller",
|
|
2908
2912
|
).items
|
|
2909
2913
|
if not pods:
|
|
2914
|
+
logger.info(f"NVMe cache check: no prepuller pods found")
|
|
2910
2915
|
return False
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
2914
|
-
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2916
|
+
|
|
2917
|
+
for pod in pods[:3]:
|
|
2918
|
+
try:
|
|
2919
|
+
stream.stream(
|
|
2920
|
+
v1.connect_get_namespaced_pod_exec,
|
|
2921
|
+
pod.metadata.name, "kube-system",
|
|
2922
|
+
container="pause",
|
|
2923
|
+
command=["test", "-d", cache_dir],
|
|
2924
|
+
stderr=True, stdout=True, stdin=False, tty=False,
|
|
2925
|
+
)
|
|
2926
|
+
logger.info(f"NVMe cache HIT on {pod.spec.node_name}")
|
|
2927
|
+
return True
|
|
2928
|
+
except Exception:
|
|
2929
|
+
continue
|
|
2930
|
+
logger.info(f"NVMe cache MISS for {_user_id}")
|
|
2931
|
+
return False
|
|
2932
|
+
except Exception as e:
|
|
2933
|
+
logger.warning(f"NVMe cache check error: {e}")
|
|
2920
2934
|
return False
|
|
2921
2935
|
|
|
2922
2936
|
def _setup_disk():
|
|
@@ -144,8 +144,7 @@ def get_existing_dns_names() -> List[str]:
|
|
|
144
144
|
return []
|
|
145
145
|
|
|
146
146
|
try:
|
|
147
|
-
|
|
148
|
-
table = dynamodb.Table(table_name)
|
|
147
|
+
table = _get_domain_mappings_table(table_name)
|
|
149
148
|
|
|
150
149
|
# Scan for all domain mappings
|
|
151
150
|
response = table.scan()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.6.5 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|