gpu-dev 0.5.32__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/PKG-INFO +1 -1
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +41 -14
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +6 -5
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +5 -3
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/pyproject.toml +1 -1
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +180 -151
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda.tf +28 -4
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/main.tf +6 -4
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +87 -4
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +3 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.gitignore +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/CLAUDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/setup.cfg +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/success/run.sh +0 -0
|
@@ -897,6 +897,13 @@ def reserve(
|
|
|
897
897
|
|
|
898
898
|
else:
|
|
899
899
|
# Non-interactive mode - use defaults and validate
|
|
900
|
+
# Route --spot to east1 when on prod (env vars override config region)
|
|
901
|
+
if spot and load_config().user_config.get("environment") == "prod":
|
|
902
|
+
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
903
|
+
if east1_cfg:
|
|
904
|
+
import os as _os
|
|
905
|
+
_os.environ["AWS_REGION"] = east1_cfg["region"]
|
|
906
|
+
|
|
900
907
|
if gpu_type is None:
|
|
901
908
|
gpu_type = "a100"
|
|
902
909
|
if hours is None:
|
|
@@ -2568,10 +2575,21 @@ def cancel(
|
|
|
2568
2575
|
with Live(
|
|
2569
2576
|
Spinner("dots", text="📡 Cancelling reservations..."), console=console
|
|
2570
2577
|
) as live:
|
|
2578
|
+
# Build east1 reservation manager for cross-region cancellations
|
|
2579
|
+
east1_mgr = None
|
|
2580
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
2581
|
+
if east1_env:
|
|
2582
|
+
import os as _os
|
|
2583
|
+
_east1_config = Config()
|
|
2584
|
+
_east1_config.aws_region = east1_env["region"]
|
|
2585
|
+
east1_mgr = ReservationManager(_east1_config)
|
|
2586
|
+
|
|
2571
2587
|
for reservation in reservations:
|
|
2572
2588
|
res_id = reservation.get("reservation_id", "")
|
|
2573
2589
|
if res_id:
|
|
2574
|
-
|
|
2590
|
+
# Use east1 manager for east1 reservations
|
|
2591
|
+
mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
|
|
2592
|
+
success = mgr.cancel_reservation(
|
|
2575
2593
|
res_id, user_info["user_id"]
|
|
2576
2594
|
)
|
|
2577
2595
|
if success:
|
|
@@ -3301,21 +3319,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3301
3319
|
|
|
3302
3320
|
live.start()
|
|
3303
3321
|
|
|
3304
|
-
#
|
|
3305
|
-
_sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
|
|
3306
|
-
if _sel and _sel.get("_region") == "us-east-1":
|
|
3307
|
-
import os as _os
|
|
3308
|
-
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
3309
|
-
_os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
|
|
3310
|
-
_east1_config = Config()
|
|
3311
|
-
_east1_config.aws_region = east1_cfg["region"]
|
|
3312
|
-
reservation_mgr = ReservationManager(_east1_config)
|
|
3313
|
-
|
|
3314
|
-
# Get connection info
|
|
3322
|
+
# Try current region first, then cross-region if not found
|
|
3315
3323
|
connection_info = reservation_mgr.get_connection_info(
|
|
3316
3324
|
reservation_id, user_info["user_id"]
|
|
3317
3325
|
)
|
|
3318
3326
|
|
|
3327
|
+
# If not found, try the other region
|
|
3328
|
+
if not connection_info:
|
|
3329
|
+
import os as _os
|
|
3330
|
+
current_env = config.user_config.get("environment", "prod")
|
|
3331
|
+
other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
|
|
3332
|
+
other_env_name = other_envs.get(current_env)
|
|
3333
|
+
if other_env_name:
|
|
3334
|
+
other_env = Config.ENVIRONMENTS.get(other_env_name, {})
|
|
3335
|
+
if other_env:
|
|
3336
|
+
_os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
|
|
3337
|
+
_other_config = Config()
|
|
3338
|
+
_other_config.aws_region = other_env["region"]
|
|
3339
|
+
other_mgr = ReservationManager(_other_config)
|
|
3340
|
+
connection_info = other_mgr.get_connection_info(
|
|
3341
|
+
reservation_id, user_info["user_id"]
|
|
3342
|
+
)
|
|
3343
|
+
if connection_info:
|
|
3344
|
+
reservation_mgr = other_mgr
|
|
3345
|
+
|
|
3319
3346
|
live.stop()
|
|
3320
3347
|
|
|
3321
3348
|
if not connection_info:
|
|
@@ -3864,7 +3891,7 @@ def set(key: str, value: str) -> None:
|
|
|
3864
3891
|
|
|
3865
3892
|
|
|
3866
3893
|
@config.command()
|
|
3867
|
-
@click.argument("env_name", type=click.Choice(["test", "prod"
|
|
3894
|
+
@click.argument("env_name", type=click.Choice(["test", "prod"]))
|
|
3868
3895
|
def environment(env_name: str) -> None:
|
|
3869
3896
|
"""Set the environment
|
|
3870
3897
|
|
|
@@ -3876,7 +3903,7 @@ def environment(env_name: str) -> None:
|
|
|
3876
3903
|
\b
|
|
3877
3904
|
Examples:
|
|
3878
3905
|
gpu-dev config environment prod # Production (us-east-2)
|
|
3879
|
-
gpu-dev config environment prod
|
|
3906
|
+
gpu-dev config environment prod # Production (spot accessible via interactive picker)
|
|
3880
3907
|
gpu-dev config environment test # Test (us-west-1)
|
|
3881
3908
|
|
|
3882
3909
|
Environment configurations:
|
|
@@ -42,13 +42,14 @@ class Config:
|
|
|
42
42
|
# Load unified config (handles migration from legacy files)
|
|
43
43
|
self.user_config = self._load_config()
|
|
44
44
|
|
|
45
|
-
# Get region
|
|
46
|
-
|
|
45
|
+
# Get region: env vars take priority (for spot routing), then config, then default
|
|
46
|
+
env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
|
|
47
|
+
if env_region and env_region != self.user_config.get("region"):
|
|
48
|
+
self.aws_region = env_region
|
|
49
|
+
elif self.user_config.get("region"):
|
|
47
50
|
self.aws_region = self.user_config["region"]
|
|
48
51
|
else:
|
|
49
|
-
self.aws_region =
|
|
50
|
-
"AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
|
|
51
|
-
)
|
|
52
|
+
self.aws_region = "us-east-2"
|
|
52
53
|
|
|
53
54
|
os.environ["AWS_DEFAULT_REGION"] = self.aws_region
|
|
54
55
|
|
|
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
|
|
|
355
355
|
return False
|
|
356
356
|
|
|
357
357
|
if not disk['in_use']:
|
|
358
|
-
|
|
359
|
-
|
|
358
|
+
# DDB says not locked — but check if EBS volume is still physically attached
|
|
359
|
+
try:
|
|
360
|
+
ec2 = config.session.client('ec2', region_name=config.aws_region)
|
|
361
|
+
vols = ec2.describe_volumes(Filters=[
|
|
362
|
+
{"Name": "tag:gpu-dev-user", "Values": [user_id]},
|
|
363
|
+
{"Name": "tag:disk_name", "Values": [disk_name]},
|
|
364
|
+
{"Name": "status", "Values": ["in-use"]},
|
|
365
|
+
]).get("Volumes", [])
|
|
366
|
+
if not vols:
|
|
367
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
368
|
+
return False
|
|
369
|
+
print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
|
|
370
|
+
except Exception:
|
|
371
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
372
|
+
return False
|
|
360
373
|
|
|
361
374
|
operation_id = str(uuid.uuid4())
|
|
362
375
|
|
|
@@ -1701,6 +1701,7 @@ class ReservationManager:
|
|
|
1701
1701
|
initial_text = f"📡 Starting multinode reservation..." if is_multinode else "🔄 Sending reservation request..."
|
|
1702
1702
|
spinner = Spinner("dots", text=initial_text)
|
|
1703
1703
|
live.update(spinner)
|
|
1704
|
+
poll_delay = 0.5 # start fast, back off over time
|
|
1704
1705
|
|
|
1705
1706
|
while (
|
|
1706
1707
|
(timeout_seconds is None or time.time() -
|
|
@@ -1761,7 +1762,7 @@ class ReservationManager:
|
|
|
1761
1762
|
if not is_multinode:
|
|
1762
1763
|
spinner.text = "📡 Waiting for reservation status update..."
|
|
1763
1764
|
live.update(spinner)
|
|
1764
|
-
time.sleep(
|
|
1765
|
+
time.sleep(0.5)
|
|
1765
1766
|
continue
|
|
1766
1767
|
else:
|
|
1767
1768
|
node_details.append({
|
|
@@ -2293,8 +2294,9 @@ class ReservationManager:
|
|
|
2293
2294
|
|
|
2294
2295
|
return None
|
|
2295
2296
|
|
|
2296
|
-
#
|
|
2297
|
-
time.sleep(
|
|
2297
|
+
# Poll with backoff: 0.5s → 1s → 1.5s → 2s → 3s (cap)
|
|
2298
|
+
time.sleep(poll_delay)
|
|
2299
|
+
poll_delay = min(poll_delay + 0.5, 3.0)
|
|
2298
2300
|
|
|
2299
2301
|
except Exception as e:
|
|
2300
2302
|
console.print(
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -195,7 +195,7 @@ GPU_CONFIG = {
|
|
|
195
195
|
"b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
|
|
196
196
|
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
197
197
|
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
198
|
-
"cpu-spot": {"instance_type": "
|
|
198
|
+
"cpu-spot": {"instance_type": "c6id.2xlarge", "max_gpus": 0, "cpus": 8, "memory_gb": 16, "efa_count": 0},
|
|
199
199
|
}
|
|
200
200
|
GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
|
|
201
201
|
|
|
@@ -2843,16 +2843,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2843
2843
|
elif dockerimage:
|
|
2844
2844
|
logger.info(f"Custom Docker image specified: {dockerimage}")
|
|
2845
2845
|
|
|
2846
|
-
|
|
2847
|
-
github_public_key = get_github_public_key(github_user, validate=True)
|
|
2848
|
-
record_trace_event(trace_data, "github_keys_fetch_end")
|
|
2849
|
-
if not github_public_key:
|
|
2850
|
-
raise ValueError(
|
|
2851
|
-
f"Could not fetch GitHub public key for GitHub user '{github_user}'"
|
|
2852
|
-
)
|
|
2853
|
-
|
|
2854
|
-
# Check if user should get persistent disk
|
|
2855
|
-
# Check if user explicitly requested no persistent disk (e.g., confirmed continuing without disk when another reservation has it)
|
|
2846
|
+
# ── Determine disk eligibility (quick, no I/O) ──
|
|
2856
2847
|
no_persistent_disk_requested = request.get("no_persistent_disk", False)
|
|
2857
2848
|
|
|
2858
2849
|
if no_persistent_disk_requested:
|
|
@@ -2895,119 +2886,93 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2895
2886
|
logger.error(f"Failed to reserve persistent disk slot: {e}")
|
|
2896
2887
|
use_persistent_disk = False
|
|
2897
2888
|
|
|
2898
|
-
|
|
2889
|
+
# ── Run SSH key fetch, disk setup, and EFS setup in parallel ──
|
|
2890
|
+
# These are independent I/O operations that together take ~8s sequentially
|
|
2891
|
+
def _fetch_ssh_keys():
|
|
2892
|
+
record_trace_event(trace_data, "github_keys_fetch_start")
|
|
2893
|
+
keys = get_github_public_key(github_user, validate=True)
|
|
2894
|
+
record_trace_event(trace_data, "github_keys_fetch_end")
|
|
2895
|
+
return keys
|
|
2896
|
+
|
|
2897
|
+
def _setup_disk():
|
|
2898
|
+
if not use_persistent_disk:
|
|
2899
|
+
return None, True, None, None, None
|
|
2900
|
+
update_reservation_status(
|
|
2901
|
+
reservation_id, "preparing",
|
|
2902
|
+
detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else ""))
|
|
2903
|
+
_target_az, _target_node = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2904
|
+
if not _target_az:
|
|
2905
|
+
raise ValueError(f"No {gpu_type} nodes found in cluster")
|
|
2906
|
+
logger.info(f"Target AZ: {_target_az}, disk_name={disk_name or 'default'}")
|
|
2907
|
+
record_trace_event(trace_data, "disk_create_start")
|
|
2908
|
+
vol_id, new_disk, warning = create_disk_from_snapshot_or_empty(
|
|
2909
|
+
user_id=user_id, availability_zone=_target_az,
|
|
2910
|
+
disk_name=disk_name, reservation_id=reservation_id)
|
|
2911
|
+
record_trace_event(trace_data, "disk_create_end")
|
|
2912
|
+
return vol_id, new_disk, warning, _target_az, _target_node
|
|
2913
|
+
|
|
2914
|
+
def _setup_efs():
|
|
2915
|
+
if not (EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS):
|
|
2916
|
+
return None
|
|
2917
|
+
update_reservation_status(
|
|
2918
|
+
reservation_id, "preparing",
|
|
2919
|
+
"Setting up shared storage (/shared) for user collaboration")
|
|
2920
|
+
record_trace_event(trace_data, "efs_setup_start")
|
|
2921
|
+
efs_id = create_or_find_user_efs(user_id)
|
|
2922
|
+
record_trace_event(trace_data, "efs_setup_end")
|
|
2923
|
+
return efs_id
|
|
2924
|
+
|
|
2925
|
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
2926
|
+
ssh_future = executor.submit(_fetch_ssh_keys)
|
|
2927
|
+
disk_future = executor.submit(_setup_disk)
|
|
2928
|
+
efs_future = executor.submit(_setup_efs)
|
|
2929
|
+
|
|
2930
|
+
github_public_key = ssh_future.result()
|
|
2899
2931
|
try:
|
|
2900
|
-
|
|
2901
|
-
# Always recreate volume from latest snapshot or create empty
|
|
2902
|
-
update_reservation_status(
|
|
2903
|
-
reservation_id,
|
|
2904
|
-
"preparing",
|
|
2905
|
-
detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
|
|
2906
|
-
)
|
|
2907
|
-
|
|
2908
|
-
# Determine target AZ + node for this reservation (binpacking)
|
|
2909
|
-
target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2910
|
-
if not target_az:
|
|
2911
|
-
raise ValueError(f"No {gpu_type} nodes found in cluster")
|
|
2912
|
-
|
|
2913
|
-
logger.info(f"Target AZ for reservation: {target_az}")
|
|
2914
|
-
logger.info(f"Creating persistent disk for user {user_id}, disk_name={disk_name or 'default'}")
|
|
2915
|
-
|
|
2916
|
-
# Use new snapshot-first function
|
|
2917
|
-
record_trace_event(trace_data, "disk_create_start")
|
|
2918
|
-
persistent_volume_id, is_new_disk, disk_warning = create_disk_from_snapshot_or_empty(
|
|
2919
|
-
user_id=user_id,
|
|
2920
|
-
availability_zone=target_az,
|
|
2921
|
-
disk_name=disk_name,
|
|
2922
|
-
reservation_id=reservation_id
|
|
2923
|
-
)
|
|
2924
|
-
record_trace_event(trace_data, "disk_create_end")
|
|
2925
|
-
|
|
2926
|
-
logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
|
|
2927
|
-
|
|
2928
|
-
# Mark disk as in_use in disks table (prevents CLI from showing as available)
|
|
2929
|
-
# Use "default" as fallback when no explicit disk_name provided
|
|
2930
|
-
effective_disk_name = disk_name or "default"
|
|
2931
|
-
try:
|
|
2932
|
-
mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
|
|
2933
|
-
logger.info(f"Marked disk '{effective_disk_name}' as in_use for reservation {reservation_id[:8]}")
|
|
2934
|
-
except Exception as mark_error:
|
|
2935
|
-
logger.warning(f"Failed to mark disk as in_use: {mark_error}")
|
|
2936
|
-
|
|
2937
|
-
# Store disk_name in DynamoDB for tracking (ALWAYS store, using "default" as fallback)
|
|
2938
|
-
# This is required for expiry cleanup to know which disk to mark as not in use
|
|
2939
|
-
update_reservation_fields(reservation_id, disk_name=effective_disk_name)
|
|
2940
|
-
|
|
2941
|
-
# Store warning if any
|
|
2942
|
-
if disk_warning:
|
|
2943
|
-
update_reservation_fields(reservation_id, warning=disk_warning)
|
|
2944
|
-
logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}")
|
|
2932
|
+
disk_result = disk_future.result()
|
|
2945
2933
|
except Exception as disk_error:
|
|
2946
2934
|
logger.error(f"Failed to set up persistent disk: {disk_error}")
|
|
2947
|
-
|
|
2948
2935
|
error_msg = str(disk_error)
|
|
2949
|
-
|
|
2950
|
-
# If user explicitly requested a named disk, NEVER silently fall back to temporary.
|
|
2951
|
-
# Any disk error (in use, timeout, creation failure) should fail the reservation
|
|
2952
|
-
# so the user knows what happened instead of getting surprise temporary storage.
|
|
2953
2936
|
if disk_name:
|
|
2954
|
-
logger.error(f"Named disk '{disk_name}'
|
|
2955
|
-
update_reservation_status(
|
|
2956
|
-
|
|
2957
|
-
"failed",
|
|
2958
|
-
failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}"
|
|
2959
|
-
)
|
|
2937
|
+
logger.error(f"Named disk '{disk_name}' setup failed - failing reservation")
|
|
2938
|
+
update_reservation_status(reservation_id, "failed",
|
|
2939
|
+
failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}")
|
|
2960
2940
|
raise RuntimeError(f"Cannot create reservation: disk '{disk_name}' setup failed: {error_msg}")
|
|
2961
|
-
|
|
2962
|
-
# Check if this is a "disk in use" error - these should fail the reservation
|
|
2963
2941
|
if "in use" in error_msg.lower():
|
|
2964
|
-
|
|
2965
|
-
update_reservation_status(
|
|
2966
|
-
reservation_id,
|
|
2967
|
-
"failed",
|
|
2968
|
-
failure_reason=error_msg
|
|
2969
|
-
)
|
|
2942
|
+
update_reservation_status(reservation_id, "failed", failure_reason=error_msg)
|
|
2970
2943
|
raise RuntimeError(f"Cannot create reservation: {error_msg}")
|
|
2971
|
-
|
|
2972
|
-
# For other errors without explicit disk_name, continue without persistent disk (backwards compatibility)
|
|
2973
|
-
logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}")
|
|
2944
|
+
logger.warning(f"Falling back to non-persistent storage: {disk_error}")
|
|
2974
2945
|
use_persistent_disk = False
|
|
2975
|
-
persistent_volume_id = None
|
|
2976
|
-
is_new_disk = True
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
"
|
|
2980
|
-
|
|
2981
|
-
)
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2946
|
+
persistent_volume_id = None
|
|
2947
|
+
is_new_disk = True
|
|
2948
|
+
disk_result = None
|
|
2949
|
+
update_reservation_status(reservation_id, "preparing",
|
|
2950
|
+
"Persistent disk setup failed - continuing without persistent storage")
|
|
2951
|
+
try:
|
|
2952
|
+
efs_filesystem_id = efs_future.result()
|
|
2953
|
+
except Exception as efs_error:
|
|
2954
|
+
logger.error(f"Failed to set up EFS: {efs_error}")
|
|
2955
|
+
efs_filesystem_id = None
|
|
2956
|
+
|
|
2957
|
+
if not github_public_key:
|
|
2958
|
+
raise ValueError(f"Could not fetch GitHub public key for GitHub user '{github_user}'")
|
|
2959
|
+
|
|
2960
|
+
if use_persistent_disk and disk_result:
|
|
2961
|
+
persistent_volume_id, is_new_disk, disk_warning, target_az, target_node = disk_result
|
|
2962
|
+
logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
|
|
2963
|
+
effective_disk_name = disk_name or "default"
|
|
2964
|
+
try:
|
|
2965
|
+
mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
|
|
2966
|
+
except Exception as mark_error:
|
|
2967
|
+
logger.warning(f"Failed to mark disk as in_use: {mark_error}")
|
|
2968
|
+
update_reservation_fields(reservation_id, disk_name=effective_disk_name)
|
|
2969
|
+
if disk_warning:
|
|
2970
|
+
update_reservation_fields(reservation_id, warning=disk_warning)
|
|
2971
|
+
elif not use_persistent_disk:
|
|
2986
2972
|
is_new_disk = True
|
|
2987
|
-
logger.info(
|
|
2988
|
-
"Non-persistent reservation - will always set up shell environment (CREATE_SH_ENV=true)")
|
|
2989
2973
|
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
try:
|
|
2993
|
-
if EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS:
|
|
2994
|
-
update_reservation_status(
|
|
2995
|
-
reservation_id,
|
|
2996
|
-
"preparing",
|
|
2997
|
-
"Setting up shared storage (/shared) for user collaboration",
|
|
2998
|
-
)
|
|
2999
|
-
record_trace_event(trace_data, "efs_setup_start")
|
|
3000
|
-
efs_filesystem_id = create_or_find_user_efs(user_id)
|
|
3001
|
-
record_trace_event(trace_data, "efs_setup_end")
|
|
3002
|
-
logger.info(
|
|
3003
|
-
f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
|
|
3004
|
-
else:
|
|
3005
|
-
logger.warning(
|
|
3006
|
-
"EFS configuration missing - skipping shared storage setup")
|
|
3007
|
-
except Exception as efs_error:
|
|
3008
|
-
logger.error(f"Failed to set up EFS: {efs_error}")
|
|
3009
|
-
# Continue without EFS rather than failing
|
|
3010
|
-
efs_filesystem_id = None
|
|
2974
|
+
if efs_filesystem_id:
|
|
2975
|
+
logger.info(f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
|
|
3011
2976
|
|
|
3012
2977
|
# Update status: Creating Kubernetes resources
|
|
3013
2978
|
disk_status = "with persistent disk" if use_persistent_disk else "without persistent disk"
|
|
@@ -3149,30 +3114,29 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
3149
3114
|
try:
|
|
3150
3115
|
v1 = client.CoreV1Api(k8s_client)
|
|
3151
3116
|
|
|
3152
|
-
#
|
|
3153
|
-
# Default image
|
|
3154
|
-
# Custom
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
retry_delay = 3 # seconds between retries
|
|
3117
|
+
# Poll for SSH daemon: 100ms for first 8s, then backoff to 5s
|
|
3118
|
+
# Default image starts SSH in ~2-5s, so rapid polling catches it instantly
|
|
3119
|
+
# Custom images may take longer, backoff keeps API load reasonable
|
|
3120
|
+
max_attempts = 60
|
|
3121
|
+
elapsed = 0.0
|
|
3158
3122
|
|
|
3159
|
-
for attempt in range(
|
|
3123
|
+
for attempt in range(max_attempts):
|
|
3160
3124
|
logs = v1.read_namespaced_pod_log(
|
|
3161
3125
|
name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=100
|
|
3162
3126
|
)
|
|
3163
3127
|
if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
|
|
3164
3128
|
logger.info(
|
|
3165
|
-
f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1})")
|
|
3129
|
+
f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
|
|
3166
3130
|
ssh_ready = True
|
|
3167
3131
|
break
|
|
3168
3132
|
else:
|
|
3169
|
-
if attempt <
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
3133
|
+
if attempt < max_attempts - 1:
|
|
3134
|
+
delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
|
|
3135
|
+
time.sleep(delay)
|
|
3136
|
+
elapsed += delay
|
|
3173
3137
|
else:
|
|
3174
3138
|
logger.warning(
|
|
3175
|
-
f"SSH daemon not detected after {
|
|
3139
|
+
f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
|
|
3176
3140
|
except Exception as e:
|
|
3177
3141
|
logger.warning(f"Could not check SSH daemon logs: {e}")
|
|
3178
3142
|
# Assume ready if pod is running (NLB will handle routing)
|
|
@@ -3514,32 +3478,52 @@ def update_reservation_fields(reservation_id: str, **fields) -> None:
|
|
|
3514
3478
|
logger.error(f"Error updating reservation fields: {str(e)}")
|
|
3515
3479
|
|
|
3516
3480
|
|
|
3481
|
+
_ssh_key_cache = {}
|
|
3482
|
+
_SSH_KEY_CACHE_TTL = 7 * 24 * 3600 # 7 days — keys rarely change, pods fetch live keys anyway
|
|
3483
|
+
|
|
3484
|
+
|
|
3517
3485
|
def get_github_public_key(github_username: str, validate: bool = True) -> str:
|
|
3518
|
-
"""Fetch GitHub public keys for user
|
|
3486
|
+
"""Fetch GitHub public keys for user, cached in-memory and DynamoDB."""
|
|
3487
|
+
import urllib.request
|
|
3519
3488
|
|
|
3520
|
-
|
|
3521
|
-
github_username: GitHub username to fetch keys for
|
|
3522
|
-
validate: If True, validate and filter keys to only include valid SSH key formats
|
|
3489
|
+
username_lower = github_username.lower()
|
|
3523
3490
|
|
|
3524
|
-
|
|
3525
|
-
|
|
3526
|
-
""
|
|
3491
|
+
# In-memory cache (survives across warm Lambda invocations)
|
|
3492
|
+
cached = _ssh_key_cache.get(username_lower)
|
|
3493
|
+
if cached and time.time() - cached["ts"] < _SSH_KEY_CACHE_TTL:
|
|
3494
|
+
logger.info(f"SSH keys for {github_username} from memory cache")
|
|
3495
|
+
return cached["keys"]
|
|
3496
|
+
|
|
3497
|
+
# DynamoDB cache (survives cold starts)
|
|
3527
3498
|
try:
|
|
3528
|
-
|
|
3499
|
+
resp = reservations_table.get_item(
|
|
3500
|
+
Key={"reservation_id": f"ssh-key-cache-{username_lower}"},
|
|
3501
|
+
ProjectionExpression="ssh_keys, cached_at",
|
|
3502
|
+
)
|
|
3503
|
+
if "Item" in resp:
|
|
3504
|
+
item = resp["Item"]
|
|
3505
|
+
cached_at = float(item.get("cached_at", 0))
|
|
3506
|
+
if time.time() - cached_at < _SSH_KEY_CACHE_TTL:
|
|
3507
|
+
keys = item["ssh_keys"]
|
|
3508
|
+
_ssh_key_cache[username_lower] = {"keys": keys, "ts": cached_at}
|
|
3509
|
+
logger.info(f"SSH keys for {github_username} from DynamoDB cache")
|
|
3510
|
+
return keys
|
|
3511
|
+
except Exception as e:
|
|
3512
|
+
logger.warning(f"DynamoDB SSH key cache read failed: {e}")
|
|
3529
3513
|
|
|
3514
|
+
# Cache miss — fetch from GitHub
|
|
3515
|
+
try:
|
|
3530
3516
|
url = f"https://github.com/{github_username}.keys"
|
|
3531
3517
|
logger.info(f"Fetching SSH keys for {github_username} from {url}")
|
|
3532
3518
|
|
|
3533
|
-
with urllib.request.urlopen(url) as response:
|
|
3519
|
+
with urllib.request.urlopen(url, timeout=10) as response:
|
|
3534
3520
|
keys_data = response.read().decode("utf-8").strip()
|
|
3535
3521
|
|
|
3536
3522
|
if not keys_data:
|
|
3537
|
-
logger.error(
|
|
3538
|
-
f"No public SSH keys found for GitHub user {github_username}")
|
|
3523
|
+
logger.error(f"No public SSH keys found for GitHub user {github_username}")
|
|
3539
3524
|
return None
|
|
3540
3525
|
|
|
3541
3526
|
if validate:
|
|
3542
|
-
# Validate keys format (basic check for ssh-rsa/ssh-ed25519/ssh-ecdsa)
|
|
3543
3527
|
valid_keys = []
|
|
3544
3528
|
for line in keys_data.split("\n"):
|
|
3545
3529
|
line = line.strip()
|
|
@@ -3549,22 +3533,31 @@ def get_github_public_key(github_username: str, validate: bool = True) -> str:
|
|
|
3549
3533
|
or line.startswith("ssh-ecdsa")
|
|
3550
3534
|
):
|
|
3551
3535
|
valid_keys.append(line)
|
|
3552
|
-
|
|
3553
3536
|
if not valid_keys:
|
|
3554
|
-
logger.error(
|
|
3555
|
-
f"No valid SSH keys found for GitHub user {github_username}"
|
|
3556
|
-
)
|
|
3537
|
+
logger.error(f"No valid SSH keys found for GitHub user {github_username}")
|
|
3557
3538
|
return None
|
|
3539
|
+
keys_data = "\n".join(valid_keys)
|
|
3558
3540
|
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
|
|
3562
|
-
|
|
3563
|
-
|
|
3541
|
+
logger.info(f"Found {len(keys_data.splitlines())} valid SSH keys for {github_username}")
|
|
3542
|
+
|
|
3543
|
+
# Store in both caches
|
|
3544
|
+
now = time.time()
|
|
3545
|
+
_ssh_key_cache[username_lower] = {"keys": keys_data, "ts": now}
|
|
3546
|
+
try:
|
|
3547
|
+
reservations_table.put_item(Item={
|
|
3548
|
+
"reservation_id": f"ssh-key-cache-{username_lower}",
|
|
3549
|
+
"ssh_keys": keys_data,
|
|
3550
|
+
"cached_at": str(now),
|
|
3551
|
+
"github_user": github_username,
|
|
3552
|
+
"status": "cache",
|
|
3553
|
+
})
|
|
3554
|
+
except Exception as e:
|
|
3555
|
+
logger.warning(f"DynamoDB SSH key cache write failed: {e}")
|
|
3556
|
+
|
|
3557
|
+
return keys_data
|
|
3564
3558
|
|
|
3565
3559
|
except Exception as e:
|
|
3566
|
-
logger.error(
|
|
3567
|
-
f"Error fetching GitHub key for {github_username}: {str(e)}")
|
|
3560
|
+
logger.error(f"Error fetching GitHub key for {github_username}: {str(e)}")
|
|
3568
3561
|
return None
|
|
3569
3562
|
|
|
3570
3563
|
|
|
@@ -4145,7 +4138,6 @@ def create_pod(
|
|
|
4145
4138
|
|
|
4146
4139
|
# Determine container image to use based on architecture
|
|
4147
4140
|
if gpu_type.startswith("cpu-arm"):
|
|
4148
|
-
# Use Python base image for ARM64 CPU instances with PyTorch installed via pip
|
|
4149
4141
|
container_image = "python:3.11-slim" # Multi-arch image with ARM64 support
|
|
4150
4142
|
else:
|
|
4151
4143
|
container_image = GPU_DEV_CONTAINER_IMAGE # Default x86_64 PyTorch image
|
|
@@ -6678,7 +6670,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6678
6670
|
"p5en.48xlarge": "H200",
|
|
6679
6671
|
"p6-b200.48xlarge": "B200",
|
|
6680
6672
|
"p6-b300.48xlarge": "B300",
|
|
6681
|
-
"
|
|
6673
|
+
"c6id.2xlarge": "cpu-spot",
|
|
6682
6674
|
}
|
|
6683
6675
|
|
|
6684
6676
|
gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
|
|
@@ -7809,6 +7801,14 @@ def process_scheduled_queue_management():
|
|
|
7809
7801
|
except Exception:
|
|
7810
7802
|
cpu_spot_ready = False
|
|
7811
7803
|
if type_available_gpus >= requested_gpus and max_single_node >= requested_gpus and cpu_spot_ready:
|
|
7804
|
+
# Re-check DDB status before allocating — another invocation may have already started
|
|
7805
|
+
_current = reservations_table.get_item(Key={"reservation_id": reservation_id}).get("Item", {})
|
|
7806
|
+
_cur_status = _current.get("status", "queued")
|
|
7807
|
+
if _cur_status != "queued":
|
|
7808
|
+
logger.info(f"Reservation {reservation_id} already {_cur_status} (race avoided), skipping allocation")
|
|
7809
|
+
processed_count += 1
|
|
7810
|
+
continue
|
|
7811
|
+
|
|
7812
7812
|
logger.info(
|
|
7813
7813
|
f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_available_gpus} available"
|
|
7814
7814
|
)
|
|
@@ -8092,6 +8092,17 @@ def process_cancellation_request(record: dict[str, Any]) -> bool:
|
|
|
8092
8092
|
logger.info(
|
|
8093
8093
|
f"Cleaned up pod resources for cancelled reservation {full_reservation_id}")
|
|
8094
8094
|
|
|
8095
|
+
# Force-detach EBS volume (CSI driver sometimes leaves it attached)
|
|
8096
|
+
ebs_vol = reservation.get("ebs_volume_id")
|
|
8097
|
+
if ebs_vol:
|
|
8098
|
+
try:
|
|
8099
|
+
vol_state = ec2_client.describe_volumes(VolumeIds=[ebs_vol])["Volumes"][0]["State"]
|
|
8100
|
+
if vol_state == "in-use":
|
|
8101
|
+
logger.info(f"Force-detaching orphaned volume {ebs_vol}")
|
|
8102
|
+
ec2_client.detach_volume(VolumeId=ebs_vol, Force=True)
|
|
8103
|
+
except Exception as detach_err:
|
|
8104
|
+
logger.warning(f"Volume detach failed for {ebs_vol}: {detach_err}")
|
|
8105
|
+
|
|
8095
8106
|
except Exception as cleanup_error:
|
|
8096
8107
|
logger.error(
|
|
8097
8108
|
f"Error cleaning up pod {pod_name}: {cleanup_error}")
|
|
@@ -8736,6 +8747,24 @@ def process_clear_disk_lock_action(record: dict[str, Any]) -> bool:
|
|
|
8736
8747
|
|
|
8737
8748
|
mark_disk_in_use(user_id, disk_name, False)
|
|
8738
8749
|
logger.info(f"Cleared stale lock on disk '{disk_name}' for user '{user_id}'")
|
|
8750
|
+
|
|
8751
|
+
# Also force-detach any orphaned EBS volumes for this disk
|
|
8752
|
+
try:
|
|
8753
|
+
volumes = ec2_client.describe_volumes(
|
|
8754
|
+
Filters=[
|
|
8755
|
+
{"Name": "tag:gpu-dev-user", "Values": [user_id]},
|
|
8756
|
+
{"Name": "tag:disk_name", "Values": [disk_name]},
|
|
8757
|
+
{"Name": "status", "Values": ["in-use"]},
|
|
8758
|
+
]
|
|
8759
|
+
).get("Volumes", [])
|
|
8760
|
+
for vol in volumes:
|
|
8761
|
+
vol_id = vol["VolumeId"]
|
|
8762
|
+
logger.info(f"Force-detaching orphaned volume {vol_id} for disk '{disk_name}'")
|
|
8763
|
+
ec2_client.detach_volume(VolumeId=vol_id, Force=True)
|
|
8764
|
+
logger.info(f"Detached volume {vol_id}")
|
|
8765
|
+
except Exception as detach_err:
|
|
8766
|
+
logger.warning(f"Failed to detach orphaned volumes for disk '{disk_name}': {detach_err}")
|
|
8767
|
+
|
|
8739
8768
|
return True
|
|
8740
8769
|
|
|
8741
8770
|
|