gpu-dev 0.5.32__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +41 -14
  4. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +6 -5
  5. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
  6. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +5 -3
  7. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/pyproject.toml +1 -1
  8. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +180 -151
  9. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda.tf +28 -4
  10. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/main.tf +6 -4
  11. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +87 -4
  12. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +3 -0
  13. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.github/workflows/no-gitlinks.yml +0 -0
  14. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.github/workflows/publish.yml +0 -0
  15. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/.gitignore +0 -0
  16. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/CLAUDE.md +0 -0
  17. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/README.md +0 -0
  18. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/README.md +0 -0
  19. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/generate_stats.py +0 -0
  20. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/admin/requirements.txt +0 -0
  21. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/README.md +0 -0
  22. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  23. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  24. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  25. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  26. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  27. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  28. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  29. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  30. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  31. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  32. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  33. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  34. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  35. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/setup.cfg +0 -0
  40. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  41. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  42. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/README.md +0 -0
  43. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/alb.tf +0 -0
  44. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ami-baker.tf +0 -0
  45. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/availability.tf +0 -0
  46. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/backend.tf +0 -0
  47. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/check_b200.py +0 -0
  48. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  49. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  50. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  51. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  52. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  53. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
  54. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc +0 -0
  55. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  56. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  57. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  58. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  59. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/motd_script +0 -0
  60. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  61. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/profile +0 -0
  62. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  63. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  64. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  65. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/shell_env +0 -0
  66. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
  67. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zprofile +0 -0
  68. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc +0 -0
  69. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  70. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-build.tf +0 -0
  71. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  72. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  73. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ecr.tf +0 -0
  74. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/efs.tf +0 -0
  75. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/eks.tf +0 -0
  76. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/git-cache.tf +0 -0
  78. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  79. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/kubernetes.tf +0 -0
  80. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  81. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  82. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  83. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  84. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  85. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  86. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  87. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  88. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  89. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  90. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  91. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  92. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  93. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  94. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/list_b200.py +0 -0
  95. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-config.tf +0 -0
  96. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  97. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  98. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  99. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  100. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  101. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  102. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/monitoring.tf +0 -0
  103. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  104. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/outputs.tf +0 -0
  105. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/pyproject.toml +0 -0
  106. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/queue.tf +0 -0
  107. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/route53.tf +0 -0
  108. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  109. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  110. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  111. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  112. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  113. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  114. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  115. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  116. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  117. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  118. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  119. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/switch-to.sh +0 -0
  120. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  121. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  122. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  123. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/variables.tf +0 -0
  124. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/README.md +0 -0
  125. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/fail/run.sh +0 -0
  126. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/multinode/run.sh +0 -0
  127. {gpu_dev-0.5.32 → gpu_dev-0.6.0}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.32
3
+ Version: 0.6.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.32
3
+ Version: 0.6.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -897,6 +897,13 @@ def reserve(
897
897
 
898
898
  else:
899
899
  # Non-interactive mode - use defaults and validate
900
+ # Route --spot to east1 when on prod (env vars override config region)
901
+ if spot and load_config().user_config.get("environment") == "prod":
902
+ east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
903
+ if east1_cfg:
904
+ import os as _os
905
+ _os.environ["AWS_REGION"] = east1_cfg["region"]
906
+
900
907
  if gpu_type is None:
901
908
  gpu_type = "a100"
902
909
  if hours is None:
@@ -2568,10 +2575,21 @@ def cancel(
2568
2575
  with Live(
2569
2576
  Spinner("dots", text="📡 Cancelling reservations..."), console=console
2570
2577
  ) as live:
2578
+ # Build east1 reservation manager for cross-region cancellations
2579
+ east1_mgr = None
2580
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
2581
+ if east1_env:
2582
+ import os as _os
2583
+ _east1_config = Config()
2584
+ _east1_config.aws_region = east1_env["region"]
2585
+ east1_mgr = ReservationManager(_east1_config)
2586
+
2571
2587
  for reservation in reservations:
2572
2588
  res_id = reservation.get("reservation_id", "")
2573
2589
  if res_id:
2574
- success = reservation_mgr.cancel_reservation(
2590
+ # Use east1 manager for east1 reservations
2591
+ mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
2592
+ success = mgr.cancel_reservation(
2575
2593
  res_id, user_info["user_id"]
2576
2594
  )
2577
2595
  if success:
@@ -3301,21 +3319,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3301
3319
 
3302
3320
  live.start()
3303
3321
 
3304
- # If the selected reservation is from east1, switch to east1 reservation_mgr
3305
- _sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
3306
- if _sel and _sel.get("_region") == "us-east-1":
3307
- import os as _os
3308
- east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
3309
- _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
3310
- _east1_config = Config()
3311
- _east1_config.aws_region = east1_cfg["region"]
3312
- reservation_mgr = ReservationManager(_east1_config)
3313
-
3314
- # Get connection info
3322
+ # Try current region first, then cross-region if not found
3315
3323
  connection_info = reservation_mgr.get_connection_info(
3316
3324
  reservation_id, user_info["user_id"]
3317
3325
  )
3318
3326
 
3327
+ # If not found, try the other region
3328
+ if not connection_info:
3329
+ import os as _os
3330
+ current_env = config.user_config.get("environment", "prod")
3331
+ other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
3332
+ other_env_name = other_envs.get(current_env)
3333
+ if other_env_name:
3334
+ other_env = Config.ENVIRONMENTS.get(other_env_name, {})
3335
+ if other_env:
3336
+ _os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
3337
+ _other_config = Config()
3338
+ _other_config.aws_region = other_env["region"]
3339
+ other_mgr = ReservationManager(_other_config)
3340
+ connection_info = other_mgr.get_connection_info(
3341
+ reservation_id, user_info["user_id"]
3342
+ )
3343
+ if connection_info:
3344
+ reservation_mgr = other_mgr
3345
+
3319
3346
  live.stop()
3320
3347
 
3321
3348
  if not connection_info:
@@ -3864,7 +3891,7 @@ def set(key: str, value: str) -> None:
3864
3891
 
3865
3892
 
3866
3893
  @config.command()
3867
- @click.argument("env_name", type=click.Choice(["test", "prod", "prod-east1"]))
3894
+ @click.argument("env_name", type=click.Choice(["test", "prod"]))
3868
3895
  def environment(env_name: str) -> None:
3869
3896
  """Set the environment
3870
3897
 
@@ -3876,7 +3903,7 @@ def environment(env_name: str) -> None:
3876
3903
  \b
3877
3904
  Examples:
3878
3905
  gpu-dev config environment prod # Production (us-east-2)
3879
- gpu-dev config environment prod-east1 # Spot-only us-east-1
3906
+ gpu-dev config environment prod # Production (spot accessible via interactive picker)
3880
3907
  gpu-dev config environment test # Test (us-west-1)
3881
3908
 
3882
3909
  Environment configurations:
@@ -42,13 +42,14 @@ class Config:
42
42
  # Load unified config (handles migration from legacy files)
43
43
  self.user_config = self._load_config()
44
44
 
45
- # Get region from config, then AWS env vars, or default
46
- if self.user_config.get("region"):
45
+ # Get region: env vars take priority (for spot routing), then config, then default
46
+ env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
47
+ if env_region and env_region != self.user_config.get("region"):
48
+ self.aws_region = env_region
49
+ elif self.user_config.get("region"):
47
50
  self.aws_region = self.user_config["region"]
48
51
  else:
49
- self.aws_region = os.getenv(
50
- "AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
51
- )
52
+ self.aws_region = "us-east-2"
52
53
 
53
54
  os.environ["AWS_DEFAULT_REGION"] = self.aws_region
54
55
 
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
355
355
  return False
356
356
 
357
357
  if not disk['in_use']:
358
- print(f"Disk '{disk_name}' is not locked")
359
- return False
358
+ # DDB says not locked — but check if EBS volume is still physically attached
359
+ try:
360
+ ec2 = config.session.client('ec2', region_name=config.aws_region)
361
+ vols = ec2.describe_volumes(Filters=[
362
+ {"Name": "tag:gpu-dev-user", "Values": [user_id]},
363
+ {"Name": "tag:disk_name", "Values": [disk_name]},
364
+ {"Name": "status", "Values": ["in-use"]},
365
+ ]).get("Volumes", [])
366
+ if not vols:
367
+ print(f"Disk '{disk_name}' is not locked")
368
+ return False
369
+ print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
370
+ except Exception:
371
+ print(f"Disk '{disk_name}' is not locked")
372
+ return False
360
373
 
361
374
  operation_id = str(uuid.uuid4())
362
375
 
@@ -1701,6 +1701,7 @@ class ReservationManager:
1701
1701
  initial_text = f"📡 Starting multinode reservation..." if is_multinode else "🔄 Sending reservation request..."
1702
1702
  spinner = Spinner("dots", text=initial_text)
1703
1703
  live.update(spinner)
1704
+ poll_delay = 0.5 # start fast, back off over time
1704
1705
 
1705
1706
  while (
1706
1707
  (timeout_seconds is None or time.time() -
@@ -1761,7 +1762,7 @@ class ReservationManager:
1761
1762
  if not is_multinode:
1762
1763
  spinner.text = "📡 Waiting for reservation status update..."
1763
1764
  live.update(spinner)
1764
- time.sleep(2)
1765
+ time.sleep(0.5)
1765
1766
  continue
1766
1767
  else:
1767
1768
  node_details.append({
@@ -2293,8 +2294,9 @@ class ReservationManager:
2293
2294
 
2294
2295
  return None
2295
2296
 
2296
- # Continue polling
2297
- time.sleep(3)
2297
+ # Poll with backoff: 0.5s → 1s → 1.5s → 2s → 3s (cap)
2298
+ time.sleep(poll_delay)
2299
+ poll_delay = min(poll_delay + 0.5, 3.0)
2298
2300
 
2299
2301
  except Exception as e:
2300
2302
  console.print(
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.32"
7
+ version = "0.6.0"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -195,7 +195,7 @@ GPU_CONFIG = {
195
195
  "b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
196
196
  "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
197
197
  "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
198
- "cpu-spot": {"instance_type": "c7i.2xlarge", "max_gpus": 0, "cpus": 8, "memory_gb": 16, "efa_count": 0},
198
+ "cpu-spot": {"instance_type": "c6id.2xlarge", "max_gpus": 0, "cpus": 8, "memory_gb": 16, "efa_count": 0},
199
199
  }
200
200
  GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
201
201
 
@@ -2843,16 +2843,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2843
2843
  elif dockerimage:
2844
2844
  logger.info(f"Custom Docker image specified: {dockerimage}")
2845
2845
 
2846
- record_trace_event(trace_data, "github_keys_fetch_start")
2847
- github_public_key = get_github_public_key(github_user, validate=True)
2848
- record_trace_event(trace_data, "github_keys_fetch_end")
2849
- if not github_public_key:
2850
- raise ValueError(
2851
- f"Could not fetch GitHub public key for GitHub user '{github_user}'"
2852
- )
2853
-
2854
- # Check if user should get persistent disk
2855
- # Check if user explicitly requested no persistent disk (e.g., confirmed continuing without disk when another reservation has it)
2846
+ # ── Determine disk eligibility (quick, no I/O) ──
2856
2847
  no_persistent_disk_requested = request.get("no_persistent_disk", False)
2857
2848
 
2858
2849
  if no_persistent_disk_requested:
@@ -2895,119 +2886,93 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2895
2886
  logger.error(f"Failed to reserve persistent disk slot: {e}")
2896
2887
  use_persistent_disk = False
2897
2888
 
2898
- if use_persistent_disk:
2889
+ # ── Run SSH key fetch, disk setup, and EFS setup in parallel ──
2890
+ # These are independent I/O operations that together take ~8s sequentially
2891
+ def _fetch_ssh_keys():
2892
+ record_trace_event(trace_data, "github_keys_fetch_start")
2893
+ keys = get_github_public_key(github_user, validate=True)
2894
+ record_trace_event(trace_data, "github_keys_fetch_end")
2895
+ return keys
2896
+
2897
+ def _setup_disk():
2898
+ if not use_persistent_disk:
2899
+ return None, True, None, None, None
2900
+ update_reservation_status(
2901
+ reservation_id, "preparing",
2902
+ detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else ""))
2903
+ _target_az, _target_node = get_target_az_for_reservation(gpu_type, gpu_count)
2904
+ if not _target_az:
2905
+ raise ValueError(f"No {gpu_type} nodes found in cluster")
2906
+ logger.info(f"Target AZ: {_target_az}, disk_name={disk_name or 'default'}")
2907
+ record_trace_event(trace_data, "disk_create_start")
2908
+ vol_id, new_disk, warning = create_disk_from_snapshot_or_empty(
2909
+ user_id=user_id, availability_zone=_target_az,
2910
+ disk_name=disk_name, reservation_id=reservation_id)
2911
+ record_trace_event(trace_data, "disk_create_end")
2912
+ return vol_id, new_disk, warning, _target_az, _target_node
2913
+
2914
+ def _setup_efs():
2915
+ if not (EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS):
2916
+ return None
2917
+ update_reservation_status(
2918
+ reservation_id, "preparing",
2919
+ "Setting up shared storage (/shared) for user collaboration")
2920
+ record_trace_event(trace_data, "efs_setup_start")
2921
+ efs_id = create_or_find_user_efs(user_id)
2922
+ record_trace_event(trace_data, "efs_setup_end")
2923
+ return efs_id
2924
+
2925
+ with ThreadPoolExecutor(max_workers=3) as executor:
2926
+ ssh_future = executor.submit(_fetch_ssh_keys)
2927
+ disk_future = executor.submit(_setup_disk)
2928
+ efs_future = executor.submit(_setup_efs)
2929
+
2930
+ github_public_key = ssh_future.result()
2899
2931
  try:
2900
- # NEW snapshot-first workflow (replaces old migration logic below)
2901
- # Always recreate volume from latest snapshot or create empty
2902
- update_reservation_status(
2903
- reservation_id,
2904
- "preparing",
2905
- detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
2906
- )
2907
-
2908
- # Determine target AZ + node for this reservation (binpacking)
2909
- target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
2910
- if not target_az:
2911
- raise ValueError(f"No {gpu_type} nodes found in cluster")
2912
-
2913
- logger.info(f"Target AZ for reservation: {target_az}")
2914
- logger.info(f"Creating persistent disk for user {user_id}, disk_name={disk_name or 'default'}")
2915
-
2916
- # Use new snapshot-first function
2917
- record_trace_event(trace_data, "disk_create_start")
2918
- persistent_volume_id, is_new_disk, disk_warning = create_disk_from_snapshot_or_empty(
2919
- user_id=user_id,
2920
- availability_zone=target_az,
2921
- disk_name=disk_name,
2922
- reservation_id=reservation_id
2923
- )
2924
- record_trace_event(trace_data, "disk_create_end")
2925
-
2926
- logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
2927
-
2928
- # Mark disk as in_use in disks table (prevents CLI from showing as available)
2929
- # Use "default" as fallback when no explicit disk_name provided
2930
- effective_disk_name = disk_name or "default"
2931
- try:
2932
- mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
2933
- logger.info(f"Marked disk '{effective_disk_name}' as in_use for reservation {reservation_id[:8]}")
2934
- except Exception as mark_error:
2935
- logger.warning(f"Failed to mark disk as in_use: {mark_error}")
2936
-
2937
- # Store disk_name in DynamoDB for tracking (ALWAYS store, using "default" as fallback)
2938
- # This is required for expiry cleanup to know which disk to mark as not in use
2939
- update_reservation_fields(reservation_id, disk_name=effective_disk_name)
2940
-
2941
- # Store warning if any
2942
- if disk_warning:
2943
- update_reservation_fields(reservation_id, warning=disk_warning)
2944
- logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}")
2932
+ disk_result = disk_future.result()
2945
2933
  except Exception as disk_error:
2946
2934
  logger.error(f"Failed to set up persistent disk: {disk_error}")
2947
-
2948
2935
  error_msg = str(disk_error)
2949
-
2950
- # If user explicitly requested a named disk, NEVER silently fall back to temporary.
2951
- # Any disk error (in use, timeout, creation failure) should fail the reservation
2952
- # so the user knows what happened instead of getting surprise temporary storage.
2953
2936
  if disk_name:
2954
- logger.error(f"Named disk '{disk_name}' was explicitly requested but setup failed - failing reservation")
2955
- update_reservation_status(
2956
- reservation_id,
2957
- "failed",
2958
- failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}"
2959
- )
2937
+ logger.error(f"Named disk '{disk_name}' setup failed - failing reservation")
2938
+ update_reservation_status(reservation_id, "failed",
2939
+ failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}")
2960
2940
  raise RuntimeError(f"Cannot create reservation: disk '{disk_name}' setup failed: {error_msg}")
2961
-
2962
- # Check if this is a "disk in use" error - these should fail the reservation
2963
2941
  if "in use" in error_msg.lower():
2964
- # Don't fall back - fail the reservation with clear error
2965
- update_reservation_status(
2966
- reservation_id,
2967
- "failed",
2968
- failure_reason=error_msg
2969
- )
2942
+ update_reservation_status(reservation_id, "failed", failure_reason=error_msg)
2970
2943
  raise RuntimeError(f"Cannot create reservation: {error_msg}")
2971
-
2972
- # For other errors without explicit disk_name, continue without persistent disk (backwards compatibility)
2973
- logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}")
2944
+ logger.warning(f"Falling back to non-persistent storage: {disk_error}")
2974
2945
  use_persistent_disk = False
2975
- persistent_volume_id = None # Clear any volume that was set before the error
2976
- is_new_disk = True # EmptyDir volume will need shell environment setup
2977
- update_reservation_status(
2978
- reservation_id,
2979
- "preparing",
2980
- "Persistent disk setup failed - continuing without persistent storage",
2981
- )
2982
- else:
2983
- logger.info(
2984
- f"User {user_id} has existing reservations - no persistent disk")
2985
- # Non-persistent reservations always need shell environment setup
2946
+ persistent_volume_id = None
2947
+ is_new_disk = True
2948
+ disk_result = None
2949
+ update_reservation_status(reservation_id, "preparing",
2950
+ "Persistent disk setup failed - continuing without persistent storage")
2951
+ try:
2952
+ efs_filesystem_id = efs_future.result()
2953
+ except Exception as efs_error:
2954
+ logger.error(f"Failed to set up EFS: {efs_error}")
2955
+ efs_filesystem_id = None
2956
+
2957
+ if not github_public_key:
2958
+ raise ValueError(f"Could not fetch GitHub public key for GitHub user '{github_user}'")
2959
+
2960
+ if use_persistent_disk and disk_result:
2961
+ persistent_volume_id, is_new_disk, disk_warning, target_az, target_node = disk_result
2962
+ logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
2963
+ effective_disk_name = disk_name or "default"
2964
+ try:
2965
+ mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
2966
+ except Exception as mark_error:
2967
+ logger.warning(f"Failed to mark disk as in_use: {mark_error}")
2968
+ update_reservation_fields(reservation_id, disk_name=effective_disk_name)
2969
+ if disk_warning:
2970
+ update_reservation_fields(reservation_id, warning=disk_warning)
2971
+ elif not use_persistent_disk:
2986
2972
  is_new_disk = True
2987
- logger.info(
2988
- "Non-persistent reservation - will always set up shell environment (CREATE_SH_ENV=true)")
2989
2973
 
2990
- # Set up shared EFS storage for user
2991
- efs_filesystem_id = None
2992
- try:
2993
- if EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS:
2994
- update_reservation_status(
2995
- reservation_id,
2996
- "preparing",
2997
- "Setting up shared storage (/shared) for user collaboration",
2998
- )
2999
- record_trace_event(trace_data, "efs_setup_start")
3000
- efs_filesystem_id = create_or_find_user_efs(user_id)
3001
- record_trace_event(trace_data, "efs_setup_end")
3002
- logger.info(
3003
- f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
3004
- else:
3005
- logger.warning(
3006
- "EFS configuration missing - skipping shared storage setup")
3007
- except Exception as efs_error:
3008
- logger.error(f"Failed to set up EFS: {efs_error}")
3009
- # Continue without EFS rather than failing
3010
- efs_filesystem_id = None
2974
+ if efs_filesystem_id:
2975
+ logger.info(f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
3011
2976
 
3012
2977
  # Update status: Creating Kubernetes resources
3013
2978
  disk_status = "with persistent disk" if use_persistent_disk else "without persistent disk"
@@ -3149,30 +3114,29 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
3149
3114
  try:
3150
3115
  v1 = client.CoreV1Api(k8s_client)
3151
3116
 
3152
- # Try multiple times to find SSH daemon in logs (custom images may take longer)
3153
- # Default image has openssh-server pre-installed so SSH starts in ~2-5s
3154
- # Custom/minimal images may need apt-get install which takes longer
3155
- # 60 retries * 3s = 180 seconds total (3 minutes) - same max but much faster detection
3156
- max_retries = 60
3157
- retry_delay = 3 # seconds between retries
3117
+ # Poll for SSH daemon: 100ms for first 8s, then backoff to 5s
3118
+ # Default image starts SSH in ~2-5s, so rapid polling catches it instantly
3119
+ # Custom images may take longer, backoff keeps API load reasonable
3120
+ max_attempts = 60
3121
+ elapsed = 0.0
3158
3122
 
3159
- for attempt in range(max_retries):
3123
+ for attempt in range(max_attempts):
3160
3124
  logs = v1.read_namespaced_pod_log(
3161
3125
  name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=100
3162
3126
  )
3163
3127
  if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
3164
3128
  logger.info(
3165
- f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1})")
3129
+ f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
3166
3130
  ssh_ready = True
3167
3131
  break
3168
3132
  else:
3169
- if attempt < max_retries - 1:
3170
- logger.info(
3171
- f"SSH daemon not yet started, waiting {retry_delay}s (attempt {attempt + 1}/{max_retries})")
3172
- time.sleep(retry_delay)
3133
+ if attempt < max_attempts - 1:
3134
+ delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
3135
+ time.sleep(delay)
3136
+ elapsed += delay
3173
3137
  else:
3174
3138
  logger.warning(
3175
- f"SSH daemon not detected after {max_retries} attempts, logs preview: {logs[-200:]}")
3139
+ f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
3176
3140
  except Exception as e:
3177
3141
  logger.warning(f"Could not check SSH daemon logs: {e}")
3178
3142
  # Assume ready if pod is running (NLB will handle routing)
@@ -3514,32 +3478,52 @@ def update_reservation_fields(reservation_id: str, **fields) -> None:
3514
3478
  logger.error(f"Error updating reservation fields: {str(e)}")
3515
3479
 
3516
3480
 
3481
+ _ssh_key_cache = {}
3482
+ _SSH_KEY_CACHE_TTL = 7 * 24 * 3600 # 7 days — keys rarely change, pods fetch live keys anyway
3483
+
3484
+
3517
3485
  def get_github_public_key(github_username: str, validate: bool = True) -> str:
3518
- """Fetch GitHub public keys for user (all keys)
3486
+ """Fetch GitHub public keys for user, cached in-memory and DynamoDB."""
3487
+ import urllib.request
3519
3488
 
3520
- Args:
3521
- github_username: GitHub username to fetch keys for
3522
- validate: If True, validate and filter keys to only include valid SSH key formats
3489
+ username_lower = github_username.lower()
3523
3490
 
3524
- Returns:
3525
- String containing SSH keys (one per line) or None if no keys found
3526
- """
3491
+ # In-memory cache (survives across warm Lambda invocations)
3492
+ cached = _ssh_key_cache.get(username_lower)
3493
+ if cached and time.time() - cached["ts"] < _SSH_KEY_CACHE_TTL:
3494
+ logger.info(f"SSH keys for {github_username} from memory cache")
3495
+ return cached["keys"]
3496
+
3497
+ # DynamoDB cache (survives cold starts)
3527
3498
  try:
3528
- import urllib.request
3499
+ resp = reservations_table.get_item(
3500
+ Key={"reservation_id": f"ssh-key-cache-{username_lower}"},
3501
+ ProjectionExpression="ssh_keys, cached_at",
3502
+ )
3503
+ if "Item" in resp:
3504
+ item = resp["Item"]
3505
+ cached_at = float(item.get("cached_at", 0))
3506
+ if time.time() - cached_at < _SSH_KEY_CACHE_TTL:
3507
+ keys = item["ssh_keys"]
3508
+ _ssh_key_cache[username_lower] = {"keys": keys, "ts": cached_at}
3509
+ logger.info(f"SSH keys for {github_username} from DynamoDB cache")
3510
+ return keys
3511
+ except Exception as e:
3512
+ logger.warning(f"DynamoDB SSH key cache read failed: {e}")
3529
3513
 
3514
+ # Cache miss — fetch from GitHub
3515
+ try:
3530
3516
  url = f"https://github.com/{github_username}.keys"
3531
3517
  logger.info(f"Fetching SSH keys for {github_username} from {url}")
3532
3518
 
3533
- with urllib.request.urlopen(url) as response:
3519
+ with urllib.request.urlopen(url, timeout=10) as response:
3534
3520
  keys_data = response.read().decode("utf-8").strip()
3535
3521
 
3536
3522
  if not keys_data:
3537
- logger.error(
3538
- f"No public SSH keys found for GitHub user {github_username}")
3523
+ logger.error(f"No public SSH keys found for GitHub user {github_username}")
3539
3524
  return None
3540
3525
 
3541
3526
  if validate:
3542
- # Validate keys format (basic check for ssh-rsa/ssh-ed25519/ssh-ecdsa)
3543
3527
  valid_keys = []
3544
3528
  for line in keys_data.split("\n"):
3545
3529
  line = line.strip()
@@ -3549,22 +3533,31 @@ def get_github_public_key(github_username: str, validate: bool = True) -> str:
3549
3533
  or line.startswith("ssh-ecdsa")
3550
3534
  ):
3551
3535
  valid_keys.append(line)
3552
-
3553
3536
  if not valid_keys:
3554
- logger.error(
3555
- f"No valid SSH keys found for GitHub user {github_username}"
3556
- )
3537
+ logger.error(f"No valid SSH keys found for GitHub user {github_username}")
3557
3538
  return None
3539
+ keys_data = "\n".join(valid_keys)
3558
3540
 
3559
- logger.info(
3560
- f"Found {len(valid_keys)} valid SSH keys for {github_username}")
3561
- return "\n".join(valid_keys)
3562
- else:
3563
- return keys_data
3541
+ logger.info(f"Found {len(keys_data.splitlines())} valid SSH keys for {github_username}")
3542
+
3543
+ # Store in both caches
3544
+ now = time.time()
3545
+ _ssh_key_cache[username_lower] = {"keys": keys_data, "ts": now}
3546
+ try:
3547
+ reservations_table.put_item(Item={
3548
+ "reservation_id": f"ssh-key-cache-{username_lower}",
3549
+ "ssh_keys": keys_data,
3550
+ "cached_at": str(now),
3551
+ "github_user": github_username,
3552
+ "status": "cache",
3553
+ })
3554
+ except Exception as e:
3555
+ logger.warning(f"DynamoDB SSH key cache write failed: {e}")
3556
+
3557
+ return keys_data
3564
3558
 
3565
3559
  except Exception as e:
3566
- logger.error(
3567
- f"Error fetching GitHub key for {github_username}: {str(e)}")
3560
+ logger.error(f"Error fetching GitHub key for {github_username}: {str(e)}")
3568
3561
  return None
3569
3562
 
3570
3563
 
@@ -4145,7 +4138,6 @@ def create_pod(
4145
4138
 
4146
4139
  # Determine container image to use based on architecture
4147
4140
  if gpu_type.startswith("cpu-arm"):
4148
- # Use Python base image for ARM64 CPU instances with PyTorch installed via pip
4149
4141
  container_image = "python:3.11-slim" # Multi-arch image with ARM64 support
4150
4142
  else:
4151
4143
  container_image = GPU_DEV_CONTAINER_IMAGE # Default x86_64 PyTorch image
@@ -6678,7 +6670,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6678
6670
  "p5en.48xlarge": "H200",
6679
6671
  "p6-b200.48xlarge": "B200",
6680
6672
  "p6-b300.48xlarge": "B300",
6681
- "c7i.2xlarge": "cpu-spot",
6673
+ "c6id.2xlarge": "cpu-spot",
6682
6674
  }
6683
6675
 
6684
6676
  gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -7809,6 +7801,14 @@ def process_scheduled_queue_management():
7809
7801
  except Exception:
7810
7802
  cpu_spot_ready = False
7811
7803
  if type_available_gpus >= requested_gpus and max_single_node >= requested_gpus and cpu_spot_ready:
7804
+ # Re-check DDB status before allocating — another invocation may have already started
7805
+ _current = reservations_table.get_item(Key={"reservation_id": reservation_id}).get("Item", {})
7806
+ _cur_status = _current.get("status", "queued")
7807
+ if _cur_status != "queued":
7808
+ logger.info(f"Reservation {reservation_id} already {_cur_status} (race avoided), skipping allocation")
7809
+ processed_count += 1
7810
+ continue
7811
+
7812
7812
  logger.info(
7813
7813
  f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_available_gpus} available"
7814
7814
  )
@@ -8092,6 +8092,17 @@ def process_cancellation_request(record: dict[str, Any]) -> bool:
8092
8092
  logger.info(
8093
8093
  f"Cleaned up pod resources for cancelled reservation {full_reservation_id}")
8094
8094
 
8095
+ # Force-detach EBS volume (CSI driver sometimes leaves it attached)
8096
+ ebs_vol = reservation.get("ebs_volume_id")
8097
+ if ebs_vol:
8098
+ try:
8099
+ vol_state = ec2_client.describe_volumes(VolumeIds=[ebs_vol])["Volumes"][0]["State"]
8100
+ if vol_state == "in-use":
8101
+ logger.info(f"Force-detaching orphaned volume {ebs_vol}")
8102
+ ec2_client.detach_volume(VolumeId=ebs_vol, Force=True)
8103
+ except Exception as detach_err:
8104
+ logger.warning(f"Volume detach failed for {ebs_vol}: {detach_err}")
8105
+
8095
8106
  except Exception as cleanup_error:
8096
8107
  logger.error(
8097
8108
  f"Error cleaning up pod {pod_name}: {cleanup_error}")
@@ -8736,6 +8747,24 @@ def process_clear_disk_lock_action(record: dict[str, Any]) -> bool:
8736
8747
 
8737
8748
  mark_disk_in_use(user_id, disk_name, False)
8738
8749
  logger.info(f"Cleared stale lock on disk '{disk_name}' for user '{user_id}'")
8750
+
8751
+ # Also force-detach any orphaned EBS volumes for this disk
8752
+ try:
8753
+ volumes = ec2_client.describe_volumes(
8754
+ Filters=[
8755
+ {"Name": "tag:gpu-dev-user", "Values": [user_id]},
8756
+ {"Name": "tag:disk_name", "Values": [disk_name]},
8757
+ {"Name": "status", "Values": ["in-use"]},
8758
+ ]
8759
+ ).get("Volumes", [])
8760
+ for vol in volumes:
8761
+ vol_id = vol["VolumeId"]
8762
+ logger.info(f"Force-detaching orphaned volume {vol_id} for disk '{disk_name}'")
8763
+ ec2_client.detach_volume(VolumeId=vol_id, Force=True)
8764
+ logger.info(f"Detached volume {vol_id}")
8765
+ except Exception as detach_err:
8766
+ logger.warning(f"Failed to detach orphaned volumes for disk '{disk_name}': {detach_err}")
8767
+
8739
8768
  return True
8740
8769
 
8741
8770