gpu-dev 0.5.30__tar.gz → 0.5.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +7 -0
  4. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +263 -101
  5. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +1 -0
  6. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +192 -70
  7. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +29 -2
  8. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +4 -2
  9. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/pyproject.toml +1 -1
  10. gpu_dev-0.5.31/terraform-gpu-devservers/ami-baker.tf +125 -0
  11. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/availability.tf +11 -1
  12. gpu_dev-0.5.31/terraform-gpu-devservers/check_b200.py +21 -0
  13. gpu_dev-0.5.31/terraform-gpu-devservers/cluster-autoscaler.tf +47 -0
  14. gpu_dev-0.5.31/terraform-gpu-devservers/cmd_proxy.py +49 -0
  15. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/eks.tf +50 -11
  16. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/expiry.tf +3 -0
  17. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +31 -0
  18. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/availability_updater/index.py +106 -18
  19. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +33 -3
  20. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/index.py +135 -11
  21. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda.tf +9 -3
  22. gpu_dev-0.5.31/terraform-gpu-devservers/list_b200.py +68 -0
  23. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/main.tf +6 -9
  24. gpu_dev-0.5.31/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  25. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
  26. gpu_dev-0.5.31/terraform-gpu-devservers/templates/ami-baker-user-data.sh +44 -0
  27. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.github/workflows/no-gitlinks.yml +0 -0
  28. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.github/workflows/publish.yml +0 -0
  29. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.gitignore +0 -0
  30. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/CLAUDE.md +0 -0
  31. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PROGRESS.md +0 -0
  32. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PR_DESCRIPTION.md +0 -0
  33. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/README.md +0 -0
  34. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/TODO.md +0 -0
  35. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/README.md +0 -0
  36. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/generate_stats.py +0 -0
  37. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/requirements.txt +0 -0
  38. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/README.md +0 -0
  39. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  40. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  41. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  42. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  43. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  44. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  45. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  46. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  47. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  48. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  49. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  50. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/USER_GUIDE.md +0 -0
  51. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/devgpu-features.html +0 -0
  52. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/docker-mark-blue.svg +0 -0
  53. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/icons8-cursor-ai.svg +0 -0
  54. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/post.md +0 -0
  55. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/setup.cfg +0 -0
  56. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  57. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  58. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/README.md +0 -0
  59. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/alb.tf +0 -0
  60. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/backend.tf +0 -0
  61. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  62. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  63. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  64. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bash_profile +0 -0
  65. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bashrc +0 -0
  66. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  67. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  68. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  69. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  70. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/motd_script +0 -0
  71. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  72. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/profile +0 -0
  73. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  74. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  75. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  76. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/shell_env +0 -0
  77. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/ssh_config +0 -0
  78. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zprofile +0 -0
  79. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zshrc +0 -0
  80. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  81. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-build.tf +0 -0
  82. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  83. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  84. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ecr.tf +0 -0
  85. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/efs.tf +0 -0
  86. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/git-cache.tf +0 -0
  87. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/kubernetes.tf +0 -0
  88. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  89. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  90. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  91. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  92. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  93. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  94. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  95. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  96. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  97. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  98. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  99. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  100. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/mig-config.tf +0 -0
  101. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  102. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  103. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  104. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  105. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  106. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  107. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/monitoring.tf +0 -0
  108. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  109. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/outputs.tf +0 -0
  110. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/pyproject.toml +0 -0
  111. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/queue.tf +0 -0
  112. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/route53.tf +0 -0
  113. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  114. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  115. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  116. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  117. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  118. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  119. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  120. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  121. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  122. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  123. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/switch-to.sh +0 -0
  124. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  125. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  126. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  127. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/variables.tf +0 -0
  128. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/README.md +0 -0
  129. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/fail/run.sh +0 -0
  130. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/multinode/run.sh +0 -0
  131. {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.30
3
+ Version: 0.5.31
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.30
3
+ Version: 0.5.31
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -37,8 +37,12 @@ docs/icons8-cursor-ai.svg
37
37
  terraform-gpu-devservers/.terraform.lock.hcl
38
38
  terraform-gpu-devservers/README.md
39
39
  terraform-gpu-devservers/alb.tf
40
+ terraform-gpu-devservers/ami-baker.tf
40
41
  terraform-gpu-devservers/availability.tf
41
42
  terraform-gpu-devservers/backend.tf
43
+ terraform-gpu-devservers/check_b200.py
44
+ terraform-gpu-devservers/cluster-autoscaler.tf
45
+ terraform-gpu-devservers/cmd_proxy.py
42
46
  terraform-gpu-devservers/docker-build.tf
43
47
  terraform-gpu-devservers/ecr.tf
44
48
  terraform-gpu-devservers/efs.tf
@@ -48,6 +52,7 @@ terraform-gpu-devservers/git-cache.tf
48
52
  terraform-gpu-devservers/gpu-dev-pod-irsa.tf
49
53
  terraform-gpu-devservers/kubernetes.tf
50
54
  terraform-gpu-devservers/lambda.tf
55
+ terraform-gpu-devservers/list_b200.py
51
56
  terraform-gpu-devservers/main.tf
52
57
  terraform-gpu-devservers/mig-config.tf
53
58
  terraform-gpu-devservers/mig-parted-config.yaml
@@ -60,6 +65,7 @@ terraform-gpu-devservers/route53.tf
60
65
  terraform-gpu-devservers/s3-disk-contents.tf
61
66
  terraform-gpu-devservers/ssh-proxy-service.tf
62
67
  terraform-gpu-devservers/ssh-proxy.tf
68
+ terraform-gpu-devservers/subnet-0fe3a2c45570091ad
63
69
  terraform-gpu-devservers/switch-to.sh
64
70
  terraform-gpu-devservers/variables.tf
65
71
  terraform-gpu-devservers/.claude/skills/deploy.md
@@ -114,6 +120,7 @@ terraform-gpu-devservers/ssh-proxy/proxy.py
114
120
  terraform-gpu-devservers/ssh-proxy/requirements.txt
115
121
  terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
116
122
  terraform-gpu-devservers/templates/al2023-user-data.sh
123
+ terraform-gpu-devservers/templates/ami-baker-user-data.sh
117
124
  terraform-gpu-devservers/templates/user-data-self-managed.sh
118
125
  terraform-gpu-devservers/templates/user-data.sh
119
126
  tests/submit/README.md
@@ -41,6 +41,36 @@ from .interactive import (
41
41
  console = Console()
42
42
 
43
43
 
44
+ def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
45
+ """Fetch reservations from current region + prod-east1 if on prod."""
46
+ reservations = reservation_mgr.list_reservations(
47
+ user_filter=user_filter, statuses_to_include=statuses)
48
+ # Cross-region fetch
49
+ try:
50
+ cfg = config or load_config()
51
+ if cfg.user_config.get("environment") == "prod":
52
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
53
+ if east1_env:
54
+ import boto3 as _b3
55
+ east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
56
+ east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
57
+ for st in (statuses or ["active"]):
58
+ resp = east1_table.query(
59
+ IndexName="StatusIndex",
60
+ KeyConditionExpression="#s = :status",
61
+ ExpressionAttributeNames={"#s": "status"},
62
+ ExpressionAttributeValues={":status": st},
63
+ )
64
+ for item in resp.get("Items", []):
65
+ if user_filter and item.get("user_id") != user_filter:
66
+ continue
67
+ item["_region"] = "us-east-1"
68
+ reservations.append(item)
69
+ except Exception:
70
+ pass
71
+ return reservations
72
+
73
+
44
74
  def _format_relative_time(timestamp_str: str, relative_to: str = "now") -> str:
45
75
  """Format timestamp as relative time if within 24h, otherwise absolute"""
46
76
  if not timestamp_str or timestamp_str == "N/A":
@@ -598,6 +628,7 @@ def reserve(
598
628
  preserve_entrypoint: bool,
599
629
  disk: Optional[str],
600
630
  node_label: tuple,
631
+ spot: bool = False,
601
632
  ) -> None:
602
633
  """Reserve GPU development server(s)
603
634
 
@@ -688,6 +719,11 @@ def reserve(
688
719
  rprint(
689
720
  "[dim]Use --no-interactive flag to disable interactive mode[/dim]\n")
690
721
 
722
+ # Auto-acknowledge spot in spot-only environments so users don't need --spot
723
+ from .config import Config as _Cfg
724
+ _env_name = load_config().user_config.get("environment", "prod")
725
+ _spot_types_env = _Cfg.ENVIRONMENTS.get(_env_name, {}).get("spot_types", [])
726
+
691
727
  # Run auth + SSH validation + availability fetch in parallel — they're independent
692
728
  # and total wall-clock time drops from sum to max(each).
693
729
  from concurrent.futures import ThreadPoolExecutor
@@ -748,6 +784,31 @@ def reserve(
748
784
  rprint("[yellow]Reservation cancelled.[/yellow]")
749
785
  return
750
786
 
787
+ # Handle spot: prefix from cross-region selection — use a TEMPORARY config
788
+ # for prod-east1 without persisting the environment change to disk.
789
+ if isinstance(gpu_type, str) and gpu_type.startswith("spot:"):
790
+ gpu_type = gpu_type[5:] # strip prefix
791
+ spot = True
792
+ rprint(f"\n[cyan]⚡ Switching to spot cluster (us-east-1) for {gpu_type.upper()}[/cyan]")
793
+ rprint("[dim]Spot instance: ~70% cheaper, may be preempted, separate disks.[/dim]\n")
794
+ # Build a temporary Config pointing at prod-east1 WITHOUT touching disk
795
+ import os as _os
796
+ east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
797
+ _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
798
+ config = Config()
799
+ config.aws_region = east1_cfg["region"]
800
+ reservation_mgr = ReservationManager(config)
801
+ try:
802
+ user_info = authenticate_user(config)
803
+ except RuntimeError as e:
804
+ rprint(f"[red]❌ {str(e)}[/red]")
805
+ return
806
+
807
+ # Auto-acknowledge spot for spot types in this environment
808
+ if _spot_types_env and gpu_type and gpu_type.lower() in _spot_types_env and not spot:
809
+ spot = True
810
+ rprint(f"[dim]{gpu_type.upper()} is a spot instance in this environment — --spot auto-acknowledged. May be preempted by AWS.[/dim]")
811
+
751
812
  # Interactive GPU count selection
752
813
  if gpus is None:
753
814
  gpu_type_lower = gpu_type.lower()
@@ -1746,13 +1807,47 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1746
1807
  def fetch_recent_failures():
1747
1808
  return reservation_mgr.list_reservations(
1748
1809
  user_filter=user_filter,
1749
- statuses_to_include=["failed", "cancelled"],
1810
+ statuses_to_include=["failed", "cancelled", "expired"],
1750
1811
  created_after=one_hour_ago)
1751
1812
 
1752
- with ThreadPoolExecutor(max_workers=2) as executor:
1813
+ # Also fetch from prod-east1 (cross-region) if we're on prod
1814
+ def fetch_east1():
1815
+ try:
1816
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
1817
+ if not east1_env or config.user_config.get("environment") != "prod":
1818
+ return []
1819
+ import boto3 as _b3
1820
+ east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
1821
+ east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
1822
+ results = []
1823
+ # Fetch active + recent failures/expired (last 24h) from east1
1824
+ all_statuses = (statuses_to_include or ["active", "preparing", "queued", "pending"]) + ["failed", "expired", "cancelled"]
1825
+ for s in all_statuses:
1826
+ resp = east1_table.query(
1827
+ IndexName="StatusIndex",
1828
+ KeyConditionExpression="#s = :status",
1829
+ ExpressionAttributeNames={"#s": "status"},
1830
+ ExpressionAttributeValues={":status": s},
1831
+ )
1832
+ for item in resp.get("Items", []):
1833
+ if user_filter and item.get("user_id") != user_filter:
1834
+ continue
1835
+ # For failed/expired/cancelled, only show if ended recently
1836
+ if s in ("failed", "expired", "cancelled"):
1837
+ ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
1838
+ if ended and ended < one_hour_ago:
1839
+ continue
1840
+ item["_region"] = "us-east-1"
1841
+ results.append(item)
1842
+ return results
1843
+ except Exception:
1844
+ return []
1845
+
1846
+ with ThreadPoolExecutor(max_workers=3) as executor:
1753
1847
  active_future = executor.submit(fetch_active)
1754
1848
  failures_future = executor.submit(fetch_recent_failures)
1755
- reservations = active_future.result() + failures_future.result()
1849
+ east1_future = executor.submit(fetch_east1)
1850
+ reservations = active_future.result() + failures_future.result() + east1_future.result()
1756
1851
  else:
1757
1852
  reservations = reservation_mgr.list_reservations(
1758
1853
  user_filter=user_filter, statuses_to_include=statuses_to_include
@@ -1787,6 +1882,9 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1787
1882
  reservations = sorted(reservations, key=sort_key)
1788
1883
 
1789
1884
  # Create table with enhanced columns for queue info
1885
+ # Check if we have cross-region reservations
1886
+ _has_east1 = any(r.get("_region") == "us-east-1" for r in reservations)
1887
+
1790
1888
  table = Table(title="GPU Reservations")
1791
1889
  table.add_column("ID", style="cyan", no_wrap=True)
1792
1890
  table.add_column("User", style="green")
@@ -1796,6 +1894,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1796
1894
  table.add_column("Queue Info", style="cyan")
1797
1895
  table.add_column("Created", style="blue")
1798
1896
  table.add_column("Expires/ETA", style="red")
1897
+ if _has_east1:
1898
+ table.add_column("Region", style="dim")
1799
1899
  if details:
1800
1900
  table.add_column("CLI Ver", style="dim", no_wrap=True)
1801
1901
  table.add_column("Lambda Ver", style="dim", no_wrap=True)
@@ -1842,6 +1942,26 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1842
1942
  expires_formatted = f"~{estimated_wait}min"
1843
1943
  else:
1844
1944
  expires_formatted = "Calculating..."
1945
+ elif res_status in ("expired", "failed", "cancelled"):
1946
+ reason = reservation.get("failure_reason", "")
1947
+ ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
1948
+ ended_str = ""
1949
+ if ended:
1950
+ try:
1951
+ from datetime import datetime, timezone
1952
+ ended_dt = datetime.fromisoformat(ended.replace("Z", "+00:00"))
1953
+ ended_str = ended_dt.astimezone().strftime("%H:%M")
1954
+ except Exception:
1955
+ pass
1956
+ if "preempted" in reason.lower():
1957
+ expires_formatted = f"Preempted{' @' + ended_str if ended_str else ''}"
1958
+ elif res_status == "cancelled":
1959
+ expires_formatted = f"Cancelled{' @' + ended_str if ended_str else ''}"
1960
+ elif reason:
1961
+ short = reason.split("\n")[0][:20]
1962
+ expires_formatted = short
1963
+ else:
1964
+ expires_formatted = res_status.capitalize()
1845
1965
  else:
1846
1966
  expires_formatted = "N/A"
1847
1967
 
@@ -1979,6 +2099,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1979
2099
  row_data.append(
1980
2100
  f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
1981
2101
 
2102
+ if _has_east1:
2103
+ region = reservation.get("_region", "us-east-2")
2104
+ row_data.append("[yellow]east1[/yellow]" if region == "us-east-1" else "prod")
2105
+
1982
2106
  table.add_row(*row_data)
1983
2107
 
1984
2108
  except Exception as row_error:
@@ -2309,12 +2433,10 @@ def cancel(
2309
2433
 
2310
2434
  reservation_mgr = ReservationManager(config)
2311
2435
 
2312
- # Get cancellable reservations
2313
- reservations = reservation_mgr.list_reservations(
2314
- user_filter=user_info["user_id"],
2315
- statuses_to_include=[
2316
- "active", "queued", "pending", "preparing"],
2317
- )
2436
+ # Get cancellable reservations (cross-region)
2437
+ reservations = _fetch_reservations_cross_region(
2438
+ reservation_mgr, user_info["user_id"],
2439
+ ["active", "queued", "pending", "preparing"], config)
2318
2440
 
2319
2441
  live.stop()
2320
2442
 
@@ -2720,7 +2842,25 @@ def _show_availability() -> None:
2720
2842
  rprint(f"[red]❌ {str(e)}[/red]")
2721
2843
  return
2722
2844
 
2723
- # Stop spinner after getting results
2845
+ # Cross-region: fetch spot availability from prod-east1
2846
+ spot_region_info = {}
2847
+ _env_name = config.user_config.get("environment", "prod")
2848
+ _east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
2849
+ if _env_name == "prod" and _east1_spot_types:
2850
+ try:
2851
+ import boto3 as _b3
2852
+ east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
2853
+ for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
2854
+ gt = item.get("gpu_type", "")
2855
+ if gt in _east1_spot_types:
2856
+ spot_region_info[gt] = {
2857
+ "available": int(item.get("available_gpus", 0)),
2858
+ "total": int(item.get("total_gpus", 0)),
2859
+ "max_reservable": int(item.get("max_reservable", 0)),
2860
+ "spot_info": item.get("spot_info", {}),
2861
+ }
2862
+ except Exception:
2863
+ pass
2724
2864
 
2725
2865
  if availability_info:
2726
2866
  # GPU architecture mapping (for display)
@@ -2762,84 +2902,99 @@ def _show_availability() -> None:
2762
2902
  "CPU (arm64)": 6,
2763
2903
  }
2764
2904
 
2765
- # Sort GPU types by architecture priority, then by name
2766
- sorted_gpu_types = sorted(
2767
- availability_info.items(),
2768
- key=lambda x: (
2769
- arch_priority.get(
2770
- gpu_architectures.get(x[0], "Unknown"), 99),
2771
- x[0]
2772
- )
2773
- )
2774
-
2775
- table = Table(
2776
- title="GPU Availability by Type (numbers are GPUs, not nodes)")
2777
- table.add_column("GPU Type", style="cyan")
2778
- table.add_column("Avail", style="green")
2779
- table.add_column("Max\nReservable", style="bright_green")
2780
- table.add_column("Total", style="blue")
2781
- table.add_column("Queue\nLength", style="yellow")
2782
- table.add_column("Architecture", style="dim")
2783
- table.add_column("Est. Wait Time", style="magenta")
2784
-
2785
- last_arch = None
2786
- for gpu_type, info in sorted_gpu_types:
2787
- arch = gpu_architectures.get(gpu_type, "Unknown")
2788
-
2789
- # Add separator before CPU section
2790
- if last_arch and not last_arch.startswith("CPU") and arch.startswith("CPU"):
2791
- table.add_row("---", "---", "---",
2792
- "---", "---", "---", "---")
2793
-
2794
- last_arch = arch
2795
- available = info.get("available", 0)
2796
- max_reservable = info.get("max_reservable", 0)
2797
- total = info.get("total", 0)
2798
- full_nodes_available = info.get("full_nodes_available", 0)
2799
- gpus_per_instance = info.get("gpus_per_instance", 0)
2800
- queue_length = info.get("queue_length", 0)
2801
- est_wait = info.get("estimated_wait_minutes", 0)
2802
-
2803
- # Format wait time
2804
- if available > 0:
2805
- wait_display = "Available now"
2806
- elif est_wait == 0:
2807
- wait_display = "Unknown"
2808
- elif est_wait < 60:
2809
- wait_display = f"{int(est_wait)}min"
2810
- else:
2811
- hours = int(est_wait // 60)
2812
- minutes = int(est_wait % 60)
2813
- if minutes == 0:
2814
- wait_display = f"{hours}h"
2905
+ # Split into categories
2906
+ full_types = {k: v for k, v in availability_info.items() if "mig" not in k}
2907
+ mig_types = {k: v for k, v in availability_info.items() if "mig" in k}
2908
+
2909
+ def _sort_by_arch(items):
2910
+ return sorted(items.items(), key=lambda x: (
2911
+ arch_priority.get(gpu_architectures.get(x[0], "Unknown"), 99), x[0]))
2912
+
2913
+ def _fmt_wait(available, est_wait):
2914
+ if available > 0: return "Available now"
2915
+ if not est_wait: return "Unknown"
2916
+ if est_wait < 60: return f"{int(est_wait)}min"
2917
+ h, m = int(est_wait // 60), int(est_wait % 60)
2918
+ return f"{h}h{f' {m}min' if m else ''}"
2919
+
2920
+ def _build_avail_table(title, items):
2921
+ t = Table(title=title)
2922
+ t.add_column("GPU Type", style="cyan")
2923
+ t.add_column("Avail", style="green")
2924
+ t.add_column("Max\nReservable", style="bright_green")
2925
+ t.add_column("Total", style="blue")
2926
+ t.add_column("Queue\nLength", style="yellow")
2927
+ t.add_column("Architecture", style="dim")
2928
+ t.add_column("Est. Wait Time", style="magenta")
2929
+ for gpu_type, info in _sort_by_arch(items):
2930
+ avail = info.get("available", 0)
2931
+ maint = info.get("maintenance", False)
2932
+ maint_reason = info.get("maintenance_reason", "")
2933
+ fn = info.get("full_nodes_available", 0)
2934
+ if maint:
2935
+ ad = "[red]MAINTENANCE[/red]"
2936
+ wd = maint_reason or "Under maintenance"
2937
+ elif avail == 0:
2938
+ ad = f"[red]{avail}[/red]"
2939
+ wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
2940
+ elif fn > 0:
2941
+ ad = f"[green]{avail}[/green]"
2942
+ wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
2815
2943
  else:
2816
- wait_display = f"{hours}h {minutes}min"
2817
-
2818
- # Check maintenance mode
2819
- is_maintenance = info.get("maintenance", False)
2820
- maintenance_reason = info.get("maintenance_reason", "")
2821
-
2822
- if is_maintenance:
2823
- available_display = f"[red]MAINTENANCE[/red]"
2824
- wait_display = maintenance_reason or "Under maintenance"
2825
- elif available == 0:
2826
- available_display = f"[red]{available}[/red]"
2827
- elif full_nodes_available > 0:
2828
- available_display = f"[green]{available}[/green]"
2829
- else:
2830
- available_display = f"[yellow]{available}[/yellow]"
2831
-
2832
- table.add_row(
2833
- gpu_type.upper(),
2834
- available_display,
2835
- str(max_reservable) if not is_maintenance else "-",
2836
- str(total),
2837
- str(queue_length) if not is_maintenance else "-",
2838
- arch,
2839
- wait_display,
2840
- )
2841
-
2842
- console.print(table)
2944
+ ad = f"[yellow]{avail}[/yellow]"
2945
+ wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
2946
+ t.add_row(
2947
+ gpu_type.upper(), ad,
2948
+ "-" if maint else str(info.get("max_reservable", 0)),
2949
+ str(info.get("total", 0)),
2950
+ "-" if maint else str(info.get("queue_length", 0)),
2951
+ gpu_architectures.get(gpu_type, "Unknown"), wd)
2952
+ console.print(t)
2953
+
2954
+ # Section 1: Full GPUs & CPUs
2955
+ _build_avail_table("━━━ Full GPUs & CPUs ━━━", full_types)
2956
+
2957
+ # Section 2: MIG Slices
2958
+ if mig_types:
2959
+ rprint("[dim] Sliced GPUs — isolated fractions of a physical GPU, perfect for smaller jobs.[/dim]")
2960
+ _build_avail_table("━━━ 🔬 MIG Slices ━━━", mig_types)
2961
+
2962
+ # Spot section from prod-east1
2963
+ if spot_region_info:
2964
+ # Spot GPU configs for max reservable (what you CAN get per node)
2965
+ spot_gpus_per_node = {
2966
+ "b300": 8, "b200": 8, "h200": 8, "h100": 8, "a100": 8,
2967
+ "t4": 4, "l4": 4,
2968
+ }
2969
+ spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
2970
+ spot_table.add_column("GPU Type", style="cyan")
2971
+ spot_table.add_column("Avail\nNow", style="green")
2972
+ spot_table.add_column("Per\nNode", style="bright_green")
2973
+ spot_table.add_column("Status", style="magenta")
2974
+ spot_table.add_column("Availability", style="dim")
2975
+ _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
2976
+ for gt, info in sorted(spot_region_info.items()):
2977
+ avail = info.get("available", 0)
2978
+ per_node = spot_gpus_per_node.get(gt, 8)
2979
+ avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
2980
+ status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
2981
+ si = info.get("spot_info", {}) or {}
2982
+ sp = si.get("spot_price", "") if isinstance(si, dict) else ""
2983
+ if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
2984
+ avail_signal = "[red]Not offered[/red]"
2985
+ else:
2986
+ try:
2987
+ ratio = float(sp) / _on_demand.get(gt, 50)
2988
+ pct = int((1 - ratio) * 100)
2989
+ if ratio < 0.4: avail_signal = f"[green]High ({pct}% off)[/green]"
2990
+ elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
2991
+ else: avail_signal = f"[red]Low ({pct}% off)[/red]"
2992
+ except (ValueError, TypeError):
2993
+ avail_signal = "[yellow]Unknown[/yellow]"
2994
+ spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
2995
+ console.print(spot_table)
2996
+ rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
2997
+ rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
2843
2998
 
2844
2999
  # Show color legend
2845
3000
  rprint("\n[bold]Availability legend:[/bold]")
@@ -2847,7 +3002,7 @@ def _show_availability() -> None:
2847
3002
 
2848
3003
  # Show usage tip
2849
3004
  rprint(
2850
- "\n[dim]💡 Use 'gpu-dev reserve --gpu-type <type>' to reserve GPUs of a specific type[/dim]"
3005
+ "\n[dim]💡 Use 'gpu-dev reserve' (interactive) to see all options including MIG slices and spot instances[/dim]"
2851
3006
  )
2852
3007
 
2853
3008
  else:
@@ -2858,6 +3013,9 @@ def _show_availability() -> None:
2858
3013
 
2859
3014
 
2860
3015
  def _show_availability_watch(interval: int) -> None:
3016
+ _env_name = load_config().user_config.get("environment", "prod")
3017
+ _spot_types = frozenset(Config.ENVIRONMENTS.get(_env_name, {}).get("spot_types", []))
3018
+
2861
3019
  """Watch mode for GPU availability with auto-refresh"""
2862
3020
  import time
2863
3021
  from datetime import datetime
@@ -2990,8 +3148,9 @@ def _show_availability_watch(interval: int) -> None:
2990
3148
  else:
2991
3149
  available_display = f"[yellow]{available}[/yellow]"
2992
3150
 
3151
+ type_label = f"{gpu_type.upper()} *" if gpu_type in _spot_types else gpu_type.upper()
2993
3152
  table.add_row(
2994
- gpu_type.upper(),
3153
+ type_label,
2995
3154
  available_display,
2996
3155
  str(max_reservable) if not is_maintenance else "-",
2997
3156
  str(total),
@@ -3079,10 +3238,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3079
3238
 
3080
3239
  # If no reservation ID provided, show interactive selection
3081
3240
  if reservation_id is None:
3082
- reservations = reservation_mgr.list_reservations(
3083
- user_filter=user_info["user_id"],
3084
- statuses_to_include=["active"]
3085
- )
3241
+ reservations = _fetch_reservations_cross_region(
3242
+ reservation_mgr, user_info["user_id"], ["active"], config)
3086
3243
 
3087
3244
  live.stop()
3088
3245
 
@@ -3109,6 +3266,16 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3109
3266
 
3110
3267
  live.start()
3111
3268
 
3269
+ # If the selected reservation is from east1, switch to east1 reservation_mgr
3270
+ _sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
3271
+ if _sel and _sel.get("_region") == "us-east-1":
3272
+ import os as _os
3273
+ east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
3274
+ _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
3275
+ _east1_config = Config()
3276
+ _east1_config.aws_region = east1_cfg["region"]
3277
+ reservation_mgr = ReservationManager(_east1_config)
3278
+
3112
3279
  # Get connection info
3113
3280
  connection_info = reservation_mgr.get_connection_info(
3114
3281
  reservation_id, user_info["user_id"]
@@ -3320,10 +3487,8 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
3320
3487
 
3321
3488
  # If no reservation ID provided, show interactive selection
3322
3489
  if reservation_id is None:
3323
- reservations = reservation_mgr.list_reservations(
3324
- user_filter=user_info["user_id"],
3325
- statuses_to_include=["active"]
3326
- )
3490
+ reservations = _fetch_reservations_cross_region(
3491
+ reservation_mgr, user_info["user_id"], ["active"], config)
3327
3492
 
3328
3493
  live.stop()
3329
3494
 
@@ -3698,10 +3863,7 @@ def environment(env_name: str) -> None:
3698
3863
  rprint(f"[dim]Configuration saved to {cfg.CONFIG_FILE}[/dim]")
3699
3864
 
3700
3865
  # Instructions for shell export
3701
- rprint(f"\n[yellow]💡 To apply in your current shell:[/yellow]")
3702
- rprint(f" export AWS_DEFAULT_REGION={env_config['region']}")
3703
- rprint(f"\n[yellow]💡 Or use the switch-to.sh script:[/yellow]")
3704
- rprint(f" ./switch-to.sh {env_name}")
3866
+ rprint(f"\n[dim]Region saved. All gpu-dev commands now target {env_config['region']}.[/dim]")
3705
3867
 
3706
3868
  except Exception as e:
3707
3869
  rprint(f"[red]❌ Error setting environment: {str(e)}[/red]")
@@ -26,6 +26,7 @@ class Config:
26
26
  "region": "us-east-1",
27
27
  "workspace": "prod-east1",
28
28
  "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
29
+ "spot_types": ["b300", "b200", "h200", "h100", "a100"],
29
30
  },
30
31
  }
31
32
  DEFAULT_ENVIRONMENT = "prod"