gpu-dev 0.3.8__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gpu_dev-0.4.0/.github/workflows/no-gitlinks.yml +22 -0
  2. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/.gitignore +2 -0
  3. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/CLAUDE.md +27 -0
  4. {gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev.egg-info → gpu_dev-0.4.0}/PKG-INFO +2 -2
  5. {gpu_dev-0.3.8 → gpu_dev-0.4.0/cli-tools/gpu-dev-cli/gpu_dev.egg-info}/PKG-INFO +2 -2
  6. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
  7. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +1 -1
  8. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -1
  9. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +85 -72
  10. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +9 -9
  11. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +23 -10
  12. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +103 -159
  13. gpu_dev-0.4.0/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +142 -0
  14. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/pyproject.toml +2 -2
  15. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/alb.tf +1 -0
  16. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/Dockerfile +28 -64
  17. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/build-with-efa.sh +16 -10
  18. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/shell_env +7 -3
  19. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/eks.tf +103 -61
  20. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +27 -21
  21. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +539 -184
  22. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +53 -19
  23. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda.tf +2 -2
  24. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/main.tf +166 -31
  25. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/queue.tf +7 -0
  26. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +4 -1
  27. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy-service.tf +1 -1
  28. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +53 -3
  29. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/variables.tf +12 -11
  30. gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -106
  31. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/.github/workflows/publish.yml +0 -0
  32. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/PROGRESS.md +0 -0
  33. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/PR_DESCRIPTION.md +0 -0
  34. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/TODO.md +0 -0
  35. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/README.md +0 -0
  36. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/generate_stats.py +0 -0
  37. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/requirements.txt +0 -0
  38. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/README.md +0 -0
  39. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  40. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  41. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  42. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  43. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  44. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  45. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  46. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  47. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  48. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/USER_GUIDE.md +0 -0
  49. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/devgpu-features.html +0 -0
  50. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/docker-mark-blue.svg +0 -0
  51. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/icons8-cursor-ai.svg +0 -0
  52. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/post.md +0 -0
  53. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/setup.cfg +0 -0
  54. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  55. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  56. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/README.md +0 -0
  57. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/availability.tf +0 -0
  58. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/backend.tf +0 -0
  59. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  60. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  61. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
  62. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bashrc +0 -0
  63. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  64. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  65. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  66. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/motd_script +0 -0
  67. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  68. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/profile +0 -0
  69. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  70. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  71. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  72. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
  73. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zprofile +0 -0
  74. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zshrc +0 -0
  75. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  76. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-build.tf +0 -0
  77. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  78. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  79. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ecr.tf +0 -0
  80. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/efs.tf +0 -0
  81. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/expiry.tf +0 -0
  82. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/git-cache.tf +0 -0
  83. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/kubernetes.tf +0 -0
  84. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  85. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  86. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  87. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  88. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  89. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  90. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  91. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  92. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  93. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  94. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  95. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  96. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/route53.tf +0 -0
  105. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  112. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  113. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/switch-to.sh +0 -0
  114. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  115. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
@@ -0,0 +1,22 @@
1
+ name: Validate repository structure
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ no-gitlinks:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v4
15
+ - name: Ensure no gitlinks are tracked
16
+ run: |
17
+ gitlinks=$(git ls-files -s | awk "$1 == 160000 {print}")
18
+ if [ -n "$gitlinks" ]; then
19
+ echo "Unexpected gitlinks found:"
20
+ echo "$gitlinks"
21
+ exit 1
22
+ fi
@@ -71,3 +71,5 @@ lambda/*/package/
71
71
 
72
72
  # Admin output files
73
73
  admin/output/
74
+
75
+ .claude/worktrees/
@@ -8,6 +8,8 @@ This will help both you, the agent, but also other agents down the road that sha
8
8
  - NEVER run `terraform apply` or any destructive terraform commands
9
9
  - You can run read-only terraform commands like `terraform plan`, `terraform state show`, etc.
10
10
  - You can run AWS CLI commands for read-only resource fetching and analysis
11
+ - NEVER run destructive AWS CLI commands: `aws ec2 terminate-instances`, `aws ec2 stop-instances`, `aws autoscaling set-desired-capacity` (to 0), `aws ec2 delete-*`, `aws dynamodb delete-table`, etc. On 2026-03-09 an agent accidentally terminated 10 EC2 instances including 6 pet H100 instances from another team's capacity reservations. This must never happen again.
12
+ - NEVER run `kubectl delete node`, `kubectl drain`, `kubectl cordon`, or any command that removes/disrupts running workloads
11
13
  - User will handle all infrastructure deployments themselves
12
14
  - Note: We use OpenTofu, so user runs `opentofu apply` or `tf apply` locally (tf is aliased to opentofu)
13
15
  - we use k for kubectl and have kubens configured to namespace gpu-dev
@@ -73,6 +75,31 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
73
75
 
74
76
  **K8s Decision:** EKS with GPU-optimized EC2 node groups (Fargate has no GPU support)
75
77
 
78
+ ## Multi-Node NCCL Communication (Mar 2026)
79
+
80
+ **Working Configuration (SENDRECV protocol):**
81
+ - Protocol: `OFI_NCCL_PROTOCOL=SENDRECV` (host-staged EFA, avoids RDMA mr_regattr failures)
82
+ - GDR disabled: `FI_EFA_USE_DEVICE_RDMA=0`, `NCCL_NET_GDR_LEVEL=0`
83
+ - Socket interface: `NCCL_SOCKET_IFNAME=^lo,docker` (H100 nodes use enp71s0/enp72s0, NOT eth0)
84
+ - Algorithm: `NCCL_ALGO=ring,tree` (NCCL auto-selects tree for large messages, ~2x faster)
85
+ - Exclude Mellanox: `NCCL_IB_HCA=^mlx`
86
+ - OpenMPI lib path: `/opt/amazon/openmpi/lib` (NOT lib64 — EFA installer puts it in lib)
87
+
88
+ **Benchmark Results (2x p5.48xlarge, 16 GPUs):**
89
+ - Ring algorithm: ~9.5 GB/s avg bus bandwidth, ~13.4 GB/s peak
90
+ - Tree algorithm: ~21.4 GB/s avg bus bandwidth, ~33.6 GB/s peak
91
+ - Ring+tree combined: ~21.0 GB/s avg (NCCL auto-selects tree for large msgs)
92
+ - Single-node NVLink: ~34 GB/s (for reference)
93
+
94
+ **GDR Status (NOT working — future optimization):**
95
+ - EFA RDMA protocol fails: `fi_mr_regattr` returns EFAULT for flush buffer (even host memory)
96
+ - EFA device version: 6 (above aws-ofi-nccl blocklist threshold of 1-3)
97
+ - EFA kernel driver: 2.17.2a (need 2.17.3+ which has "Support P2P with NVIDIA 580 drivers")
98
+ - nvidia-peermem: NOT available (module not found for kernel 6.12.68)
99
+ - efa-nv-peermem: NOT installed (available in amzn-drivers repo, works with open NVIDIA drivers)
100
+ - To enable GDR in future: install efa-nv-peermem module on host nodes, or update EFA kernel driver
101
+ - Expected GDR improvement: ~300-370 GB/s bus bandwidth (vs ~33 GB/s current)
102
+
76
103
  ## Implementation Status (Jan 11, 2025)
77
104
 
78
105
  ### ✅ Completed and Working
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.3.8
3
+ Version: 0.4.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: rich>=13.7.0
13
13
  Requires-Dist: pyyaml>=6.0.1
14
14
  Requires-Dist: questionary>=2.1.1
15
- Requires-Dist: websockets<13.0,>=12.0
15
+ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.3.8
3
+ Version: 0.4.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: rich>=13.7.0
13
13
  Requires-Dist: pyyaml>=6.0.1
14
14
  Requires-Dist: questionary>=2.1.1
15
- Requires-Dist: websockets<13.0,>=12.0
15
+ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
@@ -5,6 +5,7 @@ PR_DESCRIPTION.md
5
5
  TODO.md
6
6
  post.md
7
7
  pyproject.toml
8
+ .github/workflows/no-gitlinks.yml
8
9
  .github/workflows/publish.yml
9
10
  admin/README.md
10
11
  admin/generate_stats.py
@@ -5,6 +5,6 @@ pydantic>=2.5.0
5
5
  rich>=13.7.0
6
6
  pyyaml>=6.0.1
7
7
  questionary>=2.1.1
8
- websockets<13.0,>=12.0
8
+ websockets>=12.0
9
9
  certifi>=2023.7.22
10
10
  mcp>=1.0.0
@@ -95,7 +95,6 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
95
95
  # Restart the spinner
96
96
  if live:
97
97
  live.start()
98
- live.update(Spinner("dots", text="🔐 Validating SSH key..."))
99
98
 
100
99
  # Check if we got the expected GitHub response
101
100
  if "Hi " in ssh_output and "You've successfully authenticated" in ssh_output:
@@ -310,6 +310,12 @@ def _show_single_reservation(connection_info: dict) -> None:
310
310
  oom_time_display = format_timestamp(last_oom_at) if last_oom_at else "Unknown"
311
311
  oom_section = f"\n[red]⚠️ OOM Events:[/red] [red]{oom_count} OOM(s) detected (last: {oom_time_display})[/red]"
312
312
 
313
+ # Show pod internal IP for multinode reservations
314
+ pod_ip_info = ""
315
+ pod_ip = connection_info.get("pod_ip")
316
+ if pod_ip and connection_info.get("is_multinode"):
317
+ pod_ip_info = f"[blue]Internal IP:[/blue] {pod_ip}\n"
318
+
313
319
  panel_content = (
314
320
  f"[green]Reservation Details[/green]\n\n"
315
321
  f"[blue]Quick Connect:[/blue] {connect_command}\n"
@@ -317,7 +323,8 @@ def _show_single_reservation(connection_info: dict) -> None:
317
323
  + vscode_info
318
324
  + jupyter_info
319
325
  + f"[blue]Pod Name:[/blue] {connection_info['pod_name']}\n"
320
- f"[blue]GPUs:[/blue] {gpu_info}\n"
326
+ + pod_ip_info
327
+ + f"[blue]GPUs:[/blue] {gpu_info}\n"
321
328
  f"[blue]Instance Type:[/blue] {instance_type}\n"
322
329
  + secondary_users_info
323
330
  + f"[blue]Storage:[/blue] {disk_status}\n"
@@ -1408,59 +1415,42 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1408
1415
 
1409
1416
  statuses_to_include = requested_statuses
1410
1417
  else:
1411
- # Default: in-progress + recent failures (last hour)
1418
+ # Default: active statuses only (fast path)
1419
+ # failed/cancelled are fetched separately and filtered to last hour
1412
1420
  statuses_to_include = [
1413
- "active", "preparing", "queued", "pending", "failed", "cancelled"]
1414
-
1415
- reservations = reservation_mgr.list_reservations(
1416
- user_filter=user_filter, statuses_to_include=statuses_to_include
1417
- )
1421
+ "active", "preparing", "queued", "pending"]
1422
+
1423
+ # For default view, fetch active statuses + recent failures in parallel
1424
+ if not status:
1425
+ from datetime import datetime, timezone, timedelta
1426
+ from concurrent.futures import ThreadPoolExecutor
1427
+ one_hour_ago = (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat()
1428
+
1429
+ def fetch_active():
1430
+ return reservation_mgr.list_reservations(
1431
+ user_filter=user_filter, statuses_to_include=statuses_to_include)
1432
+
1433
+ def fetch_recent_failures():
1434
+ return reservation_mgr.list_reservations(
1435
+ user_filter=user_filter,
1436
+ statuses_to_include=["failed", "cancelled"],
1437
+ created_after=one_hour_ago)
1438
+
1439
+ with ThreadPoolExecutor(max_workers=2) as executor:
1440
+ active_future = executor.submit(fetch_active)
1441
+ failures_future = executor.submit(fetch_recent_failures)
1442
+ reservations = active_future.result() + failures_future.result()
1443
+ else:
1444
+ reservations = reservation_mgr.list_reservations(
1445
+ user_filter=user_filter, statuses_to_include=statuses_to_include
1446
+ )
1418
1447
  except RuntimeError as e:
1419
1448
  rprint(f"[red]❌ {str(e)}[/red]")
1420
1449
  return False
1421
1450
 
1422
1451
  # Filter failed/cancelled reservations to only show recent ones (last hour)
1423
1452
  if not status or "all" not in (status.split(",") if status else []):
1424
- # Only apply time filtering when using default filters (not when user specifies --status)
1425
- from datetime import datetime, timezone, timedelta
1426
- now = datetime.now(timezone.utc)
1427
- one_hour_ago = now - timedelta(hours=1)
1428
-
1429
- filtered_reservations = []
1430
- for reservation in reservations:
1431
- reservation_status = reservation.get("status", "unknown")
1432
- if reservation_status in ["active", "preparing", "queued", "pending"]:
1433
- # Always show active/pending reservations
1434
- filtered_reservations.append(reservation)
1435
- elif reservation_status in ["failed", "cancelled"]:
1436
- # Only show failed/cancelled from last hour
1437
- created_at = reservation.get("created_at")
1438
- if created_at:
1439
- try:
1440
- if isinstance(created_at, str):
1441
- if created_at.endswith("Z"):
1442
- created_dt = datetime.fromisoformat(
1443
- created_at.replace("Z", "+00:00"))
1444
- elif "+" in created_at or created_at.endswith("00:00"):
1445
- created_dt = datetime.fromisoformat(
1446
- created_at)
1447
- else:
1448
- naive_dt = datetime.fromisoformat(
1449
- created_at)
1450
- created_dt = naive_dt.replace(
1451
- tzinfo=timezone.utc)
1452
- else:
1453
- created_dt = datetime.fromtimestamp(
1454
- created_at, tz=timezone.utc)
1455
-
1456
- if created_dt >= one_hour_ago:
1457
- filtered_reservations.append(reservation)
1458
- except (ValueError, TypeError):
1459
- # If timestamp parsing fails, include it to be safe
1460
- filtered_reservations.append(reservation)
1461
- else:
1462
- # Include other statuses as-is
1463
- filtered_reservations.append(reservation)
1453
+ filtered_reservations = reservations
1464
1454
 
1465
1455
  reservations = filtered_reservations
1466
1456
 
@@ -1556,20 +1546,25 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1556
1546
  else:
1557
1547
  queue_info = "Calculating..."
1558
1548
  elif res_status == "active":
1559
- # Show SSH connection hint for active reservations
1560
- ssh_command = reservation.get("ssh_command", "")
1561
- if ssh_command and "dev@" in ssh_command:
1562
- try:
1563
- node_info = (
1564
- ssh_command.split("dev@")[1].split()[0]
1565
- if "dev@" in ssh_command
1566
- else "Ready"
1567
- )
1568
- queue_info = f"Ready: {node_info}"
1569
- except (IndexError, AttributeError):
1570
- queue_info = "Ready"
1549
+ # Show pod IP for multinode, SSH hint for single-node
1550
+ pod_ip = reservation.get("pod_ip", "")
1551
+ is_multinode = reservation.get("is_multinode", False)
1552
+ if is_multinode and pod_ip:
1553
+ queue_info = f"IP: {pod_ip}"
1571
1554
  else:
1572
- queue_info = "Ready"
1555
+ ssh_command = reservation.get("ssh_command", "")
1556
+ if ssh_command and "dev@" in ssh_command:
1557
+ try:
1558
+ node_info = (
1559
+ ssh_command.split("dev@")[1].split()[0]
1560
+ if "dev@" in ssh_command
1561
+ else "Ready"
1562
+ )
1563
+ queue_info = f"Ready: {node_info}"
1564
+ except (IndexError, AttributeError):
1565
+ queue_info = "Ready"
1566
+ else:
1567
+ queue_info = "Ready"
1573
1568
 
1574
1569
  # Format storage indicator - show disk name if available
1575
1570
  disk_name = reservation.get("disk_name")
@@ -2471,11 +2466,14 @@ def _show_availability() -> None:
2471
2466
  else:
2472
2467
  wait_display = f"{hours}h {minutes}min"
2473
2468
 
2474
- # Color code availability based on full nodes available
2475
- # Red: 0 GPUs available
2476
- # Yellow: Some GPUs available but no full node
2477
- # Green: At least one full node available
2478
- if available == 0:
2469
+ # Check maintenance mode
2470
+ is_maintenance = info.get("maintenance", False)
2471
+ maintenance_reason = info.get("maintenance_reason", "")
2472
+
2473
+ if is_maintenance:
2474
+ available_display = f"[red]MAINTENANCE[/red]"
2475
+ wait_display = maintenance_reason or "Under maintenance"
2476
+ elif available == 0:
2479
2477
  available_display = f"[red]{available}[/red]"
2480
2478
  elif full_nodes_available > 0:
2481
2479
  available_display = f"[green]{available}[/green]"
@@ -2485,9 +2483,9 @@ def _show_availability() -> None:
2485
2483
  table.add_row(
2486
2484
  gpu_type.upper(),
2487
2485
  available_display,
2488
- str(max_reservable),
2486
+ str(max_reservable) if not is_maintenance else "-",
2489
2487
  str(total),
2490
- str(queue_length),
2488
+ str(queue_length) if not is_maintenance else "-",
2491
2489
  arch,
2492
2490
  wait_display,
2493
2491
  )
@@ -2576,6 +2574,7 @@ def _show_availability_watch(interval: int) -> None:
2576
2574
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2577
2575
  table.add_column("GPU Type", style="cyan")
2578
2576
  table.add_column("Available", style="green")
2577
+ table.add_column("Max Reservable", style="blue")
2579
2578
  table.add_column("Total", style="blue")
2580
2579
  table.add_column("Queue Length", style="yellow")
2581
2580
  table.add_column("Architecture", style="dim")
@@ -2588,11 +2587,13 @@ def _show_availability_watch(interval: int) -> None:
2588
2587
  # Add separator before CPU section
2589
2588
  if last_arch and not last_arch.startswith("CPU") and arch.startswith("CPU"):
2590
2589
  table.add_row("---", "---", "---",
2591
- "---", "---", "---")
2590
+ "---", "---", "---", "---")
2592
2591
 
2593
2592
  last_arch = arch
2594
2593
  available = info.get("available", 0)
2594
+ max_reservable = info.get("max_reservable", 0)
2595
2595
  total = info.get("total", 0)
2596
+ full_nodes_available = info.get("full_nodes_available", 0)
2596
2597
  queue_length = info.get("queue_length", 0)
2597
2598
  est_wait = info.get("estimated_wait_minutes", 0)
2598
2599
 
@@ -2611,17 +2612,26 @@ def _show_availability_watch(interval: int) -> None:
2611
2612
  else:
2612
2613
  wait_display = f"{hours}h {minutes}min"
2613
2614
 
2614
- # Color code availability
2615
- if available > 0:
2615
+ # Check maintenance mode
2616
+ is_maintenance = info.get("maintenance", False)
2617
+ maintenance_reason = info.get("maintenance_reason", "")
2618
+
2619
+ if is_maintenance:
2620
+ available_display = f"[red]MAINTENANCE[/red]"
2621
+ wait_display = maintenance_reason or "Under maintenance"
2622
+ elif available == 0:
2623
+ available_display = f"[red]{available}[/red]"
2624
+ elif full_nodes_available > 0:
2616
2625
  available_display = f"[green]{available}[/green]"
2617
2626
  else:
2618
- available_display = f"[red]{available}[/red]"
2627
+ available_display = f"[yellow]{available}[/yellow]"
2619
2628
 
2620
2629
  table.add_row(
2621
2630
  gpu_type.upper(),
2622
2631
  available_display,
2632
+ str(max_reservable) if not is_maintenance else "-",
2623
2633
  str(total),
2624
- str(queue_length),
2634
+ str(queue_length) if not is_maintenance else "-",
2625
2635
  arch,
2626
2636
  wait_display,
2627
2637
  )
@@ -3505,6 +3515,9 @@ def edit(
3505
3515
  # Stop spinner before validation and operations
3506
3516
  live.stop()
3507
3517
 
3518
+ # Use the full reservation_id from connection_info (not the user-provided prefix)
3519
+ reservation_id = connection_info["reservation_id"]
3520
+
3508
3521
  if connection_info["status"] != "active":
3509
3522
  rprint(
3510
3523
  f"[red]❌ Can only edit active reservations (current status: {connection_info['status']})[/red]"
@@ -67,15 +67,15 @@ class Config:
67
67
 
68
68
  def _create_aws_session(self):
69
69
  """Create AWS session with profile support"""
70
- try:
71
- # Try to use 'gpu-dev' profile if it exists
72
- session = boto3.Session(profile_name="gpu-dev")
73
- # Test if profile works by checking credentials
74
- session.get_credentials()
75
- return session
76
- except Exception:
77
- # Fall back to default credentials (environment, default profile, IAM role, etc.)
78
- return boto3.Session()
70
+ available_profiles = boto3.Session().available_profiles
71
+ if "gpu-dev" in available_profiles:
72
+ try:
73
+ session = boto3.Session(profile_name="gpu-dev")
74
+ session.get_credentials()
75
+ return session
76
+ except Exception:
77
+ pass
78
+ return boto3.Session()
79
79
 
80
80
  @property
81
81
  def sts_client(self):
@@ -92,8 +92,14 @@ def select_gpu_type_interactive(
92
92
  wait_display = f"{hours}h {minutes}min"
93
93
  status_indicator = "⏳"
94
94
 
95
- # Color code availability
96
- if available > 0:
95
+ # Check maintenance mode
96
+ is_maintenance = info.get("maintenance", False)
97
+ maintenance_reason = info.get("maintenance_reason", "")
98
+
99
+ if is_maintenance:
100
+ available_display = f"[red]MAINTENANCE[/red]"
101
+ wait_display = maintenance_reason or "Under maintenance"
102
+ elif available > 0:
97
103
  available_display = f"[green]{available}[/green]"
98
104
  else:
99
105
  available_display = f"[red]{available}[/red]"
@@ -102,18 +108,25 @@ def select_gpu_type_interactive(
102
108
  gpu_type.upper(),
103
109
  available_display,
104
110
  str(total),
105
- str(queue_length),
111
+ str(queue_length) if not is_maintenance else "-",
106
112
  wait_display,
107
113
  )
108
114
 
109
- # Create choice label with status
110
- choice_label = (
111
- f"{status_indicator} {gpu_type.upper()} ({available}/{total} available)"
112
- )
113
- if queue_length > 0:
114
- choice_label += f" - {queue_length} in queue"
115
+ if is_maintenance:
116
+ choices.append(questionary.Choice(
117
+ title=f"🔧 {gpu_type.upper()} - MAINTENANCE: {maintenance_reason}",
118
+ value=gpu_type,
119
+ disabled="Under maintenance",
120
+ ))
121
+ else:
122
+ # Create choice label with status
123
+ choice_label = (
124
+ f"{status_indicator} {gpu_type.upper()} ({available}/{total} available)"
125
+ )
126
+ if queue_length > 0:
127
+ choice_label += f" - {queue_length} in queue"
115
128
 
116
- choices.append(questionary.Choice(title=choice_label, value=gpu_type))
129
+ choices.append(questionary.Choice(title=choice_label, value=gpu_type))
117
130
 
118
131
  console.print(table)
119
132
  console.print()