gpu-dev 0.5.22__tar.gz → 0.5.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +13 -0
  4. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +11 -2
  5. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/index.py +34 -1
  7. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda.tf +1 -1
  8. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.github/workflows/no-gitlinks.yml +0 -0
  9. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.github/workflows/publish.yml +0 -0
  10. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.gitignore +0 -0
  11. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/CLAUDE.md +0 -0
  12. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PROGRESS.md +0 -0
  13. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PR_DESCRIPTION.md +0 -0
  14. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/README.md +0 -0
  15. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/TODO.md +0 -0
  16. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/README.md +0 -0
  17. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/generate_stats.py +0 -0
  18. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/requirements.txt +0 -0
  19. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/README.md +0 -0
  20. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  21. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  22. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  23. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  24. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  25. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  26. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  27. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
  28. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  29. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  30. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  31. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  32. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  33. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  34. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  35. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/post.md +0 -0
  40. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/setup.cfg +0 -0
  41. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  42. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  43. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/README.md +0 -0
  44. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/alb.tf +0 -0
  45. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/availability.tf +0 -0
  46. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/backend.tf +0 -0
  47. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  48. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  49. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  50. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bash_profile +0 -0
  51. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bashrc +0 -0
  52. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  53. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  54. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  55. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  56. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/motd_script +0 -0
  57. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  58. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/profile +0 -0
  59. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  60. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  61. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  62. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/shell_env +0 -0
  63. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/ssh_config +0 -0
  64. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zprofile +0 -0
  65. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zshrc +0 -0
  66. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  67. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-build.tf +0 -0
  68. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  69. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  70. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ecr.tf +0 -0
  71. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/efs.tf +0 -0
  72. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/eks.tf +0 -0
  73. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/expiry.tf +0 -0
  74. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/git-cache.tf +0 -0
  75. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  76. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  84. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  85. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  86. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  87. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  88. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  89. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  90. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  91. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/main.tf +0 -0
  92. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/mig-config.tf +0 -0
  93. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  94. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  95. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  96. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  97. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  98. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  99. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/monitoring.tf +0 -0
  100. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/outputs.tf +0 -0
  101. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/pyproject.toml +0 -0
  102. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/queue.tf +0 -0
  103. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/route53.tf +0 -0
  104. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  105. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  106. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  107. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  108. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  109. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  110. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  111. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  112. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  113. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  114. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/switch-to.sh +0 -0
  115. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  116. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  117. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  118. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  119. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/variables.tf +0 -0
  120. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/README.md +0 -0
  121. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/fail/run.sh +0 -0
  122. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/multinode/run.sh +0 -0
  123. {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.22
3
+ Version: 0.5.24
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.22
3
+ Version: 0.5.24
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -38,6 +38,19 @@ def _load_auth_cache(github_user: str) -> Optional[Dict[str, Any]]:
38
38
  return None
39
39
  if time.time() - float(entry.get("ts", 0)) > _AUTH_CACHE_TTL_SECONDS:
40
40
  return None
41
+ # Defense against stale cache on a persistent disk that pre-dates the IRSA fix:
42
+ # if AWS_ROLE_ARN points at a role the cached ARN doesn\'t reference, the cache
43
+ # is from a different identity (e.g. IMDS-fallback before fs_group=1081 landed)
44
+ # and should be ignored.
45
+ expected_role_arn = os.environ.get("AWS_ROLE_ARN", "")
46
+ cached_arn = (entry.get("result") or {}).get("arn", "")
47
+ if expected_role_arn:
48
+ try:
49
+ role_name = expected_role_arn.rsplit("/", 1)[-1]
50
+ if role_name and role_name not in cached_arn:
51
+ return None
52
+ except Exception:
53
+ pass
41
54
  return entry.get("result")
42
55
  except Exception:
43
56
  return None
@@ -240,8 +240,17 @@ class Config:
240
240
  return self.user_config.get(key)
241
241
 
242
242
  def get_github_username(self) -> Optional[str]:
243
- """Get GitHub username from config."""
244
- return self.user_config.get("github_user")
243
+ """Get GitHub username, falling back to GPU_DEV_GITHUB_USER env var.
244
+
245
+ Lambda sets GPU_DEV_GITHUB_USER on every pod from the reservation's
246
+ github_user field, so a user running gpu-dev from inside their dev pod
247
+ doesn\'t have to `gpu-dev config set github_user <name>` first.
248
+ """
249
+ v = self.user_config.get("github_user")
250
+ if v:
251
+ return v
252
+ v = os.environ.get("GPU_DEV_GITHUB_USER")
253
+ return v or None
245
254
 
246
255
 
247
256
  def load_config() -> Config:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.22"
7
+ version = "0.5.24"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -2888,6 +2888,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2888
2888
  gpu_count=gpu_count,
2889
2889
  gpu_type=gpu_type,
2890
2890
  github_public_key=github_public_key,
2891
+ github_user=github_user,
2891
2892
  reservation_id=reservation_id,
2892
2893
  jupyter_enabled=jupyter_enabled,
2893
2894
  persistent_volume_id=persistent_volume_id,
@@ -3434,7 +3435,8 @@ def create_kubernetes_resources(
3434
3435
  gpu_count: int,
3435
3436
  gpu_type: str,
3436
3437
  github_public_key: str,
3437
- reservation_id: str,
3438
+ github_user: str = "",
3439
+ reservation_id: str = None,
3438
3440
  jupyter_enabled: bool = False,
3439
3441
  persistent_volume_id: str = None,
3440
3442
  user_id: str = None,
@@ -3538,6 +3540,7 @@ def create_kubernetes_resources(
3538
3540
  gpu_count,
3539
3541
  gpu_type,
3540
3542
  github_public_key,
3543
+ github_user=github_user,
3541
3544
  jupyter_enabled=True,
3542
3545
  persistent_volume_id=persistent_volume_id,
3543
3546
  user_id=user_id,
@@ -3627,6 +3630,7 @@ def create_kubernetes_resources(
3627
3630
  gpu_count,
3628
3631
  gpu_type,
3629
3632
  github_public_key,
3633
+ github_user=github_user,
3630
3634
  jupyter_enabled=False,
3631
3635
  persistent_volume_id=persistent_volume_id,
3632
3636
  user_id=user_id,
@@ -3979,6 +3983,7 @@ def create_pod(
3979
3983
  gpu_count: int,
3980
3984
  gpu_type: str,
3981
3985
  github_public_key: str,
3986
+ github_user: str = "",
3982
3987
  jupyter_enabled: bool = False,
3983
3988
  persistent_volume_id: str = None,
3984
3989
  user_id: str = None,
@@ -4486,6 +4491,18 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
4486
4491
  export MASTER_ADDR="$MASTER_ADDR"
4487
4492
  export MASTER_PORT="$MASTER_PORT"
4488
4493
 
4494
+ # IRSA + region — same reason as MULTINODE: sshd strips these from login shells, so
4495
+ # we bake the current container values into the rc file. Lets gpu-dev / aws / boto3
4496
+ # inside an SSH session pick up the gpu-dev-pod-sa IAM role automatically.
4497
+ export AWS_ROLE_ARN="$AWS_ROLE_ARN"
4498
+ export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
4499
+ export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
4500
+ export AWS_REGION="$AWS_REGION"
4501
+ export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
4502
+ export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
4503
+ # CLI falls back to this when ~/.config/gpu-dev/config.json has no github_user
4504
+ export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
4505
+
4489
4506
  # Function to check for GPU reservation expiry warnings and startup script status
4490
4507
  check_warnings() {{
4491
4508
  # Check for startup script still running
@@ -4539,6 +4556,15 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
4539
4556
  export MASTER_ADDR="$MASTER_ADDR"
4540
4557
  export MASTER_PORT="$MASTER_PORT"
4541
4558
 
4559
+ # IRSA + region (see .bashrc_ext for rationale)
4560
+ export AWS_ROLE_ARN="$AWS_ROLE_ARN"
4561
+ export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
4562
+ export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
4563
+ export AWS_REGION="$AWS_REGION"
4564
+ export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
4565
+ export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
4566
+ export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
4567
+
4542
4568
  # Function to check for GPU reservation expiry warnings and startup script status
4543
4569
  check_warnings() {{
4544
4570
  # Check for startup script still running
@@ -5314,6 +5340,9 @@ EOF
5314
5340
  ),
5315
5341
  client.V1EnvVar(
5316
5342
  name="AWS_ROLE_SESSION_NAME", value=(user_id or "gpu-dev-pod")[:64]
5343
+ ),
5344
+ client.V1EnvVar(
5345
+ name="GPU_DEV_GITHUB_USER", value=github_user or ""
5317
5346
  )
5318
5347
  ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
5319
5348
  resources=client.V1ResourceRequirements(
@@ -5501,6 +5530,10 @@ EOF
5501
5530
  # with the AWS_ROLE_SESSION_NAME env var below this lets users run
5502
5531
  # `gpu-dev submit` from inside their dev pod with no manual aws sso login.
5503
5532
  service_account_name="gpu-dev-pod-sa",
5533
+ # fs_group=1081 makes the IRSA-projected token (default 0600 root:root)
5534
+ # readable by the dev user. Without it boto3-as-dev falls through to IMDS
5535
+ # and gets the node's IAM role, which doesn't have DDB/SQS permissions.
5536
+ security_context=client.V1PodSecurityContext(fs_group=1081),
5504
5537
  # EFA requires host network namespace for RDMA access to efa0 interface
5505
5538
  **({
5506
5539
  "host_network": True,
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.23"
183
+ LAMBDA_VERSION = "0.5.25"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes