gpu-dev 0.5.24__tar.gz → 0.5.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
  4. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +29 -0
  5. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +5 -0
  6. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +10 -1
  7. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/pyproject.toml +1 -1
  8. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/eks.tf +6 -3
  9. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/main.tf +64 -0
  10. gpu_dev-0.5.26/terraform-gpu-devservers/node-termination-handler.tf +36 -0
  11. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.github/workflows/no-gitlinks.yml +0 -0
  12. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.github/workflows/publish.yml +0 -0
  13. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.gitignore +0 -0
  14. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/CLAUDE.md +0 -0
  15. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PROGRESS.md +0 -0
  16. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PR_DESCRIPTION.md +0 -0
  17. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/README.md +0 -0
  18. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/TODO.md +0 -0
  19. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/README.md +0 -0
  20. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/generate_stats.py +0 -0
  21. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/requirements.txt +0 -0
  22. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/README.md +0 -0
  23. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  24. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  25. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  26. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  27. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  28. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  29. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  30. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  31. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  32. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  34. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  35. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  36. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/USER_GUIDE.md +0 -0
  37. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/devgpu-features.html +0 -0
  38. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/docker-mark-blue.svg +0 -0
  39. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/icons8-cursor-ai.svg +0 -0
  40. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/post.md +0 -0
  41. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/setup.cfg +0 -0
  42. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  43. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  44. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/README.md +0 -0
  45. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/alb.tf +0 -0
  46. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/availability.tf +0 -0
  47. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/backend.tf +0 -0
  48. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  49. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  50. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/expiry.tf +0 -0
  74. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/git-cache.tf +0 -0
  75. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  76. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  84. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda.tf +0 -0
  93. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/mig-config.tf +0 -0
  94. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  95. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  96. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  97. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  98. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  99. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  100. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/monitoring.tf +0 -0
  101. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/outputs.tf +0 -0
  102. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/pyproject.toml +0 -0
  103. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/queue.tf +0 -0
  104. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/route53.tf +0 -0
  105. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  112. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  113. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  114. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  115. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/switch-to.sh +0 -0
  116. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  117. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  118. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  119. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  120. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/variables.tf +0 -0
  121. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/README.md +0 -0
  122. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/fail/run.sh +0 -0
  123. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/multinode/run.sh +0 -0
  124. {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.24
3
+ Version: 0.5.26
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.24
3
+ Version: 0.5.26
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -52,6 +52,7 @@ terraform-gpu-devservers/main.tf
52
52
  terraform-gpu-devservers/mig-config.tf
53
53
  terraform-gpu-devservers/mig-parted-config.yaml
54
54
  terraform-gpu-devservers/monitoring.tf
55
+ terraform-gpu-devservers/node-termination-handler.tf
55
56
  terraform-gpu-devservers/outputs.tf
56
57
  terraform-gpu-devservers/pyproject.toml
57
58
  terraform-gpu-devservers/queue.tf
@@ -4,6 +4,7 @@ Reserve and manage GPU development servers
4
4
  """
5
5
 
6
6
  import click
7
+ import os
7
8
  from typing import Optional
8
9
  from rich.console import Console
9
10
  from rich.table import Table
@@ -3218,6 +3219,34 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3218
3219
  if "-A" not in ssh_command and "-o ForwardAgent=yes" not in ssh_command:
3219
3220
  ssh_command = ssh_command.replace("ssh ", "ssh -A ", 1)
3220
3221
 
3222
+ # Inject AddKeysToAgent so the first connect from this laptop loads the user\'s
3223
+ # IdentityFile into ssh-agent — without this the forwarded agent is empty on
3224
+ # subsequent pod→pod hops. UseKeychain persists the passphrase across reboots on
3225
+ # macOS; IgnoreUnknown lets Linux SSH ignore the macOS-only option cleanly.
3226
+ # The same options live in ~/.gpu-dev/<id>-sshconfig but ssh only honours them
3227
+ # when the command-line target matches a Host block, which this connect command
3228
+ # bypasses by passing the FQDN directly.
3229
+ if "AddKeysToAgent" not in ssh_command:
3230
+ ssh_command = ssh_command.replace(
3231
+ "ssh ",
3232
+ "ssh -o AddKeysToAgent=yes -o IgnoreUnknown=UseKeychain -o UseKeychain=yes ",
3233
+ 1,
3234
+ )
3235
+
3236
+ # When running from inside a gpu-dev pod (=GPU_DEV_USER_ID env var set) and the
3237
+ # forwarded SSH agent is reachable but empty, the next hop is going to fail with
3238
+ # 'Permission denied (publickey)'. Warn upfront so the user knows to ssh-add on
3239
+ # their laptop instead of debugging an opaque auth failure on the remote side.
3240
+ if os.environ.get("GPU_DEV_USER_ID") and os.environ.get("SSH_AUTH_SOCK"):
3241
+ try:
3242
+ import subprocess as _sp
3243
+ r = _sp.run(["ssh-add", "-L"], capture_output=True, text=True, timeout=3)
3244
+ if r.returncode != 0 or not r.stdout.strip() or "no identities" in r.stdout.lower():
3245
+ rprint("[yellow]⚠️ Forwarded SSH agent is empty — second-hop SSH from a pod will fail auth.[/yellow]")
3246
+ rprint("[yellow] On your laptop: `ssh-add ~/.ssh/id_ed25519` (or your GitHub key), then reconnect to this pod with `gpu-dev connect`.[/yellow]\n")
3247
+ except Exception:
3248
+ pass
3249
+
3221
3250
  # Parse and execute the command, capturing exit code for auth failures
3222
3251
  rprint(f"[dim]Executing: {ssh_command}[/dim]\n")
3223
3252
  result = subprocess.run(ssh_command, shell=True)
@@ -22,6 +22,11 @@ class Config:
22
22
  "workspace": "prod",
23
23
  "description": "Production environment",
24
24
  },
25
+ "prod-east1": {
26
+ "region": "us-east-1",
27
+ "workspace": "prod-east1",
28
+ "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
29
+ },
25
30
  }
26
31
  DEFAULT_ENVIRONMENT = "prod"
27
32
 
@@ -162,11 +162,20 @@ def _generate_ssh_config(hostname: str, pod_name: str) -> str:
162
162
  Returns:
163
163
  SSH config content as string
164
164
  """
165
+ import sys
166
+ # AddKeysToAgent makes SSH stash the IdentityFile into ssh-agent the first time it
167
+ # uses it for this Host, so the next agent-forwarding hop (pod → pod) actually has
168
+ # something to forward. On macOS, UseKeychain persists the passphrase via Keychain
169
+ # so users aren\'t prompted on every shell restart. Linux ssh errors on UseKeychain,
170
+ # so guard it with IgnoreUnknown.
171
+ extra = " AddKeysToAgent yes\n"
172
+ if sys.platform == "darwin":
173
+ extra += " IgnoreUnknown UseKeychain\n UseKeychain yes\n"
165
174
  config_content = f"""Host {pod_name}
166
175
  HostName {hostname}
167
176
  User dev
168
177
  ForwardAgent yes
169
- ProxyCommand gpu-dev-ssh-proxy %h %p
178
+ {extra} ProxyCommand gpu-dev-ssh-proxy %h %p
170
179
  StrictHostKeyChecking no
171
180
  UserKnownHostsFile /dev/null
172
181
  """
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.24"
7
+ version = "0.5.26"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -401,11 +401,14 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
401
401
  }
402
402
  }
403
403
 
404
- # Conditionally add instance_market_options for capacity block instances (only when capacity reservation exists)
404
+ # instance_market_options: capacity-block when bound to a reservation, spot when
405
+ # the workspace's gpu_config has use_spot=true, otherwise on-demand (no block).
406
+ # Spot is mutually exclusive with capacity reservations — AWS rejects launch templates
407
+ # carrying both, so the precedence here is CR > spot > on-demand.
405
408
  dynamic "instance_market_options" {
406
- for_each = each.value.capacity_reservation_id != null ? [1] : []
409
+ for_each = (each.value.capacity_reservation_id != null || try(each.value.gpu_config.use_spot, false)) ? [1] : []
407
410
  content {
408
- market_type = "capacity-block"
411
+ market_type = each.value.capacity_reservation_id != null ? "capacity-block" : "spot"
409
412
  }
410
413
  }
411
414
 
@@ -58,6 +58,13 @@ provider "helm" {
58
58
  # Data sources
59
59
  data "aws_availability_zones" "available" {
60
60
  state = "available"
61
+ # Exclude Local Zones (e.g. us-east-1-dfw-2a) and Wavelength Zones — EKS control
62
+ # plane only supports standard AZs. us-east-2 doesn't have Local Zones so the
63
+ # existing prod workspace was unaffected; us-east-1 has several (dfw, bos, …).
64
+ filter {
65
+ name = "opt-in-status"
66
+ values = ["opt-in-not-required"]
67
+ }
61
68
  }
62
69
 
63
70
  data "aws_caller_identity" "current" {}
@@ -315,6 +322,50 @@ locals {
315
322
  }
316
323
  }
317
324
  }
325
+ # us-east-1 spot-only experimental cluster.
326
+ # Same provisioning shape as prod (managed via the terraform.workspace switch) but
327
+ # backed entirely by EC2 Spot — first cheap-and-cheerful environment we can deploy
328
+ # new instance types into (B300 land here once on-demand quota arrives).
329
+ "prod-east1" = {
330
+ aws_region = "us-east-1"
331
+ environment = "prod-east1"
332
+ domain_name = "east1.devservers.io"
333
+ gpu_instance_count = 1
334
+ use_self_managed_nodes = true
335
+ instance_type = "g4dn.12xlarge"
336
+ supported_gpu_types = {
337
+ "t4" = {
338
+ instance_type = "g4dn.12xlarge"
339
+ instance_types = null
340
+ instance_count = 1
341
+ gpus_per_instance = 4
342
+ use_placement_group = false
343
+ architecture = "x86_64"
344
+ efa_network_cards = 0
345
+ use_spot = true
346
+ }
347
+ "l4" = {
348
+ instance_type = "g6.12xlarge"
349
+ instance_types = null
350
+ instance_count = 1
351
+ gpus_per_instance = 4
352
+ use_placement_group = false
353
+ architecture = "x86_64"
354
+ efa_network_cards = 1
355
+ use_spot = true
356
+ }
357
+ "cpu-x86" = {
358
+ instance_type = "c7i.8xlarge"
359
+ instance_types = null
360
+ instance_count = 5
361
+ gpus_per_instance = 0
362
+ use_placement_group = false
363
+ architecture = "x86_64"
364
+ efa_network_cards = 0
365
+ use_spot = true
366
+ }
367
+ }
368
+ }
318
369
  }
319
370
 
320
371
  # Current workspace configuration
@@ -322,6 +373,9 @@ locals {
322
373
 
323
374
  # Workspace-specific capacity reservations (with manual instance counts)
324
375
  capacity_reservations = {
376
+ "prod-east1" = {
377
+ # No capacity reservations — this workspace is spot-only.
378
+ }
325
379
  default = {
326
380
  # Test environment capacity reservations
327
381
  # h100 = [
@@ -366,6 +420,13 @@ locals {
366
420
 
367
421
  # Workspace-specific GPU type to subnet mappings
368
422
  gpu_subnet_assignments = {
423
+ "prod-east1" = {
424
+ # All node types land in the primary subnet (us-east-1a). Spot availability is
425
+ # better than placement-group-strictness on these small ASGs.
426
+ t4 = "primary"
427
+ l4 = "primary"
428
+ "cpu-x86" = "primary"
429
+ }
369
430
  default = {
370
431
  # Test environment - T4 nodes in multiple AZs for testing
371
432
  t4 = "primary" # T4 in us-west-1a (primary AZ)
@@ -392,6 +453,9 @@ locals {
392
453
 
393
454
  # Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
394
455
  capacity_reservation_azs = {
456
+ "prod-east1" = {
457
+ # Empty — no CRs in this workspace.
458
+ }
395
459
  default = {
396
460
  "cr-04d3d1d84e127a562" = "secondary" # us-west-1c
397
461
  }
@@ -0,0 +1,36 @@
1
+ # AWS Node Termination Handler — graceful drain on spot-interrupt + ASG lifecycle events.
2
+ #
3
+ # IMDS mode (one DaemonSet per node, no SQS / no IAM role) is plenty for our use case:
4
+ # we don't care about queue-processor features (rebalance recommendations, scheduled
5
+ # events). We just want pods to get a clean SIGTERM when AWS sends the 2-minute spot
6
+ # notice via instance metadata, instead of being killed cold.
7
+ #
8
+ # Tolerates everything so it runs on the GPU nodes that have nvidia.com/gpu:NoSchedule.
9
+
10
+ resource "helm_release" "aws_node_termination_handler" {
11
+ name = "aws-node-termination-handler"
12
+ repository = "https://aws.github.io/eks-charts"
13
+ chart = "aws-node-termination-handler"
14
+ namespace = "kube-system"
15
+ version = "0.27.1"
16
+ cleanup_on_fail = true
17
+
18
+ values = [yamlencode({
19
+ enableSpotInterruptionDraining = true
20
+ enableScheduledEventDraining = true
21
+ enableRebalanceMonitoring = true
22
+ enableRebalanceDraining = false # warning only; rebalance recommendations are too noisy
23
+ nodeSelector = {
24
+ "kubernetes.io/os" = "linux"
25
+ }
26
+ tolerations = [
27
+ { operator = "Exists" }, # tolerate every taint; we want NTH on every node, including GPU nodes
28
+ ]
29
+ resources = {
30
+ requests = { cpu = "50m", memory = "64Mi" }
31
+ limits = { cpu = "100m", memory = "128Mi" }
32
+ }
33
+ })]
34
+
35
+ depends_on = [aws_eks_cluster.gpu_dev_cluster]
36
+ }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes