gpu-dev 0.5.13__tar.gz → 0.5.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +8 -9
  4. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +2 -2
  5. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/mig-config.tf +23 -6
  7. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/.github/workflows/no-gitlinks.yml +0 -0
  8. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/.github/workflows/publish.yml +0 -0
  9. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/.gitignore +0 -0
  10. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/CLAUDE.md +0 -0
  11. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/PROGRESS.md +0 -0
  12. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/PR_DESCRIPTION.md +0 -0
  13. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/TODO.md +0 -0
  14. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/admin/README.md +0 -0
  15. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/admin/generate_stats.py +0 -0
  16. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/admin/requirements.txt +0 -0
  17. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/README.md +0 -0
  18. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  19. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  20. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  21. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  22. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  23. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  24. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  25. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  26. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  27. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  28. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  29. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  30. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  31. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  32. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  33. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/docs/USER_GUIDE.md +0 -0
  34. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/docs/devgpu-features.html +0 -0
  35. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/docs/docker-mark-blue.svg +0 -0
  36. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/docs/icons8-cursor-ai.svg +0 -0
  37. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/post.md +0 -0
  38. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/setup.cfg +0 -0
  39. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  40. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  41. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/README.md +0 -0
  42. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/alb.tf +0 -0
  43. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/availability.tf +0 -0
  44. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/backend.tf +0 -0
  45. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  46. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  47. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  48. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bash_profile +0 -0
  49. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc +0 -0
  50. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  51. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  52. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  53. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  54. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/motd_script +0 -0
  55. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  56. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/profile +0 -0
  57. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  58. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  59. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  60. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/shell_env +0 -0
  61. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/ssh_config +0 -0
  62. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zprofile +0 -0
  63. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc +0 -0
  64. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  65. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-build.tf +0 -0
  66. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  67. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  68. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ecr.tf +0 -0
  69. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/efs.tf +0 -0
  70. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/eks.tf +0 -0
  71. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/expiry.tf +0 -0
  72. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/git-cache.tf +0 -0
  73. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/kubernetes.tf +0 -0
  74. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  75. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  76. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  77. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  78. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  79. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  80. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  81. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda.tf +0 -0
  90. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/main.tf +0 -0
  91. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.13 → gpu_dev-0.5.14}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.13
3
+ Version: 0.5.14
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.13
3
+ Version: 0.5.14
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -688,6 +688,7 @@ def reserve(
688
688
  # and total wall-clock time drops from sum to max(each).
689
689
  from concurrent.futures import ThreadPoolExecutor
690
690
  config = load_config()
691
+ reservation_mgr = ReservationManager(config)
691
692
 
692
693
  with Live(
693
694
  Spinner("dots", text="🚀 Loading…"), console=console
@@ -704,9 +705,7 @@ def reserve(
704
705
  else:
705
706
  f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
706
707
  ssh_result = None
707
- f_avail = ex.submit(
708
- lambda: ReservationManager(config).get_gpu_availability_by_type()
709
- )
708
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
710
709
 
711
710
  # Surface auth failure first (most actionable).
712
711
  try:
@@ -2496,10 +2495,10 @@ def _show_availability() -> None:
2496
2495
  table = Table(
2497
2496
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2498
2497
  table.add_column("GPU Type", style="cyan")
2499
- table.add_column("Available", style="green")
2500
- table.add_column("Max Reservable", style="bright_green")
2498
+ table.add_column("Avail", style="green")
2499
+ table.add_column("Max\nReservable", style="bright_green")
2501
2500
  table.add_column("Total", style="blue")
2502
- table.add_column("Queue Length", style="yellow")
2501
+ table.add_column("Queue\nLength", style="yellow")
2503
2502
  table.add_column("Architecture", style="dim")
2504
2503
  table.add_column("Est. Wait Time", style="magenta")
2505
2504
 
@@ -2657,10 +2656,10 @@ def _show_availability_watch(interval: int) -> None:
2657
2656
  table = Table(
2658
2657
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2659
2658
  table.add_column("GPU Type", style="cyan")
2660
- table.add_column("Available", style="green")
2661
- table.add_column("Max Reservable", style="blue")
2659
+ table.add_column("Avail", style="green")
2660
+ table.add_column("Max\nReservable", style="blue")
2662
2661
  table.add_column("Total", style="blue")
2663
- table.add_column("Queue Length", style="yellow")
2662
+ table.add_column("Queue\nLength", style="yellow")
2664
2663
  table.add_column("Architecture", style="dim")
2665
2664
  table.add_column("Est. Wait Time", style="magenta")
2666
2665
 
@@ -88,9 +88,9 @@ def select_gpu_type_interactive(
88
88
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
89
89
  table = Table()
90
90
  table.add_column("GPU Type", style="cyan")
91
- table.add_column("Available", style="green")
91
+ table.add_column("Avail", style="green")
92
92
  table.add_column("Total", style="blue")
93
- table.add_column("Queue Length", style="yellow")
93
+ table.add_column("Queue\nLength", style="yellow")
94
94
  table.add_column("Est. Wait Time", style="magenta")
95
95
 
96
96
  choices = []
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.13"
7
+ version = "0.5.14"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -25,23 +25,40 @@ resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
25
25
  depends_on = [helm_release.nvidia_gpu_operator]
26
26
  }
27
27
 
28
- # Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
29
- # variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
30
- # means "no node currently labelled" — the existing all-disabled stays in effect.
28
+ # Declarative B200 MIG node label. Set b200_mig_node_name (per workspace via the locals lookup
29
+ # below, or override via tfvars / -var) to dedicate a specific B200 node to the mixed profile.
30
+ # Empty string means "no node labelled" — every B200 stays full.
31
+ #
32
+ # Future cleanup: when we split a B200 CR into two ASGs (one with mig_profile, one without),
33
+ # the user_data path will set this label at boot for any instance in the MIG-dedicated ASG —
34
+ # matching the H100 cr3 pattern. Until then, this declarative label pins the role to a hostname.
35
+ locals {
36
+ # Workspace-scoped defaults so the resource is a no-op in non-prod and no apply ever tries to
37
+ # label a node that doesn't exist.
38
+ default_b200_mig_node_by_workspace = {
39
+ prod = "ip-10-0-67-125.us-east-2.compute.internal"
40
+ }
41
+ b200_mig_node_effective = (
42
+ var.b200_mig_node_name != ""
43
+ ? var.b200_mig_node_name
44
+ : lookup(local.default_b200_mig_node_by_workspace, terraform.workspace, "")
45
+ )
46
+ }
47
+
31
48
  variable "b200_mig_node_name" {
32
- description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
49
+ description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to use the per-workspace default in mig-config.tf."
33
50
  type = string
34
51
  default = ""
35
52
  }
36
53
 
37
54
  resource "kubernetes_labels" "b200_mig_node" {
38
- count = var.b200_mig_node_name == "" ? 0 : 1
55
+ count = local.b200_mig_node_effective == "" ? 0 : 1
39
56
 
40
57
  api_version = "v1"
41
58
  kind = "Node"
42
59
 
43
60
  metadata {
44
- name = var.b200_mig_node_name
61
+ name = local.b200_mig_node_effective
45
62
  }
46
63
 
47
64
  labels = {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes