gpu-dev 0.5.30__tar.gz → 0.5.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PKG-INFO +1 -1
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +7 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +263 -101
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +1 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +192 -70
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +29 -2
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +4 -2
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/pyproject.toml +1 -1
- gpu_dev-0.5.31/terraform-gpu-devservers/ami-baker.tf +125 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/availability.tf +11 -1
- gpu_dev-0.5.31/terraform-gpu-devservers/check_b200.py +21 -0
- gpu_dev-0.5.31/terraform-gpu-devservers/cluster-autoscaler.tf +47 -0
- gpu_dev-0.5.31/terraform-gpu-devservers/cmd_proxy.py +49 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/eks.tf +50 -11
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/expiry.tf +3 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +31 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/availability_updater/index.py +106 -18
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +33 -3
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/index.py +135 -11
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda.tf +9 -3
- gpu_dev-0.5.31/terraform-gpu-devservers/list_b200.py +68 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/main.tf +6 -9
- gpu_dev-0.5.31/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
- gpu_dev-0.5.31/terraform-gpu-devservers/templates/ami-baker-user-data.sh +44 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/.gitignore +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/CLAUDE.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PROGRESS.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/README.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/TODO.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/README.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/post.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/setup.cfg +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.30 → gpu_dev-0.5.31}/tests/submit/success/run.sh +0 -0
|
@@ -37,8 +37,12 @@ docs/icons8-cursor-ai.svg
|
|
|
37
37
|
terraform-gpu-devservers/.terraform.lock.hcl
|
|
38
38
|
terraform-gpu-devservers/README.md
|
|
39
39
|
terraform-gpu-devservers/alb.tf
|
|
40
|
+
terraform-gpu-devservers/ami-baker.tf
|
|
40
41
|
terraform-gpu-devservers/availability.tf
|
|
41
42
|
terraform-gpu-devservers/backend.tf
|
|
43
|
+
terraform-gpu-devservers/check_b200.py
|
|
44
|
+
terraform-gpu-devservers/cluster-autoscaler.tf
|
|
45
|
+
terraform-gpu-devservers/cmd_proxy.py
|
|
42
46
|
terraform-gpu-devservers/docker-build.tf
|
|
43
47
|
terraform-gpu-devservers/ecr.tf
|
|
44
48
|
terraform-gpu-devservers/efs.tf
|
|
@@ -48,6 +52,7 @@ terraform-gpu-devservers/git-cache.tf
|
|
|
48
52
|
terraform-gpu-devservers/gpu-dev-pod-irsa.tf
|
|
49
53
|
terraform-gpu-devservers/kubernetes.tf
|
|
50
54
|
terraform-gpu-devservers/lambda.tf
|
|
55
|
+
terraform-gpu-devservers/list_b200.py
|
|
51
56
|
terraform-gpu-devservers/main.tf
|
|
52
57
|
terraform-gpu-devservers/mig-config.tf
|
|
53
58
|
terraform-gpu-devservers/mig-parted-config.yaml
|
|
@@ -60,6 +65,7 @@ terraform-gpu-devservers/route53.tf
|
|
|
60
65
|
terraform-gpu-devservers/s3-disk-contents.tf
|
|
61
66
|
terraform-gpu-devservers/ssh-proxy-service.tf
|
|
62
67
|
terraform-gpu-devservers/ssh-proxy.tf
|
|
68
|
+
terraform-gpu-devservers/subnet-0fe3a2c45570091ad
|
|
63
69
|
terraform-gpu-devservers/switch-to.sh
|
|
64
70
|
terraform-gpu-devservers/variables.tf
|
|
65
71
|
terraform-gpu-devservers/.claude/skills/deploy.md
|
|
@@ -114,6 +120,7 @@ terraform-gpu-devservers/ssh-proxy/proxy.py
|
|
|
114
120
|
terraform-gpu-devservers/ssh-proxy/requirements.txt
|
|
115
121
|
terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
|
|
116
122
|
terraform-gpu-devservers/templates/al2023-user-data.sh
|
|
123
|
+
terraform-gpu-devservers/templates/ami-baker-user-data.sh
|
|
117
124
|
terraform-gpu-devservers/templates/user-data-self-managed.sh
|
|
118
125
|
terraform-gpu-devservers/templates/user-data.sh
|
|
119
126
|
tests/submit/README.md
|
|
@@ -41,6 +41,36 @@ from .interactive import (
|
|
|
41
41
|
console = Console()
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
|
|
45
|
+
"""Fetch reservations from current region + prod-east1 if on prod."""
|
|
46
|
+
reservations = reservation_mgr.list_reservations(
|
|
47
|
+
user_filter=user_filter, statuses_to_include=statuses)
|
|
48
|
+
# Cross-region fetch
|
|
49
|
+
try:
|
|
50
|
+
cfg = config or load_config()
|
|
51
|
+
if cfg.user_config.get("environment") == "prod":
|
|
52
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
53
|
+
if east1_env:
|
|
54
|
+
import boto3 as _b3
|
|
55
|
+
east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
|
|
56
|
+
east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
|
|
57
|
+
for st in (statuses or ["active"]):
|
|
58
|
+
resp = east1_table.query(
|
|
59
|
+
IndexName="StatusIndex",
|
|
60
|
+
KeyConditionExpression="#s = :status",
|
|
61
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
62
|
+
ExpressionAttributeValues={":status": st},
|
|
63
|
+
)
|
|
64
|
+
for item in resp.get("Items", []):
|
|
65
|
+
if user_filter and item.get("user_id") != user_filter:
|
|
66
|
+
continue
|
|
67
|
+
item["_region"] = "us-east-1"
|
|
68
|
+
reservations.append(item)
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
return reservations
|
|
72
|
+
|
|
73
|
+
|
|
44
74
|
def _format_relative_time(timestamp_str: str, relative_to: str = "now") -> str:
|
|
45
75
|
"""Format timestamp as relative time if within 24h, otherwise absolute"""
|
|
46
76
|
if not timestamp_str or timestamp_str == "N/A":
|
|
@@ -598,6 +628,7 @@ def reserve(
|
|
|
598
628
|
preserve_entrypoint: bool,
|
|
599
629
|
disk: Optional[str],
|
|
600
630
|
node_label: tuple,
|
|
631
|
+
spot: bool = False,
|
|
601
632
|
) -> None:
|
|
602
633
|
"""Reserve GPU development server(s)
|
|
603
634
|
|
|
@@ -688,6 +719,11 @@ def reserve(
|
|
|
688
719
|
rprint(
|
|
689
720
|
"[dim]Use --no-interactive flag to disable interactive mode[/dim]\n")
|
|
690
721
|
|
|
722
|
+
# Auto-acknowledge spot in spot-only environments so users don't need --spot
|
|
723
|
+
from .config import Config as _Cfg
|
|
724
|
+
_env_name = load_config().user_config.get("environment", "prod")
|
|
725
|
+
_spot_types_env = _Cfg.ENVIRONMENTS.get(_env_name, {}).get("spot_types", [])
|
|
726
|
+
|
|
691
727
|
# Run auth + SSH validation + availability fetch in parallel — they're independent
|
|
692
728
|
# and total wall-clock time drops from sum to max(each).
|
|
693
729
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -748,6 +784,31 @@ def reserve(
|
|
|
748
784
|
rprint("[yellow]Reservation cancelled.[/yellow]")
|
|
749
785
|
return
|
|
750
786
|
|
|
787
|
+
# Handle spot: prefix from cross-region selection — use a TEMPORARY config
|
|
788
|
+
# for prod-east1 without persisting the environment change to disk.
|
|
789
|
+
if isinstance(gpu_type, str) and gpu_type.startswith("spot:"):
|
|
790
|
+
gpu_type = gpu_type[5:] # strip prefix
|
|
791
|
+
spot = True
|
|
792
|
+
rprint(f"\n[cyan]⚡ Switching to spot cluster (us-east-1) for {gpu_type.upper()}[/cyan]")
|
|
793
|
+
rprint("[dim]Spot instance: ~70% cheaper, may be preempted, separate disks.[/dim]\n")
|
|
794
|
+
# Build a temporary Config pointing at prod-east1 WITHOUT touching disk
|
|
795
|
+
import os as _os
|
|
796
|
+
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
797
|
+
_os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
|
|
798
|
+
config = Config()
|
|
799
|
+
config.aws_region = east1_cfg["region"]
|
|
800
|
+
reservation_mgr = ReservationManager(config)
|
|
801
|
+
try:
|
|
802
|
+
user_info = authenticate_user(config)
|
|
803
|
+
except RuntimeError as e:
|
|
804
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
805
|
+
return
|
|
806
|
+
|
|
807
|
+
# Auto-acknowledge spot for spot types in this environment
|
|
808
|
+
if _spot_types_env and gpu_type and gpu_type.lower() in _spot_types_env and not spot:
|
|
809
|
+
spot = True
|
|
810
|
+
rprint(f"[dim]{gpu_type.upper()} is a spot instance in this environment — --spot auto-acknowledged. May be preempted by AWS.[/dim]")
|
|
811
|
+
|
|
751
812
|
# Interactive GPU count selection
|
|
752
813
|
if gpus is None:
|
|
753
814
|
gpu_type_lower = gpu_type.lower()
|
|
@@ -1746,13 +1807,47 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1746
1807
|
def fetch_recent_failures():
|
|
1747
1808
|
return reservation_mgr.list_reservations(
|
|
1748
1809
|
user_filter=user_filter,
|
|
1749
|
-
statuses_to_include=["failed", "cancelled"],
|
|
1810
|
+
statuses_to_include=["failed", "cancelled", "expired"],
|
|
1750
1811
|
created_after=one_hour_ago)
|
|
1751
1812
|
|
|
1752
|
-
|
|
1813
|
+
# Also fetch from prod-east1 (cross-region) if we're on prod
|
|
1814
|
+
def fetch_east1():
|
|
1815
|
+
try:
|
|
1816
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
1817
|
+
if not east1_env or config.user_config.get("environment") != "prod":
|
|
1818
|
+
return []
|
|
1819
|
+
import boto3 as _b3
|
|
1820
|
+
east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
|
|
1821
|
+
east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
|
|
1822
|
+
results = []
|
|
1823
|
+
# Fetch active + recent failures/expired (last 24h) from east1
|
|
1824
|
+
all_statuses = (statuses_to_include or ["active", "preparing", "queued", "pending"]) + ["failed", "expired", "cancelled"]
|
|
1825
|
+
for s in all_statuses:
|
|
1826
|
+
resp = east1_table.query(
|
|
1827
|
+
IndexName="StatusIndex",
|
|
1828
|
+
KeyConditionExpression="#s = :status",
|
|
1829
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
1830
|
+
ExpressionAttributeValues={":status": s},
|
|
1831
|
+
)
|
|
1832
|
+
for item in resp.get("Items", []):
|
|
1833
|
+
if user_filter and item.get("user_id") != user_filter:
|
|
1834
|
+
continue
|
|
1835
|
+
# For failed/expired/cancelled, only show if ended recently
|
|
1836
|
+
if s in ("failed", "expired", "cancelled"):
|
|
1837
|
+
ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
|
|
1838
|
+
if ended and ended < one_hour_ago:
|
|
1839
|
+
continue
|
|
1840
|
+
item["_region"] = "us-east-1"
|
|
1841
|
+
results.append(item)
|
|
1842
|
+
return results
|
|
1843
|
+
except Exception:
|
|
1844
|
+
return []
|
|
1845
|
+
|
|
1846
|
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
1753
1847
|
active_future = executor.submit(fetch_active)
|
|
1754
1848
|
failures_future = executor.submit(fetch_recent_failures)
|
|
1755
|
-
|
|
1849
|
+
east1_future = executor.submit(fetch_east1)
|
|
1850
|
+
reservations = active_future.result() + failures_future.result() + east1_future.result()
|
|
1756
1851
|
else:
|
|
1757
1852
|
reservations = reservation_mgr.list_reservations(
|
|
1758
1853
|
user_filter=user_filter, statuses_to_include=statuses_to_include
|
|
@@ -1787,6 +1882,9 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1787
1882
|
reservations = sorted(reservations, key=sort_key)
|
|
1788
1883
|
|
|
1789
1884
|
# Create table with enhanced columns for queue info
|
|
1885
|
+
# Check if we have cross-region reservations
|
|
1886
|
+
_has_east1 = any(r.get("_region") == "us-east-1" for r in reservations)
|
|
1887
|
+
|
|
1790
1888
|
table = Table(title="GPU Reservations")
|
|
1791
1889
|
table.add_column("ID", style="cyan", no_wrap=True)
|
|
1792
1890
|
table.add_column("User", style="green")
|
|
@@ -1796,6 +1894,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1796
1894
|
table.add_column("Queue Info", style="cyan")
|
|
1797
1895
|
table.add_column("Created", style="blue")
|
|
1798
1896
|
table.add_column("Expires/ETA", style="red")
|
|
1897
|
+
if _has_east1:
|
|
1898
|
+
table.add_column("Region", style="dim")
|
|
1799
1899
|
if details:
|
|
1800
1900
|
table.add_column("CLI Ver", style="dim", no_wrap=True)
|
|
1801
1901
|
table.add_column("Lambda Ver", style="dim", no_wrap=True)
|
|
@@ -1842,6 +1942,26 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1842
1942
|
expires_formatted = f"~{estimated_wait}min"
|
|
1843
1943
|
else:
|
|
1844
1944
|
expires_formatted = "Calculating..."
|
|
1945
|
+
elif res_status in ("expired", "failed", "cancelled"):
|
|
1946
|
+
reason = reservation.get("failure_reason", "")
|
|
1947
|
+
ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
|
|
1948
|
+
ended_str = ""
|
|
1949
|
+
if ended:
|
|
1950
|
+
try:
|
|
1951
|
+
from datetime import datetime, timezone
|
|
1952
|
+
ended_dt = datetime.fromisoformat(ended.replace("Z", "+00:00"))
|
|
1953
|
+
ended_str = ended_dt.astimezone().strftime("%H:%M")
|
|
1954
|
+
except Exception:
|
|
1955
|
+
pass
|
|
1956
|
+
if "preempted" in reason.lower():
|
|
1957
|
+
expires_formatted = f"Preempted{' @' + ended_str if ended_str else ''}"
|
|
1958
|
+
elif res_status == "cancelled":
|
|
1959
|
+
expires_formatted = f"Cancelled{' @' + ended_str if ended_str else ''}"
|
|
1960
|
+
elif reason:
|
|
1961
|
+
short = reason.split("\n")[0][:20]
|
|
1962
|
+
expires_formatted = short
|
|
1963
|
+
else:
|
|
1964
|
+
expires_formatted = res_status.capitalize()
|
|
1845
1965
|
else:
|
|
1846
1966
|
expires_formatted = "N/A"
|
|
1847
1967
|
|
|
@@ -1979,6 +2099,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1979
2099
|
row_data.append(
|
|
1980
2100
|
f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
|
|
1981
2101
|
|
|
2102
|
+
if _has_east1:
|
|
2103
|
+
region = reservation.get("_region", "us-east-2")
|
|
2104
|
+
row_data.append("[yellow]east1[/yellow]" if region == "us-east-1" else "prod")
|
|
2105
|
+
|
|
1982
2106
|
table.add_row(*row_data)
|
|
1983
2107
|
|
|
1984
2108
|
except Exception as row_error:
|
|
@@ -2309,12 +2433,10 @@ def cancel(
|
|
|
2309
2433
|
|
|
2310
2434
|
reservation_mgr = ReservationManager(config)
|
|
2311
2435
|
|
|
2312
|
-
# Get cancellable reservations
|
|
2313
|
-
reservations =
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
"active", "queued", "pending", "preparing"],
|
|
2317
|
-
)
|
|
2436
|
+
# Get cancellable reservations (cross-region)
|
|
2437
|
+
reservations = _fetch_reservations_cross_region(
|
|
2438
|
+
reservation_mgr, user_info["user_id"],
|
|
2439
|
+
["active", "queued", "pending", "preparing"], config)
|
|
2318
2440
|
|
|
2319
2441
|
live.stop()
|
|
2320
2442
|
|
|
@@ -2720,7 +2842,25 @@ def _show_availability() -> None:
|
|
|
2720
2842
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
2721
2843
|
return
|
|
2722
2844
|
|
|
2723
|
-
#
|
|
2845
|
+
# Cross-region: fetch spot availability from prod-east1
|
|
2846
|
+
spot_region_info = {}
|
|
2847
|
+
_env_name = config.user_config.get("environment", "prod")
|
|
2848
|
+
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
2849
|
+
if _env_name == "prod" and _east1_spot_types:
|
|
2850
|
+
try:
|
|
2851
|
+
import boto3 as _b3
|
|
2852
|
+
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
2853
|
+
for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
|
|
2854
|
+
gt = item.get("gpu_type", "")
|
|
2855
|
+
if gt in _east1_spot_types:
|
|
2856
|
+
spot_region_info[gt] = {
|
|
2857
|
+
"available": int(item.get("available_gpus", 0)),
|
|
2858
|
+
"total": int(item.get("total_gpus", 0)),
|
|
2859
|
+
"max_reservable": int(item.get("max_reservable", 0)),
|
|
2860
|
+
"spot_info": item.get("spot_info", {}),
|
|
2861
|
+
}
|
|
2862
|
+
except Exception:
|
|
2863
|
+
pass
|
|
2724
2864
|
|
|
2725
2865
|
if availability_info:
|
|
2726
2866
|
# GPU architecture mapping (for display)
|
|
@@ -2762,84 +2902,99 @@ def _show_availability() -> None:
|
|
|
2762
2902
|
"CPU (arm64)": 6,
|
|
2763
2903
|
}
|
|
2764
2904
|
|
|
2765
|
-
#
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
x[0]
|
|
2772
|
-
|
|
2773
|
-
)
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
# Format wait time
|
|
2804
|
-
if available > 0:
|
|
2805
|
-
wait_display = "Available now"
|
|
2806
|
-
elif est_wait == 0:
|
|
2807
|
-
wait_display = "Unknown"
|
|
2808
|
-
elif est_wait < 60:
|
|
2809
|
-
wait_display = f"{int(est_wait)}min"
|
|
2810
|
-
else:
|
|
2811
|
-
hours = int(est_wait // 60)
|
|
2812
|
-
minutes = int(est_wait % 60)
|
|
2813
|
-
if minutes == 0:
|
|
2814
|
-
wait_display = f"{hours}h"
|
|
2905
|
+
# Split into categories
|
|
2906
|
+
full_types = {k: v for k, v in availability_info.items() if "mig" not in k}
|
|
2907
|
+
mig_types = {k: v for k, v in availability_info.items() if "mig" in k}
|
|
2908
|
+
|
|
2909
|
+
def _sort_by_arch(items):
|
|
2910
|
+
return sorted(items.items(), key=lambda x: (
|
|
2911
|
+
arch_priority.get(gpu_architectures.get(x[0], "Unknown"), 99), x[0]))
|
|
2912
|
+
|
|
2913
|
+
def _fmt_wait(available, est_wait):
|
|
2914
|
+
if available > 0: return "Available now"
|
|
2915
|
+
if not est_wait: return "Unknown"
|
|
2916
|
+
if est_wait < 60: return f"{int(est_wait)}min"
|
|
2917
|
+
h, m = int(est_wait // 60), int(est_wait % 60)
|
|
2918
|
+
return f"{h}h{f' {m}min' if m else ''}"
|
|
2919
|
+
|
|
2920
|
+
def _build_avail_table(title, items):
|
|
2921
|
+
t = Table(title=title)
|
|
2922
|
+
t.add_column("GPU Type", style="cyan")
|
|
2923
|
+
t.add_column("Avail", style="green")
|
|
2924
|
+
t.add_column("Max\nReservable", style="bright_green")
|
|
2925
|
+
t.add_column("Total", style="blue")
|
|
2926
|
+
t.add_column("Queue\nLength", style="yellow")
|
|
2927
|
+
t.add_column("Architecture", style="dim")
|
|
2928
|
+
t.add_column("Est. Wait Time", style="magenta")
|
|
2929
|
+
for gpu_type, info in _sort_by_arch(items):
|
|
2930
|
+
avail = info.get("available", 0)
|
|
2931
|
+
maint = info.get("maintenance", False)
|
|
2932
|
+
maint_reason = info.get("maintenance_reason", "")
|
|
2933
|
+
fn = info.get("full_nodes_available", 0)
|
|
2934
|
+
if maint:
|
|
2935
|
+
ad = "[red]MAINTENANCE[/red]"
|
|
2936
|
+
wd = maint_reason or "Under maintenance"
|
|
2937
|
+
elif avail == 0:
|
|
2938
|
+
ad = f"[red]{avail}[/red]"
|
|
2939
|
+
wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
|
|
2940
|
+
elif fn > 0:
|
|
2941
|
+
ad = f"[green]{avail}[/green]"
|
|
2942
|
+
wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
|
|
2815
2943
|
else:
|
|
2816
|
-
|
|
2817
|
-
|
|
2818
|
-
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
2840
|
-
|
|
2841
|
-
|
|
2842
|
-
|
|
2944
|
+
ad = f"[yellow]{avail}[/yellow]"
|
|
2945
|
+
wd = _fmt_wait(avail, info.get("estimated_wait_minutes", 0))
|
|
2946
|
+
t.add_row(
|
|
2947
|
+
gpu_type.upper(), ad,
|
|
2948
|
+
"-" if maint else str(info.get("max_reservable", 0)),
|
|
2949
|
+
str(info.get("total", 0)),
|
|
2950
|
+
"-" if maint else str(info.get("queue_length", 0)),
|
|
2951
|
+
gpu_architectures.get(gpu_type, "Unknown"), wd)
|
|
2952
|
+
console.print(t)
|
|
2953
|
+
|
|
2954
|
+
# Section 1: Full GPUs & CPUs
|
|
2955
|
+
_build_avail_table("━━━ Full GPUs & CPUs ━━━", full_types)
|
|
2956
|
+
|
|
2957
|
+
# Section 2: MIG Slices
|
|
2958
|
+
if mig_types:
|
|
2959
|
+
rprint("[dim] Sliced GPUs — isolated fractions of a physical GPU, perfect for smaller jobs.[/dim]")
|
|
2960
|
+
_build_avail_table("━━━ 🔬 MIG Slices ━━━", mig_types)
|
|
2961
|
+
|
|
2962
|
+
# Spot section from prod-east1
|
|
2963
|
+
if spot_region_info:
|
|
2964
|
+
# Spot GPU configs for max reservable (what you CAN get per node)
|
|
2965
|
+
spot_gpus_per_node = {
|
|
2966
|
+
"b300": 8, "b200": 8, "h200": 8, "h100": 8, "a100": 8,
|
|
2967
|
+
"t4": 4, "l4": 4,
|
|
2968
|
+
}
|
|
2969
|
+
spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
|
|
2970
|
+
spot_table.add_column("GPU Type", style="cyan")
|
|
2971
|
+
spot_table.add_column("Avail\nNow", style="green")
|
|
2972
|
+
spot_table.add_column("Per\nNode", style="bright_green")
|
|
2973
|
+
spot_table.add_column("Status", style="magenta")
|
|
2974
|
+
spot_table.add_column("Availability", style="dim")
|
|
2975
|
+
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
2976
|
+
for gt, info in sorted(spot_region_info.items()):
|
|
2977
|
+
avail = info.get("available", 0)
|
|
2978
|
+
per_node = spot_gpus_per_node.get(gt, 8)
|
|
2979
|
+
avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
|
|
2980
|
+
status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
|
|
2981
|
+
si = info.get("spot_info", {}) or {}
|
|
2982
|
+
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
2983
|
+
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
2984
|
+
avail_signal = "[red]Not offered[/red]"
|
|
2985
|
+
else:
|
|
2986
|
+
try:
|
|
2987
|
+
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
2988
|
+
pct = int((1 - ratio) * 100)
|
|
2989
|
+
if ratio < 0.4: avail_signal = f"[green]High ({pct}% off)[/green]"
|
|
2990
|
+
elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
|
|
2991
|
+
else: avail_signal = f"[red]Low ({pct}% off)[/red]"
|
|
2992
|
+
except (ValueError, TypeError):
|
|
2993
|
+
avail_signal = "[yellow]Unknown[/yellow]"
|
|
2994
|
+
spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
|
|
2995
|
+
console.print(spot_table)
|
|
2996
|
+
rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
|
|
2997
|
+
rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
|
|
2843
2998
|
|
|
2844
2999
|
# Show color legend
|
|
2845
3000
|
rprint("\n[bold]Availability legend:[/bold]")
|
|
@@ -2847,7 +3002,7 @@ def _show_availability() -> None:
|
|
|
2847
3002
|
|
|
2848
3003
|
# Show usage tip
|
|
2849
3004
|
rprint(
|
|
2850
|
-
"\n[dim]💡 Use 'gpu-dev reserve
|
|
3005
|
+
"\n[dim]💡 Use 'gpu-dev reserve' (interactive) to see all options including MIG slices and spot instances[/dim]"
|
|
2851
3006
|
)
|
|
2852
3007
|
|
|
2853
3008
|
else:
|
|
@@ -2858,6 +3013,9 @@ def _show_availability() -> None:
|
|
|
2858
3013
|
|
|
2859
3014
|
|
|
2860
3015
|
def _show_availability_watch(interval: int) -> None:
|
|
3016
|
+
_env_name = load_config().user_config.get("environment", "prod")
|
|
3017
|
+
_spot_types = frozenset(Config.ENVIRONMENTS.get(_env_name, {}).get("spot_types", []))
|
|
3018
|
+
|
|
2861
3019
|
"""Watch mode for GPU availability with auto-refresh"""
|
|
2862
3020
|
import time
|
|
2863
3021
|
from datetime import datetime
|
|
@@ -2990,8 +3148,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2990
3148
|
else:
|
|
2991
3149
|
available_display = f"[yellow]{available}[/yellow]"
|
|
2992
3150
|
|
|
3151
|
+
type_label = f"{gpu_type.upper()} *" if gpu_type in _spot_types else gpu_type.upper()
|
|
2993
3152
|
table.add_row(
|
|
2994
|
-
|
|
3153
|
+
type_label,
|
|
2995
3154
|
available_display,
|
|
2996
3155
|
str(max_reservable) if not is_maintenance else "-",
|
|
2997
3156
|
str(total),
|
|
@@ -3079,10 +3238,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3079
3238
|
|
|
3080
3239
|
# If no reservation ID provided, show interactive selection
|
|
3081
3240
|
if reservation_id is None:
|
|
3082
|
-
reservations =
|
|
3083
|
-
|
|
3084
|
-
statuses_to_include=["active"]
|
|
3085
|
-
)
|
|
3241
|
+
reservations = _fetch_reservations_cross_region(
|
|
3242
|
+
reservation_mgr, user_info["user_id"], ["active"], config)
|
|
3086
3243
|
|
|
3087
3244
|
live.stop()
|
|
3088
3245
|
|
|
@@ -3109,6 +3266,16 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3109
3266
|
|
|
3110
3267
|
live.start()
|
|
3111
3268
|
|
|
3269
|
+
# If the selected reservation is from east1, switch to east1 reservation_mgr
|
|
3270
|
+
_sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
|
|
3271
|
+
if _sel and _sel.get("_region") == "us-east-1":
|
|
3272
|
+
import os as _os
|
|
3273
|
+
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
3274
|
+
_os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
|
|
3275
|
+
_east1_config = Config()
|
|
3276
|
+
_east1_config.aws_region = east1_cfg["region"]
|
|
3277
|
+
reservation_mgr = ReservationManager(_east1_config)
|
|
3278
|
+
|
|
3112
3279
|
# Get connection info
|
|
3113
3280
|
connection_info = reservation_mgr.get_connection_info(
|
|
3114
3281
|
reservation_id, user_info["user_id"]
|
|
@@ -3320,10 +3487,8 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
|
|
|
3320
3487
|
|
|
3321
3488
|
# If no reservation ID provided, show interactive selection
|
|
3322
3489
|
if reservation_id is None:
|
|
3323
|
-
reservations =
|
|
3324
|
-
|
|
3325
|
-
statuses_to_include=["active"]
|
|
3326
|
-
)
|
|
3490
|
+
reservations = _fetch_reservations_cross_region(
|
|
3491
|
+
reservation_mgr, user_info["user_id"], ["active"], config)
|
|
3327
3492
|
|
|
3328
3493
|
live.stop()
|
|
3329
3494
|
|
|
@@ -3698,10 +3863,7 @@ def environment(env_name: str) -> None:
|
|
|
3698
3863
|
rprint(f"[dim]Configuration saved to {cfg.CONFIG_FILE}[/dim]")
|
|
3699
3864
|
|
|
3700
3865
|
# Instructions for shell export
|
|
3701
|
-
rprint(f"\n[
|
|
3702
|
-
rprint(f" export AWS_DEFAULT_REGION={env_config['region']}")
|
|
3703
|
-
rprint(f"\n[yellow]💡 Or use the switch-to.sh script:[/yellow]")
|
|
3704
|
-
rprint(f" ./switch-to.sh {env_name}")
|
|
3866
|
+
rprint(f"\n[dim]Region saved. All gpu-dev commands now target {env_config['region']}.[/dim]")
|
|
3705
3867
|
|
|
3706
3868
|
except Exception as e:
|
|
3707
3869
|
rprint(f"[red]❌ Error setting environment: {str(e)}[/red]")
|