PyPI - gpu-dev - Versions diffs - 0.5.32__tar.gz → 0.6.0__tar.gz - Mend

gpu-dev 0.5.32tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.32
+Version: 0.6.0
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.32
+Version: 0.6.0
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -897,6 +897,13 @@ def reserve(
         else:
             # Non-interactive mode - use defaults and validate
+            # Route --spot to east1 when on prod (env vars override config region)
+            if spot and load_config().user_config.get("environment") == "prod":
+                east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
+                if east1_cfg:
+                    import os as _os
+                    _os.environ["AWS_REGION"] = east1_cfg["region"]
             if gpu_type is None:
                 gpu_type = "a100"
             if hours is None:
@@ -2568,10 +2575,21 @@ def cancel(
             with Live(
                 Spinner("dots", text="📡 Cancelling reservations..."), console=console
             ) as live:
+                # Build east1 reservation manager for cross-region cancellations
+                east1_mgr = None
+                east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
+                if east1_env:
+                    import os as _os
+                    _east1_config = Config()
+                    _east1_config.aws_region = east1_env["region"]
+                    east1_mgr = ReservationManager(_east1_config)
                 for reservation in reservations:
                     res_id = reservation.get("reservation_id", "")
                     if res_id:
-                        success = reservation_mgr.cancel_reservation(
+                        # Use east1 manager for east1 reservations
+                        mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
+                        success = mgr.cancel_reservation(
                             res_id, user_info["user_id"]
                         )
                         if success:
@@ -3301,21 +3319,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
                 live.start()
-            # If the selected reservation is from east1, switch to east1 reservation_mgr
-            _sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
-            if _sel and _sel.get("_region") == "us-east-1":
-                import os as _os
-                east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
-                _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
-                _east1_config = Config()
-                _east1_config.aws_region = east1_cfg["region"]
-                reservation_mgr = ReservationManager(_east1_config)
-            # Get connection info
+            # Try current region first, then cross-region if not found
             connection_info = reservation_mgr.get_connection_info(
                 reservation_id, user_info["user_id"]
             )
+            # If not found, try the other region
+            if not connection_info:
+                import os as _os
+                current_env = config.user_config.get("environment", "prod")
+                other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
+                other_env_name = other_envs.get(current_env)
+                if other_env_name:
+                    other_env = Config.ENVIRONMENTS.get(other_env_name, {})
+                    if other_env:
+                        _os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
+                        _other_config = Config()
+                        _other_config.aws_region = other_env["region"]
+                        other_mgr = ReservationManager(_other_config)
+                        connection_info = other_mgr.get_connection_info(
+                            reservation_id, user_info["user_id"]
+                        )
+                        if connection_info:
+                            reservation_mgr = other_mgr
         live.stop()
         if not connection_info:
@@ -3864,7 +3891,7 @@ def set(key: str, value: str) -> None:
 @config.command()
-@click.argument("env_name", type=click.Choice(["test", "prod", "prod-east1"]))
+@click.argument("env_name", type=click.Choice(["test", "prod"]))
 def environment(env_name: str) -> None:
     """Set the environment
@@ -3876,7 +3903,7 @@ def environment(env_name: str) -> None:
     \b
     Examples:
         gpu-dev config environment prod        # Production (us-east-2)
-        gpu-dev config environment prod-east1  # Spot-only us-east-1
+        gpu-dev config environment prod         # Production (spot accessible via interactive picker)
         gpu-dev config environment test        # Test (us-west-1)
     Environment configurations:

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py RENAMED Viewed

@@ -42,13 +42,14 @@ class Config:
         # Load unified config (handles migration from legacy files)
         self.user_config = self._load_config()
-        # Get region from config, then AWS env vars, or default
-        if self.user_config.get("region"):
+        # Get region: env vars take priority (for spot routing), then config, then default
+        env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
+        if env_region and env_region != self.user_config.get("region"):
+            self.aws_region = env_region
+        elif self.user_config.get("region"):
             self.aws_region = self.user_config["region"]
         else:
-            self.aws_region = os.getenv(
-                "AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
-            )
+            self.aws_region = "us-east-2"
         os.environ["AWS_DEFAULT_REGION"] = self.aws_region

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py RENAMED Viewed

@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
         return False
     if not disk['in_use']:
-        print(f"Disk '{disk_name}' is not locked")
-        return False
+        # DDB says not locked — but check if EBS volume is still physically attached
+        try:
+            ec2 = config.session.client('ec2', region_name=config.aws_region)
+            vols = ec2.describe_volumes(Filters=[
+                {"Name": "tag:gpu-dev-user", "Values": [user_id]},
+                {"Name": "tag:disk_name", "Values": [disk_name]},
+                {"Name": "status", "Values": ["in-use"]},
+            ]).get("Volumes", [])
+            if not vols:
+                print(f"Disk '{disk_name}' is not locked")
+                return False
+            print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
+        except Exception:
+            print(f"Disk '{disk_name}' is not locked")
+            return False
     operation_id = str(uuid.uuid4())

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py RENAMED Viewed

@@ -1701,6 +1701,7 @@ class ReservationManager:
                 initial_text = f"📡 Starting multinode reservation..." if is_multinode else "🔄 Sending reservation request..."
                 spinner = Spinner("dots", text=initial_text)
                 live.update(spinner)
+                poll_delay = 0.5  # start fast, back off over time
                 while (
                     (timeout_seconds is None or time.time() -
@@ -1761,7 +1762,7 @@ class ReservationManager:
                                     if not is_multinode:
                                         spinner.text = "📡 Waiting for reservation status update..."
                                         live.update(spinner)
-                                        time.sleep(2)
+                                        time.sleep(0.5)
                                         continue
                                     else:
                                         node_details.append({
@@ -2293,8 +2294,9 @@ class ReservationManager:
                             return None
-                        # Continue polling
-                        time.sleep(3)
+                        # Poll with backoff: 0.5s → 1s → 1.5s → 2s → 3s (cap)
+                        time.sleep(poll_delay)
+                        poll_delay = min(poll_delay + 0.5, 3.0)
                     except Exception as e:
                         console.print(

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.5.32"
+version = "0.6.0"
 description = "CLI tool for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.5.32 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py RENAMED Viewed

@@ -195,7 +195,7 @@ GPU_CONFIG = {
     "b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
     "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
     "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
-    "cpu-spot": {"instance_type": "c7i.2xlarge", "max_gpus": 0, "cpus": 8, "memory_gb": 16, "efa_count": 0},
+    "cpu-spot": {"instance_type": "c6id.2xlarge", "max_gpus": 0, "cpus": 8, "memory_gb": 16, "efa_count": 0},
 }
 GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
@@ -2843,16 +2843,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
         elif dockerimage:
             logger.info(f"Custom Docker image specified: {dockerimage}")
-        record_trace_event(trace_data, "github_keys_fetch_start")
-        github_public_key = get_github_public_key(github_user, validate=True)
-        record_trace_event(trace_data, "github_keys_fetch_end")
-        if not github_public_key:
-            raise ValueError(
-                f"Could not fetch GitHub public key for GitHub user '{github_user}'"
-            )
-        # Check if user should get persistent disk
-        # Check if user explicitly requested no persistent disk (e.g., confirmed continuing without disk when another reservation has it)
+        # ── Determine disk eligibility (quick, no I/O) ──
         no_persistent_disk_requested = request.get("no_persistent_disk", False)
         if no_persistent_disk_requested:
@@ -2895,119 +2886,93 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
                 logger.error(f"Failed to reserve persistent disk slot: {e}")
                 use_persistent_disk = False
-        if use_persistent_disk:
+        # ── Run SSH key fetch, disk setup, and EFS setup in parallel ──
+        # These are independent I/O operations that together take ~8s sequentially
+        def _fetch_ssh_keys():
+            record_trace_event(trace_data, "github_keys_fetch_start")
+            keys = get_github_public_key(github_user, validate=True)
+            record_trace_event(trace_data, "github_keys_fetch_end")
+            return keys
+        def _setup_disk():
+            if not use_persistent_disk:
+                return None, True, None, None, None
+            update_reservation_status(
+                reservation_id, "preparing",
+                detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else ""))
+            _target_az, _target_node = get_target_az_for_reservation(gpu_type, gpu_count)
+            if not _target_az:
+                raise ValueError(f"No {gpu_type} nodes found in cluster")
+            logger.info(f"Target AZ: {_target_az}, disk_name={disk_name or 'default'}")
+            record_trace_event(trace_data, "disk_create_start")
+            vol_id, new_disk, warning = create_disk_from_snapshot_or_empty(
+                user_id=user_id, availability_zone=_target_az,
+                disk_name=disk_name, reservation_id=reservation_id)
+            record_trace_event(trace_data, "disk_create_end")
+            return vol_id, new_disk, warning, _target_az, _target_node
+        def _setup_efs():
+            if not (EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS):
+                return None
+            update_reservation_status(
+                reservation_id, "preparing",
+                "Setting up shared storage (/shared) for user collaboration")
+            record_trace_event(trace_data, "efs_setup_start")
+            efs_id = create_or_find_user_efs(user_id)
+            record_trace_event(trace_data, "efs_setup_end")
+            return efs_id
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            ssh_future = executor.submit(_fetch_ssh_keys)
+            disk_future = executor.submit(_setup_disk)
+            efs_future = executor.submit(_setup_efs)
+            github_public_key = ssh_future.result()
             try:
-                # NEW snapshot-first workflow (replaces old migration logic below)
-                # Always recreate volume from latest snapshot or create empty
-                update_reservation_status(
-                    reservation_id,
-                    "preparing",
-                    detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
-                )
-                # Determine target AZ + node for this reservation (binpacking)
-                target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
-                if not target_az:
-                    raise ValueError(f"No {gpu_type} nodes found in cluster")
-                logger.info(f"Target AZ for reservation: {target_az}")
-                logger.info(f"Creating persistent disk for user {user_id}, disk_name={disk_name or 'default'}")
-                # Use new snapshot-first function
-                record_trace_event(trace_data, "disk_create_start")
-                persistent_volume_id, is_new_disk, disk_warning = create_disk_from_snapshot_or_empty(
-                    user_id=user_id,
-                    availability_zone=target_az,
-                    disk_name=disk_name,
-                    reservation_id=reservation_id
-                )
-                record_trace_event(trace_data, "disk_create_end")
-                logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
-                # Mark disk as in_use in disks table (prevents CLI from showing as available)
-                # Use "default" as fallback when no explicit disk_name provided
-                effective_disk_name = disk_name or "default"
-                try:
-                    mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
-                    logger.info(f"Marked disk '{effective_disk_name}' as in_use for reservation {reservation_id[:8]}")
-                except Exception as mark_error:
-                    logger.warning(f"Failed to mark disk as in_use: {mark_error}")
-                # Store disk_name in DynamoDB for tracking (ALWAYS store, using "default" as fallback)
-                # This is required for expiry cleanup to know which disk to mark as not in use
-                update_reservation_fields(reservation_id, disk_name=effective_disk_name)
-                # Store warning if any
-                if disk_warning:
-                    update_reservation_fields(reservation_id, warning=disk_warning)
-                    logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}")
+                disk_result = disk_future.result()
             except Exception as disk_error:
                 logger.error(f"Failed to set up persistent disk: {disk_error}")
                 error_msg = str(disk_error)
-                # If user explicitly requested a named disk, NEVER silently fall back to temporary.
-                # Any disk error (in use, timeout, creation failure) should fail the reservation
-                # so the user knows what happened instead of getting surprise temporary storage.
                 if disk_name:
-                    logger.error(f"Named disk '{disk_name}' was explicitly requested but setup failed - failing reservation")
-                    update_reservation_status(
-                        reservation_id,
-                        "failed",
-                        failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}"
-                    )
+                    logger.error(f"Named disk '{disk_name}' setup failed - failing reservation")
+                    update_reservation_status(reservation_id, "failed",
+                        failure_reason=f"Persistent disk '{disk_name}' setup failed: {error_msg}")
                     raise RuntimeError(f"Cannot create reservation: disk '{disk_name}' setup failed: {error_msg}")
-                # Check if this is a "disk in use" error - these should fail the reservation
                 if "in use" in error_msg.lower():
-                    # Don't fall back - fail the reservation with clear error
-                    update_reservation_status(
-                        reservation_id,
-                        "failed",
-                        failure_reason=error_msg
-                    )
+                    update_reservation_status(reservation_id, "failed", failure_reason=error_msg)
                     raise RuntimeError(f"Cannot create reservation: {error_msg}")
-                # For other errors without explicit disk_name, continue without persistent disk (backwards compatibility)
-                logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}")
+                logger.warning(f"Falling back to non-persistent storage: {disk_error}")
                 use_persistent_disk = False
-                persistent_volume_id = None  # Clear any volume that was set before the error
-                is_new_disk = True  # EmptyDir volume will need shell environment setup
-                update_reservation_status(
-                    reservation_id,
-                    "preparing",
-                    "Persistent disk setup failed - continuing without persistent storage",
-                )
-        else:
-            logger.info(
-                f"User {user_id} has existing reservations - no persistent disk")
-            # Non-persistent reservations always need shell environment setup
+                persistent_volume_id = None
+                is_new_disk = True
+                disk_result = None
+                update_reservation_status(reservation_id, "preparing",
+                    "Persistent disk setup failed - continuing without persistent storage")
+            try:
+                efs_filesystem_id = efs_future.result()
+            except Exception as efs_error:
+                logger.error(f"Failed to set up EFS: {efs_error}")
+                efs_filesystem_id = None
+        if not github_public_key:
+            raise ValueError(f"Could not fetch GitHub public key for GitHub user '{github_user}'")
+        if use_persistent_disk and disk_result:
+            persistent_volume_id, is_new_disk, disk_warning, target_az, target_node = disk_result
+            logger.info(f"Persistent disk ready: {persistent_volume_id} (is_new={is_new_disk})")
+            effective_disk_name = disk_name or "default"
+            try:
+                mark_disk_in_use(user_id, effective_disk_name, True, reservation_id)
+            except Exception as mark_error:
+                logger.warning(f"Failed to mark disk as in_use: {mark_error}")
+            update_reservation_fields(reservation_id, disk_name=effective_disk_name)
+            if disk_warning:
+                update_reservation_fields(reservation_id, warning=disk_warning)
+        elif not use_persistent_disk:
             is_new_disk = True
-            logger.info(
-                "Non-persistent reservation - will always set up shell environment (CREATE_SH_ENV=true)")
-        # Set up shared EFS storage for user
-        efs_filesystem_id = None
-        try:
-            if EFS_SECURITY_GROUP_ID and EFS_SUBNET_IDS:
-                update_reservation_status(
-                    reservation_id,
-                    "preparing",
-                    "Setting up shared storage (/shared) for user collaboration",
-                )
-                record_trace_event(trace_data, "efs_setup_start")
-                efs_filesystem_id = create_or_find_user_efs(user_id)
-                record_trace_event(trace_data, "efs_setup_end")
-                logger.info(
-                    f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
-            else:
-                logger.warning(
-                    "EFS configuration missing - skipping shared storage setup")
-        except Exception as efs_error:
-            logger.error(f"Failed to set up EFS: {efs_error}")
-            # Continue without EFS rather than failing
-            efs_filesystem_id = None
+        if efs_filesystem_id:
+            logger.info(f"EFS filesystem {efs_filesystem_id} ready for user {user_id}")
         # Update status: Creating Kubernetes resources
         disk_status = "with persistent disk" if use_persistent_disk else "without persistent disk"
@@ -3149,30 +3114,29 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
         try:
             v1 = client.CoreV1Api(k8s_client)
-            # Try multiple times to find SSH daemon in logs (custom images may take longer)
-            # Default image has openssh-server pre-installed so SSH starts in ~2-5s
-            # Custom/minimal images may need apt-get install which takes longer
-            # 60 retries * 3s = 180 seconds total (3 minutes) - same max but much faster detection
-            max_retries = 60
-            retry_delay = 3  # seconds between retries
+            # Poll for SSH daemon: 100ms for first 8s, then backoff to 5s
+            # Default image starts SSH in ~2-5s, so rapid polling catches it instantly
+            # Custom images may take longer, backoff keeps API load reasonable
+            max_attempts = 60
+            elapsed = 0.0
-            for attempt in range(max_retries):
+            for attempt in range(max_attempts):
                 logs = v1.read_namespaced_pod_log(
                     name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=100
                 )
                 if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
                     logger.info(
-                        f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1})")
+                        f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
                     ssh_ready = True
                     break
                 else:
-                    if attempt < max_retries - 1:
-                        logger.info(
-                            f"SSH daemon not yet started, waiting {retry_delay}s (attempt {attempt + 1}/{max_retries})")
-                        time.sleep(retry_delay)
+                    if attempt < max_attempts - 1:
+                        delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
+                        time.sleep(delay)
+                        elapsed += delay
                     else:
                         logger.warning(
-                            f"SSH daemon not detected after {max_retries} attempts, logs preview: {logs[-200:]}")
+                            f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
         except Exception as e:
             logger.warning(f"Could not check SSH daemon logs: {e}")
             # Assume ready if pod is running (NLB will handle routing)
@@ -3514,32 +3478,52 @@ def update_reservation_fields(reservation_id: str, **fields) -> None:
         logger.error(f"Error updating reservation fields: {str(e)}")
+_ssh_key_cache = {}
+_SSH_KEY_CACHE_TTL = 7 * 24 * 3600  # 7 days — keys rarely change, pods fetch live keys anyway
 def get_github_public_key(github_username: str, validate: bool = True) -> str:
-    """Fetch GitHub public keys for user (all keys)
+    """Fetch GitHub public keys for user, cached in-memory and DynamoDB."""
+    import urllib.request
-    Args:
-        github_username: GitHub username to fetch keys for
-        validate: If True, validate and filter keys to only include valid SSH key formats
+    username_lower = github_username.lower()
-    Returns:
-        String containing SSH keys (one per line) or None if no keys found
-    """
+    # In-memory cache (survives across warm Lambda invocations)
+    cached = _ssh_key_cache.get(username_lower)
+    if cached and time.time() - cached["ts"] < _SSH_KEY_CACHE_TTL:
+        logger.info(f"SSH keys for {github_username} from memory cache")
+        return cached["keys"]
+    # DynamoDB cache (survives cold starts)
     try:
-        import urllib.request
+        resp = reservations_table.get_item(
+            Key={"reservation_id": f"ssh-key-cache-{username_lower}"},
+            ProjectionExpression="ssh_keys, cached_at",
+        )
+        if "Item" in resp:
+            item = resp["Item"]
+            cached_at = float(item.get("cached_at", 0))
+            if time.time() - cached_at < _SSH_KEY_CACHE_TTL:
+                keys = item["ssh_keys"]
+                _ssh_key_cache[username_lower] = {"keys": keys, "ts": cached_at}
+                logger.info(f"SSH keys for {github_username} from DynamoDB cache")
+                return keys
+    except Exception as e:
+        logger.warning(f"DynamoDB SSH key cache read failed: {e}")
+    # Cache miss — fetch from GitHub
+    try:
         url = f"https://github.com/{github_username}.keys"
         logger.info(f"Fetching SSH keys for {github_username} from {url}")
-        with urllib.request.urlopen(url) as response:
+        with urllib.request.urlopen(url, timeout=10) as response:
             keys_data = response.read().decode("utf-8").strip()
         if not keys_data:
-            logger.error(
-                f"No public SSH keys found for GitHub user {github_username}")
+            logger.error(f"No public SSH keys found for GitHub user {github_username}")
             return None
         if validate:
-            # Validate keys format (basic check for ssh-rsa/ssh-ed25519/ssh-ecdsa)
             valid_keys = []
             for line in keys_data.split("\n"):
                 line = line.strip()
@@ -3549,22 +3533,31 @@ def get_github_public_key(github_username: str, validate: bool = True) -> str:
                     or line.startswith("ssh-ecdsa")
                 ):
                     valid_keys.append(line)
             if not valid_keys:
-                logger.error(
-                    f"No valid SSH keys found for GitHub user {github_username}"
-                )
+                logger.error(f"No valid SSH keys found for GitHub user {github_username}")
                 return None
+            keys_data = "\n".join(valid_keys)
-            logger.info(
-                f"Found {len(valid_keys)} valid SSH keys for {github_username}")
-            return "\n".join(valid_keys)
-        else:
-            return keys_data
+        logger.info(f"Found {len(keys_data.splitlines())} valid SSH keys for {github_username}")
+        # Store in both caches
+        now = time.time()
+        _ssh_key_cache[username_lower] = {"keys": keys_data, "ts": now}
+        try:
+            reservations_table.put_item(Item={
+                "reservation_id": f"ssh-key-cache-{username_lower}",
+                "ssh_keys": keys_data,
+                "cached_at": str(now),
+                "github_user": github_username,
+                "status": "cache",
+            })
+        except Exception as e:
+            logger.warning(f"DynamoDB SSH key cache write failed: {e}")
+        return keys_data
     except Exception as e:
-        logger.error(
-            f"Error fetching GitHub key for {github_username}: {str(e)}")
+        logger.error(f"Error fetching GitHub key for {github_username}: {str(e)}")
         return None
@@ -4145,7 +4138,6 @@ def create_pod(
         # Determine container image to use based on architecture
         if gpu_type.startswith("cpu-arm"):
-            # Use Python base image for ARM64 CPU instances with PyTorch installed via pip
             container_image = "python:3.11-slim"  # Multi-arch image with ARM64 support
         else:
             container_image = GPU_DEV_CONTAINER_IMAGE  # Default x86_64 PyTorch image
@@ -6678,7 +6670,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
             "p5en.48xlarge": "H200",
             "p6-b200.48xlarge": "B200",
             "p6-b300.48xlarge": "B300",
-            "c7i.2xlarge": "cpu-spot",
+            "c6id.2xlarge": "cpu-spot",
         }
         gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -7809,6 +7801,14 @@ def process_scheduled_queue_management():
                     except Exception:
                         cpu_spot_ready = False
                 if type_available_gpus >= requested_gpus and max_single_node >= requested_gpus and cpu_spot_ready:
+                    # Re-check DDB status before allocating — another invocation may have already started
+                    _current = reservations_table.get_item(Key={"reservation_id": reservation_id}).get("Item", {})
+                    _cur_status = _current.get("status", "queued")
+                    if _cur_status != "queued":
+                        logger.info(f"Reservation {reservation_id} already {_cur_status} (race avoided), skipping allocation")
+                        processed_count += 1
+                        continue
                     logger.info(
                         f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_available_gpus} available"
                     )
@@ -8092,6 +8092,17 @@ def process_cancellation_request(record: dict[str, Any]) -> bool:
                         logger.info(
                             f"Cleaned up pod resources for cancelled reservation {full_reservation_id}")
+                        # Force-detach EBS volume (CSI driver sometimes leaves it attached)
+                        ebs_vol = reservation.get("ebs_volume_id")
+                        if ebs_vol:
+                            try:
+                                vol_state = ec2_client.describe_volumes(VolumeIds=[ebs_vol])["Volumes"][0]["State"]
+                                if vol_state == "in-use":
+                                    logger.info(f"Force-detaching orphaned volume {ebs_vol}")
+                                    ec2_client.detach_volume(VolumeId=ebs_vol, Force=True)
+                            except Exception as detach_err:
+                                logger.warning(f"Volume detach failed for {ebs_vol}: {detach_err}")
                     except Exception as cleanup_error:
                         logger.error(
                             f"Error cleaning up pod {pod_name}: {cleanup_error}")
@@ -8736,6 +8747,24 @@ def process_clear_disk_lock_action(record: dict[str, Any]) -> bool:
     mark_disk_in_use(user_id, disk_name, False)
     logger.info(f"Cleared stale lock on disk '{disk_name}' for user '{user_id}'")
+    # Also force-detach any orphaned EBS volumes for this disk
+    try:
+        volumes = ec2_client.describe_volumes(
+            Filters=[
+                {"Name": "tag:gpu-dev-user", "Values": [user_id]},
+                {"Name": "tag:disk_name", "Values": [disk_name]},
+                {"Name": "status", "Values": ["in-use"]},
+            ]
+        ).get("Volumes", [])
+        for vol in volumes:
+            vol_id = vol["VolumeId"]
+            logger.info(f"Force-detaching orphaned volume {vol_id} for disk '{disk_name}'")
+            ec2_client.detach_volume(VolumeId=vol_id, Force=True)
+            logger.info(f"Detached volume {vol_id}")
+    except Exception as detach_err:
+        logger.warning(f"Failed to detach orphaned volumes for disk '{disk_name}': {detach_err}")
     return True

gpu-dev 0.5.32__tar.gz → 0.6.0__tar.gz

gpu-dev 0.5.32tar.gz → 0.6.0tar.gz