PyPI - matrice-compute - Versions diffs - 0.1.20__tar.gz → 0.1.21__tar.gz - Mend

matrice-compute 0.1.20tar.gz → 0.1.21tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{matrice_compute-0.1.20 → matrice_compute-0.1.21}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.20
+Version: 0.1.21
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.20 → matrice_compute-0.1.21}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.20
+Version: 0.1.21
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -268,17 +268,68 @@ class ActionInstance:
         Returns:
             str: GPU configuration string
         """
-        if not action_details["actionDetails"].get("gpuRequired", False):
+        action_id = action_details.get("_id", "unknown")
+        # Check if GPU is required
+        gpu_required = action_details["actionDetails"].get("gpuRequired", False)
+        if not gpu_required:
+            logging.info(
+                "Action %s does not require GPU - will run on CPU",
+                action_id
+            )
             return ""
-        gpu_indices = get_gpu_with_sufficient_memory_for_action(
-            action_details=action_details
+        # Get required GPU memory for logging
+        required_memory = action_details.get("actionDetails", {}).get(
+            "expectedResources", {}
+        ).get("gpuMemory", 0)
+        logging.info(
+            "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
+            action_id,
+            required_memory
         )
-        if gpu_indices:
-            gpu_str = ",".join(map(str, gpu_indices))
-            logging.info("Using GPUs: %s", gpu_str)
-            return f'--gpus "device={gpu_str}"'
-        logging.info("No GPUs with sufficient memory found.")
-        return ""
+        try:
+            # Get the best-fit GPU(s) with sufficient memory
+            gpu_indices = get_gpu_with_sufficient_memory_for_action(
+                action_details=action_details
+            )
+            if gpu_indices:
+                gpu_str = ",".join(map(str, gpu_indices))
+                logging.info(
+                    "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
+                    action_id,
+                    gpu_str,
+                    required_memory
+                )
+                # Return Docker GPU configuration
+                # Format: --gpus "device=0" or --gpus "device=0,1,2"
+                return f'--gpus "device={gpu_str}"'
+            else:
+                logging.warning(
+                    "Action %s: No GPUs with sufficient memory found (required: %d MB)",
+                    action_id,
+                    required_memory
+                )
+                return ""
+        except ValueError as e:
+            logging.error(
+                "Action %s: Error selecting GPU - %s",
+                action_id,
+                str(e)
+            )
+            return ""
+        except Exception as e:
+            logging.error(
+                "Action %s: Unexpected error in GPU selection - %s",
+                action_id,
+                str(e)
+            )
+            return ""
     @log_errors(default_return="", raise_exception=False)
     def get_base_docker_cmd(
@@ -1410,13 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
+    # Get GPU configuration based on requirements and availability
+    # This uses the best-fit algorithm to select the most appropriate GPU(s)
     use_gpu = self.get_gpu_config(action_details)
-    gpuRequired = action_details["actionDetails"]["gpuRequired"]
-    if gpuRequired==False:
-        use_gpu = ""
-    else:
-        use_gpu = "--runtime=nvidia"
+    logging.info(
+        "Action %s: Model deployment GPU config: %s",
+        action_id,
+        use_gpu if use_gpu else "CPU-only"
+    )
     extra_env_vars = {"INTERNAL_PORT": internal_port}
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)

{matrice_compute-0.1.20 → matrice_compute-0.1.21}/src/matrice_compute/instance_utils.py RENAMED Viewed

@@ -589,19 +589,42 @@ def get_required_gpu_memory(action_details: dict) -> int:
 @log_errors(default_return=True, raise_exception=False)
 def is_allowed_gpu_device(gpu_index: int) -> bool:
-    """Check if GPU device is allowed.
+    """Check if GPU device is allowed based on GPUS environment variable.
+    The GPUS environment variable can be used to restrict which GPU devices
+    are available for allocation (e.g., GPUS="0,2" allows only GPU 0 and 2).
     Args:
         gpu_index (int): GPU device index
     Returns:
-        bool: True if GPU is allowed
+        bool: True if GPU is allowed (or no filter is set), False otherwise
     """
     gpus = os.environ.get("GPUS")
     if not gpus:
+        # No filter set - all GPUs are allowed
+        return True
+    try:
+        allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
+        is_allowed = int(gpu_index) in allowed_gpus
+        if not is_allowed:
+            logging.debug(
+                "GPU %d is not in allowed GPU list: %s",
+                gpu_index,
+                allowed_gpus
+            )
+        return is_allowed
+    except ValueError as e:
+        logging.warning(
+            "Invalid GPUS environment variable format '%s': %s. Allowing all GPUs.",
+            gpus,
+            e
+        )
         return True
-    allowed_gpus = [int(x) for x in gpus.split(",") if x.strip()]
-    return int(gpu_index) in allowed_gpus
 @log_errors(raise_exception=True, log_error=False)
@@ -620,7 +643,15 @@ def get_gpu_with_sufficient_memory_for_action(
     Raises:
         ValueError: If insufficient GPU memory
     """
+    action_id = action_details.get("_id", "unknown")
     required_gpu_memory = get_required_gpu_memory(action_details)
+    logging.info(
+        "Action %s: Searching for GPU(s) with %d MB available memory",
+        action_id,
+        required_gpu_memory
+    )
     command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
     try:
         result = subprocess.run(
@@ -631,47 +662,137 @@ def get_gpu_with_sufficient_memory_for_action(
             check=False,
         )
         if result.returncode != 0:
+            error_msg = f"nvidia-smi command failed with return code {result.returncode}"
+            logging.error("Action %s: %s", action_id, error_msg)
             raise ValueError("Failed to get GPU information - nvidia-smi command failed")
         memory_free_info = result.stdout.decode("ascii").strip().split("\n")
     except subprocess.TimeoutExpired:
-        logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
+        logging.error(
+            "Action %s: nvidia-smi command timed out after 5 seconds",
+            action_id
+        )
         raise ValueError("Failed to get GPU information - nvidia-smi timed out")
     except FileNotFoundError:
+        logging.error(
+            "Action %s: nvidia-smi not found on this system",
+            action_id
+        )
         raise ValueError("nvidia-smi not found - no GPU support available")
     except Exception as e:
-        logging.warning("Error running nvidia-smi: %s", e)
+        logging.error(
+            "Action %s: Error running nvidia-smi: %s",
+            action_id,
+            e
+        )
         raise ValueError(f"Failed to get GPU information: {e}")
     if len(memory_free_info) < 2:
+        logging.error(
+            "Action %s: No GPU information available from nvidia-smi output",
+            action_id
+        )
         raise ValueError("No GPU information available from nvidia-smi")
     try:
         memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
     except (ValueError, IndexError) as e:
+        logging.error(
+            "Action %s: Error parsing GPU memory information: %s",
+            action_id,
+            e
+        )
         raise ValueError(f"Error parsing GPU memory information: {e}")
     if not memory_free_values:
+        logging.error("Action %s: No GPU devices found", action_id)
         raise ValueError("No GPU devices found")
+    # Log all available GPUs and their free memory
+    logging.info(
+        "Action %s: Found %d GPU(s) - Free memory: %s",
+        action_id,
+        len(memory_free_values),
+        ", ".join([f"GPU{i}: {mem}MB" for i, mem in enumerate(memory_free_values)])
+    )
+    # Check GPUS environment variable for allowed devices
+    allowed_gpus = os.environ.get("GPUS", "")
+    if allowed_gpus:
+        logging.info(
+            "Action %s: GPU device filter active - allowed devices: %s",
+            action_id,
+            allowed_gpus
+        )
+    # For smaller memory requirements, try to fit on a single GPU first
     if required_gpu_memory < 80000:
+        logging.debug(
+            "Action %s: Required memory %d MB < 80000 MB - attempting single GPU allocation",
+            action_id,
+            required_gpu_memory
+        )
         try:
-            return get_single_gpu_with_sufficient_memory_for_action(action_details)
-        except ValueError:
-            pass
+            single_gpu = get_single_gpu_with_sufficient_memory_for_action(action_details)
+            logging.info(
+                "Action %s: Successfully allocated single GPU: %s",
+                action_id,
+                single_gpu
+            )
+            return single_gpu
+        except ValueError as e:
+            logging.debug(
+                "Action %s: Single GPU allocation failed (%s) - will try multiple GPUs",
+                action_id,
+                str(e)
+            )
+    # Multi-GPU allocation: accumulate GPUs until we have enough memory
+    logging.info(
+        "Action %s: Attempting multi-GPU allocation for %d MB",
+        action_id,
+        required_gpu_memory
+    )
     selected_gpus = []
     total_memory = 0
     for i, mem in enumerate(memory_free_values):
         if not is_allowed_gpu_device(i):
+            logging.debug(
+                "Action %s: Skipping GPU %d - not in allowed device list",
+                action_id,
+                i
+            )
             continue
         if total_memory >= required_gpu_memory:
             break
         selected_gpus.append(i)
         total_memory += mem
+        logging.debug(
+            "Action %s: Added GPU %d (%d MB free) - Total: %d MB",
+            action_id,
+            i,
+            mem,
+            total_memory
+        )
     if total_memory >= required_gpu_memory:
+        logging.info(
+            "Action %s: Successfully allocated %d GPU(s): %s (Total memory: %d MB >= Required: %d MB)",
+            action_id,
+            len(selected_gpus),
+            selected_gpus,
+            total_memory,
+            required_gpu_memory
+        )
         return selected_gpus
-    raise ValueError(
-        f"Insufficient GPU memory available. Required: {required_gpu_memory}MB, Available: {total_memory}MB"
+    error_msg = (
+        f"Insufficient GPU memory available. "
+        f"Required: {required_gpu_memory}MB, "
+        f"Available: {total_memory}MB across {len(selected_gpus)} GPU(s)"
     )
+    logging.error("Action %s: %s", action_id, error_msg)
+    raise ValueError(error_msg)
 @log_errors(raise_exception=True, log_error=False)
@@ -679,7 +800,10 @@ def get_single_gpu_with_sufficient_memory_for_action(
     action_details: dict,
 ) -> list:
     """
-    Get single GPU with sufficient memory.
+    Get single GPU with sufficient memory using best-fit algorithm.
+    Best-fit selects the GPU with the smallest amount of free memory
+    that still meets the requirements, minimizing fragmentation.
     Args:
         action_details (dict): Action details
@@ -690,7 +814,15 @@ def get_single_gpu_with_sufficient_memory_for_action(
     Raises:
         ValueError: If no GPU has sufficient memory
     """
+    action_id = action_details.get("_id", "unknown")
     required_gpu_memory = get_required_gpu_memory(action_details)
+    logging.debug(
+        "Action %s: Finding best-fit single GPU for %d MB",
+        action_id,
+        required_gpu_memory
+    )
     command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
     try:
         result = subprocess.run(
@@ -704,38 +836,104 @@ def get_single_gpu_with_sufficient_memory_for_action(
             raise ValueError("Failed to get GPU information - nvidia-smi command failed")
         memory_free_info = result.stdout.decode("ascii").strip().split("\n")
     except subprocess.TimeoutExpired:
-        logging.warning("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
+        logging.error(
+            "Action %s: nvidia-smi timed out in single GPU selection",
+            action_id
+        )
         raise ValueError("Failed to get GPU information - nvidia-smi timed out")
     except FileNotFoundError:
         raise ValueError("nvidia-smi not found - no GPU support available")
     except Exception as e:
-        logging.warning("Error running nvidia-smi: %s", e)
+        logging.error(
+            "Action %s: Error running nvidia-smi: %s",
+            action_id,
+            e
+        )
         raise ValueError(f"Failed to get GPU information: {e}")
     if len(memory_free_info) < 2:
         raise ValueError("No GPU information available from nvidia-smi")
     try:
         memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
     except (ValueError, IndexError) as e:
         raise ValueError(f"Error parsing GPU memory information: {e}")
     if not memory_free_values:
         raise ValueError("No GPU devices found")
+    # Best-fit algorithm: find GPU with minimum free memory that meets requirement
     best_fit_gpu = None
     best_fit_memory = float("inf")
     for i, mem in enumerate(memory_free_values):
+        # Check if GPU is in allowed list
         if not is_allowed_gpu_device(i):
+            logging.debug(
+                "Action %s: Skipping GPU %d (not in allowed list) - %d MB free",
+                action_id,
+                i,
+                mem
+            )
             continue
-        if mem >= required_gpu_memory and mem < best_fit_memory:
-            best_fit_gpu = i
-            best_fit_memory = mem
+        # Check if GPU has sufficient memory
+        if mem >= required_gpu_memory:
+            logging.debug(
+                "Action %s: GPU %d is candidate - %d MB free (required: %d MB)",
+                action_id,
+                i,
+                mem,
+                required_gpu_memory
+            )
+            # Best-fit: choose GPU with smallest sufficient memory
+            if mem < best_fit_memory:
+                best_fit_gpu = i
+                best_fit_memory = mem
+                logging.debug(
+                    "Action %s: GPU %d is new best-fit candidate",
+                    action_id,
+                    i
+                )
+        else:
+            logging.debug(
+                "Action %s: GPU %d insufficient - %d MB free < %d MB required",
+                action_id,
+                i,
+                mem,
+                required_gpu_memory
+            )
     if best_fit_gpu is not None:
+        logging.info(
+            "Action %s: Selected best-fit GPU %d with %d MB free (required: %d MB, waste: %d MB)",
+            action_id,
+            best_fit_gpu,
+            best_fit_memory,
+            required_gpu_memory,
+            best_fit_memory - required_gpu_memory
+        )
         return [best_fit_gpu]
-    raise ValueError(
-        f"No single GPU with sufficient memory ({required_gpu_memory}MB) available"
-    )
+    # No suitable GPU found - provide detailed error
+    suitable_gpus = [
+        f"GPU{i}: {mem}MB (need {required_gpu_memory}MB)"
+        for i, mem in enumerate(memory_free_values)
+        if is_allowed_gpu_device(i)
+    ]
+    if not suitable_gpus:
+        error_msg = f"No allowed GPUs available (GPUS env filter active)"
+    else:
+        error_msg = (
+            f"No single GPU with sufficient memory. "
+            f"Required: {required_gpu_memory}MB. "
+            f"Available GPUs: {', '.join(suitable_gpus)}"
+        )
+    logging.warning("Action %s: %s", action_id, error_msg)
+    raise ValueError(error_msg)
 @log_errors(default_return=(None, None), raise_exception=False)

matrice-compute 0.1.20__tar.gz → 0.1.21__tar.gz

matrice-compute 0.1.20tar.gz → 0.1.21tar.gz