PyPI - matrice-compute - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

matrice-compute 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

matrice_compute/action_instance.py CHANGED Viewed

@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
     get_gpu_with_sufficient_memory_for_action,
     get_decrypted_access_key_pair,
     get_max_file_system,
+    get_best_service_ip_and_network,
 )
 from matrice_compute.task_utils import (
     setup_workspace_and_run_task,
@@ -267,17 +268,68 @@ class ActionInstance:
         Returns:
             str: GPU configuration string
         """
-        if not action_details["actionDetails"].get("gpuRequired", False):
+        action_id = action_details.get("_id", "unknown")
+        # Check if GPU is required
+        gpu_required = action_details["actionDetails"].get("gpuRequired", False)
+        if not gpu_required:
+            logging.info(
+                "Action %s does not require GPU - will run on CPU",
+                action_id
+            )
             return ""
-        gpu_indices = get_gpu_with_sufficient_memory_for_action(
-            action_details=action_details
+        # Get required GPU memory for logging
+        required_memory = action_details.get("actionDetails", {}).get(
+            "expectedResources", {}
+        ).get("gpuMemory", 0)
+        logging.info(
+            "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
+            action_id,
+            required_memory
         )
-        if gpu_indices:
-            gpu_str = ",".join(map(str, gpu_indices))
-            logging.info("Using GPUs: %s", gpu_str)
-            return f'--gpus "device={gpu_str}"'
-        logging.info("No GPUs with sufficient memory found.")
-        return ""
+        try:
+            # Get the best-fit GPU(s) with sufficient memory
+            gpu_indices = get_gpu_with_sufficient_memory_for_action(
+                action_details=action_details
+            )
+            if gpu_indices:
+                gpu_str = ",".join(map(str, gpu_indices))
+                logging.info(
+                    "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
+                    action_id,
+                    gpu_str,
+                    required_memory
+                )
+                # Return Docker GPU configuration
+                # Format: --gpus "device=0" or --gpus "device=0,1,2"
+                return f'--gpus "device={gpu_str}"'
+            else:
+                logging.warning(
+                    "Action %s: No GPUs with sufficient memory found (required: %d MB)",
+                    action_id,
+                    required_memory
+                )
+                return ""
+        except ValueError as e:
+            logging.error(
+                "Action %s: Error selecting GPU - %s",
+                action_id,
+                str(e)
+            )
+            return ""
+        except Exception as e:
+            logging.error(
+                "Action %s: Unexpected error in GPU selection - %s",
+                action_id,
+                str(e)
+            )
+            return ""
     @log_errors(default_return="", raise_exception=False)
     def get_base_docker_cmd(
@@ -526,13 +578,18 @@ class ActionInstance:
             if username and password:
                 login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
-                subprocess.run(login_cmd, shell=True, check=True)
+                result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
+                if result.returncode != 0:
+                    raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
                 logging.info("Docker login successful")
             else:
                 logging.warning(
                     "Docker credentials not available, skipping Docker login"
                 )
+        except subprocess.TimeoutExpired:
+            logging.error("Docker login timed out after 30 seconds")
+            raise Exception("Docker login timed out")
         except Exception as err:
             logging.error(
                 "Docker login failed: %s",
@@ -1151,9 +1208,17 @@ def inference_ws_server_execute(self: ActionInstance):
         return
     image = action_details["actionDetails"].get("docker")
     self.setup_action_requirements(action_details)
+    # Get the best IP and network configuration for port 8102
+    ws_host, use_host_network = get_best_service_ip_and_network(8102)
+    # Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
+    if not os.environ.get("INFERENCE_WS_HOST"):
+        os.environ["INFERENCE_WS_HOST"] = ws_host
+    logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
     # Inference WebSocket server with --net=host (Port: 8102)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
@@ -1164,7 +1229,6 @@ def inference_ws_server_execute(self: ActionInstance):
         f"{image} "
         f"./app "
         f"{self.action_record_id} "
     )
     logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
@@ -1185,7 +1249,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
     image = action_details["actionDetails"].get("docker")
     self.setup_action_requirements(action_details)
+    # Get the ws_host from environment variable set by inference_ws_server_execute
+    ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
+    ws_url = f"{ws_host}:8102"
+    logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
     # Frontend streaming with --net=host (Port: 3000)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
@@ -1195,9 +1265,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f"-e PORT=3000 "
+        f'-e WS_HOST="{ws_url}" '
         f"{image}"
     )
-    logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
+    logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
     # Docker Command run
     self.start(worker_cmd, "fe_fs_streaming")
@@ -1304,6 +1375,11 @@ def redis_setup_execute(self: ActionInstance):
         action_id=action_id,
     )
+    # Get the best IP for Redis (port 6379)
+    redis_host, _ = get_best_service_ip_and_network(6379)
+    logging.info(f"Redis will use IP: {redis_host} on port 6379")
     redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
     # Redis container with --net=host (Port: 6379)
@@ -1315,7 +1391,7 @@ def redis_setup_execute(self: ActionInstance):
         f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
     )
-    logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
+    logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
     # Start Redis container first
     redis_process = subprocess.Popen(
@@ -1324,13 +1400,13 @@ def redis_setup_execute(self: ActionInstance):
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
     )
-    logging.info("Redis container started successfully on localhost:6379")
+    logging.info("Redis container started successfully on %s:6379", redis_host)
     # Wait for Redis to be ready
     time.sleep(5)
     env_vars = {
-        "REDIS_URL": f"localhost:6379",
+        "REDIS_URL": f"{redis_host}:6379",
         "REDIS_PASSWORD": redis_password,
     }
@@ -1348,7 +1424,7 @@ def redis_setup_execute(self: ActionInstance):
         f"{self.action_record_id} "
     )
-    logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
+    logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
     self.start(cmd, "redis_setup")
@@ -1385,8 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
+    # Get GPU configuration based on requirements and availability
+    # This uses the best-fit algorithm to select the most appropriate GPU(s)
     use_gpu = self.get_gpu_config(action_details)
-    use_gpu = "--runtime=nvidia "
+    logging.info(
+        "Action %s: Model deployment GPU config: %s",
+        action_id,
+        use_gpu if use_gpu else "CPU-only"
+    )
     extra_env_vars = {"INTERNAL_PORT": internal_port}
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)

matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

matrice-compute 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl