PyPI - matrice-compute - Versions diffs - 0.1.37__tar.gz → 0.1.38__tar.gz - Mend

matrice-compute 0.1.37tar.gz → 0.1.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{matrice_compute-0.1.37 → matrice_compute-0.1.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.37
+Version: 0.1.38
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.37 → matrice_compute-0.1.38}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.37
+Version: 0.1.38
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -26,6 +26,10 @@ from matrice_common.utils import log_errors
 class ActionInstance:
     """Base class for tasks that run in Action containers."""
+    # Class-level dictionary to track deployed services and their ports
+    # Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
+    _deployed_services = {}
     def __init__(self, scaling: Scaling, action_info: dict):
         """Initialize an action instance.
@@ -85,6 +89,52 @@ class ActionInstance:
             raise ValueError(f"Unknown action type: {self.action_type}")
         self.task = self.actions_map[self.action_type]
+    @classmethod
+    def get_or_create_triton_ports(cls, service_id, scaling_instance):
+        """Get existing TRITON_PORTS for a service or create new ones.
+        Args:
+            service_id (str): Service ID (_idService)
+            scaling_instance: Scaling instance to get open ports
+        Returns:
+            str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
+        """
+        if not service_id:
+            # No service_id, generate new ports
+            port1 = scaling_instance.get_open_port()
+            port2 = scaling_instance.get_open_port()
+            port3 = scaling_instance.get_open_port()
+            return f"{port1},{port2},{port3}"
+        # Check if ports already exist for this service
+        if service_id in cls._deployed_services:
+            triton_ports = cls._deployed_services[service_id]["triton_ports"]
+            logging.info(
+                "Reusing TRITON_PORTS for service %s: %s",
+                service_id,
+                triton_ports
+            )
+            return triton_ports
+        # First deployment: generate new ports and store them
+        port1 = scaling_instance.get_open_port()
+        port2 = scaling_instance.get_open_port()
+        port3 = scaling_instance.get_open_port()
+        triton_ports = f"{port1},{port2},{port3}"
+        # Store for future use
+        cls._deployed_services[service_id] = {
+            "triton_ports": triton_ports,
+        }
+        logging.info(
+            "First deployment for service %s - generated TRITON_PORTS: %s",
+            service_id,
+            triton_ports
+        )
+        return triton_ports
     @log_errors(default_return={}, raise_exception=True, log_error=False)
     def _init_credentials(self):
         """Initialize Matrice credentials.
@@ -346,6 +396,7 @@ class ActionInstance:
         destination_workspace_path: str = "/usr/src/workspace",
         docker_workdir: str = "",
         extra_pkgs: list = [],
+        container_name: str = "",
     ):
         """Build base Docker command with common options.
@@ -360,6 +411,7 @@ class ActionInstance:
             destination_workspace_path (str): Container workspace path
             docker_workdir (str): Docker working directory
             extra_pkgs (list): List of extra packages to install
+            container_name (str): Docker container name (format: {action_type}_{action_id})
         Returns:
             str: Base Docker command
         """
@@ -426,17 +478,20 @@ class ActionInstance:
         # if the service provider is local, then put --restart unless-stopped
         if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
+            env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
             use_restart_policy = "--restart unless-stopped"
         else:
             use_restart_policy = ""
+        # Build container name option if provided
+        name_option = f"--name {container_name}" if container_name else ""
         cmd_parts = [
-            f"docker run {use_gpu} {use_restart_policy} ",
+            f"docker run -d {use_gpu} {use_restart_policy} ",
             network_config,
             *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
             *volumes,
             # Container configuration and startup commands
-            f"--cidfile ./{self.action_record_id}.cid ",
             f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
             f'/bin/bash -c "cd {docker_workdir} && '
             f"{env_exports} && "
@@ -838,55 +893,50 @@ class ActionInstance:
         self.cmd = cmd
         self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
-        with open(self.log_path, "wb") as out:
-            self.process = subprocess.Popen(
-                shlex.split(self.cmd),
-                stdout=out,
-                stderr=out,
-                env={**os.environ},
-                start_new_session=True,
-            )
+        # Run docker with -d flag to get container ID from stdout
+        process = subprocess.Popen(
+            shlex.split(self.cmd),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env={**os.environ},
+        )
-        self.container_id = None
-        cid_file_path = f"./{self.action_record_id}.cid"
-        max_retries = 5
-        retry_delay = 1  # seconds
-        for attempt in range(max_retries):
-            try:
-                with open(cid_file_path, "r") as cid_file:
-                    container_id = cid_file.read().strip()
-                    self.container_id = container_id
-                    logging.info(
-                        "Started process for action %s with container ID: %s",
-                        self.action_record_id,
-                        self.container_id,
-                    )
-                    break
-            except FileNotFoundError:
-                logging.warning(
-                    "CID file not found for action %s, attempt %d/%d",
-                    self.action_record_id,
-                    attempt + 1,
-                    max_retries,
-                )
-                time.sleep(retry_delay)
-            except Exception as e:
-                logging.error(
-                    "Error reading CID file for action %s: %s",
-                    self.action_record_id,
-                    str(e),
-                )
-                time.sleep(retry_delay)
-        else:
+        # Use a longer timeout for docker run since --pull=always may need to
+        # download large images on first run. Default: 30 minutes (1800 seconds)
+        # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
+        docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
+        logging.info(
+            "Waiting for docker container to start for action %s (timeout: %d seconds)",
+            self.action_record_id,
+            docker_start_timeout,
+        )
+        stdout, stderr = process.communicate(timeout=docker_start_timeout)
+        if process.returncode != 0:
             logging.error(
-                "Failed to read CID file for action %s after %d attempts",
+                "Docker run failed for action %s: %s",
                 self.action_record_id,
-                max_retries,
+                stderr,
             )
-            raise Exception("Failed to start process: CID file not found")
+            raise RuntimeError(f"Docker run failed: {stderr}")
-        # report container id to scaling service
+        self.container_id = stdout.strip()
+        logging.info(
+            "Started container for action %s with ID: %s",
+            self.action_record_id,
+            self.container_id,
+        )
+        # Start following container logs in background
+        self.process = subprocess.Popen(
+            ["docker", "logs", "-f", self.container_id],
+            stdout=open(self.log_path, "wb"),
+            stderr=subprocess.STDOUT,
+            start_new_session=True,
+        )
+        # Report container id to scaling service
         self.scaling.update_action_container_id(
             action_record_id=self.action_record_id,
             container_id=self.container_id,
@@ -1052,7 +1102,8 @@ def data_preparation_execute(
             "Started pulling Docker image with PID: %s",
             process.pid,
         )
-    cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
+    container_name = f"data_prep_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "data_preparation_log")
@@ -1081,7 +1132,8 @@ def data_processing_execute(self: ActionInstance):
         service="bg-job-scheduler",
         job_params=action["jobParams"],
     )
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
+    container_name = f"data_processing_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "data_processing_log")
@@ -1094,7 +1146,8 @@ def data_split_execute(self: ActionInstance):
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs, model_family="")
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
+    container_name = f"data_split_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "data_split")
@@ -1109,7 +1162,8 @@ def dataset_annotation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
+    container_name = f"dataset_annotation_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "dataset_annotation")
@@ -1124,7 +1178,8 @@ def dataset_augmentation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
+    container_name = f"dataset_augmentation_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "dataset_augmentation")
@@ -1140,7 +1195,8 @@ def augmentation_server_creation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
+    container_name = f"augmentation_setup_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "augmentation_setup")
@@ -1161,32 +1217,34 @@ def database_setup_execute(self: ActionInstance):
     project_id = action_details["_idProject"]
+    # Define container names with action_record_id for uniqueness
+    mongodb_container_name = f"database_setup_{self.action_record_id}"
+    qdrant_container_name = f"qdrant_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for inference tracker: %s",
+            "Using existing container ID for database setup: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
         cmd = "docker restart " + self.docker_container
-        self.start(cmd, "qdrant_setup")
+        self.start(cmd, "database_setup")
-        #qdrant restart
-        qdrant_cmd = "docker restart qdrant"
-        self.start(qdrant_cmd, 'qdrant_setup')
+        # qdrant restart
+        qdrant_cmd = f"docker restart {qdrant_container_name}"
+        self.start(qdrant_cmd, "qdrant_setup")
         return
-    dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
+    dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
     # MongoDB container with --net=host (Port: 27020:27017)
     cmd = (
-        f"docker run --pull=always --net=host "
+        f"docker run -d --pull=always --net=host "
+        f"--name {mongodb_container_name} "
+        f"-v matrice_myvol:/matrice_data "
         f"-v {dbPath}:{dbPath} "
-        f"--name database_setup_{self.action_record_id} "
         f"-v /var/run/docker.sock:/var/run/docker.sock "
-        f"--cidfile ./{self.action_record_id}.cid "
         f"-e ACTION_RECORD_ID={self.action_record_id} "
         f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
         f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1196,6 +1254,23 @@ def database_setup_execute(self: ActionInstance):
     )
     logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
+    # Qdrant container with --net=host (Port: 6334)
+    qdrant_cmd = (
+        f"docker run -d --pull=always --net=host "
+        f"--name {qdrant_container_name} "
+        f"-v matrice_myvol:/matrice_data "
+        f"qdrant/qdrant:latest "
+    )
+    logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
+    # Start Qdrant container
+    qdrant_process = subprocess.Popen(
+        qdrant_cmd,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    logging.info("Qdrant container started successfully")
     # Docker Command run
     self.start(cmd, "database_setup")
@@ -1215,6 +1290,8 @@ def facial_recognition_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    container_name = f"facial_recognition_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for facial recognition worker: %s",
@@ -1228,15 +1305,13 @@ def facial_recognition_setup_execute(self: ActionInstance):
     # Facial recognition worker container with --net=host (Port: 8081)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name worker "
-         f"--cidfile ./{self.action_record_id}.cid "
+        f"--name {container_name} "
         f"-v matrice_myvol:/matrice_data "
-        f"--cidfile ./{self.action_record_id}.cid "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1258,6 +1333,8 @@ def lpr_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    container_name = f"lpr_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for LPR worker: %s",
@@ -1271,15 +1348,14 @@ def lpr_setup_execute(self: ActionInstance):
     # LPR worker container with --net=host (Port: 8082)
     worker_cmd = (
         f"docker run -d --net=host --pull=always "
-        f"--name lpr-worker "
-        f"--cidfile ./{self.action_record_id}.cid "
+        f"--name {container_name} "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
         f'-e PORT=8082 '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1310,6 +1386,8 @@ def inference_ws_server_execute(self: ActionInstance):
     logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
+    container_name = f"inference_ws_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for inference WebSocket server: %s",
@@ -1323,12 +1401,11 @@ def inference_ws_server_execute(self: ActionInstance):
     # Inference WebSocket server with --net=host (Port: 8102)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name inference "
-        f"--cidfile ./{self.action_record_id}.cid "
+        f"--name {container_name} "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image} "
         f"./app "
         f"{self.action_record_id} "
@@ -1359,6 +1436,8 @@ def fe_fs_streaming_execute(self: ActionInstance):
     logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
+    container_name = f"fe_streaming_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for frontend streaming: %s",
@@ -1372,15 +1451,14 @@ def fe_fs_streaming_execute(self: ActionInstance):
     # Frontend streaming with --net=host (Port: 3000)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name fe_streaming "
-        f"--cidfile ./{self.action_record_id}.cid "
+        f"--name {container_name} "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f"-e PORT=3000 "
         f'-e WS_HOST="{ws_url}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1405,6 +1483,8 @@ def fe_analytics_service_execute(self: ActionInstance):
     project_id = action_details["_idProject"]
+    container_name = f"fe_analytics_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for frontend analytics service: %s",
@@ -1418,15 +1498,14 @@ def fe_analytics_service_execute(self: ActionInstance):
     # Frontend analytics service with --net=host (Port: 3001)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name fe-analytics "
-        f"--cidfile ./{self.action_record_id}.cid "
+        f"--name {container_name} "
         f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
         f"-e PORT=3001 "
         f'-e PROJECT_ID="{project_id}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1451,7 +1530,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
     else:
         return
     use_gpu = self.get_gpu_config(action_details)
-    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
+    container_name = f"dataset_generation_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "dataset_generation")
@@ -1472,7 +1552,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
     else:
         return
     use_gpu = self.get_gpu_config(action_details)
-    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
+    container_name = f"synthetic_data_setup_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "synthetic_data_setup")
@@ -1509,6 +1590,8 @@ def redis_setup_execute(self: ActionInstance):
     redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
+    # Define container names with action_record_id for uniqueness
+    redis_container_name = f"redis_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
@@ -1520,18 +1603,34 @@ def redis_setup_execute(self: ActionInstance):
         self.start(cmd, "redis_setup")
         # Redis container restart
-        redis_restart_cmd = "docker restart redis_container"
+        redis_restart_cmd = f"docker restart {redis_container_name}"
         self.start(redis_restart_cmd, "redis")
         return
-    # Redis container with --net=host (Port: 6379)
+    # Redis container with --net=host (Port: 6379) with optimized configuration
     redis_cmd = (
         f"docker run -d --net=host "
-        f"--name redis_container "
+        f"--name {redis_container_name} "
         f"--restart unless-stopped "
         f"{redis_image} "
-        f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
+        f"redis-server --bind 0.0.0.0 "
+        f"--appendonly no "
+        f'--save "" '
+        f"--maxmemory 30gb "
+        f"--maxmemory-policy allkeys-lru "
+        f"--io-threads 4 "
+        f"--io-threads-do-reads yes "
+        f"--stream-node-max-bytes 8192 "
+        f"--stream-node-max-entries 1000 "
+        f"--hz 100 "
+        f"--tcp-backlog 2048 "
+        f"--timeout 0 "
+        f"--lazyfree-lazy-eviction yes "
+        f"--lazyfree-lazy-expire yes "
+        f"--lazyfree-lazy-server-del yes "
+        f"--activedefrag yes "
+        f"--requirepass {redis_password}"
     )
     logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
@@ -1555,8 +1654,9 @@ def redis_setup_execute(self: ActionInstance):
     # bg-redis management container with --net=host (Port: 8082)
     cmd = (
-        f"docker run --net=host "
-         f"--cidfile ./{self.action_record_id}.cid "
+        f"docker run -d --net=host "
+        f"--restart unless-stopped "
+        f"--name bg-redis_{self.action_record_id} "
         f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
         f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
         f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1583,7 +1683,8 @@ def deploy_aggregator_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
+    container_name = f"deploy_aggregator_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "deploy_aggregator")
@@ -1599,6 +1700,10 @@ def model_deploy_execute(self: ActionInstance):
         return
     action_id = action_details["_id"]
     model_family = action_details["actionDetails"]["modelFamily"]
+    # Get the service ID to track deployments
+    service_id = action_details.get("_idService")
     self.setup_action_requirements(
         action_details,
         work_fs,
@@ -1606,17 +1711,29 @@ def model_deploy_execute(self: ActionInstance):
         action_id=action_id,
     )
-    # Get GPU configuration based on requirements and availability
-    # This uses the best-fit algorithm to select the most appropriate GPU(s)
-    use_gpu = self.get_gpu_config(action_details)
-    # Override: If GPU is required, use all available GPUs
+    # Use all GPUs if GPU is required
     gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
     if gpuRequired:
         use_gpu = "--runtime=nvidia --gpus all"
+    else:
+        use_gpu = ""
+    logging.info(
+        "Action %s: Model deployment GPU config: %s",
+        action_id,
+        use_gpu if use_gpu else "CPU-only"
+    )
+    # Get or create TRITON_PORTS (uses utility method)
+    triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
-    extra_env_vars = {"INTERNAL_PORT": internal_port}
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
+    extra_env_vars = {
+        "INTERNAL_PORT": internal_port,
+        "TRITON_PORTS": triton_ports
+    }
+    container_name = f"model_deploy_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "deploy_log")
@@ -1649,7 +1766,8 @@ def model_train_execute(self: ActionInstance):
         self.start(cmd, "train_log")
         return
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
+    container_name = f"model_train_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "train_log")
@@ -1672,7 +1790,7 @@ def model_eval_execute(self: ActionInstance):
     )
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for training: %s",
+            "Using existing container ID for evaluation: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1680,7 +1798,8 @@ def model_eval_execute(self: ActionInstance):
         self.start(cmd, "eval_log")
         return
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
+    container_name = f"model_eval_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "eval_log")
@@ -1706,7 +1825,7 @@ def model_export_execute(self: ActionInstance):
     )
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for training: %s",
+            "Using existing container ID for export: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1714,7 +1833,8 @@ def model_export_execute(self: ActionInstance):
         self.start(cmd, "export_log")
         return
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
+    container_name = f"model_export_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "export_log")
@@ -1730,7 +1850,8 @@ def image_build_execute(self: ActionInstance):
     action_id = action_details["_id"]
     internal_api_key = self.get_internal_api_key(action_id)
     extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
-    cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
+    container_name = f"image_build_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "image_build_log")
@@ -1742,7 +1863,8 @@ def resource_clone_execute(self: ActionInstance):
     if not action_details:
         return
     self.setup_action_requirements(action_details)
-    cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
+    container_name = f"resource_clone_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "resource_clone")
@@ -1760,7 +1882,7 @@ def streaming_gateway_execute(self: ActionInstance):
         )
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for training: %s",
+            "Using existing container ID for streaming gateway: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1768,7 +1890,8 @@ def streaming_gateway_execute(self: ActionInstance):
         self.start(cmd, "streaming_gateway")
         return
-    cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
+    container_name = f"streaming_gateway_{self.action_record_id}"
+    cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "streaming_gateway")
@@ -1864,7 +1987,7 @@ def kafka_setup_execute(self: ActionInstance):
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for training: %s",
+            "Using existing container ID for kafka: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1872,10 +1995,12 @@ def kafka_setup_execute(self: ActionInstance):
         self.start(cmd, "kafka_setup")
         return
+    container_name = f"kafka_{self.action_record_id}"
     # Kafka container with --net=host (Ports: 9092, 9093)
     cmd = (
-        f"docker run --net=host "
+        f"docker run -d --net=host "
+        f"--name {container_name} "
         f"{env_args} "
         f"--shm-size=30G --pull=always "
         f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
@@ -1908,6 +2033,8 @@ def inference_tracker_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    container_name = f"inference_tracker_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
             "Using existing container ID for inference tracker: %s",
@@ -1921,14 +2048,13 @@ def inference_tracker_setup_execute(self: ActionInstance):
     # This is the existing Docker run command
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-         f"--cidfile ./{self.action_record_id}.cid "
-        f"--name inference-tracker-worker "
+        f"--name {container_name} "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )
@@ -1950,9 +2076,11 @@ def video_storage_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    container_name = f"video_storage_{self.action_record_id}"
     if action_details["actionDetails"].get("containerId"):
         logging.info(
-            "Using existing container ID for inference tracker: %s",
+            "Using existing container ID for video storage: %s",
             action_details["actionDetails"]["containerId"],
         )
         self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1963,14 +2091,13 @@ def video_storage_setup_execute(self: ActionInstance):
     # This is the existing Docker run command
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-         f"--cidfile ./{self.action_record_id}.cid "
-        f"--name media_server "
+        f"--name {container_name} "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
-        f' --restart=unless-stopped '
+        f'--restart=unless-stopped '
         f"{image}"
     )

{matrice_compute-0.1.37 → matrice_compute-0.1.38}/src/matrice_compute/resources_tracker.py RENAMED Viewed

@@ -916,14 +916,27 @@ class ResourcesTracker:
         gpu_count = 0
         for gpu in gpu_data['gpus']:
-            gpu_memory_free += gpu['memory_total'] - gpu['memory_used']
+            # Be defensive: nvidia-smi can occasionally report N/A/0 for total while used is numeric,
+            # which would otherwise produce negative "free" memory.
+            total_mb = gpu.get('memory_total', 0) or 0
+            used_mb = gpu.get('memory_used', 0) or 0
+            free_mb = total_mb - used_mb
+            if free_mb < 0:
+                logging.debug(
+                    "Negative GPU free memory computed (gpu_idx=%s total_mb=%s used_mb=%s); clamping to 0",
+                    gpu.get('idx'),
+                    total_mb,
+                    used_mb,
+                )
+                free_mb = 0
+            gpu_memory_free += free_mb
             gpu_utilization += gpu['utilization']
             gpu_count += 1
         if gpu_count > 0:
             gpu_utilization /= gpu_count
-        return gpu_memory_free, gpu_utilization
+        return max(0, gpu_memory_free), gpu_utilization
     @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
     def _get_gpu_resources_direct(self) -> Tuple[int, float]:
@@ -1218,7 +1231,7 @@ class MachineResourcesTracker:
             availableCPU=available_cpu,
             availableMemory=available_memory,
             availableGPU=100 - gpu_utilization,
-            availableGPUMemory=gpu_memory_free,
+            availableGPUMemory=max(0, gpu_memory_free),
         )
         if err is not None:
             logging.error(