PyPI - matrice-compute - Versions diffs - 0.1.34__tar.gz → 0.1.36__tar.gz - Mend

matrice-compute 0.1.34tar.gz → 0.1.36tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{matrice_compute-0.1.34 → matrice_compute-0.1.36}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.34
+Version: 0.1.36
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.34
+Version: 0.1.36
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -10,7 +10,6 @@ import signal
 import urllib.request
 from matrice_compute.instance_utils import (
     get_gpu_with_sufficient_memory_for_action,
-    get_gpu_config_for_deployment,
     get_decrypted_access_key_pair,
     get_max_file_system,
     get_best_service_ip_and_network,
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
 class ActionInstance:
     """Base class for tasks that run in Action containers."""
-    # Class-level dictionary to track deployed services and their ports
-    # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
-    _deployed_services = {}
     def __init__(self, scaling: Scaling, action_info: dict):
         """Initialize an action instance.
@@ -90,67 +85,6 @@ class ActionInstance:
             raise ValueError(f"Unknown action type: {self.action_type}")
         self.task = self.actions_map[self.action_type]
-    @classmethod
-    def is_first_deployment_for_service(cls, service_id):
-        """Check if this is the first deployment for a given service.
-        Args:
-            service_id (str): Service ID (_idService)
-        Returns:
-            bool: True if this is the first deployment, False otherwise
-        """
-        if not service_id:
-            return False
-        return service_id not in cls._deployed_services
-    @classmethod
-    def get_or_create_triton_ports(cls, service_id, scaling_instance):
-        """Get existing TRITON_PORTS for a service or create new ones.
-        Args:
-            service_id (str): Service ID (_idService)
-            scaling_instance: Scaling instance to get open ports
-        Returns:
-            str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
-        """
-        if not service_id:
-            # No service_id, generate new ports
-            port1 = scaling_instance.get_open_port()
-            port2 = scaling_instance.get_open_port()
-            port3 = scaling_instance.get_open_port()
-            return f"{port1},{port2},{port3}"
-        # Check if ports already exist for this service
-        if service_id in cls._deployed_services:
-            triton_ports = cls._deployed_services[service_id]["triton_ports"]
-            logging.info(
-                "Reusing TRITON_PORTS for service %s: %s",
-                service_id,
-                triton_ports
-            )
-            return triton_ports
-        # First deployment: generate new ports and store them
-        port1 = scaling_instance.get_open_port()
-        port2 = scaling_instance.get_open_port()
-        port3 = scaling_instance.get_open_port()
-        triton_ports = f"{port1},{port2},{port3}"
-        # Store for future use
-        cls._deployed_services[service_id] = {
-            "triton_ports": triton_ports,
-            "is_first": False
-        }
-        logging.info(
-            "First deployment for service %s - generated TRITON_PORTS: %s",
-            service_id,
-            triton_ports
-        )
-        return triton_ports
     @log_errors(default_return={}, raise_exception=True, log_error=False)
     def _init_credentials(self):
         """Initialize Matrice credentials.
@@ -297,7 +231,7 @@ class ActionInstance:
                         getattr(self, "action_record_id", "unknown"),
                     )
                 else:
-                    logging.info(
+                    logging.debug(
                         "No additional logs to send for action %s",
                         getattr(self, "action_record_id", "unknown"),
                     )
@@ -352,13 +286,13 @@ class ActionInstance:
         ).get("gpuMemory", 0)
         logging.info(
-            "Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
+            "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
             action_id,
             required_memory
         )
         try:
-            # Get the GPU(s) with most free memory that have sufficient memory
+            # Get the best-fit GPU(s) with sufficient memory
             gpu_indices = get_gpu_with_sufficient_memory_for_action(
                 action_details=action_details
             )
@@ -412,7 +346,6 @@ class ActionInstance:
         destination_workspace_path: str = "/usr/src/workspace",
         docker_workdir: str = "",
         extra_pkgs: list = [],
-        container_name: str = "",
     ):
         """Build base Docker command with common options.
@@ -427,7 +360,6 @@ class ActionInstance:
             destination_workspace_path (str): Container workspace path
             docker_workdir (str): Docker working directory
             extra_pkgs (list): List of extra packages to install
-            container_name (str): Docker container name (format: {action_type}_{action_id})
         Returns:
             str: Base Docker command
         """
@@ -492,16 +424,17 @@ class ActionInstance:
             ]
         )
-        # Build container name option if provided
-        name_option = f"--name {container_name}" if container_name else ""
+        # if the service provider is local, then put --restart unless-stopped
+        if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
+            env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
         cmd_parts = [
-            f"docker run -d {use_gpu} ",
-            name_option,
+            f"docker run {use_gpu} ",
             network_config,
             *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
             *volumes,
             # Container configuration and startup commands
+            f"--cidfile ./{self.action_record_id}.cid ",
             f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
             f'/bin/bash -c "cd {docker_workdir} && '
             f"{env_exports} && "
@@ -889,34 +822,6 @@ class ActionInstance:
                 job_params=action_details["jobParams"],
             )
-    @staticmethod
-    def container_exists(container_id: str) -> bool:
-        """Check if a Docker container exists.
-        Args:
-            container_id (str): Container ID or name to check
-        Returns:
-            bool: True if container exists, False otherwise
-        """
-        if not container_id:
-            return False
-        try:
-            result = subprocess.run(
-                ["docker", "inspect", container_id],
-                capture_output=True,
-                text=True,
-                timeout=10
-            )
-            return result.returncode == 0
-        except Exception as e:
-            logging.warning(
-                "Error checking if container %s exists: %s",
-                container_id,
-                str(e)
-            )
-            return False
     @log_errors(raise_exception=True)
     def start_process(self, cmd, log_name):
         """Start the process and initialize logging.
@@ -931,54 +836,60 @@ class ActionInstance:
         self.cmd = cmd
         self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
-        # Run docker with -d flag to get container ID from stdout
-        process = subprocess.Popen(
-            shlex.split(self.cmd),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            env={**os.environ},
-        )
-        # Use a longer timeout for docker run since --pull=always may need to
-        # download large images on first run. Default: 30 minutes (1800 seconds)
-        # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
-        docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
-        logging.info(
-            "Waiting for docker container to start for action %s (timeout: %d seconds)",
-            self.action_record_id,
-            docker_start_timeout,
-        )
-        stdout, stderr = process.communicate(timeout=docker_start_timeout)
+        with open(self.log_path, "wb") as out:
+            self.process = subprocess.Popen(
+                shlex.split(self.cmd),
+                stdout=out,
+                stderr=out,
+                env={**os.environ},
+                start_new_session=True,
+            )
-        if process.returncode != 0:
+        self.container_id = None
+        cid_file_path = f"./{self.action_record_id}.cid"
+        max_retries = 5
+        retry_delay = 1  # seconds
+        for attempt in range(max_retries):
+            try:
+                with open(cid_file_path, "r") as cid_file:
+                    container_id = cid_file.read().strip()
+                    self.container_id = container_id
+                    logging.info(
+                        "Started process for action %s with container ID: %s",
+                        self.action_record_id,
+                        self.container_id,
+                    )
+                    break
+            except FileNotFoundError:
+                logging.warning(
+                    "CID file not found for action %s, attempt %d/%d",
+                    self.action_record_id,
+                    attempt + 1,
+                    max_retries,
+                )
+                time.sleep(retry_delay)
+            except Exception as e:
+                logging.error(
+                    "Error reading CID file for action %s: %s",
+                    self.action_record_id,
+                    str(e),
+                )
+                time.sleep(retry_delay)
+        else:
             logging.error(
-                "Docker run failed for action %s: %s",
+                "Failed to read CID file for action %s after %d attempts",
                 self.action_record_id,
-                stderr,
+                max_retries,
             )
-            raise RuntimeError(f"Docker run failed: {stderr}")
-        self.container_id = stdout.strip()
-        logging.info(
-            "Started container for action %s with ID: %s",
-            self.action_record_id,
-            self.container_id,
-        )
-        # Start following container logs in background
-        self.process = subprocess.Popen(
-            ["docker", "logs", "-f", self.container_id],
-            stdout=open(self.log_path, "wb"),
-            stderr=subprocess.STDOUT,
-            start_new_session=True,
-        )
+            raise Exception("Failed to start process: CID file not found")
-        # Report container id to scaling service
+        # report container id to scaling service
         self.scaling.update_action_container_id(
             action_record_id=self.action_record_id,
             container_id=self.container_id,
         )
     @log_errors(raise_exception=False)
     def start_logger(self):
@@ -1139,8 +1050,7 @@ def data_preparation_execute(
             "Started pulling Docker image with PID: %s",
             process.pid,
         )
-    container_name = f"data_prep_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "data_preparation_log")
@@ -1169,8 +1079,7 @@ def data_processing_execute(self: ActionInstance):
         service="bg-job-scheduler",
         job_params=action["jobParams"],
     )
-    container_name = f"data_processing_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "data_processing_log")
@@ -1183,8 +1092,7 @@ def data_split_execute(self: ActionInstance):
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs, model_family="")
-    container_name = f"data_split_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "data_split")
@@ -1199,8 +1107,7 @@ def dataset_annotation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    container_name = f"dataset_annotation_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "dataset_annotation")
@@ -1215,8 +1122,7 @@ def dataset_augmentation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    container_name = f"dataset_augmentation_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "dataset_augmentation")
@@ -1232,8 +1138,7 @@ def augmentation_server_creation_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    container_name = f"augmentation_setup_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "augmentation_setup")
@@ -1254,42 +1159,31 @@ def database_setup_execute(self: ActionInstance):
     project_id = action_details["_idProject"]
-    # Define container names with action_record_id for uniqueness
-    mongodb_container_name = f"database_setup_{self.action_record_id}"
-    qdrant_container_name = f"qdrant_{self.action_record_id}"
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference tracker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "qdrant_setup")
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if both containers actually exist before trying to restart
-        mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
-        qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
+        #qdrant restart
+        qdrant_cmd = "docker restart qdrant"
+        self.start(qdrant_cmd, 'qdrant_setup')
-        if mongodb_container_exists and qdrant_container_exists:
-            logging.info(
-                "Using existing container ID for database setup: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "qdrant_setup")
+        return
+    dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
-            # qdrant restart
-            qdrant_cmd = f"docker restart {qdrant_container_name}"
-            self.start(qdrant_cmd, "qdrant_setup")
-            return
-        else:
-            logging.warning(
-                "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
-                mongodb_container_exists,
-                qdrant_container_exists
-            )
-            # Fall through to create new containers
     # MongoDB container with --net=host (Port: 27020:27017)
     cmd = (
         f"docker run --pull=always --net=host "
-        f"--name {mongodb_container_name} "
-        f"-v matrice_myvol:/matrice_data "
+        f"-v {dbPath}:{dbPath} "
+        f"--name database_setup_{self.action_record_id} "
+        f"-v /var/run/docker.sock:/var/run/docker.sock "
         f"--cidfile ./{self.action_record_id}.cid "
         f"-e ACTION_RECORD_ID={self.action_record_id} "
         f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
@@ -1298,22 +1192,12 @@ def database_setup_execute(self: ActionInstance):
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f"{image} "
     )
-    logging.info("Starting MongoDB container (Port: 27020:27017): %s", cmd)
+    logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
-    # Qdrant container with --net=host (Port: 6334)
-    qdrant_cmd = (
-        f"docker run --pull=always --net=host "
-        f"--name {qdrant_container_name} "
-        f"-v matrice_myvol:/matrice_data "
-        f"{'qdrant/qdrant:latest'} "
-    )
-    logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
     # Docker Command run
     self.start(cmd, "database_setup")
-    # Docker for qdrant
-    self.start(qdrant_cmd, 'qdrant_setup')
 @log_errors(raise_exception=False)
 def facial_recognition_setup_execute(self: ActionInstance):
@@ -1329,36 +1213,28 @@ def facial_recognition_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for facial recognition worker: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "facial_recognition_setup")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for facial recognition worker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "facial_recognition_setup")
+        return
     # Facial recognition worker container with --net=host (Port: 8081)
-    container_name = f"facial_recognition_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name {container_name} "
-        f"--cidfile ./{self.action_record_id}.cid "
+        f"--name worker "
+         f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
+        f"--cidfile ./{self.action_record_id}.cid "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
+        f' --restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1380,30 +1256,20 @@ def lpr_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for LPR worker: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "lpr_setup")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for LPR worker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "lpr_setup")
+        return
     # LPR worker container with --net=host (Port: 8082)
-    container_name = f"lpr_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --net=host --pull=always "
-        f"--name {container_name} "
+        f"--name lpr-worker "
         f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1411,6 +1277,7 @@ def lpr_setup_execute(self: ActionInstance):
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
         f'-e PORT=8082 '
+        f' --restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1441,34 +1308,25 @@ def inference_ws_server_execute(self: ActionInstance):
     logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for inference WebSocket server: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "inference_ws_server")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference WebSocket server: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "inference_ws_server")
+        return
     # Inference WebSocket server with --net=host (Port: 8102)
-    container_name = f"inference_ws_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name {container_name} "
+        f"--name inference "
         f"--cidfile ./{self.action_record_id}.cid "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
+        f' --restart=unless-stopped '
         f"{image} "
         f"./app "
         f"{self.action_record_id} "
@@ -1499,30 +1357,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
     logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for frontend streaming: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "fe_fs_streaming")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for frontend streaming: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "fe_fs_streaming")
+        return
     # Frontend streaming with --net=host (Port: 3000)
-    container_name = f"fe_streaming_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name {container_name} "
+        f"--name fe_streaming "
         f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1530,6 +1378,7 @@ def fe_fs_streaming_execute(self: ActionInstance):
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f"-e PORT=3000 "
         f'-e WS_HOST="{ws_url}" '
+        f' --restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1554,30 +1403,20 @@ def fe_analytics_service_execute(self: ActionInstance):
     project_id = action_details["_idProject"]
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for frontend analytics service: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "fe_analytics_service")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for frontend analytics service: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "fe_analytics_service")
+        return
     # Frontend analytics service with --net=host (Port: 3001)
-    container_name = f"fe_analytics_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--name {container_name} "
+        f"--name fe-analytics "
         f"--cidfile ./{self.action_record_id}.cid "
         f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1585,6 +1424,7 @@ def fe_analytics_service_execute(self: ActionInstance):
         f'-e ACTION_ID="{self.action_record_id}" '
         f"-e PORT=3001 "
         f'-e PROJECT_ID="{project_id}" '
+        f' --restart=unless-stopped '
         f"{image}"
     )
     logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1609,8 +1449,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
     else:
         return
     use_gpu = self.get_gpu_config(action_details)
-    container_name = f"dataset_generation_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "dataset_generation")
@@ -1631,8 +1470,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
     else:
         return
     use_gpu = self.get_gpu_config(action_details)
-    container_name = f"synthetic_data_setup_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "synthetic_data_setup")
@@ -1669,60 +1507,31 @@ def redis_setup_execute(self: ActionInstance):
     redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
-    # Define container names with action_record_id for uniqueness
-    redis_container_name = f"redis_{self.action_record_id}"
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if both containers actually exist before trying to restart
-        management_container_exists = ActionInstance.container_exists(existing_container_id)
-        redis_container_exists = ActionInstance.container_exists(redis_container_name)
-        if management_container_exists and redis_container_exists:
-            logging.info(
-                "Using existing container ID for redis management: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "redis_setup")
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for redis management: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "redis_setup")
-            # Redis container restart
-            redis_restart_cmd = f"docker restart {redis_container_name}"
-            self.start(redis_restart_cmd, "redis")
-            return
-        else:
-            logging.warning(
-                "Container(s) not found (management=%s, redis=%s). Creating new containers.",
-                management_container_exists,
-                redis_container_exists
-            )
-            # Fall through to create new containers
+        # Redis container restart
+        redis_restart_cmd = "docker restart redis_container"
+        self.start(redis_restart_cmd, "redis")
+        return
     # Redis container with --net=host (Port: 6379)
     redis_cmd = (
         f"docker run -d --net=host "
-        f"--name {redis_container_name} "
+        f"--name redis_container "
         f"--restart unless-stopped "
         f"{redis_image} "
-        f"redis-server --bind 0.0.0.0 "
-        f"--appendonly no "
-        f'--save "" '
-        f"--maxmemory 30gb "
-        f"--maxmemory-policy allkeys-lru "
-        f"--io-threads 4 "
-        f"--io-threads-do-reads yes "
-        f"--stream-node-max-bytes 8192 "
-        f"--stream-node-max-entries 1000 "
-        f"--hz 100 "
-        f"--tcp-backlog 2048 "
-        f"--timeout 0 "
-        f"--lazyfree-lazy-eviction yes "
-        f"--lazyfree-lazy-expire yes "
-        f"--lazyfree-lazy-server-del yes "
-        f"--activedefrag yes "
-        f"--requirepass {redis_password}"
+        f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
     )
     logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
     # Start Redis container first
@@ -1772,8 +1581,7 @@ def deploy_aggregator_execute(
     if not action_details:
         return
     self.setup_action_requirements(action_details, work_fs)
-    container_name = f"deploy_aggregator_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
     logging.info("cmd: %s", cmd)
     self.start(cmd, "deploy_aggregator")
@@ -1789,10 +1597,6 @@ def model_deploy_execute(self: ActionInstance):
         return
     action_id = action_details["_id"]
     model_family = action_details["actionDetails"]["modelFamily"]
-    # Get the service ID to track deployments
-    service_id = action_details.get("_idService")
     self.setup_action_requirements(
         action_details,
         work_fs,
@@ -1800,29 +1604,17 @@ def model_deploy_execute(self: ActionInstance):
         action_id=action_id,
     )
-    # Check if this is the first deployment for this service
-    is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
-    # Get GPU configuration (uses utility function with fail-safe fallback)
-    use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
-    logging.info(
-        "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
-        action_id,
-        use_gpu if use_gpu else "CPU-only",
-        is_first_deployment
-    )
-    # Get or create TRITON_PORTS (uses utility method)
-    triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
+    # Get GPU configuration based on requirements and availability
+    # This uses the best-fit algorithm to select the most appropriate GPU(s)
+    use_gpu = self.get_gpu_config(action_details)
-    extra_env_vars = {
-        "INTERNAL_PORT": internal_port,
-        "TRITON_PORTS": triton_ports
-    }
+    # Override: If GPU is required, use all available GPUs
+    gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
+    if gpuRequired:
+        use_gpu = "--runtime=nvidia --gpus all"
-    container_name = f"model_deploy_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
+    extra_env_vars = {"INTERNAL_PORT": internal_port}
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "deploy_log")
@@ -1845,27 +1637,17 @@ def model_train_execute(self: ActionInstance):
         action_id=action_id,
     )
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for training: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "train_log")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
-    container_name = f"model_train_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "train_log")
+        return
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "train_log")
@@ -1886,27 +1668,17 @@ def model_eval_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for evaluation: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "eval_log")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
-    container_name = f"model_eval_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "eval_log")
+        return
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "eval_log")
@@ -1930,27 +1702,17 @@ def model_export_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for export: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "export_log")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
-    container_name = f"model_export_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "export_log")
+        return
+    cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "export_log")
@@ -1966,8 +1728,7 @@ def image_build_execute(self: ActionInstance):
     action_id = action_details["_id"]
     internal_api_key = self.get_internal_api_key(action_id)
     extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
-    container_name = f"image_build_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
+    cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "image_build_log")
@@ -1979,8 +1740,7 @@ def resource_clone_execute(self: ActionInstance):
     if not action_details:
         return
     self.setup_action_requirements(action_details)
-    container_name = f"resource_clone_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
+    cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "resource_clone")
@@ -1996,27 +1756,17 @@ def streaming_gateway_execute(self: ActionInstance):
         self.docker_container = (
             f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
         )
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for streaming gateway: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "streaming_gateway")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
-    container_name = f"streaming_gateway_{self.action_record_id}"
-    cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "streaming_gateway")
+        return
+    cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "streaming_gateway")
@@ -2110,24 +1860,16 @@ def kafka_setup_execute(self: ActionInstance):
     else:
         pkgs = f"matrice_common matrice"
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for kafka: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "kafka_setup")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "kafka_setup")
+        return
     # Kafka container with --net=host (Ports: 9092, 9093)
     cmd = (
@@ -2164,36 +1906,27 @@ def inference_tracker_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
-    existing_container_id = action_details["actionDetails"].get("containerId")
-    if existing_container_id:
-        # Check if container actually exists before trying to restart
-        if ActionInstance.container_exists(existing_container_id):
-            logging.info(
-                "Using existing container ID for inference tracker: %s",
-                existing_container_id,
-            )
-            self.docker_container = existing_container_id
-            cmd = "docker restart " + self.docker_container
-            self.start(cmd, "inference_tracker_setup")
-            return
-        else:
-            logging.warning(
-                "Container %s not found. Creating new container.",
-                existing_container_id
-            )
-            # Fall through to create new container
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference tracker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "inference_tracker_setup")
+        return
     # This is the existing Docker run command
-    container_name = f"inference_tracker_{self.action_record_id}"
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
-        f"--cidfile ./{self.action_record_id}.cid "
-        f"--name {container_name} "
+         f"--cidfile ./{self.action_record_id}.cid "
+        f"--name inference-tracker-worker "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
+        f' --restart=unless-stopped '
         f"{image}"
     )
@@ -2235,6 +1968,7 @@ def video_storage_setup_execute(self: ActionInstance):
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f'-e ACTION_ID="{self.action_record_id}" '
+        f' --restart=unless-stopped '
         f"{image}"
     )