PyPI - matrice-compute - Versions diffs - 0.1.26__tar.gz → 0.1.28__tar.gz - Mend

matrice-compute 0.1.26tar.gz → 0.1.28tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{matrice_compute-0.1.26 → matrice_compute-0.1.28}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.26
+Version: 0.1.28
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.26 → matrice_compute-0.1.28}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.26
+Version: 0.1.28
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -495,6 +495,7 @@ class ActionInstance:
             *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
             *volumes,
             # Container configuration and startup commands
+            f"--cidfile ./{self.action_record_id}.cid ",
             f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
             f'/bin/bash -c "cd {docker_workdir} && '
             f"{env_exports} && "
@@ -895,6 +896,7 @@ class ActionInstance:
         """
         self.cmd = cmd
         self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
         with open(self.log_path, "wb") as out:
             self.process = subprocess.Popen(
                 shlex.split(self.cmd),
@@ -903,6 +905,52 @@ class ActionInstance:
                 env={**os.environ},
                 start_new_session=True,
             )
+        self.container_id = None
+        cid_file_path = f"./{self.action_record_id}.cid"
+        max_retries = 5
+        retry_delay = 1  # seconds
+        for attempt in range(max_retries):
+            try:
+                with open(cid_file_path, "r") as cid_file:
+                    container_id = cid_file.read().strip()
+                    self.container_id = container_id
+                    logging.info(
+                        "Started process for action %s with container ID: %s",
+                        self.action_record_id,
+                        self.container_id,
+                    )
+                    break
+            except FileNotFoundError:
+                logging.warning(
+                    "CID file not found for action %s, attempt %d/%d",
+                    self.action_record_id,
+                    attempt + 1,
+                    max_retries,
+                )
+                time.sleep(retry_delay)
+            except Exception as e:
+                logging.error(
+                    "Error reading CID file for action %s: %s",
+                    self.action_record_id,
+                    str(e),
+                )
+                time.sleep(retry_delay)
+        else:
+            logging.error(
+                "Failed to read CID file for action %s after %d attempts",
+                self.action_record_id,
+                max_retries,
+            )
+            raise Exception("Failed to start process: CID file not found")
+        # report container id to scaling service
+        self.scaling.update_action_container_id(
+            action_record_id=self.action_record_id,
+            container_id=self.container_id,
+        )
     @log_errors(raise_exception=False)
     def start_logger(self):
@@ -1172,11 +1220,27 @@ def database_setup_execute(self: ActionInstance):
     project_id = action_details["_idProject"]
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference tracker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "qdrant_setup")
+        #qdrant restart
+        qdrant_cmd = "docker restart qdrant"
+        self.start(qdrant_cmd, 'qdrant_setup')
+        return
     # MongoDB container with --net=host (Port: 27020:27017)
     cmd = (
         f"docker run --pull=always --net=host "
         f"--name mongodbdatabase "
         f"-v matrice_myvol:/matrice_data "
+        f"--cidfile ./{self.action_record_id}.cid "
         f"-e ACTION_RECORD_ID={self.action_record_id} "
         f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
         f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1215,11 +1279,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for facial recognition worker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "facial_recognition_setup")
+        return
     # Facial recognition worker container with --net=host (Port: 8081)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
         f"--name worker "
+         f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
+        f"--cidfile ./{self.action_record_id}.cid "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1245,10 +1321,21 @@ def lpr_setup_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for LPR worker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "lpr_setup")
+        return
     # LPR worker container with --net=host (Port: 8082)
     worker_cmd = (
         f"docker run -d --net=host --pull=always "
         f"--name lpr-worker "
+        f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1285,10 +1372,21 @@ def inference_ws_server_execute(self: ActionInstance):
     logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference WebSocket server: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "inference_ws_server")
+        return
     # Inference WebSocket server with --net=host (Port: 8102)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
         f"--name inference "
+        f"--cidfile ./{self.action_record_id}.cid "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1321,11 +1419,22 @@ def fe_fs_streaming_execute(self: ActionInstance):
     ws_url = f"{ws_host}:8102"
     logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for frontend streaming: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "fe_fs_streaming")
+        return
     # Frontend streaming with --net=host (Port: 3000)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
         f"--name fe_streaming "
+        f"--cidfile ./{self.action_record_id}.cid "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1355,11 +1464,22 @@ def fe_analytics_service_execute(self: ActionInstance):
     self.setup_action_requirements(action_details)
     project_id = action_details["_idProject"]
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for frontend analytics service: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "fe_analytics_service")
+        return
     # Frontend analytics service with --net=host (Port: 3001)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
         f"--name fe-analytics "
+        f"--cidfile ./{self.action_record_id}.cid "
         f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1447,11 +1567,27 @@ def redis_setup_execute(self: ActionInstance):
     logging.info(f"Redis will use IP: {redis_host} on port 6379")
     redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for redis management: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "redis_setup")
+        # Redis container restart
+        redis_restart_cmd = "docker restart redis_container"
+        self.start(redis_restart_cmd, "redis")
+        return
     # Redis container with --net=host (Port: 6379)
     redis_cmd = (
         f"docker run -d --net=host "
-        f"--name redis_container_{int(time.time())} "
+        f"--name redis_container"
         f"--restart unless-stopped "
         f"--memory=32g "
         f"--cpus=8 "
@@ -1496,6 +1632,7 @@ def redis_setup_execute(self: ActionInstance):
     # bg-redis management container with --net=host (Port: 8082)
     cmd = (
         f"docker run --net=host "
+         f"--cidfile ./{self.action_record_id}.cid "
         f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
         f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
         f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1592,6 +1729,17 @@ def model_train_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "train_log")
+        return
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "train_log")
@@ -1613,6 +1761,16 @@ def model_eval_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "eval_log")
+        return
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "eval_log")
@@ -1637,6 +1795,16 @@ def model_export_execute(self: ActionInstance):
         model_family=model_family,
         action_id=action_id,
     )
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "export_log")
+        return
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "export_log")
@@ -1681,6 +1849,16 @@ def streaming_gateway_execute(self: ActionInstance):
         self.docker_container = (
             f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
         )
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "streaming_gateway")
+        return
     cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "streaming_gateway")
@@ -1775,6 +1953,17 @@ def kafka_setup_execute(self: ActionInstance):
     else:
         pkgs = f"matrice_common matrice"
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for training: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "kafka_setup")
+        return
     # Kafka container with --net=host (Ports: 9092, 9093)
     cmd = (
         f"docker run --net=host "
@@ -1809,10 +1998,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
     image = self.docker_container
     self.setup_action_requirements(action_details)
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference tracker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "inference_tracker_setup")
+        return
     # This is the existing Docker run command
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
+         f"--cidfile ./{self.action_record_id}.cid "
         f"--name inference-tracker-worker "
         f"-v matrice_myvol:/matrice_data "
         f'-e ENV="{os.environ.get("ENV", "prod")}" '

{matrice_compute-0.1.26 → matrice_compute-0.1.28}/src/matrice_compute/instance_manager.py RENAMED Viewed

@@ -3,8 +3,10 @@
 import json
 import logging
 import os
+import subprocess
 import threading
 import time
+from kafka import KafkaProducer
 from matrice_compute.actions_manager import ActionsManager
 from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
 from matrice_compute.compute_operations_handler import ComputeOperationsHandler
@@ -15,6 +17,7 @@ from matrice_compute.instance_utils import (
 from matrice_compute.resources_tracker import (
     MachineResourcesTracker,
     ActionsResourcesTracker,
+    KafkaResourceMonitor,
 )
 from matrice_compute.scaling import Scaling
 from matrice_compute.shutdown_manager import ShutdownManager
@@ -38,6 +41,7 @@ class InstanceManager:
         env: str = "",
         gpus: str = "",
         workspace_dir: str = "matrice_workspace",
+        enable_kafka: bool = False,
     ):
         """Initialize an instance manager.
@@ -58,6 +62,7 @@ class InstanceManager:
                 Defaults to empty string.
             workspace_dir (str): Directory for workspace files.
                 Defaults to "matrice_workspace".
+            enable_kafka (bool): Enable Kafka communication (default False).
         """
         self.session = self._setup_env_credentials(
             env,
@@ -72,6 +77,7 @@ class InstanceManager:
         self.scaling = Scaling(
             self.session,
             os.environ.get("INSTANCE_ID"),
+            enable_kafka,
         )
         logging.info("InstanceManager initialized with scaling")
         jupyter_token = os.environ.get("JUPYTER_TOKEN")
@@ -92,6 +98,19 @@ class InstanceManager:
         self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
         logging.info("InstanceManager initialized with actions resources tracker")
+        # Initialize Kafka resource monitor using the same internal Kafka as scaling
+        try:
+            kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
+            self.kafka_resource_monitor = KafkaResourceMonitor(
+                instance_id=os.environ.get("INSTANCE_ID"),
+                kafka_bootstrap=kafka_bootstrap,
+                interval_seconds=60
+            )
+            logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
+        except (ValueError, Exception) as e:
+            logging.warning("Failed to initialize Kafka resource monitor: %s", e)
+            self.kafka_resource_monitor = None
         # Initialize Compute Operations Handler for event-driven operations
         # Uses EventListener from matrice_common for simplified Kafka consumption
         try:
@@ -103,14 +122,30 @@ class InstanceManager:
                 instance_id=instance_id
             )
             logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
-        except Exception as e:
+        except (ValueError, Exception) as e:
             logging.warning("Failed to initialize Compute Operations Handler: %s", e)
             self.compute_operations_handler = None
         self.poll_interval = 10
         # Note: encryption_key is set in _setup_env_credentials
+        # Initialize container monitoring
+        self.container_monitor_thread = None
+        self.container_monitor_running = False
+        self.container_kafka_producer = None
         logging.info("InstanceManager initialized.")
+        # report the resources at startup
+        try:
+            self.scaling.report_architecture_info()
+            logging.info("InstanceManager reported initial resources.")
+        except Exception as exc:
+            logging.error(
+                "Error reporting initial resources: %s",
+                str(exc),
+            )
     @log_errors(default_return=None, raise_exception=True, log_error=True)
     def _setup_env_credentials(
         self,
@@ -245,13 +280,13 @@ class InstanceManager:
             #         "Error in scale_down_manager auto_scaledown_actions: %s",
             #         str(exc),
             #     )
-            try:
-                self.machine_resources_tracker.update_available_resources()
-            except Exception as exc:
-                logging.error(
-                    "Error in machine_resources_tracker update_available_resources: %s",
-                    str(exc),
-                )
+            # try:
+            #     self.machine_resources_tracker.update_available_resources()
+            # except Exception as exc:
+            #     logging.error(
+            #         "Error in machine_resources_tracker update_available_resources: %s",
+            #         str(exc),
+            #     )
             try:
                 self.actions_resources_tracker.update_actions_resources()
             except Exception as exc:
@@ -262,6 +297,130 @@ class InstanceManager:
             time.sleep(self.poll_interval)
+    @log_errors(raise_exception=False, log_error=True)
+    def start_container_status_monitor(self):
+        """Start the background container status monitoring."""
+        if self.container_monitor_running:
+            logging.info("Container status monitor is already running")
+            return
+        self.container_monitor_running = True
+        self.container_monitor_thread = threading.Thread(
+            target=self._container_status_monitor_worker,
+            daemon=True,
+            name="ContainerStatusMonitor"
+        )
+        self.container_monitor_thread.start()
+        logging.info("Started container status monitoring thread")
+    @log_errors(raise_exception=False, log_error=True)
+    def stop_container_status_monitor(self):
+        """Stop the background container status monitoring."""
+        if not self.container_monitor_running:
+            return
+        logging.info("Stopping container status monitor...")
+        self.container_monitor_running = False
+        if self.container_monitor_thread:
+            self.container_monitor_thread.join(timeout=10)
+        if self.container_kafka_producer:
+            self.container_kafka_producer.close()
+            self.container_kafka_producer = None
+        logging.info("Container status monitor stopped")
+    def _container_status_monitor_worker(self):
+        """Background worker function that monitors container status."""
+        # Initialize Kafka producer
+        try:
+            if self.scaling.enable_kafka:
+                bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
+                self.container_kafka_producer = KafkaProducer(
+                    bootstrap_servers=bootstrap_servers,
+                    value_serializer=lambda v: json.dumps(v).encode("utf-8"),
+                    max_block_ms=5000  # Timeout if Kafka is down
+                )
+                logging.info("Container status monitor: Kafka producer initialized")
+            else:
+                logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
+                return
+        except Exception as e:
+            logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
+            return
+        instance_id = os.environ.get("INSTANCE_ID")
+        topic_name = "compute_container_status"
+        logging.info("Container status monitor started for instance: %s", instance_id)
+        while self.container_monitor_running:
+            try:
+                # Get container status using docker ps -a
+                result = subprocess.run(
+                    ["docker", "ps", "-a", "--format", "json"],
+                    capture_output=True,
+                    text=True,
+                    timeout=30
+                )
+                if result.returncode != 0:
+                    logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
+                    time.sleep(30)  # Wait before retrying
+                    continue
+                # Parse container information
+                containers = []
+                if result.stdout.strip():
+                    for line in result.stdout.strip().split('\n'):
+                        try:
+                            container_info = json.loads(line)
+                            containers.append({
+                                "container_id": container_info.get("ID", ""),
+                                "image": container_info.get("Image", ""),
+                                "command": container_info.get("Command", ""),
+                                "created": container_info.get("CreatedAt", ""),
+                                "status": container_info.get("Status", ""),
+                                "ports": container_info.get("Ports", ""),
+                                "names": container_info.get("Names", ""),
+                                "size": container_info.get("Size", ""),
+                                "state": container_info.get("State", ""),
+                                "labels": container_info.get("Labels", "")
+                            })
+                        except json.JSONDecodeError as e:
+                            logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
+                            continue
+                # Prepare message for Kafka
+                status_message = {
+                    "timestamp": time.time(),
+                    "instance_id": instance_id,
+                    "container_count": len(containers),
+                    "containers": containers
+                }
+                # Send to Kafka
+                if self.container_kafka_producer:
+                    try:
+                        self.container_kafka_producer.send(topic_name, status_message)
+                        logging.debug("Container status monitor: Sent status for %d containers", len(containers))
+                    except Exception as e:
+                        logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
+            except subprocess.TimeoutExpired:
+                logging.error("Container status monitor: docker ps command timed out")
+            except Exception as e:
+                logging.error("Container status monitor: Unexpected error: %s", str(e))
+            # Wait 30 seconds before next check
+            for _ in range(30):
+                if not self.container_monitor_running:
+                    break
+                time.sleep(1)
+        logging.info("Container status monitor worker stopped")
     @log_errors(default_return=(None, None), raise_exception=True)
     def start(self) -> tuple:
         """Start the instance manager threads.
@@ -269,6 +428,14 @@ class InstanceManager:
         Returns:
             tuple: (instance_manager_thread, actions_manager_thread)
         """
+        # Start Kafka resource monitor in background thread
+        if self.kafka_resource_monitor:
+            try:
+                self.kafka_resource_monitor.start()
+                logging.info("Started Kafka resource monitor")
+            except Exception as exc:
+                logging.error("Failed to start Kafka resource monitor: %s", str(exc))
         # Start Compute Operations Handler in background thread
         if self.compute_operations_handler:
             try:
@@ -277,6 +444,13 @@ class InstanceManager:
             except Exception as exc:
                 logging.error("Failed to start Compute Operations Handler: %s", str(exc))
+        # Start Container Status Monitor in background thread
+        try:
+            self.start_container_status_monitor()
+            logging.info("Started Container Status Monitor")
+        except Exception as exc:
+            logging.error("Failed to start Container Status Monitor: %s", str(exc))
         # Create and start threads
         instance_manager_thread = threading.Thread(
             target=self.start_instance_manager,

matrice-compute 0.1.26__tar.gz → 0.1.28__tar.gz

matrice-compute 0.1.26tar.gz → 0.1.28tar.gz