PyPI - matrice-compute - Versions diffs - 0.1.24__tar.gz → 0.1.26__tar.gz - Mend

matrice-compute 0.1.24tar.gz → 0.1.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{matrice_compute-0.1.24 → matrice_compute-0.1.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.24
+Version: 0.1.26
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.24
+Version: 0.1.26
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.24 → matrice_compute-0.1.26}/matrice_compute.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,7 @@ src/matrice_compute/__init__.py
 src/matrice_compute/action_instance.py
 src/matrice_compute/actions_manager.py
 src/matrice_compute/actions_scaledown_manager.py
+src/matrice_compute/compute_operations_handler.py
 src/matrice_compute/instance_manager.py
 src/matrice_compute/instance_utils.py
 src/matrice_compute/prechecks.py

{matrice_compute-0.1.24 → matrice_compute-0.1.26}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -10,6 +10,7 @@ import signal
 import urllib.request
 from matrice_compute.instance_utils import (
     get_gpu_with_sufficient_memory_for_action,
+    get_gpu_config_for_deployment,
     get_decrypted_access_key_pair,
     get_max_file_system,
     get_best_service_ip_and_network,
@@ -26,6 +27,10 @@ from matrice_common.utils import log_errors
 class ActionInstance:
     """Base class for tasks that run in Action containers."""
+    # Class-level dictionary to track deployed services and their ports
+    # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
+    _deployed_services = {}
     def __init__(self, scaling: Scaling, action_info: dict):
         """Initialize an action instance.
@@ -84,6 +89,67 @@ class ActionInstance:
             raise ValueError(f"Unknown action type: {self.action_type}")
         self.task = self.actions_map[self.action_type]
+    @classmethod
+    def is_first_deployment_for_service(cls, service_id):
+        """Check if this is the first deployment for a given service.
+        Args:
+            service_id (str): Service ID (_idService)
+        Returns:
+            bool: True if this is the first deployment, False otherwise
+        """
+        if not service_id:
+            return False
+        return service_id not in cls._deployed_services
+    @classmethod
+    def get_or_create_triton_ports(cls, service_id, scaling_instance):
+        """Get existing TRITON_PORTS for a service or create new ones.
+        Args:
+            service_id (str): Service ID (_idService)
+            scaling_instance: Scaling instance to get open ports
+        Returns:
+            str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
+        """
+        if not service_id:
+            # No service_id, generate new ports
+            port1 = scaling_instance.get_open_port()
+            port2 = scaling_instance.get_open_port()
+            port3 = scaling_instance.get_open_port()
+            return f"{port1},{port2},{port3}"
+        # Check if ports already exist for this service
+        if service_id in cls._deployed_services:
+            triton_ports = cls._deployed_services[service_id]["triton_ports"]
+            logging.info(
+                "Reusing TRITON_PORTS for service %s: %s",
+                service_id,
+                triton_ports
+            )
+            return triton_ports
+        # First deployment: generate new ports and store them
+        port1 = scaling_instance.get_open_port()
+        port2 = scaling_instance.get_open_port()
+        port3 = scaling_instance.get_open_port()
+        triton_ports = f"{port1},{port2},{port3}"
+        # Store for future use
+        cls._deployed_services[service_id] = {
+            "triton_ports": triton_ports,
+            "is_first": False
+        }
+        logging.info(
+            "First deployment for service %s - generated TRITON_PORTS: %s",
+            service_id,
+            triton_ports
+        )
+        return triton_ports
     @log_errors(default_return={}, raise_exception=True, log_error=False)
     def _init_credentials(self):
         """Initialize Matrice credentials.
@@ -1387,10 +1453,27 @@ def redis_setup_execute(self: ActionInstance):
         f"docker run -d --net=host "
         f"--name redis_container_{int(time.time())} "
         f"--restart unless-stopped "
+        f"--memory=32g "
+        f"--cpus=8 "
         f"{redis_image} "
-        f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
+        f"redis-server --bind 0.0.0.0 "
+        f"--appendonly no "
+        f'--save "" '
+        f"--maxmemory 30gb "
+        f"--maxmemory-policy allkeys-lru "
+        f"--io-threads 4 "
+        f"--io-threads-do-reads yes "
+        f"--stream-node-max-bytes 8192 "
+        f"--stream-node-max-entries 1000 "
+        f"--hz 100 "
+        f"--tcp-backlog 2048 "
+        f"--timeout 0 "
+        f"--lazyfree-lazy-eviction yes "
+        f"--lazyfree-lazy-expire yes "
+        f"--lazyfree-lazy-server-del yes "
+        f"--activedefrag yes "
+        f"--requirepass {redis_password}"
     )
     logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
     # Start Redis container first
@@ -1455,6 +1538,10 @@ def model_deploy_execute(self: ActionInstance):
         return
     action_id = action_details["_id"]
     model_family = action_details["actionDetails"]["modelFamily"]
+    # Get the service ID to track deployments
+    service_id = action_details.get("_idService")
     self.setup_action_requirements(
         action_details,
         work_fs,
@@ -1462,17 +1549,27 @@ def model_deploy_execute(self: ActionInstance):
         action_id=action_id,
     )
-    # Get GPU configuration based on requirements and availability
-    # This selects the GPU(s) with the most free memory to balance load
-    use_gpu = self.get_gpu_config(action_details)
+    # Check if this is the first deployment for this service
+    is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
+    # Get GPU configuration (uses utility function with fail-safe fallback)
+    use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
     logging.info(
-        "Action %s: Model deployment GPU config: %s",
+        "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
         action_id,
-        use_gpu if use_gpu else "CPU-only"
+        use_gpu if use_gpu else "CPU-only",
+        is_first_deployment
     )
-    extra_env_vars = {"INTERNAL_PORT": internal_port}
+    # Get or create TRITON_PORTS (uses utility method)
+    triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
+    extra_env_vars = {
+        "INTERNAL_PORT": internal_port,
+        "TRITON_PORTS": triton_ports
+    }
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "deploy_log")

matrice_compute-0.1.26/src/matrice_compute/actions_manager.py ADDED Viewed

@@ -0,0 +1,467 @@
+"""Module providing actions_manager functionality."""
+import logging
+import os
+import time
+from matrice_compute.action_instance import (
+    ActionInstance,
+)
+from matrice_compute.instance_utils import (
+    has_gpu,
+    get_mem_usage,
+    cleanup_docker_storage,
+)
+from matrice_compute.scaling import (
+    Scaling,
+)
+from matrice_common.utils import log_errors
+class ActionsManager:
+    """Class for managing actions."""
+    def __init__(self, scaling: Scaling):
+        """Initialize an action manager.
+        Args:
+            scaling (Scaling): Scaling service instance
+        """
+        self.current_actions: dict[str, ActionInstance] = {}
+        self.stopped_actions: dict[str, ActionInstance] = {}  # Track stopped actions separately
+        self.scaling = scaling
+        self.memory_threshold = 0.9
+        self.poll_interval = 10
+        self.last_actions_check = 0
+        logging.info("ActionsManager initialized")
+    @log_errors(default_return=[], raise_exception=False)
+    def fetch_actions(self) -> list:
+        """Poll for actions and process them if memory threshold is not exceeded.
+        Returns:
+            list: List of fetched actions
+        """
+        actions = []
+        logging.info("Polling backend for new jobs")
+        fetched_actions, error, _ = self.scaling.assign_jobs(has_gpu())
+        if error:
+            logging.error("Error assigning jobs: %s", error)
+            return actions
+        if not isinstance(fetched_actions, list):
+            fetched_actions = [fetched_actions]
+        for action in fetched_actions:
+            if not action:
+                continue
+            if action["_id"] != "000000000000000000000000":
+                actions.append(action)
+                logging.info(
+                    "Fetched action details: %s",
+                    actions,
+                )
+        return actions
+    @log_errors(default_return=None, raise_exception=False)
+    def process_action(self, action: dict) -> ActionInstance:
+        """Process the given action.
+        Args:
+            action (dict): Action details to process
+        Returns:
+            ActionInstance: Processed action instance or None if failed
+        """
+        logging.info(
+            "Processing action: %s",
+            action["_id"],
+        )
+        action_instance = ActionInstance(self.scaling, action)
+        self.scaling.update_action_status(
+            service_provider=os.environ["SERVICE_PROVIDER"],
+            action_record_id=action["_id"],
+            status="starting",
+            action_duration=0,
+        )
+        logging.info("locking action")
+        self.scaling.update_action_status(
+            service_provider=os.environ["SERVICE_PROVIDER"],
+            status="started",
+            action_record_id=action["_id"],
+            isRunning=True,
+            action_duration=0,
+            cpuUtilisation=0.0,
+            gpuUtilisation=0.0,
+            memoryUtilisation=0.0,
+            gpuMemoryUsed=0,
+        )
+        self.scaling.update_status(
+            action["_id"],
+            action["action"],
+            "bg-job-scheduler",
+            "JBSS_LCK",
+            "OK",
+            "Job is locked for processing",
+        )
+        action_instance.execute()
+        logging.info(
+            "action %s started.",
+            action_instance.action_record_id,
+        )
+        return action_instance
+    @log_errors(raise_exception=False)
+    def process_actions(self) -> None:
+        """Process fetched actions."""
+        for action in self.fetch_actions():
+            action_id = action["_id"]
+            # Skip if action is already running in current_actions
+            if action_id in self.current_actions:
+                logging.info("Action %s already in current_actions, skipping", action_id)
+                continue
+            # If action exists in stopped_actions, remove it before starting fresh
+            if action_id in self.stopped_actions:
+                logging.info("Action %s found in stopped_actions, removing before restart", action_id)
+                del self.stopped_actions[action_id]
+            # Process and add to current_actions
+            action_instance = self.process_action(action)
+            if action_instance:
+                # Ensure action is not in stopped_actions (defensive check)
+                if action_id in self.stopped_actions:
+                    del self.stopped_actions[action_id]
+                self.current_actions[action_id] = action_instance
+    @log_errors(raise_exception=False)
+    def update_actions_status(self) -> None:
+        """Update tracking of running vs stopped actions.
+        This method checks all actions and moves stopped ones to stopped_actions dict
+        without deleting them. This prevents interference with compute operations
+        handler while maintaining accurate status reporting.
+        """
+        moved_to_stopped = 0
+        # Check each action and update its status
+        for action_id, instance in list(self.current_actions.items()):
+            is_running = False
+            status_reason = ""
+            # Check if process is running
+            if hasattr(instance, 'is_running'):
+                try:
+                    is_running = instance.is_running()
+                except Exception as e:
+                    logging.error("Error checking is_running for action %s: %s", action_id, str(e))
+                    is_running = False
+                    status_reason = f"error checking status: {str(e)}"
+            # Check for process object validity
+            if not is_running and not status_reason:
+                if not hasattr(instance, 'process') or instance.process is None:
+                    status_reason = "no process object"
+                else:
+                    status_reason = "process not running"
+            # Move to stopped_actions if not running (but don't delete)
+            if not is_running:
+                logging.info(
+                    "Action %s moved to stopped_actions: %s",
+                    action_id,
+                    status_reason
+                )
+                # Ensure action is removed from current_actions before adding to stopped_actions
+                if action_id in self.current_actions:
+                    del self.current_actions[action_id]
+                # Ensure action is not duplicated in stopped_actions
+                if action_id not in self.stopped_actions:
+                    self.stopped_actions[action_id] = instance
+                moved_to_stopped += 1
+        # Log current state
+        running_ids = list(self.current_actions.keys())
+        stopped_ids = list(self.stopped_actions.keys())
+        if self.current_actions or self.stopped_actions:
+            logging.info(
+                "Actions status: %d running %s, %d stopped %s",
+                len(self.current_actions),
+                running_ids if running_ids else "[]",
+                len(self.stopped_actions),
+                stopped_ids if stopped_ids else "[]"
+            )
+    @log_errors(raise_exception=False)
+    def purge_unwanted(self) -> None:
+        """Purge completed or failed actions.
+        NOTE: This now calls update_actions_status() which moves stopped actions
+        to a separate dict instead of deleting them. This prevents interference
+        with compute operations handler while maintaining accurate status.
+        """
+        self.update_actions_status()
+    @log_errors(default_return={}, raise_exception=False)
+    def get_current_actions(self) -> dict:
+        """Get the current running actions.
+        This method:
+        1. Updates action status tracking via update_actions_status()
+        2. Returns only the running actions (current_actions dict)
+        3. Provides detailed logging about current actions state
+        Returns:
+            dict: Current running actions only
+        """
+        # Update status tracking (moves stopped to stopped_actions)
+        self.update_actions_status()
+        if self.current_actions:
+            action_ids = list(self.current_actions.keys())
+            logging.info(
+                "Currently running %d actions: %s",
+                len(self.current_actions),
+                action_ids
+            )
+        else:
+            logging.debug("No actions currently running")
+        return self.current_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def get_all_actions(self) -> dict:
+        """Get all tracked actions (both running and stopped).
+        Returns:
+            dict: All tracked actions with their status
+        """
+        all_actions = {}
+        for action_id, instance in self.current_actions.items():
+            all_actions[action_id] = {"instance": instance, "status": "running"}
+        for action_id, instance in self.stopped_actions.items():
+            all_actions[action_id] = {"instance": instance, "status": "stopped"}
+        return all_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def get_stopped_actions(self) -> dict:
+        """Get stopped actions.
+        Returns:
+            dict: Stopped actions
+        """
+        return self.stopped_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def stop_action(self, action_record_id: str) -> dict:
+        """Stop a specific action by its record ID.
+        Args:
+            action_record_id (str): The action record ID to stop
+        Returns:
+            dict: Result dictionary with status information
+        """
+        logging.info("Attempting to stop action: %s", action_record_id)
+        # Check if action exists in current (running) actions
+        action_instance = None
+        action_source = None
+        if action_record_id in self.current_actions:
+            action_instance = self.current_actions[action_record_id]
+            action_source = "current_actions"
+        elif action_record_id in self.stopped_actions:
+            # Action already in stopped_actions
+            logging.info("Action %s already in stopped_actions", action_record_id)
+            return {
+                "success": True,
+                "reason": "already_stopped",
+                "action_id": action_record_id
+            }
+        else:
+            logging.warning("Action %s not found in current or stopped actions", action_record_id)
+            return {
+                "success": False,
+                "reason": "action_not_found",
+                "action_id": action_record_id
+            }
+        # Check if action is actually running
+        if not action_instance.is_running():
+            logging.info("Action %s is not running, moving to stopped_actions", action_record_id)
+            # Move to stopped_actions instead of deleting
+            # Ensure action is removed from current_actions first
+            if action_record_id in self.current_actions:
+                del self.current_actions[action_record_id]
+            # Ensure action is not duplicated in stopped_actions
+            if action_record_id not in self.stopped_actions:
+                self.stopped_actions[action_record_id] = action_instance
+            return {
+                "success": True,
+                "reason": "already_stopped",
+                "action_id": action_record_id
+            }
+        # Stop the action
+        try:
+            logging.info("Stopping action %s", action_record_id)
+            action_instance.stop()
+            # Update action status to stopped
+            self.scaling.update_action_status(
+                service_provider=os.environ["SERVICE_PROVIDER"],
+                action_record_id=action_record_id,
+                status="stopped",
+                isRunning=False,
+                action_duration=0,
+            )
+            # Move to stopped_actions instead of deleting
+            # Ensure action is removed from current_actions first
+            if action_record_id in self.current_actions:
+                del self.current_actions[action_record_id]
+            # Ensure action is not duplicated in stopped_actions
+            if action_record_id not in self.stopped_actions:
+                self.stopped_actions[action_record_id] = action_instance
+            logging.info("Successfully stopped action: %s", action_record_id)
+            return {
+                "success": True,
+                "action_id": action_record_id,
+                "stopped_at": time.time()
+            }
+        except Exception as e:
+            logging.error("Error stopping action %s: %s", action_record_id, str(e))
+            return {
+                "success": False,
+                "reason": "stop_failed",
+                "error": str(e),
+                "action_id": action_record_id
+            }
+    @log_errors(default_return={}, raise_exception=False)
+    def restart_action(self, action_record_id: str) -> dict:
+        """Restart a specific action by its record ID.
+        This method stops the action if it's running, then fetches fresh action
+        details from the backend and starts it again.
+        Args:
+            action_record_id (str): The action record ID to restart
+        Returns:
+            dict: Result dictionary with status information
+        """
+        logging.info("Attempting to restart action: %s", action_record_id)
+        # Step 1: Stop the action if it exists in current_actions or stopped_actions
+        stop_result = {"success": True, "reason": "not_running"}
+        if action_record_id in self.current_actions:
+            logging.info("Stopping existing action %s before restart", action_record_id)
+            stop_result = self.stop_action(action_record_id)
+            if not stop_result.get("success"):
+                logging.error("Failed to stop action %s for restart", action_record_id)
+                return {
+                    "success": False,
+                    "reason": "stop_failed_before_restart",
+                    "stop_result": stop_result,
+                    "action_id": action_record_id
+                }
+            # Wait a moment for cleanup
+            time.sleep(2)
+        elif action_record_id in self.stopped_actions:
+            logging.info("Action %s found in stopped_actions, will restart", action_record_id)
+            stop_result = {"success": True, "reason": "was_stopped"}
+        # Step 2: Fetch fresh action details from backend
+        try:
+            logging.info("Fetching action details for restart: %s", action_record_id)
+            # Get action details via API
+            action_details, error, _ = self.scaling.get_action_details(action_record_id)
+            if error or not action_details:
+                logging.error("Failed to fetch action details for %s: %s",
+                            action_record_id, error)
+                return {
+                    "success": False,
+                    "reason": "fetch_failed",
+                    "error": error,
+                    "action_id": action_record_id
+                }
+            # Step 3: Process (start) the action
+            logging.info("Starting action %s after restart", action_record_id)
+            action_instance = self.process_action(action_details)
+            if action_instance:
+                # Ensure action is removed from stopped_actions if present
+                if action_record_id in self.stopped_actions:
+                    del self.stopped_actions[action_record_id]
+                # Ensure action is removed from current_actions if present (defensive check)
+                if action_record_id in self.current_actions:
+                    logging.warning("Action %s already in current_actions during restart, replacing", action_record_id)
+                    del self.current_actions[action_record_id]
+                # Add to current_actions
+                self.current_actions[action_record_id] = action_instance
+                logging.info("Successfully restarted action: %s", action_record_id)
+                return {
+                    "success": True,
+                    "action_id": action_record_id,
+                    "restarted_at": time.time(),
+                    "stop_result": stop_result
+                }
+            else:
+                logging.error("Failed to start action %s after restart", action_record_id)
+                return {
+                    "success": False,
+                    "reason": "start_failed_after_restart",
+                    "action_id": action_record_id
+                }
+        except Exception as e:
+            logging.error("Error restarting action %s: %s", action_record_id, str(e))
+            return {
+                "success": False,
+                "reason": "restart_failed",
+                "error": str(e),
+                "action_id": action_record_id
+            }
+    @log_errors(raise_exception=True)
+    def start_actions_manager(self) -> None:
+        """Start the actions manager main loop."""
+        while True:
+            waiting_time = self.poll_interval  # Default wait time
+            try:
+                mem_usage = get_mem_usage()
+                logging.info("Memory usage: %d", mem_usage)
+                waiting_time = int(
+                    min(
+                        self.poll_interval
+                        / max(
+                            0.001,
+                            self.memory_threshold - mem_usage,
+                        ),
+                        120,
+                    )
+                )
+                if mem_usage < self.memory_threshold:
+                    self.process_actions()
+                    logging.info(
+                        "Waiting for %d seconds before next poll",
+                        waiting_time,
+                    )
+                else:
+                    logging.info(
+                        "Memory threshold exceeded, waiting for %d seconds",
+                        waiting_time,
+                    )
+                cleanup_docker_storage()
+            except Exception as e:
+                logging.error("Error in actions manager: %s", e)
+            time.sleep(waiting_time)

matrice-compute 0.1.24__tar.gz → 0.1.26__tar.gz

matrice-compute 0.1.24tar.gz → 0.1.26tar.gz