PyPI - matrice-compute - Versions diffs - 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl - Mend

matrice-compute 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

matrice_compute/action_instance.py CHANGED Viewed

@@ -10,6 +10,7 @@ import signal
 import urllib.request
 from matrice_compute.instance_utils import (
     get_gpu_with_sufficient_memory_for_action,
+    get_gpu_config_for_deployment,
     get_decrypted_access_key_pair,
     get_max_file_system,
     get_best_service_ip_and_network,
@@ -26,6 +27,10 @@ from matrice_common.utils import log_errors
 class ActionInstance:
     """Base class for tasks that run in Action containers."""
+    # Class-level dictionary to track deployed services and their ports
+    # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
+    _deployed_services = {}
     def __init__(self, scaling: Scaling, action_info: dict):
         """Initialize an action instance.
@@ -84,6 +89,67 @@ class ActionInstance:
             raise ValueError(f"Unknown action type: {self.action_type}")
         self.task = self.actions_map[self.action_type]
+    @classmethod
+    def is_first_deployment_for_service(cls, service_id):
+        """Check if this is the first deployment for a given service.
+        Args:
+            service_id (str): Service ID (_idService)
+        Returns:
+            bool: True if this is the first deployment, False otherwise
+        """
+        if not service_id:
+            return False
+        return service_id not in cls._deployed_services
+    @classmethod
+    def get_or_create_triton_ports(cls, service_id, scaling_instance):
+        """Get existing TRITON_PORTS for a service or create new ones.
+        Args:
+            service_id (str): Service ID (_idService)
+            scaling_instance: Scaling instance to get open ports
+        Returns:
+            str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
+        """
+        if not service_id:
+            # No service_id, generate new ports
+            port1 = scaling_instance.get_open_port()
+            port2 = scaling_instance.get_open_port()
+            port3 = scaling_instance.get_open_port()
+            return f"{port1},{port2},{port3}"
+        # Check if ports already exist for this service
+        if service_id in cls._deployed_services:
+            triton_ports = cls._deployed_services[service_id]["triton_ports"]
+            logging.info(
+                "Reusing TRITON_PORTS for service %s: %s",
+                service_id,
+                triton_ports
+            )
+            return triton_ports
+        # First deployment: generate new ports and store them
+        port1 = scaling_instance.get_open_port()
+        port2 = scaling_instance.get_open_port()
+        port3 = scaling_instance.get_open_port()
+        triton_ports = f"{port1},{port2},{port3}"
+        # Store for future use
+        cls._deployed_services[service_id] = {
+            "triton_ports": triton_ports,
+            "is_first": False
+        }
+        logging.info(
+            "First deployment for service %s - generated TRITON_PORTS: %s",
+            service_id,
+            triton_ports
+        )
+        return triton_ports
     @log_errors(default_return={}, raise_exception=True, log_error=False)
     def _init_credentials(self):
         """Initialize Matrice credentials.
@@ -1387,10 +1453,27 @@ def redis_setup_execute(self: ActionInstance):
         f"docker run -d --net=host "
         f"--name redis_container_{int(time.time())} "
         f"--restart unless-stopped "
+        f"--memory=32g "
+        f"--cpus=8 "
         f"{redis_image} "
-        f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
+        f"redis-server --bind 0.0.0.0 "
+        f"--appendonly no "
+        f'--save "" '
+        f"--maxmemory 30gb "
+        f"--maxmemory-policy allkeys-lru "
+        f"--io-threads 4 "
+        f"--io-threads-do-reads yes "
+        f"--stream-node-max-bytes 8192 "
+        f"--stream-node-max-entries 1000 "
+        f"--hz 100 "
+        f"--tcp-backlog 2048 "
+        f"--timeout 0 "
+        f"--lazyfree-lazy-eviction yes "
+        f"--lazyfree-lazy-expire yes "
+        f"--lazyfree-lazy-server-del yes "
+        f"--activedefrag yes "
+        f"--requirepass {redis_password}"
     )
     logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
     # Start Redis container first
@@ -1455,6 +1538,10 @@ def model_deploy_execute(self: ActionInstance):
         return
     action_id = action_details["_id"]
     model_family = action_details["actionDetails"]["modelFamily"]
+    # Get the service ID to track deployments
+    service_id = action_details.get("_idService")
     self.setup_action_requirements(
         action_details,
         work_fs,
@@ -1462,17 +1549,27 @@ def model_deploy_execute(self: ActionInstance):
         action_id=action_id,
     )
-    # Get GPU configuration based on requirements and availability
-    # This selects the GPU(s) with the most free memory to balance load
-    use_gpu = self.get_gpu_config(action_details)
+    # Check if this is the first deployment for this service
+    is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
+    # Get GPU configuration (uses utility function with fail-safe fallback)
+    use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
     logging.info(
-        "Action %s: Model deployment GPU config: %s",
+        "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
         action_id,
-        use_gpu if use_gpu else "CPU-only"
+        use_gpu if use_gpu else "CPU-only",
+        is_first_deployment
     )
-    extra_env_vars = {"INTERNAL_PORT": internal_port}
+    # Get or create TRITON_PORTS (uses utility method)
+    triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
+    extra_env_vars = {
+        "INTERNAL_PORT": internal_port,
+        "TRITON_PORTS": triton_ports
+    }
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)
     self.start(cmd, "deploy_log")

matrice_compute/actions_manager.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ActionsManager:
             scaling (Scaling): Scaling service instance
         """
         self.current_actions: dict[str, ActionInstance] = {}
+        self.stopped_actions: dict[str, ActionInstance] = {}  # Track stopped actions separately
         self.scaling = scaling
         self.memory_threshold = 0.9
         self.poll_interval = 10
@@ -111,75 +112,110 @@ class ActionsManager:
     def process_actions(self) -> None:
         """Process fetched actions."""
         for action in self.fetch_actions():
+            action_id = action["_id"]
+            # Skip if action is already running in current_actions
+            if action_id in self.current_actions:
+                logging.info("Action %s already in current_actions, skipping", action_id)
+                continue
+            # If action exists in stopped_actions, remove it before starting fresh
+            if action_id in self.stopped_actions:
+                logging.info("Action %s found in stopped_actions, removing before restart", action_id)
+                del self.stopped_actions[action_id]
+            # Process and add to current_actions
             action_instance = self.process_action(action)
             if action_instance:
-                self.current_actions[action["_id"]] = action_instance
+                # Ensure action is not in stopped_actions (defensive check)
+                if action_id in self.stopped_actions:
+                    del self.stopped_actions[action_id]
+                self.current_actions[action_id] = action_instance
     @log_errors(raise_exception=False)
-    def purge_unwanted(self) -> None:
-        """Purge completed or failed actions.
-        This method checks all actions in the current_actions dictionary and removes any that:
-        1. Are explicitly reported as not running by the is_running() method
-        2. Have invalid or corrupted process objects
+    def update_actions_status(self) -> None:
+        """Update tracking of running vs stopped actions.
+        This method checks all actions and moves stopped ones to stopped_actions dict
+        without deleting them. This prevents interference with compute operations
+        handler while maintaining accurate status reporting.
         """
-        purged_count = 0
-        # Check each action and purge if needed
+        moved_to_stopped = 0
+        # Check each action and update its status
         for action_id, instance in list(self.current_actions.items()):
-            should_purge = False
-            purge_reason = ""
-            # Check if process is reported as not running
-            if not instance.is_running():
-                should_purge = True
-                purge_reason = "process reported as not running"
+            is_running = False
+            status_reason = ""
+            # Check if process is running
+            if hasattr(instance, 'is_running'):
+                try:
+                    is_running = instance.is_running()
+                except Exception as e:
+                    logging.error("Error checking is_running for action %s: %s", action_id, str(e))
+                    is_running = False
+                    status_reason = f"error checking status: {str(e)}"
             # Check for process object validity
-            elif not hasattr(instance, 'process') or instance.process is None:
-                should_purge = True
-                purge_reason = "invalid process object"
-            # Purge if any condition was met
-            if should_purge:
+            if not is_running and not status_reason:
+                if not hasattr(instance, 'process') or instance.process is None:
+                    status_reason = "no process object"
+                else:
+                    status_reason = "process not running"
+            # Move to stopped_actions if not running (but don't delete)
+            if not is_running:
                 logging.info(
-                    "Action %s is being purged: %s",
+                    "Action %s moved to stopped_actions: %s",
                     action_id,
-                    purge_reason
+                    status_reason
                 )
-                # Remove from tracking dictionaries
-                del self.current_actions[action_id]
-                purged_count += 1
+                # Ensure action is removed from current_actions before adding to stopped_actions
+                if action_id in self.current_actions:
+                    del self.current_actions[action_id]
+                # Ensure action is not duplicated in stopped_actions
+                if action_id not in self.stopped_actions:
+                    self.stopped_actions[action_id] = instance
+                moved_to_stopped += 1
-                # Try to explicitly stop the action if possible
-                try:
-                    if hasattr(instance, 'stop'):
-                        instance.stop()
-                except Exception as e:
-                    logging.error(f"Error stopping action {action_id}: {str(e)}")
-        if purged_count > 0:
+        # Log current state
+        running_ids = list(self.current_actions.keys())
+        stopped_ids = list(self.stopped_actions.keys())
+        if self.current_actions or self.stopped_actions:
             logging.info(
-                "Purged %d completed actions, %d actions remain in queue",
-                purged_count,
-                len(self.current_actions)
+                "Actions status: %d running %s, %d stopped %s",
+                len(self.current_actions),
+                running_ids if running_ids else "[]",
+                len(self.stopped_actions),
+                stopped_ids if stopped_ids else "[]"
             )
+    @log_errors(raise_exception=False)
+    def purge_unwanted(self) -> None:
+        """Purge completed or failed actions.
+        NOTE: This now calls update_actions_status() which moves stopped actions
+        to a separate dict instead of deleting them. This prevents interference
+        with compute operations handler while maintaining accurate status.
+        """
+        self.update_actions_status()
     @log_errors(default_return={}, raise_exception=False)
     def get_current_actions(self) -> dict:
-        """Get the current actions.
+        """Get the current running actions.
         This method:
-        1. Purges any completed actions using purge_unwanted()
-        2. Double-checks remaining actions to ensure they are truly running
+        1. Updates action status tracking via update_actions_status()
+        2. Returns only the running actions (current_actions dict)
         3. Provides detailed logging about current actions state
         Returns:
-            dict: Current active actions
+            dict: Current running actions only
         """
-        # Always purge unwanted actions first
-        self.purge_unwanted()
+        # Update status tracking (moves stopped to stopped_actions)
+        self.update_actions_status()
         if self.current_actions:
             action_ids = list(self.current_actions.keys())
             logging.info(
@@ -189,9 +225,213 @@ class ActionsManager:
             )
         else:
             logging.debug("No actions currently running")
-            return {}
         return self.current_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def get_all_actions(self) -> dict:
+        """Get all tracked actions (both running and stopped).
+        Returns:
+            dict: All tracked actions with their status
+        """
+        all_actions = {}
+        for action_id, instance in self.current_actions.items():
+            all_actions[action_id] = {"instance": instance, "status": "running"}
+        for action_id, instance in self.stopped_actions.items():
+            all_actions[action_id] = {"instance": instance, "status": "stopped"}
+        return all_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def get_stopped_actions(self) -> dict:
+        """Get stopped actions.
+        Returns:
+            dict: Stopped actions
+        """
+        return self.stopped_actions
+    @log_errors(default_return={}, raise_exception=False)
+    def stop_action(self, action_record_id: str) -> dict:
+        """Stop a specific action by its record ID.
+        Args:
+            action_record_id (str): The action record ID to stop
+        Returns:
+            dict: Result dictionary with status information
+        """
+        logging.info("Attempting to stop action: %s", action_record_id)
+        # Check if action exists in current (running) actions
+        action_instance = None
+        action_source = None
+        if action_record_id in self.current_actions:
+            action_instance = self.current_actions[action_record_id]
+            action_source = "current_actions"
+        elif action_record_id in self.stopped_actions:
+            # Action already in stopped_actions
+            logging.info("Action %s already in stopped_actions", action_record_id)
+            return {
+                "success": True,
+                "reason": "already_stopped",
+                "action_id": action_record_id
+            }
+        else:
+            logging.warning("Action %s not found in current or stopped actions", action_record_id)
+            return {
+                "success": False,
+                "reason": "action_not_found",
+                "action_id": action_record_id
+            }
+        # Check if action is actually running
+        if not action_instance.is_running():
+            logging.info("Action %s is not running, moving to stopped_actions", action_record_id)
+            # Move to stopped_actions instead of deleting
+            # Ensure action is removed from current_actions first
+            if action_record_id in self.current_actions:
+                del self.current_actions[action_record_id]
+            # Ensure action is not duplicated in stopped_actions
+            if action_record_id not in self.stopped_actions:
+                self.stopped_actions[action_record_id] = action_instance
+            return {
+                "success": True,
+                "reason": "already_stopped",
+                "action_id": action_record_id
+            }
+        # Stop the action
+        try:
+            logging.info("Stopping action %s", action_record_id)
+            action_instance.stop()
+            # Update action status to stopped
+            self.scaling.update_action_status(
+                service_provider=os.environ["SERVICE_PROVIDER"],
+                action_record_id=action_record_id,
+                status="stopped",
+                isRunning=False,
+                action_duration=0,
+            )
+            # Move to stopped_actions instead of deleting
+            # Ensure action is removed from current_actions first
+            if action_record_id in self.current_actions:
+                del self.current_actions[action_record_id]
+            # Ensure action is not duplicated in stopped_actions
+            if action_record_id not in self.stopped_actions:
+                self.stopped_actions[action_record_id] = action_instance
+            logging.info("Successfully stopped action: %s", action_record_id)
+            return {
+                "success": True,
+                "action_id": action_record_id,
+                "stopped_at": time.time()
+            }
+        except Exception as e:
+            logging.error("Error stopping action %s: %s", action_record_id, str(e))
+            return {
+                "success": False,
+                "reason": "stop_failed",
+                "error": str(e),
+                "action_id": action_record_id
+            }
+    @log_errors(default_return={}, raise_exception=False)
+    def restart_action(self, action_record_id: str) -> dict:
+        """Restart a specific action by its record ID.
+        This method stops the action if it's running, then fetches fresh action
+        details from the backend and starts it again.
+        Args:
+            action_record_id (str): The action record ID to restart
+        Returns:
+            dict: Result dictionary with status information
+        """
+        logging.info("Attempting to restart action: %s", action_record_id)
+        # Step 1: Stop the action if it exists in current_actions or stopped_actions
+        stop_result = {"success": True, "reason": "not_running"}
+        if action_record_id in self.current_actions:
+            logging.info("Stopping existing action %s before restart", action_record_id)
+            stop_result = self.stop_action(action_record_id)
+            if not stop_result.get("success"):
+                logging.error("Failed to stop action %s for restart", action_record_id)
+                return {
+                    "success": False,
+                    "reason": "stop_failed_before_restart",
+                    "stop_result": stop_result,
+                    "action_id": action_record_id
+                }
+            # Wait a moment for cleanup
+            time.sleep(2)
+        elif action_record_id in self.stopped_actions:
+            logging.info("Action %s found in stopped_actions, will restart", action_record_id)
+            stop_result = {"success": True, "reason": "was_stopped"}
+        # Step 2: Fetch fresh action details from backend
+        try:
+            logging.info("Fetching action details for restart: %s", action_record_id)
+            # Get action details via API
+            action_details, error, _ = self.scaling.get_action_details(action_record_id)
+            if error or not action_details:
+                logging.error("Failed to fetch action details for %s: %s",
+                            action_record_id, error)
+                return {
+                    "success": False,
+                    "reason": "fetch_failed",
+                    "error": error,
+                    "action_id": action_record_id
+                }
+            # Step 3: Process (start) the action
+            logging.info("Starting action %s after restart", action_record_id)
+            action_instance = self.process_action(action_details)
+            if action_instance:
+                # Ensure action is removed from stopped_actions if present
+                if action_record_id in self.stopped_actions:
+                    del self.stopped_actions[action_record_id]
+                # Ensure action is removed from current_actions if present (defensive check)
+                if action_record_id in self.current_actions:
+                    logging.warning("Action %s already in current_actions during restart, replacing", action_record_id)
+                    del self.current_actions[action_record_id]
+                # Add to current_actions
+                self.current_actions[action_record_id] = action_instance
+                logging.info("Successfully restarted action: %s", action_record_id)
+                return {
+                    "success": True,
+                    "action_id": action_record_id,
+                    "restarted_at": time.time(),
+                    "stop_result": stop_result
+                }
+            else:
+                logging.error("Failed to start action %s after restart", action_record_id)
+                return {
+                    "success": False,
+                    "reason": "start_failed_after_restart",
+                    "action_id": action_record_id
+                }
+        except Exception as e:
+            logging.error("Error restarting action %s: %s", action_record_id, str(e))
+            return {
+                "success": False,
+                "reason": "restart_failed",
+                "error": str(e),
+                "action_id": action_record_id
+            }
     @log_errors(raise_exception=True)
     def start_actions_manager(self) -> None:
         """Start the actions manager main loop."""

matrice_compute/compute_operations_handler.py ADDED Viewed

@@ -0,0 +1,490 @@
+"""
+Compute Operations Handler - Kafka Event-Driven Operations Manager
+This module handles compute instance operations (start/stop/restart) triggered from
+the frontend dashboard via Kafka events. It consumes events from the 'compute_operations'
+topic and performs the actual operations on compute instances and their actions.
+Uses EventListener from matrice_common for simplified Kafka consumption.
+Event Structure:
+{
+    "instance_id": "string",
+    "action_record_id": "string",  # Can be ObjectID("000000000000000000000000") or all zeros for instance-level operations
+    "operation": "start|stop|restart",
+    "account_number": 12345,
+    "requested_by": "user@example.com",
+    "request_id": "uuid-string",
+    "timestamp": "2025-11-21T10:30:00.123Z"
+}
+"""
+import logging
+import re
+import time
+from typing import Dict, Any, Optional
+import sys
+import traceback
+import os
+import subprocess
+from matrice_common.stream.event_listener import EventListener
+# Configure logging
+logger = logging.getLogger(__name__)
+class ComputeOperationsHandler:
+    """
+    Handles Kafka-based compute operations for instance and action management.
+    This class uses EventListener from matrice_common to listen for operation
+    events from the 'compute_operations' Kafka topic. It delegates operations
+    to the ActionsManager for execution and updates status via API calls.
+    """
+    KAFKA_TOPIC = "compute_operations"
+    def __init__(self, actions_manager, session, scaling, instance_id: str):
+        """
+        Initialize the Compute Operations Handler.
+        Args:
+            actions_manager: Reference to the ActionsManager instance
+            session: Session object for authentication and Kafka configuration
+            scaling: Scaling service instance for API status updates
+            instance_id: This compute instance's ID for filtering events
+        """
+        self.actions_manager = actions_manager
+        self.session = session
+        self.scaling = scaling
+        self.instance_id = instance_id
+        self.event_listener: Optional[EventListener] = None
+        self.running = False
+        logger.info(f"Initializing ComputeOperationsHandler for instance ID: {instance_id}")
+    def start(self) -> bool:
+        """
+        Start the operations handler using EventListener.
+        Returns:
+            bool: True if started successfully, False otherwise
+        """
+        if self.running:
+            logger.warning("ComputeOperationsHandler is already running")
+            return False
+        try:
+            self.event_listener = EventListener(
+                session=self.session,
+                topics=[self.KAFKA_TOPIC],
+                event_handler=self._handle_operation_event,
+                filter_field='instance_id',
+                filter_value=self.instance_id,
+                consumer_group_id=f"compute_ops_{self.instance_id}"
+            )
+            self.running = self.event_listener.start()
+            if self.running:
+                logger.info("ComputeOperationsHandler started successfully")
+            else:
+                logger.error("ComputeOperationsHandler failed to start")
+            return self.running
+        except Exception as e:
+            logger.error(f"Failed to start ComputeOperationsHandler: {e}")
+            logger.error(traceback.format_exc())
+            return False
+    def stop(self):
+        """
+        Stop the operations handler gracefully.
+        """
+        logger.info("Stopping ComputeOperationsHandler...")
+        self.running = False
+        if self.event_listener:
+            self.event_listener.stop()
+        logger.info("ComputeOperationsHandler stopped")
+    def _handle_operation_event(self, event: Dict[str, Any]):
+        """
+        Handle incoming operation event from Kafka.
+        This is the callback function passed to EventListener.
+        Args:
+            event: The operation event dictionary
+        """
+        logger.info(f"Received operation event: {event}")
+        # Validate event structure
+        if not self._validate_event(event):
+            logger.error(f"Invalid event structure: {event}")
+            return
+        # Process the operation
+        self._process_operation(event)
+    def _is_instance_level_operation(self, action_record_id: str) -> bool:
+        """
+        Check if action_record_id represents an instance-level operation.
+        Instance-level operations are identified by action_record_id containing only zeros,
+        which can come in various formats:
+        - "000000000000000000000000"
+        - "ObjectID(\"000000000000000000000000\")"
+        - "ObjectID('000000000000000000000000')"
+        Args:
+            action_record_id: The action record ID to check
+        Returns:
+            True if this is an instance-level operation, False otherwise
+        """
+        if not action_record_id:
+            return False
+        # Handle ObjectID("...") or ObjectID('...') format from Kafka messages
+        clean_id = action_record_id
+        if 'ObjectID' in action_record_id:
+            match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
+            if match:
+                clean_id = match.group(1)
+        # Check if the string contains only zeros (any length)
+        return clean_id.replace('0', '') == ''
+    def _extract_action_record_id(self, action_record_id: str) -> str:
+        """
+        Extract the actual action record ID from various formats.
+        Args:
+            action_record_id: The raw action record ID (may be wrapped in ObjectID)
+        Returns:
+            The extracted action record ID string
+        """
+        if not action_record_id:
+            return action_record_id
+        # Handle ObjectID("...") or ObjectID('...') format
+        if 'ObjectID' in action_record_id:
+            match = re.search(r'ObjectID\(["\']([^"\']+)["\']\)', action_record_id)
+            if match:
+                return match.group(1)
+        return action_record_id
+    def _validate_event(self, event: Dict[str, Any]) -> bool:
+        """
+        Validate that the event has all required fields.
+        Args:
+            event: The event dictionary to validate
+        Returns:
+            True if event is valid, False otherwise
+        """
+        required_fields = [
+            "instance_id",
+            "action_record_id",
+            "operation",
+            "account_number",
+            "requested_by",
+            "request_id",
+            "timestamp"
+        ]
+        for field in required_fields:
+            if field not in event:
+                logger.error(f"Missing required field: {field}")
+                return False
+        # Validate operation type
+        valid_operations = ["start", "stop", "restart"]
+        if event["operation"] not in valid_operations:
+            logger.error(f"Invalid operation: {event['operation']}. Must be one of {valid_operations}")
+            return False
+        return True
+    def _process_operation(self, event: Dict[str, Any]):
+        """
+        Process a compute operation event.
+        Args:
+            event: The operation event dictionary
+        """
+        operation = event["operation"]
+        raw_action_record_id = event["action_record_id"]
+        action_record_id = self._extract_action_record_id(raw_action_record_id)
+        request_id = event["request_id"]
+        requested_by = event["requested_by"]
+        logger.info(f"Processing {operation} operation for action {action_record_id} "
+                   f"(request: {request_id}, user: {requested_by})")
+        try:
+            # Check if this is an instance-level operation (action_record_id contains only zeros)
+            is_instance_operation = self._is_instance_level_operation(raw_action_record_id)
+            if is_instance_operation:
+                result = self._handle_instance_operation(operation, event)
+            else:
+                result = self._handle_action_operation(operation, action_record_id, event)
+            # Update status via API and logging
+            self._update_operation_status(event, action_record_id, "completed", result)
+        except Exception as e:
+            error_msg = f"Operation failed: {str(e)}"
+            logger.error(error_msg)
+            logger.error(traceback.format_exc())
+            # Update failure status
+            self._update_operation_status(event, action_record_id, "failed", {"error": error_msg})
+    def _handle_action_operation(self, operation: str, action_record_id: str,
+                                 event: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Handle operations on a specific action.
+        Args:
+            operation: The operation type (start/stop/restart)
+            action_record_id: The action record ID to operate on
+            event: The full event dictionary
+        Returns:
+            Result dictionary with operation details
+        """
+        if operation == "start":
+            return self._start_action(action_record_id, event)
+        elif operation == "stop":
+            return self._stop_action(action_record_id, event)
+        elif operation == "restart":
+            return self._restart_action(action_record_id, event)
+        else:
+            raise ValueError(f"Unknown operation: {operation}")
+    def _handle_instance_operation(self, operation: str, event: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Handle operations on the entire instance (the Python application itself).
+        Args:
+            operation: The operation type (start/stop/restart)
+            event: The full event dictionary
+        Returns:
+            Result dictionary with operation details (may not return if app is killed/restarted)
+        """
+        logger.info(f"Executing instance-level {operation} operation on Python application")
+        if operation == "stop":
+            # Kill the Python application itself
+            logger.critical("Instance-level STOP: Killing Python application process")
+            try:
+                # Log status before killing
+                logger.warning(
+                    f"Operation {operation} on instance {self.instance_id}: "
+                    f"completed - killing_application (PID: {os.getpid()})"
+                )
+                # Give a moment for logs to be written
+                time.sleep(0.5)
+            except Exception as e:
+                logger.error(f"Failed to log status before kill: {e}")
+            # Forcefully exit the application
+            logger.critical(f"Terminating Python application (PID: {os.getpid()})")
+            os._exit(0)  # Forceful exit, doesn't call cleanup handlers
+        elif operation == "restart":
+            # Restart the Python application itself
+            logger.critical("Instance-level RESTART: Restarting Python application process")
+            try:
+                # Log status before restarting
+                logger.warning(
+                    f"Operation {operation} on instance {self.instance_id}: "
+                    f"completed - restarting_application (PID: {os.getpid()})"
+                )
+                # Give a moment for logs to be written
+                time.sleep(0.5)
+            except Exception as e:
+                logger.error(f"Failed to log status before restart: {e}")
+            # Restart the application
+            logger.critical(f"Restarting Python application (PID: {os.getpid()})")
+            self._restart_application()
+        elif operation == "start":
+            # Start doesn't make sense for instance-level
+            logger.warning("Start operation not supported at instance level")
+            return {
+                "operation": operation,
+                "instance_level": True,
+                "status": "not_supported",
+                "message": "Start operation is not supported at instance level"
+            }
+        # This should not be reached for stop/restart operations
+        return {
+            "operation": operation,
+            "instance_level": True,
+            "status": "completed"
+        }
+    def _restart_application(self):
+        """
+        Restart the Python application by replacing the current process.
+        This uses os.execv() to replace the current process with a new one.
+        """
+        try:
+            python_executable = sys.executable
+            script_args = sys.argv
+            logger.info(f"Restarting with: {python_executable} {' '.join(script_args)}")
+            # Use os.execv() to replace the current process
+            # This will restart the application with the same arguments
+            os.execv(python_executable, [python_executable] + script_args)
+        except Exception as e:
+            logger.error(f"Failed to restart application: {e}")
+            logger.error(traceback.format_exc())
+            # Fallback: try using subprocess to start a new process and exit
+            try:
+                logger.info("Attempting fallback restart method")
+                python_executable = sys.executable
+                script_args = sys.argv
+                # Start new process
+                subprocess.Popen([python_executable] + script_args)
+                # Exit current process
+                logger.critical("New process started, exiting current process")
+                os._exit(0)
+            except Exception as fallback_error:
+                logger.error(f"Fallback restart also failed: {fallback_error}")
+                logger.error(traceback.format_exc())
+                # Last resort: just exit
+                os._exit(1)
+    def _start_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Start a specific action.
+        Args:
+            action_record_id: The action record ID to start
+            event: The full event dictionary
+        Returns:
+            Result dictionary
+        """
+        logger.info(f"Starting action: {action_record_id}")
+        # Check if action is already running
+        current_actions = self.actions_manager.get_current_actions()
+        if action_record_id in current_actions:
+            action_instance = current_actions[action_record_id]
+            if action_instance.is_running():
+                logger.warning(f"Action {action_record_id} is already running")
+                return {
+                    "status": "already_running",
+                    "action_id": action_record_id
+                }
+        # Fetch action details from backend and start it
+        # This will be handled by the ActionsManager's normal flow
+        # Force a fetch to pick up this specific action
+        self.actions_manager.fetch_actions()
+        return {
+            "status": "started",
+            "action_id": action_record_id
+        }
+    def _stop_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Stop a specific action.
+        Args:
+            action_record_id: The action record ID to stop
+            event: The full event dictionary
+        Returns:
+            Result dictionary
+        """
+        logger.info(f"Stopping action: {action_record_id}")
+        result = self.actions_manager.stop_action(action_record_id)
+        return {
+            "status": "stopped",
+            "action_id": action_record_id,
+            "details": result
+        }
+    def _restart_action(self, action_record_id: str, event: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Restart a specific action.
+        Args:
+            action_record_id: The action record ID to restart
+            event: The full event dictionary
+        Returns:
+            Result dictionary
+        """
+        logger.info(f"Restarting action: {action_record_id}")
+        result = self.actions_manager.restart_action(action_record_id)
+        return {
+            "status": "restarted",
+            "action_id": action_record_id,
+            "details": result
+        }
+    def _update_operation_status(self, event: Dict[str, Any], action_record_id: str,
+                                 status: str, result: Dict[str, Any]):
+        """
+        Update operation status via API and logging.
+        Args:
+            event: The original event
+            action_record_id: The extracted action record ID
+            status: Operation status (completed/failed)
+            result: Result details
+        """
+        operation = event["operation"]
+        request_id = event["request_id"]
+        # Log status as warning for visibility
+        logger.warning(
+            f"Operation {operation} on {action_record_id}: {status} - "
+            f"request_id={request_id}, result={result}"
+        )
+        # Update via API (for action-level operations only)
+        if not self._is_instance_level_operation(event["action_record_id"]):
+            try:
+                # Determine isRunning based on operation and status
+                is_running = False
+                if status == "completed":
+                    if operation == "start":
+                        is_running = True
+                    elif operation == "restart":
+                        is_running = True
+                    elif operation == "stop":
+                        is_running = False
+                self.scaling.update_action_status(
+                    service_provider=os.environ.get("SERVICE_PROVIDER", ""),
+                    action_record_id=action_record_id,
+                    status=status,
+                    isRunning=is_running,
+                )
+                logger.info(f"API status updated for action {action_record_id}: {status}")
+            except Exception as e:
+                logger.error(f"Failed to update API status for action {action_record_id}: {e}")

matrice_compute/instance_manager.py CHANGED Viewed

@@ -7,6 +7,7 @@ import threading
 import time
 from matrice_compute.actions_manager import ActionsManager
 from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
+from matrice_compute.compute_operations_handler import ComputeOperationsHandler
 from matrice_compute.instance_utils import (
     get_instance_info,
     get_decrypted_access_key_pair,
@@ -90,6 +91,22 @@ class InstanceManager:
         logging.info("InstanceManager initialized with machine resources tracker")
         self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
         logging.info("InstanceManager initialized with actions resources tracker")
+        # Initialize Compute Operations Handler for event-driven operations
+        # Uses EventListener from matrice_common for simplified Kafka consumption
+        try:
+            instance_id = os.environ.get("INSTANCE_ID")
+            self.compute_operations_handler = ComputeOperationsHandler(
+                actions_manager=self.actions_manager,
+                session=self.session,
+                scaling=self.scaling,
+                instance_id=instance_id
+            )
+            logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
+        except Exception as e:
+            logging.warning("Failed to initialize Compute Operations Handler: %s", e)
+            self.compute_operations_handler = None
         self.poll_interval = 10
         # Note: encryption_key is set in _setup_env_credentials
         logging.info("InstanceManager initialized.")
@@ -252,6 +269,14 @@ class InstanceManager:
         Returns:
             tuple: (instance_manager_thread, actions_manager_thread)
         """
+        # Start Compute Operations Handler in background thread
+        if self.compute_operations_handler:
+            try:
+                self.compute_operations_handler.start()
+                logging.info("Started Compute Operations Handler")
+            except Exception as exc:
+                logging.error("Failed to start Compute Operations Handler: %s", str(exc))
         # Create and start threads
         instance_manager_thread = threading.Thread(
             target=self.start_instance_manager,

matrice_compute/instance_utils.py CHANGED Viewed

@@ -941,6 +941,120 @@ def get_single_gpu_with_sufficient_memory_for_action(
     raise ValueError(error_msg)
+@log_errors(default_return="", raise_exception=False)
+def get_gpu_config_for_deployment(action_details, is_first_deployment=False):
+    """Get GPU configuration for deployment actions.
+    For first deployment of a service, attempts to use all GPUs.
+    For subsequent deployments, uses standard GPU selection (most free memory).
+    Falls back gracefully to standard GPU selection if '--gpus all' is not available.
+    Args:
+        action_details (dict): Action details containing GPU requirements
+        is_first_deployment (bool): Whether this is the first deployment for this service
+    Returns:
+        str: GPU configuration string ('--gpus all' or '--gpus "device=X"' or '')
+    """
+    action_id = action_details.get("_id", "unknown")
+    # Check if GPU is required
+    gpu_required = action_details.get("actionDetails", {}).get("gpuRequired", False)
+    if not gpu_required:
+        logging.info(
+            "Action %s does not require GPU - will run on CPU",
+            action_id
+        )
+        return ""
+    # First deployment: try to use all GPUs
+    if is_first_deployment:
+        logging.info(
+            "Action %s: First deployment - attempting to use all GPUs",
+            action_id
+        )
+        try:
+            # Check if GPUs are available
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=5,
+                check=False,
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                # GPUs are available, use all of them
+                logging.info(
+                    "Action %s: Using all GPUs for first deployment",
+                    action_id
+                )
+                return '--gpus all'
+            else:
+                logging.warning(
+                    "Action %s: No GPUs detected via nvidia-smi for first deployment, falling back to standard GPU selection",
+                    action_id
+                )
+        except Exception as e:
+            logging.warning(
+                "Action %s: Error checking GPU availability (%s), falling back to standard GPU selection",
+                action_id,
+                str(e)
+            )
+    # Fall back to standard GPU selection (most free memory)
+    # This also handles subsequent deployments
+    logging.info(
+        "Action %s: Using standard GPU allocation (most free memory)",
+        action_id
+    )
+    required_memory = action_details.get("actionDetails", {}).get(
+        "expectedResources", {}
+    ).get("gpuMemory", 0)
+    try:
+        # Get the GPU(s) with most free memory that have sufficient memory
+        gpu_indices = get_gpu_with_sufficient_memory_for_action(
+            action_details=action_details
+        )
+        if gpu_indices:
+            gpu_str = ",".join(map(str, gpu_indices))
+            logging.info(
+                "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
+                action_id,
+                gpu_str,
+                required_memory
+            )
+            # Return Docker GPU configuration
+            return f'--gpus "device={gpu_str}"'
+        else:
+            logging.warning(
+                "Action %s: No GPUs with sufficient memory found (required: %d MB)",
+                action_id,
+                required_memory
+            )
+            return ""
+    except ValueError as e:
+        logging.error(
+            "Action %s: Error selecting GPU - %s",
+            action_id,
+            str(e)
+        )
+        return ""
+    except Exception as e:
+        logging.error(
+            "Action %s: Unexpected error in GPU selection - %s",
+            action_id,
+            str(e)
+        )
+        return ""
 @log_errors(default_return=(None, None), raise_exception=False)
 def get_decrypted_access_key_pair(
     enc_access_key: str,

matrice_compute/resources_tracker.py CHANGED Viewed

@@ -402,8 +402,13 @@ class ActionsResourcesTracker:
                 new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
             return new_args
-        args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
-        action_record_id = args_24[-1] if args_24 else None
+        def is_valid_objectid(s: str) -> bool:
+            """Check if string is a valid MongoDB ObjectId (24 hex characters)"""
+            s = s.strip()
+            return len(s) == 24 and all(c in '0123456789abcdefABCDEF' for c in s)
+        valid_objectids = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if is_valid_objectid(arg)]
+        action_record_id = valid_objectids[-1] if valid_objectids else None
         if not action_record_id:
             logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
         duration = calculate_time_difference(start_time, finish_time)

matrice_compute/scaling.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import logging
+import base64
 from matrice_common.utils import log_errors
 class Scaling:
@@ -33,6 +34,28 @@ class Scaling:
             "Initialized Scaling with instance_id: %s (REST API only)",
             instance_id
         )
+    @log_errors(default_return=None, log_error=True)
+    def get_kafka_bootstrap_servers(self):
+        """Get Kafka bootstrap servers from API and decode base64 fields.
+        Returns:
+            str: Kafka bootstrap servers in format "ip:port"
+        Raises:
+            ValueError: If unable to fetch Kafka configuration
+        """
+        path = "/v1/actions/get_kafka_info"
+        response = self.rpc.get(path=path)
+        if not response or not response.get("success"):
+            raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
+        encoded_ip = response["data"]["ip"]
+        encoded_port = response["data"]["port"]
+        ip = base64.b64decode(encoded_ip).decode("utf-8")
+        port = base64.b64decode(encoded_port).decode("utf-8")
+        bootstrap_servers = f"{ip}:{port}"
+        # logging.info(f"Retrieved Kafka bootstrap servers: {bootstrap_servers}")
+        return bootstrap_servers
     @log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
     def handle_response(self, resp, success_message, error_message):

{matrice_compute-0.1.24.dist-info → matrice_compute-0.1.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.24
+Version: 0.1.26
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

matrice_compute-0.1.26.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
+matrice_compute/action_instance.py,sha256=SYUZrfj6dtcgEjeEgCyKlrc2p2o08jlW84Y__V4Aqew,69552
+matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
+matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
+matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
+matrice_compute/instance_manager.py,sha256=sUkDsy_XrPp7CKQxlujQRz3E_8rVbVZOy7byJOgMlEs,11376
+matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
+matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
+matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+matrice_compute/resources_tracker.py,sha256=wy1huqB3Tw_kYC2wfnLa9iSyhDmgI7WQ5I9Kyr-1RSs,22829
+matrice_compute/scaling.py,sha256=JNOgSpAPqbTlZ4qJokkdS9PehqyFwfPh4q98qrfNVCQ,24708
+matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
+matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
+matrice_compute-0.1.26.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
+matrice_compute-0.1.26.dist-info/METADATA,sha256=t7TsI5DcNElRmlKsa8CArXCcA4iBO-9QwZ6j9UQOdg0,1038
+matrice_compute-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+matrice_compute-0.1.26.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
+matrice_compute-0.1.26.dist-info/RECORD,,

matrice_compute-0.1.24.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
-matrice_compute/action_instance.py,sha256=NK_ZWvNDrLUeOzWwXjxrX7XP-lDHbx5-A0K8ByFpnUg,66241
-matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
-matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
-matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
-matrice_compute/instance_utils.py,sha256=xDOLo21G7unvlGTpnYQkEWSkyuAsVAcs4scOHy5Oxi4,38204
-matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
-matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-matrice_compute/resources_tracker.py,sha256=pkdt0aVKx_TpY_Sq---73w9INkDffZZe3mZGlp1EftE,22573
-matrice_compute/scaling.py,sha256=CeT_lxJNkjJamRETG1lWaOtdSr5ySmcaMcqt7-lFRbo,23731
-matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
-matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
-matrice_compute-0.1.24.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
-matrice_compute-0.1.24.dist-info/METADATA,sha256=5fsmPC37r0KPPd6h0qQXnvm0dFqLqboVInQdv7KCr5Y,1038
-matrice_compute-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-matrice_compute-0.1.24.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
-matrice_compute-0.1.24.dist-info/RECORD,,

{matrice_compute-0.1.24.dist-info → matrice_compute-0.1.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{matrice_compute-0.1.24.dist-info → matrice_compute-0.1.26.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{matrice_compute-0.1.24.dist-info → matrice_compute-0.1.26.dist-info}/top_level.txt RENAMED Viewed

File without changes

matrice-compute 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

matrice-compute 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl