PyPI - matrice-compute - Versions diffs - 0.1.1__py3-none-any.whl - Mend

matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

matrice_compute/__init__.py +9 -0
matrice_compute/action_instance.py +1508 -0
matrice_compute/actions_manager.py +226 -0
matrice_compute/actions_scaledown_manager.py +57 -0
matrice_compute/instance_manager.py +270 -0
matrice_compute/instance_utils.py +707 -0
matrice_compute/prechecks.py +538 -0
matrice_compute/py.typed +0 -0
matrice_compute/resources_tracker.py +478 -0
matrice_compute/scaling.py +880 -0
matrice_compute/shutdown_manager.py +314 -0
matrice_compute/task_utils.py +77 -0
matrice_compute-0.1.1.dist-info/METADATA +28 -0
matrice_compute-0.1.1.dist-info/RECORD +17 -0
matrice_compute-0.1.1.dist-info/WHEEL +5 -0
matrice_compute-0.1.1.dist-info/licenses/LICENSE.txt +21 -0
matrice_compute-0.1.1.dist-info/top_level.txt +1 -0

matrice_compute/actions_manager.py ADDED Viewed

@@ -0,0 +1,226 @@
+"""Module providing actions_manager functionality."""
+import logging
+import os
+import time
+from matrice_compute.action_instance import (
+    ActionInstance,
+)
+from matrice_compute.instance_utils import (
+    has_gpu,
+    get_mem_usage,
+    cleanup_docker_storage,
+)
+from matrice_compute.scaling import (
+    Scaling,
+)
+from matrice_common.utils import log_errors
+class ActionsManager:
+    """Class for managing actions."""
+    def __init__(self, scaling: Scaling):
+        """Initialize an action manager.
+        Args:
+            scaling (Scaling): Scaling service instance
+        """
+        self.current_actions: dict[str, ActionInstance] = {}
+        self.scaling = scaling
+        self.memory_threshold = 0.9
+        self.poll_interval = 10
+        self.last_actions_check = 0
+        logging.info("ActionsManager initialized")
+    @log_errors(default_return=[], raise_exception=False)
+    def fetch_actions(self) -> list:
+        """Poll for actions and process them if memory threshold is not exceeded.
+        Returns:
+            list: List of fetched actions
+        """
+        actions = []
+        logging.info("Polling backend for new jobs")
+        fetched_actions, error, _ = self.scaling.assign_jobs(has_gpu())
+        if error:
+            logging.error("Error assigning jobs: %s", error)
+            return actions
+        if not isinstance(fetched_actions, list):
+            fetched_actions = [fetched_actions]
+        for action in fetched_actions:
+            if not action:
+                continue
+            if action["_id"] != "000000000000000000000000":
+                actions.append(action)
+                logging.info(
+                    "Fetched action details: %s",
+                    actions,
+                )
+        return actions
+    @log_errors(default_return=None, raise_exception=False)
+    def process_action(self, action: dict) -> ActionInstance:
+        """Process the given action.
+        Args:
+            action (dict): Action details to process
+        Returns:
+            ActionInstance: Processed action instance or None if failed
+        """
+        logging.info(
+            "Processing action: %s",
+            action["_id"],
+        )
+        action_instance = ActionInstance(self.scaling, action)
+        self.scaling.update_action_status(
+            service_provider=os.environ["SERVICE_PROVIDER"],
+            action_record_id=action["_id"],
+            status="starting",
+            action_duration=0,
+        )
+        logging.info("locking action")
+        self.scaling.update_action_status(
+            service_provider=os.environ["SERVICE_PROVIDER"],
+            status="started",
+            action_record_id=action["_id"],
+            isRunning=True,
+            action_duration=0,
+            cpuUtilisation=0.0,
+            gpuUtilisation=0.0,
+            memoryUtilisation=0.0,
+            gpuMemoryUsed=0,
+        )
+        self.scaling.update_status(
+            action["_id"],
+            action["action"],
+            "bg-job-scheduler",
+            "JBSS_LCK",
+            "OK",
+            "Job is locked for processing",
+        )
+        action_instance.execute()
+        logging.info(
+            "action %s started.",
+            action_instance.action_record_id,
+        )
+        return action_instance
+    @log_errors(raise_exception=False)
+    def process_actions(self) -> None:
+        """Process fetched actions."""
+        for action in self.fetch_actions():
+            action_instance = self.process_action(action)
+            if action_instance:
+                self.current_actions[action["_id"]] = action_instance
+    @log_errors(raise_exception=False)
+    def purge_unwanted(self) -> None:
+        """Purge completed or failed actions.
+        This method checks all actions in the current_actions dictionary and removes any that:
+        1. Are explicitly reported as not running by the is_running() method
+        2. Have invalid or corrupted process objects
+        """
+        purged_count = 0
+        # Check each action and purge if needed
+        for action_id, instance in list(self.current_actions.items()):
+            should_purge = False
+            purge_reason = ""
+            # Check if process is reported as not running
+            if not instance.is_running():
+                should_purge = True
+                purge_reason = "process reported as not running"
+            # Check for process object validity
+            elif not hasattr(instance, 'process') or instance.process is None:
+                should_purge = True
+                purge_reason = "invalid process object"
+            # Purge if any condition was met
+            if should_purge:
+                logging.info(
+                    "Action %s is being purged: %s",
+                    action_id,
+                    purge_reason
+                )
+                # Remove from tracking dictionaries
+                del self.current_actions[action_id]
+                purged_count += 1
+                # Try to explicitly stop the action if possible
+                try:
+                    if hasattr(instance, 'stop'):
+                        instance.stop()
+                except Exception as e:
+                    logging.error(f"Error stopping action {action_id}: {str(e)}")
+        if purged_count > 0:
+            logging.info(
+                "Purged %d completed actions, %d actions remain in queue",
+                purged_count,
+                len(self.current_actions)
+            )
+    @log_errors(default_return={}, raise_exception=False)
+    def get_current_actions(self) -> dict:
+        """Get the current actions.
+        This method:
+        1. Purges any completed actions using purge_unwanted()
+        2. Double-checks remaining actions to ensure they are truly running
+        3. Provides detailed logging about current actions state
+        Returns:
+            dict: Current active actions
+        """
+        # Always purge unwanted actions first
+        self.purge_unwanted()
+        if self.current_actions:
+            action_ids = list(self.current_actions.keys())
+            logging.info(
+                "Currently running %d actions: %s",
+                len(self.current_actions),
+                action_ids
+            )
+        else:
+            logging.debug("No actions currently running")
+            return {}
+        return self.current_actions
+    @log_errors(raise_exception=True)
+    def start_actions_manager(self) -> None:
+        """Start the actions manager main loop."""
+        while True:
+            try:
+                mem_usage = get_mem_usage()
+                logging.info("Memory usage: %d", mem_usage)
+                waiting_time = int(
+                    min(
+                        self.poll_interval
+                        / max(
+                            0.001,
+                            self.memory_threshold - mem_usage,
+                        ),
+                        120,
+                    )
+                )
+                if mem_usage < self.memory_threshold:
+                    self.process_actions()
+                    logging.info(
+                        "Waiting for %d seconds before next poll",
+                        waiting_time,
+                    )
+                else:
+                    logging.info(
+                        "Memory threshold exceeded, waiting for %d seconds",
+                        waiting_time,
+                    )
+                cleanup_docker_storage()
+            except Exception as e:
+                logging.error("Error in actions manager: %s", e)
+            time.sleep(waiting_time)

matrice_compute/actions_scaledown_manager.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Module providing actions_scaledown_manager functionality."""
+import logging
+import docker
+from matrice_compute.scaling import (
+    Scaling,
+)
+from matrice_common.utils import log_errors
+class ActionsScaleDownManager:
+    """Class for managing container scale down operations."""
+    def __init__(self, scaling: Scaling):
+        """Initialize the scale down manager.
+        Args:
+            scaling (Scaling): Scaling service instance
+        """
+        self.docker_client = docker.from_env()
+        self.scaling = scaling
+    @log_errors(raise_exception=False, log_error=True)
+    def auto_scaledown_actions(self) -> None:
+        """Start polling for containers that need to be scaled down and stop them."""
+        down_scaled_jobs, error, _ = self.scaling.get_downscaled_ids()
+        if error is not None:
+            logging.error(
+                "Error getting downscaled ids: %s",
+                error,
+            )
+            return
+        containers = self.docker_client.containers.list(
+            filters={"status": "running"},
+            all=True,
+        )
+        if down_scaled_jobs:
+            for container in containers:
+                container_id = container.id
+                inspect_data = self.docker_client.api.inspect_container(container_id)
+                action_record_id = next(
+                    (arg for arg in inspect_data["Args"] if len(arg) == 24),
+                    None,
+                )
+                if action_record_id in down_scaled_jobs:
+                    try:
+                        container.stop()
+                        logging.info(
+                            "Container %s stopped.",
+                            container_id,
+                        )
+                    except docker.errors.APIError as err:
+                        logging.error(
+                            "Failed to stop container %s: %s",
+                            container_id,
+                            str(err),
+                        )

matrice_compute/instance_manager.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""Module providing instance_manager functionality."""
+import json
+import logging
+import os
+import threading
+import time
+from matrice_compute.actions_manager import ActionsManager
+from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
+from matrice_compute.instance_utils import (
+    get_instance_info,
+    get_decrypted_access_key_pair,
+)
+from matrice_compute.resources_tracker import (
+    MachineResourcesTracker,
+    ActionsResourcesTracker,
+)
+from matrice_compute.scaling import Scaling
+from matrice_compute.shutdown_manager import ShutdownManager
+from matrice_common.session import Session
+from matrice_common.utils import log_errors
+class InstanceManager:
+    """Class for managing compute instances and their associated actions.
+    Now includes auto streaming capabilities for specified deployment IDs.
+    """
+    def __init__(
+        self,
+        matrice_access_key_id: str = "",
+        matrice_secret_access_key: str = "",
+        encryption_key: str = "",
+        instance_id: str = "",
+        service_provider: str = "",
+        env: str = "",
+        gpus: str = "",
+        workspace_dir: str = "matrice_workspace",
+    ):
+        """Initialize an instance manager.
+        Args:
+            matrice_access_key_id (str): Access key ID for Matrice authentication.
+                Defaults to empty string.
+            matrice_secret_access_key (str): Secret access key for Matrice
+                authentication. Defaults to empty string.
+            encryption_key (str): Key used for encrypting sensitive data.
+                Defaults to empty string.
+            instance_id (str): Unique identifier for this compute instance.
+                Defaults to empty string.
+            service_provider (str): Cloud service provider being used.
+                Defaults to empty string.
+            env (str): Environment name (e.g. dev, prod).
+                Defaults to empty string.
+            gpus (str): GPU configuration string (e.g. "0,1").
+                Defaults to empty string.
+            workspace_dir (str): Directory for workspace files.
+                Defaults to "matrice_workspace".
+        """
+        self.session = self._setup_env_credentials(
+            env,
+            service_provider,
+            instance_id,
+            encryption_key,
+            matrice_access_key_id,
+            matrice_secret_access_key,
+        )
+        os.environ["WORKSPACE_DIR"] = str(workspace_dir)
+        os.environ["GPUS"] = json.dumps(gpus)
+        self.scaling = Scaling(
+            self.session,
+            os.environ.get("INSTANCE_ID"),
+        )
+        logging.info("InstanceManager initialized with scaling")
+        jupyter_token = os.environ.get("JUPYTER_TOKEN")
+        if jupyter_token:
+            self.scaling.update_jupyter_token(jupyter_token)
+            logging.info("InstanceManager updated Jupyter token")
+        else:
+            logging.warning("No Jupyter token found in environment variables")
+        self.current_actions = {}
+        self.actions_manager = ActionsManager(self.scaling)
+        logging.info("InstanceManager initialized with actions manager")
+        self.scale_down_manager = ActionsScaleDownManager(self.scaling)
+        logging.info("InstanceManager initialized with scale down manager")
+        self.shutdown_manager = ShutdownManager(self.scaling)
+        logging.info("InstanceManager initialized with shutdown manager")
+        self.machine_resources_tracker = MachineResourcesTracker(self.scaling)
+        logging.info("InstanceManager initialized with machine resources tracker")
+        self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
+        logging.info("InstanceManager initialized with actions resources tracker")
+        self.poll_interval = 10
+        self.encryption_key = None
+        logging.info("InstanceManager initialized.")
+    @log_errors(default_return=None, raise_exception=True, log_error=True)
+    def _setup_env_credentials(
+        self,
+        env: str,
+        service_provider: str,
+        instance_id: str,
+        encryption_key: str,
+        matrice_access_key_id: str,
+        matrice_secret_access_key: str,
+    ):
+        """Set up environment credentials.
+        Args:
+            env (str): Environment name
+            service_provider (str): Cloud service provider
+            instance_id (str): Instance identifier
+            encryption_key (str): Encryption key
+            matrice_access_key_id (str): Matrice access key ID
+            matrice_secret_access_key (str): Matrice secret access key
+        Returns:
+            Session: Initialized session object
+        Raises:
+            Exception: If required environment variables are not set
+        """
+        try:
+            auto_instance_info = get_instance_info(service_provider, instance_id)
+            (
+                auto_service_provider,
+                auto_instance_id,
+            ) = auto_instance_info
+        except Exception as exc:
+            logging.error(
+                "Error getting instance info: %s",
+                str(exc),
+            )
+            auto_service_provider = ""
+            auto_instance_id = ""
+        manual_instance_info = {
+            "ENV": env or os.environ.get("ENV"),
+            "SERVICE_PROVIDER": service_provider
+            or os.environ.get("SERVICE_PROVIDER")
+            or auto_service_provider,
+            "INSTANCE_ID": instance_id
+            or os.environ.get("INSTANCE_ID")
+            or auto_instance_id,
+            "MATRICE_ENCRYPTION_KEY": encryption_key
+            or os.environ.get("MATRICE_ENCRYPTION_KEY"),
+            "MATRICE_ACCESS_KEY_ID": matrice_access_key_id
+            or os.environ.get("MATRICE_ACCESS_KEY_ID"),
+            "MATRICE_SECRET_ACCESS_KEY": matrice_secret_access_key
+            or os.environ.get("MATRICE_SECRET_ACCESS_KEY"),
+        }
+        for (
+            key,
+            value,
+        ) in manual_instance_info.items():
+            os.environ[key] = value
+        if not (os.environ.get("SERVICE_PROVIDER") and os.environ.get("INSTANCE_ID")):
+            raise Exception(
+                "SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
+            )
+        self.encryption_key = manual_instance_info["MATRICE_ENCRYPTION_KEY"]
+        access_key = manual_instance_info["MATRICE_ACCESS_KEY_ID"]
+        secret_key = manual_instance_info["MATRICE_SECRET_ACCESS_KEY"]
+        if (  # Keys are not encrypted
+            self.encryption_key
+            and access_key
+            and secret_key
+            and len(access_key) != 21
+            and len(secret_key) != 21
+        ):
+            access_key, secret_key = self._decrypt_access_key_pair(
+                access_key,
+                secret_key,
+                self.encryption_key,
+            )
+        os.environ["MATRICE_SECRET_ACCESS_KEY"] = secret_key
+        os.environ["MATRICE_ACCESS_KEY_ID"] = access_key
+        os.environ["MATRICE_ENCRYPTION_KEY"] = self.encryption_key
+        return Session(
+            account_number="",
+            secret_key=secret_key,
+            access_key=access_key,
+        )
+    @log_errors(default_return=(None, None), raise_exception=False)
+    def _decrypt_access_key_pair(
+        self,
+        enc_access_key: str,
+        enc_secret_key: str,
+        encryption_key: str = "",
+    ) -> tuple:
+        """Decrypt the access key pair.
+        Args:
+            enc_access_key (str): Encrypted access key
+            enc_secret_key (str): Encrypted secret key
+            encryption_key (str): Key for decryption. Defaults to empty string.
+        Returns:
+            tuple: Decrypted (access_key, secret_key) pair
+        """
+        return get_decrypted_access_key_pair(
+            enc_access_key,
+            enc_secret_key,
+            encryption_key,
+        )
+    @log_errors(raise_exception=True, log_error=True)
+    def start_instance_manager(self) -> None:
+        """Run the instance manager loop."""
+        while True:
+            try:
+                self.shutdown_manager.handle_shutdown(
+                    bool(self.actions_manager.get_current_actions())
+                )
+            except Exception as exc:
+                logging.error(
+                    "Error in shutdown_manager handle_shutdown: %s",
+                    str(exc),
+                )
+            try:
+                self.scale_down_manager.auto_scaledown_actions()
+            except Exception as exc:
+                logging.error(
+                    "Error in scale_down_manager auto_scaledown_actions: %s",
+                    str(exc),
+                )
+            try:
+                self.machine_resources_tracker.update_available_resources()
+            except Exception as exc:
+                logging.error(
+                    "Error in machine_resources_tracker update_available_resources: %s",
+                    str(exc),
+                )
+            try:
+                self.actions_resources_tracker.update_actions_resources()
+            except Exception as exc:
+                logging.error(
+                    "Error in actions_resources_tracker update_actions_resources: %s",
+                    str(exc),
+                )
+            time.sleep(self.poll_interval)
+    @log_errors(default_return=(None, None), raise_exception=True)
+    def start(self) -> tuple:
+        """Start the instance manager threads.
+        Returns:
+            tuple: (instance_manager_thread, actions_manager_thread)
+        """
+        # Create and start threads
+        instance_manager_thread = threading.Thread(
+            target=self.start_instance_manager,
+            name="InstanceManager",
+        )
+        instance_manager_thread.start()
+        actions_manager_thread = threading.Thread(
+            target=self.actions_manager.start_actions_manager,
+            name="ActionsManager",
+        )
+        actions_manager_thread.start()
+        return (
+            instance_manager_thread,
+            actions_manager_thread,
+        )