PyPI - matrice-compute - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

matrice-compute 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

matrice_compute/action_instance.py CHANGED Viewed

@@ -83,7 +83,8 @@ class ActionInstance:
             "inference_ws_server": inference_ws_server_execute,
             "fe_analytics_service": fe_analytics_service_execute,
             "lpr_setup": lpr_setup_execute,
-            "inference_tracker_server": inference_tracker_setup_execute
+            "inference_tracker_server": inference_tracker_setup_execute,
+            "video_storage_setup" : video_storage_setup_execute
         }
         if self.action_type not in self.actions_map:
             raise ValueError(f"Unknown action type: {self.action_type}")
@@ -309,7 +310,7 @@ class ActionInstance:
             )
     @log_errors(default_return=None, raise_exception=False, log_error=False)
-    def get_action_details(self):
+    def  get_action_details(self):
         """Get action details from scaling service.
         Returns:
@@ -2196,4 +2197,45 @@ def inference_tracker_setup_execute(self: ActionInstance):
         f"{image}"
     )
-    self.start(worker_cmd, "inference_tracker_setup")
+    self.start(worker_cmd, "inference_tracker_setup")
+@log_errors(raise_exception=False)
+def video_storage_setup_execute(self: ActionInstance):
+    """
+    Creates and start Video Storage
+    Video Stroage runs on port 8106 (localhost only with --net=host).
+    """
+    action_details = self.get_action_details()
+    if not action_details:
+        return
+    image = self.docker_container
+    self.setup_action_requirements(action_details)
+    if action_details["actionDetails"].get("containerId"):
+        logging.info(
+            "Using existing container ID for inference tracker: %s",
+            action_details["actionDetails"]["containerId"],
+        )
+        self.docker_container = action_details["actionDetails"]["containerId"]
+        cmd = "docker restart " + self.docker_container
+        self.start(cmd, "video_storage_setup_execute")
+        return
+    # This is the existing Docker run command
+    worker_cmd = (
+        f"docker run -d --pull=always --net=host "
+         f"--cidfile ./{self.action_record_id}.cid "
+        f"--name media_server "
+        f"-v matrice_myvol:/matrice_data "
+        f'-e ENV="{os.environ.get("ENV", "prod")}" '
+        f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
+        f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
+        f'-e ACTION_ID="{self.action_record_id}" '
+        f"{image}"
+    )
+    self.start(worker_cmd, "video_storage_setup_execute")

matrice_compute/resources_tracker.py CHANGED Viewed

@@ -21,7 +21,19 @@ from matrice_common.utils import log_errors
 class ResourcesTracker:
-    """Tracks machine and container resources."""
+    """Tracks machine and container resources.
+    GPU Utilization Note:
+        GPU utilization is tracked at the DEVICE level, not per-container.
+        NVIDIA does not expose reliable per-process GPU utilization.
+        Per-container GPU MEMORY is accurate; per-container GPU UTILIZATION is best-effort.
+    """
+    # Cache for nvidia-smi output to reduce subprocess overhead
+    _gpu_cache: Dict = {}
+    _gpu_cache_timestamp: float = 0
+    _gpu_cache_ttl: float = 1.0  # Cache TTL in seconds
+    _gpu_cache_lock = threading.Lock()
     def __init__(self) -> None:
         """
@@ -38,11 +50,11 @@ class ResourcesTracker:
             container (docker.models.containers.Container): Docker container instance.
         Returns:
-            Tuple[float, float]: CPU utilization percentage and memory utilization percentage.
+            Tuple[float, float]: CPU utilization percentage (0-100 per core used) and memory usage in MB.
         """
         stats = container.stats(stream=False)
         if stats:
-            cpu_utilization = 0
+            cpu_utilization = 0.0
             cpu_delta = (
                 stats["cpu_stats"]["cpu_usage"]["total_usage"]
                 - stats["precpu_stats"]["cpu_usage"]["total_usage"]
@@ -50,14 +62,92 @@ class ResourcesTracker:
             system_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats[
                 "precpu_stats"
             ].get("system_cpu_usage", 0)
             if system_delta > 0:
-                cpu_utilization = cpu_delta / system_delta * 100.0
-            memory_usage = stats["memory_stats"].get("usage", 0)
-            memory_limit = stats["memory_stats"].get("limit", 1)
-            memory_utilization = memory_usage / memory_limit * 100.0
-            return cpu_utilization, memory_utilization
+                # FIX: Multiply by online_cpus to get correct percentage
+                # Docker formula: (cpu_delta / system_delta) * online_cpus * 100
+                online_cpus = stats["cpu_stats"].get("online_cpus")
+                if not online_cpus:
+                    # Fallback: count from percpu_usage or use system CPU count
+                    percpu = stats["cpu_stats"]["cpu_usage"].get("percpu_usage", [])
+                    online_cpus = len(percpu) if percpu else psutil.cpu_count()
+                cpu_utilization = (cpu_delta / system_delta) * online_cpus * 100.0
+            # Return memory in MB (consistent units) instead of percentage
+            memory_usage_bytes = stats["memory_stats"].get("usage", 0)
+            # Subtract cache if available for more accurate "real" memory
+            cache_bytes = stats["memory_stats"].get("stats", {}).get("cache", 0)
+            memory_usage_mb = (memory_usage_bytes - cache_bytes) / (1024 * 1024)
+            return cpu_utilization, max(0, memory_usage_mb)
         return 0, 0
+    @staticmethod
+    def _parse_memory_string(memory_str: str) -> float:
+        """
+        Parse Docker memory string to MB.
+        Handles: "1.5GiB", "512MiB", "1024KiB", "1.5GB", "512MB", "1024KB", "1024B"
+        Args:
+            memory_str: Memory string from docker stats
+        Returns:
+            float: Memory in MB
+        """
+        import re
+        memory_str = memory_str.strip()
+        # Match number (with optional decimal) and unit
+        match = re.match(r'^([\d.]+)\s*([A-Za-z]+)$', memory_str)
+        if not match:
+            # Try splitting by space
+            parts = memory_str.split()
+            if len(parts) >= 2:
+                value_str, unit = parts[0], parts[1]
+            else:
+                # Last resort: assume it's bytes
+                try:
+                    return float(memory_str) / (1024 * 1024)
+                except ValueError:
+                    return 0.0
+        else:
+            value_str, unit = match.groups()
+        try:
+            value = float(value_str)
+        except ValueError:
+            return 0.0
+        # Normalize unit to lowercase for comparison
+        unit = unit.lower()
+        # Binary units (IEC)
+        if unit in ('kib', 'ki'):
+            return value / 1024
+        elif unit in ('mib', 'mi'):
+            return value
+        elif unit in ('gib', 'gi'):
+            return value * 1024
+        elif unit in ('tib', 'ti'):
+            return value * 1024 * 1024
+        # Decimal units (SI)
+        elif unit in ('kb', 'k'):
+            return value / 1000
+        elif unit in ('mb', 'm'):
+            return value
+        elif unit in ('gb', 'g'):
+            return value * 1000
+        elif unit in ('tb', 't'):
+            return value * 1000 * 1000
+        # Bytes
+        elif unit in ('b', 'bytes'):
+            return value / (1024 * 1024)
+        else:
+            # Unknown unit, assume MB
+            logging.debug("Unknown memory unit '%s', assuming MB", unit)
+            return value
     @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
     def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
         """
@@ -70,13 +160,14 @@ class ResourcesTracker:
             Tuple[float, float]: CPU utilization percentage and memory usage in MB.
         """
         try:
+            # Use JSON format for more reliable parsing
             stats_result = subprocess.run(
                 [
                     "docker",
                     "stats",
                     "--no-stream",
                     "--format",
-                    "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
+                    '{"cpu":"{{.CPUPerc}}","mem":"{{.MemUsage}}"}',
                     container_id,
                 ],
                 capture_output=True,
@@ -87,19 +178,24 @@ class ResourcesTracker:
             if stats_result.returncode != 0:
                 logging.debug("docker stats command failed for container %s", container_id)
                 return 0, 0
-            stats = stats_result.stdout.strip().split(": ")[1].split(", ")
-            cpu_usage = float(stats[0].replace("% CPU", "").strip())
-            memory_usage = stats[1].split(" / ")[0]
-            mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
-            if mem_unit == "KiB":
-                memory_usage_mb = float(mem_value) / 1024
-            elif mem_unit == "MiB":
-                memory_usage_mb = float(mem_value)
-            elif mem_unit == "GiB":
-                memory_usage_mb = float(mem_value) * 1024
-            else:
-                memory_usage_mb = float(mem_value)
+            # Parse JSON output
+            stats_json = json.loads(stats_result.stdout.strip())
+            # Parse CPU (remove % sign)
+            cpu_str = stats_json.get("cpu", "0%").replace("%", "").strip()
+            cpu_usage = float(cpu_str) if cpu_str else 0.0
+            # Parse memory (format: "used / limit")
+            mem_str = stats_json.get("mem", "0B / 0B")
+            mem_used = mem_str.split("/")[0].strip()
+            memory_usage_mb = self._parse_memory_string(mem_used)
             return cpu_usage, memory_usage_mb
+        except json.JSONDecodeError as e:
+            logging.debug("JSON parse error for container %s: %s", container_id, e)
+            return 0, 0
         except subprocess.TimeoutExpired:
             logging.debug("docker stats command timed out for container %s", container_id)
             return 0, 0
@@ -110,20 +206,395 @@ class ResourcesTracker:
             logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
             return 0, 0
+    def _get_cached_gpu_data(self) -> Dict:
+        """
+        Get cached GPU data from nvidia-smi to reduce subprocess overhead.
+        Returns:
+            Dict: Cached GPU data with keys:
+                - 'processes': List of {pid, gpu_idx, memory_mb}
+                - 'gpus': List of {idx, utilization, memory_used, memory_total}
+                - 'timestamp': When cache was populated
+        """
+        import time as time_module
+        current_time = time_module.time()
+        with ResourcesTracker._gpu_cache_lock:
+            # Return cache if still valid
+            if (ResourcesTracker._gpu_cache and
+                current_time - ResourcesTracker._gpu_cache_timestamp < ResourcesTracker._gpu_cache_ttl):
+                return ResourcesTracker._gpu_cache
+            # Refresh cache
+            cache = {
+                'processes': [],
+                'gpus': [],
+                'timestamp': current_time,
+            }
+            if not has_gpu():
+                ResourcesTracker._gpu_cache = cache
+                ResourcesTracker._gpu_cache_timestamp = current_time
+                return cache
+            try:
+                # Single nvidia-smi call for all GPU info
+                result = subprocess.run(
+                    [
+                        "nvidia-smi",
+                        "--query-gpu=index,utilization.gpu,memory.used,memory.total",
+                        "--format=csv,noheader,nounits"
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    for line in result.stdout.strip().split("\n"):
+                        if not line.strip():
+                            continue
+                        parts = [p.strip() for p in line.split(",")]
+                        if len(parts) >= 4:
+                            cache['gpus'].append({
+                                'idx': int(parts[0]) if parts[0].isdigit() else 0,
+                                'utilization': float(parts[1]) if parts[1].replace('.', '').isdigit() else 0,
+                                'memory_used': int(parts[2]) if parts[2].isdigit() else 0,
+                                'memory_total': int(parts[3]) if parts[3].isdigit() else 0,
+                            })
+                # Single nvidia-smi call for all processes
+                result = subprocess.run(
+                    [
+                        "nvidia-smi",
+                        "--query-compute-apps=pid,gpu_uuid,used_memory",
+                        "--format=csv,noheader,nounits"
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    for line in result.stdout.strip().split("\n"):
+                        if not line.strip():
+                            continue
+                        parts = [p.strip() for p in line.split(",")]
+                        if len(parts) >= 3:
+                            cache['processes'].append({
+                                'pid': parts[0],
+                                'gpu_uuid': parts[1],
+                                'memory_mb': int(parts[2]) if parts[2].isdigit() else 0,
+                            })
+            except subprocess.TimeoutExpired:
+                logging.debug("nvidia-smi cache refresh timed out")
+            except Exception as e:
+                logging.debug("Error refreshing GPU cache: %s", e)
+            ResourcesTracker._gpu_cache = cache
+            ResourcesTracker._gpu_cache_timestamp = current_time
+            return cache
     @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
     def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
         """
         Get GPU usage for a specific container.
+        IMPORTANT: GPU utilization tracking limitations:
+        - GPU MEMORY per container is ACCURATE (from nvidia-smi per-process data)
+        - GPU UTILIZATION per container is BEST-EFFORT (NVIDIA doesn't expose per-process SM usage)
+        For GPU utilization, we report the utilization of GPUs that have container processes.
+        If multiple containers share a GPU, they will all report similar utilization.
+        Args:
+            container_id (str): ID of the Docker container.
+        Returns:
+            Tuple[float, int]:
+                - GPU utilization percentage (device-level, for GPUs used by container)
+                - GPU memory usage in MB (accurate per-container)
+        """
+        # Get ALL PIDs belonging to this container (not just main PID)
+        container_pids = self.get_all_container_pids(container_id)
+        if not container_pids:
+            # Fallback to main PID only
+            main_pid = self.get_pid_id_by_container_id(container_id)
+            if main_pid:
+                container_pids = {main_pid}
+            else:
+                return 0, 0
+        # Check if this is a Jetson device
+        if self._is_jetson_device():
+            return self._get_jetson_gpu_usage(container_pids)
+        # Use cached GPU data for efficiency
+        gpu_data = self._get_cached_gpu_data()
+        # Find GPU memory used by container (ACCURATE)
+        gpu_mem_used = 0
+        container_gpu_uuids = set()
+        for proc in gpu_data.get('processes', []):
+            if proc['pid'] in container_pids:
+                gpu_mem_used += proc['memory_mb']
+                container_gpu_uuids.add(proc['gpu_uuid'])
+        # Get utilization of GPUs used by container (DEVICE-LEVEL approximation)
+        # NOTE: This is NOT per-container utilization - it's the utilization of shared GPUs
+        gpu_util = 0.0
+        if container_gpu_uuids:
+            # If we have GPU UUIDs, get their utilization
+            # For now, just use overall utilization as approximation
+            total_util = sum(g['utilization'] for g in gpu_data.get('gpus', []))
+            gpu_count = len(gpu_data.get('gpus', [])) or 1
+            gpu_util = total_util / gpu_count
+        return gpu_util, gpu_mem_used
+    @log_errors(default_return=False, raise_exception=False, log_error=False)
+    def _is_jetson_device(self) -> bool:
+        """
+        Check if the current device is an NVIDIA Jetson.
+        Returns:
+            bool: True if Jetson device, False otherwise.
+        """
+        # Check for Jetson-specific indicators
+        try:
+            # Method 1: Check /etc/nv_tegra_release (Jetson specific)
+            if os.path.exists("/etc/nv_tegra_release"):
+                return True
+            # Method 2: Check for tegra in /proc/device-tree/compatible
+            if os.path.exists("/proc/device-tree/compatible"):
+                with open("/proc/device-tree/compatible", "r") as f:
+                    content = f.read().lower()
+                    if "tegra" in content or "jetson" in content:
+                        return True
+            # Method 3: Check if tegrastats exists
+            result = subprocess.run(
+                ["which", "tegrastats"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if result.returncode == 0:
+                return True
+        except Exception as e:
+            logging.debug("Error checking for Jetson device: %s", e)
+        return False
+    @log_errors(default_return=set(), raise_exception=False, log_error=False)
+    def get_all_container_pids(self, container_id: str) -> set:
+        """
+        Get ALL PIDs belonging to a container (including child processes).
+        Uses multiple methods for robustness:
+        1. docker top (most reliable for standard Docker)
+        2. Docker API inspect + process tree enumeration
+        3. cgroup procs files (v1 and v2)
+        Known limitations:
+        - May miss processes in rootless Docker
+        - CRI-O/containerd may have different layouts
         Args:
             container_id (str): ID of the Docker container.
+        Returns:
+            set: Set of all PIDs (as strings) belonging to the container.
+        """
+        pids = set()
+        # Method 1: Use docker top (most reliable)
+        try:
+            result = subprocess.run(
+                ["docker", "top", container_id, "-o", "pid"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                lines = result.stdout.strip().split("\n")
+                for line in lines[1:]:  # Skip header
+                    pid = line.strip()
+                    if pid.isdigit():
+                        pids.add(pid)
+        except subprocess.TimeoutExpired:
+            logging.debug("docker top command timed out for container %s", container_id)
+        except Exception as e:
+            logging.debug("docker top failed for %s: %s", container_id, e)
+        # Method 2: Get init PID from docker inspect and enumerate children
+        if not pids:
+            try:
+                result = subprocess.run(
+                    ["docker", "inspect", "--format", "{{.State.Pid}}", container_id],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    init_pid = result.stdout.strip()
+                    if init_pid and init_pid.isdigit() and init_pid != "0":
+                        pids.add(init_pid)
+                        # Enumerate all child processes recursively
+                        pids.update(self._get_child_pids(init_pid))
+            except Exception as e:
+                logging.debug("docker inspect failed for %s: %s", container_id, e)
+        # Method 3: Check cgroup procs files (fallback)
+        cgroup_paths = [
+            # cgroup v2 paths
+            f"/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cgroup.procs",
+            f"/sys/fs/cgroup/docker/{container_id}/cgroup.procs",
+            # cgroup v1 paths
+            f"/sys/fs/cgroup/pids/docker/{container_id}/cgroup.procs",
+            f"/sys/fs/cgroup/cpu/docker/{container_id}/cgroup.procs",
+            f"/sys/fs/cgroup/memory/docker/{container_id}/cgroup.procs",
+        ]
+        for cgroup_path in cgroup_paths:
+            try:
+                if os.path.exists(cgroup_path):
+                    with open(cgroup_path, "r") as f:
+                        for line in f:
+                            pid = line.strip()
+                            if pid.isdigit():
+                                pids.add(pid)
+                    break
+            except Exception as e:
+                logging.debug("Error reading cgroup %s: %s", cgroup_path, e)
+        return pids
+    @log_errors(default_return=set(), raise_exception=False, log_error=False)
+    def _get_child_pids(self, parent_pid: str, visited: set = None) -> set:
+        """
+        Recursively get all child PIDs of a process.
+        Args:
+            parent_pid (str): Parent PID to get children for.
+            visited (set): Set of already visited PIDs to prevent cycles.
+        Returns:
+            set: Set of all child PIDs (as strings).
+        """
+        if visited is None:
+            visited = set()
+        if parent_pid in visited:
+            return set()
+        visited.add(parent_pid)
+        children = set()
+        children_path = f"/proc/{parent_pid}/task/{parent_pid}/children"
+        try:
+            if os.path.exists(children_path):
+                with open(children_path, "r") as f:
+                    child_pids = f.read().strip().split()
+                    for child_pid in child_pids:
+                        if child_pid.isdigit():
+                            children.add(child_pid)
+                            # Recursively get grandchildren
+                            children.update(self._get_child_pids(child_pid, visited))
+        except Exception as e:
+            logging.debug("Error getting children of PID %s: %s", parent_pid, e)
+        return children
+    @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
+    def _get_jetson_gpu_usage(self, container_pids: set) -> Tuple[float, int]:
+        """
+        Get GPU usage for Jetson devices.
+        Args:
+            container_pids (set): Set of container PIDs.
         Returns:
             Tuple[float, int]: GPU utilization percentage and GPU memory usage in MB.
         """
-        container_pid = self.get_pid_id_by_container_id(container_id)
-        gpu_util = self.get_container_gpu_usage(container_pid)
-        gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
+        gpu_util = 0.0
+        gpu_mem_used = 0
+        try:
+            # Method 1: Try using tegrastats (one-shot)
+            result = subprocess.run(
+                ["tegrastats", "--interval", "100", "--stop", "1"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if result.returncode == 0 and result.stdout:
+                output = result.stdout.strip()
+                # Parse tegrastats output - format varies by Jetson model
+                # Example: "RAM 2457/7773MB (lfb 1x512kB) CPU [...] GR3D_FREQ 0% ..."
+                # Extract GR3D (GPU) utilization
+                import re
+                gr3d_match = re.search(r'GR3D_FREQ\s+(\d+)%', output)
+                if gr3d_match:
+                    gpu_util = float(gr3d_match.group(1))
+                # For Jetson, GPU memory is shared with system RAM
+                # We can estimate based on total GPU memory allocation
+                # Try to get from /sys/kernel/debug/nvmap or similar
+        except subprocess.TimeoutExpired:
+            logging.debug("tegrastats timed out")
+        except FileNotFoundError:
+            logging.debug("tegrastats not found, trying alternative methods")
+        except Exception as e:
+            logging.debug("Error running tegrastats: %s", e)
+        # Method 2: Try jtop Python library info from /sys
+        if gpu_util == 0:
+            try:
+                # Read GPU frequency/utilization from sysfs
+                gpu_load_paths = [
+                    "/sys/devices/gpu.0/load",
+                    "/sys/devices/platform/host1x/gpu.0/load",
+                    "/sys/devices/57000000.gpu/load",
+                    "/sys/devices/17000000.ga10b/load",  # Orin
+                ]
+                for path in gpu_load_paths:
+                    if os.path.exists(path):
+                        with open(path, "r") as f:
+                            # Load is reported as 0-1000, convert to percentage
+                            load_val = int(f.read().strip())
+                            gpu_util = load_val / 10.0
+                        break
+            except Exception as e:
+                logging.debug("Error reading Jetson GPU load from sysfs: %s", e)
+        # Method 3: Get GPU memory from /proc for container processes
+        if container_pids:
+            try:
+                # On Jetson, GPU memory is unified with system RAM
+                # Check /proc/[pid]/smaps for GPU-related mappings
+                for pid in container_pids:
+                    smaps_path = f"/proc/{pid}/smaps"
+                    if os.path.exists(smaps_path):
+                        with open(smaps_path, "r") as f:
+                            content = f.read()
+                            # Look for nvmap or GPU memory regions
+                            for line in content.split("\n"):
+                                if "nvmap" in line.lower() or "gpu" in line.lower():
+                                    # Extract size if present
+                                    if "Size:" in line:
+                                        size_kb = int(line.split()[1])
+                                        gpu_mem_used += size_kb // 1024  # Convert to MB
+            except Exception as e:
+                logging.debug("Error getting Jetson GPU memory: %s", e)
         return gpu_util, gpu_mem_used
     @log_errors(default_return="", raise_exception=False, log_error=False)
@@ -174,30 +645,97 @@ class ResourcesTracker:
         Returns:
             float: GPU utilization percentage.
         """
+        return self.get_container_gpu_usage_multi_pid({str(container_pid)})
+    @log_errors(default_return=0, raise_exception=False, log_error=False)
+    def get_container_gpu_usage_multi_pid(self, container_pids: set) -> float:
+        """
+        Get GPU usage for multiple container PIDs.
+        Args:
+            container_pids (set): Set of container PIDs (as strings).
+        Returns:
+            float: Total GPU utilization percentage across all matching processes.
+        """
         if not has_gpu():
             return 0
-        gpu_util = 0
+        if not container_pids:
+            return 0
+        gpu_util = 0.0
         try:
+            # Method 1: nvidia-smi pmon (process monitoring)
             result = subprocess.run(
-                ["nvidia-smi", "pmon", "-c", "1"],
+                ["nvidia-smi", "pmon", "-c", "1", "-s", "u"],
                 capture_output=True,
                 text=True,
                 check=False,
-                timeout=5,
+                timeout=10,
             )
-            if result.returncode != 0:
-                logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
-                return 0
-            pmon_output = result.stdout.strip().split("\n")
-            for line in pmon_output[2:]:
-                parts = line.split()
-                if len(parts) >= 8:
-                    pid = parts[1]
-                    gpu_usage = parts[3]
-                    if pid == str(container_pid):
-                        gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
+            if result.returncode == 0:
+                pmon_output = result.stdout.strip().split("\n")
+                for line in pmon_output:
+                    # Skip header lines (start with # or contain column names)
+                    if line.startswith("#") or "gpu" in line.lower() and "pid" in line.lower():
+                        continue
+                    parts = line.split()
+                    if len(parts) >= 4:
+                        pid = parts[1]
+                        sm_usage = parts[3] if len(parts) > 3 else "0"
+                        if pid in container_pids:
+                            if sm_usage != "-" and sm_usage.replace(".", "").isdigit():
+                                gpu_util += float(sm_usage)
+                if gpu_util > 0:
+                    return gpu_util
+            # Method 2: Query per-process GPU utilization
+            result = subprocess.run(
+                ["nvidia-smi", "--query-compute-apps=pid,gpu_uuid", "--format=csv,noheader,nounits"],
+                capture_output=True,
+                text=True,
+                check=False,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                # Get overall GPU utilization per GPU
+                gpu_utils = {}
+                util_result = subprocess.run(
+                    ["nvidia-smi", "--query-gpu=uuid,utilization.gpu", "--format=csv,noheader,nounits"],
+                    capture_output=True,
+                    text=True,
+                    check=False,
+                    timeout=10,
+                )
+                if util_result.returncode == 0:
+                    for line in util_result.stdout.strip().split("\n"):
+                        parts = line.split(",")
+                        if len(parts) >= 2:
+                            gpu_uuid = parts[0].strip()
+                            util = float(parts[1].strip()) if parts[1].strip().replace(".", "").isdigit() else 0
+                            gpu_utils[gpu_uuid] = util
+                # Check which GPUs have our container processes
+                matched_gpus = set()
+                for line in result.stdout.strip().split("\n"):
+                    if not line.strip():
+                        continue
+                    parts = line.split(",")
+                    if len(parts) >= 2:
+                        pid = parts[0].strip()
+                        gpu_uuid = parts[1].strip()
+                        if pid in container_pids:
+                            matched_gpus.add(gpu_uuid)
+                # Sum utilization for matched GPUs
+                for gpu_uuid in matched_gpus:
+                    if gpu_uuid in gpu_utils:
+                        gpu_util += gpu_utils[gpu_uuid]
         except subprocess.TimeoutExpired:
-            logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
+            logging.debug("nvidia-smi command timed out in get_container_gpu_usage_multi_pid")
             return 0
         except (ValueError, IndexError) as e:
             logging.debug("Error parsing GPU usage info: %s", e)
@@ -206,8 +744,9 @@ class ResourcesTracker:
             logging.debug("nvidia-smi not found on this system")
             return 0
         except Exception as e:
-            logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
+            logging.debug("Unexpected error in get_container_gpu_usage_multi_pid: %s", e)
             return 0
         return gpu_util
     @log_errors(default_return=0, raise_exception=False, log_error=False)
@@ -221,34 +760,85 @@ class ResourcesTracker:
         Returns:
             int: GPU memory usage in MB.
         """
+        return self.get_container_gpu_memory_usage_multi_pid({str(container_pid)})
+    @log_errors(default_return=0, raise_exception=False, log_error=False)
+    def get_container_gpu_memory_usage_multi_pid(self, container_pids: set) -> int:
+        """
+        Get GPU memory usage for multiple container PIDs.
+        Args:
+            container_pids (set): Set of container PIDs (as strings).
+        Returns:
+            int: Total GPU memory usage in MB across all matching processes.
+        """
         if not has_gpu():
             return 0
-        cmd = [
-            "nvidia-smi",
-            "--query-compute-apps=pid,used_memory",
-            "--format=csv,noheader,nounits",
-        ]
+        if not container_pids:
+            return 0
         total_memory = 0
         try:
+            # Method 1: Query compute apps for memory usage
+            cmd = [
+                "nvidia-smi",
+                "--query-compute-apps=pid,used_memory",
+                "--format=csv,noheader,nounits",
+            ]
             result = subprocess.run(
                 cmd,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
                 check=False,
-                timeout=5,
+                timeout=10,
             )
-            if result.returncode != 0:
-                logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
-                return 0
-            for line in result.stdout.splitlines():
-                parts = line.strip().split(", ")
-                if len(parts) == 2:
-                    process_pid, used_memory = parts
-                    if process_pid == str(container_pid):
-                        total_memory += int(used_memory)
+            if result.returncode == 0:
+                for line in result.stdout.splitlines():
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Handle both ", " and "," separators
+                    if ", " in line:
+                        parts = line.split(", ")
+                    else:
+                        parts = line.split(",")
+                    if len(parts) >= 2:
+                        process_pid = parts[0].strip()
+                        used_memory = parts[1].strip()
+                        if process_pid in container_pids:
+                            if used_memory.isdigit():
+                                total_memory += int(used_memory)
+                if total_memory > 0:
+                    return total_memory
+            # Method 2: Use pmon for memory info
+            result = subprocess.run(
+                ["nvidia-smi", "pmon", "-c", "1", "-s", "m"],
+                capture_output=True,
+                text=True,
+                check=False,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                pmon_output = result.stdout.strip().split("\n")
+                for line in pmon_output:
+                    if line.startswith("#") or "gpu" in line.lower() and "pid" in line.lower():
+                        continue
+                    parts = line.split()
+                    # Format: gpu pid type fb_mem (MB)
+                    if len(parts) >= 4:
+                        pid = parts[1]
+                        fb_mem = parts[3] if len(parts) > 3 else "0"
+                        if pid in container_pids:
+                            if fb_mem != "-" and fb_mem.isdigit():
+                                total_memory += int(fb_mem)
         except subprocess.TimeoutExpired:
-            logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
+            logging.debug("nvidia-smi command timed out in get_container_gpu_memory_usage_multi_pid")
             return 0
         except (ValueError, IndexError) as e:
             logging.debug("Error parsing GPU memory usage info: %s", e)
@@ -257,72 +847,125 @@ class ResourcesTracker:
             logging.debug("nvidia-smi not found on this system")
             return 0
         except Exception as e:
-            logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
+            logging.debug("Unexpected error in get_container_gpu_memory_usage_multi_pid: %s", e)
             return 0
         return total_memory
     @log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
     def get_available_resources(self) -> Tuple[float, float, int, float]:
         """
         Get available machine resources.
+        Note: CPU measurement is non-blocking (uses interval=0).
+        For more accurate CPU usage, call this method periodically and track trends.
         Returns:
-            Tuple[float, float, int, float]: Available memory in GB, available CPU percentage,
-            free GPU memory in MB, and GPU utilization percentage.
+            Tuple[float, float, int, float]:
+                - Available memory in GB
+                - Available CPU percentage (100 - current_usage)
+                - Free GPU memory in MB
+                - GPU utilization percentage (0-100)
         """
-        available_memory = psutil.virtual_memory().available / 1024**3
-        available_cpu = 100 - psutil.cpu_percent(1)
+        # Memory: straightforward
+        available_memory = psutil.virtual_memory().available / (1024 ** 3)
+        # CPU: NON-BLOCKING - interval=0 returns instant snapshot
+        # For better accuracy, consider using load average or tracking over time
+        # Note: Inside containers, this may not reflect cgroup limits
+        try:
+            # Use interval=0 for non-blocking (returns cached value or 0.0 on first call)
+            cpu_percent = psutil.cpu_percent(1)
+            # # If first call (returns 0.0), try load average as fallback
+            # if cpu_percent == 0.0:
+            #     try:
+            #         # Use 1-minute load average as percentage of CPU count
+            #         load_avg = os.getloadavg()[0]
+            #         cpu_count = psutil.cpu_count() or 1
+            #         cpu_percent = min(100.0, (load_avg / cpu_count) * 100.0)
+            #     except (OSError, AttributeError):
+            #         # os.getloadavg() not available on Windows
+            #         pass
+            available_cpu = max(0.0, 100.0 - cpu_percent)
+        except Exception:
+            available_cpu = 100.0
         gpu_memory_free, gpu_utilization = self._get_gpu_resources()
         return available_memory, available_cpu, gpu_memory_free, gpu_utilization
     @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
     def _get_gpu_resources(self) -> Tuple[int, float]:
         """
-        Get available GPU resources.
+        Get available GPU resources using cached data.
+        Returns:
+            Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
+        """
+        if not has_gpu():
+            return 0, 0.0
+        # Use cached GPU data for efficiency
+        gpu_data = self._get_cached_gpu_data()
+        if not gpu_data.get('gpus'):
+            # Cache miss or no GPUs, fall back to direct query
+            return self._get_gpu_resources_direct()
+        gpu_memory_free = 0
+        gpu_utilization = 0.0
+        gpu_count = 0
+        for gpu in gpu_data['gpus']:
+            gpu_memory_free += gpu['memory_total'] - gpu['memory_used']
+            gpu_utilization += gpu['utilization']
+            gpu_count += 1
+        if gpu_count > 0:
+            gpu_utilization /= gpu_count
+        return gpu_memory_free, gpu_utilization
+    @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
+    def _get_gpu_resources_direct(self) -> Tuple[int, float]:
+        """
+        Get GPU resources directly (fallback when cache is empty).
         Returns:
             Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
         """
         gpu_memory_free = 0
         gpu_utilization = 0.0
-        if not has_gpu():
-            return gpu_memory_free, gpu_utilization
         try:
             result = subprocess.run(
-                ["nvidia-smi"],
+                ["nvidia-smi", "--query-gpu=memory.free,utilization.gpu", "--format=csv,noheader,nounits"],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
+                text=True,
                 timeout=5,
-                check=False,
             )
             if result.returncode != 0:
-                logging.debug("nvidia-smi command failed in _get_gpu_resources")
                 return 0, 0.0
+            gpu_count = 0
+            for line in result.stdout.strip().split("\n"):
+                if not line.strip():
+                    continue
+                parts = [p.strip() for p in line.split(",")]
+                if len(parts) >= 2:
+                    gpu_memory_free += int(parts[0]) if parts[0].isdigit() else 0
+                    gpu_utilization += float(parts[1]) if parts[1].replace('.', '').isdigit() else 0
+                    gpu_count += 1
+            if gpu_count > 0:
+                gpu_utilization /= gpu_count
         except subprocess.TimeoutExpired:
-            logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
-            return 0, 0.0
+            logging.debug("nvidia-smi command timed out in _get_gpu_resources_direct")
         except FileNotFoundError:
             logging.debug("nvidia-smi not found on this system")
-            return 0, 0.0
         except Exception as e:
-            logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
-            return 0, 0.0
-        info_list = get_gpu_info()
-        if not info_list:
-            return 0, 0.0
-        try:
-            for info in info_list:
-                info_split = info.split(", ")
-                if len(info_split) >= 6:
-                    gpu_memory_free += int(info_split[5])
-                    gpu_utilization += float(info_split[2])
-            gpu_utilization /= len(info_list) if info_list else 1
-        except (ValueError, IndexError) as e:
-            logging.debug("Error parsing GPU resources: %s", e)
-            return 0, 0.0
+            logging.debug("Error in _get_gpu_resources_direct: %s", e)
         return gpu_memory_free, gpu_utilization

{matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.33
+Version: 0.1.34
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
-matrice_compute/action_instance.py,sha256=jWv-YlB1_YiaEU-oazGBMMwSHFh2TJ5NU88R7NxCtUM,85957
+matrice_compute/action_instance.py,sha256=GF49-yYJp_5EHZ6ZT5kY4U-y1zyPkFjjDt1xMb2BaIg,87439
 matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
 matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
 matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
@@ -7,12 +7,12 @@ matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWI
 matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
 matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
 matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcyEPJ0f2c,32246
+matrice_compute/resources_tracker.py,sha256=DffKitGU1gran0OAuKIsfH0XeOe03xU7NGl-_uMsad4,58674
 matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
 matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
 matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
-matrice_compute-0.1.33.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
-matrice_compute-0.1.33.dist-info/METADATA,sha256=iJWPWtfLyDOfNVvSxmH8BMLcXkFlX1Z7Lm6Sih-JSfY,1038
-matrice_compute-0.1.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-matrice_compute-0.1.33.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
-matrice_compute-0.1.33.dist-info/RECORD,,
+matrice_compute-0.1.34.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
+matrice_compute-0.1.34.dist-info/METADATA,sha256=K4c_uaSlUeEbbC7yWB9RzW_qvLoxfgwGOk94BbbtaQs,1038
+matrice_compute-0.1.34.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+matrice_compute-0.1.34.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
+matrice_compute-0.1.34.dist-info/RECORD,,

{matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/WHEEL RENAMED Viewed

File without changes

{matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/top_level.txt RENAMED Viewed

File without changes

matrice-compute 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

matrice-compute 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl