PyPI - matrice-compute - Versions diffs - 0.1.18__tar.gz → 0.1.20__tar.gz - Mend

matrice-compute 0.1.18tar.gz → 0.1.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{matrice_compute-0.1.18 → matrice_compute-0.1.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.18
+Version: 0.1.20
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_compute
-Version: 0.1.18
+Version: 0.1.20
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/action_instance.py RENAMED Viewed

@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
     get_gpu_with_sufficient_memory_for_action,
     get_decrypted_access_key_pair,
     get_max_file_system,
+    get_best_service_ip_and_network,
 )
 from matrice_compute.task_utils import (
     setup_workspace_and_run_task,
@@ -526,13 +527,18 @@ class ActionInstance:
             if username and password:
                 login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
-                subprocess.run(login_cmd, shell=True, check=True)
+                result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
+                if result.returncode != 0:
+                    raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
                 logging.info("Docker login successful")
             else:
                 logging.warning(
                     "Docker credentials not available, skipping Docker login"
                 )
+        except subprocess.TimeoutExpired:
+            logging.error("Docker login timed out after 30 seconds")
+            raise Exception("Docker login timed out")
         except Exception as err:
             logging.error(
                 "Docker login failed: %s",
@@ -1151,9 +1157,17 @@ def inference_ws_server_execute(self: ActionInstance):
         return
     image = action_details["actionDetails"].get("docker")
     self.setup_action_requirements(action_details)
+    # Get the best IP and network configuration for port 8102
+    ws_host, use_host_network = get_best_service_ip_and_network(8102)
+    # Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
+    if not os.environ.get("INFERENCE_WS_HOST"):
+        os.environ["INFERENCE_WS_HOST"] = ws_host
+    logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
     # Inference WebSocket server with --net=host (Port: 8102)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
@@ -1164,7 +1178,6 @@ def inference_ws_server_execute(self: ActionInstance):
         f"{image} "
         f"./app "
         f"{self.action_record_id} "
     )
     logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
@@ -1185,7 +1198,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
     image = action_details["actionDetails"].get("docker")
     self.setup_action_requirements(action_details)
+    # Get the ws_host from environment variable set by inference_ws_server_execute
+    ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
+    ws_url = f"{ws_host}:8102"
+    logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
     # Frontend streaming with --net=host (Port: 3000)
     worker_cmd = (
         f"docker run -d --pull=always --net=host "
@@ -1195,9 +1214,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
         f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
         f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
         f"-e PORT=3000 "
+        f'-e WS_HOST="{ws_url}" '
         f"{image}"
     )
-    logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
+    logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
     # Docker Command run
     self.start(worker_cmd, "fe_fs_streaming")
@@ -1304,6 +1324,11 @@ def redis_setup_execute(self: ActionInstance):
         action_id=action_id,
     )
+    # Get the best IP for Redis (port 6379)
+    redis_host, _ = get_best_service_ip_and_network(6379)
+    logging.info(f"Redis will use IP: {redis_host} on port 6379")
     redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
     # Redis container with --net=host (Port: 6379)
@@ -1315,7 +1340,7 @@ def redis_setup_execute(self: ActionInstance):
         f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
     )
-    logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
+    logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
     # Start Redis container first
     redis_process = subprocess.Popen(
@@ -1324,13 +1349,13 @@ def redis_setup_execute(self: ActionInstance):
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
     )
-    logging.info("Redis container started successfully on localhost:6379")
+    logging.info("Redis container started successfully on %s:6379", redis_host)
     # Wait for Redis to be ready
     time.sleep(5)
     env_vars = {
-        "REDIS_URL": f"localhost:6379",
+        "REDIS_URL": f"{redis_host}:6379",
         "REDIS_PASSWORD": redis_password,
     }
@@ -1348,7 +1373,7 @@ def redis_setup_execute(self: ActionInstance):
         f"{self.action_record_id} "
     )
-    logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
+    logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
     self.start(cmd, "redis_setup")
@@ -1386,6 +1411,12 @@ def model_deploy_execute(self: ActionInstance):
         action_id=action_id,
     )
     use_gpu = self.get_gpu_config(action_details)
+    gpuRequired = action_details["actionDetails"]["gpuRequired"]
+    if gpuRequired==False:
+        use_gpu = ""
+    else:
+        use_gpu = "--runtime=nvidia"
     extra_env_vars = {"INTERNAL_PORT": internal_port}
     cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
     logging.info("cmd is: %s", cmd)

{matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/instance_utils.py RENAMED Viewed

@@ -95,28 +95,72 @@ def get_instance_info(service_provider: str = None, instance_id: str = None) ->
     return str(auto_service_provider), str(auto_instance_id)
+def _normalize_timestamp(timestamp_str: str) -> str:
+    """
+    Normalize timestamp string to handle different precision levels.
+    Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
+    and various timezone formats across different cloud providers.
+    Args:
+        timestamp_str (str): Timestamp string in various formats
+    Returns:
+        str: Normalized timestamp string compatible with fromisoformat()
+    """
+    # Replace 'Z' with '+00:00' for UTC timestamps
+    timestamp_str = timestamp_str.replace("Z", "+00:00")
+    # Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
+    # Some providers (like OCI, GCP) may return nanoseconds (9 digits)
+    if "." in timestamp_str:
+        # Split into main part and fractional part
+        if "+" in timestamp_str:
+            main_part, tz_part = timestamp_str.rsplit("+", 1)
+            tz_suffix = "+" + tz_part
+        elif timestamp_str.count("-") > 2:  # Has negative timezone offset
+            main_part, tz_part = timestamp_str.rsplit("-", 1)
+            tz_suffix = "-" + tz_part
+        else:
+            main_part = timestamp_str
+            tz_suffix = ""
+        # Split main part into date/time and fractional seconds
+        datetime_part, fractional = main_part.rsplit(".", 1)
+        # Truncate fractional seconds to 6 digits (microseconds)
+        if len(fractional) > 6:
+            fractional = fractional[:6]
+        # Reconstruct timestamp
+        timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
+    return timestamp_str
 @log_errors(default_return=0, raise_exception=False, log_error=False)
 def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
     """
     Calculate time difference between start and finish times.
+    Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
+    and different precision levels (nanoseconds, microseconds, milliseconds).
     Args:
-        start_time_str (str): Start time string
-        finish_time_str (str): Finish time string
+        start_time_str (str): Start time string in ISO format
+        finish_time_str (str): Finish time string in ISO format
     Returns:
         int: Time difference in seconds
     """
-    if os.environ["SERVICE_PROVIDER"] in [
-        "AWS",
-        "OCI",
-        "LAMBDA",
-    ]:
-        start_time = datetime.fromisoformat(start_time_str.split(".")[0] + "+00:00")
-        finish_time = datetime.fromisoformat(finish_time_str.split(".")[0] + "+00:00")
-    else:
-        start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
-        finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
+    # Normalize both timestamps to handle different formats
+    normalized_start = _normalize_timestamp(start_time_str)
+    normalized_finish = _normalize_timestamp(finish_time_str)
+    # Parse the normalized timestamps
+    start_time = datetime.fromisoformat(normalized_start)
+    finish_time = datetime.fromisoformat(normalized_finish)
     return int((finish_time - start_time).total_seconds())
@@ -129,14 +173,25 @@ def has_gpu() -> bool:
         bool: True if GPU is present, False otherwise
     """
     try:
-        subprocess.run("nvidia-smi", timeout=5)
-        return True
+        result = subprocess.run(
+            ["nvidia-smi"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=5,
+            check=False,
+        )
+        return result.returncode == 0
     except subprocess.TimeoutExpired:
-        logging.warning("nvidia-smi command timed out after 5 seconds")
+        logging.debug("nvidia-smi command timed out after 5 seconds")
+        return False
+    except FileNotFoundError:
+        logging.debug("nvidia-smi not found on this system")
+        return False
+    except Exception:
         return False
-@log_errors(default_return=0, raise_exception=False)
+@log_errors(default_return=0, raise_exception=False, log_error=False)
 def get_gpu_memory_usage() -> float:
     """
     Get GPU memory usage percentage.
@@ -144,17 +199,35 @@ def get_gpu_memory_usage() -> float:
     Returns:
         float: Memory usage between 0 and 1
     """
-    command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
+    command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
     try:
-        output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=5,
+            check=False,
+        )
+        if result.returncode != 0:
+            logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
+            return 0
+        output = result.stdout.decode("ascii").strip().split("\n")
         memory_percentages = []
         for line in output:
-            used, total = map(int, line.split(","))
-            usage_percentage = used / total
-            memory_percentages.append(usage_percentage)
-        return min(memory_percentages)
+            if line.strip():
+                used, total = map(int, line.split(","))
+                if total > 0:
+                    usage_percentage = used / total
+                    memory_percentages.append(usage_percentage)
+        return min(memory_percentages) if memory_percentages else 0
     except subprocess.TimeoutExpired:
-        logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
+        logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
+        return 0
+    except (ValueError, IndexError) as e:
+        logging.debug("Error parsing GPU memory info: %s", e)
+        return 0
+    except Exception as e:
+        logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
         return 0
@@ -194,7 +267,7 @@ def get_mem_usage() -> float:
     return mem_usage
-@log_errors(default_return=[], raise_exception=False)
+@log_errors(default_return=[], raise_exception=False, log_error=False)
 def get_gpu_info() -> list:
     """
     Get GPU information.
@@ -202,23 +275,34 @@ def get_gpu_info() -> list:
     Returns:
         list: GPU information strings
     """
-    proc = subprocess.Popen(
-        [
-            "nvidia-smi",
-            "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
-            "--format=csv,noheader,nounits",
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
     try:
-        stdout, stderr = proc.communicate(timeout=5)
-        output = stdout.decode("UTF-8")
-        return output.split("\n")[:-1]
-    except subprocess.TimeoutExpired:
-        logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
-        proc.kill()
-        proc.communicate()  # flush output after kill
+        proc = subprocess.Popen(
+            [
+                "nvidia-smi",
+                "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
+                "--format=csv,noheader,nounits",
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        try:
+            stdout, stderr = proc.communicate(timeout=5)
+            if proc.returncode != 0:
+                logging.debug("nvidia-smi command failed in get_gpu_info")
+                return []
+            output = stdout.decode("UTF-8")
+            result = [line for line in output.split("\n") if line.strip()]
+            return result
+        except subprocess.TimeoutExpired:
+            logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
+            proc.kill()
+            proc.communicate()  # flush output after kill
+            return []
+    except FileNotFoundError:
+        logging.debug("nvidia-smi not found on this system")
+        return []
+    except Exception as e:
+        logging.debug("Error getting GPU info: %s", e)
         return []
@@ -241,11 +325,29 @@ def is_docker_running() -> bool:
     Returns:
         bool: True if Docker containers are running
     """
-    command = "docker ps"
-    docker_images = (
-        subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
-    )
-    return bool(docker_images)
+    command = ["docker", "ps"]
+    try:
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=False,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            logging.warning("docker ps command failed")
+            return False
+        docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
+        return bool(docker_images)
+    except subprocess.TimeoutExpired:
+        logging.warning("docker ps command timed out")
+        return False
+    except FileNotFoundError:
+        logging.warning("docker command not found")
+        return False
+    except Exception as e:
+        logging.warning("Error checking if docker is running: %s", e)
+        return False
 @log_errors(default_return=None, raise_exception=False)
@@ -502,7 +604,7 @@ def is_allowed_gpu_device(gpu_index: int) -> bool:
     return int(gpu_index) in allowed_gpus
-@log_errors(raise_exception=True)
+@log_errors(raise_exception=True, log_error=False)
 def get_gpu_with_sufficient_memory_for_action(
     action_details: dict,
 ) -> list:
@@ -519,16 +621,38 @@ def get_gpu_with_sufficient_memory_for_action(
         ValueError: If insufficient GPU memory
     """
     required_gpu_memory = get_required_gpu_memory(action_details)
-    command = "nvidia-smi --query-gpu=memory.free --format=csv"
+    command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
     try:
-        memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=5,
+            check=False,
+        )
+        if result.returncode != 0:
+            raise ValueError("Failed to get GPU information - nvidia-smi command failed")
+        memory_free_info = result.stdout.decode("ascii").strip().split("\n")
     except subprocess.TimeoutExpired:
-        logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
+        logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
         raise ValueError("Failed to get GPU information - nvidia-smi timed out")
+    except FileNotFoundError:
+        raise ValueError("nvidia-smi not found - no GPU support available")
+    except Exception as e:
+        logging.warning("Error running nvidia-smi: %s", e)
+        raise ValueError(f"Failed to get GPU information: {e}")
     if len(memory_free_info) < 2:
         raise ValueError("No GPU information available from nvidia-smi")
-    memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
+    try:
+        memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
+    except (ValueError, IndexError) as e:
+        raise ValueError(f"Error parsing GPU memory information: {e}")
+    if not memory_free_values:
+        raise ValueError("No GPU devices found")
     if required_gpu_memory < 80000:
         try:
             return get_single_gpu_with_sufficient_memory_for_action(action_details)
@@ -546,11 +670,11 @@ def get_gpu_with_sufficient_memory_for_action(
     if total_memory >= required_gpu_memory:
         return selected_gpus
     raise ValueError(
-        f"Insufficient GPU memory available. Required: {required_gpu_memory}, Available: {total_memory}"
+        f"Insufficient GPU memory available. Required: {required_gpu_memory}MB, Available: {total_memory}MB"
     )
-@log_errors(raise_exception=True)
+@log_errors(raise_exception=True, log_error=False)
 def get_single_gpu_with_sufficient_memory_for_action(
     action_details: dict,
 ) -> list:
@@ -567,16 +691,38 @@ def get_single_gpu_with_sufficient_memory_for_action(
         ValueError: If no GPU has sufficient memory
     """
     required_gpu_memory = get_required_gpu_memory(action_details)
-    command = "nvidia-smi --query-gpu=memory.free --format=csv"
+    command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
     try:
-        memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=5,
+            check=False,
+        )
+        if result.returncode != 0:
+            raise ValueError("Failed to get GPU information - nvidia-smi command failed")
+        memory_free_info = result.stdout.decode("ascii").strip().split("\n")
     except subprocess.TimeoutExpired:
-        logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
+        logging.warning("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
         raise ValueError("Failed to get GPU information - nvidia-smi timed out")
+    except FileNotFoundError:
+        raise ValueError("nvidia-smi not found - no GPU support available")
+    except Exception as e:
+        logging.warning("Error running nvidia-smi: %s", e)
+        raise ValueError(f"Failed to get GPU information: {e}")
     if len(memory_free_info) < 2:
         raise ValueError("No GPU information available from nvidia-smi")
-    memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
+    try:
+        memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
+    except (ValueError, IndexError) as e:
+        raise ValueError(f"Error parsing GPU memory information: {e}")
+    if not memory_free_values:
+        raise ValueError("No GPU devices found")
     best_fit_gpu = None
     best_fit_memory = float("inf")
     for i, mem in enumerate(memory_free_values):
@@ -692,47 +838,112 @@ def get_encrypted_access_key_pair(
     return encoded_access_key, encoded_secret_key
-@log_errors(default_return=False, raise_exception=False)
-def check_public_port_exposure(port: int) -> bool:
+def _get_private_ip() -> str:
     """
-    Check if port is publicly accessible.
+    Get the actual private/LAN IP address using UDP socket trick.
+    This works reliably even in Docker, NAT, VPN, etc.
+    Returns:
+        str: Private IP address or None if not available
+    """
+    try:
+        # Use UDP socket to determine which interface would be used for external connection
+        # No actual packets are sent
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            s.connect(("8.8.8.8", 80))
+            private_ip = s.getsockname()[0]
+            return private_ip
+    except Exception:
+        return None
-    Args:
-        port (int): Port number to check
-    Returns:
-        bool: True if port is publicly accessible
-    """
-    is_public_exposed = False
-    is_locally_available = False
-    # Check if port is publicly accessible
-    public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
-        conn_sock.settimeout(3)
-        result = conn_sock.connect_ex((public_ip, port))
-        is_public_exposed = result == 0
+def _public_ip_is_local(public_ip: str) -> bool:
+    """
+    Check if a public IP address is actually assigned to a local network interface.
+    This is true on cloud servers with real public IPs, false behind NAT.
-    # Check if port is locally available
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as bind_sock:
-        bind_sock.setsockopt(
-            socket.SOL_SOCKET,
-            socket.SO_REUSEADDR,
-            1,
-        )
-        bind_sock.bind(("", port))
-        bind_sock.listen(1)
-        is_locally_available = True
-    if not is_public_exposed:
-        logging.debug(
-            "Port %d is not publicly exposed",
-            port,
-        )
+    Args:
+        public_ip (str): The public IP to check
+    Returns:
+        bool: True if the public IP is on a local interface
+    """
+    try:
+        for iface, addrs in psutil.net_if_addrs().items():
+            for addr in addrs:
+                if addr.family == socket.AF_INET:
+                    if addr.address == public_ip:
+                        return True
         return False
-    if not is_locally_available:
-        logging.debug(
-            "Port %d is not locally available",
-            port,
-        )
+    except Exception:
         return False
-    return True
+@log_errors(default_return=("localhost", True), raise_exception=False)
+def get_best_service_ip_and_network(port: int) -> tuple:
+    """
+    Determine the best IP address and network configuration for a service.
+    This function intelligently selects the best IP to bind a service to:
+    Priority:
+    1. Public IP if it's actually on a local interface (cloud servers)
+    2. Private/LAN IP (NAT, local network, Docker)
+    3. localhost with --net=host (fallback)
+    Args:
+        port (int): Port number for the service
+    Returns:
+        tuple: (ip_address, use_host_network) where:
+            - ip_address: The IP address to use (public, private, or localhost)
+            - use_host_network: True if should use --net=host, False if should use port mapping
+    """
+    try:
+        # Check if port is available (not already in use)
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
+                test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                test_sock.bind(("0.0.0.0", port))
+                test_sock.listen(1)
+                # Port is available - socket closes automatically
+        except OSError as e:
+            logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
+            return "localhost", True
+        # Get the actual private/LAN IP
+        private_ip = _get_private_ip()
+        if private_ip:
+            logging.info(f"Determined private/LAN IP: {private_ip}")
+        else:
+            logging.debug("Could not determine private IP")
+        # Try to get public IP from external service
+        public_ip = None
+        try:
+            public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
+            # Validate it's a proper IP address
+            socket.inet_aton(public_ip)
+            logging.info(f"Determined external/public IP: {public_ip}")
+        except Exception as e:
+            logging.debug(f"Could not determine public IP: {e}")
+        # Decision logic: Choose the best IP
+        # 1. If public IP is on a local interface, use it (cloud server with real public IP)
+        if public_ip and _public_ip_is_local(public_ip):
+            logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
+            return public_ip, False
+        # 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
+        if private_ip and not private_ip.startswith("127."):
+            logging.info(f"Using private/LAN IP {private_ip} for port {port}")
+            return private_ip, False
+        # 3. Fall back to localhost with --net=host
+        logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
+        return "localhost", True
+    except Exception as e:
+        logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
+        return "localhost", True

{matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/resources_tracker.py RENAMED Viewed

@@ -56,7 +56,7 @@ class ResourcesTracker:
             return cpu_utilization, memory_utilization
         return 0, 0
-    @log_errors(default_return=(0, 0), raise_exception=False)
+    @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
     def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
         """
         Get CPU and memory usage for a specific container by its ID.
@@ -67,32 +67,46 @@ class ResourcesTracker:
         Returns:
             Tuple[float, float]: CPU utilization percentage and memory usage in MB.
         """
-        stats_result = subprocess.run(
-            [
-                "docker",
-                "stats",
-                "--no-stream",
-                "--format",
-                "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
-                container_id,
-            ],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        stats = stats_result.stdout.strip().split(": ")[1].split(", ")
-        cpu_usage = float(stats[0].replace("% CPU", "").strip())
-        memory_usage = stats[1].split(" / ")[0]
-        mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
-        if mem_unit == "KiB":
-            memory_usage_mb = float(mem_value) / 1024
-        elif mem_unit == "MiB":
-            memory_usage_mb = float(mem_value)
-        elif mem_unit == "GiB":
-            memory_usage_mb = float(mem_value) * 1024
-        else:
-            memory_usage_mb = float(mem_value)
-        return cpu_usage, memory_usage_mb
+        try:
+            stats_result = subprocess.run(
+                [
+                    "docker",
+                    "stats",
+                    "--no-stream",
+                    "--format",
+                    "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
+                    container_id,
+                ],
+                capture_output=True,
+                text=True,
+                check=False,
+                timeout=10,
+            )
+            if stats_result.returncode != 0:
+                logging.debug("docker stats command failed for container %s", container_id)
+                return 0, 0
+            stats = stats_result.stdout.strip().split(": ")[1].split(", ")
+            cpu_usage = float(stats[0].replace("% CPU", "").strip())
+            memory_usage = stats[1].split(" / ")[0]
+            mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
+            if mem_unit == "KiB":
+                memory_usage_mb = float(mem_value) / 1024
+            elif mem_unit == "MiB":
+                memory_usage_mb = float(mem_value)
+            elif mem_unit == "GiB":
+                memory_usage_mb = float(mem_value) * 1024
+            else:
+                memory_usage_mb = float(mem_value)
+            return cpu_usage, memory_usage_mb
+        except subprocess.TimeoutExpired:
+            logging.debug("docker stats command timed out for container %s", container_id)
+            return 0, 0
+        except (ValueError, IndexError) as e:
+            logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
+            return 0, 0
+        except Exception as e:
+            logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
+            return 0, 0
     @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
     def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
@@ -110,7 +124,7 @@ class ResourcesTracker:
         gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
         return gpu_util, gpu_mem_used
-    @log_errors(default_return="", raise_exception=False)
+    @log_errors(default_return="", raise_exception=False, log_error=False)
     def get_pid_id_by_container_id(self, container_id: str) -> str:
         """
         Get PID for a container ID.
@@ -121,20 +135,31 @@ class ResourcesTracker:
         Returns:
             str: PID of the container.
         """
-        pid_result = subprocess.run(
-            [
-                "docker",
-                "inspect",
-                "--format",
-                "{{.State.Pid}}",
-                container_id,
-            ],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        container_pid = pid_result.stdout.strip()
-        return container_pid
+        try:
+            pid_result = subprocess.run(
+                [
+                    "docker",
+                    "inspect",
+                    "--format",
+                    "{{.State.Pid}}",
+                    container_id,
+                ],
+                capture_output=True,
+                text=True,
+                check=False,
+                timeout=10,
+            )
+            if pid_result.returncode != 0:
+                logging.debug("docker inspect command failed for container %s", container_id)
+                return ""
+            container_pid = pid_result.stdout.strip()
+            return container_pid
+        except subprocess.TimeoutExpired:
+            logging.debug("docker inspect command timed out for container %s", container_id)
+            return ""
+        except Exception as e:
+            logging.debug("Error getting PID for container %s: %s", container_id, e)
+            return ""
     @log_errors(default_return=0, raise_exception=False, log_error=False)
     def get_container_gpu_usage(self, container_pid: str) -> float:
@@ -155,9 +180,12 @@ class ResourcesTracker:
                 ["nvidia-smi", "pmon", "-c", "1"],
                 capture_output=True,
                 text=True,
-                check=True,
+                check=False,
                 timeout=5,
             )
+            if result.returncode != 0:
+                logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
+                return 0
             pmon_output = result.stdout.strip().split("\n")
             for line in pmon_output[2:]:
                 parts = line.split()
@@ -167,7 +195,16 @@ class ResourcesTracker:
                     if pid == str(container_pid):
                         gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
         except subprocess.TimeoutExpired:
-            logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
+            logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
+            return 0
+        except (ValueError, IndexError) as e:
+            logging.debug("Error parsing GPU usage info: %s", e)
+            return 0
+        except FileNotFoundError:
+            logging.debug("nvidia-smi not found on this system")
+            return 0
+        except Exception as e:
+            logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
             return 0
         return gpu_util
@@ -196,9 +233,12 @@ class ResourcesTracker:
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True,
+                check=False,
                 timeout=5,
             )
+            if result.returncode != 0:
+                logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
+                return 0
             for line in result.stdout.splitlines():
                 parts = line.strip().split(", ")
                 if len(parts) == 2:
@@ -206,7 +246,16 @@ class ResourcesTracker:
                     if process_pid == str(container_pid):
                         total_memory += int(used_memory)
         except subprocess.TimeoutExpired:
-            logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
+            logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
+            return 0
+        except (ValueError, IndexError) as e:
+            logging.debug("Error parsing GPU memory usage info: %s", e)
+            return 0
+        except FileNotFoundError:
+            logging.debug("nvidia-smi not found on this system")
+            return 0
+        except Exception as e:
+            logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
             return 0
         return total_memory
@@ -238,17 +287,40 @@ class ResourcesTracker:
             return gpu_memory_free, gpu_utilization
         try:
-            subprocess.check_output("nvidia-smi", timeout=5)
+            result = subprocess.run(
+                ["nvidia-smi"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=5,
+                check=False,
+            )
+            if result.returncode != 0:
+                logging.debug("nvidia-smi command failed in _get_gpu_resources")
+                return 0, 0.0
         except subprocess.TimeoutExpired:
-            logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
+            logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
+            return 0, 0.0
+        except FileNotFoundError:
+            logging.debug("nvidia-smi not found on this system")
+            return 0, 0.0
+        except Exception as e:
+            logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
             return 0, 0.0
         info_list = get_gpu_info()
-        for info in info_list:
-            info_split = info.split(", ")
-            gpu_memory_free += int(info_split[5])
-            gpu_utilization += float(info_split[2])
-        gpu_utilization /= len(info_list) if info_list else 1
+        if not info_list:
+            return 0, 0.0
+        try:
+            for info in info_list:
+                info_split = info.split(", ")
+                if len(info_split) >= 6:
+                    gpu_memory_free += int(info_split[5])
+                    gpu_utilization += float(info_split[2])
+            gpu_utilization /= len(info_list) if info_list else 1
+        except (ValueError, IndexError) as e:
+            logging.debug("Error parsing GPU resources: %s", e)
+            return 0, 0.0
         return gpu_memory_free, gpu_utilization