PyPI - snowflake-ml-python - Versions diffs - 1.8.0__py3-none-any.whl → 1.8.2__py3-none-any.whl - Mend

snowflake-ml-python 1.8.0py3-none-any.whl → 1.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

snowflake/ml/jobs/_utils/scripts/worker_shutdown_listener.py ADDED Viewed

@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# This file is part of the Ray-based distributed job system for Snowflake ML.
+# Architecture overview:
+# - Head node creates a ShutdownSignal actor and signals workers when job completes
+# - Worker nodes listen for this signal via this script and gracefully shut down
+# - This ensures clean termination of distributed Ray jobs
+import logging
+import signal
+import sys
+import time
+from typing import Optional
+import get_instance_ip
+import ray
+from constants import (
+    SHUTDOWN_ACTOR_NAME,
+    SHUTDOWN_ACTOR_NAMESPACE,
+    SHUTDOWN_RPC_TIMEOUT_SECONDS,
+)
+from ray.actor import ActorHandle
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+def get_shutdown_actor() -> Optional[ActorHandle]:
+    """
+    Retrieve the shutdown signal actor from Ray.
+    Returns:
+        The shutdown signal actor or None if not found
+    """
+    try:
+        shutdown_signal = ray.get_actor(SHUTDOWN_ACTOR_NAME, namespace=SHUTDOWN_ACTOR_NAMESPACE)
+        return shutdown_signal
+    except Exception:
+        return None
+def ping_shutdown_actor(shutdown_signal: ActorHandle) -> bool:
+    """
+    Ping the shutdown actor to ensure connectivity.
+    Args:
+        shutdown_signal: The Ray actor handle for the shutdown signal
+    Returns:
+        True if ping succeeds, False otherwise
+    """
+    try:
+        ping_result = ray.get(shutdown_signal.ping.remote(), timeout=SHUTDOWN_RPC_TIMEOUT_SECONDS)
+        logging.debug(f"Actor ping result: {ping_result}")
+        return True
+    except (ray.exceptions.GetTimeoutError, Exception) as e:
+        logging.debug(f"Actor ping failed: {e}")
+        return False
+def check_shutdown_status(shutdown_signal: ActorHandle, worker_id: str) -> bool:
+    """
+    Check if worker should shutdown and acknowledge if needed.
+    Args:
+        shutdown_signal: The Ray actor handle for the shutdown signal
+        worker_id: Worker identifier (IP address)
+    Returns:
+        True if should shutdown, False otherwise
+    """
+    try:
+        status = ray.get(shutdown_signal.should_shutdown.remote(), timeout=SHUTDOWN_RPC_TIMEOUT_SECONDS)
+        logging.debug(f"Shutdown status: {status}")
+        if status.get("shutdown", False):
+            logging.info(
+                f"Received shutdown signal from head node at {status.get('timestamp')}. " f"Exiting worker process."
+            )
+            # Acknowledge shutdown before exiting
+            try:
+                ack_result = ray.get(
+                    shutdown_signal.acknowledge_shutdown.remote(worker_id), timeout=SHUTDOWN_RPC_TIMEOUT_SECONDS
+                )
+                logging.info(f"Acknowledged shutdown: {ack_result}")
+            except Exception as e:
+                logging.warning(f"Failed to acknowledge shutdown: {e}. Continue to exit worker.")
+            return True
+        return False
+    except Exception as e:
+        logging.debug(f"Error checking shutdown status: {e}")
+        return False
+def check_ray_connectivity() -> bool:
+    """
+    Check if the Ray cluster is accessible.
+    Returns:
+        True if Ray is connected, False otherwise
+    """
+    try:
+        # A simple check to verify Ray is working
+        nodes = ray.nodes()
+        if nodes:
+            return True
+        return False
+    except Exception as e:
+        logging.debug(f"Ray connectivity check failed: {e}")
+        return False
+def initialize_ray_connection(max_retries: int, initial_retry_delay: int, max_retry_delay: int) -> bool:
+    """
+    Initialize connection to Ray with retries.
+    Args:
+        max_retries: Maximum number of connection attempts
+        initial_retry_delay: Initial delay between retries in seconds
+        max_retry_delay: Maximum delay between retries in seconds
+    Returns:
+        bool: True if connection successful, False otherwise
+    """
+    retry_count = 0
+    retry_delay = initial_retry_delay
+    while retry_count < max_retries:
+        try:
+            ray.init(address="auto", ignore_reinit_error=True)
+            return True
+        except (ConnectionError, TimeoutError, RuntimeError) as e:
+            retry_count += 1
+            if retry_count >= max_retries:
+                logging.error(f"Failed to connect to Ray head after {max_retries} attempts: {e}")
+                return False
+            logging.debug(
+                f"Attempt {retry_count}/{max_retries} to connect to Ray failed: {e}. "
+                f"Retrying in {retry_delay} seconds..."
+            )
+            time.sleep(retry_delay)
+            # Exponential backoff with cap
+            retry_delay = min(retry_delay * 1.5, max_retry_delay)
+    return False  # Should not reach here, but added for completeness
+def monitor_shutdown_signal(check_interval: int, max_consecutive_failures: int) -> int:
+    """
+    Main loop to monitor for shutdown signals.
+    Args:
+        check_interval: Time in seconds between checks
+        max_consecutive_failures: Maximum allowed consecutive connection failures
+    Returns:
+        int: Exit code (0 for success, non-zero for failure)
+    Raises:
+        ConnectionError: If Ray connection failures exceed threshold
+    """
+    worker_id = get_instance_ip.get_self_ip()
+    actor_check_count = 0
+    consecutive_connection_failures = 0
+    logging.debug(
+        f"Starting to monitor for shutdown signal using actor {SHUTDOWN_ACTOR_NAME}"
+        f" in namespace {SHUTDOWN_ACTOR_NAMESPACE}."
+    )
+    while True:
+        actor_check_count += 1
+        # Check Ray connectivity before proceeding
+        if not check_ray_connectivity():
+            consecutive_connection_failures += 1
+            logging.debug(
+                f"Ray connectivity check failed (attempt {consecutive_connection_failures}/{max_consecutive_failures})"
+            )
+            if consecutive_connection_failures >= max_consecutive_failures:
+                raise ConnectionError("Exceeded max consecutive Ray connection failures")
+            time.sleep(check_interval)
+            continue
+        # Reset counter on successful connection
+        consecutive_connection_failures = 0
+        # Get shutdown actor
+        shutdown_signal = get_shutdown_actor()
+        if not shutdown_signal:
+            logging.debug(f"Shutdown signal actor not found at check #{actor_check_count}, continuing to wait...")
+            time.sleep(check_interval)
+            continue
+        # Ping the actor to ensure connectivity
+        if not ping_shutdown_actor(shutdown_signal):
+            time.sleep(check_interval)
+            continue
+        # Check shutdown status
+        if check_shutdown_status(shutdown_signal, worker_id):
+            return 0
+        # Wait before checking again
+        time.sleep(check_interval)
+def run_listener() -> int:
+    """Listen for shutdown signals from the head node"""
+    # Configuration
+    max_retries = 15
+    initial_retry_delay = 2
+    max_retry_delay = 30
+    check_interval = 5  # How often to check for ray connection or shutdown signal
+    max_consecutive_failures = 12  # Exit after about 1 minute of connection failures
+    # Initialize Ray connection
+    if not initialize_ray_connection(max_retries, initial_retry_delay, max_retry_delay):
+        raise ConnectionError("Failed to connect to Ray cluster. Aborting worker.")
+    # Monitor for shutdown signals
+    return monitor_shutdown_signal(check_interval, max_consecutive_failures)
+def main():
+    """Main entry point with signal handling"""
+    def signal_handler(signum, frame):
+        logging.info(f"Received signal {signum}, exiting worker process.")
+        sys.exit(0)
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+    # Run the listener - this will block until a shutdown signal is received
+    result = run_listener()
+    sys.exit(result)
+if __name__ == "__main__":
+    main()

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -97,6 +97,7 @@ def generate_service_spec(
     payload: types.UploadedPayload,
     args: Optional[List[str]] = None,
     num_instances: Optional[int] = None,
+    enable_metrics: bool = False,
 ) -> Dict[str, Any]:
     """
     Generate a service specification for a job.
@@ -107,20 +108,15 @@ def generate_service_spec(
         payload: Uploaded job payload
         args: Arguments to pass to entrypoint script
         num_instances: Number of instances for multi-node job
+        enable_metrics: Enable platform metrics for the job
     Returns:
         Job service specification
     """
     is_multi_node = num_instances is not None and num_instances > 1
+    image_spec = _get_image_spec(session, compute_pool)
     # Set resource requests/limits, including nvidia.com/gpu quantity if applicable
-    if is_multi_node:
-        # If the job is of multi-node, we will need a different image which contains
-        # module snowflake.runtime.utils.get_instance_ip
-        # TODO(SNOW-1961849): Remove the hard-coded image name
-        image_spec = _get_image_spec(session, compute_pool, constants.MULTINODE_HEADLESS_IMAGE_TAG)
-    else:
-        image_spec = _get_image_spec(session, compute_pool)
     resource_requests: Dict[str, Union[str, int]] = {
         "cpu": f"{int(image_spec.resource_requests.cpu * 1000)}m",
         "memory": f"{image_spec.resource_limits.memory}Gi",
@@ -189,7 +185,10 @@ def generate_service_spec(
     # TODO: Add hooks for endpoints for integration with TensorBoard etc
-    env_vars = {constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix()}
+    env_vars = {
+        constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix(),
+        constants.RESULT_PATH_ENV_VAR: constants.RESULT_PATH_DEFAULT_VALUE,
+    }
     endpoints = []
     if is_multi_node:
@@ -211,6 +210,16 @@ def generate_service_spec(
         ]
         endpoints.extend(ray_endpoints)
+    metrics = []
+    if enable_metrics:
+        # https://docs.snowflake.com/en/developer-guide/snowpark-container-services/monitoring-services#label-spcs-available-platform-metrics
+        metrics = [
+            "system",
+            "status",
+            "network",
+            "storage",
+        ]
     spec_dict = {
         "containers": [
             {
@@ -233,6 +242,16 @@ def generate_service_spec(
     }
     if endpoints:
         spec_dict["endpoints"] = endpoints
+    if metrics:
+        spec_dict.update(
+            {
+                "platformMonitor": {
+                    "metricConfig": {
+                        "groups": metrics,
+                    },
+                },
+            }
+        )
     # Assemble into service specification dict
     spec = {"spec": spec_dict}

snowflake/ml/jobs/_utils/types.py CHANGED Viewed

@@ -11,6 +11,12 @@ JOB_STATUS = Literal[
 ]
+@dataclass(frozen=True)
+class PayloadEntrypoint:
+    file_path: PurePath
+    main_func: Optional[str]
 @dataclass(frozen=True)
 class UploadedPayload:
     # TODO: Include manifest of payload files for validation

snowflake/ml/jobs/decorators.py CHANGED Viewed

@@ -19,14 +19,16 @@ _ReturnValue = TypeVar("_ReturnValue")
 @telemetry.send_api_usage_telemetry(project=_PROJECT)
 def remote(
     compute_pool: str,
+    *,
     stage_name: str,
     pip_requirements: Optional[List[str]] = None,
     external_access_integrations: Optional[List[str]] = None,
     query_warehouse: Optional[str] = None,
     env_vars: Optional[Dict[str, str]] = None,
-    session: Optional[snowpark.Session] = None,
     num_instances: Optional[int] = None,
-) -> Callable[[Callable[_Args, _ReturnValue]], Callable[_Args, jb.MLJob]]:
+    enable_metrics: bool = False,
+    session: Optional[snowpark.Session] = None,
+) -> Callable[[Callable[_Args, _ReturnValue]], Callable[_Args, jb.MLJob[_ReturnValue]]]:
     """
     Submit a job to the compute pool.
@@ -37,14 +39,15 @@ def remote(
         external_access_integrations: A list of external access integrations.
         query_warehouse: The query warehouse to use. Defaults to session warehouse.
         env_vars: Environment variables to set in container
-        session: The Snowpark session to use. If none specified, uses active session.
         num_instances: The number of nodes in the job. If none specified, create a single node job.
+        enable_metrics: Whether to enable metrics publishing for the job.
+        session: The Snowpark session to use. If none specified, uses active session.
     Returns:
         Decorator that dispatches invocations of the decorated function as remote jobs.
     """
-    def decorator(func: Callable[_Args, _ReturnValue]) -> Callable[_Args, jb.MLJob]:
+    def decorator(func: Callable[_Args, _ReturnValue]) -> Callable[_Args, jb.MLJob[_ReturnValue]]:
         # Copy the function to avoid modifying the original
         # We need to modify the line number of the function to exclude the
         # decorator from the copied source code
@@ -52,7 +55,7 @@ def remote(
         wrapped_func.__code__ = wrapped_func.__code__.replace(co_firstlineno=func.__code__.co_firstlineno + 1)
         @functools.wraps(func)
-        def wrapper(*args: _Args.args, **kwargs: _Args.kwargs) -> jb.MLJob:
+        def wrapper(*args: _Args.args, **kwargs: _Args.kwargs) -> jb.MLJob[_ReturnValue]:
             payload = functools.partial(func, *args, **kwargs)
             setattr(payload, constants.IS_MLJOB_REMOTE_ATTR, True)
             job = jm._submit_job(
@@ -63,8 +66,9 @@ def remote(
                 external_access_integrations=external_access_integrations,
                 query_warehouse=query_warehouse,
                 env_vars=env_vars,
-                session=session,
                 num_instances=num_instances,
+                enable_metrics=enable_metrics,
+                session=session,
             )
             assert isinstance(job, jb.MLJob), f"Unexpected job type: {type(job)}"
             return job

snowflake/ml/jobs/job.py CHANGED Viewed

@@ -1,20 +1,32 @@
 import time
-from typing import Any, List, Optional, cast
+from typing import Any, Dict, Generic, List, Optional, TypeVar, cast
+import yaml
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
-from snowflake.ml.jobs._utils import constants, types
+from snowflake.ml.jobs._utils import constants, interop_utils, types
 from snowflake.snowpark import context as sp_context
 _PROJECT = "MLJob"
 TERMINAL_JOB_STATUSES = {"FAILED", "DONE", "INTERNAL_ERROR"}
+T = TypeVar("T")
-class MLJob:
-    def __init__(self, id: str, session: Optional[snowpark.Session] = None) -> None:
+class MLJob(Generic[T]):
+    def __init__(
+        self,
+        id: str,
+        service_spec: Optional[Dict[str, Any]] = None,
+        session: Optional[snowpark.Session] = None,
+    ) -> None:
         self._id = id
+        self._service_spec_cached: Optional[Dict[str, Any]] = service_spec
         self._session = session or sp_context.get_active_session()
         self._status: types.JOB_STATUS = "PENDING"
+        self._result: Optional[interop_utils.ExecutionResult] = None
     @property
     def id(self) -> str:
@@ -29,33 +41,66 @@ class MLJob:
             self._status = _get_status(self._session, self.id)
         return self._status
+    @property
+    def _service_spec(self) -> Dict[str, Any]:
+        """Get the job's service spec."""
+        if not self._service_spec_cached:
+            self._service_spec_cached = _get_service_spec(self._session, self.id)
+        return self._service_spec_cached
+    @property
+    def _container_spec(self) -> Dict[str, Any]:
+        """Get the job's main container spec."""
+        containers = self._service_spec["spec"]["containers"]
+        container_spec = next(c for c in containers if c["name"] == constants.DEFAULT_CONTAINER_NAME)
+        return cast(Dict[str, Any], container_spec)
+    @property
+    def _stage_path(self) -> str:
+        """Get the job's artifact storage stage location."""
+        volumes = self._service_spec["spec"]["volumes"]
+        stage_path = next(v for v in volumes if v["name"] == constants.STAGE_VOLUME_NAME)["source"]
+        return cast(str, stage_path)
+    @property
+    def _result_path(self) -> str:
+        """Get the job's result file location."""
+        result_path = self._container_spec["env"].get(constants.RESULT_PATH_ENV_VAR)
+        if result_path is None:
+            raise RuntimeError(f"Job {self.id} doesn't have a result path configured")
+        return f"{self._stage_path}/{result_path}"
     @snowpark._internal.utils.private_preview(version="1.7.4")
-    def get_logs(self, limit: int = -1) -> str:
+    def get_logs(self, limit: int = -1, instance_id: Optional[int] = None) -> str:
         """
         Return the job's execution logs.
         Args:
             limit: The maximum number of lines to return. Negative values are treated as no limit.
+            instance_id: Optional instance ID to get logs from a specific instance.
+                         If not provided, returns logs from the head node.
         Returns:
             The job's execution logs.
         """
-        logs = _get_logs(self._session, self.id, limit)
+        logs = _get_logs(self._session, self.id, limit, instance_id)
         assert isinstance(logs, str)  # mypy
         return logs
     @snowpark._internal.utils.private_preview(version="1.7.4")
-    def show_logs(self, limit: int = -1) -> None:
+    def show_logs(self, limit: int = -1, instance_id: Optional[int] = None) -> None:
         """
         Display the job's execution logs.
         Args:
             limit: The maximum number of lines to display. Negative values are treated as no limit.
+            instance_id: Optional instance ID to get logs from a specific instance.
+                         If not provided, displays logs from the head node.
         """
-        print(self.get_logs(limit))  # noqa: T201: we need to print here.
+        print(self.get_logs(limit, instance_id))  # noqa: T201: we need to print here.
     @snowpark._internal.utils.private_preview(version="1.7.4")
-    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    @telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["timeout"])
     def wait(self, timeout: float = -1) -> types.JOB_STATUS:
         """
         Block until completion. Returns completion status.
@@ -78,20 +123,58 @@ class MLJob:
             delay = min(delay * 2, constants.JOB_POLL_MAX_DELAY_SECONDS)  # Exponential backoff
         return self.status
+    @snowpark._internal.utils.private_preview(version="1.8.2")
+    @telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["timeout"])
+    def result(self, timeout: float = -1) -> T:
+        """
+        Block until completion. Returns job execution result.
+        Args:
+            timeout: The maximum time to wait in seconds. Negative values are treated as no timeout.
+        Returns:
+            T: The deserialized job result.  # noqa: DAR401
+        Raises:
+            RuntimeError: If the job failed or if the job doesn't have a result to retrieve.
+            TimeoutError: If the job does not complete within the specified timeout.  # noqa: DAR402
+        """
+        if self._result is None:
+            self.wait(timeout)
+            try:
+                self._result = interop_utils.fetch_result(self._session, self._result_path)
+            except Exception as e:
+                raise RuntimeError(f"Failed to retrieve result for job (id={self.id})") from e
+        if self._result.success:
+            return cast(T, self._result.result)
+        raise RuntimeError(f"Job execution failed (id={self.id})") from self._result.exception
+@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id", "instance_id"])
+def _get_status(session: snowpark.Session, job_id: str, instance_id: Optional[int] = None) -> types.JOB_STATUS:
+    """Retrieve job or job instance execution status."""
+    if instance_id is not None:
+        # Get specific instance status
+        rows = session.sql("SHOW SERVICE INSTANCES IN SERVICE IDENTIFIER(?)", params=(job_id,)).collect()
+        for row in rows:
+            if row["instance_id"] == str(instance_id):
+                return cast(types.JOB_STATUS, row["status"])
+        raise ValueError(f"Instance {instance_id} not found in job {job_id}")
+    else:
+        (row,) = session.sql("DESCRIBE SERVICE IDENTIFIER(?)", params=(job_id,)).collect()
+        return cast(types.JOB_STATUS, row["status"])
 @telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id"])
-def _get_status(session: snowpark.Session, job_id: str) -> types.JOB_STATUS:
-    """Retrieve job execution status."""
-    # TODO: snowflake-snowpark-python<1.24.0 shows spurious error messages on
-    #       `DESCRIBE` queries with bind variables
-    #       Switch to use bind variables instead of client side formatting after
-    #       updating to snowflake-snowpark-python>=1.24.0
-    (row,) = session.sql(f"DESCRIBE SERVICE {job_id}").collect()
-    return cast(types.JOB_STATUS, row["status"])
-@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id", "limit"])
-def _get_logs(session: snowpark.Session, job_id: str, limit: int = -1) -> str:
+def _get_service_spec(session: snowpark.Session, job_id: str) -> Dict[str, Any]:
+    """Retrieve job execution service spec."""
+    (row,) = session.sql("DESCRIBE SERVICE IDENTIFIER(?)", params=[job_id]).collect()
+    return cast(Dict[str, Any], yaml.safe_load(row["spec"]))
+@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id", "limit", "instance_id"])
+def _get_logs(session: snowpark.Session, job_id: str, limit: int = -1, instance_id: Optional[int] = None) -> str:
     """
     Retrieve the job's execution logs.
@@ -99,15 +182,54 @@ def _get_logs(session: snowpark.Session, job_id: str, limit: int = -1) -> str:
         job_id: The job ID.
         limit: The maximum number of lines to return. Negative values are treated as no limit.
         session: The Snowpark session to use. If none specified, uses active session.
+        instance_id: Optional instance ID to get logs from a specific instance.
     Returns:
         The job's execution logs.
     """
-    params: List[Any] = [job_id]
+    # If instance_id is not specified, try to get the head instance ID
+    if instance_id is None:
+        instance_id = _get_head_instance_id(session, job_id)
+    # Assemble params: [job_id, instance_id, container_name, (optional) limit]
+    params: List[Any] = [
+        job_id,
+        0 if instance_id is None else instance_id,
+        constants.DEFAULT_CONTAINER_NAME,
+    ]
     if limit > 0:
         params.append(limit)
     (row,) = session.sql(
-        f"SELECT SYSTEM$GET_SERVICE_LOGS(?, 0, '{constants.DEFAULT_CONTAINER_NAME}'{f', ?' if limit > 0 else ''})",
+        f"SELECT SYSTEM$GET_SERVICE_LOGS(?, ?, ?{f', ?' if limit > 0 else ''})",
         params=params,
     ).collect()
     return str(row[0])
+@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id"])
+def _get_head_instance_id(session: snowpark.Session, job_id: str) -> Optional[int]:
+    """
+    Retrieve the head instance ID of a job.
+    Args:
+        session: The Snowpark session to use.
+        job_id: The job ID.
+    Returns:
+        The head instance ID of the job. Returns None if the head instance has not started yet.
+    """
+    rows = session.sql("SHOW SERVICE INSTANCES IN SERVICE IDENTIFIER(?)", params=(job_id,)).collect()
+    if not rows:
+        return None
+    # Sort by start_time first, then by instance_id
+    sorted_instances = sorted(rows, key=lambda x: (x["start_time"], int(x["instance_id"])))
+    head_instance = sorted_instances[0]
+    if not head_instance["start_time"]:
+        # If head instance hasn't started yet, return None
+        return None
+    try:
+        return int(head_instance["instance_id"])
+    except (ValueError, TypeError):
+        return 0

snowflake-ml-python 1.8.0__py3-none-any.whl → 1.8.2__py3-none-any.whl

snowflake-ml-python 1.8.0py3-none-any.whl → 1.8.2py3-none-any.whl