PyPI - snowflake-ml-python - Versions diffs - 1.8.4__py3-none-any.whl → 1.8.5__py3-none-any.whl - Mend

snowflake-ml-python 1.8.4py3-none-any.whl → 1.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -4,6 +4,7 @@ import enum
 import functools
 import inspect
 import operator
+import os
 import sys
 import time
 import traceback
@@ -13,7 +14,7 @@ from typing import Any, Callable, Iterable, Mapping, Optional, TypeVar, Union, c
 from typing_extensions import ParamSpec
 from snowflake import connector
-from snowflake.connector import telemetry as connector_telemetry, time_util
+from snowflake.connector import connect, telemetry as connector_telemetry, time_util
 from snowflake.ml import version as snowml_version
 from snowflake.ml._internal import env
 from snowflake.ml._internal.exceptions import (
@@ -37,6 +38,37 @@ _Args = ParamSpec("_Args")
 _ReturnValue = TypeVar("_ReturnValue")
+def _get_login_token() -> Union[str, bytes]:
+    with open("/snowflake/session/token") as f:
+        return f.read()
+def _get_snowflake_connection() -> Optional[connector.SnowflakeConnection]:
+    conn = None
+    if os.getenv("SNOWFLAKE_HOST") is not None and os.getenv("SNOWFLAKE_ACCOUNT") is not None:
+        try:
+            conn = connect(
+                host=os.getenv("SNOWFLAKE_HOST"),
+                account=os.getenv("SNOWFLAKE_ACCOUNT"),
+                token=_get_login_token(),
+                authenticator="oauth",
+            )
+        except Exception:
+            # Failed to get a new SnowflakeConnection in SPCS. Fall back to using the active session.
+            # This will work in some cases once SPCS enables multiple authentication modes, and users select any auth.
+            pass
+    if conn is None:
+        try:
+            active_session = next(iter(session._get_active_sessions()))
+            conn = active_session._conn._conn if active_session.telemetry_enabled else None
+        except snowpark_exceptions.SnowparkSessionException:
+            # Failed to get an active session. No connection available.
+            pass
+    return conn
 @enum.unique
 class TelemetryProject(enum.Enum):
     MLOPS = "MLOps"
@@ -378,10 +410,14 @@ def send_custom_usage(
     data: Optional[dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
-    active_session = next(iter(session._get_active_sessions()))
-    assert active_session, "Missing active session object"
+    conn = _get_snowflake_connection()
+    if conn is None:
+        raise ValueError(
+            """Snowflake connection is required to send custom telemetry. This means there
+            must be at least one active session, or that telemetry is being sent from within an SPCS service."""
+        )
-    client = _SourceTelemetryClient(conn=active_session._conn._conn, project=project, subproject=subproject)
+    client = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject)
     common_metrics = client._create_basic_telemetry_data(telemetry_type=telemetry_type)
     data = {**common_metrics, TelemetryField.KEY_DATA.value: data, **kwargs}
     client._send(msg=data)
@@ -501,7 +537,6 @@ def send_api_usage_telemetry(
                 return update_stmt_params_if_snowpark_df(result, statement_params)
             # prioritize `conn_attr_name` over the active session
-            telemetry_enabled = True
             if conn_attr_name:
                 # raise AttributeError if conn attribute does not exist in `self`
                 conn = operator.attrgetter(conn_attr_name)(args[0])
@@ -509,16 +544,10 @@ def send_api_usage_telemetry(
                     raise TypeError(
                         f"Expected a conn object of type {' or '.join(_CONNECTION_TYPES.keys())} but got {type(conn)}"
                     )
-            # get an active session
             else:
-                try:
-                    active_session = next(iter(session._get_active_sessions()))
-                    conn = active_session._conn._conn
-                    telemetry_enabled = active_session.telemetry_enabled
-                except snowpark_exceptions.SnowparkSessionException:
-                    conn = None
+                conn = _get_snowflake_connection()
-            if conn is None or not telemetry_enabled:
+            if conn is None:
                 # Telemetry not enabled, just execute without our additional telemetry logic
                 try:
                     return ctx.run(execute_func_with_statement_params)

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -249,7 +249,7 @@ class DataConnector:
 # Switch to use Runtime's Data Ingester if running in ML runtime
 # Fail silently if the data ingester is not found
-if env.IN_ML_RUNTIME and os.getenv(env.USE_OPTIMIZED_DATA_INGESTOR):
+if env.IN_ML_RUNTIME and os.getenv(env.USE_OPTIMIZED_DATA_INGESTOR, "").lower() in ("true", "1"):
     try:
         from runtime_external_entities import get_ingester_class

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -5,6 +5,7 @@ from snowflake.ml.jobs._utils.types import ComputeResources
 DEFAULT_CONTAINER_NAME = "main"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
 RESULT_PATH_ENV_VAR = "MLRS_RESULT_PATH"
+MIN_INSTANCES_ENV_VAR = "MLRS_MIN_INSTANCES"
 MEMORY_VOLUME_NAME = "dshm"
 STAGE_VOLUME_NAME = "stage-volume"
 STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
@@ -37,6 +38,7 @@ RAY_PORTS = {
 # Node health check configuration
 # TODO(SNOW-1937020): Revisit the health check configuration
 ML_RUNTIME_HEALTH_CHECK_PORT = "5001"
+ENABLE_HEALTH_CHECKS_ENV_VAR = "ENABLE_HEALTH_CHECKS"
 ENABLE_HEALTH_CHECKS = "false"
 # Job status polling constants
@@ -47,6 +49,13 @@ JOB_POLL_MAX_DELAY_SECONDS = 1
 IS_MLJOB_REMOTE_ATTR = "_is_mljob_remote_callable"
 RESULT_PATH_DEFAULT_VALUE = "mljob_result.pkl"
+# Log start and end messages
+LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
+LOG_END_MSG = "--------------------------------\nML job finished\n--------------------------------"
+# Default setting for verbose logging in get_log function
+DEFAULT_VERBOSE_LOG = False
 # Compute pool resource information
 # TODO: Query Snowflake for resource information instead of relying on this hardcoded
 #       table from https://docs.snowflake.com/en/sql-reference/sql/create-compute-pool

snowflake/ml/jobs/_utils/interop_utils.py CHANGED Viewed

@@ -80,7 +80,7 @@ def fetch_result(session: snowpark.Session, result_path: str) -> ExecutionResult
         # TODO: Check if file exists
         with session.file.get_stream(result_path) as result_stream:
             return ExecutionResult.from_dict(pickle.load(result_stream))
-    except (sp_exceptions.SnowparkSQLException, TypeError, pickle.UnpicklingError):
+    except (sp_exceptions.SnowparkSQLException, pickle.UnpicklingError, TypeError, ImportError):
         # Fall back to JSON result if loading pickled result fails for any reason
         result_json_path = os.path.splitext(result_path)[0] + ".json"
         with session.file.get_stream(result_json_path) as result_stream:

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -100,6 +100,11 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
             # Parse the output using read
             read head_index head_ip head_status<<< "$head_info"
+            if [ "$SNOWFLAKE_JOB_INDEX" -ne "$head_index" ]; then
+                NODE_TYPE="worker"
+                echo "{constants.LOG_START_MSG}"
+            fi
             # Use the parsed variables
             echo "Head Instance Index: $head_index"
             echo "Head Instance IP: $head_ip"
@@ -117,9 +122,7 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
             exit 1
         fi
-        if [ "$SNOWFLAKE_JOB_INDEX" -ne "$head_index" ]; then
-            NODE_TYPE="worker"
-        fi
     fi
     # Common parameters for both head and worker nodes
@@ -168,6 +171,10 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # Start Ray on a worker node - run in background
         ray start "${{common_params[@]}}" "${{worker_params[@]}}" -v --block &
+        echo "Worker node started on address $eth0Ip. See more logs in the head node."
+        echo "{constants.LOG_END_MSG}"
         # Start the worker shutdown listener in the background
         echo "Starting worker shutdown listener..."
         python worker_shutdown_listener.py
@@ -189,15 +196,16 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # Start Ray on the head node
         ray start "${{common_params[@]}}" "${{head_params[@]}}" -v
         ##### End Ray configuration #####
         # TODO: Monitor MLRS and handle process crashes
         python -m web.ml_runtime_grpc_server &
         # TODO: Launch worker service(s) using SQL if Ray and MLRS successfully started
+        echo Running command: python "$@"
         # Run user's Python entrypoint
-        echo Running command: python "$@"
         python "$@"
         # After the user's job completes, signal workers to shut down

snowflake/ml/jobs/_utils/scripts/constants.py CHANGED Viewed

@@ -2,3 +2,9 @@
 SHUTDOWN_ACTOR_NAME = "ShutdownSignal"
 SHUTDOWN_ACTOR_NAMESPACE = "default"
 SHUTDOWN_RPC_TIMEOUT_SECONDS = 5.0
+# Log start and end messages
+# Inherited from snowflake.ml.jobs._utils.constants
+LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
+LOG_END_MSG = "--------------------------------\nML job finished\n--------------------------------"

snowflake/ml/jobs/_utils/scripts/mljob_launcher.py CHANGED Viewed

@@ -2,25 +2,35 @@ import argparse
 import copy
 import importlib.util
 import json
+import logging
 import os
 import runpy
 import sys
+import time
 import traceback
 import warnings
 from pathlib import Path
 from typing import Any, Optional
 import cloudpickle
+from constants import LOG_END_MSG, LOG_START_MSG
 from snowflake.ml.jobs._utils import constants
 from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
 from snowflake.snowpark import Session
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
 # Fallbacks in case of SnowML version mismatch
 RESULT_PATH_ENV_VAR = getattr(constants, "RESULT_PATH_ENV_VAR", "MLRS_RESULT_PATH")
 JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "mljob_result.pkl")
+# Constants for the wait_for_min_instances function
+CHECK_INTERVAL = 10  # seconds
+TIMEOUT = 720  # seconds
 try:
     from snowflake.ml.jobs._utils.interop_utils import ExecutionResult
@@ -62,6 +72,66 @@ class SimpleJSONEncoder(json.JSONEncoder):
             return f"Unserializable object: {repr(obj)}"
+def get_active_node_count() -> int:
+    """
+    Count the number of active nodes in the Ray cluster.
+    Returns:
+        int: Total count of active nodes
+    """
+    import ray
+    if not ray.is_initialized():
+        ray.init(address="auto", ignore_reinit_error=True, log_to_driver=False)
+    try:
+        nodes = [node for node in ray.nodes() if node.get("Alive")]
+        total_active = len(nodes)
+        logger.info(f"Active nodes: {total_active}")
+        return total_active
+    except Exception as e:
+        logger.warning(f"Error getting active node count: {e}")
+        return 0
+def wait_for_min_instances(min_instances: int) -> None:
+    """
+    Wait until the specified minimum number of instances are available in the Ray cluster.
+    Args:
+        min_instances: Minimum number of instances required
+    Raises:
+        TimeoutError: If failed to connect to Ray or if minimum instances are not available within timeout
+    """
+    if min_instances <= 1:
+        logger.debug("Minimum instances is 1 or less, no need to wait for additional instances")
+        return
+    start_time = time.time()
+    timeout = os.getenv("JOB_MIN_INSTANCES_TIMEOUT", TIMEOUT)
+    check_interval = os.getenv("JOB_MIN_INSTANCES_CHECK_INTERVAL", CHECK_INTERVAL)
+    logger.debug(f"Waiting for at least {min_instances} instances to be ready (timeout: {timeout}s)")
+    while time.time() - start_time < timeout:
+        total_nodes = get_active_node_count()
+        if total_nodes >= min_instances:
+            elapsed = time.time() - start_time
+            logger.info(f"Minimum instance requirement met: {total_nodes} instances available after {elapsed:.1f}s")
+            return
+        logger.debug(
+            f"Waiting for instances: {total_nodes}/{min_instances} available "
+            f"(elapsed: {time.time() - start_time:.1f}s)"
+        )
+        time.sleep(check_interval)
+    raise TimeoutError(
+        f"Timed out after {timeout}s waiting for {min_instances} instances, only {get_active_node_count()} available"
+    )
 def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = None) -> Any:
     """
     Execute a Python script and return its result.
@@ -86,6 +156,7 @@ def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = N
     session = Session.builder.configs(SnowflakeLoginOptions()).create()  # noqa: F841
     try:
         if main_func:
             # Use importlib for scripts with a main function defined
             module_name = Path(script_path).stem
@@ -126,9 +197,21 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
     Raises:
         Exception: Re-raises any exception caught during script execution.
     """
-    # Run the script with the specified arguments
     try:
+        # Wait for minimum required instances if specified
+        min_instances_str = os.environ.get("JOB_MIN_INSTANCES", 1)
+        if min_instances_str and int(min_instances_str) > 1:
+            wait_for_min_instances(int(min_instances_str))
+        # Log start marker for user script execution
+        print(LOG_START_MSG)  # noqa: T201
+        # Run the script with the specified arguments
         result = run_script(script_path, *script_args, main_func=script_main_func)
+        # Log end marker for user script execution
+        print(LOG_END_MSG)  # noqa: T201
         result_obj = ExecutionResult(result=result)
         return result_obj
     except Exception as e:

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -85,7 +85,8 @@ def generate_service_spec(
     compute_pool: str,
     payload: types.UploadedPayload,
     args: Optional[list[str]] = None,
-    num_instances: Optional[int] = None,
+    target_instances: int = 1,
+    min_instances: int = 1,
     enable_metrics: bool = False,
 ) -> dict[str, Any]:
     """
@@ -96,13 +97,13 @@ def generate_service_spec(
         compute_pool: Compute pool for job execution
         payload: Uploaded job payload
         args: Arguments to pass to entrypoint script
-        num_instances: Number of instances for multi-node job
+        target_instances: Number of instances for multi-node job
         enable_metrics: Enable platform metrics for the job
+        min_instances: Minimum number of instances required to start the job
     Returns:
         Job service specification
     """
-    is_multi_node = num_instances is not None and num_instances > 1
     image_spec = _get_image_spec(session, compute_pool)
     # Set resource requests/limits, including nvidia.com/gpu quantity if applicable
@@ -180,10 +181,11 @@ def generate_service_spec(
     }
     endpoints = []
-    if is_multi_node:
+    if target_instances > 1:
         # Update environment variables for multi-node job
         env_vars.update(constants.RAY_PORTS)
-        env_vars["ENABLE_HEALTH_CHECKS"] = constants.ENABLE_HEALTH_CHECKS
+        env_vars[constants.ENABLE_HEALTH_CHECKS_ENV_VAR] = constants.ENABLE_HEALTH_CHECKS
+        env_vars[constants.MIN_INSTANCES_ENV_VAR] = str(min_instances)
         # Define Ray endpoints for intra-service instance communication
         ray_endpoints = [

snowflake/ml/jobs/decorators.py CHANGED Viewed

@@ -24,7 +24,8 @@ def remote(
     external_access_integrations: Optional[list[str]] = None,
     query_warehouse: Optional[str] = None,
     env_vars: Optional[dict[str, str]] = None,
-    num_instances: Optional[int] = None,
+    target_instances: int = 1,
+    min_instances: int = 1,
     enable_metrics: bool = False,
     database: Optional[str] = None,
     schema: Optional[str] = None,
@@ -40,7 +41,9 @@ def remote(
         external_access_integrations: A list of external access integrations.
         query_warehouse: The query warehouse to use. Defaults to session warehouse.
         env_vars: Environment variables to set in container
-        num_instances: The number of nodes in the job. If none specified, create a single node job.
+        target_instances: The number of nodes in the job. If none specified, create a single node job.
+        min_instances: The minimum number of nodes required to start the job. If none specified, defaults to 1.
+            If set, the job will not start until the minimum number of nodes is available.
         enable_metrics: Whether to enable metrics publishing for the job.
         database: The database to use for the job.
         schema: The schema to use for the job.
@@ -69,7 +72,8 @@ def remote(
                 external_access_integrations=external_access_integrations,
                 query_warehouse=query_warehouse,
                 env_vars=env_vars,
-                num_instances=num_instances,
+                target_instances=target_instances,
+                min_instances=min_instances,
                 enable_metrics=enable_metrics,
                 database=database,
                 schema=schema,

snowflake-ml-python 1.8.4__py3-none-any.whl → 1.8.5__py3-none-any.whl

snowflake-ml-python 1.8.4py3-none-any.whl → 1.8.5py3-none-any.whl