PyPI - snowflake-ml-python - Versions diffs - 1.8.3__py3-none-any.whl → 1.8.5__py3-none-any.whl - Mend

snowflake-ml-python 1.8.3py3-none-any.whl → 1.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

snowflake/cortex/__init__.py CHANGED Viewed

@@ -1,5 +1,10 @@
 from snowflake.cortex._classify_text import ClassifyText, classify_text
-from snowflake.cortex._complete import Complete, CompleteOptions, complete
+from snowflake.cortex._complete import (
+    Complete,
+    CompleteOptions,
+    ConversationMessage,
+    complete,
+)
 from snowflake.cortex._embed_text_768 import EmbedText768, embed_text_768
 from snowflake.cortex._embed_text_1024 import EmbedText1024, embed_text_1024
 from snowflake.cortex._extract_answer import ExtractAnswer, extract_answer
@@ -14,6 +19,7 @@ __all__ = [
     "Complete",
     "complete",
     "CompleteOptions",
+    "ConversationMessage",
     "EmbedText768",
     "embed_text_768",
     "EmbedText1024",

snowflake/ml/_internal/platform_capabilities.py CHANGED Viewed

@@ -11,6 +11,9 @@ from snowflake.snowpark import (
     session as snowpark_session,
 )
+LIVE_COMMIT_PARAMETER = "ENABLE_LIVE_VERSION_IN_SDK"
+INLINE_DEPLOYMENT_SPEC_PARAMETER = "ENABLE_INLINE_DEPLOYMENT_SPEC"
 class PlatformCapabilities:
     """Class that retrieves platform feature values for the currently running server.
@@ -18,12 +21,12 @@ class PlatformCapabilities:
     Example usage:
     ```
     pc = PlatformCapabilities.get_instance(session)
-    if pc.is_nested_function_enabled():
-        # Nested functions are enabled.
-        print("Nested functions are enabled.")
+    if pc.is_inlined_deployment_spec_enabled():
+        # Inline deployment spec is enabled.
+        print("Inline deployment spec is enabled.")
     else:
-        # Nested functions are disabled.
-        print("Nested functions are disabled or not supported.")
+        # Inline deployment spec is disabled.
+        print("Inline deployment spec is disabled or not supported.")
     ```
     """
@@ -50,9 +53,11 @@ class PlatformCapabilities:
     # For contextmanager, we need to have return type Iterator[Never]. However, Never type is introduced only in
     # Python 3.11. So, we are ignoring the type for this method.
+    _dummy_features: dict[str, Any] = {"dummy": "dummy"}
     @classmethod  # type: ignore[arg-type]
     @contextmanager
-    def mock_features(cls, features: dict[str, Any]) -> None:  # type: ignore[misc]
+    def mock_features(cls, features: dict[str, Any] = _dummy_features) -> None:  # type: ignore[misc]
         logging.debug(f"Setting mock features: {features}")
         cls.set_mock_features(features)
         try:
@@ -61,14 +66,11 @@ class PlatformCapabilities:
             logging.debug(f"Clearing mock features: {features}")
             cls.clear_mock_features()
-    def is_nested_function_enabled(self) -> bool:
-        return self._get_bool_feature("SPCS_MODEL_ENABLE_EMBEDDED_SERVICE_FUNCTIONS", False)
     def is_inlined_deployment_spec_enabled(self) -> bool:
-        return self._get_bool_feature("ENABLE_INLINE_DEPLOYMENT_SPEC", False)
+        return self._get_bool_feature(INLINE_DEPLOYMENT_SPEC_PARAMETER, False)
     def is_live_commit_enabled(self) -> bool:
-        return self._get_bool_feature("ENABLE_BUNDLE_MODULE_CHECKOUT", False)
+        return self._get_bool_feature(LIVE_COMMIT_PARAMETER, False)
     @staticmethod
     def _get_features(session: snowpark_session.Session) -> dict[str, Any]:

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -4,6 +4,7 @@ import enum
 import functools
 import inspect
 import operator
+import os
 import sys
 import time
 import traceback
@@ -13,7 +14,7 @@ from typing import Any, Callable, Iterable, Mapping, Optional, TypeVar, Union, c
 from typing_extensions import ParamSpec
 from snowflake import connector
-from snowflake.connector import telemetry as connector_telemetry, time_util
+from snowflake.connector import connect, telemetry as connector_telemetry, time_util
 from snowflake.ml import version as snowml_version
 from snowflake.ml._internal import env
 from snowflake.ml._internal.exceptions import (
@@ -37,6 +38,37 @@ _Args = ParamSpec("_Args")
 _ReturnValue = TypeVar("_ReturnValue")
+def _get_login_token() -> Union[str, bytes]:
+    with open("/snowflake/session/token") as f:
+        return f.read()
+def _get_snowflake_connection() -> Optional[connector.SnowflakeConnection]:
+    conn = None
+    if os.getenv("SNOWFLAKE_HOST") is not None and os.getenv("SNOWFLAKE_ACCOUNT") is not None:
+        try:
+            conn = connect(
+                host=os.getenv("SNOWFLAKE_HOST"),
+                account=os.getenv("SNOWFLAKE_ACCOUNT"),
+                token=_get_login_token(),
+                authenticator="oauth",
+            )
+        except Exception:
+            # Failed to get a new SnowflakeConnection in SPCS. Fall back to using the active session.
+            # This will work in some cases once SPCS enables multiple authentication modes, and users select any auth.
+            pass
+    if conn is None:
+        try:
+            active_session = next(iter(session._get_active_sessions()))
+            conn = active_session._conn._conn if active_session.telemetry_enabled else None
+        except snowpark_exceptions.SnowparkSessionException:
+            # Failed to get an active session. No connection available.
+            pass
+    return conn
 @enum.unique
 class TelemetryProject(enum.Enum):
     MLOPS = "MLOps"
@@ -378,10 +410,14 @@ def send_custom_usage(
     data: Optional[dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
-    active_session = next(iter(session._get_active_sessions()))
-    assert active_session, "Missing active session object"
+    conn = _get_snowflake_connection()
+    if conn is None:
+        raise ValueError(
+            """Snowflake connection is required to send custom telemetry. This means there
+            must be at least one active session, or that telemetry is being sent from within an SPCS service."""
+        )
-    client = _SourceTelemetryClient(conn=active_session._conn._conn, project=project, subproject=subproject)
+    client = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject)
     common_metrics = client._create_basic_telemetry_data(telemetry_type=telemetry_type)
     data = {**common_metrics, TelemetryField.KEY_DATA.value: data, **kwargs}
     client._send(msg=data)
@@ -501,7 +537,6 @@ def send_api_usage_telemetry(
                 return update_stmt_params_if_snowpark_df(result, statement_params)
             # prioritize `conn_attr_name` over the active session
-            telemetry_enabled = True
             if conn_attr_name:
                 # raise AttributeError if conn attribute does not exist in `self`
                 conn = operator.attrgetter(conn_attr_name)(args[0])
@@ -509,16 +544,10 @@ def send_api_usage_telemetry(
                     raise TypeError(
                         f"Expected a conn object of type {' or '.join(_CONNECTION_TYPES.keys())} but got {type(conn)}"
                     )
-            # get an active session
             else:
-                try:
-                    active_session = next(iter(session._get_active_sessions()))
-                    conn = active_session._conn._conn
-                    telemetry_enabled = active_session.telemetry_enabled
-                except snowpark_exceptions.SnowparkSessionException:
-                    conn = None
+                conn = _get_snowflake_connection()
-            if conn is None or not telemetry_enabled:
+            if conn is None:
                 # Telemetry not enabled, just execute without our additional telemetry logic
                 try:
                     return ctx.run(execute_func_with_statement_params)

snowflake/ml/_internal/utils/identifier.py CHANGED Viewed

@@ -12,7 +12,7 @@ SF_IDENTIFIER_RE = re.compile(_SF_IDENTIFIER)
 _SF_SCHEMA_LEVEL_OBJECT = (
     rf"(?:(?:(?P<db>{_SF_IDENTIFIER})\.)?(?P<schema>{_SF_IDENTIFIER})\.)?(?P<object>{_SF_IDENTIFIER})"
 )
-_SF_STAGE_PATH = rf"{_SF_SCHEMA_LEVEL_OBJECT}(?P<path>.*)"
+_SF_STAGE_PATH = rf"@?{_SF_SCHEMA_LEVEL_OBJECT}(?P<path>/.*)?"
 _SF_SCHEMA_LEVEL_OBJECT_RE = re.compile(_SF_SCHEMA_LEVEL_OBJECT)
 _SF_STAGE_PATH_RE = re.compile(_SF_STAGE_PATH)
@@ -197,7 +197,7 @@ def parse_snowflake_stage_path(
         res.group("db"),
         res.group("schema"),
         res.group("object"),
-        res.group("path"),
+        res.group("path") or "",
     )

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -249,7 +249,7 @@ class DataConnector:
 # Switch to use Runtime's Data Ingester if running in ML runtime
 # Fail silently if the data ingester is not found
-if env.IN_ML_RUNTIME and os.getenv(env.USE_OPTIMIZED_DATA_INGESTOR):
+if env.IN_ML_RUNTIME and os.getenv(env.USE_OPTIMIZED_DATA_INGESTOR, "").lower() in ("true", "1"):
     try:
         from runtime_external_entities import get_ingester_class

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -5,6 +5,7 @@ from snowflake.ml.jobs._utils.types import ComputeResources
 DEFAULT_CONTAINER_NAME = "main"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
 RESULT_PATH_ENV_VAR = "MLRS_RESULT_PATH"
+MIN_INSTANCES_ENV_VAR = "MLRS_MIN_INSTANCES"
 MEMORY_VOLUME_NAME = "dshm"
 STAGE_VOLUME_NAME = "stage-volume"
 STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
@@ -13,7 +14,7 @@ STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
 DEFAULT_IMAGE_REPO = "/snowflake/images/snowflake_images"
 DEFAULT_IMAGE_CPU = "st_plat/runtime/x86/runtime_image/snowbooks"
 DEFAULT_IMAGE_GPU = "st_plat/runtime/x86/generic_gpu/runtime_image/snowbooks"
-DEFAULT_IMAGE_TAG = "1.0.1"
+DEFAULT_IMAGE_TAG = "1.2.3"
 DEFAULT_ENTRYPOINT_PATH = "func.py"
 # Percent of container memory to allocate for /dev/shm volume
@@ -37,6 +38,7 @@ RAY_PORTS = {
 # Node health check configuration
 # TODO(SNOW-1937020): Revisit the health check configuration
 ML_RUNTIME_HEALTH_CHECK_PORT = "5001"
+ENABLE_HEALTH_CHECKS_ENV_VAR = "ENABLE_HEALTH_CHECKS"
 ENABLE_HEALTH_CHECKS = "false"
 # Job status polling constants
@@ -47,6 +49,13 @@ JOB_POLL_MAX_DELAY_SECONDS = 1
 IS_MLJOB_REMOTE_ATTR = "_is_mljob_remote_callable"
 RESULT_PATH_DEFAULT_VALUE = "mljob_result.pkl"
+# Log start and end messages
+LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
+LOG_END_MSG = "--------------------------------\nML job finished\n--------------------------------"
+# Default setting for verbose logging in get_log function
+DEFAULT_VERBOSE_LOG = False
 # Compute pool resource information
 # TODO: Query Snowflake for resource information instead of relying on this hardcoded
 #       table from https://docs.snowflake.com/en/sql-reference/sql/create-compute-pool

snowflake/ml/jobs/_utils/interop_utils.py CHANGED Viewed

@@ -80,7 +80,7 @@ def fetch_result(session: snowpark.Session, result_path: str) -> ExecutionResult
         # TODO: Check if file exists
         with session.file.get_stream(result_path) as result_stream:
             return ExecutionResult.from_dict(pickle.load(result_stream))
-    except (sp_exceptions.SnowparkSQLException, TypeError, pickle.UnpicklingError):
+    except (sp_exceptions.SnowparkSQLException, pickle.UnpicklingError, TypeError, ImportError):
         # Fall back to JSON result if loading pickled result fails for any reason
         result_json_path = os.path.splitext(result_path)[0] + ".json"
         with session.file.get_stream(result_json_path) as result_stream:

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pathlib import Path, PurePath
 from typing import Any, Callable, Optional, Union, cast, get_args, get_origin
 import cloudpickle as cp
+from packaging import version
 from snowflake import snowpark
 from snowflake.ml.jobs._utils import constants, types
@@ -97,11 +98,23 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         head_info=$(python3 get_instance_ip.py "$SNOWFLAKE_SERVICE_NAME" --head)
         if [ $? -eq 0 ]; then
             # Parse the output using read
-            read head_index head_ip <<< "$head_info"
+            read head_index head_ip head_status<<< "$head_info"
+            if [ "$SNOWFLAKE_JOB_INDEX" -ne "$head_index" ]; then
+                NODE_TYPE="worker"
+                echo "{constants.LOG_START_MSG}"
+            fi
             # Use the parsed variables
             echo "Head Instance Index: $head_index"
             echo "Head Instance IP: $head_ip"
+            echo "Head Instance Status: $head_status"
+            # If the head status is not "READY" or "PENDING", exit early
+            if [ "$head_status" != "READY" ] && [ "$head_status" != "PENDING" ]; then
+                echo "Head instance status is not READY or PENDING. Exiting."
+                exit 0
+            fi
         else
             echo "Error: Failed to get head instance information."
@@ -109,9 +122,7 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
             exit 1
         fi
-        if [ "$SNOWFLAKE_JOB_INDEX" -ne "$head_index" ]; then
-            NODE_TYPE="worker"
-        fi
     fi
     # Common parameters for both head and worker nodes
@@ -160,6 +171,10 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # Start Ray on a worker node - run in background
         ray start "${{common_params[@]}}" "${{worker_params[@]}}" -v --block &
+        echo "Worker node started on address $eth0Ip. See more logs in the head node."
+        echo "{constants.LOG_END_MSG}"
         # Start the worker shutdown listener in the background
         echo "Starting worker shutdown listener..."
         python worker_shutdown_listener.py
@@ -181,15 +196,16 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # Start Ray on the head node
         ray start "${{common_params[@]}}" "${{head_params[@]}}" -v
         ##### End Ray configuration #####
         # TODO: Monitor MLRS and handle process crashes
         python -m web.ml_runtime_grpc_server &
         # TODO: Launch worker service(s) using SQL if Ray and MLRS successfully started
+        echo Running command: python "$@"
         # Run user's Python entrypoint
-        echo Running command: python "$@"
         python "$@"
         # After the user's job completes, signal workers to shut down
@@ -278,17 +294,19 @@ class JobPayload:
         stage_path = PurePath(stage_path) if isinstance(stage_path, str) else stage_path
         source = resolve_source(self.source)
         entrypoint = resolve_entrypoint(source, self.entrypoint)
+        pip_requirements = self.pip_requirements or []
         # Create stage if necessary
         stage_name = stage_path.parts[0].lstrip("@")
         # Explicitly check if stage exists first since we may not have CREATE STAGE privilege
         try:
-            session.sql(f"describe stage {stage_name}").collect()
+            session.sql("describe stage identifier(?)", params=[stage_name]).collect()
         except sp_exceptions.SnowparkSQLException:
             session.sql(
-                f"create stage if not exists {stage_name}"
+                "create stage if not exists identifier(?)"
                 " encryption = ( type = 'SNOWFLAKE_SSE' )"
-                " comment = 'Created by snowflake.ml.jobs Python API'"
+                " comment = 'Created by snowflake.ml.jobs Python API'",
+                params=[stage_name],
             ).collect()
         # Upload payload to stage
@@ -301,6 +319,8 @@ class JobPayload:
                 overwrite=True,
             )
             source = Path(entrypoint.file_path.parent)
+            if not any(r.startswith("cloudpickle") for r in pip_requirements):
+                pip_requirements.append(f"cloudpickle~={version.parse(cp.__version__).major}.0")
         elif source.is_dir():
             # Manually traverse the directory and upload each file, since Snowflake PUT
             # can't handle directories. Reduce the number of PUT operations by using
@@ -325,10 +345,10 @@ class JobPayload:
         # Upload requirements
         # TODO: Check if payload includes both a requirements.txt file and pip_requirements
-        if self.pip_requirements:
+        if pip_requirements:
             # Upload requirements.txt to stage
             session.file.put_stream(
-                io.BytesIO("\n".join(self.pip_requirements).encode()),
+                io.BytesIO("\n".join(pip_requirements).encode()),
                 stage_location=stage_path.joinpath("requirements.txt").as_posix(),
                 auto_compress=False,
                 overwrite=True,
@@ -495,13 +515,6 @@ def generate_python_code(func: Callable[..., Any], source_code_display: bool = F
     # https://github.com/snowflakedb/snowpark-python/blob/main/src/snowflake/snowpark/_internal/udf_utils.py
     source_code_comment = _generate_source_code_comment(func) if source_code_display else ""
-    func_code = f"""
-{source_code_comment}
-import pickle
-{_ENTRYPOINT_FUNC_NAME} = pickle.loads(bytes.fromhex('{_serialize_callable(func).hex()}'))
-"""
     arg_dict_name = "kwargs"
     if getattr(func, constants.IS_MLJOB_REMOTE_ATTR, None):
         param_code = f"{arg_dict_name} = {{}}"
@@ -509,25 +522,29 @@ import pickle
         param_code = _generate_param_handler_code(signature, arg_dict_name)
     return f"""
-### Version guard to check compatibility across Python versions ###
-import os
 import sys
-import warnings
-if sys.version_info.major != {sys.version_info.major} or sys.version_info.minor != {sys.version_info.minor}:
-    warnings.warn(
-        "Python version mismatch: job was created using"
-        " python{sys.version_info.major}.{sys.version_info.minor}"
-        f" but runtime environment uses python{{sys.version_info.major}}.{{sys.version_info.minor}}."
-        " Compatibility across Python versions is not guaranteed and may result in unexpected behavior."
-        " This will be fixed in a future release; for now, please use Python version"
-        f" {{sys.version_info.major}}.{{sys.version_info.minor}}.",
-        RuntimeWarning,
-        stacklevel=0,
-    )
-### End version guard ###
+import pickle
-{func_code.strip()}
+try:
+    {textwrap.indent(source_code_comment, '    ')}
+    {_ENTRYPOINT_FUNC_NAME} = pickle.loads(bytes.fromhex('{_serialize_callable(func).hex()}'))
+except (TypeError, pickle.PickleError):
+    if sys.version_info.major != {sys.version_info.major} or sys.version_info.minor != {sys.version_info.minor}:
+        raise RuntimeError(
+            "Failed to deserialize function due to Python version mismatch."
+            f" Runtime environment is Python {{sys.version_info.major}}.{{sys.version_info.minor}}"
+            " but function was serialized using Python {sys.version_info.major}.{sys.version_info.minor}."
+        ) from None
+    raise
+except AttributeError as e:
+    if 'cloudpickle' in str(e):
+        import cloudpickle as cp
+        raise RuntimeError(
+            "Failed to deserialize function due to cloudpickle version mismatch."
+            f" Runtime environment uses cloudpickle=={{cp.__version__}}"
+            " but job was serialized using cloudpickle=={cp.__version__}."
+        ) from e
+    raise
 if __name__ == '__main__':
 {textwrap.indent(param_code, '    ')}

snowflake/ml/jobs/_utils/scripts/constants.py CHANGED Viewed

@@ -2,3 +2,9 @@
 SHUTDOWN_ACTOR_NAME = "ShutdownSignal"
 SHUTDOWN_ACTOR_NAMESPACE = "default"
 SHUTDOWN_RPC_TIMEOUT_SECONDS = 5.0
+# Log start and end messages
+# Inherited from snowflake.ml.jobs._utils.constants
+LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
+LOG_END_MSG = "--------------------------------\nML job finished\n--------------------------------"

snowflake/ml/jobs/_utils/scripts/get_instance_ip.py CHANGED Viewed

@@ -29,7 +29,7 @@ def get_self_ip() -> Optional[str]:
         return None
-def get_first_instance(service_name: str) -> Optional[tuple[str, str]]:
+def get_first_instance(service_name: str) -> Optional[tuple[str, str, str]]:
     """Get the first instance of a batch job based on start time and instance ID.
     Args:
@@ -42,7 +42,7 @@ def get_first_instance(service_name: str) -> Optional[tuple[str, str]]:
     session = session_utils.get_session()
     df = session.sql(f"show service instances in service {service_name}")
-    result = df.select('"instance_id"', '"ip_address"', '"start_time"').collect()
+    result = df.select('"instance_id"', '"ip_address"', '"start_time"', '"status"').collect()
     if not result:
         return None
@@ -57,7 +57,7 @@ def get_first_instance(service_name: str) -> Optional[tuple[str, str]]:
     ip_address = head_instance["ip_address"]
     try:
         socket.inet_aton(ip_address)  # Validate IPv4 address
-        return (head_instance["instance_id"], ip_address)
+        return (head_instance["instance_id"], ip_address, head_instance["status"])
     except OSError:
         logger.error(f"Error: Invalid IP address format: {ip_address}")
         return None
@@ -110,7 +110,7 @@ def main():
             head_info = get_first_instance(args.service_name)
             if head_info:
                 # Print to stdout to allow capture but don't use logger
-                sys.stdout.write(f"{head_info[0]} {head_info[1]}\n")
+                sys.stdout.write(" ".join(head_info) + "\n")
                 sys.exit(0)
             time.sleep(args.retry_interval)
         # If we get here, we've timed out

snowflake/ml/jobs/_utils/scripts/mljob_launcher.py CHANGED Viewed

@@ -2,25 +2,35 @@ import argparse
 import copy
 import importlib.util
 import json
+import logging
 import os
 import runpy
 import sys
+import time
 import traceback
 import warnings
 from pathlib import Path
 from typing import Any, Optional
 import cloudpickle
+from constants import LOG_END_MSG, LOG_START_MSG
 from snowflake.ml.jobs._utils import constants
 from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
 from snowflake.snowpark import Session
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
 # Fallbacks in case of SnowML version mismatch
 RESULT_PATH_ENV_VAR = getattr(constants, "RESULT_PATH_ENV_VAR", "MLRS_RESULT_PATH")
 JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "mljob_result.pkl")
+# Constants for the wait_for_min_instances function
+CHECK_INTERVAL = 10  # seconds
+TIMEOUT = 720  # seconds
 try:
     from snowflake.ml.jobs._utils.interop_utils import ExecutionResult
@@ -59,7 +69,67 @@ class SimpleJSONEncoder(json.JSONEncoder):
         try:
             return super().default(obj)
         except TypeError:
-            return str(obj)
+            return f"Unserializable object: {repr(obj)}"
+def get_active_node_count() -> int:
+    """
+    Count the number of active nodes in the Ray cluster.
+    Returns:
+        int: Total count of active nodes
+    """
+    import ray
+    if not ray.is_initialized():
+        ray.init(address="auto", ignore_reinit_error=True, log_to_driver=False)
+    try:
+        nodes = [node for node in ray.nodes() if node.get("Alive")]
+        total_active = len(nodes)
+        logger.info(f"Active nodes: {total_active}")
+        return total_active
+    except Exception as e:
+        logger.warning(f"Error getting active node count: {e}")
+        return 0
+def wait_for_min_instances(min_instances: int) -> None:
+    """
+    Wait until the specified minimum number of instances are available in the Ray cluster.
+    Args:
+        min_instances: Minimum number of instances required
+    Raises:
+        TimeoutError: If failed to connect to Ray or if minimum instances are not available within timeout
+    """
+    if min_instances <= 1:
+        logger.debug("Minimum instances is 1 or less, no need to wait for additional instances")
+        return
+    start_time = time.time()
+    timeout = os.getenv("JOB_MIN_INSTANCES_TIMEOUT", TIMEOUT)
+    check_interval = os.getenv("JOB_MIN_INSTANCES_CHECK_INTERVAL", CHECK_INTERVAL)
+    logger.debug(f"Waiting for at least {min_instances} instances to be ready (timeout: {timeout}s)")
+    while time.time() - start_time < timeout:
+        total_nodes = get_active_node_count()
+        if total_nodes >= min_instances:
+            elapsed = time.time() - start_time
+            logger.info(f"Minimum instance requirement met: {total_nodes} instances available after {elapsed:.1f}s")
+            return
+        logger.debug(
+            f"Waiting for instances: {total_nodes}/{min_instances} available "
+            f"(elapsed: {time.time() - start_time:.1f}s)"
+        )
+        time.sleep(check_interval)
+    raise TimeoutError(
+        f"Timed out after {timeout}s waiting for {min_instances} instances, only {get_active_node_count()} available"
+    )
 def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = None) -> Any:
@@ -86,6 +156,7 @@ def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = N
     session = Session.builder.configs(SnowflakeLoginOptions()).create()  # noqa: F841
     try:
         if main_func:
             # Use importlib for scripts with a main function defined
             module_name = Path(script_path).stem
@@ -126,9 +197,21 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
     Raises:
         Exception: Re-raises any exception caught during script execution.
     """
-    # Run the script with the specified arguments
     try:
+        # Wait for minimum required instances if specified
+        min_instances_str = os.environ.get("JOB_MIN_INSTANCES", 1)
+        if min_instances_str and int(min_instances_str) > 1:
+            wait_for_min_instances(int(min_instances_str))
+        # Log start marker for user script execution
+        print(LOG_START_MSG)  # noqa: T201
+        # Run the script with the specified arguments
         result = run_script(script_path, *script_args, main_func=script_main_func)
+        # Log end marker for user script execution
+        print(LOG_END_MSG)  # noqa: T201
         result_obj = ExecutionResult(result=result)
         return result_obj
     except Exception as e:

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ from snowflake.ml.jobs._utils import constants, types
 def _get_node_resources(session: snowpark.Session, compute_pool: str) -> types.ComputeResources:
     """Extract resource information for the specified compute pool"""
     # Get the instance family
-    rows = session.sql(f"show compute pools like '{compute_pool}'").collect()
+    rows = session.sql("show compute pools like ?", params=[compute_pool]).collect()
     if not rows:
         raise ValueError(f"Compute pool '{compute_pool}' not found")
     instance_family: str = rows[0]["instance_family"]
@@ -85,7 +85,8 @@ def generate_service_spec(
     compute_pool: str,
     payload: types.UploadedPayload,
     args: Optional[list[str]] = None,
-    num_instances: Optional[int] = None,
+    target_instances: int = 1,
+    min_instances: int = 1,
     enable_metrics: bool = False,
 ) -> dict[str, Any]:
     """
@@ -96,13 +97,13 @@ def generate_service_spec(
         compute_pool: Compute pool for job execution
         payload: Uploaded job payload
         args: Arguments to pass to entrypoint script
-        num_instances: Number of instances for multi-node job
+        target_instances: Number of instances for multi-node job
         enable_metrics: Enable platform metrics for the job
+        min_instances: Minimum number of instances required to start the job
     Returns:
         Job service specification
     """
-    is_multi_node = num_instances is not None and num_instances > 1
     image_spec = _get_image_spec(session, compute_pool)
     # Set resource requests/limits, including nvidia.com/gpu quantity if applicable
@@ -180,10 +181,11 @@ def generate_service_spec(
     }
     endpoints = []
-    if is_multi_node:
+    if target_instances > 1:
         # Update environment variables for multi-node job
         env_vars.update(constants.RAY_PORTS)
-        env_vars["ENABLE_HEALTH_CHECKS"] = constants.ENABLE_HEALTH_CHECKS
+        env_vars[constants.ENABLE_HEALTH_CHECKS_ENV_VAR] = constants.ENABLE_HEALTH_CHECKS
+        env_vars[constants.MIN_INSTANCES_ENV_VAR] = str(min_instances)
         # Define Ray endpoints for intra-service instance communication
         ray_endpoints = [

snowflake-ml-python 1.8.3__py3-none-any.whl → 1.8.5__py3-none-any.whl

snowflake-ml-python 1.8.3py3-none-any.whl → 1.8.5py3-none-any.whl