PyPI - snowflake-ml-python - Versions diffs - 1.9.1__py3-none-any.whl → 1.9.2__py3-none-any.whl - Mend

snowflake-ml-python 1.9.1py3-none-any.whl → 1.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

snowflake/ml/_internal/utils/mixins.py +6 -4
snowflake/ml/_internal/utils/service_logger.py +101 -1
snowflake/ml/data/_internal/arrow_ingestor.py +4 -1
snowflake/ml/data/data_connector.py +4 -34
snowflake/ml/dataset/dataset.py +1 -1
snowflake/ml/dataset/dataset_reader.py +2 -8
snowflake/ml/experiment/__init__.py +3 -0
snowflake/ml/experiment/callback.py +121 -0
snowflake/ml/jobs/_utils/constants.py +15 -4
snowflake/ml/jobs/_utils/payload_utils.py +150 -49
snowflake/ml/jobs/_utils/scripts/constants.py +0 -22
snowflake/ml/jobs/_utils/scripts/mljob_launcher.py +125 -22
snowflake/ml/jobs/_utils/spec_utils.py +1 -1
snowflake/ml/jobs/_utils/stage_utils.py +30 -14
snowflake/ml/jobs/_utils/types.py +64 -4
snowflake/ml/jobs/job.py +22 -6
snowflake/ml/jobs/manager.py +5 -3
snowflake/ml/model/_client/ops/service_ops.py +17 -2
snowflake/ml/model/_client/sql/service.py +1 -38
snowflake/ml/model/_packager/model_handlers/sklearn.py +9 -5
snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -0
snowflake/ml/model/_signatures/pandas_handler.py +3 -0
snowflake/ml/model/_signatures/utils.py +4 -0
snowflake/ml/model/model_signature.py +2 -0
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.9.2.dist-info}/METADATA +42 -4
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.9.2.dist-info}/RECORD +30 -28
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.9.2.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.9.2.dist-info}/licenses/LICENSE.txt +0 -0
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.9.2.dist-info}/top_level.txt +0 -0

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import functools
 import inspect
 import io
 import itertools
+import keyword
+import logging
 import pickle
 import sys
 import textwrap
@@ -22,8 +24,11 @@ from snowflake.ml.jobs._utils import (
 from snowflake.snowpark import exceptions as sp_exceptions
 from snowflake.snowpark._internal import code_generation
+logger = logging.getLogger(__name__)
 cp.register_pickle_by_value(function_payload_utils)
 _SUPPORTED_ARG_TYPES = {str, int, float}
 _SUPPORTED_ENTRYPOINT_EXTENSIONS = {".py"}
 _ENTRYPOINT_FUNC_NAME = "func"
@@ -32,6 +37,9 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     f"""
     #!/bin/bash
+    ##### Get system scripts directory #####
+    SYSTEM_DIR=$(cd "$(dirname "$0")" && pwd)
     ##### Perform common set up steps #####
     set -e # exit if a command fails
@@ -75,12 +83,14 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     # Check if the local get_instance_ip.py script exists
     HELPER_EXISTS=$(
-        [ -f "get_instance_ip.py" ] && echo "true" || echo "false"
+        [ -f "${{SYSTEM_DIR}}/get_instance_ip.py" ] && echo "true" || echo "false"
     )
     # Configure IP address and logging directory
     if [ "$HELPER_EXISTS" = "true" ]; then
-        eth0Ip=$(python3 get_instance_ip.py "$SNOWFLAKE_SERVICE_NAME" --instance-index=-1)
+        eth0Ip=$(python3 "${{SYSTEM_DIR}}/get_instance_ip.py" \
+            "$SNOWFLAKE_SERVICE_NAME" --instance-index=-1)
     else
         eth0Ip=$(ifconfig eth0 2>/dev/null | sed -En -e 's/.*inet ([0-9.]+).*/\1/p')
     fi
@@ -103,7 +113,7 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     # Determine if it should be a worker or a head node for batch jobs
     if [[ "$SNOWFLAKE_JOBS_COUNT" -gt 1 && "$HELPER_EXISTS" = "true" ]]; then
-        head_info=$(python3 get_instance_ip.py "$SNOWFLAKE_SERVICE_NAME" --head)
+        head_info=$(python3 "${{SYSTEM_DIR}}/get_instance_ip.py" "$SNOWFLAKE_SERVICE_NAME" --head)
         if [ $? -eq 0 ]; then
             # Parse the output using read
             read head_index head_ip head_status<<< "$head_info"
@@ -185,7 +195,7 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # Start the worker shutdown listener in the background
         echo "Starting worker shutdown listener..."
-        python worker_shutdown_listener.py
+        python "${{SYSTEM_DIR}}/worker_shutdown_listener.py"
         WORKER_EXIT_CODE=$?
         echo "Worker shutdown listener exited with code $WORKER_EXIT_CODE"
@@ -218,19 +228,59 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         # After the user's job completes, signal workers to shut down
         echo "User job completed. Signaling workers to shut down..."
-        python signal_workers.py --wait-time 15
+        python "${{SYSTEM_DIR}}/signal_workers.py" --wait-time 15
         echo "Head node job completed. Exiting."
     fi
     """
 ).strip()
+def resolve_path(path: str) -> types.PayloadPath:
+    try:
+        stage_path = stage_utils.StagePath(path)
+    except ValueError:
+        return Path(path)
+    return stage_path
+def upload_payloads(session: snowpark.Session, stage_path: PurePath, *payload_specs: types.PayloadSpec) -> None:
+    for source_path, remote_relative_path in payload_specs:
+        payload_stage_path = stage_path.joinpath(remote_relative_path) if remote_relative_path else stage_path
+        if isinstance(source_path, stage_utils.StagePath):
+            # only copy files into one stage directory from another stage directory, not from stage file
+            # due to incomplete of StagePath functionality
+            session.sql(f"copy files into {payload_stage_path.as_posix()}/ from {source_path.as_posix()}/").collect()
+        elif isinstance(source_path, Path):
+            if source_path.is_dir():
+                # Manually traverse the directory and upload each file, since Snowflake PUT
+                # can't handle directories. Reduce the number of PUT operations by using
+                # wildcard patterns to batch upload files with the same extension.
+                for path in {
+                    p.parent.joinpath(f"*{p.suffix}") if p.suffix else p
+                    for p in source_path.resolve().rglob("*")
+                    if p.is_file()
+                }:
+                    session.file.put(
+                        str(path),
+                        payload_stage_path.joinpath(path.parent.relative_to(source_path)).as_posix(),
+                        overwrite=True,
+                        auto_compress=False,
+                    )
+            else:
+                session.file.put(
+                    str(source_path.resolve()),
+                    payload_stage_path.as_posix(),
+                    overwrite=True,
+                    auto_compress=False,
+                )
 def resolve_source(
-    source: Union[Path, stage_utils.StagePath, Callable[..., Any]]
-) -> Union[Path, stage_utils.StagePath, Callable[..., Any]]:
+    source: Union[types.PayloadPath, Callable[..., Any]]
+) -> Union[types.PayloadPath, Callable[..., Any]]:
     if callable(source):
         return source
-    elif isinstance(source, (Path, stage_utils.StagePath)):
+    elif isinstance(source, types.PayloadPath):
         if not source.exists():
             raise FileNotFoundError(f"{source} does not exist")
         return source.absolute()
@@ -239,8 +289,8 @@ def resolve_source(
 def resolve_entrypoint(
-    source: Union[Path, stage_utils.StagePath, Callable[..., Any]],
-    entrypoint: Optional[Union[stage_utils.StagePath, Path]],
+    source: Union[types.PayloadPath, Callable[..., Any]],
+    entrypoint: Optional[types.PayloadPath],
 ) -> types.PayloadEntrypoint:
     if callable(source):
         # Entrypoint is generated for callable payloads
@@ -289,6 +339,73 @@ def resolve_entrypoint(
     )
+def resolve_additional_payloads(
+    additional_payloads: Optional[list[Union[str, tuple[str, str]]]]
+) -> list[types.PayloadSpec]:
+    """
+    Determine how to stage local packages so that imports continue to work.
+    Args:
+        additional_payloads: A list of directory paths, each optionally paired with a dot-separated
+            import path
+            e.g. [("proj/src/utils", "src.utils"), "proj/src/helper"]
+            if there is no import path, the last part of path will be considered as import path
+            e.g. the import path of "proj/src/helper" is "helper"
+    Returns:
+        A list of payloadSpec for additional payloads.
+    Raises:
+        FileNotFoundError: If any specified package path does not exist.
+        ValueError: If the format of local_packages is invalid.
+    """
+    if not additional_payloads:
+        return []
+    logger.warning(
+        "When providing a stage path as an additional payload, "
+        "please ensure it points to a directory. "
+        "Files are not currently supported."
+    )
+    additional_payloads_paths = []
+    for pkg in additional_payloads:
+        if isinstance(pkg, str):
+            source_path = resolve_path(pkg).absolute()
+            module_path = source_path.name
+        elif isinstance(pkg, tuple):
+            try:
+                source_path_str, module_path = pkg
+            except ValueError:
+                raise ValueError(
+                    f"Invalid format in `additional_payloads`. "
+                    f"Expected a tuple of (source_path, module_path). Got {pkg}"
+                )
+            source_path = resolve_path(source_path_str).absolute()
+        else:
+            raise ValueError("the format of additional payload is not correct")
+        if not source_path.exists():
+            raise FileNotFoundError(f"{source_path} does not exist")
+        if isinstance(source_path, Path):
+            if source_path.is_file():
+                raise ValueError(f"file is not supported for additional payloads: {source_path}")
+        module_parts = module_path.split(".")
+        for part in module_parts:
+            if not part.isidentifier() or keyword.iskeyword(part):
+                raise ValueError(
+                    f"Invalid module import path '{module_path}'. "
+                    f"'{part}' is not a valid Python identifier or is a keyword."
+                )
+        dest_path = PurePath(*module_parts)
+        additional_payloads_paths.append(types.PayloadSpec(source_path, dest_path))
+    return additional_payloads_paths
 class JobPayload:
     def __init__(
         self,
@@ -296,11 +413,13 @@ class JobPayload:
         entrypoint: Optional[Union[str, Path]] = None,
         *,
         pip_requirements: Optional[list[str]] = None,
+        additional_payloads: Optional[list[Union[str, tuple[str, str]]]] = None,
     ) -> None:
         # for stage path like snow://domain....., Path(path) will remove duplicate /, it will become snow:/ domain...
-        self.source = stage_utils.identify_stage_path(source) if isinstance(source, str) else source
-        self.entrypoint = stage_utils.identify_stage_path(entrypoint) if isinstance(entrypoint, str) else entrypoint
+        self.source = resolve_path(source) if isinstance(source, str) else source
+        self.entrypoint = resolve_path(entrypoint) if isinstance(entrypoint, str) else entrypoint
         self.pip_requirements = pip_requirements
+        self.additional_payloads = additional_payloads
     def upload(self, session: snowpark.Session, stage_path: Union[str, PurePath]) -> types.UploadedPayload:
         # Prepare local variables
@@ -308,6 +427,7 @@ class JobPayload:
         source = resolve_source(self.source)
         entrypoint = resolve_entrypoint(source, self.entrypoint)
         pip_requirements = self.pip_requirements or []
+        additional_payload_specs = resolve_additional_payloads(self.additional_payloads)
         # Create stage if necessary
         stage_name = stage_path.parts[0].lstrip("@")
@@ -323,12 +443,13 @@ class JobPayload:
                 params=[stage_name],
             )
-        # Upload payload to stage
-        if not isinstance(source, (Path, stage_utils.StagePath)):
+        # Upload payload to stage - organize into app/ subdirectory
+        app_stage_path = stage_path.joinpath(constants.APP_STAGE_SUBPATH)
+        if not isinstance(source, types.PayloadPath):
             source_code = generate_python_code(source, source_code_display=True)
             _ = session.file.put_stream(
                 io.BytesIO(source_code.encode()),
-                stage_location=stage_path.joinpath(entrypoint.file_path).as_posix(),
+                stage_location=app_stage_path.joinpath(entrypoint.file_path).as_posix(),
                 auto_compress=False,
                 overwrite=True,
             )
@@ -340,68 +461,48 @@ class JobPayload:
             # copy payload to stage
             if source == entrypoint.file_path:
                 source = source.parent
-            source_path = source.as_posix() + "/"
-            session.sql(f"copy files into {stage_path}/ from {source_path}").collect()
+            upload_payloads(session, app_stage_path, types.PayloadSpec(source, None))
         elif isinstance(source, Path):
-            if source.is_dir():
-                # Manually traverse the directory and upload each file, since Snowflake PUT
-                # can't handle directories. Reduce the number of PUT operations by using
-                # wildcard patterns to batch upload files with the same extension.
-                for path in {
-                    p.parent.joinpath(f"*{p.suffix}") if p.suffix else p
-                    for p in source.resolve().rglob("*")
-                    if p.is_file()
-                }:
-                    session.file.put(
-                        str(path),
-                        stage_path.joinpath(path.parent.relative_to(source)).as_posix(),
-                        overwrite=True,
-                        auto_compress=False,
-                    )
-            else:
-                session.file.put(
-                    str(source.resolve()),
-                    stage_path.as_posix(),
-                    overwrite=True,
-                    auto_compress=False,
-                )
+            upload_payloads(session, app_stage_path, types.PayloadSpec(source, None))
+            if source.is_file():
                 source = source.parent
-        # Upload requirements
+        upload_payloads(session, app_stage_path, *additional_payload_specs)
+        # Upload requirements to app/ directory
         # TODO: Check if payload includes both a requirements.txt file and pip_requirements
         if pip_requirements:
             # Upload requirements.txt to stage
             session.file.put_stream(
                 io.BytesIO("\n".join(pip_requirements).encode()),
-                stage_location=stage_path.joinpath("requirements.txt").as_posix(),
+                stage_location=app_stage_path.joinpath("requirements.txt").as_posix(),
                 auto_compress=False,
                 overwrite=True,
             )
-        # Upload startup script
+        # Upload startup script to system/ directory within payload
+        system_stage_path = stage_path.joinpath(constants.SYSTEM_STAGE_SUBPATH)
         # TODO: Make sure payload does not include file with same name
         session.file.put_stream(
             io.BytesIO(_STARTUP_SCRIPT_CODE.encode()),
-            stage_location=stage_path.joinpath(_STARTUP_SCRIPT_PATH).as_posix(),
+            stage_location=system_stage_path.joinpath(_STARTUP_SCRIPT_PATH).as_posix(),
             auto_compress=False,
             overwrite=False,  # FIXME
         )
-        # Upload system scripts
         scripts_dir = Path(__file__).parent.joinpath("scripts")
         for script_file in scripts_dir.glob("*"):
             if script_file.is_file():
                 session.file.put(
                     script_file.as_posix(),
-                    stage_path.as_posix(),
+                    system_stage_path.as_posix(),
                     overwrite=True,
                     auto_compress=False,
                 )
         python_entrypoint: list[Union[str, PurePath]] = [
-            PurePath("mljob_launcher.py"),
-            entrypoint.file_path.relative_to(source),
+            PurePath(f"{constants.SYSTEM_MOUNT_PATH}/mljob_launcher.py"),
+            PurePath(f"{constants.APP_MOUNT_PATH}/{entrypoint.file_path.relative_to(source).as_posix()}"),
         ]
         if entrypoint.main_func:
             python_entrypoint += ["--script_main_func", entrypoint.main_func]
@@ -410,7 +511,7 @@ class JobPayload:
             stage_path=stage_path,
             entrypoint=[
                 "bash",
-                _STARTUP_SCRIPT_PATH,
+                f"{constants.SYSTEM_MOUNT_PATH}/{_STARTUP_SCRIPT_PATH}",
                 *python_entrypoint,
             ],
         )

snowflake/ml/jobs/_utils/scripts/constants.py CHANGED Viewed

@@ -1,26 +1,4 @@
-from snowflake.ml.jobs._utils import constants as mljob_constants
 # Constants defining the shutdown signal actor configuration.
 SHUTDOWN_ACTOR_NAME = "ShutdownSignal"
 SHUTDOWN_ACTOR_NAMESPACE = "default"
 SHUTDOWN_RPC_TIMEOUT_SECONDS = 5.0
-# The followings are Inherited from snowflake.ml.jobs._utils.constants
-# We need to copy them here since snowml package on the server side does
-# not have the latest version of the code
-# Log start and end messages
-LOG_START_MSG = getattr(
-    mljob_constants,
-    "LOG_START_MSG",
-    "--------------------------------\nML job started\n--------------------------------",
-)
-LOG_END_MSG = getattr(
-    mljob_constants,
-    "LOG_END_MSG",
-    "--------------------------------\nML job finished\n--------------------------------",
-)
-# min_instances environment variable name
-MIN_INSTANCES_ENV_VAR = getattr(mljob_constants, "MIN_INSTANCES_ENV_VAR", "MLRS_MIN_INSTANCES")

snowflake/ml/jobs/_utils/scripts/mljob_launcher.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import importlib.util
 import json
 import logging
+import math
 import os
 import runpy
 import sys
@@ -13,7 +14,6 @@ from pathlib import Path
 from typing import Any, Optional
 import cloudpickle
-from constants import LOG_END_MSG, LOG_START_MSG, MIN_INSTANCES_ENV_VAR
 from snowflake.ml.jobs._utils import constants
 from snowflake.snowpark import Session
@@ -27,13 +27,35 @@ except ImportError:
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+# The followings are Inherited from snowflake.ml.jobs._utils.constants
+# We need to copy them here since snowml package on the server side does
+# not have the latest version of the code
+# Log start and end messages
+LOG_START_MSG = getattr(
+    constants,
+    "LOG_START_MSG",
+    "--------------------------------\nML job started\n--------------------------------",
+)
+LOG_END_MSG = getattr(
+    constants,
+    "LOG_END_MSG",
+    "--------------------------------\nML job finished\n--------------------------------",
+)
+# min_instances environment variable name
+MIN_INSTANCES_ENV_VAR = getattr(constants, "MIN_INSTANCES_ENV_VAR", "MLRS_MIN_INSTANCES")
+TARGET_INSTANCES_ENV_VAR = getattr(constants, "TARGET_INSTANCES_ENV_VAR", "SNOWFLAKE_JOBS_COUNT")
 # Fallbacks in case of SnowML version mismatch
 RESULT_PATH_ENV_VAR = getattr(constants, "RESULT_PATH_ENV_VAR", "MLRS_RESULT_PATH")
-JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "mljob_result.pkl")
+JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "/mnt/job_stage/output/mljob_result.pkl")
+PAYLOAD_DIR_ENV_VAR = getattr(constants, "PAYLOAD_DIR_ENV_VAR", "MLRS_PAYLOAD_DIR")
-# Constants for the wait_for_min_instances function
-CHECK_INTERVAL = 10  # seconds
-TIMEOUT = 720  # seconds
+# Constants for the wait_for_instances function
+MIN_WAIT_TIME = float(os.getenv("MLRS_INSTANCES_MIN_WAIT") or -1)  # seconds
+TIMEOUT = float(os.getenv("MLRS_INSTANCES_TIMEOUT") or 720)  # seconds
+CHECK_INTERVAL = float(os.getenv("MLRS_INSTANCES_CHECK_INTERVAL") or 10)  # seconds
 try:
@@ -76,45 +98,108 @@ class SimpleJSONEncoder(json.JSONEncoder):
             return f"Unserializable object: {repr(obj)}"
-def wait_for_min_instances(min_instances: int) -> None:
+def wait_for_instances(
+    min_instances: int,
+    target_instances: int,
+    *,
+    min_wait_time: float = -1,  # seconds
+    timeout: float = 720,  # seconds
+    check_interval: float = 10,  # seconds
+) -> None:
     """
     Wait until the specified minimum number of instances are available in the Ray cluster.
     Args:
         min_instances: Minimum number of instances required
+        target_instances: Target number of instances to wait for
+        min_wait_time: Minimum time to wait for target_instances to be available.
+            If less than 0, automatically set based on target_instances.
+        timeout: Maximum time to wait for min_instances to be available before raising a TimeoutError.
+        check_interval: Maximum time to wait between checks (uses exponential backoff).
+    Examples:
+        Scenario 1 - Ideal case (target met quickly):
+            wait_for_instances(min_instances=2, target_instances=4, min_wait_time=5, timeout=60)
+            If 4 instances are available after 1 second, the function returns without further waiting (target met).
+        Scenario 2 - Min instances met, target not reached:
+            wait_for_instances(min_instances=2, target_instances=4, min_wait_time=10, timeout=60)
+            If only 3 instances are available after 10 seconds, the function returns (min requirement satisfied).
+        Scenario 3 - Min instances met early, but min_wait_time not elapsed:
+            wait_for_instances(min_instances=2, target_instances=4, min_wait_time=30, timeout=60)
+            If 2 instances are available after 5 seconds, function continues waiting for target_instances
+            until either 4 instances are found or 30 seconds have elapsed.
+        Scenario 4 - Timeout scenario:
+            wait_for_instances(min_instances=3, target_instances=5, min_wait_time=10, timeout=30)
+            If only 2 instances are available after 30 seconds, TimeoutError is raised.
+        Scenario 5 - Single instance job (early return):
+            wait_for_instances(min_instances=1, target_instances=1, min_wait_time=5, timeout=60)
+            The function returns without waiting because target_instances <= 1.
     Raises:
+        ValueError: If arguments are invalid
         TimeoutError: If failed to connect to Ray or if minimum instances are not available within timeout
     """
-    if min_instances <= 1:
-        logger.debug("Minimum instances is 1 or less, no need to wait for additional instances")
+    if min_instances > target_instances:
+        raise ValueError(
+            f"Minimum instances ({min_instances}) cannot be greater than target instances ({target_instances})"
+        )
+    if timeout < 0:
+        raise ValueError("Timeout must be greater than 0")
+    if check_interval < 0:
+        raise ValueError("Check interval must be greater than 0")
+    if target_instances <= 1:
+        logger.debug("Target instances is 1 or less, no need to wait for additional instances")
         return
+    if min_wait_time < 0:
+        # Automatically set min_wait_time based on the number of target instances
+        # Using min_wait_time = 3 * log2(target_instances) as a starting point:
+        #   target_instances = 1    => min_wait_time = 0
+        #   target_instances = 2    => min_wait_time = 3
+        #   target_instances = 4    => min_wait_time = 6
+        #   target_instances = 8    => min_wait_time = 9
+        #   target_instances = 32   => min_wait_time = 15
+        #   target_instances = 50   => min_wait_time = 16.9
+        #   target_instances = 100  => min_wait_time = 19.9
+        min_wait_time = min(3 * math.log2(target_instances), timeout / 10)  # Clamp to timeout / 10
     # mljob_launcher runs inside the CR where mlruntime libraries are available, so we can import common_util directly
     from common_utils import common_util as mlrs_util
     start_time = time.time()
-    timeout = os.getenv("JOB_MIN_INSTANCES_TIMEOUT", TIMEOUT)
-    check_interval = os.getenv("JOB_MIN_INSTANCES_CHECK_INTERVAL", CHECK_INTERVAL)
-    logger.debug(f"Waiting for at least {min_instances} instances to be ready (timeout: {timeout}s)")
+    current_interval = max(min(1, check_interval), 0.1)  # Default 1s, minimum 0.1s
+    logger.debug(
+        "Waiting for instances to be ready "
+        "(min_instances={}, target_instances={}, timeout={}s, max_check_interval={}s)".format(
+            min_instances, target_instances, timeout, check_interval
+        )
+    )
-    while time.time() - start_time < timeout:
+    while (elapsed := time.time() - start_time) < timeout:
         total_nodes = mlrs_util.get_num_ray_nodes()
-        if total_nodes >= min_instances:
-            elapsed = time.time() - start_time
+        if total_nodes >= target_instances:
+            # Best case scenario: target_instances are already available
+            logger.info(f"Target instance requirement met: {total_nodes} instances available after {elapsed:.1f}s")
+            return
+        elif total_nodes >= min_instances and elapsed >= min_wait_time:
+            # Second best case scenario: target_instances not met within min_wait_time, but min_instances met
             logger.info(f"Minimum instance requirement met: {total_nodes} instances available after {elapsed:.1f}s")
             return
         logger.debug(
-            f"Waiting for instances: {total_nodes}/{min_instances} available "
-            f"(elapsed: {time.time() - start_time:.1f}s)"
+            f"Waiting for instances: current_instances={total_nodes}, min_instances={min_instances}, "
+            f"target_instances={target_instances}, elapsed={elapsed:.1f}s, next check in {current_interval:.1f}s"
         )
-        time.sleep(check_interval)
+        time.sleep(current_interval)
+        current_interval = min(current_interval * 2, check_interval)  # Exponential backoff
     raise TimeoutError(
-        f"Timed out after {timeout}s waiting for {min_instances} instances, only "
-        f"{mlrs_util.get_num_ray_nodes()} available"
+        f"Timed out after {timeout}s waiting for {min_instances} instances, only " f"{total_nodes} available"
     )
@@ -137,6 +222,13 @@ def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = N
     original_argv = sys.argv
     sys.argv = [script_path, *script_args]
+    # Ensure payload directory is in sys.path for module imports
+    # This is needed because mljob_launcher.py is now in /mnt/job_stage/system
+    # but user scripts are in the payload directory and may import from each other
+    payload_dir = os.environ.get(PAYLOAD_DIR_ENV_VAR)
+    if payload_dir and payload_dir not in sys.path:
+        sys.path.insert(0, payload_dir)
     # Create a Snowpark session before running the script
     # Session can be retrieved from using snowflake.snowpark.context.get_active_session()
     session = Session.builder.configs(SnowflakeLoginOptions()).create()  # noqa: F841
@@ -183,11 +275,22 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
     Raises:
         Exception: Re-raises any exception caught during script execution.
     """
+    # Ensure the output directory exists before trying to write result files.
+    output_dir = os.path.dirname(JOB_RESULT_PATH)
+    os.makedirs(output_dir, exist_ok=True)
     try:
         # Wait for minimum required instances if specified
         min_instances_str = os.environ.get(MIN_INSTANCES_ENV_VAR) or "1"
-        if min_instances_str and int(min_instances_str) > 1:
-            wait_for_min_instances(int(min_instances_str))
+        target_instances_str = os.environ.get(TARGET_INSTANCES_ENV_VAR) or min_instances_str
+        if target_instances_str and int(target_instances_str) > 1:
+            wait_for_instances(
+                int(min_instances_str),
+                int(target_instances_str),
+                min_wait_time=MIN_WAIT_TIME,
+                timeout=TIMEOUT,
+                check_interval=CHECK_INTERVAL,
+            )
         # Log start marker for user script execution
         print(LOG_START_MSG)  # noqa: T201

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -181,7 +181,7 @@ def generate_service_spec(
     # TODO: Add hooks for endpoints for integration with TensorBoard etc
     env_vars = {
-        constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix(),
+        constants.PAYLOAD_DIR_ENV_VAR: constants.APP_MOUNT_PATH,
         constants.RESULT_PATH_ENV_VAR: constants.RESULT_PATH_DEFAULT_VALUE,
     }
     endpoints: list[dict[str, Any]] = []

snowflake-ml-python 1.9.1__py3-none-any.whl → 1.9.2__py3-none-any.whl

snowflake-ml-python 1.9.1py3-none-any.whl → 1.9.2py3-none-any.whl