PyPI - snowflake-ml-python - Versions diffs - 1.7.4__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

snowflake-ml-python 1.7.4py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -4,21 +4,51 @@ from snowflake.ml.jobs._utils.types import ComputeResources
 # SPCS specification constants
 DEFAULT_CONTAINER_NAME = "main"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
+MEMORY_VOLUME_NAME = "dshm"
+STAGE_VOLUME_NAME = "stage-volume"
+STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
 # Default container image information
 DEFAULT_IMAGE_REPO = "/snowflake/images/snowflake_images"
 DEFAULT_IMAGE_CPU = "st_plat/runtime/x86/runtime_image/snowbooks"
 DEFAULT_IMAGE_GPU = "st_plat/runtime/x86/generic_gpu/runtime_image/snowbooks"
-DEFAULT_IMAGE_TAG = "0.8.0"
+DEFAULT_IMAGE_TAG = "0.9.2"
 DEFAULT_ENTRYPOINT_PATH = "func.py"
 # Percent of container memory to allocate for /dev/shm volume
 MEMORY_VOLUME_SIZE = 0.3
+# Multi Node Headless prototype constants
+# TODO: Replace this placeholder with the actual container runtime image tag.
+MULTINODE_HEADLESS_IMAGE_TAG = "latest"
+# Ray port configuration
+RAY_PORTS = {
+    "HEAD_CLIENT_SERVER_PORT": "10001",
+    "HEAD_GCS_PORT": "12001",
+    "HEAD_DASHBOARD_GRPC_PORT": "12002",
+    "HEAD_DASHBOARD_PORT": "12003",
+    "OBJECT_MANAGER_PORT": "12011",
+    "NODE_MANAGER_PORT": "12012",
+    "RUNTIME_ENV_AGENT_PORT": "12013",
+    "DASHBOARD_AGENT_GRPC_PORT": "12014",
+    "DASHBOARD_AGENT_LISTEN_PORT": "12015",
+    "MIN_WORKER_PORT": "12031",
+    "MAX_WORKER_PORT": "13000",
+}
+# Node health check configuration
+# TODO(SNOW-1937020): Revisit the health check configuration
+ML_RUNTIME_HEALTH_CHECK_PORT = "5001"
+ENABLE_HEALTH_CHECKS = "false"
 # Job status polling constants
 JOB_POLL_INITIAL_DELAY_SECONDS = 0.1
 JOB_POLL_MAX_DELAY_SECONDS = 1
+# Magic attributes
+IS_MLJOB_REMOTE_ATTR = "_is_mljob_remote_callable"
 # Compute pool resource information
 # TODO: Query Snowflake for resource information instead of relying on this hardcoded
 #       table from https://docs.snowflake.com/en/sql-reference/sql/create-compute-pool

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
+import functools
 import inspect
 import io
+import itertools
+import pickle
 import sys
 import textwrap
 from pathlib import Path, PurePath
@@ -19,9 +22,11 @@ import cloudpickle as cp
 from snowflake import snowpark
 from snowflake.ml.jobs._utils import constants, types
+from snowflake.snowpark import exceptions as sp_exceptions
 from snowflake.snowpark._internal import code_generation
 _SUPPORTED_ARG_TYPES = {str, int, float}
+_SUPPORTED_ENTRYPOINT_EXTENSIONS = {".py"}
 _STARTUP_SCRIPT_PATH = PurePath("startup.sh")
 _STARTUP_SCRIPT_CODE = textwrap.dedent(
     f"""
@@ -68,16 +73,56 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     ##### Ray configuration #####
     shm_size=$(df --output=size --block-size=1 /dev/shm | tail -n 1)
+    # Check if the instance ip retrieval module exists, which is a prerequisite for multi node jobs
+    HELPER_EXISTS=$(
+        python3 -c "import snowflake.runtime.utils.get_instance_ip" 2>/dev/null && echo "true" || echo "false"
+    )
     # Configure IP address and logging directory
-    eth0Ip=$(ifconfig eth0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p')
+    if [ "$HELPER_EXISTS" = "true" ]; then
+        eth0Ip=$(python3 -m snowflake.runtime.utils.get_instance_ip "$SNOWFLAKE_SERVICE_NAME" --instance-index=-1)
+    else
+        eth0Ip=$(ifconfig eth0 2>/dev/null | sed -En -e 's/.*inet ([0-9.]+).*/\1/p')
+    fi
     log_dir="/tmp/ray"
-    # Check if eth0Ip is empty and set default if necessary
-    if [ -z "$eth0Ip" ]; then
-        # This should never happen, but just in case ethOIp is not set, we should default to localhost
+    # Check if eth0Ip is a valid IP address and fall back to default if necessary
+    if [[ ! $eth0Ip =~ ^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$ ]]; then
         eth0Ip="127.0.0.1"
     fi
+    # Get the environment values of SNOWFLAKE_JOBS_COUNT and SNOWFLAKE_JOB_INDEX for batch jobs
+    # These variables don't exist for non-batch jobs, so set defaults
+    if [ -z "$SNOWFLAKE_JOBS_COUNT" ]; then
+        SNOWFLAKE_JOBS_COUNT=1
+    fi
+    if [ -z "$SNOWFLAKE_JOB_INDEX" ]; then
+        SNOWFLAKE_JOB_INDEX=0
+    fi
+    # Determine if it should be a worker or a head node for batch jobs
+    if [[ "$SNOWFLAKE_JOBS_COUNT" -gt 1 && "$HELPER_EXISTS" = "true" ]]; then
+        head_info=$(python3 -m snowflake.runtime.utils.get_instance_ip "$SNOWFLAKE_SERVICE_NAME" --head)
+        if [ $? -eq 0 ]; then
+            # Parse the output using read
+            read head_index head_ip <<< "$head_info"
+            # Use the parsed variables
+            echo "Head Instance Index: $head_index"
+            echo "Head Instance IP: $head_ip"
+        else
+            echo "Error: Failed to get head instance information."
+            echo "$head_info" # Print the error message
+            exit 1
+        fi
+        if [ "$SNOWFLAKE_JOB_INDEX" -ne "$head_index" ]; then
+            NODE_TYPE="worker"
+        fi
+    fi
     # Common parameters for both head and worker nodes
     common_params=(
         "--node-ip-address=$eth0Ip"
@@ -93,33 +138,94 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
         "--disable-usage-stats"
     )
-    # Additional head-specific parameters
-    head_params=(
-        "--head"
-        "--port=${{RAY_HEAD_GCS_PORT:-12001}}"                                  # Port of Ray (GCS server)
-        "--ray-client-server-port=${{RAY_HEAD_CLIENT_SERVER_PORT:-10001}}"      # Listening port for Ray Client Server
-        "--dashboard-host=${{NODE_IP_ADDRESS}}"                                 # Host to bind the dashboard server
-        "--dashboard-grpc-port=${{RAY_HEAD_DASHBOARD_GRPC_PORT:-12002}}"        # Dashboard head to listen for grpc on
-        "--dashboard-port=${{DASHBOARD_PORT}}"                  # Port to bind the dashboard server for local debugging
-        "--resources={{\\"node_tag:head\\":1}}"                   # Resource tag for selecting head as coordinator
-    )
+    if [ "$NODE_TYPE" = "worker" ]; then
+        # Use head_ip as head address if it exists
+        if [ ! -z "$head_ip" ]; then
+            RAY_HEAD_ADDRESS="$head_ip"
+        fi
+        # If RAY_HEAD_ADDRESS is still empty, exit with an error
+        if [ -z "$RAY_HEAD_ADDRESS" ]; then
+            echo "Error: Failed to determine head node address using default instance-index=0"
+            exit 1
+        fi
+        if [ -z "$SERVICE_NAME" ]; then
+            SERVICE_NAME="$SNOWFLAKE_SERVICE_NAME"
+        fi
+        if [ -z "$RAY_HEAD_ADDRESS" ] || [ -z "$SERVICE_NAME" ]; then
+            echo "Error: RAY_HEAD_ADDRESS and SERVICE_NAME must be set."
+            exit 1
+        fi
+        # Additional worker-specific parameters
+        worker_params=(
+            "--address=${{RAY_HEAD_ADDRESS}}:12001"       # Connect to head node
+            "--resources={{\\"${{SERVICE_NAME}}\\":1, \\"node_tag:worker\\":1}}"  # Tag for node identification
+            "--object-store-memory=${{shm_size}}"
+        )
-    # Start Ray on the head node
-    ray start "${{common_params[@]}}" "${{head_params[@]}}" &
-    ##### End Ray configuration #####
+        # Start Ray on a worker node
+        ray start "${{common_params[@]}}" "${{worker_params[@]}}" -v --block
+    else
+        # Additional head-specific parameters
+        head_params=(
+            "--head"
+            "--port=${{RAY_HEAD_GCS_PORT:-12001}}"                                  # Port of Ray (GCS server)
+            "--ray-client-server-port=${{RAY_HEAD_CLIENT_SERVER_PORT:-10001}}"      # Rort for Ray Client Server
+            "--dashboard-host=${{NODE_IP_ADDRESS}}"                                 # Host to bind the dashboard server
+            "--dashboard-grpc-port=${{RAY_HEAD_DASHBOARD_GRPC_PORT:-12002}}"        # Dashboard head to listen for grpc
+            "--dashboard-port=${{DASHBOARD_PORT}}"                  # Port to bind the dashboard server for debugging
+            "--resources={{\\"node_tag:head\\":1}}"                   # Resource tag for selecting head as coordinator
+        )
+        # Start Ray on the head node
+        ray start "${{common_params[@]}}" "${{head_params[@]}}" -v
+        ##### End Ray configuration #####
-    # TODO: Monitor MLRS and handle process crashes
-    python -m web.ml_runtime_grpc_server &
+        # TODO: Monitor MLRS and handle process crashes
+        python -m web.ml_runtime_grpc_server &
-    # TODO: Launch worker service(s) using SQL if Ray and MLRS successfully started
+        # TODO: Launch worker service(s) using SQL if Ray and MLRS successfully started
-    # Run user's Python entrypoint
-    echo Running command: python "$@"
-    python "$@"
+        # Run user's Python entrypoint
+        echo Running command: python "$@"
+        python "$@"
+    fi
     """
 ).strip()
+def _resolve_entrypoint(parent: Path, entrypoint: Optional[Path]) -> Path:
+    parent = parent.absolute()
+    if entrypoint is None:
+        if parent.is_file():
+            # Infer entrypoint from source
+            entrypoint = parent
+        else:
+            raise ValueError("entrypoint must be provided when source is a directory")
+    elif entrypoint.is_absolute():
+        # Absolute path - validate it's a subpath of source dir
+        if not entrypoint.is_relative_to(parent):
+            raise ValueError(f"Entrypoint must be a subpath of {parent}, got: {entrypoint})")
+    else:
+        # Relative path
+        if (abs_entrypoint := entrypoint.absolute()).is_relative_to(parent) and abs_entrypoint.is_file():
+            # Relative to working dir iff path is relative to source dir and exists
+            entrypoint = abs_entrypoint
+        else:
+            # Relative to source dir
+            entrypoint = parent.joinpath(entrypoint)
+    if not entrypoint.is_file():
+        raise FileNotFoundError(
+            "Entrypoint not found. Ensure the entrypoint is a valid file and is under"
+            f" the source directory (source={parent}, entrypoint={entrypoint})"
+        )
+    return entrypoint
 class JobPayload:
     def __init__(
         self,
@@ -138,23 +244,23 @@ class JobPayload:
             # since we will generate the file from the serialized callable
             pass
         elif isinstance(self.source, Path):
-            # Validate self.source and self.entrypoint for files
-            if not self.source.exists():
-                raise FileNotFoundError(f"{self.source} does not exist")
-            if self.entrypoint is None:
-                if self.source.is_file():
-                    self.entrypoint = self.source
-                else:
-                    raise ValueError("entrypoint must be provided when source is a directory")
-            if not self.entrypoint.is_file():
-                # Check if self.entrypoint is a valid relative path
-                self.entrypoint = self.source.joinpath(self.entrypoint)
-                if not self.entrypoint.is_file():
-                    raise FileNotFoundError(f"File {self.entrypoint} does not exist")
-            if not self.entrypoint.is_relative_to(self.source):
-                raise ValueError(f"{self.entrypoint} must be a subpath of {self.source}")
-            if self.entrypoint.suffix != ".py":
-                raise NotImplementedError("Only Python entrypoints are supported currently")
+            # Validate source
+            source = self.source
+            if not source.exists():
+                raise FileNotFoundError(f"{source} does not exist")
+            source = source.absolute()
+            # Validate entrypoint
+            entrypoint = _resolve_entrypoint(source, self.entrypoint)
+            if entrypoint.suffix not in _SUPPORTED_ENTRYPOINT_EXTENSIONS:
+                raise ValueError(
+                    "Unsupported entrypoint type:"
+                    f" supported={','.join(_SUPPORTED_ENTRYPOINT_EXTENSIONS)} got={entrypoint.suffix}"
+                )
+            # Update fields with normalized values
+            self.source = source
+            self.entrypoint = entrypoint
         else:
             raise ValueError("Unsupported source type. Source must be a file, directory, or callable.")
@@ -168,12 +274,16 @@ class JobPayload:
         entrypoint = self.entrypoint or Path(constants.DEFAULT_ENTRYPOINT_PATH)
         # Create stage if necessary
-        stage_name = stage_path.parts[0]
-        session.sql(
-            f"create stage if not exists {stage_name.lstrip('@')}"
-            " encryption = ( type = 'SNOWFLAKE_SSE' )"
-            " comment = 'Created by snowflake.ml.jobs Python API'"
-        ).collect()
+        stage_name = stage_path.parts[0].lstrip("@")
+        # Explicitly check if stage exists first since we may not have CREATE STAGE privilege
+        try:
+            session.sql(f"describe stage {stage_name}").collect()
+        except sp_exceptions.SnowparkSQLException:
+            session.sql(
+                f"create stage if not exists {stage_name}"
+                " encryption = ( type = 'SNOWFLAKE_SSE' )"
+                " comment = 'Created by snowflake.ml.jobs Python API'"
+            ).collect()
         # Upload payload to stage
         if not isinstance(source, Path):
@@ -237,7 +347,7 @@ class JobPayload:
         )
-def get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
+def _get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
     # Unwrap Optional type annotations
     param_type = param.annotation
     if get_origin(param_type) is Union and len(get_args(param_type)) == 2 and type(None) in get_args(param_type):
@@ -249,7 +359,7 @@ def get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
     return cast(Type[object], param_type)
-def validate_parameter_type(param_type: Type[object], param_name: str) -> None:
+def _validate_parameter_type(param_type: Type[object], param_name: str) -> None:
     # Validate param_type is a supported type
     if param_type not in _SUPPORTED_ARG_TYPES:
         raise ValueError(
@@ -258,41 +368,60 @@ def validate_parameter_type(param_type: Type[object], param_name: str) -> None:
         )
-def generate_python_code(func: Callable[..., Any], source_code_display: bool = False) -> str:
-    signature = inspect.signature(func)
-    if any(
-        p.kind in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
-        for p in signature.parameters.values()
-    ):
-        raise NotImplementedError("Function must not have unpacking arguments (* or **)")
-    # Mirrored from Snowpark generate_python_code() function
-    # https://github.com/snowflakedb/snowpark-python/blob/main/src/snowflake/snowpark/_internal/udf_utils.py
+def _generate_source_code_comment(func: Callable[..., Any]) -> str:
+    """Generate a comment string containing the source code of a function for readability."""
     try:
-        source_code_comment = (
-            code_generation.generate_source_code(func) if source_code_display else ""  # type: ignore[arg-type]
-        )
+        if isinstance(func, functools.partial):
+            # Unwrap functools.partial and generate source code comment from the original function
+            comment = code_generation.generate_source_code(func.func)  # type: ignore[arg-type]
+            args = itertools.chain((repr(a) for a in func.args), (f"{k}={v!r}" for k, v in func.keywords.items()))
+            # Update invocation comment to show arguments passed via functools.partial
+            comment = comment.replace(
+                f"= {func.func.__name__}",
+                "= functools.partial({}({}))".format(
+                    func.func.__name__,
+                    ", ".join(args),
+                ),
+            )
+            return comment
+        else:
+            return code_generation.generate_source_code(func)  # type: ignore[arg-type]
     except Exception as exc:
         error_msg = f"Source code comment could not be generated for {func} due to error {exc}."
-        source_code_comment = code_generation.comment_source_code(error_msg)
-    func_name = "func"
-    func_code = f"""
-{source_code_comment}
+        return code_generation.comment_source_code(error_msg)
-import pickle
-{func_name} = pickle.loads(bytes.fromhex('{cp.dumps(func).hex()}'))
-"""
+def _serialize_callable(func: Callable[..., Any]) -> bytes:
+    try:
+        func_bytes: bytes = cp.dumps(func)
+        return func_bytes
+    except pickle.PicklingError as e:
+        if isinstance(func, functools.partial):
+            # Try to find which part of the partial isn't serializable for better debuggability
+            objects = [
+                ("function", func.func),
+                *((f"positional arg {i}", a) for i, a in enumerate(func.args)),
+                *((f"keyword arg '{k}'", v) for k, v in func.keywords.items()),
+            ]
+            for name, obj in objects:
+                try:
+                    cp.dumps(obj)
+                except pickle.PicklingError:
+                    raise ValueError(f"Unable to serialize {name}: {obj}") from e
+        raise ValueError(f"Unable to serialize function: {func}") from e
+def _generate_param_handler_code(signature: inspect.Signature, output_name: str = "kwargs") -> str:
     # Generate argparse logic for argument handling (type coercion, default values, etc)
     argparse_code = ["import argparse", "", "parser = argparse.ArgumentParser()"]
     argparse_postproc = []
     for name, param in signature.parameters.items():
         opts = {}
-        param_type = get_parameter_type(param)
+        param_type = _get_parameter_type(param)
         if param_type is not None:
-            validate_parameter_type(param_type, name)
+            _validate_parameter_type(param_type, name)
             opts["type"] = param_type.__name__
         if param.default != inspect.Parameter.empty:
@@ -324,6 +453,37 @@ import pickle
             )
     argparse_code.append("args = parser.parse_args()")
     param_code = "\n".join(argparse_code + argparse_postproc)
+    param_code += f"\n{output_name} = vars(args)"
+    return param_code
+def generate_python_code(func: Callable[..., Any], source_code_display: bool = False) -> str:
+    """Generate an entrypoint script from a Python function."""
+    signature = inspect.signature(func)
+    if any(
+        p.kind in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
+        for p in signature.parameters.values()
+    ):
+        raise NotImplementedError("Function must not have unpacking arguments (* or **)")
+    # Mirrored from Snowpark generate_python_code() function
+    # https://github.com/snowflakedb/snowpark-python/blob/main/src/snowflake/snowpark/_internal/udf_utils.py
+    source_code_comment = _generate_source_code_comment(func) if source_code_display else ""
+    func_name = "func"
+    func_code = f"""
+{source_code_comment}
+import pickle
+{func_name} = pickle.loads(bytes.fromhex('{_serialize_callable(func).hex()}'))
+"""
+    arg_dict_name = "kwargs"
+    if getattr(func, constants.IS_MLJOB_REMOTE_ATTR, None):
+        param_code = f"{arg_dict_name} = {{}}"
+    else:
+        param_code = _generate_param_handler_code(signature, arg_dict_name)
     return f"""
 ### Version guard to check compatibility across Python versions ###
@@ -348,5 +508,5 @@ if sys.version_info.major != {sys.version_info.major} or sys.version_info.minor
 if __name__ == '__main__':
 {textwrap.indent(param_code, '    ')}
-    {func_name}(**vars(args))
+    {func_name}(**{arg_dict_name})
 """

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -26,19 +26,22 @@ def _get_node_resources(session: snowpark.Session, compute_pool: str) -> types.C
     )
-def _get_image_spec(session: snowpark.Session, compute_pool: str) -> types.ImageSpec:
+def _get_image_spec(session: snowpark.Session, compute_pool: str, image_tag: Optional[str] = None) -> types.ImageSpec:
     # Retrieve compute pool node resources
     resources = _get_node_resources(session, compute_pool=compute_pool)
     # Use MLRuntime image
     image_repo = constants.DEFAULT_IMAGE_REPO
     image_name = constants.DEFAULT_IMAGE_GPU if resources.gpu > 0 else constants.DEFAULT_IMAGE_CPU
-    image_tag = constants.DEFAULT_IMAGE_TAG
     # Try to pull latest image tag from server side if possible
-    query_result = session.sql("SHOW PARAMETERS LIKE 'constants.RUNTIME_BASE_IMAGE_TAG' IN ACCOUNT").collect()
-    if query_result:
-        image_tag = query_result[0]["value"]
+    if not image_tag:
+        query_result = session.sql("SHOW PARAMETERS LIKE 'constants.RUNTIME_BASE_IMAGE_TAG' IN ACCOUNT").collect()
+        if query_result:
+            image_tag = query_result[0]["value"]
+    if image_tag is None:
+        image_tag = constants.DEFAULT_IMAGE_TAG
     # TODO: Should each instance consume the entire pod?
     return types.ImageSpec(
@@ -93,6 +96,7 @@ def generate_service_spec(
     compute_pool: str,
     payload: types.UploadedPayload,
     args: Optional[List[str]] = None,
+    num_instances: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
     Generate a service specification for a job.
@@ -102,12 +106,21 @@ def generate_service_spec(
         compute_pool: Compute pool for job execution
         payload: Uploaded job payload
         args: Arguments to pass to entrypoint script
+        num_instances: Number of instances for multi-node job
     Returns:
         Job service specification
     """
+    is_multi_node = num_instances is not None and num_instances > 1
     # Set resource requests/limits, including nvidia.com/gpu quantity if applicable
-    image_spec = _get_image_spec(session, compute_pool)
+    if is_multi_node:
+        # If the job is of multi-node, we will need a different image which contains
+        # module snowflake.runtime.utils.get_instance_ip
+        # TODO(SNOW-1961849): Remove the hard-coded image name
+        image_spec = _get_image_spec(session, compute_pool, constants.MULTINODE_HEADLESS_IMAGE_TAG)
+    else:
+        image_spec = _get_image_spec(session, compute_pool)
     resource_requests: Dict[str, Union[str, int]] = {
         "cpu": f"{int(image_spec.resource_requests.cpu * 1000)}m",
         "memory": f"{image_spec.resource_limits.memory}Gi",
@@ -141,68 +154,88 @@ def generate_service_spec(
         )
     # Mount 30% of memory limit as a memory-backed volume
-    memory_volume_name = "dshm"
     memory_volume_size = min(
         ceil(image_spec.resource_limits.memory * constants.MEMORY_VOLUME_SIZE),
         image_spec.resource_requests.memory,
     )
     volume_mounts.append(
         {
-            "name": memory_volume_name,
+            "name": constants.MEMORY_VOLUME_NAME,
             "mountPath": "/dev/shm",
         }
     )
     volumes.append(
         {
-            "name": memory_volume_name,
+            "name": constants.MEMORY_VOLUME_NAME,
             "source": "memory",
             "size": f"{memory_volume_size}Gi",
         }
     )
     # Mount payload as volume
-    stage_mount = PurePath("/opt/app")
-    stage_volume_name = "stage-volume"
+    stage_mount = PurePath(constants.STAGE_VOLUME_MOUNT_PATH)
     volume_mounts.append(
         {
-            "name": stage_volume_name,
+            "name": constants.STAGE_VOLUME_NAME,
             "mountPath": stage_mount.as_posix(),
         }
     )
     volumes.append(
         {
-            "name": stage_volume_name,
+            "name": constants.STAGE_VOLUME_NAME,
             "source": payload.stage_path.as_posix(),
         }
     )
     # TODO: Add hooks for endpoints for integration with TensorBoard etc
-    # Assemble into service specification dict
-    spec = {
-        "spec": {
-            "containers": [
-                {
-                    "name": constants.DEFAULT_CONTAINER_NAME,
-                    "image": image_spec.full_name,
-                    "command": ["/usr/local/bin/_entrypoint.sh"],
-                    "args": [
-                        stage_mount.joinpath(v).as_posix() if isinstance(v, PurePath) else v for v in payload.entrypoint
-                    ]
-                    + (args or []),
-                    "env": {
-                        constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix(),
-                    },
-                    "volumeMounts": volume_mounts,
-                    "resources": {
-                        "requests": resource_requests,
-                        "limits": resource_limits,
-                    },
+    env_vars = {constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix()}
+    endpoints = []
+    if is_multi_node:
+        # Update environment variables for multi-node job
+        env_vars.update(constants.RAY_PORTS)
+        env_vars["ENABLE_HEALTH_CHECKS"] = constants.ENABLE_HEALTH_CHECKS
+        # Define Ray endpoints for intra-service instance communication
+        ray_endpoints = [
+            {"name": "ray-client-server-endpoint", "port": 10001, "protocol": "TCP"},
+            {"name": "ray-gcs-endpoint", "port": 12001, "protocol": "TCP"},
+            {"name": "ray-dashboard-grpc-endpoint", "port": 12002, "protocol": "TCP"},
+            {"name": "ray-object-manager-endpoint", "port": 12011, "protocol": "TCP"},
+            {"name": "ray-node-manager-endpoint", "port": 12012, "protocol": "TCP"},
+            {"name": "ray-runtime-agent-endpoint", "port": 12013, "protocol": "TCP"},
+            {"name": "ray-dashboard-agent-grpc-endpoint", "port": 12014, "protocol": "TCP"},
+            {"name": "ephemeral-port-range", "portRange": "32768-60999", "protocol": "TCP"},
+            {"name": "ray-worker-port-range", "portRange": "12031-13000", "protocol": "TCP"},
+        ]
+        endpoints.extend(ray_endpoints)
+    spec_dict = {
+        "containers": [
+            {
+                "name": constants.DEFAULT_CONTAINER_NAME,
+                "image": image_spec.full_name,
+                "command": ["/usr/local/bin/_entrypoint.sh"],
+                "args": [
+                    (stage_mount.joinpath(v).as_posix() if isinstance(v, PurePath) else v) for v in payload.entrypoint
+                ]
+                + (args or []),
+                "env": env_vars,
+                "volumeMounts": volume_mounts,
+                "resources": {
+                    "requests": resource_requests,
+                    "limits": resource_limits,
                 },
-            ],
-            "volumes": volumes,
-        }
+            },
+        ],
+        "volumes": volumes,
     }
+    if endpoints:
+        spec_dict["endpoints"] = endpoints
+    # Assemble into service specification dict
+    spec = {"spec": spec_dict}
     return spec
@@ -250,7 +283,10 @@ def merge_patch(base: Any, patch: Any, display_name: str = "") -> Any:
 def _merge_lists_of_dicts(
-    base: List[Dict[str, Any]], patch: List[Dict[str, Any]], merge_key: str = "name", display_name: str = ""
+    base: List[Dict[str, Any]],
+    patch: List[Dict[str, Any]],
+    merge_key: str = "name",
+    display_name: str = "",
 ) -> List[Dict[str, Any]]:
     """
     Attempts to merge lists of dicts by matching on a merge key (default "name").
@@ -290,7 +326,11 @@ def _merge_lists_of_dicts(
         # Apply patch
         if key in result:
-            d = merge_patch(result[key], d, display_name=f"{display_name}[{merge_key}={d[merge_key]}]")
+            d = merge_patch(
+                result[key],
+                d,
+                display_name=f"{display_name}[{merge_key}={d[merge_key]}]",
+            )
             # TODO: Should we drop the item if the patch result is empty save for the merge key?
             #       Can check `d.keys() <= {merge_key}`
         result[key] = d

snowflake-ml-python 1.7.4__py3-none-any.whl → 1.8.0__py3-none-any.whl

snowflake-ml-python 1.7.4py3-none-any.whl → 1.8.0py3-none-any.whl