PyPI - snowflake-ml-python - Versions diffs - 1.7.3__py3-none-any.whl → 1.7.4__py3-none-any.whl - Mend

snowflake-ml-python 1.7.3py3-none-any.whl → 1.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

snowflake/ml/jobs/_utils/spec_utils.py ADDED Viewed

@@ -0,0 +1,298 @@
+import logging
+from math import ceil
+from pathlib import PurePath
+from typing import Any, Dict, List, Optional, Union
+from snowflake import snowpark
+from snowflake.ml._internal.utils import snowflake_env
+from snowflake.ml.jobs._utils import constants, types
+def _get_node_resources(session: snowpark.Session, compute_pool: str) -> types.ComputeResources:
+    """Extract resource information for the specified compute pool"""
+    # Get the instance family
+    rows = session.sql(f"show compute pools like '{compute_pool}'").collect()
+    if not rows:
+        raise ValueError(f"Compute pool '{compute_pool}' not found")
+    instance_family: str = rows[0]["instance_family"]
+    # Get the cloud we're using (AWS, Azure, etc)
+    region = snowflake_env.get_regions(session)[snowflake_env.get_current_region_id(session)]
+    cloud = region["cloud"]
+    return (
+        constants.COMMON_INSTANCE_FAMILIES.get(instance_family)
+        or constants.CLOUD_INSTANCE_FAMILIES[cloud][instance_family]
+    )
+def _get_image_spec(session: snowpark.Session, compute_pool: str) -> types.ImageSpec:
+    # Retrieve compute pool node resources
+    resources = _get_node_resources(session, compute_pool=compute_pool)
+    # Use MLRuntime image
+    image_repo = constants.DEFAULT_IMAGE_REPO
+    image_name = constants.DEFAULT_IMAGE_GPU if resources.gpu > 0 else constants.DEFAULT_IMAGE_CPU
+    image_tag = constants.DEFAULT_IMAGE_TAG
+    # Try to pull latest image tag from server side if possible
+    query_result = session.sql("SHOW PARAMETERS LIKE 'constants.RUNTIME_BASE_IMAGE_TAG' IN ACCOUNT").collect()
+    if query_result:
+        image_tag = query_result[0]["value"]
+    # TODO: Should each instance consume the entire pod?
+    return types.ImageSpec(
+        repo=image_repo,
+        image_name=image_name,
+        image_tag=image_tag,
+        resource_requests=resources,
+        resource_limits=resources,
+    )
+def generate_spec_overrides(
+    environment_vars: Optional[Dict[str, str]] = None,
+    custom_overrides: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    Generate a dictionary of service specification overrides.
+    Args:
+        environment_vars: Environment variables to set in primary container
+        custom_overrides: Custom service specification overrides
+    Returns:
+        Resulting service specifiation patch dict. Empty if no overrides were supplied.
+    """
+    # Generate container level overrides
+    container_spec: Dict[str, Any] = {
+        "name": constants.DEFAULT_CONTAINER_NAME,
+    }
+    if environment_vars:
+        # TODO: Validate environment variables
+        container_spec["env"] = environment_vars
+    # Build container override spec only if any overrides were supplied
+    spec = {}
+    if len(container_spec) > 1:
+        spec = {
+            "spec": {
+                "containers": [container_spec],
+            }
+        }
+    # Apply custom overrides
+    if custom_overrides:
+        spec = merge_patch(spec, custom_overrides, display_name="custom_overrides")
+    return spec
+def generate_service_spec(
+    session: snowpark.Session,
+    compute_pool: str,
+    payload: types.UploadedPayload,
+    args: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """
+    Generate a service specification for a job.
+    Args:
+        session: Snowflake session
+        compute_pool: Compute pool for job execution
+        payload: Uploaded job payload
+        args: Arguments to pass to entrypoint script
+    Returns:
+        Job service specification
+    """
+    # Set resource requests/limits, including nvidia.com/gpu quantity if applicable
+    image_spec = _get_image_spec(session, compute_pool)
+    resource_requests: Dict[str, Union[str, int]] = {
+        "cpu": f"{int(image_spec.resource_requests.cpu * 1000)}m",
+        "memory": f"{image_spec.resource_limits.memory}Gi",
+    }
+    resource_limits: Dict[str, Union[str, int]] = {
+        "cpu": f"{int(image_spec.resource_requests.cpu * 1000)}m",
+        "memory": f"{image_spec.resource_limits.memory}Gi",
+    }
+    if image_spec.resource_limits.gpu > 0:
+        resource_requests["nvidia.com/gpu"] = image_spec.resource_requests.gpu
+        resource_limits["nvidia.com/gpu"] = image_spec.resource_limits.gpu
+    # Add local volumes for ephemeral logs and artifacts
+    volumes: List[Dict[str, str]] = []
+    volume_mounts: List[Dict[str, str]] = []
+    for volume_name, mount_path in [
+        ("system-logs", "/var/log/managedservices/system/mlrs"),
+        ("user-logs", "/var/log/managedservices/user/mlrs"),
+    ]:
+        volume_mounts.append(
+            {
+                "name": volume_name,
+                "mountPath": mount_path,
+            }
+        )
+        volumes.append(
+            {
+                "name": volume_name,
+                "source": "local",
+            }
+        )
+    # Mount 30% of memory limit as a memory-backed volume
+    memory_volume_name = "dshm"
+    memory_volume_size = min(
+        ceil(image_spec.resource_limits.memory * constants.MEMORY_VOLUME_SIZE),
+        image_spec.resource_requests.memory,
+    )
+    volume_mounts.append(
+        {
+            "name": memory_volume_name,
+            "mountPath": "/dev/shm",
+        }
+    )
+    volumes.append(
+        {
+            "name": memory_volume_name,
+            "source": "memory",
+            "size": f"{memory_volume_size}Gi",
+        }
+    )
+    # Mount payload as volume
+    stage_mount = PurePath("/opt/app")
+    stage_volume_name = "stage-volume"
+    volume_mounts.append(
+        {
+            "name": stage_volume_name,
+            "mountPath": stage_mount.as_posix(),
+        }
+    )
+    volumes.append(
+        {
+            "name": stage_volume_name,
+            "source": payload.stage_path.as_posix(),
+        }
+    )
+    # TODO: Add hooks for endpoints for integration with TensorBoard etc
+    # Assemble into service specification dict
+    spec = {
+        "spec": {
+            "containers": [
+                {
+                    "name": constants.DEFAULT_CONTAINER_NAME,
+                    "image": image_spec.full_name,
+                    "command": ["/usr/local/bin/_entrypoint.sh"],
+                    "args": [
+                        stage_mount.joinpath(v).as_posix() if isinstance(v, PurePath) else v for v in payload.entrypoint
+                    ]
+                    + (args or []),
+                    "env": {
+                        constants.PAYLOAD_DIR_ENV_VAR: stage_mount.as_posix(),
+                    },
+                    "volumeMounts": volume_mounts,
+                    "resources": {
+                        "requests": resource_requests,
+                        "limits": resource_limits,
+                    },
+                },
+            ],
+            "volumes": volumes,
+        }
+    }
+    return spec
+def merge_patch(base: Any, patch: Any, display_name: str = "") -> Any:
+    """
+    Implements a modified RFC7386 JSON Merge Patch
+    https://datatracker.ietf.org/doc/html/rfc7386
+    Behavior differs from the RFC in the following ways:
+      1. Empty nested dictionaries resulting from the patch are treated as None and are pruned
+      2. Attempts to merge lists of dicts using a merge key (default "name").
+         See _merge_lists_of_dicts for details on list merge behavior.
+    Args:
+        base: The base object to patch.
+        patch: The patch object.
+        display_name: The name of the patch object for logging purposes.
+    Returns:
+        The patched object.
+    """
+    if not type(base) is type(patch):
+        if base is not None:
+            logging.warning(f"Type mismatch while merging {display_name} (base={type(base)}, patch={type(patch)})")
+        return patch
+    elif isinstance(patch, list) and all(isinstance(v, dict) for v in base + patch):
+        # TODO: Should we prune empty lists?
+        return _merge_lists_of_dicts(base, patch, display_name=display_name)
+    elif not isinstance(patch, dict) or len(patch) == 0:
+        return patch
+    result = dict(base)  # Shallow copy
+    for key, value in patch.items():
+        if value is None:
+            result.pop(key, None)
+        else:
+            merge_result = merge_patch(result.get(key, None), value, display_name=f"{display_name}.{key}")
+            if isinstance(merge_result, dict) and len(merge_result) == 0:
+                result.pop(key, None)
+            else:
+                result[key] = merge_result
+    return result
+def _merge_lists_of_dicts(
+    base: List[Dict[str, Any]], patch: List[Dict[str, Any]], merge_key: str = "name", display_name: str = ""
+) -> List[Dict[str, Any]]:
+    """
+    Attempts to merge lists of dicts by matching on a merge key (default "name").
+    - If the merge key is missing, the behavior falls back to overwriting the list.
+    - If the merge key is present, the behavior is to match the list elements based on the
+        merge key and preserving any unmatched elements from the base list.
+    - Matched entries may be dropped in the following way(s):
+        1. The matching patch entry has a None key entry, e.g. { "name": "foo", None: None }.
+    Args:
+        base: The base list of dicts.
+        patch: The patch list of dicts.
+        merge_key: The key to use for merging.
+        display_name: The name of the patch object for logging purposes.
+    Returns:
+        The merged list of dicts if merging successful, else returns the patch list.
+    """
+    if any(merge_key not in d for d in base + patch):
+        logging.warning(f"Missing merge key {merge_key} in {display_name}. Falling back to overwrite behavior.")
+        return patch
+    # Build mapping of merge key values to list elements for the base list
+    result = {d[merge_key]: d for d in base}
+    if len(result) != len(base):
+        logging.warning(f"Duplicate merge key {merge_key} in {display_name}. Falling back to overwrite behavior.")
+        return patch
+    # Apply patches
+    for d in patch:
+        key = d[merge_key]
+        # Removal case 1: `None` key in patch entry
+        if None in d:
+            result.pop(key, None)
+            continue
+        # Apply patch
+        if key in result:
+            d = merge_patch(result[key], d, display_name=f"{display_name}[{merge_key}={d[merge_key]}]")
+            # TODO: Should we drop the item if the patch result is empty save for the merge key?
+            #       Can check `d.keys() <= {merge_key}`
+        result[key] = d
+    return list(result.values())

snowflake/ml/jobs/_utils/types.py ADDED Viewed

@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+from pathlib import PurePath
+from typing import List, Literal, Optional, Union
+JOB_STATUS = Literal[
+    "PENDING",
+    "RUNNING",
+    "FAILED",
+    "DONE",
+    "INTERNAL_ERROR",
+]
+@dataclass(frozen=True)
+class UploadedPayload:
+    # TODO: Include manifest of payload files for validation
+    stage_path: PurePath
+    entrypoint: List[Union[str, PurePath]]
+@dataclass(frozen=True)
+class ComputeResources:
+    cpu: float  # Number of vCPU cores
+    memory: float  # Memory in GiB
+    gpu: int = 0  # Number of GPUs
+    gpu_type: Optional[str] = None
+@dataclass(frozen=True)
+class ImageSpec:
+    repo: str
+    image_name: str
+    image_tag: str
+    resource_requests: ComputeResources
+    resource_limits: ComputeResources
+    @property
+    def full_name(self) -> str:
+        return f"{self.repo}/{self.image_name}:{self.image_tag}"

snowflake/ml/jobs/decorators.py ADDED Viewed

@@ -0,0 +1,91 @@
+import copy
+import functools
+import inspect
+from typing import Callable, Dict, List, Optional, TypeVar
+from typing_extensions import ParamSpec
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.ml.jobs import job as jb, manager as jm
+from snowflake.ml.jobs._utils import payload_utils
+_PROJECT = "MLJob"
+_Args = ParamSpec("_Args")
+_ReturnValue = TypeVar("_ReturnValue")
+@snowpark._internal.utils.private_preview(version="1.7.4")
+@telemetry.send_api_usage_telemetry(project=_PROJECT)
+def remote(
+    compute_pool: str,
+    stage_name: str,
+    pip_requirements: Optional[List[str]] = None,
+    external_access_integrations: Optional[List[str]] = None,
+    query_warehouse: Optional[str] = None,
+    env_vars: Optional[Dict[str, str]] = None,
+    session: Optional[snowpark.Session] = None,
+) -> Callable[[Callable[_Args, _ReturnValue]], Callable[_Args, jb.MLJob]]:
+    """
+    Submit a job to the compute pool.
+    Args:
+        compute_pool: The compute pool to use for the job.
+        stage_name: The name of the stage where the job payload will be uploaded.
+        pip_requirements: A list of pip requirements for the job.
+        external_access_integrations: A list of external access integrations.
+        query_warehouse: The query warehouse to use. Defaults to session warehouse.
+        env_vars: Environment variables to set in container
+        session: The Snowpark session to use. If none specified, uses active session.
+    Returns:
+        Decorator that dispatches invocations of the decorated function as remote jobs.
+    """
+    def decorator(func: Callable[_Args, _ReturnValue]) -> Callable[_Args, jb.MLJob]:
+        # Copy the function to avoid modifying the original
+        # We need to modify the line number of the function to exclude the
+        # decorator from the copied source code
+        wrapped_func = copy.copy(func)
+        wrapped_func.__code__ = wrapped_func.__code__.replace(co_firstlineno=func.__code__.co_firstlineno + 1)
+        # Validate function arguments based on signature
+        signature = inspect.signature(func)
+        pos_arg_names = []
+        for name, param in signature.parameters.items():
+            param_type = payload_utils.get_parameter_type(param)
+            if param_type is not None:
+                payload_utils.validate_parameter_type(param_type, name)
+            if param.kind in (param.POSITIONAL_ONLY, param.POSITIONAL_OR_KEYWORD):
+                pos_arg_names.append(name)
+        @functools.wraps(func)
+        def wrapper(*args: _Args.args, **kwargs: _Args.kwargs) -> jb.MLJob:
+            # Validate positional args
+            for i, arg in enumerate(args):
+                arg_name = pos_arg_names[i] if i < len(pos_arg_names) else f"args[{i}]"
+                payload_utils.validate_parameter_type(type(arg), arg_name)
+            # Validate keyword args
+            for k, v in kwargs.items():
+                payload_utils.validate_parameter_type(type(v), k)
+            arg_list = [str(v) for v in args] + [x for k, v in kwargs.items() for x in (f"--{k}", str(v))]
+            job = jm._submit_job(
+                source=wrapped_func,
+                args=arg_list,
+                stage_name=stage_name,
+                compute_pool=compute_pool,
+                pip_requirements=pip_requirements,
+                external_access_integrations=external_access_integrations,
+                query_warehouse=query_warehouse,
+                env_vars=env_vars,
+                session=session,
+            )
+            assert isinstance(job, jb.MLJob)
+            return job
+        return wrapper
+    return decorator

snowflake/ml/jobs/job.py ADDED Viewed

@@ -0,0 +1,113 @@
+import time
+from typing import Any, List, Optional, cast
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.ml.jobs._utils import constants, types
+from snowflake.snowpark.context import get_active_session
+_PROJECT = "MLJob"
+TERMINAL_JOB_STATUSES = {"FAILED", "DONE", "INTERNAL_ERROR"}
+class MLJob:
+    def __init__(self, id: str, session: Optional[snowpark.Session] = None) -> None:
+        self._id = id
+        self._session = session or get_active_session()
+        self._status: types.JOB_STATUS = "PENDING"
+    @property
+    def id(self) -> str:
+        """Get the unique job ID"""
+        return self._id
+    @property
+    def status(self) -> types.JOB_STATUS:
+        """Get the job's execution status."""
+        if self._status not in TERMINAL_JOB_STATUSES:
+            # Query backend for job status if not in terminal state
+            self._status = _get_status(self._session, self.id)
+        return self._status
+    @snowpark._internal.utils.private_preview(version="1.7.4")
+    def get_logs(self, limit: int = -1) -> str:
+        """
+        Return the job's execution logs.
+        Args:
+            limit: The maximum number of lines to return. Negative values are treated as no limit.
+        Returns:
+            The job's execution logs.
+        """
+        logs = _get_logs(self._session, self.id, limit)
+        assert isinstance(logs, str)  # mypy
+        return logs
+    @snowpark._internal.utils.private_preview(version="1.7.4")
+    def show_logs(self, limit: int = -1) -> None:
+        """
+        Display the job's execution logs.
+        Args:
+            limit: The maximum number of lines to display. Negative values are treated as no limit.
+        """
+        print(self.get_logs(limit))  # noqa: T201: we need to print here.
+    @snowpark._internal.utils.private_preview(version="1.7.4")
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def wait(self, timeout: float = -1) -> types.JOB_STATUS:
+        """
+        Block until completion. Returns completion status.
+        Args:
+            timeout: The maximum time to wait in seconds. Negative values are treated as no timeout.
+        Returns:
+            The job's completion status.
+        Raises:
+            TimeoutError: If the job does not complete within the specified timeout.
+        """
+        delay = constants.JOB_POLL_INITIAL_DELAY_SECONDS  # Start with 100ms delay
+        start_time = time.monotonic()
+        while self.status not in TERMINAL_JOB_STATUSES:
+            if timeout >= 0 and (elapsed := time.monotonic() - start_time) >= timeout:
+                raise TimeoutError(f"Job {self.id} did not complete within {elapsed} seconds")
+            time.sleep(delay)
+            delay = min(delay * 2, constants.JOB_POLL_MAX_DELAY_SECONDS)  # Exponential backoff
+        return self.status
+@telemetry.send_api_usage_telemetry(project=_PROJECT)
+def _get_status(session: snowpark.Session, job_id: str) -> types.JOB_STATUS:
+    """Retrieve job execution status."""
+    # TODO: snowflake-snowpark-python<1.24.0 shows spurious error messages on
+    #       `DESCRIBE` queries with bind variables
+    #       Switch to use bind variables instead of client side formatting after
+    #       updating to snowflake-snowpark-python>=1.24.0
+    (row,) = session.sql(f"DESCRIBE SERVICE {job_id}").collect()
+    return cast(types.JOB_STATUS, row["status"])
+@telemetry.send_api_usage_telemetry(project=_PROJECT)
+def _get_logs(session: snowpark.Session, job_id: str, limit: int = -1) -> str:
+    """
+    Retrieve the job's execution logs.
+    Args:
+        job_id: The job ID.
+        limit: The maximum number of lines to return. Negative values are treated as no limit.
+        session: The Snowpark session to use. If none specified, uses active session.
+    Returns:
+        The job's execution logs.
+    """
+    params: List[Any] = [job_id]
+    if limit > 0:
+        params.append(limit)
+    (row,) = session.sql(
+        f"SELECT SYSTEM$GET_SERVICE_LOGS(?, 0, '{constants.DEFAULT_CONTAINER_NAME}'{f', ?' if limit > 0 else ''})",
+        params=params,
+    ).collect()
+    return str(row[0])

snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.4__py3-none-any.whl

snowflake-ml-python 1.7.3py3-none-any.whl → 1.7.4py3-none-any.whl