PyPI - snowflake-ml-python - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

snowflake-ml-python 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

snowflake/ml/_internal/utils/mixins.py +6 -4
snowflake/ml/_internal/utils/service_logger.py +118 -4
snowflake/ml/data/_internal/arrow_ingestor.py +4 -1
snowflake/ml/data/data_connector.py +4 -34
snowflake/ml/dataset/dataset.py +1 -1
snowflake/ml/dataset/dataset_reader.py +2 -8
snowflake/ml/experiment/__init__.py +3 -0
snowflake/ml/experiment/callback/lightgbm.py +55 -0
snowflake/ml/experiment/callback/xgboost.py +63 -0
snowflake/ml/experiment/utils.py +14 -0
snowflake/ml/jobs/_utils/constants.py +15 -4
snowflake/ml/jobs/_utils/payload_utils.py +159 -52
snowflake/ml/jobs/_utils/scripts/constants.py +0 -22
snowflake/ml/jobs/_utils/scripts/mljob_launcher.py +126 -23
snowflake/ml/jobs/_utils/spec_utils.py +1 -1
snowflake/ml/jobs/_utils/stage_utils.py +30 -14
snowflake/ml/jobs/_utils/types.py +64 -4
snowflake/ml/jobs/job.py +22 -6
snowflake/ml/jobs/manager.py +5 -3
snowflake/ml/model/_client/model/model_version_impl.py +56 -48
snowflake/ml/model/_client/ops/service_ops.py +194 -14
snowflake/ml/model/_client/sql/service.py +1 -38
snowflake/ml/model/_packager/model_handlers/sklearn.py +9 -5
snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -0
snowflake/ml/model/_signatures/pandas_handler.py +3 -0
snowflake/ml/model/_signatures/utils.py +4 -0
snowflake/ml/model/event_handler.py +87 -18
snowflake/ml/model/model_signature.py +2 -0
snowflake/ml/model/models/huggingface_pipeline.py +71 -49
snowflake/ml/model/type_hints.py +26 -1
snowflake/ml/registry/_manager/model_manager.py +30 -35
snowflake/ml/registry/_manager/model_parameter_reconciler.py +105 -0
snowflake/ml/registry/registry.py +0 -19
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.10.0.dist-info}/METADATA +542 -491
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.10.0.dist-info}/RECORD +39 -34
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.10.0.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.10.0.dist-info}/licenses/LICENSE.txt +0 -0
{snowflake_ml_python-1.9.1.dist-info → snowflake_ml_python-1.10.0.dist-info}/top_level.txt +0 -0

snowflake/ml/_internal/utils/mixins.py CHANGED Viewed

@@ -21,10 +21,12 @@ class SerializableSessionMixin:
     def __getstate__(self) -> dict[str, Any]:
         """Customize pickling to exclude non-serializable session and related components."""
-        if hasattr(super(), "__getstate__"):
-            state: dict[str, Any] = super().__getstate__()  # type: ignore[misc]
-        else:
-            state = self.__dict__.copy()
+        parent_state = (
+            super().__getstate__()  # type: ignore[misc] # object.__getstate__ appears in 3.11
+            if hasattr(super(), "__getstate__")
+            else self.__dict__
+        )
+        state = dict(parent_state)  # Create a copy so we can safely modify the state
         # Save session metadata for validation during unpickling
         session = state.pop(_SESSION_KEY, None)

snowflake/ml/_internal/utils/service_logger.py CHANGED Viewed

@@ -1,6 +1,22 @@
 import enum
 import logging
+import os
 import sys
+import tempfile
+import time
+import uuid
+from typing import Optional
+import platformdirs
+# Module-level logger for operational messages that should appear on console
+stdout_handler = logging.StreamHandler(sys.stdout)
+stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+console_logger = logging.getLogger(__name__)
+console_logger.addHandler(stdout_handler)
+console_logger.setLevel(logging.INFO)
+console_logger.propagate = False
 class LogColor(enum.Enum):
@@ -57,9 +73,107 @@ class CustomFormatter(logging.Formatter):
         return "\n".join(formatted_lines)
-def get_logger(logger_name: str, info_color: LogColor) -> logging.Logger:
+def _test_writability(directory: str) -> bool:
+    """Test if a directory is writable by creating and removing a test file."""
+    try:
+        os.makedirs(directory, exist_ok=True)
+        test_file = os.path.join(directory, f".write_test_{uuid.uuid4().hex[:8]}")
+        with open(test_file, "w") as f:
+            f.write("test")
+        os.remove(test_file)
+        return True
+    except OSError:
+        return False
+def _try_log_location(log_dir: str, operation_id: str) -> Optional[str]:
+    """Try to create a log file in the given directory if it's writable."""
+    if _test_writability(log_dir):
+        return os.path.join(log_dir, f"{operation_id}.log")
+    return None
+def _get_log_file_path(operation_id: str) -> Optional[str]:
+    """Get platform-independent log file path. Returns None if no writable location found."""
+    # Try locations in order of preference
+    locations = [
+        # Primary: User log directory
+        platformdirs.user_log_dir("snowflake-ml", "Snowflake"),
+        # Fallback 1: System temp directory
+        os.path.join(tempfile.gettempdir(), "snowflake-ml-logs"),
+        # Fallback 2: Current working directory
+        ".",
+    ]
+    for location in locations:
+        log_file_path = _try_log_location(location, operation_id)
+        if log_file_path:
+            return log_file_path
+    # No writable location found
+    return None
+def _get_or_create_parent_logger(operation_id: str) -> logging.Logger:
+    """Get or create a parent logger with FileHandler for the operation."""
+    parent_logger_name = f"snowflake_ml_operation_{operation_id}"
+    parent_logger = logging.getLogger(parent_logger_name)
+    parent_logger.setLevel(logging.DEBUG)
+    parent_logger.propagate = False
+    if not parent_logger.handlers:
+        log_file_path = _get_log_file_path(operation_id)
+        if log_file_path:
+            try:
+                file_handler = logging.FileHandler(log_file_path)
+                file_handler.setFormatter(logging.Formatter("%(name)s [%(asctime)s] [%(levelname)s] %(message)s"))
+                parent_logger.addHandler(file_handler)
+                console_logger.info(f"create_service logs saved to: {log_file_path}")
+            except OSError as e:
+                console_logger.warning(f"Could not create log file at {log_file_path}: {e}.")
+        else:
+            # No writable location found, use console-only logging
+            console_logger.warning("No writable location found for create_service log file.")
+        if logging.getLogger().level > logging.INFO:
+            console_logger.info(
+                "To see logs in console, set log level to INFO: logging.getLogger().setLevel(logging.INFO)"
+            )
+    return parent_logger
+def get_logger(logger_name: str, info_color: LogColor, operation_id: Optional[str] = None) -> logging.Logger:
     logger = logging.getLogger(logger_name)
-    handler = logging.StreamHandler(sys.stdout)
-    handler.setFormatter(CustomFormatter(info_color))
-    logger.addHandler(handler)
+    root_logger = logging.getLogger()
+    # If operation_id provided, set up parent logger with file handler
+    if operation_id:
+        parent_logger = _get_or_create_parent_logger(operation_id)
+        logger.parent = parent_logger
+        logger.propagate = True
+        if root_logger.level <= logging.INFO:
+            handler = logging.StreamHandler(sys.stdout)
+            handler.setFormatter(CustomFormatter(info_color))
+            logger.addHandler(handler)
+    else:
+        # No operation_id - add console handler only if user wants verbose logging
+        if root_logger.level <= logging.INFO and not logger.handlers:
+            handler = logging.StreamHandler(sys.stdout)
+            handler.setFormatter(CustomFormatter(info_color))
+            logger.addHandler(handler)
     return logger
+def get_operation_id() -> str:
+    """Generate a unique operation ID."""
+    return f"model_deploy_{uuid.uuid4().hex[:8]}_{int(time.time())}"
+def get_log_file_location(operation_id: str) -> Optional[str]:
+    """Get the log file path for an operation ID. Returns None if no writable location available."""
+    return _get_log_file_path(operation_id)

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -14,6 +14,7 @@ if TYPE_CHECKING:
     import ray
 from snowflake import snowpark
+from snowflake.ml._internal.utils import mixins
 from snowflake.ml.data import data_ingestor, data_source, ingestor_utils
 _EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
@@ -44,7 +45,7 @@ class _RecordBatchesBuffer:
         return popped
-class ArrowIngestor(data_ingestor.DataIngestor):
+class ArrowIngestor(data_ingestor.DataIngestor, mixins.SerializableSessionMixin):
     """Read and parse the data sources into an Arrow Dataset and yield batched numpy array in dict."""
     def __init__(
@@ -71,6 +72,8 @@ class ArrowIngestor(data_ingestor.DataIngestor):
     @classmethod
     def from_sources(cls, session: snowpark.Session, sources: Sequence[data_source.DataSource]) -> "ArrowIngestor":
+        if session is None:
+            raise ValueError("Session is required")
         return cls(session, sources)
     @classmethod

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -6,10 +6,9 @@ from typing_extensions import deprecated
 from snowflake import snowpark
 from snowflake.ml._internal import env, telemetry
-from snowflake.ml._internal.utils import mixins
 from snowflake.ml.data import data_ingestor, data_source
 from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor
-from snowflake.snowpark import context as sf_context
+from snowflake.snowpark import context as sp_context
 if TYPE_CHECKING:
     import pandas as pd
@@ -22,13 +21,11 @@ if TYPE_CHECKING:
     from snowflake.ml import dataset
 _PROJECT = "DataConnector"
-_INGESTOR_KEY = "_ingestor"
-_INGESTOR_SOURCES_KEY = "ingestor$sources"
 DataConnectorType = TypeVar("DataConnectorType", bound="DataConnector")
-class DataConnector(mixins.SerializableSessionMixin):
+class DataConnector:
     """Snowflake data reader which provides application integration connectors"""
     DEFAULT_INGESTOR_CLASS: type[data_ingestor.DataIngestor] = ArrowIngestor
@@ -36,11 +33,8 @@ class DataConnector(mixins.SerializableSessionMixin):
     def __init__(
         self,
         ingestor: data_ingestor.DataIngestor,
-        *,
-        session: Optional[snowpark.Session] = None,
         **kwargs: Any,
     ) -> None:
-        self._session = session
         self._ingestor = ingestor
         self._kwargs = kwargs
@@ -63,7 +57,7 @@ class DataConnector(mixins.SerializableSessionMixin):
         ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None,
         **kwargs: Any,
     ) -> DataConnectorType:
-        session = session or sf_context.get_active_session()
+        session = session or sp_context.get_active_session()
         source = data_source.DataFrameInfo(query)
         return cls.from_sources(session, [source], ingestor_class=ingestor_class, **kwargs)
@@ -107,31 +101,7 @@ class DataConnector(mixins.SerializableSessionMixin):
     ) -> DataConnectorType:
         ingestor_class = ingestor_class or cls.DEFAULT_INGESTOR_CLASS
         ingestor = ingestor_class.from_sources(session, sources)
-        return cls(ingestor, **kwargs, session=session)
-    def __getstate__(self) -> dict[str, Any]:
-        """Customize pickling to exclude non-serializable session and related components."""
-        if hasattr(super(), "__getstate__"):
-            state = super().__getstate__()
-        else:
-            state = self.__dict__.copy()
-        ingestor = state.pop(_INGESTOR_KEY)
-        state[_INGESTOR_SOURCES_KEY] = ingestor.data_sources
-        return state
-    def __setstate__(self, state: dict[str, Any]) -> None:
-        """Restore session from context during unpickling."""
-        data_sources = state.pop(_INGESTOR_SOURCES_KEY)
-        if hasattr(super(), "__setstate__"):
-            super().__setstate__(state)
-        else:
-            self.__dict__.update(state)
-        assert self._session is not None
-        self._ingestor = self.DEFAULT_INGESTOR_CLASS.from_sources(self._session, data_sources)
+        return cls(ingestor, **kwargs)
     @property
     def data_sources(self) -> list[data_source.DataSource]:

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -177,7 +177,7 @@ class Dataset(lineage_node.LineageNode):
                 original_exception=RuntimeError("No Dataset version selected."),
             )
         if self._reader is None:
-            self._reader = dataset_reader.DatasetReader.from_dataset(self)
+            self._reader = dataset_reader.DatasetReader.from_dataset(self, snowpark_session=self._session)
         return self._reader
     @staticmethod

snowflake/ml/dataset/dataset_reader.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import Any, Optional
-from warnings import warn
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
@@ -21,16 +20,11 @@ class DatasetReader(data_connector.DataConnector, mixins.SerializableSessionMixi
         self,
         ingestor: data_ingestor.DataIngestor,
         *,
-        session: snowpark.Session,
         snowpark_session: Optional[snowpark.Session] = None,
     ) -> None:
-        if snowpark_session is not None:
-            warn(
-                "Argument snowpark_session is deprecated and will be removed in a future release. Use session instead."
-            )
-            session = snowpark_session
-        super().__init__(ingestor, session=session)
+        super().__init__(ingestor)
+        self._session = snowpark_session
         self._fs_cached: Optional[snowfs.SnowFileSystem] = None
         self._files: Optional[list[str]] = None

snowflake/ml/experiment/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from snowflake.ml.experiment.experiment_tracking import ExperimentTracking
+__all__ = ["ExperimentTracking"]

snowflake/ml/experiment/callback/lightgbm.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING, Optional
+from warnings import warn
+import lightgbm as lgb
+if TYPE_CHECKING:
+    from snowflake.ml.experiment.experiment_tracking import ExperimentTracking
+    from snowflake.ml.model.model_signature import ModelSignature
+class SnowflakeLightgbmCallback(lgb.callback._RecordEvaluationCallback):
+    def __init__(
+        self,
+        experiment_tracking: "ExperimentTracking",
+        log_model: bool = True,
+        log_metrics: bool = True,
+        log_params: bool = True,
+        model_name: Optional[str] = None,
+        model_signature: Optional["ModelSignature"] = None,
+    ) -> None:
+        self._experiment_tracking = experiment_tracking
+        self.log_model = log_model
+        self.log_metrics = log_metrics
+        self.log_params = log_params
+        self.model_name = model_name
+        self.model_signature = model_signature
+        super().__init__(eval_result={})
+    def __call__(self, env: lgb.callback.CallbackEnv) -> None:
+        if self.log_params:
+            if env.iteration == env.begin_iteration:  # Log params only at the first iteration
+                self._experiment_tracking.log_params(env.params)
+        if self.log_metrics:
+            super().__call__(env)
+            for dataset_name, metrics in self.eval_result.items():
+                for metric_name, log in metrics.items():
+                    metric_key = dataset_name + ":" + metric_name
+                    self._experiment_tracking.log_metric(key=metric_key, value=log[-1], step=env.iteration)
+        if self.log_model:
+            if env.iteration == env.end_iteration - 1:  # Log model only at the last iteration
+                if self.model_signature:
+                    model_name = self.model_name or self._experiment_tracking._get_or_set_experiment().name + "_model"
+                    self._experiment_tracking.log_model(  # type: ignore[call-arg]
+                        model=env.model,
+                        model_name=model_name,
+                        signatures={"predict": self.model_signature},
+                    )
+                else:
+                    warn(
+                        "Model will not be logged because model signature is missing. To autolog the model, "
+                        "please specify `model_signature` when constructing SnowflakeLightgbmCallback."
+                    )

snowflake/ml/experiment/callback/xgboost.py ADDED Viewed

@@ -0,0 +1,63 @@
+import json
+from typing import TYPE_CHECKING, Any, Optional
+from warnings import warn
+import xgboost as xgb
+from snowflake.ml.experiment import utils
+if TYPE_CHECKING:
+    from snowflake.ml.experiment.experiment_tracking import ExperimentTracking
+    from snowflake.ml.model.model_signature import ModelSignature
+class SnowflakeXgboostCallback(xgb.callback.TrainingCallback):
+    def __init__(
+        self,
+        experiment_tracking: "ExperimentTracking",
+        log_model: bool = True,
+        log_metrics: bool = True,
+        log_params: bool = True,
+        model_name: Optional[str] = None,
+        model_signature: Optional["ModelSignature"] = None,
+    ) -> None:
+        self._experiment_tracking = experiment_tracking
+        self.log_model = log_model
+        self.log_metrics = log_metrics
+        self.log_params = log_params
+        self.model_name = model_name
+        self.model_signature = model_signature
+    def before_training(self, model: xgb.Booster) -> xgb.Booster:
+        if self.log_params:
+            params = json.loads(model.save_config())
+            self._experiment_tracking.log_params(utils.flatten_nested_params(params))
+        return model
+    def after_iteration(self, model: Any, epoch: int, evals_log: dict[str, dict[str, Any]]) -> bool:
+        if self.log_metrics:
+            for dataset_name, metrics in evals_log.items():
+                for metric_name, log in metrics.items():
+                    metric_key = dataset_name + ":" + metric_name
+                    self._experiment_tracking.log_metric(key=metric_key, value=log[-1], step=epoch)
+        return False
+    def after_training(self, model: xgb.Booster) -> xgb.Booster:
+        if self.log_model:
+            if not self.model_signature:
+                warn(
+                    "Model will not be logged because model signature is missing. "
+                    "To autolog the model, please specify `model_signature` when constructing SnowflakeXgboostCallback."
+                )
+                return model
+            model_name = self.model_name or self._experiment_tracking._get_or_set_experiment().name + "_model"
+            self._experiment_tracking.log_model(  # type: ignore[call-arg]
+                model=model,
+                model_name=model_name,
+                signatures={"predict": self.model_signature},
+            )
+        return model

snowflake/ml/experiment/utils.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import Any, Union
+def flatten_nested_params(params: Union[list[Any], dict[str, Any]], prefix: str = "") -> dict[str, Any]:
+    flat_params = {}
+    items = params.items() if isinstance(params, dict) else enumerate(params)
+    for key, value in items:
+        key = str(key).replace(".", "_")  # Replace dots in keys to avoid collisions involving nested keys
+        new_prefix = f"{prefix}.{key}" if prefix else key
+        if isinstance(value, (dict, list)):
+            flat_params.update(flatten_nested_params(value, new_prefix))
+        else:
+            flat_params[new_prefix] = value
+    return flat_params

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -6,10 +6,23 @@ DEFAULT_CONTAINER_NAME = "main"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
 RESULT_PATH_ENV_VAR = "MLRS_RESULT_PATH"
 MIN_INSTANCES_ENV_VAR = "MLRS_MIN_INSTANCES"
+TARGET_INSTANCES_ENV_VAR = "SNOWFLAKE_JOBS_COUNT"
 RUNTIME_IMAGE_TAG_ENV_VAR = "MLRS_CONTAINER_IMAGE_TAG"
 MEMORY_VOLUME_NAME = "dshm"
 STAGE_VOLUME_NAME = "stage-volume"
-STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
+# Base mount path
+STAGE_VOLUME_MOUNT_PATH = "/mnt/job_stage"
+# Stage subdirectory paths
+APP_STAGE_SUBPATH = "app"
+SYSTEM_STAGE_SUBPATH = "system"
+OUTPUT_STAGE_SUBPATH = "output"
+# Complete mount paths (automatically generated from base + subpath)
+APP_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{APP_STAGE_SUBPATH}"
+SYSTEM_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{SYSTEM_STAGE_SUBPATH}"
+OUTPUT_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{OUTPUT_STAGE_SUBPATH}"
 # Default container image information
 DEFAULT_IMAGE_REPO = "/snowflake/images/snowflake_images"
@@ -46,9 +59,7 @@ ENABLE_HEALTH_CHECKS = "false"
 JOB_POLL_INITIAL_DELAY_SECONDS = 0.1
 JOB_POLL_MAX_DELAY_SECONDS = 30
-# Magic attributes
-IS_MLJOB_REMOTE_ATTR = "_is_mljob_remote_callable"
-RESULT_PATH_DEFAULT_VALUE = "mljob_result.pkl"
+RESULT_PATH_DEFAULT_VALUE = f"{OUTPUT_MOUNT_PATH}/mljob_result.pkl"
 # Log start and end messages
 LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"

snowflake-ml-python 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

snowflake-ml-python 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl