PyPI - snowflake-ml-python - Versions diffs - 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

snowflake/ml/model/_packager/model_handlers/_base.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from enum import Enum
 from typing import Dict, Generic, Optional, Protocol, Type, final
 from typing_extensions import TypeGuard, Unpack
@@ -8,6 +9,15 @@ from snowflake.ml.model._packager.model_handlers_migrator import base_migrator
 from snowflake.ml.model._packager.model_meta import model_meta
+class ModelObjective(Enum):
+    # This is not getting stored anywhere as metadata yet so it should be fine to slowly extend it for better coverage
+    UNKNOWN = "unknown"
+    BINARY_CLASSIFICATION = "binary_classification"
+    MULTI_CLASSIFICATION = "multi_classification"
+    REGRESSION = "regression"
+    RANKING = "ranking"
 class _BaseModelHandlerProtocol(Protocol[model_types._ModelType]):
     HANDLER_TYPE: model_types.SupportedModelHandlerType
     HANDLER_VERSION: str
@@ -16,7 +26,7 @@ class _BaseModelHandlerProtocol(Protocol[model_types._ModelType]):
     @classmethod
     @abstractmethod
-    def can_handle(cls, model: model_types.SupportedDataType) -> TypeGuard[model_types._ModelType]:
+    def can_handle(cls, model: model_types.SupportedModelType) -> TypeGuard[model_types._ModelType]:
         """Whether this handler could support the type of the `model`.
         Args:

snowflake/ml/model/_packager/model_handlers/_utils.py CHANGED Viewed

@@ -1,4 +1,9 @@
-from typing import Callable, Iterable, Optional, Sequence, cast
+import json
+from typing import Any, Callable, Iterable, Optional, Sequence, cast
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
 from snowflake.ml.model import model_signature, type_hints as model_types
 from snowflake.ml.model._packager.model_meta import model_meta
@@ -40,6 +45,24 @@ def validate_signature(
     return model_meta
+def add_explain_method_signature(
+    model_meta: model_meta.ModelMetadata,
+    explain_method: str,
+    target_method: str,
+    output_return_type: model_signature.DataType = model_signature.DataType.DOUBLE,
+) -> model_meta.ModelMetadata:
+    if target_method not in model_meta.signatures:
+        raise ValueError(f"Signature for target method {target_method} is missing")
+    inputs = model_meta.signatures[target_method].inputs
+    model_meta.signatures[explain_method] = model_signature.ModelSignature(
+        inputs=inputs,
+        outputs=[
+            model_signature.FeatureSpec(dtype=output_return_type, name=f"{spec.name}_explanation") for spec in inputs
+        ],
+    )
+    return model_meta
 def get_target_methods(
     model: model_types.SupportedModelType,
     target_methods: Optional[Sequence[str]],
@@ -56,3 +79,37 @@ def validate_target_methods(model: model_types.SupportedModelType, target_method
     for method_name in target_methods:
         if not _is_callable(model, method_name):
             raise ValueError(f"Target method {method_name} is not callable or does not exist in the model.")
+def get_num_classes_if_exists(model: model_types.SupportedModelType) -> int:
+    num_classes = getattr(model, "classes_", [])
+    return len(num_classes)
+def convert_explanations_to_2D_df(
+    model: model_types.SupportedModelType, explanations: npt.NDArray[Any]
+) -> pd.DataFrame:
+    if explanations.ndim != 3:
+        return pd.DataFrame(explanations)
+    if hasattr(model, "classes_"):
+        classes_list = [cl for cl in model.classes_]  # type:ignore[union-attr]
+        len_classes = len(classes_list)
+        if explanations.shape[2] != len_classes:
+            raise ValueError(f"Model has {len_classes} classes but explanations have {explanations.shape[2]}")
+    else:
+        classes_list = [i for i in range(explanations.shape[2])]
+    exp_2d = []
+    # TODO (SNOW-1549044): Optimize this
+    for row in explanations:
+        col_list = []
+        for column in row:
+            class_explanations = {}
+            for cl, cl_exp in zip(classes_list, column):
+                if isinstance(cl, (int, np.integer)):
+                    cl = int(cl)
+                class_explanations[cl] = cl_exp
+            col_list.append(json.dumps(class_explanations))
+        exp_2d.append(col_list)
+    return pd.DataFrame(exp_2d)

snowflake/ml/model/_packager/model_handlers/catboost.py CHANGED Viewed

@@ -33,6 +33,22 @@ class CatBoostModelHandler(_base.BaseModelHandler["catboost.CatBoost"]):
     MODELE_BLOB_FILE_OR_DIR = "model.bin"
     DEFAULT_TARGET_METHODS = ["predict", "predict_proba"]
+    @classmethod
+    def get_model_objective(cls, model: "catboost.CatBoost") -> _base.ModelObjective:
+        import catboost
+        if isinstance(model, catboost.CatBoostClassifier):
+            num_classes = handlers_utils.get_num_classes_if_exists(model)
+            if num_classes == 2:
+                return _base.ModelObjective.BINARY_CLASSIFICATION
+            return _base.ModelObjective.MULTI_CLASSIFICATION
+        if isinstance(model, catboost.CatBoostRanker):
+            return _base.ModelObjective.RANKING
+        if isinstance(model, catboost.CatBoostRegressor):
+            return _base.ModelObjective.REGRESSION
+        # TODO: Find out model type from the generic Catboost Model
+        return _base.ModelObjective.UNKNOWN
     @classmethod
     def can_handle(cls, model: model_types.SupportedModelType) -> TypeGuard["catboost.CatBoost"]:
         return (type_utils.LazyType("catboost.CatBoost").isinstance(model)) and any(
@@ -89,6 +105,16 @@ class CatBoostModelHandler(_base.BaseModelHandler["catboost.CatBoost"]):
                 sample_input_data=sample_input_data,
                 get_prediction_fn=get_prediction,
             )
+            if kwargs.get("enable_explainability", False):
+                output_type = model_signature.DataType.DOUBLE
+                if cls.get_model_objective(model) == _base.ModelObjective.MULTI_CLASSIFICATION:
+                    output_type = model_signature.DataType.STRING
+                model_meta = handlers_utils.add_explain_method_signature(
+                    model_meta=model_meta,
+                    explain_method="explain",
+                    target_method="predict",
+                    output_return_type=output_type,
+                )
         model_blob_path = os.path.join(model_blobs_dir_path, name)
         os.makedirs(model_blob_path, exist_ok=True)
@@ -112,6 +138,11 @@ class CatBoostModelHandler(_base.BaseModelHandler["catboost.CatBoost"]):
             ],
             check_local_version=True,
         )
+        if kwargs.get("enable_explainability", False):
+            model_meta.env.include_if_absent(
+                [model_env.ModelDependency(requirement="shap", pip_name="shap")],
+                check_local_version=True,
+            )
         model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION)
         return None
@@ -186,6 +217,17 @@ class CatBoostModelHandler(_base.BaseModelHandler["catboost.CatBoost"]):
                     return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                @custom_model.inference_api
+                def explain_fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame:
+                    import shap
+                    explainer = shap.TreeExplainer(raw_model)
+                    df = handlers_utils.convert_explanations_to_2D_df(raw_model, explainer(X).values)
+                    return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                if target_method == "explain":
+                    return explain_fn
                 return fn
             type_method_dict: Dict[str, Any] = {"_raw_model": raw_model}

snowflake/ml/model/_packager/model_handlers/lightgbm.py CHANGED Viewed

@@ -43,6 +43,45 @@ class LGBMModelHandler(_base.BaseModelHandler[Union["lightgbm.Booster", "lightgb
     MODELE_BLOB_FILE_OR_DIR = "model.pkl"
     DEFAULT_TARGET_METHODS = ["predict", "predict_proba"]
+    _BINARY_CLASSIFICATION_OBJECTIVES = ["binary"]
+    _MULTI_CLASSIFICATION_OBJECTIVES = ["multiclass", "multiclassova"]
+    _RANKING_OBJECTIVES = ["lambdarank", "rank_xendcg"]
+    _REGRESSION_OBJECTIVES = [
+        "regression",
+        "regression_l1",
+        "huber",
+        "fair",
+        "poisson",
+        "quantile",
+        "tweedie",
+        "mape",
+        "gamma",
+    ]
+    @classmethod
+    def get_model_objective(cls, model: Union["lightgbm.Booster", "lightgbm.LGBMModel"]) -> _base.ModelObjective:
+        import lightgbm
+        # does not account for cross-entropy and custom
+        if isinstance(model, lightgbm.LGBMClassifier):
+            num_classes = handlers_utils.get_num_classes_if_exists(model)
+            if num_classes == 2:
+                return _base.ModelObjective.BINARY_CLASSIFICATION
+            return _base.ModelObjective.MULTI_CLASSIFICATION
+        if isinstance(model, lightgbm.LGBMRanker):
+            return _base.ModelObjective.RANKING
+        if isinstance(model, lightgbm.LGBMRegressor):
+            return _base.ModelObjective.REGRESSION
+        model_objective = model.params["objective"]
+        if model_objective in cls._BINARY_CLASSIFICATION_OBJECTIVES:
+            return _base.ModelObjective.BINARY_CLASSIFICATION
+        if model_objective in cls._MULTI_CLASSIFICATION_OBJECTIVES:
+            return _base.ModelObjective.MULTI_CLASSIFICATION
+        if model_objective in cls._RANKING_OBJECTIVES:
+            return _base.ModelObjective.RANKING
+        if model_objective in cls._REGRESSION_OBJECTIVES:
+            return _base.ModelObjective.REGRESSION
+        return _base.ModelObjective.UNKNOWN
     @classmethod
     def can_handle(
@@ -105,6 +144,19 @@ class LGBMModelHandler(_base.BaseModelHandler[Union["lightgbm.Booster", "lightgb
                 sample_input_data=sample_input_data,
                 get_prediction_fn=get_prediction,
             )
+            if kwargs.get("enable_explainability", False):
+                output_type = model_signature.DataType.DOUBLE
+                if cls.get_model_objective(model) in [
+                    _base.ModelObjective.BINARY_CLASSIFICATION,
+                    _base.ModelObjective.MULTI_CLASSIFICATION,
+                ]:
+                    output_type = model_signature.DataType.STRING
+                model_meta = handlers_utils.add_explain_method_signature(
+                    model_meta=model_meta,
+                    explain_method="explain",
+                    target_method="predict",
+                    output_return_type=output_type,
+                )
         model_blob_path = os.path.join(model_blobs_dir_path, name)
         os.makedirs(model_blob_path, exist_ok=True)
@@ -130,6 +182,11 @@ class LGBMModelHandler(_base.BaseModelHandler[Union["lightgbm.Booster", "lightgb
             ],
             check_local_version=True,
         )
+        if kwargs.get("enable_explainability", False):
+            model_meta.env.include_if_absent(
+                [model_env.ModelDependency(requirement="shap", pip_name="shap")],
+                check_local_version=True,
+            )
         return None
@@ -198,6 +255,17 @@ class LGBMModelHandler(_base.BaseModelHandler[Union["lightgbm.Booster", "lightgb
                     return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                @custom_model.inference_api
+                def explain_fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame:
+                    import shap
+                    explainer = shap.TreeExplainer(raw_model)
+                    df = handlers_utils.convert_explanations_to_2D_df(raw_model, explainer(X).values)
+                    return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                if target_method == "explain":
+                    return explain_fn
                 return fn
             type_method_dict: Dict[str, Any] = {"_raw_model": raw_model}

snowflake/ml/model/_packager/model_handlers/xgboost.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # mypy: disable-error-code="import"
+import json
 import os
 from typing import (
     TYPE_CHECKING,
@@ -46,6 +47,39 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
     MODELE_BLOB_FILE_OR_DIR = "model.ubj"
     DEFAULT_TARGET_METHODS = ["predict", "predict_proba"]
+    _BINARY_CLASSIFICATION_OBJECTIVE_PREFIX = ["binary:"]
+    _MULTI_CLASSIFICATION_OBJECTIVE_PREFIX = ["multi:"]
+    _RANKING_OBJECTIVE_PREFIX = ["rank:"]
+    _REGRESSION_OBJECTIVE_PREFIX = ["reg:"]
+    @classmethod
+    def get_model_objective(cls, model: Union["xgboost.Booster", "xgboost.XGBModel"]) -> _base.ModelObjective:
+        import xgboost
+        if isinstance(model, xgboost.XGBClassifier) or isinstance(model, xgboost.XGBRFClassifier):
+            num_classes = handlers_utils.get_num_classes_if_exists(model)
+            if num_classes == 2:
+                return _base.ModelObjective.BINARY_CLASSIFICATION
+            return _base.ModelObjective.MULTI_CLASSIFICATION
+        if isinstance(model, xgboost.XGBRegressor) or isinstance(model, xgboost.XGBRFRegressor):
+            return _base.ModelObjective.REGRESSION
+        if isinstance(model, xgboost.XGBRanker):
+            return _base.ModelObjective.RANKING
+        model_params = json.loads(model.save_config())
+        model_objective = model_params["learner"]["objective"]
+        for classification_objective in cls._BINARY_CLASSIFICATION_OBJECTIVE_PREFIX:
+            if classification_objective in model_objective:
+                return _base.ModelObjective.BINARY_CLASSIFICATION
+        for classification_objective in cls._MULTI_CLASSIFICATION_OBJECTIVE_PREFIX:
+            if classification_objective in model_objective:
+                return _base.ModelObjective.MULTI_CLASSIFICATION
+        for ranking_objective in cls._RANKING_OBJECTIVE_PREFIX:
+            if ranking_objective in model_objective:
+                return _base.ModelObjective.RANKING
+        for regression_objective in cls._REGRESSION_OBJECTIVE_PREFIX:
+            if regression_objective in model_objective:
+                return _base.ModelObjective.REGRESSION
+        return _base.ModelObjective.UNKNOWN
     @classmethod
     def can_handle(
@@ -112,6 +146,16 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
                 sample_input_data=sample_input_data,
                 get_prediction_fn=get_prediction,
             )
+            if kwargs.get("enable_explainability", False):
+                output_type = model_signature.DataType.DOUBLE
+                if cls.get_model_objective(model) == _base.ModelObjective.MULTI_CLASSIFICATION:
+                    output_type = model_signature.DataType.STRING
+                model_meta = handlers_utils.add_explain_method_signature(
+                    model_meta=model_meta,
+                    explain_method="explain",
+                    target_method="predict",
+                    output_return_type=output_type,
+                )
         model_blob_path = os.path.join(model_blobs_dir_path, name)
         os.makedirs(model_blob_path, exist_ok=True)
@@ -133,6 +177,11 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
             ],
             check_local_version=True,
         )
+        if kwargs.get("enable_explainability", False):
+            model_meta.env.include_if_absent(
+                [model_env.ModelDependency(requirement="shap", pip_name="shap")],
+                check_local_version=True,
+            )
         model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION)
     @classmethod
@@ -206,6 +255,16 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
                     return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                @custom_model.inference_api
+                def explain_fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame:
+                    import shap
+                    explainer = shap.TreeExplainer(raw_model)
+                    df = pd.DataFrame(explainer(X).values)
+                    return model_signature_utils.rename_pandas_df(df, signature.outputs)
+                if target_method == "explain":
+                    return explain_fn
                 return fn
             type_method_dict: Dict[str, Any] = {"_raw_model": raw_model}

snowflake/ml/model/_packager/model_runtime/model_runtime.py CHANGED Viewed

@@ -35,7 +35,7 @@ class ModelRuntime:
         self,
         name: str,
         env: model_env.ModelEnv,
-        imports: Optional[List[pathlib.PurePosixPath]] = None,
+        imports: Optional[List[str]] = None,
         is_gpu: bool = False,
         loading_from_file: bool = False,
     ) -> None:
@@ -75,7 +75,7 @@ class ModelRuntime:
             snowpark_ml_lib_path = runtime_base_path / "snowflake-ml-python.zip"
             file_utils.zip_python_package(str(snowpark_ml_lib_path), "snowflake.ml")
             snowpark_ml_lib_rel_path = pathlib.PurePosixPath(snowpark_ml_lib_path.relative_to(packager_path).as_posix())
-            self.imports.append(snowpark_ml_lib_rel_path)
+            self.imports.append(str(snowpark_ml_lib_rel_path))
         self.runtime_env.conda_env_rel_path = self.runtime_rel_path / self.runtime_env.conda_env_rel_path
         self.runtime_env.pip_requirements_rel_path = self.runtime_rel_path / self.runtime_env.pip_requirements_rel_path
@@ -108,6 +108,4 @@ class ModelRuntime:
             warnings.simplefilter("ignore")
             env.load_from_conda_file(packager_path / conda_env_rel_path)
             env.load_from_pip_file(packager_path / pip_requirements_rel_path)
-        return ModelRuntime(
-            name=name, env=env, imports=list(map(pathlib.PurePosixPath, loaded_dict["imports"])), loading_from_file=True
-        )
+        return ModelRuntime(name=name, env=env, imports=loaded_dict["imports"], loading_from_file=True)

snowflake/ml/model/model_signature.py CHANGED Viewed

@@ -232,7 +232,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                     ),
                 )
         else:
-            if isinstance(data_col[0], list):
+            if isinstance(data_col.iloc[0], list):
                 if not ft_shape:
                     raise snowml_exceptions.SnowflakeMLException(
                         error_code=error_codes.INVALID_DATA,
@@ -266,7 +266,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                             ),
                         )
-            elif isinstance(data_col[0], np.ndarray):
+            elif isinstance(data_col.iloc[0], np.ndarray):
                 if not ft_shape:
                     raise snowml_exceptions.SnowflakeMLException(
                         error_code=error_codes.INVALID_DATA,
@@ -297,7 +297,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                             ),
                         )
-            elif isinstance(data_col[0], str):
+            elif isinstance(data_col.iloc[0], str):
                 if ft_shape is not None:
                     raise snowml_exceptions.SnowflakeMLException(
                         error_code=error_codes.INVALID_DATA,
@@ -316,7 +316,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                         ),
                     )
-            elif isinstance(data_col[0], bytes):
+            elif isinstance(data_col.iloc[0], bytes):
                 if ft_shape is not None:
                     raise snowml_exceptions.SnowflakeMLException(
                         error_code=error_codes.INVALID_DATA,

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -232,11 +232,13 @@ class BaseModelSaveOption(TypedDict):
     _legacy_save: NotRequired[bool]
     function_type: NotRequired[Literal["FUNCTION", "TABLE_FUNCTION"]]
     method_options: NotRequired[Dict[str, ModelMethodSaveOptions]]
+    include_pip_dependencies: NotRequired[bool]
 class CatBoostModelSaveOptions(BaseModelSaveOption):
     target_methods: NotRequired[Sequence[str]]
     cuda_version: NotRequired[str]
+    enable_explainability: NotRequired[bool]
 class CustomModelSaveOption(BaseModelSaveOption):
@@ -250,10 +252,12 @@ class SKLModelSaveOptions(BaseModelSaveOption):
 class XGBModelSaveOptions(BaseModelSaveOption):
     target_methods: NotRequired[Sequence[str]]
     cuda_version: NotRequired[str]
+    enable_explainability: NotRequired[bool]
 class LGBMModelSaveOptions(BaseModelSaveOption):
     target_methods: NotRequired[Sequence[str]]
+    enable_explainability: NotRequired[bool]
 class SNOWModelSaveOptions(BaseModelSaveOption):

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -41,7 +41,7 @@ cp.register_pickle_by_value(inspect.getmodule(snowpark_dataframe_utils.cast_snow
 _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
-ENABLE_EFFICIENT_MEMORY_USAGE = False
+ENABLE_EFFICIENT_MEMORY_USAGE = True
 _UDTF_STAGE_NAME = f"MEMORY_EFFICIENT_UDTF_{str(uuid.uuid4()).replace('-', '_')}"

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py CHANGED Viewed

@@ -83,7 +83,19 @@ def _load_data_into_udf() -> Tuple[
     with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
         fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
-    # convert dataframe to numpy would save memory consumption
+    # Convert dataframe to numpy would save memory consumption
+    # Except for Pipeline, we need to keep the dataframe for the column names
+    from sklearn.pipeline import Pipeline
+    if isinstance(base_estimator, Pipeline):
+        return (
+            df[CONSTANTS['input_cols']],
+            df[CONSTANTS['label_cols']].squeeze(),
+            indices,
+            params_to_evaluate,
+            base_estimator,
+            fit_and_score_kwargs,
+            CONSTANTS
+        )
     return (
         df[CONSTANTS['input_cols']].to_numpy(),
         df[CONSTANTS['label_cols']].squeeze().to_numpy(),

snowflake/ml/modeling/impute/simple_imputer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import copy
+import warnings
 from typing import Any, Dict, Iterable, Optional, Type, Union
 import numpy as np
@@ -10,6 +11,7 @@ from sklearn import impute
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions
+from snowflake.ml._internal.utils import formatting
 from snowflake.ml.modeling.framework import _utils, base
 from snowflake.snowpark import functions as F, types as T
 from snowflake.snowpark._internal import utils as snowpark_utils
@@ -171,6 +173,14 @@ class SimpleImputer(base.BaseTransformer):
         self.set_output_cols(output_cols)
         self.set_passthrough_cols(passthrough_cols)
+    def _is_integer_type(self, column_type: T.DataType) -> bool:
+        return (
+            isinstance(column_type, T.ByteType)
+            or isinstance(column_type, T.ShortType)
+            or isinstance(column_type, T.IntegerType)
+            or isinstance(column_type, T.LongType)
+        )
     def _reset(self) -> None:
         """
         Reset internal data-dependent state of the imputer, if necessary.
@@ -389,6 +399,22 @@ class SimpleImputer(base.BaseTransformer):
                 # Use `fillna` for replacing nans. Check if the column has a string data type, or coerce a float.
                 if not isinstance(input_col_datatypes[input_col], T.StringType):
                     statistic = float(statistic)
+                if self._is_integer_type(input_col_datatypes[input_col]):
+                    if statistic.is_integer():
+                        statistic = int(statistic)
+                    else:
+                        warnings.warn(
+                            formatting.unwrap(
+                                f"""
+                                Integer column may not be imputed with a non-integer value {statistic}.
+                                In order to impute a non-integer value, convert the column to FloatType before imputing.
+                                """
+                            ),
+                            category=UserWarning,
+                            stacklevel=1,
+                        )
                 transformed_dataset = transformed_dataset.na.fill({output_col: statistic})
             else:
                 transformed_dataset = transformed_dataset.na.replace(

snowflake/ml/modeling/pipeline/pipeline.py CHANGED Viewed

@@ -99,10 +99,6 @@ class Pipeline(base.BaseTransformer):
         must implement `fit` and `transform` methods.
         The final step can be a transform or estimator, that is, it must implement
         `fit` and `transform`/`predict` methods.
-        TODO: SKLearn pipeline expects last step(and only the last step) to be an estimator obj or a dummy
-                estimator(like None or passthrough). Currently this Pipeline class works with a list of all
-                transforms or a list of transforms ending with an estimator. Should we change this implementation
-                to only work with list of steps ending with an estimator or a dummy estimator like SKLearn?
         Args:
             steps: List of (name, transform) tuples (implementing `fit`/`transform`) that
@@ -111,6 +107,10 @@ class Pipeline(base.BaseTransformer):
         """
         super().__init__()
         self.steps = steps
+        # TODO(snandamuri): SKLearn pipeline expects last step(and only the last step) to be an estimator obj or a dummy
+        # estimator(like None or passthrough). Currently this Pipeline class works with a list of all
+        # transforms or a list of transforms ending with an estimator. Should we change this implementation
+        # to only work with list of steps ending with an estimator or a dummy estimator like SKLearn?
         self._is_final_step_estimator = Pipeline._is_estimator(steps[-1][1])
         self._is_fitted = False
         self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = []

snowflake-ml-python 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl