PyPI - snowflake-ml-python - Versions diffs - 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl - Mend

snowflake-ml-python 1.7.3py3-none-any.whl → 1.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

snowflake/ml/model/_packager/model_task/model_task_utils.py CHANGED Viewed

@@ -24,7 +24,11 @@ def get_task_skl(model: Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pi
     from sklearn.base import is_classifier, is_regressor
     if type_utils.LazyType("sklearn.pipeline.Pipeline").isinstance(model):
-        return type_hints.Task.UNKNOWN
+        if hasattr(model, "predict_proba") or hasattr(model, "predict"):
+            model = model.steps[-1][1]  # type: ignore[attr-defined]
+            return _get_model_task(model)
+        else:
+            return type_hints.Task.UNKNOWN
     if is_regressor(model):
         return type_hints.Task.TABULAR_REGRESSION
     if is_classifier(model):

snowflake/ml/model/_signatures/base_handler.py CHANGED Viewed

@@ -12,7 +12,6 @@ class BaseDataHandler(ABC, Generic[model_types._DataType]):
     FEATURE_PREFIX: Final[str] = "feature"
     INPUT_PREFIX: Final[str] = "input"
     OUTPUT_PREFIX: Final[str] = "output"
-    SIG_INFER_ROWS_COUNT_LIMIT: Final[int] = 10
     @staticmethod
     @abstractmethod
@@ -26,7 +25,7 @@ class BaseDataHandler(ABC, Generic[model_types._DataType]):
     @staticmethod
     @abstractmethod
-    def truncate(data: model_types._DataType) -> model_types._DataType:
+    def truncate(data: model_types._DataType, length: int) -> model_types._DataType:
         ...
     @staticmethod

snowflake/ml/model/_signatures/builtins_handler.py CHANGED Viewed

@@ -35,8 +35,8 @@ class ListOfBuiltinHandler(base_handler.BaseDataHandler[model_types._SupportedBu
         return len(data)
     @staticmethod
-    def truncate(data: model_types._SupportedBuiltinsList) -> model_types._SupportedBuiltinsList:
-        return data[: min(ListOfBuiltinHandler.count(data), ListOfBuiltinHandler.SIG_INFER_ROWS_COUNT_LIMIT)]
+    def truncate(data: model_types._SupportedBuiltinsList, length: int) -> model_types._SupportedBuiltinsList:
+        return data[: min(ListOfBuiltinHandler.count(data), length)]
     @staticmethod
     def validate(data: model_types._SupportedBuiltinsList) -> None:

snowflake/ml/model/_signatures/core.py CHANGED Viewed

@@ -282,7 +282,7 @@ class FeatureSpec(BaseFeatureSpec):
             result_type = spt.ArrayType(result_type)
         return result_type
-    def as_dtype(self) -> Union[npt.DTypeLike, str, PandasExtensionTypes]:
+    def as_dtype(self, force_numpy_dtype: bool = False) -> Union[npt.DTypeLike, str, PandasExtensionTypes]:
         """Convert to corresponding local Type."""
         if not self._shape:
@@ -291,7 +291,7 @@ class FeatureSpec(BaseFeatureSpec):
                 return self._dtype._value
             np_type = self._dtype._numpy_type
-            if self._nullable:
+            if self._nullable and not force_numpy_dtype:
                 np_to_pd_dtype_mapping = {
                     np.int8: pd.Int8Dtype(),
                     np.int16: pd.Int16Dtype(),

snowflake/ml/model/_signatures/numpy_handler.py CHANGED Viewed

@@ -23,8 +23,8 @@ class NumpyArrayHandler(base_handler.BaseDataHandler[model_types._SupportedNumpy
         return data.shape[0]
     @staticmethod
-    def truncate(data: model_types._SupportedNumpyArray) -> model_types._SupportedNumpyArray:
-        return data[: min(NumpyArrayHandler.count(data), NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)]
+    def truncate(data: model_types._SupportedNumpyArray, length: int) -> model_types._SupportedNumpyArray:
+        return data[: min(NumpyArrayHandler.count(data), length)]
     @staticmethod
     def validate(data: model_types._SupportedNumpyArray) -> None:
@@ -50,7 +50,7 @@ class NumpyArrayHandler(base_handler.BaseDataHandler[model_types._SupportedNumpy
         dtype = core.DataType.from_numpy_type(data.dtype)
         role_prefix = (NumpyArrayHandler.INPUT_PREFIX if role == "input" else NumpyArrayHandler.OUTPUT_PREFIX) + "_"
         if len(data.shape) == 1:
-            return [core.FeatureSpec(dtype=dtype, name=f"{role_prefix}{feature_prefix}0")]
+            return [core.FeatureSpec(dtype=dtype, name=f"{role_prefix}{feature_prefix}0", nullable=False)]
         else:
             # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns.
             features = []
@@ -59,9 +59,9 @@ class NumpyArrayHandler(base_handler.BaseDataHandler[model_types._SupportedNumpy
             for col_data, ft_name in zip(data[0], ft_names):
                 if isinstance(col_data, np.ndarray):
                     ft_shape = np.shape(col_data)
-                    features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
+                    features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
                 else:
-                    features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
+                    features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
             return features
     @staticmethod
@@ -94,11 +94,10 @@ class SeqOfNumpyArrayHandler(base_handler.BaseDataHandler[Sequence[model_types._
         return min(NumpyArrayHandler.count(data_col) for data_col in data)
     @staticmethod
-    def truncate(data: Sequence[model_types._SupportedNumpyArray]) -> Sequence[model_types._SupportedNumpyArray]:
-        return [
-            data_col[: min(SeqOfNumpyArrayHandler.count(data), SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)]
-            for data_col in data
-        ]
+    def truncate(
+        data: Sequence[model_types._SupportedNumpyArray], length: int
+    ) -> Sequence[model_types._SupportedNumpyArray]:
+        return [data_col[: min(SeqOfNumpyArrayHandler.count(data), length)] for data_col in data]
     @staticmethod
     def validate(data: Sequence[model_types._SupportedNumpyArray]) -> None:
@@ -119,10 +118,10 @@ class SeqOfNumpyArrayHandler(base_handler.BaseDataHandler[Sequence[model_types._
             dtype = core.DataType.from_numpy_type(data_col.dtype)
             ft_name = f"{role_prefix}{feature_prefix}{i}"
             if len(data_col.shape) == 1:
-                features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
+                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
             else:
                 ft_shape = tuple(data_col.shape[1:])
-                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
+                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
         return features
     @staticmethod

snowflake/ml/model/_signatures/pandas_handler.py CHANGED Viewed

@@ -23,8 +23,8 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
         return len(data.index)
     @staticmethod
-    def truncate(data: pd.DataFrame) -> pd.DataFrame:
-        return data.head(min(PandasDataFrameHandler.count(data), PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT))
+    def truncate(data: pd.DataFrame, length: int) -> pd.DataFrame:
+        return data.head(min(PandasDataFrameHandler.count(data), length))
     @staticmethod
     def validate(data: Union[pd.DataFrame, pd.Series]) -> None:
@@ -72,13 +72,6 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
         df_col_dtypes = [data[col].dtype for col in data.columns]
         for df_col, df_col_dtype in zip(df_cols, df_col_dtypes):
             df_col_data = data[df_col]
-            if df_col_data.isnull().all():
-                raise snowml_exceptions.SnowflakeMLException(
-                    error_code=error_codes.INVALID_DATA,
-                    original_exception=ValueError(
-                        f"Data Validation Error: There is no non-null data in column {df_col}."
-                    ),
-                )
             if df_col_data.isnull().any():
                 warnings.warn(
                     (
@@ -163,6 +156,15 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
         specs = []
         for df_col, df_col_dtype, ft_name in zip(df_cols, df_col_dtypes, ft_names):
             df_col_data = data[df_col]
+            if df_col_data.isnull().all():
+                raise snowml_exceptions.SnowflakeMLException(
+                    error_code=error_codes.INVALID_DATA,
+                    original_exception=ValueError(
+                        "Data Validation Error: "
+                        f"There is no non-null data in column {df_col} so the signature cannot be inferred."
+                    ),
+                )
             if df_col_data.isnull().any():
                 df_col_data = utils.series_dropna(df_col_data)
             df_col_dtype = df_col_data.dtype

snowflake/ml/model/_signatures/pytorch_handler.py CHANGED Viewed

@@ -30,14 +30,11 @@ class SeqOfPyTorchTensorHandler(base_handler.BaseDataHandler[Sequence["torch.Ten
     @staticmethod
     def count(data: Sequence["torch.Tensor"]) -> int:
-        return min(data_col.shape[0] for data_col in data)  # type: ignore[no-any-return]
+        return min(data_col.shape[0] for data_col in data)
     @staticmethod
-    def truncate(data: Sequence["torch.Tensor"]) -> Sequence["torch.Tensor"]:
-        return [
-            data_col[: min(SeqOfPyTorchTensorHandler.count(data), SeqOfPyTorchTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)]
-            for data_col in data
-        ]
+    def truncate(data: Sequence["torch.Tensor"], length: int) -> Sequence["torch.Tensor"]:
+        return [data_col[: min(SeqOfPyTorchTensorHandler.count(data), 10)] for data_col in data]
     @staticmethod
     def validate(data: Sequence["torch.Tensor"]) -> None:

snowflake/ml/model/_signatures/snowpark_handler.py CHANGED Viewed

@@ -29,8 +29,8 @@ class SnowparkDataFrameHandler(base_handler.BaseDataHandler[snowflake.snowpark.D
         return data.count()
     @staticmethod
-    def truncate(data: snowflake.snowpark.DataFrame) -> snowflake.snowpark.DataFrame:
-        return cast(snowflake.snowpark.DataFrame, data.limit(SnowparkDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT))
+    def truncate(data: snowflake.snowpark.DataFrame, length: int) -> snowflake.snowpark.DataFrame:
+        return cast(snowflake.snowpark.DataFrame, data.limit(length))
     @staticmethod
     def validate(data: snowflake.snowpark.DataFrame) -> None:
@@ -52,7 +52,7 @@ class SnowparkDataFrameHandler(base_handler.BaseDataHandler[snowflake.snowpark.D
         data: snowflake.snowpark.DataFrame, role: Literal["input", "output"]
     ) -> Sequence[core.BaseFeatureSpec]:
         return pandas_handler.PandasDataFrameHandler.infer_signature(
-            SnowparkDataFrameHandler.convert_to_df(data.limit(n=1)), role=role
+            SnowparkDataFrameHandler.convert_to_df(data), role=role
         )
     @staticmethod

snowflake/ml/model/_signatures/tensorflow_handler.py CHANGED Viewed

@@ -60,14 +60,9 @@ class SeqOfTensorflowTensorHandler(
     @staticmethod
     def truncate(
-        data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]
+        data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]], length: int
     ) -> Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]:
-        return [
-            data_col[
-                : min(SeqOfTensorflowTensorHandler.count(data), SeqOfTensorflowTensorHandler.SIG_INFER_ROWS_COUNT_LIMIT)
-            ]
-            for data_col in data
-        ]
+        return [data_col[: min(SeqOfTensorflowTensorHandler.count(data), length)] for data_col in data]
     @staticmethod
     def validate(data: Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]) -> None:

snowflake/ml/model/model_signature.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing_extensions import Never
 import snowflake.snowpark
 import snowflake.snowpark.functions as F
 import snowflake.snowpark.types as spt
+from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.exceptions import (
     error_codes,
     exceptions as snowml_exceptions,
@@ -56,14 +57,22 @@ _LOCAL_DATA_HANDLERS: List[Type[base_handler.BaseDataHandler[Any]]] = [
 ]
 _ALL_DATA_HANDLERS = _LOCAL_DATA_HANDLERS + [snowpark_handler.SnowparkDataFrameHandler]
+_TELEMETRY_PROJECT = "MLOps"
+_MODEL_TELEMETRY_SUBPROJECT = "ModelSignature"
 def _truncate_data(
     data: model_types.SupportedDataType,
+    length: Optional[int] = 100,
 ) -> model_types.SupportedDataType:
     for handler in _ALL_DATA_HANDLERS:
         if handler.can_handle(data):
+            # If length is None, return the original data
+            if length is None:
+                return data
             row_count = handler.count(data)
-            if row_count <= handler.SIG_INFER_ROWS_COUNT_LIMIT:
+            if row_count <= length:
                 return data
             warnings.warn(
@@ -77,7 +86,7 @@ def _truncate_data(
                 category=UserWarning,
                 stacklevel=1,
             )
-            return handler.truncate(data)
+            return handler.truncate(data, length)
     raise snowml_exceptions.SnowflakeMLException(
         error_code=error_codes.NOT_IMPLEMENTED,
         original_exception=NotImplementedError(
@@ -682,11 +691,17 @@ def _convert_and_validate_local_data(
     return df
+@telemetry.send_api_usage_telemetry(
+    project=_TELEMETRY_PROJECT,
+    subproject=_MODEL_TELEMETRY_SUBPROJECT,
+)
 def infer_signature(
     input_data: model_types.SupportedLocalDataType,
     output_data: model_types.SupportedLocalDataType,
     input_feature_names: Optional[List[str]] = None,
     output_feature_names: Optional[List[str]] = None,
+    input_data_limit: Optional[int] = 100,
+    output_data_limit: Optional[int] = 100,
 ) -> core.ModelSignature:
     """
     Infer model signature from given input and output sample data.
@@ -710,12 +725,18 @@ def infer_signature(
         output_data: Sample output data for the model.
         input_feature_names: Names for input features. Defaults to None.
         output_feature_names: Names for output features. Defaults to None.
+        input_data_limit: Limit the number of rows to be used in signature inference in the input data. Defaults to 100.
+            If None, all rows are used. If the number of rows in the input data is less than the limit, all rows are
+            used.
+        output_data_limit: Limit the number of rows to be used in signature inference in the output data. Defaults to
+            100. If None, all rows are used. If the number of rows in the output data is less than the limit, all rows
+            are used.
     Returns:
         A model signature inferred from the given input and output sample data.
     """
-    inputs = _infer_signature(input_data, role="input")
+    inputs = _infer_signature(_truncate_data(input_data, input_data_limit), role="input")
     inputs = utils.rename_features(inputs, input_feature_names)
-    outputs = _infer_signature(output_data, role="output")
+    outputs = _infer_signature(_truncate_data(output_data, output_data_limit), role="output")
     outputs = utils.rename_features(outputs, output_feature_names)
     return core.ModelSignature(inputs, outputs)

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing_extensions import NotRequired
 if TYPE_CHECKING:
     import catboost
+    import keras
     import lightgbm
     import mlflow
     import numpy as np
@@ -68,6 +69,7 @@ SupportedRequireSignatureModelType = Union[
     "torch.nn.Module",
     "torch.jit.ScriptModule",
     "tensorflow.Module",
+    "keras.Model",
 ]
 SupportedNoSignatureRequirementsModelType = Union[
@@ -103,6 +105,7 @@ Here is all acceptable types of Snowflake native model packaging and its handler
 | transformers.Pipeline | huggingface_pipeline.py | _HuggingFacePipelineHandler |
 | huggingface_pipeline.HuggingFacePipelineModel | huggingface_pipeline.py | _HuggingFacePipelineHandler |
 | sentence_transformers.SentenceTransformer | sentence_transformers.py | _SentenceTransformerHandler |
+| keras.Model | keras.py | _KerasHandler |
 """
 SupportedModelHandlerType = Literal[
@@ -118,6 +121,7 @@ SupportedModelHandlerType = Literal[
     "tensorflow",
     "torchscript",
     "xgboost",
+    "keras",
 ]
 _ModelType = TypeVar("_ModelType", bound=SupportedModelType)
@@ -202,6 +206,11 @@ class SentenceTransformersSaveOptions(BaseModelSaveOption):
     batch_size: NotRequired[int]
+class KerasSaveOptions(BaseModelSaveOption):
+    target_methods: NotRequired[Sequence[str]]
+    cuda_version: NotRequired[str]
 ModelSaveOption = Union[
     BaseModelSaveOption,
     CatBoostModelSaveOptions,
@@ -216,6 +225,7 @@ ModelSaveOption = Union[
     MLFlowSaveOptions,
     HuggingFaceSaveOptions,
     SentenceTransformersSaveOptions,
+    KerasSaveOptions,
 ]
@@ -276,6 +286,10 @@ class SentenceTransformersLoadOptions(BaseModelLoadOption):
     device: NotRequired[str]
+class KerasLoadOptions(BaseModelLoadOption):
+    use_gpu: NotRequired[bool]
 ModelLoadOption = Union[
     BaseModelLoadOption,
     CatBoostModelLoadOptions,
@@ -290,6 +304,7 @@ ModelLoadOption = Union[
     MLFlowLoadOptions,
     HuggingFaceLoadOptions,
     SentenceTransformersLoadOptions,
+    KerasLoadOptions,
 ]

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -199,8 +199,21 @@ class SnowparkTransformHandlers:
         if expected_output_cols_type == "":
             expected_output_cols_type = "string"
         assert expected_output_cols_type is not None
+        # If there is only one output column, the UDF might have generate complex objects (lists, dicts).
+        # In such cases, we attempt to not do explicit cast. (Example: PolynomialFeatures.transform)
+        try_parse_object = len(expected_output_cols) == 1 and expected_output_cols_type != "string"
         for output_feature in expected_output_cols:
-            output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature].astype(expected_output_cols_type))
+            column_expr = F.col(INTERMEDIATE_OBJ_NAME)[output_feature]
+            if try_parse_object and df_res.count() > 0:
+                # Only do type casting if it's not an array
+                if not df_res.select(F.is_array(column_expr)).first()[0]:
+                    column_expr = column_expr.astype(expected_output_cols_type)
+            else:
+                column_expr = column_expr.astype(expected_output_cols_type)
+            output_cols.append(column_expr)
             output_col_names.append(identifier.get_inferred_name(output_feature))
         # Extract output from INTERMEDIATE_OBJ_NAME and drop that column

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.calibration".replace("sk
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class CalibratedClassifierCV(BaseTransformer):
     r"""Probability calibration with isotonic regression or logistic regression
     For more details on this class, see [sklearn.calibration.CalibratedClassifierCV]
@@ -465,7 +468,7 @@ class CalibratedClassifierCV(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1126,7 +1129,7 @@ class CalibratedClassifierCV(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1134,7 +1137,7 @@ class CalibratedClassifierCV(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/cluster/affinity_propagation.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class AffinityPropagation(BaseTransformer):
     r"""Perform Affinity Propagation Clustering of data
     For more details on this class, see [sklearn.cluster.AffinityPropagation]
@@ -449,7 +452,7 @@ class AffinityPropagation(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1106,7 +1109,7 @@ class AffinityPropagation(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1114,7 +1117,7 @@ class AffinityPropagation(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/cluster/agglomerative_clustering.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class AgglomerativeClustering(BaseTransformer):
     r"""Agglomerative Clustering
     For more details on this class, see [sklearn.cluster.AgglomerativeClustering]
@@ -478,7 +481,7 @@ class AgglomerativeClustering(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1135,7 +1138,7 @@ class AgglomerativeClustering(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1143,7 +1146,7 @@ class AgglomerativeClustering(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/cluster/birch.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class Birch(BaseTransformer):
     r"""Implements the BIRCH clustering algorithm
     For more details on this class, see [sklearn.cluster.Birch]
@@ -442,7 +445,7 @@ class Birch(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1101,7 +1104,7 @@ class Birch(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1109,7 +1112,7 @@ class Birch(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/cluster/bisecting_k_means.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklear
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class BisectingKMeans(BaseTransformer):
     r"""Bisecting K-Means clustering
     For more details on this class, see [sklearn.cluster.BisectingKMeans]
@@ -491,7 +494,7 @@ class BisectingKMeans(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1152,7 +1155,7 @@ class BisectingKMeans(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1160,7 +1163,7 @@ class BisectingKMeans(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake-ml-python 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl

snowflake-ml-python 1.7.3py3-none-any.whl → 1.7.5py3-none-any.whl