PyPI - snowflake-ml-python - Versions diffs - 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl - Mend

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

snowflake/ml/modeling/pipeline/pipeline.py CHANGED Viewed

@@ -20,7 +20,11 @@ from snowflake.ml._internal.exceptions import error_codes, exceptions
 from snowflake.ml._internal.lineage import lineage_utils
 from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
 from snowflake.ml.data import data_source
-from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
+from snowflake.ml.model.model_signature import (
+    ModelSignature,
+    _infer_signature,
+    _truncate_data,
+)
 from snowflake.ml.modeling._internal.model_transformer_builder import (
     ModelTransformerBuilder,
 )
@@ -30,7 +34,8 @@ from snowflake.snowpark._internal import utils as snowpark_utils
 _PROJECT = "ModelDevelopment"
 _SUBPROJECT = "Framework"
-IN_ML_RUNTIME_ENV_VAR = "IN_SPCS_ML_RUNTIME"
+INFER_SIGNATURE_MAX_ROWS = 100
 def _final_step_has(attr: str) -> Callable[..., bool]:
@@ -432,10 +437,7 @@ class Pipeline(base.BaseTransformer):
             data_sources = [data_source.DataFrameInfo(dataset.queries["queries"][-1])]
         lineage_utils.set_data_sources(self, data_sources)
-        if self._can_be_trained_in_ml_runtime(dataset):
-            self._fit_ml_runtime(dataset)
-        elif squash and isinstance(dataset, snowpark.DataFrame):
+        if squash and isinstance(dataset, snowpark.DataFrame):
             session = dataset._session
             assert session is not None
             self._fit_snowpark_dataframe_within_one_sproc(session=session, dataset=dataset)
@@ -606,25 +608,7 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR) and self._sklearn_object is not None:
-            expected_output_cols = self._infer_output_cols()
-            handler = ModelTransformerBuilder.build(
-                dataset=dataset,
-                estimator=self._sklearn_object,
-                class_name="Pipeline",
-                subproject="",
-                autogenerated=False,
-            )
-            return handler.batch_inference(
-                inference_method="predict",
-                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
-                expected_output_cols=expected_output_cols,
-                session=dataset._session,
-                dependencies=self._deps,
-            )
-        else:
-            return self._invoke_estimator_func("predict", dataset)
+        return self._invoke_estimator_func("predict", dataset)
     @metaestimators.available_if(_final_step_has("score_samples"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -642,32 +626,8 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
-        Raises:
-            ValueError: An sklearn object has not been fit before calling this function
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            if self._sklearn_object is None:
-                raise ValueError("Model must be fit before inference.")
-            expected_output_cols = self._get_output_column_names("score_samples")
-            handler = ModelTransformerBuilder.build(
-                dataset=dataset,
-                estimator=self._sklearn_object,
-                class_name="Pipeline",
-                subproject="",
-                autogenerated=False,
-            )
-            return handler.batch_inference(
-                inference_method="score_samples",
-                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
-                expected_output_cols=expected_output_cols,
-                session=dataset._session,
-                dependencies=self._deps,
-            )
-        else:
-            return self._invoke_estimator_func("score_samples", dataset)
+        return self._invoke_estimator_func("score_samples", dataset)
     @metaestimators.available_if(_final_step_has("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -685,32 +645,8 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
-        Raises:
-            ValueError: An sklearn object has not been fit before calling this function
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            if self._sklearn_object is None:
-                raise ValueError("Model must be fit before inference.")
-            expected_output_cols = self._get_output_column_names("predict_proba")
-            handler = ModelTransformerBuilder.build(
-                dataset=dataset,
-                estimator=self._sklearn_object,
-                class_name="Pipeline",
-                subproject="",
-                autogenerated=False,
-            )
-            return handler.batch_inference(
-                inference_method="predict_proba",
-                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
-                expected_output_cols=expected_output_cols,
-                session=dataset._session,
-                dependencies=self._deps,
-            )
-        else:
-            return self._invoke_estimator_func("predict_proba", dataset)
+        return self._invoke_estimator_func("predict_proba", dataset)
     @metaestimators.available_if(_final_step_has("predict_log_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -729,31 +665,8 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
-        Raises:
-            ValueError: An sklearn object has not been fit before calling this function
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            if self._sklearn_object is None:
-                raise ValueError("Model must be fit before inference.")
-            expected_output_cols = self._get_output_column_names("predict_log_proba")
-            handler = ModelTransformerBuilder.build(
-                dataset=dataset,
-                estimator=self._sklearn_object,
-                class_name="Pipeline",
-                subproject="",
-                autogenerated=False,
-            )
-            return handler.batch_inference(
-                inference_method="predict_log_proba",
-                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
-                expected_output_cols=expected_output_cols,
-                session=dataset._session,
-                dependencies=self._deps,
-            )
-        else:
-            return self._invoke_estimator_func("predict_log_proba", dataset)
+        return self._invoke_estimator_func("predict_log_proba", dataset)
     @metaestimators.available_if(_final_step_has("score"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -769,30 +682,9 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
-        Raises:
-            ValueError: An sklearn object has not been fit before calling this function
         """
-        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            if self._sklearn_object is None:
-                raise ValueError("Model must be fit before scoreing.")
-            handler = ModelTransformerBuilder.build(
-                dataset=dataset,
-                estimator=self._sklearn_object,
-                class_name="Pipeline",
-                subproject="",
-                autogenerated=False,
-            )
-            return handler.score(
-                input_cols=self._infer_input_cols(),
-                label_cols=self._get_label_cols(),
-                session=dataset._session,
-                dependencies=self._deps,
-                score_sproc_imports=[],
-            )
-        else:
-            return self._invoke_estimator_func("score", dataset)
+        return self._invoke_estimator_func("score", dataset)
     def _invoke_estimator_func(
         self, func_name: str, dataset: Union[snowpark.DataFrame, pd.DataFrame]
@@ -882,39 +774,6 @@ class Pipeline(base.BaseTransformer):
         return ct
-    def _fit_ml_runtime(self, dataset: snowpark.DataFrame) -> None:
-        """Train the pipeline in the ML Runtime.
-        Args:
-            dataset: The training Snowpark dataframe
-        Raises:
-            ModuleNotFoundError: The ML Runtime Client is not installed.
-        """
-        try:
-            from snowflake.ml.runtime import MLRuntimeClient
-        except ModuleNotFoundError as e:
-            # The snowflake.ml.runtime module should always be present when
-            # the env var IN_SPCS_ML_RUNTIME is present.
-            raise ModuleNotFoundError("ML Runtime Python Client is not installed.") from e
-        client = MLRuntimeClient()
-        ml_runtime_compatible_pipeline = self._create_unfitted_sklearn_object()
-        label_cols = self._get_label_cols()
-        all_df_cols = dataset.columns
-        input_cols = [col for col in all_df_cols if col not in label_cols]
-        trained_pipeline = client.train(
-            estimator=ml_runtime_compatible_pipeline,
-            dataset=dataset,
-            input_cols=input_cols,
-            label_cols=label_cols,
-            sample_weight_col=self.sample_weight_col,
-        )
-        self._sklearn_object = trained_pipeline
     def _get_label_cols(self) -> List[str]:
         """Util function to get the label columns from the pipeline.
         The label column is only present in the estimator
@@ -929,28 +788,6 @@ class Pipeline(base.BaseTransformer):
         return label_cols
-    def _can_be_trained_in_ml_runtime(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> bool:
-        """A utility function to determine if the pipeline cam be pushed down to the ML Runtime for training.
-        Currently, this is true if:
-        - The training dataset is a snowpark dataframe,
-        - The IN_SPCS_ML_RUNTIME environment is present and
-        - The pipeline can be converted to an sklearn pipeline.
-        Args:
-            dataset: The training dataset
-        Returns:
-            True if the dataset can be fit in the ml runtime, else false.
-        """
-        if not isinstance(dataset, snowpark.DataFrame):
-            return False
-        if not os.environ.get(IN_ML_RUNTIME_ENV_VAR):
-            return False
-        return self._is_convertible_to_sklearn
     @staticmethod
     def _wrap_transformer_in_column_transformer(
         transformer_name: str, transformer: base.BaseTransformer
@@ -1054,7 +891,9 @@ class Pipeline(base.BaseTransformer):
         self._model_signature_dict = dict()
         input_columns = self._get_sanitized_list_of_columns(dataset.columns)
-        inputs_signature = _infer_signature(dataset[input_columns], "input", use_snowflake_identifiers=True)
+        inputs_signature = _infer_signature(
+            _truncate_data(dataset[input_columns], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True
+        )
         estimator_step = self._get_estimator()
         if estimator_step:
@@ -1124,7 +963,6 @@ class Pipeline(base.BaseTransformer):
         telemetry_data = {
             "pipeline_is_convertible_to_sklearn": self._is_convertible_to_sklearn,
-            "in_spcs_ml_runtime": bool(os.environ.get(IN_ML_RUNTIME_ENV_VAR)),
         }
         telemetry.send_custom_usage(
             project=_PROJECT,

snowflake/ml/modeling/preprocessing/polynomial_features.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.preprocessing".replace("
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class PolynomialFeatures(BaseTransformer):
     r"""Generate polynomial and interaction features
     For more details on this class, see [sklearn.preprocessing.PolynomialFeatures]
@@ -429,7 +432,7 @@ class PolynomialFeatures(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1086,7 +1089,7 @@ class PolynomialFeatures(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1094,7 +1097,7 @@ class PolynomialFeatures(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/semi_supervised/label_propagation.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.semi_supervised".replace
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class LabelPropagation(BaseTransformer):
     r"""Label Propagation classifier
     For more details on this class, see [sklearn.semi_supervised.LabelPropagation]
@@ -433,7 +436,7 @@ class LabelPropagation(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1094,7 +1097,7 @@ class LabelPropagation(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1102,7 +1105,7 @@ class LabelPropagation(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/semi_supervised/label_spreading.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.semi_supervised".replace
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class LabelSpreading(BaseTransformer):
     r"""LabelSpreading model for semi-supervised learning
     For more details on this class, see [sklearn.semi_supervised.LabelSpreading]
@@ -442,7 +445,7 @@ class LabelSpreading(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1103,7 +1106,7 @@ class LabelSpreading(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1111,7 +1114,7 @@ class LabelSpreading(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/svm/linear_svc.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class LinearSVC(BaseTransformer):
     r"""Linear Support Vector Classification
     For more details on this class, see [sklearn.svm.LinearSVC]
@@ -507,7 +510,7 @@ class LinearSVC(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1166,7 +1169,7 @@ class LinearSVC(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1174,7 +1177,7 @@ class LinearSVC(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/svm/linear_svr.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class LinearSVR(BaseTransformer):
     r"""Linear Support Vector Regression
     For more details on this class, see [sklearn.svm.LinearSVR]
@@ -476,7 +479,7 @@ class LinearSVR(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1133,7 +1136,7 @@ class LinearSVR(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1141,7 +1144,7 @@ class LinearSVR(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/svm/nu_svc.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class NuSVC(BaseTransformer):
     r"""Nu-Support Vector Classification
     For more details on this class, see [sklearn.svm.NuSVC]
@@ -506,7 +509,7 @@ class NuSVC(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1169,7 +1172,7 @@ class NuSVC(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1177,7 +1180,7 @@ class NuSVC(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/svm/nu_svr.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class NuSVR(BaseTransformer):
     r"""Nu Support Vector Regression
     For more details on this class, see [sklearn.svm.NuSVR]
@@ -467,7 +470,7 @@ class NuSVR(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1124,7 +1127,7 @@ class NuSVR(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1132,7 +1135,7 @@ class NuSVR(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake/ml/modeling/svm/svc.py CHANGED Viewed

@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
     FeatureSpec,
     ModelSignature,
     _infer_signature,
+    _truncate_data,
     _rename_signature_with_snowflake_identifiers,
 )
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
+INFER_SIGNATURE_MAX_ROWS = 100
 class SVC(BaseTransformer):
     r"""C-Support Vector Classification
     For more details on this class, see [sklearn.svm.SVC]
@@ -511,7 +514,7 @@ class SVC(BaseTransformer):
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
                     expected_dtype = "array"
                 else:
-                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
                     # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
                     # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1174,7 +1177,7 @@ class SVC(BaseTransformer):
         PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
-        inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
+        inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
         outputs: List[BaseFeatureSpec] = []
         if hasattr(self, "predict"):
             # keep mypy happy
@@ -1182,7 +1185,7 @@ class SVC(BaseTransformer):
             # For classifier, the type of predict is the same as the type of label
             if self._sklearn_object._estimator_type == "classifier":
                 # label columns is the desired type for output
-                outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
+                outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
                 # rename the output columns
                 outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
                 self._model_signature_dict["predict"] = ModelSignature(

snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.4py3-none-any.whl