PyPI - snowflake-ml-python - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

snowflake-ml-python 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

snowflake/ml/modeling/linear_model/ridge_cv.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -231,7 +231,7 @@ class RidgeCV(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=RidgeCV.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=RidgeCV.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -591,6 +591,22 @@ class RidgeCV(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/linear_model/sgd_classifier.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -350,7 +350,7 @@ class SGDClassifier(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDClassifier.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDClassifier.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -710,6 +710,22 @@ class SGDClassifier(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/linear_model/sgd_one_class_svm.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -248,7 +248,7 @@ class SGDOneClassSVM(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDOneClassSVM.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDOneClassSVM.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -608,6 +608,22 @@ class SGDOneClassSVM(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/linear_model/sgd_regressor.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -316,7 +316,7 @@ class SGDRegressor(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=SGDRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=SGDRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -676,6 +676,22 @@ class SGDRegressor(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/linear_model/theil_sen_regressor.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -218,7 +218,7 @@ class TheilSenRegressor(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=TheilSenRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=TheilSenRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -578,6 +578,22 @@ class TheilSenRegressor(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/linear_model/tweedie_regressor.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -244,7 +244,7 @@ class TweedieRegressor(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=TweedieRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=TweedieRegressor.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -604,6 +604,22 @@ class TweedieRegressor(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/manifold/isomap.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -240,7 +240,7 @@ class Isomap(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=Isomap.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=Isomap.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -600,6 +600,22 @@ class Isomap(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/manifold/mds.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -223,7 +223,7 @@ class MDS(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=MDS.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=MDS.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -581,6 +581,22 @@ class MDS(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/manifold/spectral_embedding.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -225,7 +225,7 @@ class SpectralEmbedding(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=SpectralEmbedding.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=SpectralEmbedding.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -583,6 +583,22 @@ class SpectralEmbedding(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/manifold/tsne.py CHANGED Viewed

@@ -26,7 +26,7 @@ from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
-from snowflake.ml.modeling._internal.snowpark_handlers import SnowparkHandlers as HandlersImpl
+from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import SnowparkHandlers as HandlersImpl
 from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.estimator_utils import (
@@ -35,7 +35,7 @@ from snowflake.ml.modeling._internal.estimator_utils import (
     transform_snowml_obj_to_sklearn_obj,
     validate_sklearn_args,
 )
-from snowflake.ml.modeling._internal.estimator_protocols import FitPredictHandlers
+from snowflake.ml.modeling._internal.estimator_protocols import TransformerHandlers
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -284,7 +284,7 @@ class TSNE(BaseTransformer):
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
         # If user used snowpark dataframe during fit, here it stores the snowpark input_cols, otherwise the processed input_cols
         self._snowpark_cols: Optional[List[str]] = self.input_cols
-        self._handlers: FitPredictHandlers = HandlersImpl(class_name=TSNE.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
+        self._handlers: TransformerHandlers = HandlersImpl(class_name=TSNE.__class__.__name__, subproject=_SUBPROJECT, autogenerated=True)
         self._autogenerated = True
     def _get_rand_id(self) -> str:
@@ -642,6 +642,22 @@ class TSNE(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",

snowflake/ml/modeling/metrics/classification.py CHANGED Viewed

@@ -228,16 +228,15 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
     Returns:
         Name of the UDTF.
     """
+    batch_size = metrics_utils.BATCH_SIZE
     class ConfusionMatrixComputer:
-        BATCH_SIZE = 1000
         def __init__(self) -> None:
             self._initialized = False
             self._confusion_matrix = np.zeros((1, 1))
-            # 2d array containing a batch of input rows. A batch contains self.BATCH_SIZE rows.
+            # 2d array containing a batch of input rows. A batch contains metrics_utils.BATCH_SIZE rows.
             # [sample_weight, y_true, y_pred]
-            self._batched_rows = np.zeros((self.BATCH_SIZE, 1))
+            self._batched_rows = np.zeros((batch_size, 1))
             # Number of columns in the dataset.
             self._n_cols = -1
             # Running count of number of rows added to self._batched_rows.
@@ -255,7 +254,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
             # 1. Initialize variables.
             if not self._initialized:
                 self._n_cols = len(input_row)
-                self._batched_rows = np.zeros((self.BATCH_SIZE, self._n_cols))
+                self._batched_rows = np.zeros((batch_size, self._n_cols))
                 self._n_label = n_label
                 self._confusion_matrix = np.zeros((self._n_label, self._n_label))
             self._initialized = True
@@ -264,7 +263,7 @@ def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_
             self._cur_count += 1
             # 2. Compute incremental confusion matrix for the batch.
-            if self._cur_count >= self.BATCH_SIZE:
+            if self._cur_count >= batch_size:
                 self.update_confusion_matrix()
                 self._cur_count = 0

snowflake/ml/modeling/metrics/metrics_utils.py CHANGED Viewed

@@ -15,6 +15,7 @@ from snowflake.snowpark import Session, functions as F, types as T
 LABEL = "LABEL"
 INDEX = "INDEX"
+BATCH_SIZE = 1000
 def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str:
@@ -82,7 +83,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
         """This class is registered as a UDTF and computes the sum and dot product
         of columns for each partition of rows. The computations across all the partitions happens
         in parallel using the nodes in the warehouse. In order to avoid keeping the entire partition
-        in memory, we batch the rows (size is 1000) and maintain a running sum and dot prod in self._sum_by_count,
+        in memory, we batch the rows and maintain a running sum and dot prod in self._sum_by_count,
         self._sum_by_countd and self._dot_prod respectively. We return these at the end of the partition.
         """
@@ -95,7 +96,7 @@ def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dic
             # delta degree of freedom
             self._ddof = 0
             # Setting the batch size to 1000 based on experimentation. Can be fine tuned later.
-            self._batch_size = 1000
+            self._batch_size = BATCH_SIZE
             # 2d array containing a batch of input rows. A batch contains self._batch_size rows.
             self._batched_rows = np.zeros((self._batch_size, 1))
             # 1d array of length = # of cols. Contains sum(col/count) for each column.
@@ -224,7 +225,7 @@ def check_label_columns(
         TypeError: `y_true_col_names` and `y_pred_col_names` are of different types.
         ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths.
     """
-    if type(y_true_col_names) != type(y_pred_col_names):
+    if type(y_true_col_names) is not type(y_pred_col_names):
         raise TypeError(
             "Label columns should be of the same type."
             f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}."
@@ -300,6 +301,7 @@ def validate_average_pos_label(average: Optional[str] = None, pos_label: Union[s
             "average != 'binary' (got %r). You may use "
             "labels=[pos_label] to specify a single positive class." % (pos_label, average),
             UserWarning,
+            stacklevel=2,
         )

snowflake/ml/modeling/metrics/ranking.py CHANGED Viewed

@@ -122,7 +122,8 @@ def precision_recall_curve(
         result_module = cloudpickle.loads(pickled_result_module)
         return result_module.serialize(session, (precision, recall, thresholds))  # type: ignore[no-any-return]
-    result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session))
+    kwargs = telemetry.get_sproc_statement_params_kwargs(precision_recall_curve_anon_sproc, statement_params)
+    result_object = result.deserialize(session, precision_recall_curve_anon_sproc(session, **kwargs))
     res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
     return res
@@ -271,7 +272,8 @@ def roc_auc_score(
         result_module = cloudpickle.loads(pickled_result_module)
         return result_module.serialize(session, auc)  # type: ignore[no-any-return]
-    result_object = result.deserialize(session, roc_auc_score_anon_sproc(session))
+    kwargs = telemetry.get_sproc_statement_params_kwargs(roc_auc_score_anon_sproc, statement_params)
+    result_object = result.deserialize(session, roc_auc_score_anon_sproc(session, **kwargs))
     auc: Union[float, npt.NDArray[np.float_]] = result_object
     return auc
@@ -372,7 +374,9 @@ def roc_curve(
         result_module = cloudpickle.loads(pickled_result_module)
         return result_module.serialize(session, (fpr, tpr, thresholds))  # type: ignore[no-any-return]
-    result_object = result.deserialize(session, roc_curve_anon_sproc(session))
+    kwargs = telemetry.get_sproc_statement_params_kwargs(roc_curve_anon_sproc, statement_params)
+    result_object = result.deserialize(session, roc_curve_anon_sproc(session, **kwargs))
     res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object
     return res

snowflake-ml-python 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

snowflake-ml-python 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl