PyPI - snowflake-ml-python - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (206) hide show

snowflake/ml/_internal/env_utils.py +66 -31
snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
snowflake/ml/_internal/exceptions/error_codes.py +3 -0
snowflake/ml/_internal/lineage/data_source.py +10 -0
snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
snowflake/ml/dataset/__init__.py +10 -0
snowflake/ml/dataset/dataset.py +454 -129
snowflake/ml/dataset/dataset_factory.py +53 -0
snowflake/ml/dataset/dataset_metadata.py +103 -0
snowflake/ml/dataset/dataset_reader.py +202 -0
snowflake/ml/feature_store/feature_store.py +408 -282
snowflake/ml/feature_store/feature_view.py +37 -8
snowflake/ml/fileset/embedded_stage_fs.py +146 -0
snowflake/ml/fileset/sfcfs.py +0 -4
snowflake/ml/fileset/snowfs.py +159 -0
snowflake/ml/fileset/stage_fs.py +1 -4
snowflake/ml/model/__init__.py +2 -2
snowflake/ml/model/_api.py +16 -1
snowflake/ml/model/_client/model/model_impl.py +27 -0
snowflake/ml/model/_client/model/model_version_impl.py +135 -0
snowflake/ml/model/_client/ops/model_ops.py +137 -67
snowflake/ml/model/_client/sql/model.py +16 -14
snowflake/ml/model/_client/sql/model_version.py +109 -1
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
snowflake/ml/model/_model_composer/model_composer.py +22 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
snowflake/ml/model/_packager/model_env/model_env.py +41 -0
snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
snowflake/ml/model/_packager/model_packager.py +0 -3
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
snowflake/ml/modeling/_internal/model_trainer.py +7 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
snowflake/ml/modeling/cluster/birch.py +53 -52
snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
snowflake/ml/modeling/cluster/dbscan.py +51 -52
snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
snowflake/ml/modeling/cluster/k_means.py +53 -52
snowflake/ml/modeling/cluster/mean_shift.py +51 -52
snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
snowflake/ml/modeling/cluster/optics.py +51 -52
snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
snowflake/ml/modeling/compose/column_transformer.py +53 -52
snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
snowflake/ml/modeling/covariance/oas.py +51 -52
snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/pca.py +53 -52
snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
snowflake/ml/modeling/framework/base.py +63 -36
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
snowflake/ml/modeling/impute/knn_imputer.py +53 -52
snowflake/ml/modeling/impute/missing_indicator.py +53 -52
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
snowflake/ml/modeling/linear_model/lars.py +51 -52
snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso.py +51 -52
snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
snowflake/ml/modeling/linear_model/perceptron.py +51 -52
snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ridge.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
snowflake/ml/modeling/manifold/isomap.py +53 -52
snowflake/ml/modeling/manifold/mds.py +53 -52
snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
snowflake/ml/modeling/manifold/tsne.py +53 -52
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
snowflake/ml/modeling/pipeline/pipeline.py +514 -32
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
snowflake/ml/modeling/svm/linear_svc.py +51 -52
snowflake/ml/modeling/svm/linear_svr.py +51 -52
snowflake/ml/modeling/svm/nu_svc.py +51 -52
snowflake/ml/modeling/svm/nu_svr.py +51 -52
snowflake/ml/modeling/svm/svc.py +51 -52
snowflake/ml/modeling/svm/svr.py +51 -52
snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
snowflake/ml/registry/model_registry.py +3 -149
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +63 -2
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.0.dist-info}/RECORD +204 -196
snowflake/ml/registry/_artifact_manager.py +0 -156
snowflake/ml/registry/artifact.py +0 -46
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py CHANGED Viewed

@@ -23,20 +23,26 @@ from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
     get_temp_file_path,
 )
+from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecifications,
     ModelSpecificationsBuilder,
 )
-from snowflake.snowpark import DataFrame, Session, exceptions as snowpark_exceptions
+from snowflake.snowpark import (
+    DataFrame,
+    Session,
+    exceptions as snowpark_exceptions,
+    functions as F,
+)
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
     random_name_for_temp_object,
 )
-from snowflake.snowpark.functions import sproc
 from snowflake.snowpark.stored_procedure import StoredProcedure
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
+cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
 _PROJECT = "ModelDevelopment"
@@ -122,7 +128,7 @@ class SnowparkModelTrainer:
             project=_PROJECT,
             subproject=self._subproject,
             function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-            api_calls=[sproc],
+            api_calls=[F.sproc],
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
         # Put locally serialized transform on stage.
@@ -292,7 +298,7 @@ class SnowparkModelTrainer:
         """
         imports = model_spec.imports  # In order for the sproc to not resolve this reference in snowflake.ml
-        def fit_wrapper_function(
+        def fit_predict_wrapper_function(
             session: Session,
             sql_queries: List[str],
             stage_transform_file_name: str,
@@ -329,7 +335,7 @@ class SnowparkModelTrainer:
             with open(local_transform_file_path, mode="r+b") as local_transform_file_obj:
                 estimator = cp.load(local_transform_file_obj)
-            fit_predict_result = estimator.fit_predict(df[input_cols])
+            fit_predict_result = estimator.fit_predict(X=df[input_cols])
             local_result_file_name = get_temp_file_path()
@@ -349,8 +355,16 @@ class SnowparkModelTrainer:
                 fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
             else:
                 df = df.copy()
-                fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
-                fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
+                # in case the output column name overlap with the input column names,
+                # remove the ones in input column names
+                remove_dataset_col_name_exist_in_output_col = list(set(df.columns) - set(expected_output_cols_list))
+                fit_predict_result_pd = pd.concat(
+                    [
+                        df[remove_dataset_col_name_exist_in_output_col],
+                        pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list),
+                    ],
+                    axis=1,
+                )
             # write into a temp table in sproc and load the table from outside
             session.write_pandas(
@@ -361,17 +375,150 @@ class SnowparkModelTrainer:
             # to pass debug information to the caller.
             return str(os.path.basename(local_result_file_name))
-        return fit_wrapper_function
+        return fit_predict_wrapper_function
+    def _build_fit_transform_wrapper_sproc(
+        self,
+        model_spec: ModelSpecifications,
+    ) -> Callable[
+        [
+            Session,
+            List[str],
+            str,
+            str,
+            List[str],
+            Optional[List[str]],
+            Optional[str],
+            Dict[str, str],
+            bool,
+            List[str],
+            str,
+        ],
+        str,
+    ]:
+        """
+        Constructs and returns a python stored procedure function to be used for training model.
+        Args:
+            model_spec: ModelSpecifications object that contains model specific information
+                like required imports, package dependencies, etc.
+        Returns:
+            A callable that can be registered as a stored procedure.
+        """
+        imports = model_spec.imports  # In order for the sproc to not resolve this reference in snowflake.ml
+        def fit_transform_wrapper_function(
+            session: Session,
+            sql_queries: List[str],
+            stage_transform_file_name: str,
+            stage_result_file_name: str,
+            input_cols: List[str],
+            label_cols: Optional[List[str]],
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str],
+            drop_input_cols: bool,
+            expected_output_cols_list: List[str],
+            fit_transform_result_name: str,
+        ) -> str:
+            import os
+            import cloudpickle as cp
+            import pandas as pd
+            for import_name in imports:
+                importlib.import_module(import_name)
+            # Execute snowpark queries and obtain the results as pandas dataframe
+            # NB: this implies that the result data must fit into memory.
+            for query in sql_queries[:-1]:
+                _ = session.sql(query).collect(statement_params=statement_params)
+            sp_df = session.sql(sql_queries[-1])
+            df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params)
+            df.columns = sp_df.columns
+            local_transform_file_name = get_temp_file_path()
+            session.file.get(stage_transform_file_name, local_transform_file_name, statement_params=statement_params)
+            local_transform_file_path = os.path.join(
+                local_transform_file_name, os.listdir(local_transform_file_name)[0]
+            )
+            with open(local_transform_file_path, mode="r+b") as local_transform_file_obj:
+                estimator = cp.load(local_transform_file_obj)
+            argspec = inspect.getfullargspec(estimator.fit)
+            args = {"X": df[input_cols]}
+            if label_cols:
+                label_arg_name = "Y" if "Y" in argspec.args else "y"
+                args[label_arg_name] = df[label_cols].squeeze()
+            if sample_weight_col is not None and "sample_weight" in argspec.args:
+                args["sample_weight"] = df[sample_weight_col].squeeze()
+            fit_transform_result = estimator.fit_transform(**args)
+            local_result_file_name = get_temp_file_path()
+            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                cp.dump(estimator, local_result_file_obj)
+            session.file.put(
+                local_result_file_name,
+                stage_result_file_name,
+                auto_compress=False,
+                overwrite=True,
+                statement_params=statement_params,
+            )
+            transformed_numpy_array, output_cols = handle_inference_result(
+                inference_res=fit_transform_result,
+                output_cols=expected_output_cols_list,
+                inference_method="fit_transform",
+                within_udf=True,
+            )
+            if len(transformed_numpy_array.shape) > 1:
+                if transformed_numpy_array.shape[1] != len(output_cols):
+                    series = pd.Series(transformed_numpy_array.tolist())
+                    transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
+                else:
+                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
+            else:
+                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
+            # store the transform output
+            if not drop_input_cols:
+                df = df.copy()
+                # in case the output column name overlap with the input column names,
+                # remove the ones in input column names
+                remove_dataset_col_name_exist_in_output_col = list(set(df.columns) - set(output_cols))
+                transformed_pandas_df = pd.concat(
+                    [df[remove_dataset_col_name_exist_in_output_col], transformed_pandas_df], axis=1
+                )
+            # write into a temp table in sproc and load the table from outside
+            session.write_pandas(
+                transformed_pandas_df,
+                fit_transform_result_name,
+                auto_create_table=True,
+                table_type="temp",
+                quote_identifiers=False,
+            )
+            return str(os.path.basename(local_result_file_name))
+        return fit_transform_wrapper_function
     def _get_fit_predict_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
         # If the sproc already exists, don't register.
-        if not hasattr(self.session, "_FIT_PRE_WRAPPER_SPROCS"):
-            self.session._FIT_PRE_WRAPPER_SPROCS: Dict[str, StoredProcedure] = {}  # type: ignore[attr-defined, misc]
+        if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
+            self.session._FIT_WRAPPER_SPROCS: Dict[str, StoredProcedure] = {}  # type: ignore[attr-defined, misc]
         model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
-        fit_predict_sproc_key = model_spec.__class__.__name__
-        if fit_predict_sproc_key in self.session._FIT_PRE_WRAPPER_SPROCS:  # type: ignore[attr-defined]
-            fit_sproc: StoredProcedure = self.session._FIT_PRE_WRAPPER_SPROCS[  # type: ignore[attr-defined]
+        fit_predict_sproc_key = model_spec.__class__.__name__ + "_fit_predict"
+        if fit_predict_sproc_key in self.session._FIT_WRAPPER_SPROCS:  # type: ignore[attr-defined]
+            fit_sproc: StoredProcedure = self.session._FIT_WRAPPER_SPROCS[  # type: ignore[attr-defined]
                 fit_predict_sproc_key
             ]
             return fit_sproc
@@ -392,12 +539,47 @@ class SnowparkModelTrainer:
             statement_params=statement_params,
         )
-        self.session._FIT_PRE_WRAPPER_SPROCS[  # type: ignore[attr-defined]
+        self.session._FIT_WRAPPER_SPROCS[  # type: ignore[attr-defined]
             fit_predict_sproc_key
         ] = fit_predict_wrapper_sproc
         return fit_predict_wrapper_sproc
+    def _get_fit_transform_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
+        # If the sproc already exists, don't register.
+        if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
+            self.session._FIT_WRAPPER_SPROCS: Dict[str, StoredProcedure] = {}  # type: ignore[attr-defined, misc]
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        fit_transform_sproc_key = model_spec.__class__.__name__ + "_fit_transform"
+        if fit_transform_sproc_key in self.session._FIT_WRAPPER_SPROCS:  # type: ignore[attr-defined]
+            fit_sproc: StoredProcedure = self.session._FIT_WRAPPER_SPROCS[  # type: ignore[attr-defined]
+                fit_transform_sproc_key
+            ]
+            return fit_sproc
+        fit_transform_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
+        fit_transform_wrapper_sproc = self.session.sproc.register(
+            func=self._build_fit_transform_wrapper_sproc(model_spec=model_spec),
+            is_permanent=False,
+            name=fit_transform_sproc_name,
+            packages=["snowflake-snowpark-python"] + relaxed_dependencies,  # type: ignore[arg-type]
+            replace=True,
+            session=self.session,
+            statement_params=statement_params,
+        )
+        self.session._FIT_WRAPPER_SPROCS[  # type: ignore[attr-defined]
+            fit_transform_sproc_key
+        ] = fit_transform_wrapper_sproc
+        return fit_transform_wrapper_sproc
     def train(self) -> object:
         """
         Trains the model by pushing down the compute into Snowflake using stored procedures.
@@ -498,10 +680,10 @@ class SnowparkModelTrainer:
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
-        fit_wrapper_sproc = self._get_fit_predict_wrapper_sproc(statement_params=statement_params)
+        fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc(statement_params=statement_params)
         fit_predict_result_name = random_name_for_temp_object(TempObjectType.TABLE)
-        sproc_export_file_name: str = fit_wrapper_sproc(
+        sproc_export_file_name: str = fit_predict_wrapper_sproc(
             self.session,
             queries,
             stage_transform_file_name,
@@ -521,3 +703,66 @@ class SnowparkModelTrainer:
         )
         return output_result_sp, fitted_estimator
+    def train_fit_transform(
+        self,
+        expected_output_cols_list: List[str],
+        drop_input_cols: Optional[bool] = False,
+    ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
+        """Trains the model by pushing down the compute into Snowflake using stored procedures.
+        This API is different from fit itself because it would also provide the transform
+        output.
+        Args:
+            expected_output_cols_list (List[str]): The output columns
+                name as a list. Defaults to None.
+            drop_input_cols (Optional[bool]): Boolean to determine whether to
+                drop the input columns from the output dataset.
+        Returns:
+            Tuple[Union[DataFrame, pd.DataFrame], object]: [transformed dataset, estimator]
+        """
+        dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(self.dataset)
+        # Extract query that generated the dataframe. We will need to pass it to the fit procedure.
+        queries = dataset.queries["queries"]
+        transform_stage_name = self._create_temp_stage()
+        (stage_transform_file_name, stage_result_file_name) = self._upload_model_to_stage(
+            stage_name=transform_stage_name
+        )
+        # Call fit sproc
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
+            api_calls=[Session.call],
+            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+        )
+        fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc(statement_params=statement_params)
+        fit_transform_result_name = random_name_for_temp_object(TempObjectType.TABLE)
+        sproc_export_file_name: str = fit_transform_wrapper_sproc(
+            self.session,
+            queries,
+            stage_transform_file_name,
+            stage_result_file_name,
+            self.input_cols,
+            self.label_cols,
+            self.sample_weight_col,
+            statement_params,
+            drop_input_cols,
+            expected_output_cols_list,
+            fit_transform_result_name,
+        )
+        output_result_sp = self.session.table(fit_transform_result_name)
+        fitted_estimator = self._fetch_model_from_stage(
+            dir_path=stage_result_file_name,
+            file_name=sproc_export_file_name,
+            statement_params=statement_params,
+        )
+        return output_result_sp, fitted_estimator

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -60,12 +60,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.calibration".replace("sk
 DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
-def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
-    def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
-        return False and callable(getattr(self._sklearn_object, "fit_transform", None))
-    return check
 class CalibratedClassifierCV(BaseTransformer):
     r"""Probability calibration with isotonic regression or logistic regression
     For more details on this class, see [sklearn.calibration.CalibratedClassifierCV]
@@ -328,20 +322,17 @@ class CalibratedClassifierCV(BaseTransformer):
         self,
         dataset: DataFrame,
         inference_method: str,
-    ) -> List[str]:
-        """Util method to run validate that batch inference can be run on a snowpark dataframe and
-        return the available package that exists in the snowflake anaconda channel
+    ) -> None:
+        """Util method to run validate that batch inference can be run on a snowpark dataframe.
         Args:
             dataset: snowpark dataframe
             inference_method: the inference method such as predict, score...
         Raises:
             SnowflakeMLException: If the estimator is not fitted, raise error
             SnowflakeMLException: If the session is None, raise error
-        Returns:
-            A list of available package that exists in the snowflake anaconda channel
         """
         if not self._is_fitted:
             raise exceptions.SnowflakeMLException(
@@ -359,9 +350,7 @@ class CalibratedClassifierCV(BaseTransformer):
                     "Session must not specified for snowpark dataset."
                 ),
             )
-        # Validate that key package version in user workspace are supported in snowflake conda channel
-        return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
-            pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
     @available_if(original_estimator_has_callable("predict"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -409,7 +398,8 @@ class CalibratedClassifierCV(BaseTransformer):
                 expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
-            self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(
                 dataset._session, Session
             )  # mypy does not recognize the check in _batch_inference_validate_snowpark()
@@ -492,10 +482,8 @@ class CalibratedClassifierCV(BaseTransformer):
                     if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
                         expected_dtype = convert_sp_to_sf_type(output_types[0])
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
             transform_kwargs = dict(
@@ -562,16 +550,40 @@ class CalibratedClassifierCV(BaseTransformer):
         self._is_fitted = True
         return output_result
+    @available_if(original_estimator_has_callable("fit_transform"))  # type: ignore[misc]
+    def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
+        """ Method not supported for this class.
-    @available_if(_is_fit_transform_method_enabled())  # type: ignore[misc]
-    def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[Any, npt.NDArray[Any]]:
-        """
+        Raises:
+            TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
+        Args:
+            dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
+                Snowpark or Pandas DataFrame.
+        output_cols_prefix: Prefix for the response columns
         Returns:
             Transformed dataset.
         """
-        self.fit(dataset)
-        assert self._sklearn_object is not None
-        return self._sklearn_object.embedding_
+        self._infer_input_output_cols(dataset)
+        super()._check_dataset_type(dataset)
+        model_trainer = ModelTrainerBuilder.build_fit_transform(
+            estimator=self._sklearn_object,
+            dataset=dataset,
+            input_cols=self.input_cols,
+            label_cols=self.label_cols,
+            sample_weight_col=self.sample_weight_col,
+            autogenerated=self._autogenerated,
+            subproject=_SUBPROJECT,
+        )
+        output_result, fitted_estimator = model_trainer.train_fit_transform(
+            drop_input_cols=self._drop_input_cols,
+            expected_output_cols_list=self.output_cols,
+        )
+        self._sklearn_object = fitted_estimator
+        self._is_fitted = True
+        return output_result
     def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
@@ -664,10 +676,8 @@ class CalibratedClassifierCV(BaseTransformer):
         expected_output_cols = self._get_output_column_names(output_cols_prefix)
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(
                 dataset._session, Session
             )  # mypy does not recognize the check in _batch_inference_validate_snowpark()
@@ -734,10 +744,8 @@ class CalibratedClassifierCV(BaseTransformer):
         transform_kwargs: BatchInferenceKwargsTypedDict = dict()
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(
                 dataset._session, Session
             )  # mypy does not recognize the check in _batch_inference_validate_snowpark()
@@ -799,10 +807,8 @@ class CalibratedClassifierCV(BaseTransformer):
         expected_output_cols = self._get_output_column_names(output_cols_prefix)
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(
                 dataset._session, Session
             )  # mypy does not recognize the check in _batch_inference_validate_snowpark()
@@ -868,10 +874,8 @@ class CalibratedClassifierCV(BaseTransformer):
         expected_output_cols = self._get_output_column_names(output_cols_prefix)
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
             transform_kwargs = dict(
                 session=dataset._session,
@@ -935,17 +939,15 @@ class CalibratedClassifierCV(BaseTransformer):
         transform_kwargs: ScoreKwargsTypedDict = dict()
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method="score",
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
+            self._deps = self._get_dependencies()
             selected_cols = self._get_active_columns()
             if len(selected_cols) > 0:
                 dataset = dataset.select(selected_cols)
             assert isinstance(dataset._session, Session) # keep mypy happy
             transform_kwargs = dict(
                 session=dataset._session,
-                dependencies=["snowflake-snowpark-python"] + self._deps,
+                dependencies=self._deps,
                 score_sproc_imports=['sklearn'],
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -1010,11 +1012,8 @@ class CalibratedClassifierCV(BaseTransformer):
         if isinstance(dataset, DataFrame):
-            self._deps = self._batch_inference_validate_snowpark(
-                dataset=dataset,
-                inference_method=inference_method,
-            )
+            self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
+            self._deps = self._get_dependencies()
             assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
             transform_kwargs = dict(
                 session = dataset._session,

snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl