PyPI - snowflake-ml-python - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (218) hide show

snowflake/ml/_internal/env_utils.py +72 -31
snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
snowflake/ml/_internal/exceptions/error_codes.py +3 -0
snowflake/ml/_internal/lineage/data_source.py +10 -0
snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
snowflake/ml/_internal/telemetry.py +1 -0
snowflake/ml/_internal/utils/identifier.py +1 -1
snowflake/ml/_internal/utils/sql_identifier.py +14 -1
snowflake/ml/dataset/__init__.py +11 -0
snowflake/ml/dataset/dataset.py +455 -129
snowflake/ml/dataset/dataset_factory.py +53 -0
snowflake/ml/dataset/dataset_metadata.py +103 -0
snowflake/ml/dataset/dataset_reader.py +199 -0
snowflake/ml/feature_store/__init__.py +6 -0
snowflake/ml/feature_store/access_manager.py +279 -0
snowflake/ml/feature_store/feature_store.py +544 -358
snowflake/ml/feature_store/feature_view.py +55 -16
snowflake/ml/fileset/embedded_stage_fs.py +149 -0
snowflake/ml/fileset/sfcfs.py +0 -4
snowflake/ml/fileset/snowfs.py +160 -0
snowflake/ml/fileset/stage_fs.py +25 -10
snowflake/ml/model/__init__.py +2 -2
snowflake/ml/model/_api.py +16 -1
snowflake/ml/model/_client/model/model_impl.py +65 -31
snowflake/ml/model/_client/model/model_version_impl.py +159 -2
snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
snowflake/ml/model/_client/ops/model_ops.py +268 -83
snowflake/ml/model/_client/sql/_base.py +34 -0
snowflake/ml/model/_client/sql/model.py +42 -47
snowflake/ml/model/_client/sql/model_version.py +164 -39
snowflake/ml/model/_client/sql/stage.py +6 -32
snowflake/ml/model/_client/sql/tag.py +32 -56
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
snowflake/ml/model/_model_composer/model_composer.py +22 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
snowflake/ml/model/_packager/model_env/model_env.py +41 -0
snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
snowflake/ml/model/_packager/model_packager.py +0 -3
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
snowflake/ml/modeling/_internal/model_trainer.py +7 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +340 -17
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
snowflake/ml/modeling/cluster/birch.py +53 -52
snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
snowflake/ml/modeling/cluster/dbscan.py +51 -52
snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
snowflake/ml/modeling/cluster/k_means.py +53 -52
snowflake/ml/modeling/cluster/mean_shift.py +51 -52
snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
snowflake/ml/modeling/cluster/optics.py +51 -52
snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
snowflake/ml/modeling/compose/column_transformer.py +53 -52
snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
snowflake/ml/modeling/covariance/oas.py +51 -52
snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/pca.py +53 -52
snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
snowflake/ml/modeling/framework/base.py +64 -36
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
snowflake/ml/modeling/impute/knn_imputer.py +53 -52
snowflake/ml/modeling/impute/missing_indicator.py +53 -52
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
snowflake/ml/modeling/linear_model/lars.py +51 -52
snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso.py +51 -52
snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
snowflake/ml/modeling/linear_model/perceptron.py +51 -52
snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
snowflake/ml/modeling/linear_model/ridge.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
snowflake/ml/modeling/manifold/isomap.py +53 -52
snowflake/ml/modeling/manifold/mds.py +53 -52
snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
snowflake/ml/modeling/manifold/tsne.py +53 -52
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
snowflake/ml/modeling/pipeline/pipeline.py +538 -36
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
snowflake/ml/modeling/svm/linear_svc.py +51 -52
snowflake/ml/modeling/svm/linear_svr.py +51 -52
snowflake/ml/modeling/svm/nu_svc.py +51 -52
snowflake/ml/modeling/svm/nu_svr.py +51 -52
snowflake/ml/modeling/svm/svc.py +51 -52
snowflake/ml/modeling/svm/svr.py +51 -52
snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
snowflake/ml/registry/_manager/model_manager.py +36 -7
snowflake/ml/registry/model_registry.py +3 -149
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +112 -7
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +216 -206
snowflake/ml/registry/_artifact_manager.py +0 -156
snowflake/ml/registry/artifact.py +0 -46
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/pipeline/pipeline.py CHANGED Viewed

@@ -1,7 +1,12 @@
 #!/usr/bin/env python3
+import inspect
+import os
+import posixpath
+import tempfile
 from itertools import chain
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+import cloudpickle as cp
 import numpy as np
 import pandas as pd
 from sklearn import __version__ as skversion, pipeline
@@ -10,14 +15,20 @@ from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import metaestimators
 from snowflake import snowpark
-from snowflake.ml._internal import telemetry
+from snowflake.ml._internal import file_utils, telemetry
 from snowflake.ml._internal.exceptions import error_codes, exceptions
-from snowflake.ml._internal.utils import snowpark_dataframe_utils
+from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
 from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
+from snowflake.ml.modeling._internal.model_transformer_builder import (
+    ModelTransformerBuilder,
+)
 from snowflake.ml.modeling.framework import _utils, base
+from snowflake.snowpark import Session, functions as F
+from snowflake.snowpark._internal import utils as snowpark_utils
 _PROJECT = "ModelDevelopment"
 _SUBPROJECT = "Framework"
+IN_ML_RUNTIME_ENV_VAR = "IN_SPCS_ML_RUNTIME"
 def _final_step_has(attr: str) -> Callable[..., bool]:
@@ -104,7 +115,7 @@ class Pipeline(base.BaseTransformer):
         self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = []
         self._n_features_in: List[int] = []
         self._transformers_to_input_indices: Dict[str, List[int]] = {}
-        self._is_convertible_to_sklearn = True
+        self._modifies_label_or_sample_weight = True
         self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
@@ -113,6 +124,11 @@ class Pipeline(base.BaseTransformer):
             if isinstance(obj, base.BaseTransformer):
                 deps = deps | set(obj._get_dependencies())
         self._deps = list(deps)
+        self._sklearn_object = None
+        self.label_cols = self._get_label_cols()
+        self._is_convertible_to_sklearn = self._is_convertible_to_sklearn_object()
+        self._send_pipeline_configuration_telemetry()
     @staticmethod
     def _is_estimator(obj: object) -> bool:
@@ -147,6 +163,33 @@ class Pipeline(base.BaseTransformer):
         self._n_features_in = []
         self._transformers_to_input_indices = {}
+    def _is_convertible_to_sklearn_object(self) -> bool:
+        """Checks if the pipeline can be converted to a native sklearn pipeline.
+        - We can not create an sklearn pipeline if its label or sample weight column are
+          modified in the pipeline.
+        - We can not create an sklearn pipeline if any of its steps cannot be converted to an sklearn pipeline
+        - We can not create an sklearn pipeline if input columns are specified in any step other than
+          the first step
+        Returns:
+            True if the pipeline can be converted to a native sklearn pipeline, else false.
+        """
+        if self._is_pipeline_modifying_label_or_sample_weight():
+            return False
+        # check that nested pipelines can be converted to sklearn
+        for _, base_estimator in self.steps:
+            if hasattr(base_estimator, "_is_convertible_to_sklearn_object"):
+                if not base_estimator._is_convertible_to_sklearn_object():
+                    return False
+        # check that no column after the first column has 'input columns' set.
+        for _, base_estimator in self.steps[1:]:
+            if base_estimator.get_input_cols():
+                # We only want Falsy values - None and []
+                return False
+        return True
     def _is_pipeline_modifying_label_or_sample_weight(self) -> bool:
         """
         Checks if pipeline is modifying label or sample_weight columns.
@@ -188,7 +231,7 @@ class Pipeline(base.BaseTransformer):
         return [c for c in columns if c not in target_cols]
     def _append_step_feature_consumption_info(self, step_name: str, all_cols: List[str], input_cols: List[str]) -> None:
-        if self._is_convertible_to_sklearn:
+        if self._modifies_label_or_sample_weight:
             all_cols = self._get_sanitized_list_of_columns(all_cols)
             self._feature_names_in.append(np.asarray(all_cols, dtype=object))
             self._n_features_in.append(len(all_cols))
@@ -208,33 +251,173 @@ class Pipeline(base.BaseTransformer):
         self, dataset: Union[snowpark.DataFrame, pd.DataFrame]
     ) -> Union[snowpark.DataFrame, pd.DataFrame]:
         self._reset()
-        self._is_convertible_to_sklearn = not self._is_pipeline_modifying_label_or_sample_weight()
+        self._modifies_label_or_sample_weight = not self._is_pipeline_modifying_label_or_sample_weight()
         transformed_dataset = dataset
         for name, trans in self._get_transformers():
             self._append_step_feature_consumption_info(
                 step_name=name, all_cols=transformed_dataset.columns[:], input_cols=trans.get_input_cols()
             )
-            if has_callable_attr(trans, "fit_transform"):
-                transformed_dataset = trans.fit_transform(transformed_dataset)
-            else:
-                trans.fit(transformed_dataset)
-                transformed_dataset = trans.transform(transformed_dataset)
+            trans.fit(transformed_dataset)
+            transformed_dataset = trans.transform(transformed_dataset)
         return transformed_dataset
+    def _upload_model_to_stage(self, stage_name: str, estimator: object, session: Session) -> Tuple[str, str]:
+        """
+        Util method to pickle and upload the model to a temp Snowflake stage.
+        Args:
+            stage_name: Stage name to save model.
+            estimator: the pipeline estimator itself
+            session: Session object
+        Returns:
+            a tuple containing stage file paths for pickled input model for training and location to store trained
+            models(response from training sproc).
+        """
+        # Create a temp file and dump the transform to that file.
+        local_transform_file_name = temp_file_utils.get_temp_file_path()
+        with open(local_transform_file_name, mode="w+b") as local_transform_file:
+            cp.dump(estimator, local_transform_file)
+        # Use posixpath to construct stage paths
+        stage_transform_file_name = posixpath.join(stage_name, os.path.basename(local_transform_file_name))
+        stage_result_file_name = posixpath.join(stage_name, os.path.basename(local_transform_file_name))
+        # Put locally serialized transform on stage.
+        session.file.put(
+            local_transform_file_name,
+            stage_transform_file_name,
+            auto_compress=False,
+            overwrite=True,
+        )
+        temp_file_utils.cleanup_temp_files([local_transform_file_name])
+        return (stage_transform_file_name, stage_result_file_name)
+    def _fit_snowpark_dataframe_within_one_sproc(self, session: Session, dataset: snowpark.DataFrame) -> None:
+        # Extract queries that generated the dataframe. We will need to pass it to score procedure.
+        sql_queries = dataset.queries["queries"]
+        # Zip the current snowml package
+        with tempfile.TemporaryDirectory() as tmpdir:
+            snowml_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-python.zip")
+            file_utils.zip_python_package(snowml_zip_module_filename, "snowflake.ml")
+            imports = [snowml_zip_module_filename]
+            sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE)
+            required_deps = self._deps
+            sproc_statement_params = telemetry.get_function_usage_statement_params(
+                project=_PROJECT,
+                subproject="PIPELINE",
+                function_name=telemetry.get_statement_params_full_func_name(
+                    inspect.currentframe(), self.__class__.__name__
+                ),
+                api_calls=[F.sproc],
+            )
+            transform_stage_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.STAGE)
+            stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
+            session.sql(stage_creation_query).collect()
+            (stage_estimator_file_name, stage_result_file_name) = self._upload_model_to_stage(
+                transform_stage_name, self, session
+            )
+            def pipeline_within_one_sproc(
+                session: Session,
+                sql_queries: List[str],
+                stage_estimator_file_name: str,
+                stage_result_file_name: str,
+                sproc_statement_params: Dict[str, str],
+            ) -> str:
+                import os
+                import cloudpickle as cp
+                import pandas as pd
+                for query in sql_queries[:-1]:
+                    _ = session.sql(query).collect(statement_params=sproc_statement_params)
+                sp_df = session.sql(sql_queries[-1])
+                df: pd.DataFrame = sp_df.to_pandas(statement_params=sproc_statement_params)
+                df.columns = sp_df.columns
+                local_estimator_file_name = temp_file_utils.get_temp_file_path()
+                session.file.get(stage_estimator_file_name, local_estimator_file_name)
+                local_estimator_file_path = os.path.join(
+                    local_estimator_file_name, os.listdir(local_estimator_file_name)[0]
+                )
+                with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
+                    estimator = cp.load(local_estimator_file_obj)
+                estimator.fit(df)
+                local_result_file_name = temp_file_utils.get_temp_file_path()
+                with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                    cp.dump(estimator, local_result_file_obj)
+                session.file.put(
+                    local_result_file_name,
+                    stage_result_file_name,
+                    auto_compress=False,
+                    overwrite=True,
+                    statement_params=sproc_statement_params,
+                )
+                return str(os.path.basename(local_result_file_name))
+            session.sproc.register(
+                func=pipeline_within_one_sproc,
+                is_permanent=False,
+                name=sproc_name,
+                packages=required_deps,  # type: ignore[arg-type]
+                replace=True,
+                session=session,
+                anonymous=True,
+                imports=imports,  # type: ignore[arg-type]
+                statement_params=sproc_statement_params,
+            )
+            sproc_export_file_name: str = pipeline_within_one_sproc(
+                session,
+                sql_queries,
+                stage_estimator_file_name,
+                stage_result_file_name,
+                sproc_statement_params,
+            )
+            local_result_file_name = temp_file_utils.get_temp_file_path()
+            session.file.get(
+                posixpath.join(stage_estimator_file_name, sproc_export_file_name),
+                local_result_file_name,
+                statement_params=sproc_statement_params,
+            )
+            with open(os.path.join(local_result_file_name, sproc_export_file_name), mode="r+b") as result_file_obj:
+                fit_estimator = cp.load(result_file_obj)
+            temp_file_utils.cleanup_temp_files([local_result_file_name])
+            for key, val in vars(fit_estimator).items():
+                setattr(self, key, val)
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
         subproject=_SUBPROJECT,
     )
-    def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Pipeline":
+    def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame], squash: Optional[bool] = False) -> "Pipeline":
         """
         Fit the entire pipeline using the dataset.
         Args:
             dataset: Input dataset.
+            squash: Run the whole pipeline within a stored procedure
         Returns:
             Fitted pipeline.
+        Raises:
+            ValueError: A pipeline incompatible with sklearn is used on MLRS
         """
         self._validate_steps()
@@ -243,19 +426,33 @@ class Pipeline(base.BaseTransformer):
             if isinstance(dataset, snowpark.DataFrame)
             else dataset
         )
-        transformed_dataset = self._fit_transform_dataset(dataset)
-        estimator = self._get_estimator()
-        if estimator:
-            all_cols = transformed_dataset.columns[:]
-            estimator[1].fit(transformed_dataset)
+        if self._can_be_trained_in_ml_runtime(dataset):
+            if not self._is_convertible_to_sklearn:
+                raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
+            self._fit_ml_runtime(dataset)
-            self._append_step_feature_consumption_info(
-                step_name=estimator[0], all_cols=all_cols, input_cols=estimator[1].get_input_cols()
-            )
+        elif squash and isinstance(dataset, snowpark.DataFrame):
+            session = dataset._session
+            assert session is not None
+            self._fit_snowpark_dataframe_within_one_sproc(session=session, dataset=dataset)
+        else:
+            transformed_dataset = self._fit_transform_dataset(dataset)
+            estimator = self._get_estimator()
+            if estimator:
+                all_cols = transformed_dataset.columns[:]
+                estimator[1].fit(transformed_dataset)
+                self._append_step_feature_consumption_info(
+                    step_name=estimator[0], all_cols=all_cols, input_cols=estimator[1].get_input_cols()
+                )
+            self._generate_model_signatures(dataset=dataset)
-        self._generate_model_signatures(dataset=dataset)
         self._is_fitted = True
         return self
     @metaestimators.available_if(_final_step_has("transform"))  # type: ignore[misc]
@@ -280,6 +477,22 @@ class Pipeline(base.BaseTransformer):
             else dataset
         )
+        if self._sklearn_object is not None:
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.batch_inference(
+                inference_method="transform",
+                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
+                expected_output_cols=self._infer_output_cols(),
+                session=dataset._session,
+                dependencies=self._deps,
+            )
         transformed_dataset = self._transform_dataset(dataset=dataset)
         estimator = self._get_estimator()
         if estimator:
@@ -389,8 +602,32 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
+        Raises:
+            ValueError: An sklearn object has not been fit and stored before calling this function.
         """
-        return self._invoke_estimator_func("predict", dataset)
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            if self._sklearn_object is None:
+                raise ValueError("Model must be fit before inference.")
+            expected_output_cols = self._infer_output_cols()
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.batch_inference(
+                inference_method="predict",
+                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
+                expected_output_cols=expected_output_cols,
+                session=dataset._session,
+                dependencies=self._deps,
+            )
+        else:
+            return self._invoke_estimator_func("predict", dataset)
     @metaestimators.available_if(_final_step_has("score_samples"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -408,8 +645,32 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
+        Raises:
+            ValueError: An sklearn object has not been fit before calling this function
         """
-        return self._invoke_estimator_func("score_samples", dataset)
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            if self._sklearn_object is None:
+                raise ValueError("Model must be fit before inference.")
+            expected_output_cols = self._get_output_column_names("score_samples")
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.batch_inference(
+                inference_method="score_samples",
+                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
+                expected_output_cols=expected_output_cols,
+                session=dataset._session,
+                dependencies=self._deps,
+            )
+        else:
+            return self._invoke_estimator_func("score_samples", dataset)
     @metaestimators.available_if(_final_step_has("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -427,8 +688,32 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
+        Raises:
+            ValueError: An sklearn object has not been fit before calling this function
         """
-        return self._invoke_estimator_func("predict_proba", dataset)
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            if self._sklearn_object is None:
+                raise ValueError("Model must be fit before inference.")
+            expected_output_cols = self._get_output_column_names("predict_proba")
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.batch_inference(
+                inference_method="predict_proba",
+                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
+                expected_output_cols=expected_output_cols,
+                session=dataset._session,
+                dependencies=self._deps,
+            )
+        else:
+            return self._invoke_estimator_func("predict_proba", dataset)
     @metaestimators.available_if(_final_step_has("predict_log_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -447,8 +732,31 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
+        Raises:
+            ValueError: An sklearn object has not been fit before calling this function
         """
-        return self._invoke_estimator_func("predict_log_proba", dataset)
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            if self._sklearn_object is None:
+                raise ValueError("Model must be fit before inference.")
+            expected_output_cols = self._get_output_column_names("predict_log_proba")
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.batch_inference(
+                inference_method="predict_log_proba",
+                input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
+                expected_output_cols=expected_output_cols,
+                session=dataset._session,
+                dependencies=self._deps,
+            )
+        else:
+            return self._invoke_estimator_func("predict_log_proba", dataset)
     @metaestimators.available_if(_final_step_has("score"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
@@ -464,8 +772,30 @@ class Pipeline(base.BaseTransformer):
         Returns:
             Output dataset.
+        Raises:
+            ValueError: An sklearn object has not been fit before calling this function
         """
-        return self._invoke_estimator_func("score", dataset)
+        if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            if self._sklearn_object is None:
+                raise ValueError("Model must be fit before scoreing.")
+            handler = ModelTransformerBuilder.build(
+                dataset=dataset,
+                estimator=self._sklearn_object,
+                class_name="Pipeline",
+                subproject="",
+                autogenerated=False,
+            )
+            return handler.score(
+                input_cols=self._infer_input_cols(),
+                label_cols=self._get_label_cols(),
+                session=dataset._session,
+                dependencies=self._deps,
+                score_sproc_imports=[],
+            )
+        else:
+            return self._invoke_estimator_func("score", dataset)
     def _invoke_estimator_func(
         self, func_name: str, dataset: Union[snowpark.DataFrame, pd.DataFrame]
@@ -495,15 +825,6 @@ class Pipeline(base.BaseTransformer):
         res: snowpark.DataFrame = getattr(estimator[1], func_name)(transformed_dataset)
         return res
-    def _create_unfitted_sklearn_object(self) -> pipeline.Pipeline:
-        sksteps = []
-        for step in self.steps:
-            if isinstance(step[1], base.BaseTransformer):
-                sksteps.append(tuple([step[0], _utils.to_native_format(step[1])]))
-            else:
-                sksteps.append(tuple([step[0], step[1]]))
-        return pipeline.Pipeline(steps=sksteps)
     def _construct_fitted_column_transformer_object(
         self,
         step_name_in_pipeline: str,
@@ -562,15 +883,134 @@ class Pipeline(base.BaseTransformer):
             ct._name_to_fitted_passthrough = {step_name_in_ct: ft}
         return ct
+    def _fit_ml_runtime(self, dataset: snowpark.DataFrame) -> None:
+        """Train the pipeline in the ML Runtime.
+        Args:
+            dataset: The training Snowpark dataframe
+        Raises:
+            ModuleNotFoundError: The ML Runtime Client is not installed.
+        """
+        try:
+            from snowflake.ml.runtime import MLRuntimeClient
+        except ModuleNotFoundError as e:
+            # The snowflake.ml.runtime module should always be present when
+            # the env var IN_SPCS_ML_RUNTIME is present.
+            raise ModuleNotFoundError("ML Runtime Python Client is not installed.") from e
+        client = MLRuntimeClient()
+        ml_runtime_compatible_pipeline = self._create_unfitted_sklearn_object()
+        label_cols = self._get_label_cols()
+        all_df_cols = dataset.columns
+        input_cols = [col for col in all_df_cols if col not in label_cols]
+        trained_pipeline = client.train(
+            estimator=ml_runtime_compatible_pipeline,
+            dataset=dataset,
+            input_cols=input_cols,
+            label_cols=label_cols,
+            sample_weight_col=self.sample_weight_col,
+        )
+        self._sklearn_object = trained_pipeline
+    def _get_label_cols(self) -> List[str]:
+        """Util function to get the label columns from the pipeline.
+        The label column is only present in the estimator
+        Returns:
+            List of label columns, or empty list if no label cols.
+        """
+        label_cols = []
+        estimator = self._get_estimator()
+        if estimator is not None:
+            label_cols = estimator[1].get_label_cols()
+        return label_cols
+    def _can_be_trained_in_ml_runtime(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> bool:
+        """A utility function to determine if the pipeline cam be pushed down to the ML Runtime for training.
+        Currently, this is true if:
+        - The training dataset is a snowpark dataframe,
+        - The IN_SPCS_ML_RUNTIME environment is present and
+        - The pipeline can be converted to an sklearn pipeline.
+        Args:
+            dataset: The training dataset
+        Returns:
+            True if the dataset can be fit in the ml runtime, else false.
+        """
+        if not isinstance(dataset, snowpark.DataFrame):
+            return False
+        if not os.environ.get(IN_ML_RUNTIME_ENV_VAR):
+            return False
+        return self._is_convertible_to_sklearn
+    @staticmethod
+    def _wrap_transformer_in_column_transformer(
+        transformer_name: str, transformer: base.BaseTransformer
+    ) -> ColumnTransformer:
+        """A helper function to convert a transformer object to an sklearn object and wrap in an sklearn
+            ColumnTransformer.
+        Args:
+            transformer_name: Name of the transformer to be wrapped.
+            transformer: The transformer object to be wrapped.
+        Returns:
+            A column transformer sklearn object that uses the input columns from the initial snowpark ml transformer.
+        """
+        column_transformer = ColumnTransformer(
+            transformers=[(transformer_name, Pipeline._get_native_object(transformer), transformer.get_input_cols())],
+            remainder="passthrough",
+        )
+        return column_transformer
+    def _create_unfitted_sklearn_object(self) -> pipeline.Pipeline:
+        """Create a sklearn pipeline from the current snowml pipeline.
+        ColumnTransformers are used to wrap transformers as their input columns can be specified
+        as a subset of the pipeline's input columns.
+        Returns:
+            An unfit pipeline that can be fit using the ML runtime client.
+        """
+        sklearn_pipeline_steps = []
+        first_step_name, first_step_object = self.steps[0]
+        # Only the first step can have the input_cols field not None/empty.
+        if first_step_object.get_input_cols():
+            first_step_column_transformer = Pipeline._wrap_transformer_in_column_transformer(
+                first_step_name, first_step_object
+            )
+            first_step_skl = (first_step_name, first_step_column_transformer)
+        else:
+            first_step_skl = (first_step_name, Pipeline._get_native_object(first_step_object))
+        sklearn_pipeline_steps.append(first_step_skl)
+        for step_name, step_object in self.steps[1:]:
+            skl_step = (step_name, Pipeline._get_native_object(step_object))
+            sklearn_pipeline_steps.append(skl_step)
+        return pipeline.Pipeline(sklearn_pipeline_steps)
     def _create_sklearn_object(self) -> pipeline.Pipeline:
         if not self._is_fitted:
             return self._create_unfitted_sklearn_object()
-        if not self._is_convertible_to_sklearn:
+        if not self._modifies_label_or_sample_weight:
             raise exceptions.SnowflakeMLException(
                 error_code=error_codes.METHOD_NOT_ALLOWED,
                 original_exception=ValueError(
-                    "The pipeline can't be converted to SKLearn equivalent because it processing label or "
+                    "The pipeline can't be converted to SKLearn equivalent because it modifies processing label or "
                     "sample_weight columns as part of pipeline preprocessing steps which is not allowed in SKLearn."
                 ),
             )
@@ -631,3 +1071,65 @@ class Pipeline(base.BaseTransformer):
                 original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
             )
         return self._model_signature_dict
+    @staticmethod
+    def _get_native_object(estimator: base.BaseEstimator) -> object:
+        """A helper function to get the native(sklearn, xgboost, or lightgbm)
+        object from a snowpark ml estimator.
+        TODO - better type hinting - is there a common base class for all xgb/lgbm estimators?
+        Args:
+            estimator: the estimator from which to derive the native object.
+        Returns:
+            a native estimator object
+        Raises:
+            ValueError: The estimator is not an sklearn, xgboost, or lightgbm estimator.
+        """
+        methods = ["to_sklearn", "to_xgboost", "to_lightgbm"]
+        for method_name in methods:
+            if hasattr(estimator, method_name):
+                try:
+                    result = getattr(estimator, method_name)()
+                    return result
+                except exceptions.SnowflakeMLException:
+                    pass  # Do nothing and continue to the next method
+        raise ValueError("The estimator must be an sklearn, xgboost, or lightgbm estimator.")
+    def to_sklearn(self) -> pipeline.Pipeline:
+        """Returns an sklearn Pipeline representing the object, if possible.
+        Returns:
+            previously fit sklearn Pipeline if present, else an unfit pipeline
+        Raises:
+            ValueError: The pipeline cannot be represented as an sklearn pipeline.
+        """
+        if self._is_fitted:
+            if self._sklearn_object is not None:
+                return self._sklearn_object
+            else:
+                return self._create_sklearn_object()
+        else:
+            if self._is_convertible_to_sklearn:
+                return self._create_unfitted_sklearn_object()
+            else:
+                raise ValueError("This pipeline can not be converted to an sklearn pipeline.")
+    def _send_pipeline_configuration_telemetry(self) -> None:
+        """Track information about the pipeline setup. Currently, we want to track:
+        - Whether the pipeline is converible to an sklearn pipeline
+        - Whether the pipeline is being used in the SPCS ml runtime.
+        """
+        telemetry_data = {
+            "pipeline_is_convertible_to_sklearn": self._is_convertible_to_sklearn,
+            "in_spcs_ml_runtime": bool(os.environ.get(IN_ML_RUNTIME_ENV_VAR)),
+        }
+        telemetry.send_custom_usage(
+            project=_PROJECT,
+            subproject=_SUBPROJECT,
+            telemetry_type=telemetry.TelemetryField.TYPE_SNOWML_PIPELINE_USAGE.value,
+            data=telemetry_data,
+        )

snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

snowflake-ml-python 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl