snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +72 -31
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
- snowflake/ml/_internal/telemetry.py +1 -0
- snowflake/ml/_internal/utils/identifier.py +1 -1
- snowflake/ml/_internal/utils/sql_identifier.py +14 -1
- snowflake/ml/dataset/__init__.py +11 -0
- snowflake/ml/dataset/dataset.py +455 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +199 -0
- snowflake/ml/feature_store/__init__.py +6 -0
- snowflake/ml/feature_store/access_manager.py +279 -0
- snowflake/ml/feature_store/feature_store.py +544 -358
- snowflake/ml/feature_store/feature_view.py +55 -16
- snowflake/ml/fileset/embedded_stage_fs.py +149 -0
- snowflake/ml/fileset/sfcfs.py +0 -4
- snowflake/ml/fileset/snowfs.py +160 -0
- snowflake/ml/fileset/stage_fs.py +25 -10
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +65 -31
- snowflake/ml/model/_client/model/model_version_impl.py +159 -2
- snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
- snowflake/ml/model/_client/ops/model_ops.py +268 -83
- snowflake/ml/model/_client/sql/_base.py +34 -0
- snowflake/ml/model/_client/sql/model.py +42 -47
- snowflake/ml/model/_client/sql/model_version.py +164 -39
- snowflake/ml/model/_client/sql/stage.py +6 -32
- snowflake/ml/model/_client/sql/tag.py +32 -56
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
- snowflake/ml/model/_packager/model_packager.py +0 -3
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +340 -17
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
- snowflake/ml/modeling/cluster/birch.py +53 -52
- snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
- snowflake/ml/modeling/cluster/dbscan.py +51 -52
- snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
- snowflake/ml/modeling/cluster/k_means.py +53 -52
- snowflake/ml/modeling/cluster/mean_shift.py +51 -52
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
- snowflake/ml/modeling/cluster/optics.py +51 -52
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
- snowflake/ml/modeling/compose/column_transformer.py +53 -52
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
- snowflake/ml/modeling/covariance/oas.py +51 -52
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
- snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
- snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
- snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
- snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
- snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
- snowflake/ml/modeling/decomposition/pca.py +53 -52
- snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
- snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
- snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
- snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
- snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
- snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
- snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
- snowflake/ml/modeling/framework/base.py +64 -36
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
- snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
- snowflake/ml/modeling/impute/knn_imputer.py +53 -52
- snowflake/ml/modeling/impute/missing_indicator.py +53 -52
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
- snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/lars.py +51 -52
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/perceptron.py +51 -52
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ridge.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
- snowflake/ml/modeling/manifold/isomap.py +53 -52
- snowflake/ml/modeling/manifold/mds.py +53 -52
- snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
- snowflake/ml/modeling/manifold/tsne.py +53 -52
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
- snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
- snowflake/ml/modeling/pipeline/pipeline.py +538 -36
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
- snowflake/ml/modeling/svm/linear_svc.py +51 -52
- snowflake/ml/modeling/svm/linear_svr.py +51 -52
- snowflake/ml/modeling/svm/nu_svc.py +51 -52
- snowflake/ml/modeling/svm/nu_svr.py +51 -52
- snowflake/ml/modeling/svm/svc.py +51 -52
- snowflake/ml/modeling/svm/svr.py +51 -52
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
- snowflake/ml/registry/_manager/model_manager.py +36 -7
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +112 -7
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +216 -206
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,12 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
import inspect
|
3
|
+
import os
|
4
|
+
import posixpath
|
5
|
+
import tempfile
|
2
6
|
from itertools import chain
|
3
7
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
4
8
|
|
9
|
+
import cloudpickle as cp
|
5
10
|
import numpy as np
|
6
11
|
import pandas as pd
|
7
12
|
from sklearn import __version__ as skversion, pipeline
|
@@ -10,14 +15,20 @@ from sklearn.preprocessing import FunctionTransformer
|
|
10
15
|
from sklearn.utils import metaestimators
|
11
16
|
|
12
17
|
from snowflake import snowpark
|
13
|
-
from snowflake.ml._internal import telemetry
|
18
|
+
from snowflake.ml._internal import file_utils, telemetry
|
14
19
|
from snowflake.ml._internal.exceptions import error_codes, exceptions
|
15
|
-
from snowflake.ml._internal.utils import snowpark_dataframe_utils
|
20
|
+
from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
|
16
21
|
from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
|
22
|
+
from snowflake.ml.modeling._internal.model_transformer_builder import (
|
23
|
+
ModelTransformerBuilder,
|
24
|
+
)
|
17
25
|
from snowflake.ml.modeling.framework import _utils, base
|
26
|
+
from snowflake.snowpark import Session, functions as F
|
27
|
+
from snowflake.snowpark._internal import utils as snowpark_utils
|
18
28
|
|
19
29
|
_PROJECT = "ModelDevelopment"
|
20
30
|
_SUBPROJECT = "Framework"
|
31
|
+
IN_ML_RUNTIME_ENV_VAR = "IN_SPCS_ML_RUNTIME"
|
21
32
|
|
22
33
|
|
23
34
|
def _final_step_has(attr: str) -> Callable[..., bool]:
|
@@ -104,7 +115,7 @@ class Pipeline(base.BaseTransformer):
|
|
104
115
|
self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = []
|
105
116
|
self._n_features_in: List[int] = []
|
106
117
|
self._transformers_to_input_indices: Dict[str, List[int]] = {}
|
107
|
-
self.
|
118
|
+
self._modifies_label_or_sample_weight = True
|
108
119
|
|
109
120
|
self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
|
110
121
|
|
@@ -113,6 +124,11 @@ class Pipeline(base.BaseTransformer):
|
|
113
124
|
if isinstance(obj, base.BaseTransformer):
|
114
125
|
deps = deps | set(obj._get_dependencies())
|
115
126
|
self._deps = list(deps)
|
127
|
+
self._sklearn_object = None
|
128
|
+
self.label_cols = self._get_label_cols()
|
129
|
+
self._is_convertible_to_sklearn = self._is_convertible_to_sklearn_object()
|
130
|
+
|
131
|
+
self._send_pipeline_configuration_telemetry()
|
116
132
|
|
117
133
|
@staticmethod
|
118
134
|
def _is_estimator(obj: object) -> bool:
|
@@ -147,6 +163,33 @@ class Pipeline(base.BaseTransformer):
|
|
147
163
|
self._n_features_in = []
|
148
164
|
self._transformers_to_input_indices = {}
|
149
165
|
|
166
|
+
def _is_convertible_to_sklearn_object(self) -> bool:
|
167
|
+
"""Checks if the pipeline can be converted to a native sklearn pipeline.
|
168
|
+
- We can not create an sklearn pipeline if its label or sample weight column are
|
169
|
+
modified in the pipeline.
|
170
|
+
- We can not create an sklearn pipeline if any of its steps cannot be converted to an sklearn pipeline
|
171
|
+
- We can not create an sklearn pipeline if input columns are specified in any step other than
|
172
|
+
the first step
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
True if the pipeline can be converted to a native sklearn pipeline, else false.
|
176
|
+
"""
|
177
|
+
if self._is_pipeline_modifying_label_or_sample_weight():
|
178
|
+
return False
|
179
|
+
|
180
|
+
# check that nested pipelines can be converted to sklearn
|
181
|
+
for _, base_estimator in self.steps:
|
182
|
+
if hasattr(base_estimator, "_is_convertible_to_sklearn_object"):
|
183
|
+
if not base_estimator._is_convertible_to_sklearn_object():
|
184
|
+
return False
|
185
|
+
|
186
|
+
# check that no column after the first column has 'input columns' set.
|
187
|
+
for _, base_estimator in self.steps[1:]:
|
188
|
+
if base_estimator.get_input_cols():
|
189
|
+
# We only want Falsy values - None and []
|
190
|
+
return False
|
191
|
+
return True
|
192
|
+
|
150
193
|
def _is_pipeline_modifying_label_or_sample_weight(self) -> bool:
|
151
194
|
"""
|
152
195
|
Checks if pipeline is modifying label or sample_weight columns.
|
@@ -188,7 +231,7 @@ class Pipeline(base.BaseTransformer):
|
|
188
231
|
return [c for c in columns if c not in target_cols]
|
189
232
|
|
190
233
|
def _append_step_feature_consumption_info(self, step_name: str, all_cols: List[str], input_cols: List[str]) -> None:
|
191
|
-
if self.
|
234
|
+
if self._modifies_label_or_sample_weight:
|
192
235
|
all_cols = self._get_sanitized_list_of_columns(all_cols)
|
193
236
|
self._feature_names_in.append(np.asarray(all_cols, dtype=object))
|
194
237
|
self._n_features_in.append(len(all_cols))
|
@@ -208,33 +251,173 @@ class Pipeline(base.BaseTransformer):
|
|
208
251
|
self, dataset: Union[snowpark.DataFrame, pd.DataFrame]
|
209
252
|
) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
210
253
|
self._reset()
|
211
|
-
self.
|
254
|
+
self._modifies_label_or_sample_weight = not self._is_pipeline_modifying_label_or_sample_weight()
|
212
255
|
transformed_dataset = dataset
|
213
256
|
for name, trans in self._get_transformers():
|
214
257
|
self._append_step_feature_consumption_info(
|
215
258
|
step_name=name, all_cols=transformed_dataset.columns[:], input_cols=trans.get_input_cols()
|
216
259
|
)
|
217
|
-
|
218
|
-
|
219
|
-
else:
|
220
|
-
trans.fit(transformed_dataset)
|
221
|
-
transformed_dataset = trans.transform(transformed_dataset)
|
260
|
+
trans.fit(transformed_dataset)
|
261
|
+
transformed_dataset = trans.transform(transformed_dataset)
|
222
262
|
|
223
263
|
return transformed_dataset
|
224
264
|
|
265
|
+
def _upload_model_to_stage(self, stage_name: str, estimator: object, session: Session) -> Tuple[str, str]:
|
266
|
+
"""
|
267
|
+
Util method to pickle and upload the model to a temp Snowflake stage.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
stage_name: Stage name to save model.
|
271
|
+
estimator: the pipeline estimator itself
|
272
|
+
session: Session object
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
a tuple containing stage file paths for pickled input model for training and location to store trained
|
276
|
+
models(response from training sproc).
|
277
|
+
"""
|
278
|
+
# Create a temp file and dump the transform to that file.
|
279
|
+
local_transform_file_name = temp_file_utils.get_temp_file_path()
|
280
|
+
with open(local_transform_file_name, mode="w+b") as local_transform_file:
|
281
|
+
cp.dump(estimator, local_transform_file)
|
282
|
+
|
283
|
+
# Use posixpath to construct stage paths
|
284
|
+
stage_transform_file_name = posixpath.join(stage_name, os.path.basename(local_transform_file_name))
|
285
|
+
stage_result_file_name = posixpath.join(stage_name, os.path.basename(local_transform_file_name))
|
286
|
+
|
287
|
+
# Put locally serialized transform on stage.
|
288
|
+
session.file.put(
|
289
|
+
local_transform_file_name,
|
290
|
+
stage_transform_file_name,
|
291
|
+
auto_compress=False,
|
292
|
+
overwrite=True,
|
293
|
+
)
|
294
|
+
|
295
|
+
temp_file_utils.cleanup_temp_files([local_transform_file_name])
|
296
|
+
return (stage_transform_file_name, stage_result_file_name)
|
297
|
+
|
298
|
+
def _fit_snowpark_dataframe_within_one_sproc(self, session: Session, dataset: snowpark.DataFrame) -> None:
|
299
|
+
# Extract queries that generated the dataframe. We will need to pass it to score procedure.
|
300
|
+
sql_queries = dataset.queries["queries"]
|
301
|
+
|
302
|
+
# Zip the current snowml package
|
303
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
304
|
+
snowml_zip_module_filename = os.path.join(tmpdir, "snowflake-ml-python.zip")
|
305
|
+
file_utils.zip_python_package(snowml_zip_module_filename, "snowflake.ml")
|
306
|
+
imports = [snowml_zip_module_filename]
|
307
|
+
|
308
|
+
sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE)
|
309
|
+
required_deps = self._deps
|
310
|
+
sproc_statement_params = telemetry.get_function_usage_statement_params(
|
311
|
+
project=_PROJECT,
|
312
|
+
subproject="PIPELINE",
|
313
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
314
|
+
inspect.currentframe(), self.__class__.__name__
|
315
|
+
),
|
316
|
+
api_calls=[F.sproc],
|
317
|
+
)
|
318
|
+
transform_stage_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.STAGE)
|
319
|
+
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
320
|
+
session.sql(stage_creation_query).collect()
|
321
|
+
(stage_estimator_file_name, stage_result_file_name) = self._upload_model_to_stage(
|
322
|
+
transform_stage_name, self, session
|
323
|
+
)
|
324
|
+
|
325
|
+
def pipeline_within_one_sproc(
|
326
|
+
session: Session,
|
327
|
+
sql_queries: List[str],
|
328
|
+
stage_estimator_file_name: str,
|
329
|
+
stage_result_file_name: str,
|
330
|
+
sproc_statement_params: Dict[str, str],
|
331
|
+
) -> str:
|
332
|
+
import os
|
333
|
+
|
334
|
+
import cloudpickle as cp
|
335
|
+
import pandas as pd
|
336
|
+
|
337
|
+
for query in sql_queries[:-1]:
|
338
|
+
_ = session.sql(query).collect(statement_params=sproc_statement_params)
|
339
|
+
sp_df = session.sql(sql_queries[-1])
|
340
|
+
df: pd.DataFrame = sp_df.to_pandas(statement_params=sproc_statement_params)
|
341
|
+
df.columns = sp_df.columns
|
342
|
+
|
343
|
+
local_estimator_file_name = temp_file_utils.get_temp_file_path()
|
344
|
+
|
345
|
+
session.file.get(stage_estimator_file_name, local_estimator_file_name)
|
346
|
+
|
347
|
+
local_estimator_file_path = os.path.join(
|
348
|
+
local_estimator_file_name, os.listdir(local_estimator_file_name)[0]
|
349
|
+
)
|
350
|
+
with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
|
351
|
+
estimator = cp.load(local_estimator_file_obj)
|
352
|
+
|
353
|
+
estimator.fit(df)
|
354
|
+
|
355
|
+
local_result_file_name = temp_file_utils.get_temp_file_path()
|
356
|
+
|
357
|
+
with open(local_result_file_name, mode="w+b") as local_result_file_obj:
|
358
|
+
cp.dump(estimator, local_result_file_obj)
|
359
|
+
|
360
|
+
session.file.put(
|
361
|
+
local_result_file_name,
|
362
|
+
stage_result_file_name,
|
363
|
+
auto_compress=False,
|
364
|
+
overwrite=True,
|
365
|
+
statement_params=sproc_statement_params,
|
366
|
+
)
|
367
|
+
|
368
|
+
return str(os.path.basename(local_result_file_name))
|
369
|
+
|
370
|
+
session.sproc.register(
|
371
|
+
func=pipeline_within_one_sproc,
|
372
|
+
is_permanent=False,
|
373
|
+
name=sproc_name,
|
374
|
+
packages=required_deps, # type: ignore[arg-type]
|
375
|
+
replace=True,
|
376
|
+
session=session,
|
377
|
+
anonymous=True,
|
378
|
+
imports=imports, # type: ignore[arg-type]
|
379
|
+
statement_params=sproc_statement_params,
|
380
|
+
)
|
381
|
+
|
382
|
+
sproc_export_file_name: str = pipeline_within_one_sproc(
|
383
|
+
session,
|
384
|
+
sql_queries,
|
385
|
+
stage_estimator_file_name,
|
386
|
+
stage_result_file_name,
|
387
|
+
sproc_statement_params,
|
388
|
+
)
|
389
|
+
|
390
|
+
local_result_file_name = temp_file_utils.get_temp_file_path()
|
391
|
+
session.file.get(
|
392
|
+
posixpath.join(stage_estimator_file_name, sproc_export_file_name),
|
393
|
+
local_result_file_name,
|
394
|
+
statement_params=sproc_statement_params,
|
395
|
+
)
|
396
|
+
|
397
|
+
with open(os.path.join(local_result_file_name, sproc_export_file_name), mode="r+b") as result_file_obj:
|
398
|
+
fit_estimator = cp.load(result_file_obj)
|
399
|
+
|
400
|
+
temp_file_utils.cleanup_temp_files([local_result_file_name])
|
401
|
+
for key, val in vars(fit_estimator).items():
|
402
|
+
setattr(self, key, val)
|
403
|
+
|
225
404
|
@telemetry.send_api_usage_telemetry(
|
226
405
|
project=_PROJECT,
|
227
406
|
subproject=_SUBPROJECT,
|
228
407
|
)
|
229
|
-
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Pipeline":
|
408
|
+
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame], squash: Optional[bool] = False) -> "Pipeline":
|
230
409
|
"""
|
231
410
|
Fit the entire pipeline using the dataset.
|
232
411
|
|
233
412
|
Args:
|
234
413
|
dataset: Input dataset.
|
414
|
+
squash: Run the whole pipeline within a stored procedure
|
235
415
|
|
236
416
|
Returns:
|
237
417
|
Fitted pipeline.
|
418
|
+
|
419
|
+
Raises:
|
420
|
+
ValueError: A pipeline incompatible with sklearn is used on MLRS
|
238
421
|
"""
|
239
422
|
|
240
423
|
self._validate_steps()
|
@@ -243,19 +426,33 @@ class Pipeline(base.BaseTransformer):
|
|
243
426
|
if isinstance(dataset, snowpark.DataFrame)
|
244
427
|
else dataset
|
245
428
|
)
|
246
|
-
transformed_dataset = self._fit_transform_dataset(dataset)
|
247
429
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
430
|
+
if self._can_be_trained_in_ml_runtime(dataset):
|
431
|
+
if not self._is_convertible_to_sklearn:
|
432
|
+
raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
|
433
|
+
self._fit_ml_runtime(dataset)
|
252
434
|
|
253
|
-
|
254
|
-
|
255
|
-
|
435
|
+
elif squash and isinstance(dataset, snowpark.DataFrame):
|
436
|
+
session = dataset._session
|
437
|
+
assert session is not None
|
438
|
+
self._fit_snowpark_dataframe_within_one_sproc(session=session, dataset=dataset)
|
439
|
+
|
440
|
+
else:
|
441
|
+
transformed_dataset = self._fit_transform_dataset(dataset)
|
442
|
+
|
443
|
+
estimator = self._get_estimator()
|
444
|
+
if estimator:
|
445
|
+
all_cols = transformed_dataset.columns[:]
|
446
|
+
estimator[1].fit(transformed_dataset)
|
447
|
+
|
448
|
+
self._append_step_feature_consumption_info(
|
449
|
+
step_name=estimator[0], all_cols=all_cols, input_cols=estimator[1].get_input_cols()
|
450
|
+
)
|
451
|
+
|
452
|
+
self._generate_model_signatures(dataset=dataset)
|
256
453
|
|
257
|
-
self._generate_model_signatures(dataset=dataset)
|
258
454
|
self._is_fitted = True
|
455
|
+
|
259
456
|
return self
|
260
457
|
|
261
458
|
@metaestimators.available_if(_final_step_has("transform")) # type: ignore[misc]
|
@@ -280,6 +477,22 @@ class Pipeline(base.BaseTransformer):
|
|
280
477
|
else dataset
|
281
478
|
)
|
282
479
|
|
480
|
+
if self._sklearn_object is not None:
|
481
|
+
handler = ModelTransformerBuilder.build(
|
482
|
+
dataset=dataset,
|
483
|
+
estimator=self._sklearn_object,
|
484
|
+
class_name="Pipeline",
|
485
|
+
subproject="",
|
486
|
+
autogenerated=False,
|
487
|
+
)
|
488
|
+
return handler.batch_inference(
|
489
|
+
inference_method="transform",
|
490
|
+
input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
|
491
|
+
expected_output_cols=self._infer_output_cols(),
|
492
|
+
session=dataset._session,
|
493
|
+
dependencies=self._deps,
|
494
|
+
)
|
495
|
+
|
283
496
|
transformed_dataset = self._transform_dataset(dataset=dataset)
|
284
497
|
estimator = self._get_estimator()
|
285
498
|
if estimator:
|
@@ -389,8 +602,32 @@ class Pipeline(base.BaseTransformer):
|
|
389
602
|
|
390
603
|
Returns:
|
391
604
|
Output dataset.
|
605
|
+
|
606
|
+
Raises:
|
607
|
+
ValueError: An sklearn object has not been fit and stored before calling this function.
|
392
608
|
"""
|
393
|
-
|
609
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
610
|
+
if self._sklearn_object is None:
|
611
|
+
raise ValueError("Model must be fit before inference.")
|
612
|
+
|
613
|
+
expected_output_cols = self._infer_output_cols()
|
614
|
+
handler = ModelTransformerBuilder.build(
|
615
|
+
dataset=dataset,
|
616
|
+
estimator=self._sklearn_object,
|
617
|
+
class_name="Pipeline",
|
618
|
+
subproject="",
|
619
|
+
autogenerated=False,
|
620
|
+
)
|
621
|
+
return handler.batch_inference(
|
622
|
+
inference_method="predict",
|
623
|
+
input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
|
624
|
+
expected_output_cols=expected_output_cols,
|
625
|
+
session=dataset._session,
|
626
|
+
dependencies=self._deps,
|
627
|
+
)
|
628
|
+
|
629
|
+
else:
|
630
|
+
return self._invoke_estimator_func("predict", dataset)
|
394
631
|
|
395
632
|
@metaestimators.available_if(_final_step_has("score_samples")) # type: ignore[misc]
|
396
633
|
@telemetry.send_api_usage_telemetry(
|
@@ -408,8 +645,32 @@ class Pipeline(base.BaseTransformer):
|
|
408
645
|
|
409
646
|
Returns:
|
410
647
|
Output dataset.
|
648
|
+
|
649
|
+
Raises:
|
650
|
+
ValueError: An sklearn object has not been fit before calling this function
|
411
651
|
"""
|
412
|
-
|
652
|
+
|
653
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
654
|
+
if self._sklearn_object is None:
|
655
|
+
raise ValueError("Model must be fit before inference.")
|
656
|
+
|
657
|
+
expected_output_cols = self._get_output_column_names("score_samples")
|
658
|
+
handler = ModelTransformerBuilder.build(
|
659
|
+
dataset=dataset,
|
660
|
+
estimator=self._sklearn_object,
|
661
|
+
class_name="Pipeline",
|
662
|
+
subproject="",
|
663
|
+
autogenerated=False,
|
664
|
+
)
|
665
|
+
return handler.batch_inference(
|
666
|
+
inference_method="score_samples",
|
667
|
+
input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
|
668
|
+
expected_output_cols=expected_output_cols,
|
669
|
+
session=dataset._session,
|
670
|
+
dependencies=self._deps,
|
671
|
+
)
|
672
|
+
else:
|
673
|
+
return self._invoke_estimator_func("score_samples", dataset)
|
413
674
|
|
414
675
|
@metaestimators.available_if(_final_step_has("predict_proba")) # type: ignore[misc]
|
415
676
|
@telemetry.send_api_usage_telemetry(
|
@@ -427,8 +688,32 @@ class Pipeline(base.BaseTransformer):
|
|
427
688
|
|
428
689
|
Returns:
|
429
690
|
Output dataset.
|
691
|
+
|
692
|
+
Raises:
|
693
|
+
ValueError: An sklearn object has not been fit before calling this function
|
430
694
|
"""
|
431
|
-
|
695
|
+
|
696
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
697
|
+
if self._sklearn_object is None:
|
698
|
+
raise ValueError("Model must be fit before inference.")
|
699
|
+
expected_output_cols = self._get_output_column_names("predict_proba")
|
700
|
+
|
701
|
+
handler = ModelTransformerBuilder.build(
|
702
|
+
dataset=dataset,
|
703
|
+
estimator=self._sklearn_object,
|
704
|
+
class_name="Pipeline",
|
705
|
+
subproject="",
|
706
|
+
autogenerated=False,
|
707
|
+
)
|
708
|
+
return handler.batch_inference(
|
709
|
+
inference_method="predict_proba",
|
710
|
+
input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
|
711
|
+
expected_output_cols=expected_output_cols,
|
712
|
+
session=dataset._session,
|
713
|
+
dependencies=self._deps,
|
714
|
+
)
|
715
|
+
else:
|
716
|
+
return self._invoke_estimator_func("predict_proba", dataset)
|
432
717
|
|
433
718
|
@metaestimators.available_if(_final_step_has("predict_log_proba")) # type: ignore[misc]
|
434
719
|
@telemetry.send_api_usage_telemetry(
|
@@ -447,8 +732,31 @@ class Pipeline(base.BaseTransformer):
|
|
447
732
|
|
448
733
|
Returns:
|
449
734
|
Output dataset.
|
735
|
+
|
736
|
+
Raises:
|
737
|
+
ValueError: An sklearn object has not been fit before calling this function
|
450
738
|
"""
|
451
|
-
|
739
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
740
|
+
if self._sklearn_object is None:
|
741
|
+
raise ValueError("Model must be fit before inference.")
|
742
|
+
|
743
|
+
expected_output_cols = self._get_output_column_names("predict_log_proba")
|
744
|
+
handler = ModelTransformerBuilder.build(
|
745
|
+
dataset=dataset,
|
746
|
+
estimator=self._sklearn_object,
|
747
|
+
class_name="Pipeline",
|
748
|
+
subproject="",
|
749
|
+
autogenerated=False,
|
750
|
+
)
|
751
|
+
return handler.batch_inference(
|
752
|
+
inference_method="predict_log_proba",
|
753
|
+
input_cols=self.input_cols if self.input_cols else self._infer_input_cols(dataset),
|
754
|
+
expected_output_cols=expected_output_cols,
|
755
|
+
session=dataset._session,
|
756
|
+
dependencies=self._deps,
|
757
|
+
)
|
758
|
+
else:
|
759
|
+
return self._invoke_estimator_func("predict_log_proba", dataset)
|
452
760
|
|
453
761
|
@metaestimators.available_if(_final_step_has("score")) # type: ignore[misc]
|
454
762
|
@telemetry.send_api_usage_telemetry(
|
@@ -464,8 +772,30 @@ class Pipeline(base.BaseTransformer):
|
|
464
772
|
|
465
773
|
Returns:
|
466
774
|
Output dataset.
|
775
|
+
|
776
|
+
Raises:
|
777
|
+
ValueError: An sklearn object has not been fit before calling this function
|
467
778
|
"""
|
468
|
-
|
779
|
+
|
780
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
781
|
+
if self._sklearn_object is None:
|
782
|
+
raise ValueError("Model must be fit before scoreing.")
|
783
|
+
handler = ModelTransformerBuilder.build(
|
784
|
+
dataset=dataset,
|
785
|
+
estimator=self._sklearn_object,
|
786
|
+
class_name="Pipeline",
|
787
|
+
subproject="",
|
788
|
+
autogenerated=False,
|
789
|
+
)
|
790
|
+
return handler.score(
|
791
|
+
input_cols=self._infer_input_cols(),
|
792
|
+
label_cols=self._get_label_cols(),
|
793
|
+
session=dataset._session,
|
794
|
+
dependencies=self._deps,
|
795
|
+
score_sproc_imports=[],
|
796
|
+
)
|
797
|
+
else:
|
798
|
+
return self._invoke_estimator_func("score", dataset)
|
469
799
|
|
470
800
|
def _invoke_estimator_func(
|
471
801
|
self, func_name: str, dataset: Union[snowpark.DataFrame, pd.DataFrame]
|
@@ -495,15 +825,6 @@ class Pipeline(base.BaseTransformer):
|
|
495
825
|
res: snowpark.DataFrame = getattr(estimator[1], func_name)(transformed_dataset)
|
496
826
|
return res
|
497
827
|
|
498
|
-
def _create_unfitted_sklearn_object(self) -> pipeline.Pipeline:
|
499
|
-
sksteps = []
|
500
|
-
for step in self.steps:
|
501
|
-
if isinstance(step[1], base.BaseTransformer):
|
502
|
-
sksteps.append(tuple([step[0], _utils.to_native_format(step[1])]))
|
503
|
-
else:
|
504
|
-
sksteps.append(tuple([step[0], step[1]]))
|
505
|
-
return pipeline.Pipeline(steps=sksteps)
|
506
|
-
|
507
828
|
def _construct_fitted_column_transformer_object(
|
508
829
|
self,
|
509
830
|
step_name_in_pipeline: str,
|
@@ -562,15 +883,134 @@ class Pipeline(base.BaseTransformer):
|
|
562
883
|
ct._name_to_fitted_passthrough = {step_name_in_ct: ft}
|
563
884
|
return ct
|
564
885
|
|
886
|
+
def _fit_ml_runtime(self, dataset: snowpark.DataFrame) -> None:
|
887
|
+
"""Train the pipeline in the ML Runtime.
|
888
|
+
|
889
|
+
Args:
|
890
|
+
dataset: The training Snowpark dataframe
|
891
|
+
|
892
|
+
Raises:
|
893
|
+
ModuleNotFoundError: The ML Runtime Client is not installed.
|
894
|
+
"""
|
895
|
+
try:
|
896
|
+
from snowflake.ml.runtime import MLRuntimeClient
|
897
|
+
except ModuleNotFoundError as e:
|
898
|
+
# The snowflake.ml.runtime module should always be present when
|
899
|
+
# the env var IN_SPCS_ML_RUNTIME is present.
|
900
|
+
raise ModuleNotFoundError("ML Runtime Python Client is not installed.") from e
|
901
|
+
|
902
|
+
client = MLRuntimeClient()
|
903
|
+
ml_runtime_compatible_pipeline = self._create_unfitted_sklearn_object()
|
904
|
+
|
905
|
+
label_cols = self._get_label_cols()
|
906
|
+
all_df_cols = dataset.columns
|
907
|
+
input_cols = [col for col in all_df_cols if col not in label_cols]
|
908
|
+
|
909
|
+
trained_pipeline = client.train(
|
910
|
+
estimator=ml_runtime_compatible_pipeline,
|
911
|
+
dataset=dataset,
|
912
|
+
input_cols=input_cols,
|
913
|
+
label_cols=label_cols,
|
914
|
+
sample_weight_col=self.sample_weight_col,
|
915
|
+
)
|
916
|
+
|
917
|
+
self._sklearn_object = trained_pipeline
|
918
|
+
|
919
|
+
def _get_label_cols(self) -> List[str]:
|
920
|
+
"""Util function to get the label columns from the pipeline.
|
921
|
+
The label column is only present in the estimator
|
922
|
+
|
923
|
+
Returns:
|
924
|
+
List of label columns, or empty list if no label cols.
|
925
|
+
"""
|
926
|
+
label_cols = []
|
927
|
+
estimator = self._get_estimator()
|
928
|
+
if estimator is not None:
|
929
|
+
label_cols = estimator[1].get_label_cols()
|
930
|
+
|
931
|
+
return label_cols
|
932
|
+
|
933
|
+
def _can_be_trained_in_ml_runtime(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> bool:
|
934
|
+
"""A utility function to determine if the pipeline cam be pushed down to the ML Runtime for training.
|
935
|
+
Currently, this is true if:
|
936
|
+
- The training dataset is a snowpark dataframe,
|
937
|
+
- The IN_SPCS_ML_RUNTIME environment is present and
|
938
|
+
- The pipeline can be converted to an sklearn pipeline.
|
939
|
+
|
940
|
+
Args:
|
941
|
+
dataset: The training dataset
|
942
|
+
|
943
|
+
Returns:
|
944
|
+
True if the dataset can be fit in the ml runtime, else false.
|
945
|
+
|
946
|
+
"""
|
947
|
+
if not isinstance(dataset, snowpark.DataFrame):
|
948
|
+
return False
|
949
|
+
|
950
|
+
if not os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
951
|
+
return False
|
952
|
+
|
953
|
+
return self._is_convertible_to_sklearn
|
954
|
+
|
955
|
+
@staticmethod
|
956
|
+
def _wrap_transformer_in_column_transformer(
|
957
|
+
transformer_name: str, transformer: base.BaseTransformer
|
958
|
+
) -> ColumnTransformer:
|
959
|
+
"""A helper function to convert a transformer object to an sklearn object and wrap in an sklearn
|
960
|
+
ColumnTransformer.
|
961
|
+
|
962
|
+
Args:
|
963
|
+
transformer_name: Name of the transformer to be wrapped.
|
964
|
+
transformer: The transformer object to be wrapped.
|
965
|
+
|
966
|
+
Returns:
|
967
|
+
A column transformer sklearn object that uses the input columns from the initial snowpark ml transformer.
|
968
|
+
"""
|
969
|
+
column_transformer = ColumnTransformer(
|
970
|
+
transformers=[(transformer_name, Pipeline._get_native_object(transformer), transformer.get_input_cols())],
|
971
|
+
remainder="passthrough",
|
972
|
+
)
|
973
|
+
return column_transformer
|
974
|
+
|
975
|
+
def _create_unfitted_sklearn_object(self) -> pipeline.Pipeline:
|
976
|
+
"""Create a sklearn pipeline from the current snowml pipeline.
|
977
|
+
ColumnTransformers are used to wrap transformers as their input columns can be specified
|
978
|
+
as a subset of the pipeline's input columns.
|
979
|
+
|
980
|
+
Returns:
|
981
|
+
An unfit pipeline that can be fit using the ML runtime client.
|
982
|
+
"""
|
983
|
+
|
984
|
+
sklearn_pipeline_steps = []
|
985
|
+
|
986
|
+
first_step_name, first_step_object = self.steps[0]
|
987
|
+
|
988
|
+
# Only the first step can have the input_cols field not None/empty.
|
989
|
+
if first_step_object.get_input_cols():
|
990
|
+
first_step_column_transformer = Pipeline._wrap_transformer_in_column_transformer(
|
991
|
+
first_step_name, first_step_object
|
992
|
+
)
|
993
|
+
first_step_skl = (first_step_name, first_step_column_transformer)
|
994
|
+
else:
|
995
|
+
first_step_skl = (first_step_name, Pipeline._get_native_object(first_step_object))
|
996
|
+
|
997
|
+
sklearn_pipeline_steps.append(first_step_skl)
|
998
|
+
|
999
|
+
for step_name, step_object in self.steps[1:]:
|
1000
|
+
skl_step = (step_name, Pipeline._get_native_object(step_object))
|
1001
|
+
sklearn_pipeline_steps.append(skl_step)
|
1002
|
+
|
1003
|
+
return pipeline.Pipeline(sklearn_pipeline_steps)
|
1004
|
+
|
565
1005
|
def _create_sklearn_object(self) -> pipeline.Pipeline:
|
566
1006
|
if not self._is_fitted:
|
567
1007
|
return self._create_unfitted_sklearn_object()
|
568
1008
|
|
569
|
-
if not self.
|
1009
|
+
if not self._modifies_label_or_sample_weight:
|
570
1010
|
raise exceptions.SnowflakeMLException(
|
571
1011
|
error_code=error_codes.METHOD_NOT_ALLOWED,
|
572
1012
|
original_exception=ValueError(
|
573
|
-
"The pipeline can't be converted to SKLearn equivalent because it processing label or "
|
1013
|
+
"The pipeline can't be converted to SKLearn equivalent because it modifies processing label or "
|
574
1014
|
"sample_weight columns as part of pipeline preprocessing steps which is not allowed in SKLearn."
|
575
1015
|
),
|
576
1016
|
)
|
@@ -631,3 +1071,65 @@ class Pipeline(base.BaseTransformer):
|
|
631
1071
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
632
1072
|
)
|
633
1073
|
return self._model_signature_dict
|
1074
|
+
|
1075
|
+
@staticmethod
|
1076
|
+
def _get_native_object(estimator: base.BaseEstimator) -> object:
|
1077
|
+
"""A helper function to get the native(sklearn, xgboost, or lightgbm)
|
1078
|
+
object from a snowpark ml estimator.
|
1079
|
+
TODO - better type hinting - is there a common base class for all xgb/lgbm estimators?
|
1080
|
+
|
1081
|
+
Args:
|
1082
|
+
estimator: the estimator from which to derive the native object.
|
1083
|
+
|
1084
|
+
Returns:
|
1085
|
+
a native estimator object
|
1086
|
+
|
1087
|
+
Raises:
|
1088
|
+
ValueError: The estimator is not an sklearn, xgboost, or lightgbm estimator.
|
1089
|
+
"""
|
1090
|
+
methods = ["to_sklearn", "to_xgboost", "to_lightgbm"]
|
1091
|
+
for method_name in methods:
|
1092
|
+
if hasattr(estimator, method_name):
|
1093
|
+
try:
|
1094
|
+
result = getattr(estimator, method_name)()
|
1095
|
+
return result
|
1096
|
+
except exceptions.SnowflakeMLException:
|
1097
|
+
pass # Do nothing and continue to the next method
|
1098
|
+
raise ValueError("The estimator must be an sklearn, xgboost, or lightgbm estimator.")
|
1099
|
+
|
1100
|
+
def to_sklearn(self) -> pipeline.Pipeline:
|
1101
|
+
"""Returns an sklearn Pipeline representing the object, if possible.
|
1102
|
+
|
1103
|
+
Returns:
|
1104
|
+
previously fit sklearn Pipeline if present, else an unfit pipeline
|
1105
|
+
|
1106
|
+
Raises:
|
1107
|
+
ValueError: The pipeline cannot be represented as an sklearn pipeline.
|
1108
|
+
"""
|
1109
|
+
if self._is_fitted:
|
1110
|
+
if self._sklearn_object is not None:
|
1111
|
+
return self._sklearn_object
|
1112
|
+
else:
|
1113
|
+
return self._create_sklearn_object()
|
1114
|
+
else:
|
1115
|
+
if self._is_convertible_to_sklearn:
|
1116
|
+
return self._create_unfitted_sklearn_object()
|
1117
|
+
else:
|
1118
|
+
raise ValueError("This pipeline can not be converted to an sklearn pipeline.")
|
1119
|
+
|
1120
|
+
def _send_pipeline_configuration_telemetry(self) -> None:
|
1121
|
+
"""Track information about the pipeline setup. Currently, we want to track:
|
1122
|
+
- Whether the pipeline is converible to an sklearn pipeline
|
1123
|
+
- Whether the pipeline is being used in the SPCS ml runtime.
|
1124
|
+
"""
|
1125
|
+
|
1126
|
+
telemetry_data = {
|
1127
|
+
"pipeline_is_convertible_to_sklearn": self._is_convertible_to_sklearn,
|
1128
|
+
"in_spcs_ml_runtime": bool(os.environ.get(IN_ML_RUNTIME_ENV_VAR)),
|
1129
|
+
}
|
1130
|
+
telemetry.send_custom_usage(
|
1131
|
+
project=_PROJECT,
|
1132
|
+
subproject=_SUBPROJECT,
|
1133
|
+
telemetry_type=telemetry.TelemetryField.TYPE_SNOWML_PIPELINE_USAGE.value,
|
1134
|
+
data=telemetry_data,
|
1135
|
+
)
|