snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
28
29
|
from snowflake.snowpark import DataFrame, Session
|
29
30
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
30
31
|
from snowflake.snowpark.types import PandasSeries
|
32
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
33
|
|
32
34
|
from snowflake.ml.model.model_signature import (
|
33
35
|
DataType,
|
@@ -194,7 +196,6 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
194
196
|
sample_weight_col: Optional[str] = None,
|
195
197
|
) -> None:
|
196
198
|
super().__init__()
|
197
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
198
199
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
199
200
|
|
200
201
|
self._deps = list(deps)
|
@@ -216,6 +217,15 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
216
217
|
self.set_drop_input_cols(drop_input_cols)
|
217
218
|
self.set_sample_weight_col(sample_weight_col)
|
218
219
|
|
220
|
+
def _get_rand_id(self) -> str:
|
221
|
+
"""
|
222
|
+
Generate random id to be used in sproc and stage names.
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
Random id string usable in sproc, table, and stage names.
|
226
|
+
"""
|
227
|
+
return str(uuid4()).replace("-", "_").upper()
|
228
|
+
|
219
229
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
220
230
|
"""
|
221
231
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -294,7 +304,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
294
304
|
cp.dump(self._sklearn_object, local_transform_file)
|
295
305
|
|
296
306
|
# Create temp stage to run fit.
|
297
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
307
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
298
308
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
299
309
|
SqlResultValidator(
|
300
310
|
session=session,
|
@@ -307,11 +317,12 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
307
317
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
308
318
|
).validate()
|
309
319
|
|
310
|
-
|
320
|
+
# Use posixpath to construct stage paths
|
321
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
322
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
311
323
|
local_result_file_name = get_temp_file_path()
|
312
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
313
324
|
|
314
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
325
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
315
326
|
statement_params = telemetry.get_function_usage_statement_params(
|
316
327
|
project=_PROJECT,
|
317
328
|
subproject=_SUBPROJECT,
|
@@ -337,6 +348,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
337
348
|
replace=True,
|
338
349
|
session=session,
|
339
350
|
statement_params=statement_params,
|
351
|
+
anonymous=True
|
340
352
|
)
|
341
353
|
def fit_wrapper_sproc(
|
342
354
|
session: Session,
|
@@ -345,7 +357,8 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
345
357
|
stage_result_file_name: str,
|
346
358
|
input_cols: List[str],
|
347
359
|
label_cols: List[str],
|
348
|
-
sample_weight_col: Optional[str]
|
360
|
+
sample_weight_col: Optional[str],
|
361
|
+
statement_params: Dict[str, str]
|
349
362
|
) -> str:
|
350
363
|
import cloudpickle as cp
|
351
364
|
import numpy as np
|
@@ -412,15 +425,15 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
412
425
|
api_calls=[Session.call],
|
413
426
|
custom_tags=dict([("autogen", True)]),
|
414
427
|
)
|
415
|
-
sproc_export_file_name =
|
416
|
-
|
428
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
429
|
+
session,
|
417
430
|
query,
|
418
431
|
stage_transform_file_name,
|
419
432
|
stage_result_file_name,
|
420
433
|
identifier.get_unescaped_names(self.input_cols),
|
421
434
|
identifier.get_unescaped_names(self.label_cols),
|
422
435
|
identifier.get_unescaped_names(self.sample_weight_col),
|
423
|
-
statement_params
|
436
|
+
statement_params,
|
424
437
|
)
|
425
438
|
|
426
439
|
if "|" in sproc_export_file_name:
|
@@ -430,7 +443,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
430
443
|
print("\n".join(fields[1:]))
|
431
444
|
|
432
445
|
session.file.get(
|
433
|
-
|
446
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
434
447
|
local_result_file_name,
|
435
448
|
statement_params=statement_params
|
436
449
|
)
|
@@ -476,7 +489,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
476
489
|
|
477
490
|
# Register vectorized UDF for batch inference
|
478
491
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
479
|
-
safe_id=self.
|
492
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
480
493
|
|
481
494
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
482
495
|
# will try to pickle all of self which fails.
|
@@ -568,7 +581,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
568
581
|
return transformed_pandas_df.to_dict("records")
|
569
582
|
|
570
583
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
571
|
-
safe_id=self.
|
584
|
+
safe_id=self._get_rand_id()
|
572
585
|
)
|
573
586
|
|
574
587
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -733,11 +746,18 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
733
746
|
Transformed dataset.
|
734
747
|
"""
|
735
748
|
if isinstance(dataset, DataFrame):
|
749
|
+
expected_type_inferred = ""
|
750
|
+
# when it is classifier, infer the datatype from label columns
|
751
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
752
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
753
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
754
|
+
)
|
755
|
+
|
736
756
|
output_df = self._batch_inference(
|
737
757
|
dataset=dataset,
|
738
758
|
inference_method="predict",
|
739
759
|
expected_output_cols_list=self.output_cols,
|
740
|
-
expected_output_cols_type=
|
760
|
+
expected_output_cols_type=expected_type_inferred,
|
741
761
|
)
|
742
762
|
elif isinstance(dataset, pd.DataFrame):
|
743
763
|
output_df = self._sklearn_inference(
|
@@ -810,10 +830,10 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
810
830
|
|
811
831
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
812
832
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
813
|
-
Returns
|
833
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
814
834
|
"""
|
815
835
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
816
|
-
return []
|
836
|
+
return [output_cols_prefix]
|
817
837
|
|
818
838
|
classes = self._sklearn_object.classes_
|
819
839
|
if isinstance(classes, numpy.ndarray):
|
@@ -1038,7 +1058,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1038
1058
|
cp.dump(self._sklearn_object, local_score_file)
|
1039
1059
|
|
1040
1060
|
# Create temp stage to run score.
|
1041
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1061
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1042
1062
|
session = dataset._session
|
1043
1063
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1044
1064
|
SqlResultValidator(
|
@@ -1052,8 +1072,9 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1052
1072
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1053
1073
|
).validate()
|
1054
1074
|
|
1055
|
-
|
1056
|
-
|
1075
|
+
# Use posixpath to construct stage paths
|
1076
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1077
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1057
1078
|
statement_params = telemetry.get_function_usage_statement_params(
|
1058
1079
|
project=_PROJECT,
|
1059
1080
|
subproject=_SUBPROJECT,
|
@@ -1079,6 +1100,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1079
1100
|
replace=True,
|
1080
1101
|
session=session,
|
1081
1102
|
statement_params=statement_params,
|
1103
|
+
anonymous=True
|
1082
1104
|
)
|
1083
1105
|
def score_wrapper_sproc(
|
1084
1106
|
session: Session,
|
@@ -1086,7 +1108,8 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1086
1108
|
stage_score_file_name: str,
|
1087
1109
|
input_cols: List[str],
|
1088
1110
|
label_cols: List[str],
|
1089
|
-
sample_weight_col: Optional[str]
|
1111
|
+
sample_weight_col: Optional[str],
|
1112
|
+
statement_params: Dict[str, str]
|
1090
1113
|
) -> float:
|
1091
1114
|
import cloudpickle as cp
|
1092
1115
|
import numpy as np
|
@@ -1136,14 +1159,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1136
1159
|
api_calls=[Session.call],
|
1137
1160
|
custom_tags=dict([("autogen", True)]),
|
1138
1161
|
)
|
1139
|
-
score =
|
1140
|
-
|
1162
|
+
score = score_wrapper_sproc(
|
1163
|
+
session,
|
1141
1164
|
query,
|
1142
1165
|
stage_score_file_name,
|
1143
1166
|
identifier.get_unescaped_names(self.input_cols),
|
1144
1167
|
identifier.get_unescaped_names(self.label_cols),
|
1145
1168
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1146
|
-
statement_params
|
1169
|
+
statement_params,
|
1147
1170
|
)
|
1148
1171
|
|
1149
1172
|
cleanup_temp_files([local_score_file_name])
|
@@ -1161,18 +1184,20 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
1161
1184
|
if self._sklearn_object._estimator_type == 'classifier':
|
1162
1185
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1163
1186
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1164
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1187
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1188
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1165
1189
|
# For regressor, the type of predict is float64
|
1166
1190
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1167
1191
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1168
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1169
|
-
|
1192
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1193
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1170
1194
|
for prob_func in PROB_FUNCTIONS:
|
1171
1195
|
if hasattr(self, prob_func):
|
1172
1196
|
output_cols_prefix: str = f"{prob_func}_"
|
1173
1197
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1174
1198
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1175
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1199
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1200
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1176
1201
|
|
1177
1202
|
@property
|
1178
1203
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
28
29
|
from snowflake.snowpark import DataFrame, Session
|
29
30
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
30
31
|
from snowflake.snowpark.types import PandasSeries
|
32
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
33
|
|
32
34
|
from snowflake.ml.model.model_signature import (
|
33
35
|
DataType,
|
@@ -191,7 +193,6 @@ class SelectFdr(BaseTransformer):
|
|
191
193
|
sample_weight_col: Optional[str] = None,
|
192
194
|
) -> None:
|
193
195
|
super().__init__()
|
194
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
195
196
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
196
197
|
|
197
198
|
self._deps = list(deps)
|
@@ -212,6 +213,15 @@ class SelectFdr(BaseTransformer):
|
|
212
213
|
self.set_drop_input_cols(drop_input_cols)
|
213
214
|
self.set_sample_weight_col(sample_weight_col)
|
214
215
|
|
216
|
+
def _get_rand_id(self) -> str:
|
217
|
+
"""
|
218
|
+
Generate random id to be used in sproc and stage names.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
Random id string usable in sproc, table, and stage names.
|
222
|
+
"""
|
223
|
+
return str(uuid4()).replace("-", "_").upper()
|
224
|
+
|
215
225
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
216
226
|
"""
|
217
227
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -290,7 +300,7 @@ class SelectFdr(BaseTransformer):
|
|
290
300
|
cp.dump(self._sklearn_object, local_transform_file)
|
291
301
|
|
292
302
|
# Create temp stage to run fit.
|
293
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
303
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
294
304
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
295
305
|
SqlResultValidator(
|
296
306
|
session=session,
|
@@ -303,11 +313,12 @@ class SelectFdr(BaseTransformer):
|
|
303
313
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
304
314
|
).validate()
|
305
315
|
|
306
|
-
|
316
|
+
# Use posixpath to construct stage paths
|
317
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
318
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
307
319
|
local_result_file_name = get_temp_file_path()
|
308
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
309
320
|
|
310
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
321
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
311
322
|
statement_params = telemetry.get_function_usage_statement_params(
|
312
323
|
project=_PROJECT,
|
313
324
|
subproject=_SUBPROJECT,
|
@@ -333,6 +344,7 @@ class SelectFdr(BaseTransformer):
|
|
333
344
|
replace=True,
|
334
345
|
session=session,
|
335
346
|
statement_params=statement_params,
|
347
|
+
anonymous=True
|
336
348
|
)
|
337
349
|
def fit_wrapper_sproc(
|
338
350
|
session: Session,
|
@@ -341,7 +353,8 @@ class SelectFdr(BaseTransformer):
|
|
341
353
|
stage_result_file_name: str,
|
342
354
|
input_cols: List[str],
|
343
355
|
label_cols: List[str],
|
344
|
-
sample_weight_col: Optional[str]
|
356
|
+
sample_weight_col: Optional[str],
|
357
|
+
statement_params: Dict[str, str]
|
345
358
|
) -> str:
|
346
359
|
import cloudpickle as cp
|
347
360
|
import numpy as np
|
@@ -408,15 +421,15 @@ class SelectFdr(BaseTransformer):
|
|
408
421
|
api_calls=[Session.call],
|
409
422
|
custom_tags=dict([("autogen", True)]),
|
410
423
|
)
|
411
|
-
sproc_export_file_name =
|
412
|
-
|
424
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
425
|
+
session,
|
413
426
|
query,
|
414
427
|
stage_transform_file_name,
|
415
428
|
stage_result_file_name,
|
416
429
|
identifier.get_unescaped_names(self.input_cols),
|
417
430
|
identifier.get_unescaped_names(self.label_cols),
|
418
431
|
identifier.get_unescaped_names(self.sample_weight_col),
|
419
|
-
statement_params
|
432
|
+
statement_params,
|
420
433
|
)
|
421
434
|
|
422
435
|
if "|" in sproc_export_file_name:
|
@@ -426,7 +439,7 @@ class SelectFdr(BaseTransformer):
|
|
426
439
|
print("\n".join(fields[1:]))
|
427
440
|
|
428
441
|
session.file.get(
|
429
|
-
|
442
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
430
443
|
local_result_file_name,
|
431
444
|
statement_params=statement_params
|
432
445
|
)
|
@@ -472,7 +485,7 @@ class SelectFdr(BaseTransformer):
|
|
472
485
|
|
473
486
|
# Register vectorized UDF for batch inference
|
474
487
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
475
|
-
safe_id=self.
|
488
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
476
489
|
|
477
490
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
478
491
|
# will try to pickle all of self which fails.
|
@@ -564,7 +577,7 @@ class SelectFdr(BaseTransformer):
|
|
564
577
|
return transformed_pandas_df.to_dict("records")
|
565
578
|
|
566
579
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
567
|
-
safe_id=self.
|
580
|
+
safe_id=self._get_rand_id()
|
568
581
|
)
|
569
582
|
|
570
583
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -729,11 +742,18 @@ class SelectFdr(BaseTransformer):
|
|
729
742
|
Transformed dataset.
|
730
743
|
"""
|
731
744
|
if isinstance(dataset, DataFrame):
|
745
|
+
expected_type_inferred = ""
|
746
|
+
# when it is classifier, infer the datatype from label columns
|
747
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
748
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
749
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
750
|
+
)
|
751
|
+
|
732
752
|
output_df = self._batch_inference(
|
733
753
|
dataset=dataset,
|
734
754
|
inference_method="predict",
|
735
755
|
expected_output_cols_list=self.output_cols,
|
736
|
-
expected_output_cols_type=
|
756
|
+
expected_output_cols_type=expected_type_inferred,
|
737
757
|
)
|
738
758
|
elif isinstance(dataset, pd.DataFrame):
|
739
759
|
output_df = self._sklearn_inference(
|
@@ -806,10 +826,10 @@ class SelectFdr(BaseTransformer):
|
|
806
826
|
|
807
827
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
808
828
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
809
|
-
Returns
|
829
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
810
830
|
"""
|
811
831
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
812
|
-
return []
|
832
|
+
return [output_cols_prefix]
|
813
833
|
|
814
834
|
classes = self._sklearn_object.classes_
|
815
835
|
if isinstance(classes, numpy.ndarray):
|
@@ -1034,7 +1054,7 @@ class SelectFdr(BaseTransformer):
|
|
1034
1054
|
cp.dump(self._sklearn_object, local_score_file)
|
1035
1055
|
|
1036
1056
|
# Create temp stage to run score.
|
1037
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1057
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1038
1058
|
session = dataset._session
|
1039
1059
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1040
1060
|
SqlResultValidator(
|
@@ -1048,8 +1068,9 @@ class SelectFdr(BaseTransformer):
|
|
1048
1068
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1049
1069
|
).validate()
|
1050
1070
|
|
1051
|
-
|
1052
|
-
|
1071
|
+
# Use posixpath to construct stage paths
|
1072
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1073
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1053
1074
|
statement_params = telemetry.get_function_usage_statement_params(
|
1054
1075
|
project=_PROJECT,
|
1055
1076
|
subproject=_SUBPROJECT,
|
@@ -1075,6 +1096,7 @@ class SelectFdr(BaseTransformer):
|
|
1075
1096
|
replace=True,
|
1076
1097
|
session=session,
|
1077
1098
|
statement_params=statement_params,
|
1099
|
+
anonymous=True
|
1078
1100
|
)
|
1079
1101
|
def score_wrapper_sproc(
|
1080
1102
|
session: Session,
|
@@ -1082,7 +1104,8 @@ class SelectFdr(BaseTransformer):
|
|
1082
1104
|
stage_score_file_name: str,
|
1083
1105
|
input_cols: List[str],
|
1084
1106
|
label_cols: List[str],
|
1085
|
-
sample_weight_col: Optional[str]
|
1107
|
+
sample_weight_col: Optional[str],
|
1108
|
+
statement_params: Dict[str, str]
|
1086
1109
|
) -> float:
|
1087
1110
|
import cloudpickle as cp
|
1088
1111
|
import numpy as np
|
@@ -1132,14 +1155,14 @@ class SelectFdr(BaseTransformer):
|
|
1132
1155
|
api_calls=[Session.call],
|
1133
1156
|
custom_tags=dict([("autogen", True)]),
|
1134
1157
|
)
|
1135
|
-
score =
|
1136
|
-
|
1158
|
+
score = score_wrapper_sproc(
|
1159
|
+
session,
|
1137
1160
|
query,
|
1138
1161
|
stage_score_file_name,
|
1139
1162
|
identifier.get_unescaped_names(self.input_cols),
|
1140
1163
|
identifier.get_unescaped_names(self.label_cols),
|
1141
1164
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1142
|
-
statement_params
|
1165
|
+
statement_params,
|
1143
1166
|
)
|
1144
1167
|
|
1145
1168
|
cleanup_temp_files([local_score_file_name])
|
@@ -1157,18 +1180,20 @@ class SelectFdr(BaseTransformer):
|
|
1157
1180
|
if self._sklearn_object._estimator_type == 'classifier':
|
1158
1181
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1159
1182
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1160
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1183
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1184
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1161
1185
|
# For regressor, the type of predict is float64
|
1162
1186
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1163
1187
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1164
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1165
|
-
|
1188
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1189
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1166
1190
|
for prob_func in PROB_FUNCTIONS:
|
1167
1191
|
if hasattr(self, prob_func):
|
1168
1192
|
output_cols_prefix: str = f"{prob_func}_"
|
1169
1193
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1170
1194
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1171
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1195
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1196
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1172
1197
|
|
1173
1198
|
@property
|
1174
1199
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|