snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -246,7 +248,6 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
246
248
|
sample_weight_col: Optional[str] = None,
|
247
249
|
) -> None:
|
248
250
|
super().__init__()
|
249
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
250
251
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
251
252
|
deps = deps | _gather_dependencies(estimator)
|
252
253
|
self._deps = list(deps)
|
@@ -272,6 +273,15 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
272
273
|
self.set_drop_input_cols(drop_input_cols)
|
273
274
|
self.set_sample_weight_col(sample_weight_col)
|
274
275
|
|
276
|
+
def _get_rand_id(self) -> str:
|
277
|
+
"""
|
278
|
+
Generate random id to be used in sproc and stage names.
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
Random id string usable in sproc, table, and stage names.
|
282
|
+
"""
|
283
|
+
return str(uuid4()).replace("-", "_").upper()
|
284
|
+
|
275
285
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
276
286
|
"""
|
277
287
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -350,7 +360,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
350
360
|
cp.dump(self._sklearn_object, local_transform_file)
|
351
361
|
|
352
362
|
# Create temp stage to run fit.
|
353
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
363
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
354
364
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
355
365
|
SqlResultValidator(
|
356
366
|
session=session,
|
@@ -363,11 +373,12 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
363
373
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
364
374
|
).validate()
|
365
375
|
|
366
|
-
|
376
|
+
# Use posixpath to construct stage paths
|
377
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
378
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
367
379
|
local_result_file_name = get_temp_file_path()
|
368
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
369
380
|
|
370
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
381
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
371
382
|
statement_params = telemetry.get_function_usage_statement_params(
|
372
383
|
project=_PROJECT,
|
373
384
|
subproject=_SUBPROJECT,
|
@@ -393,6 +404,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
393
404
|
replace=True,
|
394
405
|
session=session,
|
395
406
|
statement_params=statement_params,
|
407
|
+
anonymous=True
|
396
408
|
)
|
397
409
|
def fit_wrapper_sproc(
|
398
410
|
session: Session,
|
@@ -401,7 +413,8 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
401
413
|
stage_result_file_name: str,
|
402
414
|
input_cols: List[str],
|
403
415
|
label_cols: List[str],
|
404
|
-
sample_weight_col: Optional[str]
|
416
|
+
sample_weight_col: Optional[str],
|
417
|
+
statement_params: Dict[str, str]
|
405
418
|
) -> str:
|
406
419
|
import cloudpickle as cp
|
407
420
|
import numpy as np
|
@@ -468,15 +481,15 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
468
481
|
api_calls=[Session.call],
|
469
482
|
custom_tags=dict([("autogen", True)]),
|
470
483
|
)
|
471
|
-
sproc_export_file_name =
|
472
|
-
|
484
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
485
|
+
session,
|
473
486
|
query,
|
474
487
|
stage_transform_file_name,
|
475
488
|
stage_result_file_name,
|
476
489
|
identifier.get_unescaped_names(self.input_cols),
|
477
490
|
identifier.get_unescaped_names(self.label_cols),
|
478
491
|
identifier.get_unescaped_names(self.sample_weight_col),
|
479
|
-
statement_params
|
492
|
+
statement_params,
|
480
493
|
)
|
481
494
|
|
482
495
|
if "|" in sproc_export_file_name:
|
@@ -486,7 +499,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
486
499
|
print("\n".join(fields[1:]))
|
487
500
|
|
488
501
|
session.file.get(
|
489
|
-
|
502
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
490
503
|
local_result_file_name,
|
491
504
|
statement_params=statement_params
|
492
505
|
)
|
@@ -532,7 +545,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
532
545
|
|
533
546
|
# Register vectorized UDF for batch inference
|
534
547
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
535
|
-
safe_id=self.
|
548
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
536
549
|
|
537
550
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
538
551
|
# will try to pickle all of self which fails.
|
@@ -624,7 +637,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
624
637
|
return transformed_pandas_df.to_dict("records")
|
625
638
|
|
626
639
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
627
|
-
safe_id=self.
|
640
|
+
safe_id=self._get_rand_id()
|
628
641
|
)
|
629
642
|
|
630
643
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -789,11 +802,18 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
789
802
|
Transformed dataset.
|
790
803
|
"""
|
791
804
|
if isinstance(dataset, DataFrame):
|
805
|
+
expected_type_inferred = ""
|
806
|
+
# when it is classifier, infer the datatype from label columns
|
807
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
808
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
809
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
810
|
+
)
|
811
|
+
|
792
812
|
output_df = self._batch_inference(
|
793
813
|
dataset=dataset,
|
794
814
|
inference_method="predict",
|
795
815
|
expected_output_cols_list=self.output_cols,
|
796
|
-
expected_output_cols_type=
|
816
|
+
expected_output_cols_type=expected_type_inferred,
|
797
817
|
)
|
798
818
|
elif isinstance(dataset, pd.DataFrame):
|
799
819
|
output_df = self._sklearn_inference(
|
@@ -866,10 +886,10 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
866
886
|
|
867
887
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
868
888
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
869
|
-
Returns
|
889
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
870
890
|
"""
|
871
891
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
872
|
-
return []
|
892
|
+
return [output_cols_prefix]
|
873
893
|
|
874
894
|
classes = self._sklearn_object.classes_
|
875
895
|
if isinstance(classes, numpy.ndarray):
|
@@ -1094,7 +1114,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1094
1114
|
cp.dump(self._sklearn_object, local_score_file)
|
1095
1115
|
|
1096
1116
|
# Create temp stage to run score.
|
1097
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1117
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1098
1118
|
session = dataset._session
|
1099
1119
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1100
1120
|
SqlResultValidator(
|
@@ -1108,8 +1128,9 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1108
1128
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1109
1129
|
).validate()
|
1110
1130
|
|
1111
|
-
|
1112
|
-
|
1131
|
+
# Use posixpath to construct stage paths
|
1132
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1133
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1113
1134
|
statement_params = telemetry.get_function_usage_statement_params(
|
1114
1135
|
project=_PROJECT,
|
1115
1136
|
subproject=_SUBPROJECT,
|
@@ -1135,6 +1156,7 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1135
1156
|
replace=True,
|
1136
1157
|
session=session,
|
1137
1158
|
statement_params=statement_params,
|
1159
|
+
anonymous=True
|
1138
1160
|
)
|
1139
1161
|
def score_wrapper_sproc(
|
1140
1162
|
session: Session,
|
@@ -1142,7 +1164,8 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1142
1164
|
stage_score_file_name: str,
|
1143
1165
|
input_cols: List[str],
|
1144
1166
|
label_cols: List[str],
|
1145
|
-
sample_weight_col: Optional[str]
|
1167
|
+
sample_weight_col: Optional[str],
|
1168
|
+
statement_params: Dict[str, str]
|
1146
1169
|
) -> float:
|
1147
1170
|
import cloudpickle as cp
|
1148
1171
|
import numpy as np
|
@@ -1192,14 +1215,14 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1192
1215
|
api_calls=[Session.call],
|
1193
1216
|
custom_tags=dict([("autogen", True)]),
|
1194
1217
|
)
|
1195
|
-
score =
|
1196
|
-
|
1218
|
+
score = score_wrapper_sproc(
|
1219
|
+
session,
|
1197
1220
|
query,
|
1198
1221
|
stage_score_file_name,
|
1199
1222
|
identifier.get_unescaped_names(self.input_cols),
|
1200
1223
|
identifier.get_unescaped_names(self.label_cols),
|
1201
1224
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1202
|
-
statement_params
|
1225
|
+
statement_params,
|
1203
1226
|
)
|
1204
1227
|
|
1205
1228
|
cleanup_temp_files([local_score_file_name])
|
@@ -1217,18 +1240,20 @@ class SequentialFeatureSelector(BaseTransformer):
|
|
1217
1240
|
if self._sklearn_object._estimator_type == 'classifier':
|
1218
1241
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1219
1242
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1220
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1243
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1244
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1221
1245
|
# For regressor, the type of predict is float64
|
1222
1246
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1223
1247
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1224
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1225
|
-
|
1248
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1249
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1226
1250
|
for prob_func in PROB_FUNCTIONS:
|
1227
1251
|
if hasattr(self, prob_func):
|
1228
1252
|
output_cols_prefix: str = f"{prob_func}_"
|
1229
1253
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1230
1254
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1231
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1255
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1256
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1232
1257
|
|
1233
1258
|
@property
|
1234
1259
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -185,7 +187,6 @@ class VarianceThreshold(BaseTransformer):
|
|
185
187
|
sample_weight_col: Optional[str] = None,
|
186
188
|
) -> None:
|
187
189
|
super().__init__()
|
188
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
189
190
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
190
191
|
|
191
192
|
self._deps = list(deps)
|
@@ -205,6 +206,15 @@ class VarianceThreshold(BaseTransformer):
|
|
205
206
|
self.set_drop_input_cols(drop_input_cols)
|
206
207
|
self.set_sample_weight_col(sample_weight_col)
|
207
208
|
|
209
|
+
def _get_rand_id(self) -> str:
|
210
|
+
"""
|
211
|
+
Generate random id to be used in sproc and stage names.
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
Random id string usable in sproc, table, and stage names.
|
215
|
+
"""
|
216
|
+
return str(uuid4()).replace("-", "_").upper()
|
217
|
+
|
208
218
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
209
219
|
"""
|
210
220
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -283,7 +293,7 @@ class VarianceThreshold(BaseTransformer):
|
|
283
293
|
cp.dump(self._sklearn_object, local_transform_file)
|
284
294
|
|
285
295
|
# Create temp stage to run fit.
|
286
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
296
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
287
297
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
288
298
|
SqlResultValidator(
|
289
299
|
session=session,
|
@@ -296,11 +306,12 @@ class VarianceThreshold(BaseTransformer):
|
|
296
306
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
297
307
|
).validate()
|
298
308
|
|
299
|
-
|
309
|
+
# Use posixpath to construct stage paths
|
310
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
311
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
300
312
|
local_result_file_name = get_temp_file_path()
|
301
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
302
313
|
|
303
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
314
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
304
315
|
statement_params = telemetry.get_function_usage_statement_params(
|
305
316
|
project=_PROJECT,
|
306
317
|
subproject=_SUBPROJECT,
|
@@ -326,6 +337,7 @@ class VarianceThreshold(BaseTransformer):
|
|
326
337
|
replace=True,
|
327
338
|
session=session,
|
328
339
|
statement_params=statement_params,
|
340
|
+
anonymous=True
|
329
341
|
)
|
330
342
|
def fit_wrapper_sproc(
|
331
343
|
session: Session,
|
@@ -334,7 +346,8 @@ class VarianceThreshold(BaseTransformer):
|
|
334
346
|
stage_result_file_name: str,
|
335
347
|
input_cols: List[str],
|
336
348
|
label_cols: List[str],
|
337
|
-
sample_weight_col: Optional[str]
|
349
|
+
sample_weight_col: Optional[str],
|
350
|
+
statement_params: Dict[str, str]
|
338
351
|
) -> str:
|
339
352
|
import cloudpickle as cp
|
340
353
|
import numpy as np
|
@@ -401,15 +414,15 @@ class VarianceThreshold(BaseTransformer):
|
|
401
414
|
api_calls=[Session.call],
|
402
415
|
custom_tags=dict([("autogen", True)]),
|
403
416
|
)
|
404
|
-
sproc_export_file_name =
|
405
|
-
|
417
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
418
|
+
session,
|
406
419
|
query,
|
407
420
|
stage_transform_file_name,
|
408
421
|
stage_result_file_name,
|
409
422
|
identifier.get_unescaped_names(self.input_cols),
|
410
423
|
identifier.get_unescaped_names(self.label_cols),
|
411
424
|
identifier.get_unescaped_names(self.sample_weight_col),
|
412
|
-
statement_params
|
425
|
+
statement_params,
|
413
426
|
)
|
414
427
|
|
415
428
|
if "|" in sproc_export_file_name:
|
@@ -419,7 +432,7 @@ class VarianceThreshold(BaseTransformer):
|
|
419
432
|
print("\n".join(fields[1:]))
|
420
433
|
|
421
434
|
session.file.get(
|
422
|
-
|
435
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
423
436
|
local_result_file_name,
|
424
437
|
statement_params=statement_params
|
425
438
|
)
|
@@ -465,7 +478,7 @@ class VarianceThreshold(BaseTransformer):
|
|
465
478
|
|
466
479
|
# Register vectorized UDF for batch inference
|
467
480
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
468
|
-
safe_id=self.
|
481
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
469
482
|
|
470
483
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
471
484
|
# will try to pickle all of self which fails.
|
@@ -557,7 +570,7 @@ class VarianceThreshold(BaseTransformer):
|
|
557
570
|
return transformed_pandas_df.to_dict("records")
|
558
571
|
|
559
572
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
560
|
-
safe_id=self.
|
573
|
+
safe_id=self._get_rand_id()
|
561
574
|
)
|
562
575
|
|
563
576
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -722,11 +735,18 @@ class VarianceThreshold(BaseTransformer):
|
|
722
735
|
Transformed dataset.
|
723
736
|
"""
|
724
737
|
if isinstance(dataset, DataFrame):
|
738
|
+
expected_type_inferred = ""
|
739
|
+
# when it is classifier, infer the datatype from label columns
|
740
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
741
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
742
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
743
|
+
)
|
744
|
+
|
725
745
|
output_df = self._batch_inference(
|
726
746
|
dataset=dataset,
|
727
747
|
inference_method="predict",
|
728
748
|
expected_output_cols_list=self.output_cols,
|
729
|
-
expected_output_cols_type=
|
749
|
+
expected_output_cols_type=expected_type_inferred,
|
730
750
|
)
|
731
751
|
elif isinstance(dataset, pd.DataFrame):
|
732
752
|
output_df = self._sklearn_inference(
|
@@ -799,10 +819,10 @@ class VarianceThreshold(BaseTransformer):
|
|
799
819
|
|
800
820
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
801
821
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
802
|
-
Returns
|
822
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
803
823
|
"""
|
804
824
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
805
|
-
return []
|
825
|
+
return [output_cols_prefix]
|
806
826
|
|
807
827
|
classes = self._sklearn_object.classes_
|
808
828
|
if isinstance(classes, numpy.ndarray):
|
@@ -1027,7 +1047,7 @@ class VarianceThreshold(BaseTransformer):
|
|
1027
1047
|
cp.dump(self._sklearn_object, local_score_file)
|
1028
1048
|
|
1029
1049
|
# Create temp stage to run score.
|
1030
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1050
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1031
1051
|
session = dataset._session
|
1032
1052
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1033
1053
|
SqlResultValidator(
|
@@ -1041,8 +1061,9 @@ class VarianceThreshold(BaseTransformer):
|
|
1041
1061
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1042
1062
|
).validate()
|
1043
1063
|
|
1044
|
-
|
1045
|
-
|
1064
|
+
# Use posixpath to construct stage paths
|
1065
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1066
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1046
1067
|
statement_params = telemetry.get_function_usage_statement_params(
|
1047
1068
|
project=_PROJECT,
|
1048
1069
|
subproject=_SUBPROJECT,
|
@@ -1068,6 +1089,7 @@ class VarianceThreshold(BaseTransformer):
|
|
1068
1089
|
replace=True,
|
1069
1090
|
session=session,
|
1070
1091
|
statement_params=statement_params,
|
1092
|
+
anonymous=True
|
1071
1093
|
)
|
1072
1094
|
def score_wrapper_sproc(
|
1073
1095
|
session: Session,
|
@@ -1075,7 +1097,8 @@ class VarianceThreshold(BaseTransformer):
|
|
1075
1097
|
stage_score_file_name: str,
|
1076
1098
|
input_cols: List[str],
|
1077
1099
|
label_cols: List[str],
|
1078
|
-
sample_weight_col: Optional[str]
|
1100
|
+
sample_weight_col: Optional[str],
|
1101
|
+
statement_params: Dict[str, str]
|
1079
1102
|
) -> float:
|
1080
1103
|
import cloudpickle as cp
|
1081
1104
|
import numpy as np
|
@@ -1125,14 +1148,14 @@ class VarianceThreshold(BaseTransformer):
|
|
1125
1148
|
api_calls=[Session.call],
|
1126
1149
|
custom_tags=dict([("autogen", True)]),
|
1127
1150
|
)
|
1128
|
-
score =
|
1129
|
-
|
1151
|
+
score = score_wrapper_sproc(
|
1152
|
+
session,
|
1130
1153
|
query,
|
1131
1154
|
stage_score_file_name,
|
1132
1155
|
identifier.get_unescaped_names(self.input_cols),
|
1133
1156
|
identifier.get_unescaped_names(self.label_cols),
|
1134
1157
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1135
|
-
statement_params
|
1158
|
+
statement_params,
|
1136
1159
|
)
|
1137
1160
|
|
1138
1161
|
cleanup_temp_files([local_score_file_name])
|
@@ -1150,18 +1173,20 @@ class VarianceThreshold(BaseTransformer):
|
|
1150
1173
|
if self._sklearn_object._estimator_type == 'classifier':
|
1151
1174
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1152
1175
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1153
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1176
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1177
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1154
1178
|
# For regressor, the type of predict is float64
|
1155
1179
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1156
1180
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1157
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1158
|
-
|
1181
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1182
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1159
1183
|
for prob_func in PROB_FUNCTIONS:
|
1160
1184
|
if hasattr(self, prob_func):
|
1161
1185
|
output_cols_prefix: str = f"{prob_func}_"
|
1162
1186
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1163
1187
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1164
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1188
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1189
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1165
1190
|
|
1166
1191
|
@property
|
1167
1192
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|