snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -314,7 +316,6 @@ class GridSearchCV(BaseTransformer):
|
|
314
316
|
sample_weight_col: Optional[str] = None,
|
315
317
|
) -> None:
|
316
318
|
super().__init__()
|
317
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
318
319
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
319
320
|
deps = deps | _gather_dependencies(estimator)
|
320
321
|
self._deps = list(deps)
|
@@ -343,6 +344,15 @@ class GridSearchCV(BaseTransformer):
|
|
343
344
|
self.set_drop_input_cols(drop_input_cols)
|
344
345
|
self.set_sample_weight_col(sample_weight_col)
|
345
346
|
|
347
|
+
def _get_rand_id(self) -> str:
|
348
|
+
"""
|
349
|
+
Generate random id to be used in sproc and stage names.
|
350
|
+
|
351
|
+
Returns:
|
352
|
+
Random id string usable in sproc, table, and stage names.
|
353
|
+
"""
|
354
|
+
return str(uuid4()).replace("-", "_").upper()
|
355
|
+
|
346
356
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
347
357
|
"""
|
348
358
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -421,7 +431,7 @@ class GridSearchCV(BaseTransformer):
|
|
421
431
|
cp.dump(self._sklearn_object, local_transform_file)
|
422
432
|
|
423
433
|
# Create temp stage to run fit.
|
424
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
434
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
425
435
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
426
436
|
SqlResultValidator(
|
427
437
|
session=session,
|
@@ -434,11 +444,12 @@ class GridSearchCV(BaseTransformer):
|
|
434
444
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
435
445
|
).validate()
|
436
446
|
|
437
|
-
|
447
|
+
# Use posixpath to construct stage paths
|
448
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
449
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
438
450
|
local_result_file_name = get_temp_file_path()
|
439
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
440
451
|
|
441
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
452
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
442
453
|
statement_params = telemetry.get_function_usage_statement_params(
|
443
454
|
project=_PROJECT,
|
444
455
|
subproject=_SUBPROJECT,
|
@@ -464,6 +475,7 @@ class GridSearchCV(BaseTransformer):
|
|
464
475
|
replace=True,
|
465
476
|
session=session,
|
466
477
|
statement_params=statement_params,
|
478
|
+
anonymous=True
|
467
479
|
)
|
468
480
|
def fit_wrapper_sproc(
|
469
481
|
session: Session,
|
@@ -472,7 +484,8 @@ class GridSearchCV(BaseTransformer):
|
|
472
484
|
stage_result_file_name: str,
|
473
485
|
input_cols: List[str],
|
474
486
|
label_cols: List[str],
|
475
|
-
sample_weight_col: Optional[str]
|
487
|
+
sample_weight_col: Optional[str],
|
488
|
+
statement_params: Dict[str, str]
|
476
489
|
) -> str:
|
477
490
|
import cloudpickle as cp
|
478
491
|
import numpy as np
|
@@ -539,15 +552,15 @@ class GridSearchCV(BaseTransformer):
|
|
539
552
|
api_calls=[Session.call],
|
540
553
|
custom_tags=dict([("autogen", True)]),
|
541
554
|
)
|
542
|
-
sproc_export_file_name =
|
543
|
-
|
555
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
556
|
+
session,
|
544
557
|
query,
|
545
558
|
stage_transform_file_name,
|
546
559
|
stage_result_file_name,
|
547
560
|
identifier.get_unescaped_names(self.input_cols),
|
548
561
|
identifier.get_unescaped_names(self.label_cols),
|
549
562
|
identifier.get_unescaped_names(self.sample_weight_col),
|
550
|
-
statement_params
|
563
|
+
statement_params,
|
551
564
|
)
|
552
565
|
|
553
566
|
if "|" in sproc_export_file_name:
|
@@ -557,7 +570,7 @@ class GridSearchCV(BaseTransformer):
|
|
557
570
|
print("\n".join(fields[1:]))
|
558
571
|
|
559
572
|
session.file.get(
|
560
|
-
|
573
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
561
574
|
local_result_file_name,
|
562
575
|
statement_params=statement_params
|
563
576
|
)
|
@@ -603,7 +616,7 @@ class GridSearchCV(BaseTransformer):
|
|
603
616
|
|
604
617
|
# Register vectorized UDF for batch inference
|
605
618
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
606
|
-
safe_id=self.
|
619
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
607
620
|
|
608
621
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
609
622
|
# will try to pickle all of self which fails.
|
@@ -695,7 +708,7 @@ class GridSearchCV(BaseTransformer):
|
|
695
708
|
return transformed_pandas_df.to_dict("records")
|
696
709
|
|
697
710
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
698
|
-
safe_id=self.
|
711
|
+
safe_id=self._get_rand_id()
|
699
712
|
)
|
700
713
|
|
701
714
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -862,11 +875,18 @@ class GridSearchCV(BaseTransformer):
|
|
862
875
|
Transformed dataset.
|
863
876
|
"""
|
864
877
|
if isinstance(dataset, DataFrame):
|
878
|
+
expected_type_inferred = ""
|
879
|
+
# when it is classifier, infer the datatype from label columns
|
880
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
881
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
882
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
883
|
+
)
|
884
|
+
|
865
885
|
output_df = self._batch_inference(
|
866
886
|
dataset=dataset,
|
867
887
|
inference_method="predict",
|
868
888
|
expected_output_cols_list=self.output_cols,
|
869
|
-
expected_output_cols_type=
|
889
|
+
expected_output_cols_type=expected_type_inferred,
|
870
890
|
)
|
871
891
|
elif isinstance(dataset, pd.DataFrame):
|
872
892
|
output_df = self._sklearn_inference(
|
@@ -939,10 +959,10 @@ class GridSearchCV(BaseTransformer):
|
|
939
959
|
|
940
960
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
941
961
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
942
|
-
Returns
|
962
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
943
963
|
"""
|
944
964
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
945
|
-
return []
|
965
|
+
return [output_cols_prefix]
|
946
966
|
|
947
967
|
classes = self._sklearn_object.classes_
|
948
968
|
if isinstance(classes, numpy.ndarray):
|
@@ -1173,7 +1193,7 @@ class GridSearchCV(BaseTransformer):
|
|
1173
1193
|
cp.dump(self._sklearn_object, local_score_file)
|
1174
1194
|
|
1175
1195
|
# Create temp stage to run score.
|
1176
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1196
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1177
1197
|
session = dataset._session
|
1178
1198
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1179
1199
|
SqlResultValidator(
|
@@ -1187,8 +1207,9 @@ class GridSearchCV(BaseTransformer):
|
|
1187
1207
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1188
1208
|
).validate()
|
1189
1209
|
|
1190
|
-
|
1191
|
-
|
1210
|
+
# Use posixpath to construct stage paths
|
1211
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1212
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1192
1213
|
statement_params = telemetry.get_function_usage_statement_params(
|
1193
1214
|
project=_PROJECT,
|
1194
1215
|
subproject=_SUBPROJECT,
|
@@ -1214,6 +1235,7 @@ class GridSearchCV(BaseTransformer):
|
|
1214
1235
|
replace=True,
|
1215
1236
|
session=session,
|
1216
1237
|
statement_params=statement_params,
|
1238
|
+
anonymous=True
|
1217
1239
|
)
|
1218
1240
|
def score_wrapper_sproc(
|
1219
1241
|
session: Session,
|
@@ -1221,7 +1243,8 @@ class GridSearchCV(BaseTransformer):
|
|
1221
1243
|
stage_score_file_name: str,
|
1222
1244
|
input_cols: List[str],
|
1223
1245
|
label_cols: List[str],
|
1224
|
-
sample_weight_col: Optional[str]
|
1246
|
+
sample_weight_col: Optional[str],
|
1247
|
+
statement_params: Dict[str, str]
|
1225
1248
|
) -> float:
|
1226
1249
|
import cloudpickle as cp
|
1227
1250
|
import numpy as np
|
@@ -1271,14 +1294,14 @@ class GridSearchCV(BaseTransformer):
|
|
1271
1294
|
api_calls=[Session.call],
|
1272
1295
|
custom_tags=dict([("autogen", True)]),
|
1273
1296
|
)
|
1274
|
-
score =
|
1275
|
-
|
1297
|
+
score = score_wrapper_sproc(
|
1298
|
+
session,
|
1276
1299
|
query,
|
1277
1300
|
stage_score_file_name,
|
1278
1301
|
identifier.get_unescaped_names(self.input_cols),
|
1279
1302
|
identifier.get_unescaped_names(self.label_cols),
|
1280
1303
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1281
|
-
statement_params
|
1304
|
+
statement_params,
|
1282
1305
|
)
|
1283
1306
|
|
1284
1307
|
cleanup_temp_files([local_score_file_name])
|
@@ -1296,18 +1319,20 @@ class GridSearchCV(BaseTransformer):
|
|
1296
1319
|
if self._sklearn_object._estimator_type == 'classifier':
|
1297
1320
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1298
1321
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1299
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1322
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1323
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1300
1324
|
# For regressor, the type of predict is float64
|
1301
1325
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1302
1326
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1303
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1304
|
-
|
1327
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1328
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1305
1329
|
for prob_func in PROB_FUNCTIONS:
|
1306
1330
|
if hasattr(self, prob_func):
|
1307
1331
|
output_cols_prefix: str = f"{prob_func}_"
|
1308
1332
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1309
1333
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1310
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1334
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1335
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1311
1336
|
|
1312
1337
|
@property
|
1313
1338
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -327,7 +329,6 @@ class RandomizedSearchCV(BaseTransformer):
|
|
327
329
|
sample_weight_col: Optional[str] = None,
|
328
330
|
) -> None:
|
329
331
|
super().__init__()
|
330
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
331
332
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
332
333
|
deps = deps | _gather_dependencies(estimator)
|
333
334
|
self._deps = list(deps)
|
@@ -358,6 +359,15 @@ class RandomizedSearchCV(BaseTransformer):
|
|
358
359
|
self.set_drop_input_cols(drop_input_cols)
|
359
360
|
self.set_sample_weight_col(sample_weight_col)
|
360
361
|
|
362
|
+
def _get_rand_id(self) -> str:
|
363
|
+
"""
|
364
|
+
Generate random id to be used in sproc and stage names.
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
Random id string usable in sproc, table, and stage names.
|
368
|
+
"""
|
369
|
+
return str(uuid4()).replace("-", "_").upper()
|
370
|
+
|
361
371
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
362
372
|
"""
|
363
373
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -436,7 +446,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
436
446
|
cp.dump(self._sklearn_object, local_transform_file)
|
437
447
|
|
438
448
|
# Create temp stage to run fit.
|
439
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
449
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
440
450
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
441
451
|
SqlResultValidator(
|
442
452
|
session=session,
|
@@ -449,11 +459,12 @@ class RandomizedSearchCV(BaseTransformer):
|
|
449
459
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
450
460
|
).validate()
|
451
461
|
|
452
|
-
|
462
|
+
# Use posixpath to construct stage paths
|
463
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
464
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
453
465
|
local_result_file_name = get_temp_file_path()
|
454
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
455
466
|
|
456
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
467
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
457
468
|
statement_params = telemetry.get_function_usage_statement_params(
|
458
469
|
project=_PROJECT,
|
459
470
|
subproject=_SUBPROJECT,
|
@@ -479,6 +490,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
479
490
|
replace=True,
|
480
491
|
session=session,
|
481
492
|
statement_params=statement_params,
|
493
|
+
anonymous=True
|
482
494
|
)
|
483
495
|
def fit_wrapper_sproc(
|
484
496
|
session: Session,
|
@@ -487,7 +499,8 @@ class RandomizedSearchCV(BaseTransformer):
|
|
487
499
|
stage_result_file_name: str,
|
488
500
|
input_cols: List[str],
|
489
501
|
label_cols: List[str],
|
490
|
-
sample_weight_col: Optional[str]
|
502
|
+
sample_weight_col: Optional[str],
|
503
|
+
statement_params: Dict[str, str]
|
491
504
|
) -> str:
|
492
505
|
import cloudpickle as cp
|
493
506
|
import numpy as np
|
@@ -554,15 +567,15 @@ class RandomizedSearchCV(BaseTransformer):
|
|
554
567
|
api_calls=[Session.call],
|
555
568
|
custom_tags=dict([("autogen", True)]),
|
556
569
|
)
|
557
|
-
sproc_export_file_name =
|
558
|
-
|
570
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
571
|
+
session,
|
559
572
|
query,
|
560
573
|
stage_transform_file_name,
|
561
574
|
stage_result_file_name,
|
562
575
|
identifier.get_unescaped_names(self.input_cols),
|
563
576
|
identifier.get_unescaped_names(self.label_cols),
|
564
577
|
identifier.get_unescaped_names(self.sample_weight_col),
|
565
|
-
statement_params
|
578
|
+
statement_params,
|
566
579
|
)
|
567
580
|
|
568
581
|
if "|" in sproc_export_file_name:
|
@@ -572,7 +585,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
572
585
|
print("\n".join(fields[1:]))
|
573
586
|
|
574
587
|
session.file.get(
|
575
|
-
|
588
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
576
589
|
local_result_file_name,
|
577
590
|
statement_params=statement_params
|
578
591
|
)
|
@@ -618,7 +631,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
618
631
|
|
619
632
|
# Register vectorized UDF for batch inference
|
620
633
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
621
|
-
safe_id=self.
|
634
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
622
635
|
|
623
636
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
624
637
|
# will try to pickle all of self which fails.
|
@@ -710,7 +723,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
710
723
|
return transformed_pandas_df.to_dict("records")
|
711
724
|
|
712
725
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
713
|
-
safe_id=self.
|
726
|
+
safe_id=self._get_rand_id()
|
714
727
|
)
|
715
728
|
|
716
729
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -877,11 +890,18 @@ class RandomizedSearchCV(BaseTransformer):
|
|
877
890
|
Transformed dataset.
|
878
891
|
"""
|
879
892
|
if isinstance(dataset, DataFrame):
|
893
|
+
expected_type_inferred = ""
|
894
|
+
# when it is classifier, infer the datatype from label columns
|
895
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
896
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
897
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
898
|
+
)
|
899
|
+
|
880
900
|
output_df = self._batch_inference(
|
881
901
|
dataset=dataset,
|
882
902
|
inference_method="predict",
|
883
903
|
expected_output_cols_list=self.output_cols,
|
884
|
-
expected_output_cols_type=
|
904
|
+
expected_output_cols_type=expected_type_inferred,
|
885
905
|
)
|
886
906
|
elif isinstance(dataset, pd.DataFrame):
|
887
907
|
output_df = self._sklearn_inference(
|
@@ -954,10 +974,10 @@ class RandomizedSearchCV(BaseTransformer):
|
|
954
974
|
|
955
975
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
956
976
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
957
|
-
Returns
|
977
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
958
978
|
"""
|
959
979
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
960
|
-
return []
|
980
|
+
return [output_cols_prefix]
|
961
981
|
|
962
982
|
classes = self._sklearn_object.classes_
|
963
983
|
if isinstance(classes, numpy.ndarray):
|
@@ -1188,7 +1208,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1188
1208
|
cp.dump(self._sklearn_object, local_score_file)
|
1189
1209
|
|
1190
1210
|
# Create temp stage to run score.
|
1191
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1211
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1192
1212
|
session = dataset._session
|
1193
1213
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1194
1214
|
SqlResultValidator(
|
@@ -1202,8 +1222,9 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1202
1222
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1203
1223
|
).validate()
|
1204
1224
|
|
1205
|
-
|
1206
|
-
|
1225
|
+
# Use posixpath to construct stage paths
|
1226
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1227
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1207
1228
|
statement_params = telemetry.get_function_usage_statement_params(
|
1208
1229
|
project=_PROJECT,
|
1209
1230
|
subproject=_SUBPROJECT,
|
@@ -1229,6 +1250,7 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1229
1250
|
replace=True,
|
1230
1251
|
session=session,
|
1231
1252
|
statement_params=statement_params,
|
1253
|
+
anonymous=True
|
1232
1254
|
)
|
1233
1255
|
def score_wrapper_sproc(
|
1234
1256
|
session: Session,
|
@@ -1236,7 +1258,8 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1236
1258
|
stage_score_file_name: str,
|
1237
1259
|
input_cols: List[str],
|
1238
1260
|
label_cols: List[str],
|
1239
|
-
sample_weight_col: Optional[str]
|
1261
|
+
sample_weight_col: Optional[str],
|
1262
|
+
statement_params: Dict[str, str]
|
1240
1263
|
) -> float:
|
1241
1264
|
import cloudpickle as cp
|
1242
1265
|
import numpy as np
|
@@ -1286,14 +1309,14 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1286
1309
|
api_calls=[Session.call],
|
1287
1310
|
custom_tags=dict([("autogen", True)]),
|
1288
1311
|
)
|
1289
|
-
score =
|
1290
|
-
|
1312
|
+
score = score_wrapper_sproc(
|
1313
|
+
session,
|
1291
1314
|
query,
|
1292
1315
|
stage_score_file_name,
|
1293
1316
|
identifier.get_unescaped_names(self.input_cols),
|
1294
1317
|
identifier.get_unescaped_names(self.label_cols),
|
1295
1318
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1296
|
-
statement_params
|
1319
|
+
statement_params,
|
1297
1320
|
)
|
1298
1321
|
|
1299
1322
|
cleanup_temp_files([local_score_file_name])
|
@@ -1311,18 +1334,20 @@ class RandomizedSearchCV(BaseTransformer):
|
|
1311
1334
|
if self._sklearn_object._estimator_type == 'classifier':
|
1312
1335
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1313
1336
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1314
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1337
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1338
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1315
1339
|
# For regressor, the type of predict is float64
|
1316
1340
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1317
1341
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1318
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1319
|
-
|
1342
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1343
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1320
1344
|
for prob_func in PROB_FUNCTIONS:
|
1321
1345
|
if hasattr(self, prob_func):
|
1322
1346
|
output_cols_prefix: str = f"{prob_func}_"
|
1323
1347
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1324
1348
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1325
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1349
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1350
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1326
1351
|
|
1327
1352
|
@property
|
1328
1353
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|