snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -262,7 +264,6 @@ class BisectingKMeans(BaseTransformer):
|
|
262
264
|
sample_weight_col: Optional[str] = None,
|
263
265
|
) -> None:
|
264
266
|
super().__init__()
|
265
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
266
267
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
267
268
|
|
268
269
|
self._deps = list(deps)
|
@@ -291,6 +292,15 @@ class BisectingKMeans(BaseTransformer):
|
|
291
292
|
self.set_drop_input_cols(drop_input_cols)
|
292
293
|
self.set_sample_weight_col(sample_weight_col)
|
293
294
|
|
295
|
+
def _get_rand_id(self) -> str:
|
296
|
+
"""
|
297
|
+
Generate random id to be used in sproc and stage names.
|
298
|
+
|
299
|
+
Returns:
|
300
|
+
Random id string usable in sproc, table, and stage names.
|
301
|
+
"""
|
302
|
+
return str(uuid4()).replace("-", "_").upper()
|
303
|
+
|
294
304
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
295
305
|
"""
|
296
306
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -369,7 +379,7 @@ class BisectingKMeans(BaseTransformer):
|
|
369
379
|
cp.dump(self._sklearn_object, local_transform_file)
|
370
380
|
|
371
381
|
# Create temp stage to run fit.
|
372
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
382
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
373
383
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
374
384
|
SqlResultValidator(
|
375
385
|
session=session,
|
@@ -382,11 +392,12 @@ class BisectingKMeans(BaseTransformer):
|
|
382
392
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
383
393
|
).validate()
|
384
394
|
|
385
|
-
|
395
|
+
# Use posixpath to construct stage paths
|
396
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
397
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
386
398
|
local_result_file_name = get_temp_file_path()
|
387
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
388
399
|
|
389
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
400
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
390
401
|
statement_params = telemetry.get_function_usage_statement_params(
|
391
402
|
project=_PROJECT,
|
392
403
|
subproject=_SUBPROJECT,
|
@@ -412,6 +423,7 @@ class BisectingKMeans(BaseTransformer):
|
|
412
423
|
replace=True,
|
413
424
|
session=session,
|
414
425
|
statement_params=statement_params,
|
426
|
+
anonymous=True
|
415
427
|
)
|
416
428
|
def fit_wrapper_sproc(
|
417
429
|
session: Session,
|
@@ -420,7 +432,8 @@ class BisectingKMeans(BaseTransformer):
|
|
420
432
|
stage_result_file_name: str,
|
421
433
|
input_cols: List[str],
|
422
434
|
label_cols: List[str],
|
423
|
-
sample_weight_col: Optional[str]
|
435
|
+
sample_weight_col: Optional[str],
|
436
|
+
statement_params: Dict[str, str]
|
424
437
|
) -> str:
|
425
438
|
import cloudpickle as cp
|
426
439
|
import numpy as np
|
@@ -487,15 +500,15 @@ class BisectingKMeans(BaseTransformer):
|
|
487
500
|
api_calls=[Session.call],
|
488
501
|
custom_tags=dict([("autogen", True)]),
|
489
502
|
)
|
490
|
-
sproc_export_file_name =
|
491
|
-
|
503
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
504
|
+
session,
|
492
505
|
query,
|
493
506
|
stage_transform_file_name,
|
494
507
|
stage_result_file_name,
|
495
508
|
identifier.get_unescaped_names(self.input_cols),
|
496
509
|
identifier.get_unescaped_names(self.label_cols),
|
497
510
|
identifier.get_unescaped_names(self.sample_weight_col),
|
498
|
-
statement_params
|
511
|
+
statement_params,
|
499
512
|
)
|
500
513
|
|
501
514
|
if "|" in sproc_export_file_name:
|
@@ -505,7 +518,7 @@ class BisectingKMeans(BaseTransformer):
|
|
505
518
|
print("\n".join(fields[1:]))
|
506
519
|
|
507
520
|
session.file.get(
|
508
|
-
|
521
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
509
522
|
local_result_file_name,
|
510
523
|
statement_params=statement_params
|
511
524
|
)
|
@@ -551,7 +564,7 @@ class BisectingKMeans(BaseTransformer):
|
|
551
564
|
|
552
565
|
# Register vectorized UDF for batch inference
|
553
566
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
554
|
-
safe_id=self.
|
567
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
555
568
|
|
556
569
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
557
570
|
# will try to pickle all of self which fails.
|
@@ -643,7 +656,7 @@ class BisectingKMeans(BaseTransformer):
|
|
643
656
|
return transformed_pandas_df.to_dict("records")
|
644
657
|
|
645
658
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
646
|
-
safe_id=self.
|
659
|
+
safe_id=self._get_rand_id()
|
647
660
|
)
|
648
661
|
|
649
662
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -810,11 +823,18 @@ class BisectingKMeans(BaseTransformer):
|
|
810
823
|
Transformed dataset.
|
811
824
|
"""
|
812
825
|
if isinstance(dataset, DataFrame):
|
826
|
+
expected_type_inferred = ""
|
827
|
+
# when it is classifier, infer the datatype from label columns
|
828
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
829
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
830
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
831
|
+
)
|
832
|
+
|
813
833
|
output_df = self._batch_inference(
|
814
834
|
dataset=dataset,
|
815
835
|
inference_method="predict",
|
816
836
|
expected_output_cols_list=self.output_cols,
|
817
|
-
expected_output_cols_type=
|
837
|
+
expected_output_cols_type=expected_type_inferred,
|
818
838
|
)
|
819
839
|
elif isinstance(dataset, pd.DataFrame):
|
820
840
|
output_df = self._sklearn_inference(
|
@@ -887,10 +907,10 @@ class BisectingKMeans(BaseTransformer):
|
|
887
907
|
|
888
908
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
889
909
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
890
|
-
Returns
|
910
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
891
911
|
"""
|
892
912
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
893
|
-
return []
|
913
|
+
return [output_cols_prefix]
|
894
914
|
|
895
915
|
classes = self._sklearn_object.classes_
|
896
916
|
if isinstance(classes, numpy.ndarray):
|
@@ -1115,7 +1135,7 @@ class BisectingKMeans(BaseTransformer):
|
|
1115
1135
|
cp.dump(self._sklearn_object, local_score_file)
|
1116
1136
|
|
1117
1137
|
# Create temp stage to run score.
|
1118
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1138
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1119
1139
|
session = dataset._session
|
1120
1140
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1121
1141
|
SqlResultValidator(
|
@@ -1129,8 +1149,9 @@ class BisectingKMeans(BaseTransformer):
|
|
1129
1149
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1130
1150
|
).validate()
|
1131
1151
|
|
1132
|
-
|
1133
|
-
|
1152
|
+
# Use posixpath to construct stage paths
|
1153
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1154
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1134
1155
|
statement_params = telemetry.get_function_usage_statement_params(
|
1135
1156
|
project=_PROJECT,
|
1136
1157
|
subproject=_SUBPROJECT,
|
@@ -1156,6 +1177,7 @@ class BisectingKMeans(BaseTransformer):
|
|
1156
1177
|
replace=True,
|
1157
1178
|
session=session,
|
1158
1179
|
statement_params=statement_params,
|
1180
|
+
anonymous=True
|
1159
1181
|
)
|
1160
1182
|
def score_wrapper_sproc(
|
1161
1183
|
session: Session,
|
@@ -1163,7 +1185,8 @@ class BisectingKMeans(BaseTransformer):
|
|
1163
1185
|
stage_score_file_name: str,
|
1164
1186
|
input_cols: List[str],
|
1165
1187
|
label_cols: List[str],
|
1166
|
-
sample_weight_col: Optional[str]
|
1188
|
+
sample_weight_col: Optional[str],
|
1189
|
+
statement_params: Dict[str, str]
|
1167
1190
|
) -> float:
|
1168
1191
|
import cloudpickle as cp
|
1169
1192
|
import numpy as np
|
@@ -1213,14 +1236,14 @@ class BisectingKMeans(BaseTransformer):
|
|
1213
1236
|
api_calls=[Session.call],
|
1214
1237
|
custom_tags=dict([("autogen", True)]),
|
1215
1238
|
)
|
1216
|
-
score =
|
1217
|
-
|
1239
|
+
score = score_wrapper_sproc(
|
1240
|
+
session,
|
1218
1241
|
query,
|
1219
1242
|
stage_score_file_name,
|
1220
1243
|
identifier.get_unescaped_names(self.input_cols),
|
1221
1244
|
identifier.get_unescaped_names(self.label_cols),
|
1222
1245
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1223
|
-
statement_params
|
1246
|
+
statement_params,
|
1224
1247
|
)
|
1225
1248
|
|
1226
1249
|
cleanup_temp_files([local_score_file_name])
|
@@ -1238,18 +1261,20 @@ class BisectingKMeans(BaseTransformer):
|
|
1238
1261
|
if self._sklearn_object._estimator_type == 'classifier':
|
1239
1262
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1240
1263
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1241
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1264
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1265
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1242
1266
|
# For regressor, the type of predict is float64
|
1243
1267
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1244
1268
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1245
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1246
|
-
|
1269
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1270
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1247
1271
|
for prob_func in PROB_FUNCTIONS:
|
1248
1272
|
if hasattr(self, prob_func):
|
1249
1273
|
output_cols_prefix: str = f"{prob_func}_"
|
1250
1274
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1251
1275
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1252
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1276
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1277
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1253
1278
|
|
1254
1279
|
@property
|
1255
1280
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -232,7 +234,6 @@ class DBSCAN(BaseTransformer):
|
|
232
234
|
sample_weight_col: Optional[str] = None,
|
233
235
|
) -> None:
|
234
236
|
super().__init__()
|
235
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
236
237
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
237
238
|
|
238
239
|
self._deps = list(deps)
|
@@ -259,6 +260,15 @@ class DBSCAN(BaseTransformer):
|
|
259
260
|
self.set_drop_input_cols(drop_input_cols)
|
260
261
|
self.set_sample_weight_col(sample_weight_col)
|
261
262
|
|
263
|
+
def _get_rand_id(self) -> str:
|
264
|
+
"""
|
265
|
+
Generate random id to be used in sproc and stage names.
|
266
|
+
|
267
|
+
Returns:
|
268
|
+
Random id string usable in sproc, table, and stage names.
|
269
|
+
"""
|
270
|
+
return str(uuid4()).replace("-", "_").upper()
|
271
|
+
|
262
272
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
263
273
|
"""
|
264
274
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -337,7 +347,7 @@ class DBSCAN(BaseTransformer):
|
|
337
347
|
cp.dump(self._sklearn_object, local_transform_file)
|
338
348
|
|
339
349
|
# Create temp stage to run fit.
|
340
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
350
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
341
351
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
342
352
|
SqlResultValidator(
|
343
353
|
session=session,
|
@@ -350,11 +360,12 @@ class DBSCAN(BaseTransformer):
|
|
350
360
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
351
361
|
).validate()
|
352
362
|
|
353
|
-
|
363
|
+
# Use posixpath to construct stage paths
|
364
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
365
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
354
366
|
local_result_file_name = get_temp_file_path()
|
355
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
356
367
|
|
357
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
368
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
358
369
|
statement_params = telemetry.get_function_usage_statement_params(
|
359
370
|
project=_PROJECT,
|
360
371
|
subproject=_SUBPROJECT,
|
@@ -380,6 +391,7 @@ class DBSCAN(BaseTransformer):
|
|
380
391
|
replace=True,
|
381
392
|
session=session,
|
382
393
|
statement_params=statement_params,
|
394
|
+
anonymous=True
|
383
395
|
)
|
384
396
|
def fit_wrapper_sproc(
|
385
397
|
session: Session,
|
@@ -388,7 +400,8 @@ class DBSCAN(BaseTransformer):
|
|
388
400
|
stage_result_file_name: str,
|
389
401
|
input_cols: List[str],
|
390
402
|
label_cols: List[str],
|
391
|
-
sample_weight_col: Optional[str]
|
403
|
+
sample_weight_col: Optional[str],
|
404
|
+
statement_params: Dict[str, str]
|
392
405
|
) -> str:
|
393
406
|
import cloudpickle as cp
|
394
407
|
import numpy as np
|
@@ -455,15 +468,15 @@ class DBSCAN(BaseTransformer):
|
|
455
468
|
api_calls=[Session.call],
|
456
469
|
custom_tags=dict([("autogen", True)]),
|
457
470
|
)
|
458
|
-
sproc_export_file_name =
|
459
|
-
|
471
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
472
|
+
session,
|
460
473
|
query,
|
461
474
|
stage_transform_file_name,
|
462
475
|
stage_result_file_name,
|
463
476
|
identifier.get_unescaped_names(self.input_cols),
|
464
477
|
identifier.get_unescaped_names(self.label_cols),
|
465
478
|
identifier.get_unescaped_names(self.sample_weight_col),
|
466
|
-
statement_params
|
479
|
+
statement_params,
|
467
480
|
)
|
468
481
|
|
469
482
|
if "|" in sproc_export_file_name:
|
@@ -473,7 +486,7 @@ class DBSCAN(BaseTransformer):
|
|
473
486
|
print("\n".join(fields[1:]))
|
474
487
|
|
475
488
|
session.file.get(
|
476
|
-
|
489
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
477
490
|
local_result_file_name,
|
478
491
|
statement_params=statement_params
|
479
492
|
)
|
@@ -519,7 +532,7 @@ class DBSCAN(BaseTransformer):
|
|
519
532
|
|
520
533
|
# Register vectorized UDF for batch inference
|
521
534
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
522
|
-
safe_id=self.
|
535
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
523
536
|
|
524
537
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
525
538
|
# will try to pickle all of self which fails.
|
@@ -611,7 +624,7 @@ class DBSCAN(BaseTransformer):
|
|
611
624
|
return transformed_pandas_df.to_dict("records")
|
612
625
|
|
613
626
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
614
|
-
safe_id=self.
|
627
|
+
safe_id=self._get_rand_id()
|
615
628
|
)
|
616
629
|
|
617
630
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -776,11 +789,18 @@ class DBSCAN(BaseTransformer):
|
|
776
789
|
Transformed dataset.
|
777
790
|
"""
|
778
791
|
if isinstance(dataset, DataFrame):
|
792
|
+
expected_type_inferred = ""
|
793
|
+
# when it is classifier, infer the datatype from label columns
|
794
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
795
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
796
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
797
|
+
)
|
798
|
+
|
779
799
|
output_df = self._batch_inference(
|
780
800
|
dataset=dataset,
|
781
801
|
inference_method="predict",
|
782
802
|
expected_output_cols_list=self.output_cols,
|
783
|
-
expected_output_cols_type=
|
803
|
+
expected_output_cols_type=expected_type_inferred,
|
784
804
|
)
|
785
805
|
elif isinstance(dataset, pd.DataFrame):
|
786
806
|
output_df = self._sklearn_inference(
|
@@ -851,10 +871,10 @@ class DBSCAN(BaseTransformer):
|
|
851
871
|
|
852
872
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
853
873
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
854
|
-
Returns
|
874
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
855
875
|
"""
|
856
876
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
857
|
-
return []
|
877
|
+
return [output_cols_prefix]
|
858
878
|
|
859
879
|
classes = self._sklearn_object.classes_
|
860
880
|
if isinstance(classes, numpy.ndarray):
|
@@ -1079,7 +1099,7 @@ class DBSCAN(BaseTransformer):
|
|
1079
1099
|
cp.dump(self._sklearn_object, local_score_file)
|
1080
1100
|
|
1081
1101
|
# Create temp stage to run score.
|
1082
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1102
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1083
1103
|
session = dataset._session
|
1084
1104
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1085
1105
|
SqlResultValidator(
|
@@ -1093,8 +1113,9 @@ class DBSCAN(BaseTransformer):
|
|
1093
1113
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1094
1114
|
).validate()
|
1095
1115
|
|
1096
|
-
|
1097
|
-
|
1116
|
+
# Use posixpath to construct stage paths
|
1117
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1118
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1098
1119
|
statement_params = telemetry.get_function_usage_statement_params(
|
1099
1120
|
project=_PROJECT,
|
1100
1121
|
subproject=_SUBPROJECT,
|
@@ -1120,6 +1141,7 @@ class DBSCAN(BaseTransformer):
|
|
1120
1141
|
replace=True,
|
1121
1142
|
session=session,
|
1122
1143
|
statement_params=statement_params,
|
1144
|
+
anonymous=True
|
1123
1145
|
)
|
1124
1146
|
def score_wrapper_sproc(
|
1125
1147
|
session: Session,
|
@@ -1127,7 +1149,8 @@ class DBSCAN(BaseTransformer):
|
|
1127
1149
|
stage_score_file_name: str,
|
1128
1150
|
input_cols: List[str],
|
1129
1151
|
label_cols: List[str],
|
1130
|
-
sample_weight_col: Optional[str]
|
1152
|
+
sample_weight_col: Optional[str],
|
1153
|
+
statement_params: Dict[str, str]
|
1131
1154
|
) -> float:
|
1132
1155
|
import cloudpickle as cp
|
1133
1156
|
import numpy as np
|
@@ -1177,14 +1200,14 @@ class DBSCAN(BaseTransformer):
|
|
1177
1200
|
api_calls=[Session.call],
|
1178
1201
|
custom_tags=dict([("autogen", True)]),
|
1179
1202
|
)
|
1180
|
-
score =
|
1181
|
-
|
1203
|
+
score = score_wrapper_sproc(
|
1204
|
+
session,
|
1182
1205
|
query,
|
1183
1206
|
stage_score_file_name,
|
1184
1207
|
identifier.get_unescaped_names(self.input_cols),
|
1185
1208
|
identifier.get_unescaped_names(self.label_cols),
|
1186
1209
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1187
|
-
statement_params
|
1210
|
+
statement_params,
|
1188
1211
|
)
|
1189
1212
|
|
1190
1213
|
cleanup_temp_files([local_score_file_name])
|
@@ -1202,18 +1225,20 @@ class DBSCAN(BaseTransformer):
|
|
1202
1225
|
if self._sklearn_object._estimator_type == 'classifier':
|
1203
1226
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1204
1227
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1205
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1228
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1229
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1206
1230
|
# For regressor, the type of predict is float64
|
1207
1231
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1208
1232
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1209
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1210
|
-
|
1233
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1234
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1211
1235
|
for prob_func in PROB_FUNCTIONS:
|
1212
1236
|
if hasattr(self, prob_func):
|
1213
1237
|
output_cols_prefix: str = f"{prob_func}_"
|
1214
1238
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1215
1239
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1216
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1240
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1241
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1217
1242
|
|
1218
1243
|
@property
|
1219
1244
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|