snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -197,7 +199,6 @@ class RBFSampler(BaseTransformer):
|
|
197
199
|
sample_weight_col: Optional[str] = None,
|
198
200
|
) -> None:
|
199
201
|
super().__init__()
|
200
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
201
202
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
202
203
|
|
203
204
|
self._deps = list(deps)
|
@@ -219,6 +220,15 @@ class RBFSampler(BaseTransformer):
|
|
219
220
|
self.set_drop_input_cols(drop_input_cols)
|
220
221
|
self.set_sample_weight_col(sample_weight_col)
|
221
222
|
|
223
|
+
def _get_rand_id(self) -> str:
|
224
|
+
"""
|
225
|
+
Generate random id to be used in sproc and stage names.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
Random id string usable in sproc, table, and stage names.
|
229
|
+
"""
|
230
|
+
return str(uuid4()).replace("-", "_").upper()
|
231
|
+
|
222
232
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
223
233
|
"""
|
224
234
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -297,7 +307,7 @@ class RBFSampler(BaseTransformer):
|
|
297
307
|
cp.dump(self._sklearn_object, local_transform_file)
|
298
308
|
|
299
309
|
# Create temp stage to run fit.
|
300
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
310
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
301
311
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
302
312
|
SqlResultValidator(
|
303
313
|
session=session,
|
@@ -310,11 +320,12 @@ class RBFSampler(BaseTransformer):
|
|
310
320
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
311
321
|
).validate()
|
312
322
|
|
313
|
-
|
323
|
+
# Use posixpath to construct stage paths
|
324
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
325
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
314
326
|
local_result_file_name = get_temp_file_path()
|
315
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
316
327
|
|
317
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
328
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
318
329
|
statement_params = telemetry.get_function_usage_statement_params(
|
319
330
|
project=_PROJECT,
|
320
331
|
subproject=_SUBPROJECT,
|
@@ -340,6 +351,7 @@ class RBFSampler(BaseTransformer):
|
|
340
351
|
replace=True,
|
341
352
|
session=session,
|
342
353
|
statement_params=statement_params,
|
354
|
+
anonymous=True
|
343
355
|
)
|
344
356
|
def fit_wrapper_sproc(
|
345
357
|
session: Session,
|
@@ -348,7 +360,8 @@ class RBFSampler(BaseTransformer):
|
|
348
360
|
stage_result_file_name: str,
|
349
361
|
input_cols: List[str],
|
350
362
|
label_cols: List[str],
|
351
|
-
sample_weight_col: Optional[str]
|
363
|
+
sample_weight_col: Optional[str],
|
364
|
+
statement_params: Dict[str, str]
|
352
365
|
) -> str:
|
353
366
|
import cloudpickle as cp
|
354
367
|
import numpy as np
|
@@ -415,15 +428,15 @@ class RBFSampler(BaseTransformer):
|
|
415
428
|
api_calls=[Session.call],
|
416
429
|
custom_tags=dict([("autogen", True)]),
|
417
430
|
)
|
418
|
-
sproc_export_file_name =
|
419
|
-
|
431
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
432
|
+
session,
|
420
433
|
query,
|
421
434
|
stage_transform_file_name,
|
422
435
|
stage_result_file_name,
|
423
436
|
identifier.get_unescaped_names(self.input_cols),
|
424
437
|
identifier.get_unescaped_names(self.label_cols),
|
425
438
|
identifier.get_unescaped_names(self.sample_weight_col),
|
426
|
-
statement_params
|
439
|
+
statement_params,
|
427
440
|
)
|
428
441
|
|
429
442
|
if "|" in sproc_export_file_name:
|
@@ -433,7 +446,7 @@ class RBFSampler(BaseTransformer):
|
|
433
446
|
print("\n".join(fields[1:]))
|
434
447
|
|
435
448
|
session.file.get(
|
436
|
-
|
449
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
437
450
|
local_result_file_name,
|
438
451
|
statement_params=statement_params
|
439
452
|
)
|
@@ -479,7 +492,7 @@ class RBFSampler(BaseTransformer):
|
|
479
492
|
|
480
493
|
# Register vectorized UDF for batch inference
|
481
494
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
482
|
-
safe_id=self.
|
495
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
483
496
|
|
484
497
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
485
498
|
# will try to pickle all of self which fails.
|
@@ -571,7 +584,7 @@ class RBFSampler(BaseTransformer):
|
|
571
584
|
return transformed_pandas_df.to_dict("records")
|
572
585
|
|
573
586
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
574
|
-
safe_id=self.
|
587
|
+
safe_id=self._get_rand_id()
|
575
588
|
)
|
576
589
|
|
577
590
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -736,11 +749,18 @@ class RBFSampler(BaseTransformer):
|
|
736
749
|
Transformed dataset.
|
737
750
|
"""
|
738
751
|
if isinstance(dataset, DataFrame):
|
752
|
+
expected_type_inferred = ""
|
753
|
+
# when it is classifier, infer the datatype from label columns
|
754
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
755
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
756
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
757
|
+
)
|
758
|
+
|
739
759
|
output_df = self._batch_inference(
|
740
760
|
dataset=dataset,
|
741
761
|
inference_method="predict",
|
742
762
|
expected_output_cols_list=self.output_cols,
|
743
|
-
expected_output_cols_type=
|
763
|
+
expected_output_cols_type=expected_type_inferred,
|
744
764
|
)
|
745
765
|
elif isinstance(dataset, pd.DataFrame):
|
746
766
|
output_df = self._sklearn_inference(
|
@@ -813,10 +833,10 @@ class RBFSampler(BaseTransformer):
|
|
813
833
|
|
814
834
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
815
835
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
816
|
-
Returns
|
836
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
817
837
|
"""
|
818
838
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
819
|
-
return []
|
839
|
+
return [output_cols_prefix]
|
820
840
|
|
821
841
|
classes = self._sklearn_object.classes_
|
822
842
|
if isinstance(classes, numpy.ndarray):
|
@@ -1041,7 +1061,7 @@ class RBFSampler(BaseTransformer):
|
|
1041
1061
|
cp.dump(self._sklearn_object, local_score_file)
|
1042
1062
|
|
1043
1063
|
# Create temp stage to run score.
|
1044
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1064
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1045
1065
|
session = dataset._session
|
1046
1066
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1047
1067
|
SqlResultValidator(
|
@@ -1055,8 +1075,9 @@ class RBFSampler(BaseTransformer):
|
|
1055
1075
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1056
1076
|
).validate()
|
1057
1077
|
|
1058
|
-
|
1059
|
-
|
1078
|
+
# Use posixpath to construct stage paths
|
1079
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1080
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1060
1081
|
statement_params = telemetry.get_function_usage_statement_params(
|
1061
1082
|
project=_PROJECT,
|
1062
1083
|
subproject=_SUBPROJECT,
|
@@ -1082,6 +1103,7 @@ class RBFSampler(BaseTransformer):
|
|
1082
1103
|
replace=True,
|
1083
1104
|
session=session,
|
1084
1105
|
statement_params=statement_params,
|
1106
|
+
anonymous=True
|
1085
1107
|
)
|
1086
1108
|
def score_wrapper_sproc(
|
1087
1109
|
session: Session,
|
@@ -1089,7 +1111,8 @@ class RBFSampler(BaseTransformer):
|
|
1089
1111
|
stage_score_file_name: str,
|
1090
1112
|
input_cols: List[str],
|
1091
1113
|
label_cols: List[str],
|
1092
|
-
sample_weight_col: Optional[str]
|
1114
|
+
sample_weight_col: Optional[str],
|
1115
|
+
statement_params: Dict[str, str]
|
1093
1116
|
) -> float:
|
1094
1117
|
import cloudpickle as cp
|
1095
1118
|
import numpy as np
|
@@ -1139,14 +1162,14 @@ class RBFSampler(BaseTransformer):
|
|
1139
1162
|
api_calls=[Session.call],
|
1140
1163
|
custom_tags=dict([("autogen", True)]),
|
1141
1164
|
)
|
1142
|
-
score =
|
1143
|
-
|
1165
|
+
score = score_wrapper_sproc(
|
1166
|
+
session,
|
1144
1167
|
query,
|
1145
1168
|
stage_score_file_name,
|
1146
1169
|
identifier.get_unescaped_names(self.input_cols),
|
1147
1170
|
identifier.get_unescaped_names(self.label_cols),
|
1148
1171
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1149
|
-
statement_params
|
1172
|
+
statement_params,
|
1150
1173
|
)
|
1151
1174
|
|
1152
1175
|
cleanup_temp_files([local_score_file_name])
|
@@ -1164,18 +1187,20 @@ class RBFSampler(BaseTransformer):
|
|
1164
1187
|
if self._sklearn_object._estimator_type == 'classifier':
|
1165
1188
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1166
1189
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1167
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1190
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1191
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1168
1192
|
# For regressor, the type of predict is float64
|
1169
1193
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1170
1194
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1171
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1172
|
-
|
1195
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1196
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1173
1197
|
for prob_func in PROB_FUNCTIONS:
|
1174
1198
|
if hasattr(self, prob_func):
|
1175
1199
|
output_cols_prefix: str = f"{prob_func}_"
|
1176
1200
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1177
1201
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1178
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1202
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1203
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1179
1204
|
|
1180
1205
|
@property
|
1181
1206
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -195,7 +197,6 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
195
197
|
sample_weight_col: Optional[str] = None,
|
196
198
|
) -> None:
|
197
199
|
super().__init__()
|
198
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
199
200
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
200
201
|
|
201
202
|
self._deps = list(deps)
|
@@ -217,6 +218,15 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
217
218
|
self.set_drop_input_cols(drop_input_cols)
|
218
219
|
self.set_sample_weight_col(sample_weight_col)
|
219
220
|
|
221
|
+
def _get_rand_id(self) -> str:
|
222
|
+
"""
|
223
|
+
Generate random id to be used in sproc and stage names.
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
Random id string usable in sproc, table, and stage names.
|
227
|
+
"""
|
228
|
+
return str(uuid4()).replace("-", "_").upper()
|
229
|
+
|
220
230
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
221
231
|
"""
|
222
232
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -295,7 +305,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
295
305
|
cp.dump(self._sklearn_object, local_transform_file)
|
296
306
|
|
297
307
|
# Create temp stage to run fit.
|
298
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
308
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
299
309
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
300
310
|
SqlResultValidator(
|
301
311
|
session=session,
|
@@ -308,11 +318,12 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
308
318
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
309
319
|
).validate()
|
310
320
|
|
311
|
-
|
321
|
+
# Use posixpath to construct stage paths
|
322
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
323
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
312
324
|
local_result_file_name = get_temp_file_path()
|
313
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
314
325
|
|
315
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
326
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
316
327
|
statement_params = telemetry.get_function_usage_statement_params(
|
317
328
|
project=_PROJECT,
|
318
329
|
subproject=_SUBPROJECT,
|
@@ -338,6 +349,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
338
349
|
replace=True,
|
339
350
|
session=session,
|
340
351
|
statement_params=statement_params,
|
352
|
+
anonymous=True
|
341
353
|
)
|
342
354
|
def fit_wrapper_sproc(
|
343
355
|
session: Session,
|
@@ -346,7 +358,8 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
346
358
|
stage_result_file_name: str,
|
347
359
|
input_cols: List[str],
|
348
360
|
label_cols: List[str],
|
349
|
-
sample_weight_col: Optional[str]
|
361
|
+
sample_weight_col: Optional[str],
|
362
|
+
statement_params: Dict[str, str]
|
350
363
|
) -> str:
|
351
364
|
import cloudpickle as cp
|
352
365
|
import numpy as np
|
@@ -413,15 +426,15 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
413
426
|
api_calls=[Session.call],
|
414
427
|
custom_tags=dict([("autogen", True)]),
|
415
428
|
)
|
416
|
-
sproc_export_file_name =
|
417
|
-
|
429
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
430
|
+
session,
|
418
431
|
query,
|
419
432
|
stage_transform_file_name,
|
420
433
|
stage_result_file_name,
|
421
434
|
identifier.get_unescaped_names(self.input_cols),
|
422
435
|
identifier.get_unescaped_names(self.label_cols),
|
423
436
|
identifier.get_unescaped_names(self.sample_weight_col),
|
424
|
-
statement_params
|
437
|
+
statement_params,
|
425
438
|
)
|
426
439
|
|
427
440
|
if "|" in sproc_export_file_name:
|
@@ -431,7 +444,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
431
444
|
print("\n".join(fields[1:]))
|
432
445
|
|
433
446
|
session.file.get(
|
434
|
-
|
447
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
435
448
|
local_result_file_name,
|
436
449
|
statement_params=statement_params
|
437
450
|
)
|
@@ -477,7 +490,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
477
490
|
|
478
491
|
# Register vectorized UDF for batch inference
|
479
492
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
480
|
-
safe_id=self.
|
493
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
481
494
|
|
482
495
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
483
496
|
# will try to pickle all of self which fails.
|
@@ -569,7 +582,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
569
582
|
return transformed_pandas_df.to_dict("records")
|
570
583
|
|
571
584
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
572
|
-
safe_id=self.
|
585
|
+
safe_id=self._get_rand_id()
|
573
586
|
)
|
574
587
|
|
575
588
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -734,11 +747,18 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
734
747
|
Transformed dataset.
|
735
748
|
"""
|
736
749
|
if isinstance(dataset, DataFrame):
|
750
|
+
expected_type_inferred = ""
|
751
|
+
# when it is classifier, infer the datatype from label columns
|
752
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
753
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
754
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
755
|
+
)
|
756
|
+
|
737
757
|
output_df = self._batch_inference(
|
738
758
|
dataset=dataset,
|
739
759
|
inference_method="predict",
|
740
760
|
expected_output_cols_list=self.output_cols,
|
741
|
-
expected_output_cols_type=
|
761
|
+
expected_output_cols_type=expected_type_inferred,
|
742
762
|
)
|
743
763
|
elif isinstance(dataset, pd.DataFrame):
|
744
764
|
output_df = self._sklearn_inference(
|
@@ -811,10 +831,10 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
811
831
|
|
812
832
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
813
833
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
814
|
-
Returns
|
834
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
815
835
|
"""
|
816
836
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
817
|
-
return []
|
837
|
+
return [output_cols_prefix]
|
818
838
|
|
819
839
|
classes = self._sklearn_object.classes_
|
820
840
|
if isinstance(classes, numpy.ndarray):
|
@@ -1039,7 +1059,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1039
1059
|
cp.dump(self._sklearn_object, local_score_file)
|
1040
1060
|
|
1041
1061
|
# Create temp stage to run score.
|
1042
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1062
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1043
1063
|
session = dataset._session
|
1044
1064
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1045
1065
|
SqlResultValidator(
|
@@ -1053,8 +1073,9 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1053
1073
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1054
1074
|
).validate()
|
1055
1075
|
|
1056
|
-
|
1057
|
-
|
1076
|
+
# Use posixpath to construct stage paths
|
1077
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1078
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1058
1079
|
statement_params = telemetry.get_function_usage_statement_params(
|
1059
1080
|
project=_PROJECT,
|
1060
1081
|
subproject=_SUBPROJECT,
|
@@ -1080,6 +1101,7 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1080
1101
|
replace=True,
|
1081
1102
|
session=session,
|
1082
1103
|
statement_params=statement_params,
|
1104
|
+
anonymous=True
|
1083
1105
|
)
|
1084
1106
|
def score_wrapper_sproc(
|
1085
1107
|
session: Session,
|
@@ -1087,7 +1109,8 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1087
1109
|
stage_score_file_name: str,
|
1088
1110
|
input_cols: List[str],
|
1089
1111
|
label_cols: List[str],
|
1090
|
-
sample_weight_col: Optional[str]
|
1112
|
+
sample_weight_col: Optional[str],
|
1113
|
+
statement_params: Dict[str, str]
|
1091
1114
|
) -> float:
|
1092
1115
|
import cloudpickle as cp
|
1093
1116
|
import numpy as np
|
@@ -1137,14 +1160,14 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1137
1160
|
api_calls=[Session.call],
|
1138
1161
|
custom_tags=dict([("autogen", True)]),
|
1139
1162
|
)
|
1140
|
-
score =
|
1141
|
-
|
1163
|
+
score = score_wrapper_sproc(
|
1164
|
+
session,
|
1142
1165
|
query,
|
1143
1166
|
stage_score_file_name,
|
1144
1167
|
identifier.get_unescaped_names(self.input_cols),
|
1145
1168
|
identifier.get_unescaped_names(self.label_cols),
|
1146
1169
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1147
|
-
statement_params
|
1170
|
+
statement_params,
|
1148
1171
|
)
|
1149
1172
|
|
1150
1173
|
cleanup_temp_files([local_score_file_name])
|
@@ -1162,18 +1185,20 @@ class SkewedChi2Sampler(BaseTransformer):
|
|
1162
1185
|
if self._sklearn_object._estimator_type == 'classifier':
|
1163
1186
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1164
1187
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1165
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1188
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1189
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1166
1190
|
# For regressor, the type of predict is float64
|
1167
1191
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1168
1192
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1169
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1170
|
-
|
1193
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1194
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1171
1195
|
for prob_func in PROB_FUNCTIONS:
|
1172
1196
|
if hasattr(self, prob_func):
|
1173
1197
|
output_cols_prefix: str = f"{prob_func}_"
|
1174
1198
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1175
1199
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1176
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1200
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1201
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1177
1202
|
|
1178
1203
|
@property
|
1179
1204
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|