snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -260,7 +262,6 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
260
262
|
sample_weight_col: Optional[str] = None,
|
261
263
|
) -> None:
|
262
264
|
super().__init__()
|
263
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
264
265
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
265
266
|
|
266
267
|
self._deps = list(deps)
|
@@ -293,6 +294,15 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
293
294
|
self.set_drop_input_cols(drop_input_cols)
|
294
295
|
self.set_sample_weight_col(sample_weight_col)
|
295
296
|
|
297
|
+
def _get_rand_id(self) -> str:
|
298
|
+
"""
|
299
|
+
Generate random id to be used in sproc and stage names.
|
300
|
+
|
301
|
+
Returns:
|
302
|
+
Random id string usable in sproc, table, and stage names.
|
303
|
+
"""
|
304
|
+
return str(uuid4()).replace("-", "_").upper()
|
305
|
+
|
296
306
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
297
307
|
"""
|
298
308
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -371,7 +381,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
371
381
|
cp.dump(self._sklearn_object, local_transform_file)
|
372
382
|
|
373
383
|
# Create temp stage to run fit.
|
374
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
384
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
375
385
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
376
386
|
SqlResultValidator(
|
377
387
|
session=session,
|
@@ -384,11 +394,12 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
384
394
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
385
395
|
).validate()
|
386
396
|
|
387
|
-
|
397
|
+
# Use posixpath to construct stage paths
|
398
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
399
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
388
400
|
local_result_file_name = get_temp_file_path()
|
389
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
390
401
|
|
391
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
402
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
392
403
|
statement_params = telemetry.get_function_usage_statement_params(
|
393
404
|
project=_PROJECT,
|
394
405
|
subproject=_SUBPROJECT,
|
@@ -414,6 +425,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
414
425
|
replace=True,
|
415
426
|
session=session,
|
416
427
|
statement_params=statement_params,
|
428
|
+
anonymous=True
|
417
429
|
)
|
418
430
|
def fit_wrapper_sproc(
|
419
431
|
session: Session,
|
@@ -422,7 +434,8 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
422
434
|
stage_result_file_name: str,
|
423
435
|
input_cols: List[str],
|
424
436
|
label_cols: List[str],
|
425
|
-
sample_weight_col: Optional[str]
|
437
|
+
sample_weight_col: Optional[str],
|
438
|
+
statement_params: Dict[str, str]
|
426
439
|
) -> str:
|
427
440
|
import cloudpickle as cp
|
428
441
|
import numpy as np
|
@@ -489,15 +502,15 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
489
502
|
api_calls=[Session.call],
|
490
503
|
custom_tags=dict([("autogen", True)]),
|
491
504
|
)
|
492
|
-
sproc_export_file_name =
|
493
|
-
|
505
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
506
|
+
session,
|
494
507
|
query,
|
495
508
|
stage_transform_file_name,
|
496
509
|
stage_result_file_name,
|
497
510
|
identifier.get_unescaped_names(self.input_cols),
|
498
511
|
identifier.get_unescaped_names(self.label_cols),
|
499
512
|
identifier.get_unescaped_names(self.sample_weight_col),
|
500
|
-
statement_params
|
513
|
+
statement_params,
|
501
514
|
)
|
502
515
|
|
503
516
|
if "|" in sproc_export_file_name:
|
@@ -507,7 +520,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
507
520
|
print("\n".join(fields[1:]))
|
508
521
|
|
509
522
|
session.file.get(
|
510
|
-
|
523
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
511
524
|
local_result_file_name,
|
512
525
|
statement_params=statement_params
|
513
526
|
)
|
@@ -553,7 +566,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
553
566
|
|
554
567
|
# Register vectorized UDF for batch inference
|
555
568
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
556
|
-
safe_id=self.
|
569
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
557
570
|
|
558
571
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
559
572
|
# will try to pickle all of self which fails.
|
@@ -645,7 +658,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
645
658
|
return transformed_pandas_df.to_dict("records")
|
646
659
|
|
647
660
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
648
|
-
safe_id=self.
|
661
|
+
safe_id=self._get_rand_id()
|
649
662
|
)
|
650
663
|
|
651
664
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -810,11 +823,18 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
810
823
|
Transformed dataset.
|
811
824
|
"""
|
812
825
|
if isinstance(dataset, DataFrame):
|
826
|
+
expected_type_inferred = ""
|
827
|
+
# when it is classifier, infer the datatype from label columns
|
828
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
829
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
830
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
831
|
+
)
|
832
|
+
|
813
833
|
output_df = self._batch_inference(
|
814
834
|
dataset=dataset,
|
815
835
|
inference_method="predict",
|
816
836
|
expected_output_cols_list=self.output_cols,
|
817
|
-
expected_output_cols_type=
|
837
|
+
expected_output_cols_type=expected_type_inferred,
|
818
838
|
)
|
819
839
|
elif isinstance(dataset, pd.DataFrame):
|
820
840
|
output_df = self._sklearn_inference(
|
@@ -887,10 +907,10 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
887
907
|
|
888
908
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
889
909
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
890
|
-
Returns
|
910
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
891
911
|
"""
|
892
912
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
893
|
-
return []
|
913
|
+
return [output_cols_prefix]
|
894
914
|
|
895
915
|
classes = self._sklearn_object.classes_
|
896
916
|
if isinstance(classes, numpy.ndarray):
|
@@ -1115,7 +1135,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1115
1135
|
cp.dump(self._sklearn_object, local_score_file)
|
1116
1136
|
|
1117
1137
|
# Create temp stage to run score.
|
1118
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1138
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1119
1139
|
session = dataset._session
|
1120
1140
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1121
1141
|
SqlResultValidator(
|
@@ -1129,8 +1149,9 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1129
1149
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1130
1150
|
).validate()
|
1131
1151
|
|
1132
|
-
|
1133
|
-
|
1152
|
+
# Use posixpath to construct stage paths
|
1153
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1154
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1134
1155
|
statement_params = telemetry.get_function_usage_statement_params(
|
1135
1156
|
project=_PROJECT,
|
1136
1157
|
subproject=_SUBPROJECT,
|
@@ -1156,6 +1177,7 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1156
1177
|
replace=True,
|
1157
1178
|
session=session,
|
1158
1179
|
statement_params=statement_params,
|
1180
|
+
anonymous=True
|
1159
1181
|
)
|
1160
1182
|
def score_wrapper_sproc(
|
1161
1183
|
session: Session,
|
@@ -1163,7 +1185,8 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1163
1185
|
stage_score_file_name: str,
|
1164
1186
|
input_cols: List[str],
|
1165
1187
|
label_cols: List[str],
|
1166
|
-
sample_weight_col: Optional[str]
|
1188
|
+
sample_weight_col: Optional[str],
|
1189
|
+
statement_params: Dict[str, str]
|
1167
1190
|
) -> float:
|
1168
1191
|
import cloudpickle as cp
|
1169
1192
|
import numpy as np
|
@@ -1213,14 +1236,14 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1213
1236
|
api_calls=[Session.call],
|
1214
1237
|
custom_tags=dict([("autogen", True)]),
|
1215
1238
|
)
|
1216
|
-
score =
|
1217
|
-
|
1239
|
+
score = score_wrapper_sproc(
|
1240
|
+
session,
|
1218
1241
|
query,
|
1219
1242
|
stage_score_file_name,
|
1220
1243
|
identifier.get_unescaped_names(self.input_cols),
|
1221
1244
|
identifier.get_unescaped_names(self.label_cols),
|
1222
1245
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1223
|
-
statement_params
|
1246
|
+
statement_params,
|
1224
1247
|
)
|
1225
1248
|
|
1226
1249
|
cleanup_temp_files([local_score_file_name])
|
@@ -1238,18 +1261,20 @@ class MiniBatchSparsePCA(BaseTransformer):
|
|
1238
1261
|
if self._sklearn_object._estimator_type == 'classifier':
|
1239
1262
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1240
1263
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1241
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1264
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1265
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1242
1266
|
# For regressor, the type of predict is float64
|
1243
1267
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1244
1268
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1245
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1246
|
-
|
1269
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1270
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1247
1271
|
for prob_func in PROB_FUNCTIONS:
|
1248
1272
|
if hasattr(self, prob_func):
|
1249
1273
|
output_cols_prefix: str = f"{prob_func}_"
|
1250
1274
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1251
1275
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1252
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1276
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1277
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1253
1278
|
|
1254
1279
|
@property
|
1255
1280
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -267,7 +269,6 @@ class PCA(BaseTransformer):
|
|
267
269
|
sample_weight_col: Optional[str] = None,
|
268
270
|
) -> None:
|
269
271
|
super().__init__()
|
270
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
271
272
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
272
273
|
|
273
274
|
self._deps = list(deps)
|
@@ -295,6 +296,15 @@ class PCA(BaseTransformer):
|
|
295
296
|
self.set_drop_input_cols(drop_input_cols)
|
296
297
|
self.set_sample_weight_col(sample_weight_col)
|
297
298
|
|
299
|
+
def _get_rand_id(self) -> str:
|
300
|
+
"""
|
301
|
+
Generate random id to be used in sproc and stage names.
|
302
|
+
|
303
|
+
Returns:
|
304
|
+
Random id string usable in sproc, table, and stage names.
|
305
|
+
"""
|
306
|
+
return str(uuid4()).replace("-", "_").upper()
|
307
|
+
|
298
308
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
299
309
|
"""
|
300
310
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -373,7 +383,7 @@ class PCA(BaseTransformer):
|
|
373
383
|
cp.dump(self._sklearn_object, local_transform_file)
|
374
384
|
|
375
385
|
# Create temp stage to run fit.
|
376
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
386
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
377
387
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
378
388
|
SqlResultValidator(
|
379
389
|
session=session,
|
@@ -386,11 +396,12 @@ class PCA(BaseTransformer):
|
|
386
396
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
387
397
|
).validate()
|
388
398
|
|
389
|
-
|
399
|
+
# Use posixpath to construct stage paths
|
400
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
401
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
390
402
|
local_result_file_name = get_temp_file_path()
|
391
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
392
403
|
|
393
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
404
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
394
405
|
statement_params = telemetry.get_function_usage_statement_params(
|
395
406
|
project=_PROJECT,
|
396
407
|
subproject=_SUBPROJECT,
|
@@ -416,6 +427,7 @@ class PCA(BaseTransformer):
|
|
416
427
|
replace=True,
|
417
428
|
session=session,
|
418
429
|
statement_params=statement_params,
|
430
|
+
anonymous=True
|
419
431
|
)
|
420
432
|
def fit_wrapper_sproc(
|
421
433
|
session: Session,
|
@@ -424,7 +436,8 @@ class PCA(BaseTransformer):
|
|
424
436
|
stage_result_file_name: str,
|
425
437
|
input_cols: List[str],
|
426
438
|
label_cols: List[str],
|
427
|
-
sample_weight_col: Optional[str]
|
439
|
+
sample_weight_col: Optional[str],
|
440
|
+
statement_params: Dict[str, str]
|
428
441
|
) -> str:
|
429
442
|
import cloudpickle as cp
|
430
443
|
import numpy as np
|
@@ -491,15 +504,15 @@ class PCA(BaseTransformer):
|
|
491
504
|
api_calls=[Session.call],
|
492
505
|
custom_tags=dict([("autogen", True)]),
|
493
506
|
)
|
494
|
-
sproc_export_file_name =
|
495
|
-
|
507
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
508
|
+
session,
|
496
509
|
query,
|
497
510
|
stage_transform_file_name,
|
498
511
|
stage_result_file_name,
|
499
512
|
identifier.get_unescaped_names(self.input_cols),
|
500
513
|
identifier.get_unescaped_names(self.label_cols),
|
501
514
|
identifier.get_unescaped_names(self.sample_weight_col),
|
502
|
-
statement_params
|
515
|
+
statement_params,
|
503
516
|
)
|
504
517
|
|
505
518
|
if "|" in sproc_export_file_name:
|
@@ -509,7 +522,7 @@ class PCA(BaseTransformer):
|
|
509
522
|
print("\n".join(fields[1:]))
|
510
523
|
|
511
524
|
session.file.get(
|
512
|
-
|
525
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
513
526
|
local_result_file_name,
|
514
527
|
statement_params=statement_params
|
515
528
|
)
|
@@ -555,7 +568,7 @@ class PCA(BaseTransformer):
|
|
555
568
|
|
556
569
|
# Register vectorized UDF for batch inference
|
557
570
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
558
|
-
safe_id=self.
|
571
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
559
572
|
|
560
573
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
561
574
|
# will try to pickle all of self which fails.
|
@@ -647,7 +660,7 @@ class PCA(BaseTransformer):
|
|
647
660
|
return transformed_pandas_df.to_dict("records")
|
648
661
|
|
649
662
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
650
|
-
safe_id=self.
|
663
|
+
safe_id=self._get_rand_id()
|
651
664
|
)
|
652
665
|
|
653
666
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -812,11 +825,18 @@ class PCA(BaseTransformer):
|
|
812
825
|
Transformed dataset.
|
813
826
|
"""
|
814
827
|
if isinstance(dataset, DataFrame):
|
828
|
+
expected_type_inferred = ""
|
829
|
+
# when it is classifier, infer the datatype from label columns
|
830
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
831
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
832
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
833
|
+
)
|
834
|
+
|
815
835
|
output_df = self._batch_inference(
|
816
836
|
dataset=dataset,
|
817
837
|
inference_method="predict",
|
818
838
|
expected_output_cols_list=self.output_cols,
|
819
|
-
expected_output_cols_type=
|
839
|
+
expected_output_cols_type=expected_type_inferred,
|
820
840
|
)
|
821
841
|
elif isinstance(dataset, pd.DataFrame):
|
822
842
|
output_df = self._sklearn_inference(
|
@@ -889,10 +909,10 @@ class PCA(BaseTransformer):
|
|
889
909
|
|
890
910
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
891
911
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
892
|
-
Returns
|
912
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
893
913
|
"""
|
894
914
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
895
|
-
return []
|
915
|
+
return [output_cols_prefix]
|
896
916
|
|
897
917
|
classes = self._sklearn_object.classes_
|
898
918
|
if isinstance(classes, numpy.ndarray):
|
@@ -1117,7 +1137,7 @@ class PCA(BaseTransformer):
|
|
1117
1137
|
cp.dump(self._sklearn_object, local_score_file)
|
1118
1138
|
|
1119
1139
|
# Create temp stage to run score.
|
1120
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1140
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1121
1141
|
session = dataset._session
|
1122
1142
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1123
1143
|
SqlResultValidator(
|
@@ -1131,8 +1151,9 @@ class PCA(BaseTransformer):
|
|
1131
1151
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1132
1152
|
).validate()
|
1133
1153
|
|
1134
|
-
|
1135
|
-
|
1154
|
+
# Use posixpath to construct stage paths
|
1155
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1156
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1136
1157
|
statement_params = telemetry.get_function_usage_statement_params(
|
1137
1158
|
project=_PROJECT,
|
1138
1159
|
subproject=_SUBPROJECT,
|
@@ -1158,6 +1179,7 @@ class PCA(BaseTransformer):
|
|
1158
1179
|
replace=True,
|
1159
1180
|
session=session,
|
1160
1181
|
statement_params=statement_params,
|
1182
|
+
anonymous=True
|
1161
1183
|
)
|
1162
1184
|
def score_wrapper_sproc(
|
1163
1185
|
session: Session,
|
@@ -1165,7 +1187,8 @@ class PCA(BaseTransformer):
|
|
1165
1187
|
stage_score_file_name: str,
|
1166
1188
|
input_cols: List[str],
|
1167
1189
|
label_cols: List[str],
|
1168
|
-
sample_weight_col: Optional[str]
|
1190
|
+
sample_weight_col: Optional[str],
|
1191
|
+
statement_params: Dict[str, str]
|
1169
1192
|
) -> float:
|
1170
1193
|
import cloudpickle as cp
|
1171
1194
|
import numpy as np
|
@@ -1215,14 +1238,14 @@ class PCA(BaseTransformer):
|
|
1215
1238
|
api_calls=[Session.call],
|
1216
1239
|
custom_tags=dict([("autogen", True)]),
|
1217
1240
|
)
|
1218
|
-
score =
|
1219
|
-
|
1241
|
+
score = score_wrapper_sproc(
|
1242
|
+
session,
|
1220
1243
|
query,
|
1221
1244
|
stage_score_file_name,
|
1222
1245
|
identifier.get_unescaped_names(self.input_cols),
|
1223
1246
|
identifier.get_unescaped_names(self.label_cols),
|
1224
1247
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1225
|
-
statement_params
|
1248
|
+
statement_params,
|
1226
1249
|
)
|
1227
1250
|
|
1228
1251
|
cleanup_temp_files([local_score_file_name])
|
@@ -1240,18 +1263,20 @@ class PCA(BaseTransformer):
|
|
1240
1263
|
if self._sklearn_object._estimator_type == 'classifier':
|
1241
1264
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1242
1265
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1243
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1266
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1267
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1244
1268
|
# For regressor, the type of predict is float64
|
1245
1269
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1246
1270
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1247
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1248
|
-
|
1271
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1272
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1249
1273
|
for prob_func in PROB_FUNCTIONS:
|
1250
1274
|
if hasattr(self, prob_func):
|
1251
1275
|
output_cols_prefix: str = f"{prob_func}_"
|
1252
1276
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1253
1277
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1254
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1278
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1279
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1255
1280
|
|
1256
1281
|
@property
|
1257
1282
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|