snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +29 -7
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -9
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +3 -2
- snowflake/ml/model/_model_meta.py +12 -7
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +23 -4
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
- snowflake/ml/modeling/cluster/birch.py +51 -26
- snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
- snowflake/ml/modeling/cluster/dbscan.py +51 -26
- snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
- snowflake/ml/modeling/cluster/k_means.py +51 -26
- snowflake/ml/modeling/cluster/mean_shift.py +51 -26
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
- snowflake/ml/modeling/cluster/optics.py +51 -26
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
- snowflake/ml/modeling/compose/column_transformer.py +51 -26
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
- snowflake/ml/modeling/covariance/oas.py +51 -26
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
- snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
- snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
- snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
- snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/pca.py +51 -26
- snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
- snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
- snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
- snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
- snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
- snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
- snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
- snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
- snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
- snowflake/ml/modeling/impute/knn_imputer.py +51 -26
- snowflake/ml/modeling/impute/missing_indicator.py +51 -26
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/lars.py +51 -26
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/perceptron.py +51 -26
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/ridge.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
- snowflake/ml/modeling/manifold/isomap.py +51 -26
- snowflake/ml/modeling/manifold/mds.py +51 -26
- snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
- snowflake/ml/modeling/manifold/tsne.py +51 -26
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
- snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
- snowflake/ml/modeling/svm/linear_svc.py +51 -26
- snowflake/ml/modeling/svm/linear_svr.py +51 -26
- snowflake/ml/modeling/svm/nu_svc.py +51 -26
- snowflake/ml/modeling/svm/nu_svr.py +51 -26
- snowflake/ml/modeling/svm/svc.py +51 -26
- snowflake/ml/modeling/svm/svr.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
- snowflake/ml/registry/model_registry.py +74 -56
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -210,7 +212,6 @@ class MissingIndicator(BaseTransformer):
|
|
210
212
|
sample_weight_col: Optional[str] = None,
|
211
213
|
) -> None:
|
212
214
|
super().__init__()
|
213
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
214
215
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
215
216
|
|
216
217
|
self._deps = list(deps)
|
@@ -233,6 +234,15 @@ class MissingIndicator(BaseTransformer):
|
|
233
234
|
self.set_drop_input_cols(drop_input_cols)
|
234
235
|
self.set_sample_weight_col(sample_weight_col)
|
235
236
|
|
237
|
+
def _get_rand_id(self) -> str:
|
238
|
+
"""
|
239
|
+
Generate random id to be used in sproc and stage names.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
Random id string usable in sproc, table, and stage names.
|
243
|
+
"""
|
244
|
+
return str(uuid4()).replace("-", "_").upper()
|
245
|
+
|
236
246
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
237
247
|
"""
|
238
248
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -311,7 +321,7 @@ class MissingIndicator(BaseTransformer):
|
|
311
321
|
cp.dump(self._sklearn_object, local_transform_file)
|
312
322
|
|
313
323
|
# Create temp stage to run fit.
|
314
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
324
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
315
325
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
316
326
|
SqlResultValidator(
|
317
327
|
session=session,
|
@@ -324,11 +334,12 @@ class MissingIndicator(BaseTransformer):
|
|
324
334
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
325
335
|
).validate()
|
326
336
|
|
327
|
-
|
337
|
+
# Use posixpath to construct stage paths
|
338
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
339
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
328
340
|
local_result_file_name = get_temp_file_path()
|
329
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
330
341
|
|
331
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
342
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
332
343
|
statement_params = telemetry.get_function_usage_statement_params(
|
333
344
|
project=_PROJECT,
|
334
345
|
subproject=_SUBPROJECT,
|
@@ -354,6 +365,7 @@ class MissingIndicator(BaseTransformer):
|
|
354
365
|
replace=True,
|
355
366
|
session=session,
|
356
367
|
statement_params=statement_params,
|
368
|
+
anonymous=True
|
357
369
|
)
|
358
370
|
def fit_wrapper_sproc(
|
359
371
|
session: Session,
|
@@ -362,7 +374,8 @@ class MissingIndicator(BaseTransformer):
|
|
362
374
|
stage_result_file_name: str,
|
363
375
|
input_cols: List[str],
|
364
376
|
label_cols: List[str],
|
365
|
-
sample_weight_col: Optional[str]
|
377
|
+
sample_weight_col: Optional[str],
|
378
|
+
statement_params: Dict[str, str]
|
366
379
|
) -> str:
|
367
380
|
import cloudpickle as cp
|
368
381
|
import numpy as np
|
@@ -429,15 +442,15 @@ class MissingIndicator(BaseTransformer):
|
|
429
442
|
api_calls=[Session.call],
|
430
443
|
custom_tags=dict([("autogen", True)]),
|
431
444
|
)
|
432
|
-
sproc_export_file_name =
|
433
|
-
|
445
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
446
|
+
session,
|
434
447
|
query,
|
435
448
|
stage_transform_file_name,
|
436
449
|
stage_result_file_name,
|
437
450
|
identifier.get_unescaped_names(self.input_cols),
|
438
451
|
identifier.get_unescaped_names(self.label_cols),
|
439
452
|
identifier.get_unescaped_names(self.sample_weight_col),
|
440
|
-
statement_params
|
453
|
+
statement_params,
|
441
454
|
)
|
442
455
|
|
443
456
|
if "|" in sproc_export_file_name:
|
@@ -447,7 +460,7 @@ class MissingIndicator(BaseTransformer):
|
|
447
460
|
print("\n".join(fields[1:]))
|
448
461
|
|
449
462
|
session.file.get(
|
450
|
-
|
463
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
451
464
|
local_result_file_name,
|
452
465
|
statement_params=statement_params
|
453
466
|
)
|
@@ -493,7 +506,7 @@ class MissingIndicator(BaseTransformer):
|
|
493
506
|
|
494
507
|
# Register vectorized UDF for batch inference
|
495
508
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
496
|
-
safe_id=self.
|
509
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
497
510
|
|
498
511
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
499
512
|
# will try to pickle all of self which fails.
|
@@ -585,7 +598,7 @@ class MissingIndicator(BaseTransformer):
|
|
585
598
|
return transformed_pandas_df.to_dict("records")
|
586
599
|
|
587
600
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
588
|
-
safe_id=self.
|
601
|
+
safe_id=self._get_rand_id()
|
589
602
|
)
|
590
603
|
|
591
604
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -750,11 +763,18 @@ class MissingIndicator(BaseTransformer):
|
|
750
763
|
Transformed dataset.
|
751
764
|
"""
|
752
765
|
if isinstance(dataset, DataFrame):
|
766
|
+
expected_type_inferred = ""
|
767
|
+
# when it is classifier, infer the datatype from label columns
|
768
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
769
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
770
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
771
|
+
)
|
772
|
+
|
753
773
|
output_df = self._batch_inference(
|
754
774
|
dataset=dataset,
|
755
775
|
inference_method="predict",
|
756
776
|
expected_output_cols_list=self.output_cols,
|
757
|
-
expected_output_cols_type=
|
777
|
+
expected_output_cols_type=expected_type_inferred,
|
758
778
|
)
|
759
779
|
elif isinstance(dataset, pd.DataFrame):
|
760
780
|
output_df = self._sklearn_inference(
|
@@ -827,10 +847,10 @@ class MissingIndicator(BaseTransformer):
|
|
827
847
|
|
828
848
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
829
849
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
830
|
-
Returns
|
850
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
831
851
|
"""
|
832
852
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
833
|
-
return []
|
853
|
+
return [output_cols_prefix]
|
834
854
|
|
835
855
|
classes = self._sklearn_object.classes_
|
836
856
|
if isinstance(classes, numpy.ndarray):
|
@@ -1055,7 +1075,7 @@ class MissingIndicator(BaseTransformer):
|
|
1055
1075
|
cp.dump(self._sklearn_object, local_score_file)
|
1056
1076
|
|
1057
1077
|
# Create temp stage to run score.
|
1058
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1078
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1059
1079
|
session = dataset._session
|
1060
1080
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1061
1081
|
SqlResultValidator(
|
@@ -1069,8 +1089,9 @@ class MissingIndicator(BaseTransformer):
|
|
1069
1089
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1070
1090
|
).validate()
|
1071
1091
|
|
1072
|
-
|
1073
|
-
|
1092
|
+
# Use posixpath to construct stage paths
|
1093
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1094
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1074
1095
|
statement_params = telemetry.get_function_usage_statement_params(
|
1075
1096
|
project=_PROJECT,
|
1076
1097
|
subproject=_SUBPROJECT,
|
@@ -1096,6 +1117,7 @@ class MissingIndicator(BaseTransformer):
|
|
1096
1117
|
replace=True,
|
1097
1118
|
session=session,
|
1098
1119
|
statement_params=statement_params,
|
1120
|
+
anonymous=True
|
1099
1121
|
)
|
1100
1122
|
def score_wrapper_sproc(
|
1101
1123
|
session: Session,
|
@@ -1103,7 +1125,8 @@ class MissingIndicator(BaseTransformer):
|
|
1103
1125
|
stage_score_file_name: str,
|
1104
1126
|
input_cols: List[str],
|
1105
1127
|
label_cols: List[str],
|
1106
|
-
sample_weight_col: Optional[str]
|
1128
|
+
sample_weight_col: Optional[str],
|
1129
|
+
statement_params: Dict[str, str]
|
1107
1130
|
) -> float:
|
1108
1131
|
import cloudpickle as cp
|
1109
1132
|
import numpy as np
|
@@ -1153,14 +1176,14 @@ class MissingIndicator(BaseTransformer):
|
|
1153
1176
|
api_calls=[Session.call],
|
1154
1177
|
custom_tags=dict([("autogen", True)]),
|
1155
1178
|
)
|
1156
|
-
score =
|
1157
|
-
|
1179
|
+
score = score_wrapper_sproc(
|
1180
|
+
session,
|
1158
1181
|
query,
|
1159
1182
|
stage_score_file_name,
|
1160
1183
|
identifier.get_unescaped_names(self.input_cols),
|
1161
1184
|
identifier.get_unescaped_names(self.label_cols),
|
1162
1185
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1163
|
-
statement_params
|
1186
|
+
statement_params,
|
1164
1187
|
)
|
1165
1188
|
|
1166
1189
|
cleanup_temp_files([local_score_file_name])
|
@@ -1178,18 +1201,20 @@ class MissingIndicator(BaseTransformer):
|
|
1178
1201
|
if self._sklearn_object._estimator_type == 'classifier':
|
1179
1202
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1180
1203
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1181
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1204
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1205
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1182
1206
|
# For regressor, the type of predict is float64
|
1183
1207
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1184
1208
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1185
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1186
|
-
|
1209
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1210
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1187
1211
|
for prob_func in PROB_FUNCTIONS:
|
1188
1212
|
if hasattr(self, prob_func):
|
1189
1213
|
output_cols_prefix: str = f"{prob_func}_"
|
1190
1214
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1191
1215
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1192
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1216
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1217
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1193
1218
|
|
1194
1219
|
@property
|
1195
1220
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -187,7 +189,6 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
187
189
|
sample_weight_col: Optional[str] = None,
|
188
190
|
) -> None:
|
189
191
|
super().__init__()
|
190
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
191
192
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
192
193
|
|
193
194
|
self._deps = list(deps)
|
@@ -208,6 +209,15 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
208
209
|
self.set_drop_input_cols(drop_input_cols)
|
209
210
|
self.set_sample_weight_col(sample_weight_col)
|
210
211
|
|
212
|
+
def _get_rand_id(self) -> str:
|
213
|
+
"""
|
214
|
+
Generate random id to be used in sproc and stage names.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
Random id string usable in sproc, table, and stage names.
|
218
|
+
"""
|
219
|
+
return str(uuid4()).replace("-", "_").upper()
|
220
|
+
|
211
221
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
212
222
|
"""
|
213
223
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -286,7 +296,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
286
296
|
cp.dump(self._sklearn_object, local_transform_file)
|
287
297
|
|
288
298
|
# Create temp stage to run fit.
|
289
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
299
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
290
300
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
291
301
|
SqlResultValidator(
|
292
302
|
session=session,
|
@@ -299,11 +309,12 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
299
309
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
300
310
|
).validate()
|
301
311
|
|
302
|
-
|
312
|
+
# Use posixpath to construct stage paths
|
313
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
314
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
303
315
|
local_result_file_name = get_temp_file_path()
|
304
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
305
316
|
|
306
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
317
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
307
318
|
statement_params = telemetry.get_function_usage_statement_params(
|
308
319
|
project=_PROJECT,
|
309
320
|
subproject=_SUBPROJECT,
|
@@ -329,6 +340,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
329
340
|
replace=True,
|
330
341
|
session=session,
|
331
342
|
statement_params=statement_params,
|
343
|
+
anonymous=True
|
332
344
|
)
|
333
345
|
def fit_wrapper_sproc(
|
334
346
|
session: Session,
|
@@ -337,7 +349,8 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
337
349
|
stage_result_file_name: str,
|
338
350
|
input_cols: List[str],
|
339
351
|
label_cols: List[str],
|
340
|
-
sample_weight_col: Optional[str]
|
352
|
+
sample_weight_col: Optional[str],
|
353
|
+
statement_params: Dict[str, str]
|
341
354
|
) -> str:
|
342
355
|
import cloudpickle as cp
|
343
356
|
import numpy as np
|
@@ -404,15 +417,15 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
404
417
|
api_calls=[Session.call],
|
405
418
|
custom_tags=dict([("autogen", True)]),
|
406
419
|
)
|
407
|
-
sproc_export_file_name =
|
408
|
-
|
420
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
421
|
+
session,
|
409
422
|
query,
|
410
423
|
stage_transform_file_name,
|
411
424
|
stage_result_file_name,
|
412
425
|
identifier.get_unescaped_names(self.input_cols),
|
413
426
|
identifier.get_unescaped_names(self.label_cols),
|
414
427
|
identifier.get_unescaped_names(self.sample_weight_col),
|
415
|
-
statement_params
|
428
|
+
statement_params,
|
416
429
|
)
|
417
430
|
|
418
431
|
if "|" in sproc_export_file_name:
|
@@ -422,7 +435,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
422
435
|
print("\n".join(fields[1:]))
|
423
436
|
|
424
437
|
session.file.get(
|
425
|
-
|
438
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
426
439
|
local_result_file_name,
|
427
440
|
statement_params=statement_params
|
428
441
|
)
|
@@ -468,7 +481,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
468
481
|
|
469
482
|
# Register vectorized UDF for batch inference
|
470
483
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
471
|
-
safe_id=self.
|
484
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
472
485
|
|
473
486
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
474
487
|
# will try to pickle all of self which fails.
|
@@ -560,7 +573,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
560
573
|
return transformed_pandas_df.to_dict("records")
|
561
574
|
|
562
575
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
563
|
-
safe_id=self.
|
576
|
+
safe_id=self._get_rand_id()
|
564
577
|
)
|
565
578
|
|
566
579
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -725,11 +738,18 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
725
738
|
Transformed dataset.
|
726
739
|
"""
|
727
740
|
if isinstance(dataset, DataFrame):
|
741
|
+
expected_type_inferred = ""
|
742
|
+
# when it is classifier, infer the datatype from label columns
|
743
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
744
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
745
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
746
|
+
)
|
747
|
+
|
728
748
|
output_df = self._batch_inference(
|
729
749
|
dataset=dataset,
|
730
750
|
inference_method="predict",
|
731
751
|
expected_output_cols_list=self.output_cols,
|
732
|
-
expected_output_cols_type=
|
752
|
+
expected_output_cols_type=expected_type_inferred,
|
733
753
|
)
|
734
754
|
elif isinstance(dataset, pd.DataFrame):
|
735
755
|
output_df = self._sklearn_inference(
|
@@ -802,10 +822,10 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
802
822
|
|
803
823
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
804
824
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
805
|
-
Returns
|
825
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
806
826
|
"""
|
807
827
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
808
|
-
return []
|
828
|
+
return [output_cols_prefix]
|
809
829
|
|
810
830
|
classes = self._sklearn_object.classes_
|
811
831
|
if isinstance(classes, numpy.ndarray):
|
@@ -1030,7 +1050,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1030
1050
|
cp.dump(self._sklearn_object, local_score_file)
|
1031
1051
|
|
1032
1052
|
# Create temp stage to run score.
|
1033
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1053
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1034
1054
|
session = dataset._session
|
1035
1055
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1036
1056
|
SqlResultValidator(
|
@@ -1044,8 +1064,9 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1044
1064
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1045
1065
|
).validate()
|
1046
1066
|
|
1047
|
-
|
1048
|
-
|
1067
|
+
# Use posixpath to construct stage paths
|
1068
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1069
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1049
1070
|
statement_params = telemetry.get_function_usage_statement_params(
|
1050
1071
|
project=_PROJECT,
|
1051
1072
|
subproject=_SUBPROJECT,
|
@@ -1071,6 +1092,7 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1071
1092
|
replace=True,
|
1072
1093
|
session=session,
|
1073
1094
|
statement_params=statement_params,
|
1095
|
+
anonymous=True
|
1074
1096
|
)
|
1075
1097
|
def score_wrapper_sproc(
|
1076
1098
|
session: Session,
|
@@ -1078,7 +1100,8 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1078
1100
|
stage_score_file_name: str,
|
1079
1101
|
input_cols: List[str],
|
1080
1102
|
label_cols: List[str],
|
1081
|
-
sample_weight_col: Optional[str]
|
1103
|
+
sample_weight_col: Optional[str],
|
1104
|
+
statement_params: Dict[str, str]
|
1082
1105
|
) -> float:
|
1083
1106
|
import cloudpickle as cp
|
1084
1107
|
import numpy as np
|
@@ -1128,14 +1151,14 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1128
1151
|
api_calls=[Session.call],
|
1129
1152
|
custom_tags=dict([("autogen", True)]),
|
1130
1153
|
)
|
1131
|
-
score =
|
1132
|
-
|
1154
|
+
score = score_wrapper_sproc(
|
1155
|
+
session,
|
1133
1156
|
query,
|
1134
1157
|
stage_score_file_name,
|
1135
1158
|
identifier.get_unescaped_names(self.input_cols),
|
1136
1159
|
identifier.get_unescaped_names(self.label_cols),
|
1137
1160
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1138
|
-
statement_params
|
1161
|
+
statement_params,
|
1139
1162
|
)
|
1140
1163
|
|
1141
1164
|
cleanup_temp_files([local_score_file_name])
|
@@ -1153,18 +1176,20 @@ class AdditiveChi2Sampler(BaseTransformer):
|
|
1153
1176
|
if self._sklearn_object._estimator_type == 'classifier':
|
1154
1177
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1155
1178
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1156
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1179
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1180
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1157
1181
|
# For regressor, the type of predict is float64
|
1158
1182
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1159
1183
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1160
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1161
|
-
|
1184
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1185
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1162
1186
|
for prob_func in PROB_FUNCTIONS:
|
1163
1187
|
if hasattr(self, prob_func):
|
1164
1188
|
output_cols_prefix: str = f"{prob_func}_"
|
1165
1189
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1166
1190
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1167
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1191
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1192
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1168
1193
|
|
1169
1194
|
@property
|
1170
1195
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|