snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +2 -1
- snowflake/ml/_internal/file_utils.py +35 -40
- snowflake/ml/_internal/telemetry.py +5 -8
- snowflake/ml/_internal/utils/identifier.py +74 -7
- snowflake/ml/_internal/utils/uri.py +7 -2
- snowflake/ml/model/_core_requirements.py +1 -1
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
- snowflake/ml/model/_deployer.py +14 -27
- snowflake/ml/model/_env.py +4 -4
- snowflake/ml/model/_handlers/_base.py +3 -1
- snowflake/ml/model/_handlers/custom.py +14 -2
- snowflake/ml/model/_handlers/pytorch.py +186 -0
- snowflake/ml/model/_handlers/sklearn.py +14 -8
- snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
- snowflake/ml/model/_handlers/torchscript.py +180 -0
- snowflake/ml/model/_handlers/xgboost.py +19 -9
- snowflake/ml/model/_model.py +27 -21
- snowflake/ml/model/_model_meta.py +33 -19
- snowflake/ml/model/model_signature.py +446 -66
- snowflake/ml/model/type_hints.py +28 -15
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
- snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
- snowflake/ml/modeling/cluster/birch.py +79 -43
- snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
- snowflake/ml/modeling/cluster/dbscan.py +79 -43
- snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
- snowflake/ml/modeling/cluster/k_means.py +79 -43
- snowflake/ml/modeling/cluster/mean_shift.py +79 -43
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
- snowflake/ml/modeling/cluster/optics.py +79 -43
- snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
- snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
- snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
- snowflake/ml/modeling/compose/column_transformer.py +79 -43
- snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
- snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
- snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
- snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
- snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
- snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
- snowflake/ml/modeling/covariance/oas.py +79 -43
- snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
- snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
- snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
- snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
- snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
- snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
- snowflake/ml/modeling/decomposition/pca.py +79 -43
- snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
- snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
- snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
- snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
- snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
- snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
- snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
- snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
- snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
- snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
- snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
- snowflake/ml/modeling/impute/knn_imputer.py +79 -43
- snowflake/ml/modeling/impute/missing_indicator.py +79 -43
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
- snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
- snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
- snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/lars.py +79 -43
- snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
- snowflake/ml/modeling/linear_model/lasso.py +79 -43
- snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
- snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
- snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
- snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/perceptron.py +79 -43
- snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/ridge.py +79 -43
- snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
- snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
- snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
- snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
- snowflake/ml/modeling/manifold/isomap.py +79 -43
- snowflake/ml/modeling/manifold/mds.py +79 -43
- snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
- snowflake/ml/modeling/manifold/tsne.py +79 -43
- snowflake/ml/modeling/metrics/classification.py +6 -1
- snowflake/ml/modeling/metrics/regression.py +517 -9
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
- snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
- snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
- snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
- snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
- snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
- snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
- snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
- snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
- snowflake/ml/modeling/pipeline/pipeline.py +24 -0
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
- snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
- snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
- snowflake/ml/modeling/svm/linear_svc.py +79 -43
- snowflake/ml/modeling/svm/linear_svr.py +79 -43
- snowflake/ml/modeling/svm/nu_svc.py +79 -43
- snowflake/ml/modeling/svm/nu_svr.py +79 -43
- snowflake/ml/modeling/svm/svc.py +79 -43
- snowflake/ml/modeling/svm/svr.py +79 -43
- snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
- snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
- snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
- snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
- snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
- snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
- snowflake/ml/registry/model_registry.py +123 -121
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
- snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
- snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
- {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -800,7 +800,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
800
800
|
state_df = dataset._session.create_dataframe(state_pandas)
|
801
801
|
|
802
802
|
transformed_dataset = dataset
|
803
|
-
|
803
|
+
original_dataset_columns = transformed_dataset.columns[:]
|
804
804
|
all_output_cols = []
|
805
805
|
for input_col in self.input_cols:
|
806
806
|
output_cols = [
|
@@ -818,7 +818,7 @@ class OneHotEncoder(base.BaseTransformer):
|
|
818
818
|
|
819
819
|
transformed_dataset = self._handle_unknown_in_transform(transformed_dataset)
|
820
820
|
# Reorder columns. Passthrough columns are added at the right to the output of the transformers.
|
821
|
-
transformed_dataset = transformed_dataset[all_output_cols +
|
821
|
+
transformed_dataset = transformed_dataset[all_output_cols + original_dataset_columns]
|
822
822
|
return transformed_dataset
|
823
823
|
|
824
824
|
def _transform_snowpark_sparse_udf(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame:
|
@@ -895,15 +895,14 @@ class OneHotEncoder(base.BaseTransformer):
|
|
895
895
|
Output dataset.
|
896
896
|
"""
|
897
897
|
encoder_sklearn = self.to_sklearn()
|
898
|
-
|
899
898
|
transformed_dataset = encoder_sklearn.transform(dataset[self.input_cols])
|
900
899
|
|
901
|
-
if
|
902
|
-
|
903
|
-
dataset[self.get_output_cols()] = transformed_dataset
|
904
|
-
return dataset
|
900
|
+
if self.sparse:
|
901
|
+
return transformed_dataset
|
905
902
|
|
906
|
-
|
903
|
+
dataset = dataset.copy()
|
904
|
+
dataset[self.get_output_cols()] = transformed_dataset
|
905
|
+
return dataset
|
907
906
|
|
908
907
|
def _create_unfitted_sklearn_object(self) -> preprocessing.OneHotEncoder:
|
909
908
|
sklearn_args = self.get_sklearn_args(
|
@@ -1331,17 +1330,17 @@ class OneHotEncoder(base.BaseTransformer):
|
|
1331
1330
|
Output columns.
|
1332
1331
|
"""
|
1333
1332
|
if self.sparse:
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1333
|
+
return self.output_cols
|
1334
|
+
|
1335
|
+
output_cols = (
|
1336
|
+
[
|
1337
|
+
identifier.get_inferred_name(col)
|
1338
|
+
for input_col in self.input_cols
|
1339
|
+
for col in self._dense_output_cols_mappings[input_col]
|
1340
|
+
]
|
1341
|
+
if self._dense_output_cols_mappings
|
1342
|
+
else []
|
1343
|
+
)
|
1345
1344
|
return output_cols
|
1346
1345
|
|
1347
1346
|
def _get_dense_output_cols_mappings(self) -> None:
|
@@ -121,6 +121,7 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
121
121
|
self.categories_: Dict[str, type_utils.LiteralNDArrayType] = {}
|
122
122
|
self._categories_list: List[type_utils.LiteralNDArrayType] = []
|
123
123
|
self._missing_indices: Dict[int, int] = {}
|
124
|
+
self._infrequent_enabled = False
|
124
125
|
self._vocab_table_name = "snowml_preprocessing_ordinal_encoder_temp_table_" + uuid.uuid4().hex
|
125
126
|
|
126
127
|
self.set_input_cols(input_cols)
|
@@ -547,6 +548,7 @@ class OrdinalEncoder(base.BaseTransformer):
|
|
547
548
|
if self._is_fitted:
|
548
549
|
encoder.categories_ = self._categories_list
|
549
550
|
encoder._missing_indices = self._missing_indices
|
551
|
+
encoder._infrequent_enabled = self._infrequent_enabled
|
550
552
|
return encoder
|
551
553
|
|
552
554
|
def _validate_keywords(self) -> None:
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#
|
8
8
|
import inspect
|
9
9
|
import os
|
10
|
+
import posixpath
|
10
11
|
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
11
12
|
from uuid import uuid4
|
12
13
|
|
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
|
|
27
28
|
from snowflake.snowpark import DataFrame, Session
|
28
29
|
from snowflake.snowpark.functions import pandas_udf, sproc
|
29
30
|
from snowflake.snowpark.types import PandasSeries
|
31
|
+
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
32
|
|
31
33
|
from snowflake.ml.model.model_signature import (
|
32
34
|
DataType,
|
@@ -208,7 +210,6 @@ class PolynomialFeatures(BaseTransformer):
|
|
208
210
|
sample_weight_col: Optional[str] = None,
|
209
211
|
) -> None:
|
210
212
|
super().__init__()
|
211
|
-
self.id = str(uuid4()).replace("-", "_").upper()
|
212
213
|
deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
|
213
214
|
|
214
215
|
self._deps = list(deps)
|
@@ -231,6 +232,15 @@ class PolynomialFeatures(BaseTransformer):
|
|
231
232
|
self.set_drop_input_cols(drop_input_cols)
|
232
233
|
self.set_sample_weight_col(sample_weight_col)
|
233
234
|
|
235
|
+
def _get_rand_id(self) -> str:
|
236
|
+
"""
|
237
|
+
Generate random id to be used in sproc and stage names.
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
Random id string usable in sproc, table, and stage names.
|
241
|
+
"""
|
242
|
+
return str(uuid4()).replace("-", "_").upper()
|
243
|
+
|
234
244
|
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
235
245
|
"""
|
236
246
|
Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
|
@@ -309,7 +319,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
309
319
|
cp.dump(self._sklearn_object, local_transform_file)
|
310
320
|
|
311
321
|
# Create temp stage to run fit.
|
312
|
-
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.
|
322
|
+
transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
|
313
323
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
|
314
324
|
SqlResultValidator(
|
315
325
|
session=session,
|
@@ -322,11 +332,12 @@ class PolynomialFeatures(BaseTransformer):
|
|
322
332
|
expected_value=f"Stage area {transform_stage_name} successfully created."
|
323
333
|
).validate()
|
324
334
|
|
325
|
-
|
335
|
+
# Use posixpath to construct stage paths
|
336
|
+
stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
337
|
+
stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
326
338
|
local_result_file_name = get_temp_file_path()
|
327
|
-
stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
|
328
339
|
|
329
|
-
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.
|
340
|
+
fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
|
330
341
|
statement_params = telemetry.get_function_usage_statement_params(
|
331
342
|
project=_PROJECT,
|
332
343
|
subproject=_SUBPROJECT,
|
@@ -352,6 +363,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
352
363
|
replace=True,
|
353
364
|
session=session,
|
354
365
|
statement_params=statement_params,
|
366
|
+
anonymous=True
|
355
367
|
)
|
356
368
|
def fit_wrapper_sproc(
|
357
369
|
session: Session,
|
@@ -360,7 +372,8 @@ class PolynomialFeatures(BaseTransformer):
|
|
360
372
|
stage_result_file_name: str,
|
361
373
|
input_cols: List[str],
|
362
374
|
label_cols: List[str],
|
363
|
-
sample_weight_col: Optional[str]
|
375
|
+
sample_weight_col: Optional[str],
|
376
|
+
statement_params: Dict[str, str]
|
364
377
|
) -> str:
|
365
378
|
import cloudpickle as cp
|
366
379
|
import numpy as np
|
@@ -427,15 +440,15 @@ class PolynomialFeatures(BaseTransformer):
|
|
427
440
|
api_calls=[Session.call],
|
428
441
|
custom_tags=dict([("autogen", True)]),
|
429
442
|
)
|
430
|
-
sproc_export_file_name =
|
431
|
-
|
443
|
+
sproc_export_file_name = fit_wrapper_sproc(
|
444
|
+
session,
|
432
445
|
query,
|
433
446
|
stage_transform_file_name,
|
434
447
|
stage_result_file_name,
|
435
448
|
identifier.get_unescaped_names(self.input_cols),
|
436
449
|
identifier.get_unescaped_names(self.label_cols),
|
437
450
|
identifier.get_unescaped_names(self.sample_weight_col),
|
438
|
-
statement_params
|
451
|
+
statement_params,
|
439
452
|
)
|
440
453
|
|
441
454
|
if "|" in sproc_export_file_name:
|
@@ -445,7 +458,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
445
458
|
print("\n".join(fields[1:]))
|
446
459
|
|
447
460
|
session.file.get(
|
448
|
-
|
461
|
+
posixpath.join(stage_result_file_name, sproc_export_file_name),
|
449
462
|
local_result_file_name,
|
450
463
|
statement_params=statement_params
|
451
464
|
)
|
@@ -491,7 +504,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
491
504
|
|
492
505
|
# Register vectorized UDF for batch inference
|
493
506
|
batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
|
494
|
-
safe_id=self.
|
507
|
+
safe_id=self._get_rand_id(), method=inference_method)
|
495
508
|
|
496
509
|
# Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
|
497
510
|
# will try to pickle all of self which fails.
|
@@ -583,7 +596,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
583
596
|
return transformed_pandas_df.to_dict("records")
|
584
597
|
|
585
598
|
batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
|
586
|
-
safe_id=self.
|
599
|
+
safe_id=self._get_rand_id()
|
587
600
|
)
|
588
601
|
|
589
602
|
pass_through_columns = self._get_pass_through_columns(dataset)
|
@@ -639,26 +652,37 @@ class PolynomialFeatures(BaseTransformer):
|
|
639
652
|
# input cols need to match unquoted / quoted
|
640
653
|
input_cols = self.input_cols
|
641
654
|
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
|
655
|
+
quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
|
642
656
|
|
643
657
|
estimator = self._sklearn_object
|
644
658
|
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
659
|
+
features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
|
660
|
+
missing_features = []
|
661
|
+
features_in_dataset = set(dataset.columns)
|
662
|
+
columns_to_select = []
|
663
|
+
for i, f in enumerate(features_required_by_estimator):
|
664
|
+
if (
|
665
|
+
i >= len(input_cols)
|
666
|
+
or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
|
667
|
+
or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
|
668
|
+
and quoted_input_cols[i] not in features_in_dataset)
|
669
|
+
):
|
670
|
+
missing_features.append(f)
|
671
|
+
elif input_cols[i] in features_in_dataset:
|
672
|
+
columns_to_select.append(input_cols[i])
|
673
|
+
elif unquoted_input_cols[i] in features_in_dataset:
|
674
|
+
columns_to_select.append(unquoted_input_cols[i])
|
675
|
+
else:
|
676
|
+
columns_to_select.append(quoted_input_cols[i])
|
677
|
+
|
678
|
+
if len(missing_features) > 0:
|
679
|
+
raise ValueError(
|
680
|
+
"The feature names should match with those that were passed during fit.\n"
|
681
|
+
f"Features seen during fit call but not present in the input: {missing_features}\n"
|
682
|
+
f"Features in the input dataframe : {input_cols}\n"
|
683
|
+
)
|
684
|
+
input_df = dataset[columns_to_select]
|
685
|
+
input_df.columns = features_required_by_estimator
|
662
686
|
|
663
687
|
transformed_numpy_array = getattr(estimator, inference_method)(
|
664
688
|
input_df
|
@@ -737,11 +761,18 @@ class PolynomialFeatures(BaseTransformer):
|
|
737
761
|
Transformed dataset.
|
738
762
|
"""
|
739
763
|
if isinstance(dataset, DataFrame):
|
764
|
+
expected_type_inferred = ""
|
765
|
+
# when it is classifier, infer the datatype from label columns
|
766
|
+
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
767
|
+
expected_type_inferred = convert_sp_to_sf_type(
|
768
|
+
self.model_signatures['predict'].outputs[0].as_snowpark_type()
|
769
|
+
)
|
770
|
+
|
740
771
|
output_df = self._batch_inference(
|
741
772
|
dataset=dataset,
|
742
773
|
inference_method="predict",
|
743
774
|
expected_output_cols_list=self.output_cols,
|
744
|
-
expected_output_cols_type=
|
775
|
+
expected_output_cols_type=expected_type_inferred,
|
745
776
|
)
|
746
777
|
elif isinstance(dataset, pd.DataFrame):
|
747
778
|
output_df = self._sklearn_inference(
|
@@ -814,10 +845,10 @@ class PolynomialFeatures(BaseTransformer):
|
|
814
845
|
|
815
846
|
def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
|
816
847
|
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
817
|
-
Returns
|
848
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
818
849
|
"""
|
819
850
|
if getattr(self._sklearn_object, "classes_", None) is None:
|
820
|
-
return []
|
851
|
+
return [output_cols_prefix]
|
821
852
|
|
822
853
|
classes = self._sklearn_object.classes_
|
823
854
|
if isinstance(classes, numpy.ndarray):
|
@@ -1042,7 +1073,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
1042
1073
|
cp.dump(self._sklearn_object, local_score_file)
|
1043
1074
|
|
1044
1075
|
# Create temp stage to run score.
|
1045
|
-
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.
|
1076
|
+
score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1046
1077
|
session = dataset._session
|
1047
1078
|
stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
|
1048
1079
|
SqlResultValidator(
|
@@ -1056,8 +1087,9 @@ class PolynomialFeatures(BaseTransformer):
|
|
1056
1087
|
expected_value=f"Stage area {score_stage_name} successfully created."
|
1057
1088
|
).validate()
|
1058
1089
|
|
1059
|
-
|
1060
|
-
|
1090
|
+
# Use posixpath to construct stage paths
|
1091
|
+
stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
|
1092
|
+
score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
|
1061
1093
|
statement_params = telemetry.get_function_usage_statement_params(
|
1062
1094
|
project=_PROJECT,
|
1063
1095
|
subproject=_SUBPROJECT,
|
@@ -1083,6 +1115,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
1083
1115
|
replace=True,
|
1084
1116
|
session=session,
|
1085
1117
|
statement_params=statement_params,
|
1118
|
+
anonymous=True
|
1086
1119
|
)
|
1087
1120
|
def score_wrapper_sproc(
|
1088
1121
|
session: Session,
|
@@ -1090,7 +1123,8 @@ class PolynomialFeatures(BaseTransformer):
|
|
1090
1123
|
stage_score_file_name: str,
|
1091
1124
|
input_cols: List[str],
|
1092
1125
|
label_cols: List[str],
|
1093
|
-
sample_weight_col: Optional[str]
|
1126
|
+
sample_weight_col: Optional[str],
|
1127
|
+
statement_params: Dict[str, str]
|
1094
1128
|
) -> float:
|
1095
1129
|
import cloudpickle as cp
|
1096
1130
|
import numpy as np
|
@@ -1140,14 +1174,14 @@ class PolynomialFeatures(BaseTransformer):
|
|
1140
1174
|
api_calls=[Session.call],
|
1141
1175
|
custom_tags=dict([("autogen", True)]),
|
1142
1176
|
)
|
1143
|
-
score =
|
1144
|
-
|
1177
|
+
score = score_wrapper_sproc(
|
1178
|
+
session,
|
1145
1179
|
query,
|
1146
1180
|
stage_score_file_name,
|
1147
1181
|
identifier.get_unescaped_names(self.input_cols),
|
1148
1182
|
identifier.get_unescaped_names(self.label_cols),
|
1149
1183
|
identifier.get_unescaped_names(self.sample_weight_col),
|
1150
|
-
statement_params
|
1184
|
+
statement_params,
|
1151
1185
|
)
|
1152
1186
|
|
1153
1187
|
cleanup_temp_files([local_score_file_name])
|
@@ -1165,18 +1199,20 @@ class PolynomialFeatures(BaseTransformer):
|
|
1165
1199
|
if self._sklearn_object._estimator_type == 'classifier':
|
1166
1200
|
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
|
1167
1201
|
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
|
1168
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1202
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1203
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1169
1204
|
# For regressor, the type of predict is float64
|
1170
1205
|
elif self._sklearn_object._estimator_type == 'regressor':
|
1171
1206
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1172
|
-
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1173
|
-
|
1207
|
+
self._model_signature_dict["predict"] = ModelSignature(inputs,
|
1208
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1174
1209
|
for prob_func in PROB_FUNCTIONS:
|
1175
1210
|
if hasattr(self, prob_func):
|
1176
1211
|
output_cols_prefix: str = f"{prob_func}_"
|
1177
1212
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1178
1213
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1179
|
-
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1214
|
+
self._model_signature_dict[prob_func] = ModelSignature(inputs,
|
1215
|
+
([] if self._drop_input_cols else inputs) + outputs)
|
1180
1216
|
|
1181
1217
|
@property
|
1182
1218
|
def model_signatures(self) -> Dict[str, ModelSignature]:
|