snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -223,12 +222,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
223
222
|
)
|
224
223
|
return selected_cols
|
225
224
|
|
226
|
-
|
227
|
-
project=_PROJECT,
|
228
|
-
subproject=_SUBPROJECT,
|
229
|
-
custom_tags=dict([("autogen", True)]),
|
230
|
-
)
|
231
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PolynomialCountSketch":
|
225
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "PolynomialCountSketch":
|
232
226
|
"""Fit the model with X
|
233
227
|
For more details on this function, see [sklearn.kernel_approximation.PolynomialCountSketch.fit]
|
234
228
|
(https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.PolynomialCountSketch.html#sklearn.kernel_approximation.PolynomialCountSketch.fit)
|
@@ -255,12 +249,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
255
249
|
|
256
250
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
257
251
|
|
258
|
-
|
252
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
259
253
|
if SNOWML_SPROC_ENV in os.environ:
|
260
254
|
statement_params = telemetry.get_function_usage_statement_params(
|
261
255
|
project=_PROJECT,
|
262
256
|
subproject=_SUBPROJECT,
|
263
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
257
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
258
|
+
inspect.currentframe(), PolynomialCountSketch.__class__.__name__
|
259
|
+
),
|
264
260
|
api_calls=[Session.call],
|
265
261
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
266
262
|
)
|
@@ -281,7 +277,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
281
277
|
)
|
282
278
|
self._sklearn_object = model_trainer.train()
|
283
279
|
self._is_fitted = True
|
284
|
-
self.
|
280
|
+
self._generate_model_signatures(dataset)
|
285
281
|
return self
|
286
282
|
|
287
283
|
def _batch_inference_validate_snowpark(
|
@@ -355,7 +351,9 @@ class PolynomialCountSketch(BaseTransformer):
|
|
355
351
|
# when it is classifier, infer the datatype from label columns
|
356
352
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
357
353
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
358
|
-
label_cols_signatures = [
|
354
|
+
label_cols_signatures = [
|
355
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
356
|
+
]
|
359
357
|
if len(label_cols_signatures) == 0:
|
360
358
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
361
359
|
raise exceptions.SnowflakeMLException(
|
@@ -363,25 +361,22 @@ class PolynomialCountSketch(BaseTransformer):
|
|
363
361
|
original_exception=ValueError(error_str),
|
364
362
|
)
|
365
363
|
|
366
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
367
|
-
label_cols_signatures[0].as_snowpark_type()
|
368
|
-
)
|
364
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
369
365
|
|
370
366
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
371
|
-
assert isinstance(
|
367
|
+
assert isinstance(
|
368
|
+
dataset._session, Session
|
369
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
372
370
|
|
373
371
|
transform_kwargs = dict(
|
374
|
-
session
|
375
|
-
dependencies
|
376
|
-
drop_input_cols
|
377
|
-
expected_output_cols_type
|
372
|
+
session=dataset._session,
|
373
|
+
dependencies=self._deps,
|
374
|
+
drop_input_cols=self._drop_input_cols,
|
375
|
+
expected_output_cols_type=expected_type_inferred,
|
378
376
|
)
|
379
377
|
|
380
378
|
elif isinstance(dataset, pd.DataFrame):
|
381
|
-
transform_kwargs = dict(
|
382
|
-
snowpark_input_cols = self._snowpark_cols,
|
383
|
-
drop_input_cols = self._drop_input_cols
|
384
|
-
)
|
379
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
385
380
|
|
386
381
|
transform_handlers = ModelTransformerBuilder.build(
|
387
382
|
dataset=dataset,
|
@@ -423,7 +418,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
423
418
|
Transformed dataset.
|
424
419
|
"""
|
425
420
|
super()._check_dataset_type(dataset)
|
426
|
-
inference_method="transform"
|
421
|
+
inference_method = "transform"
|
427
422
|
|
428
423
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
429
424
|
# are specific to the type of dataset used.
|
@@ -460,17 +455,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
460
455
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
461
456
|
|
462
457
|
transform_kwargs = dict(
|
463
|
-
session
|
464
|
-
dependencies
|
465
|
-
drop_input_cols
|
466
|
-
expected_output_cols_type
|
458
|
+
session=dataset._session,
|
459
|
+
dependencies=self._deps,
|
460
|
+
drop_input_cols=self._drop_input_cols,
|
461
|
+
expected_output_cols_type=expected_dtype,
|
467
462
|
)
|
468
463
|
|
469
464
|
elif isinstance(dataset, pd.DataFrame):
|
470
|
-
transform_kwargs = dict(
|
471
|
-
snowpark_input_cols = self._snowpark_cols,
|
472
|
-
drop_input_cols = self._drop_input_cols
|
473
|
-
)
|
465
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
474
466
|
|
475
467
|
transform_handlers = ModelTransformerBuilder.build(
|
476
468
|
dataset=dataset,
|
@@ -489,7 +481,11 @@ class PolynomialCountSketch(BaseTransformer):
|
|
489
481
|
return output_df
|
490
482
|
|
491
483
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
492
|
-
def fit_predict(
|
484
|
+
def fit_predict(
|
485
|
+
self,
|
486
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
487
|
+
output_cols_prefix: str = "fit_predict_",
|
488
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
493
489
|
""" Method not supported for this class.
|
494
490
|
|
495
491
|
|
@@ -514,7 +510,9 @@ class PolynomialCountSketch(BaseTransformer):
|
|
514
510
|
)
|
515
511
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
516
512
|
drop_input_cols=self._drop_input_cols,
|
517
|
-
expected_output_cols_list=
|
513
|
+
expected_output_cols_list=(
|
514
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
515
|
+
),
|
518
516
|
)
|
519
517
|
self._sklearn_object = fitted_estimator
|
520
518
|
self._is_fitted = True
|
@@ -531,6 +529,62 @@ class PolynomialCountSketch(BaseTransformer):
|
|
531
529
|
assert self._sklearn_object is not None
|
532
530
|
return self._sklearn_object.embedding_
|
533
531
|
|
532
|
+
|
533
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
534
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
535
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
536
|
+
"""
|
537
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
538
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
539
|
+
if output_cols:
|
540
|
+
output_cols = [
|
541
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
542
|
+
for c in output_cols
|
543
|
+
]
|
544
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
545
|
+
output_cols = [output_cols_prefix]
|
546
|
+
elif self._sklearn_object is not None:
|
547
|
+
classes = self._sklearn_object.classes_
|
548
|
+
if isinstance(classes, numpy.ndarray):
|
549
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
550
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
551
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
552
|
+
output_cols = []
|
553
|
+
for i, cl in enumerate(classes):
|
554
|
+
# For binary classification, there is only one output column for each class
|
555
|
+
# ndarray as the two classes are complementary.
|
556
|
+
if len(cl) == 2:
|
557
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
558
|
+
else:
|
559
|
+
output_cols.extend([
|
560
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
561
|
+
])
|
562
|
+
else:
|
563
|
+
output_cols = []
|
564
|
+
|
565
|
+
# Make sure column names are valid snowflake identifiers.
|
566
|
+
assert output_cols is not None # Make MyPy happy
|
567
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
568
|
+
|
569
|
+
return rv
|
570
|
+
|
571
|
+
def _align_expected_output_names(
|
572
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
573
|
+
) -> List[str]:
|
574
|
+
# in case the inferred output column names dimension is different
|
575
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
576
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
577
|
+
output_df_columns = list(output_df_pd.columns)
|
578
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
579
|
+
if self.sample_weight_col:
|
580
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
581
|
+
# if the dimension of inferred output column names is correct; use it
|
582
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
583
|
+
return expected_output_cols_list
|
584
|
+
# otherwise, use the sklearn estimator's output
|
585
|
+
else:
|
586
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
587
|
+
|
534
588
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
535
589
|
@telemetry.send_api_usage_telemetry(
|
536
590
|
project=_PROJECT,
|
@@ -561,24 +615,28 @@ class PolynomialCountSketch(BaseTransformer):
|
|
561
615
|
# are specific to the type of dataset used.
|
562
616
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
563
617
|
|
618
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
619
|
+
|
564
620
|
if isinstance(dataset, DataFrame):
|
565
621
|
self._deps = self._batch_inference_validate_snowpark(
|
566
622
|
dataset=dataset,
|
567
623
|
inference_method=inference_method,
|
568
624
|
)
|
569
|
-
assert isinstance(
|
625
|
+
assert isinstance(
|
626
|
+
dataset._session, Session
|
627
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
570
628
|
transform_kwargs = dict(
|
571
629
|
session=dataset._session,
|
572
630
|
dependencies=self._deps,
|
573
|
-
drop_input_cols
|
631
|
+
drop_input_cols=self._drop_input_cols,
|
574
632
|
expected_output_cols_type="float",
|
575
633
|
)
|
634
|
+
expected_output_cols = self._align_expected_output_names(
|
635
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
636
|
+
)
|
576
637
|
|
577
638
|
elif isinstance(dataset, pd.DataFrame):
|
578
|
-
transform_kwargs = dict(
|
579
|
-
snowpark_input_cols = self._snowpark_cols,
|
580
|
-
drop_input_cols = self._drop_input_cols
|
581
|
-
)
|
639
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
582
640
|
|
583
641
|
transform_handlers = ModelTransformerBuilder.build(
|
584
642
|
dataset=dataset,
|
@@ -590,7 +648,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
590
648
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
591
649
|
inference_method=inference_method,
|
592
650
|
input_cols=self.input_cols,
|
593
|
-
expected_output_cols=
|
651
|
+
expected_output_cols=expected_output_cols,
|
594
652
|
**transform_kwargs
|
595
653
|
)
|
596
654
|
return output_df
|
@@ -620,7 +678,8 @@ class PolynomialCountSketch(BaseTransformer):
|
|
620
678
|
Output dataset with log probability of the sample for each class in the model.
|
621
679
|
"""
|
622
680
|
super()._check_dataset_type(dataset)
|
623
|
-
inference_method="predict_log_proba"
|
681
|
+
inference_method = "predict_log_proba"
|
682
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
624
683
|
|
625
684
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
626
685
|
# are specific to the type of dataset used.
|
@@ -631,18 +690,20 @@ class PolynomialCountSketch(BaseTransformer):
|
|
631
690
|
dataset=dataset,
|
632
691
|
inference_method=inference_method,
|
633
692
|
)
|
634
|
-
assert isinstance(
|
693
|
+
assert isinstance(
|
694
|
+
dataset._session, Session
|
695
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
635
696
|
transform_kwargs = dict(
|
636
697
|
session=dataset._session,
|
637
698
|
dependencies=self._deps,
|
638
|
-
drop_input_cols
|
699
|
+
drop_input_cols=self._drop_input_cols,
|
639
700
|
expected_output_cols_type="float",
|
640
701
|
)
|
702
|
+
expected_output_cols = self._align_expected_output_names(
|
703
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
704
|
+
)
|
641
705
|
elif isinstance(dataset, pd.DataFrame):
|
642
|
-
transform_kwargs = dict(
|
643
|
-
snowpark_input_cols = self._snowpark_cols,
|
644
|
-
drop_input_cols = self._drop_input_cols
|
645
|
-
)
|
706
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
646
707
|
|
647
708
|
transform_handlers = ModelTransformerBuilder.build(
|
648
709
|
dataset=dataset,
|
@@ -655,7 +716,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
655
716
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
656
717
|
inference_method=inference_method,
|
657
718
|
input_cols=self.input_cols,
|
658
|
-
expected_output_cols=
|
719
|
+
expected_output_cols=expected_output_cols,
|
659
720
|
**transform_kwargs
|
660
721
|
)
|
661
722
|
return output_df
|
@@ -681,30 +742,34 @@ class PolynomialCountSketch(BaseTransformer):
|
|
681
742
|
Output dataset with results of the decision function for the samples in input dataset.
|
682
743
|
"""
|
683
744
|
super()._check_dataset_type(dataset)
|
684
|
-
inference_method="decision_function"
|
745
|
+
inference_method = "decision_function"
|
685
746
|
|
686
747
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
687
748
|
# are specific to the type of dataset used.
|
688
749
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
689
750
|
|
751
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
752
|
+
|
690
753
|
if isinstance(dataset, DataFrame):
|
691
754
|
self._deps = self._batch_inference_validate_snowpark(
|
692
755
|
dataset=dataset,
|
693
756
|
inference_method=inference_method,
|
694
757
|
)
|
695
|
-
assert isinstance(
|
758
|
+
assert isinstance(
|
759
|
+
dataset._session, Session
|
760
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
696
761
|
transform_kwargs = dict(
|
697
762
|
session=dataset._session,
|
698
763
|
dependencies=self._deps,
|
699
|
-
drop_input_cols
|
764
|
+
drop_input_cols=self._drop_input_cols,
|
700
765
|
expected_output_cols_type="float",
|
701
766
|
)
|
767
|
+
expected_output_cols = self._align_expected_output_names(
|
768
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
769
|
+
)
|
702
770
|
|
703
771
|
elif isinstance(dataset, pd.DataFrame):
|
704
|
-
transform_kwargs = dict(
|
705
|
-
snowpark_input_cols = self._snowpark_cols,
|
706
|
-
drop_input_cols = self._drop_input_cols
|
707
|
-
)
|
772
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
708
773
|
|
709
774
|
transform_handlers = ModelTransformerBuilder.build(
|
710
775
|
dataset=dataset,
|
@@ -717,7 +782,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
717
782
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
718
783
|
inference_method=inference_method,
|
719
784
|
input_cols=self.input_cols,
|
720
|
-
expected_output_cols=
|
785
|
+
expected_output_cols=expected_output_cols,
|
721
786
|
**transform_kwargs
|
722
787
|
)
|
723
788
|
return output_df
|
@@ -746,12 +811,14 @@ class PolynomialCountSketch(BaseTransformer):
|
|
746
811
|
Output dataset with probability of the sample for each class in the model.
|
747
812
|
"""
|
748
813
|
super()._check_dataset_type(dataset)
|
749
|
-
inference_method="score_samples"
|
814
|
+
inference_method = "score_samples"
|
750
815
|
|
751
816
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
752
817
|
# are specific to the type of dataset used.
|
753
818
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
754
819
|
|
820
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
821
|
+
|
755
822
|
if isinstance(dataset, DataFrame):
|
756
823
|
self._deps = self._batch_inference_validate_snowpark(
|
757
824
|
dataset=dataset,
|
@@ -764,6 +831,9 @@ class PolynomialCountSketch(BaseTransformer):
|
|
764
831
|
drop_input_cols = self._drop_input_cols,
|
765
832
|
expected_output_cols_type="float",
|
766
833
|
)
|
834
|
+
expected_output_cols = self._align_expected_output_names(
|
835
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
836
|
+
)
|
767
837
|
|
768
838
|
elif isinstance(dataset, pd.DataFrame):
|
769
839
|
transform_kwargs = dict(
|
@@ -782,7 +852,7 @@ class PolynomialCountSketch(BaseTransformer):
|
|
782
852
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
783
853
|
inference_method=inference_method,
|
784
854
|
input_cols=self.input_cols,
|
785
|
-
expected_output_cols=
|
855
|
+
expected_output_cols=expected_output_cols,
|
786
856
|
**transform_kwargs
|
787
857
|
)
|
788
858
|
return output_df
|
@@ -927,50 +997,84 @@ class PolynomialCountSketch(BaseTransformer):
|
|
927
997
|
)
|
928
998
|
return output_df
|
929
999
|
|
1000
|
+
|
1001
|
+
|
1002
|
+
def to_sklearn(self) -> Any:
|
1003
|
+
"""Get sklearn.kernel_approximation.PolynomialCountSketch object.
|
1004
|
+
"""
|
1005
|
+
if self._sklearn_object is None:
|
1006
|
+
self._sklearn_object = self._create_sklearn_object()
|
1007
|
+
return self._sklearn_object
|
1008
|
+
|
1009
|
+
def to_xgboost(self) -> Any:
|
1010
|
+
raise exceptions.SnowflakeMLException(
|
1011
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1012
|
+
original_exception=AttributeError(
|
1013
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1014
|
+
"to_xgboost()",
|
1015
|
+
"to_sklearn()"
|
1016
|
+
)
|
1017
|
+
),
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
def to_lightgbm(self) -> Any:
|
1021
|
+
raise exceptions.SnowflakeMLException(
|
1022
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1023
|
+
original_exception=AttributeError(
|
1024
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1025
|
+
"to_lightgbm()",
|
1026
|
+
"to_sklearn()"
|
1027
|
+
)
|
1028
|
+
),
|
1029
|
+
)
|
930
1030
|
|
931
|
-
def
|
1031
|
+
def _get_dependencies(self) -> List[str]:
|
1032
|
+
return self._deps
|
1033
|
+
|
1034
|
+
|
1035
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
932
1036
|
self._model_signature_dict = dict()
|
933
1037
|
|
934
1038
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
935
1039
|
|
936
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1040
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
937
1041
|
outputs: List[BaseFeatureSpec] = []
|
938
1042
|
if hasattr(self, "predict"):
|
939
1043
|
# keep mypy happy
|
940
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1044
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
941
1045
|
# For classifier, the type of predict is the same as the type of label
|
942
|
-
if self._sklearn_object._estimator_type ==
|
943
|
-
|
1046
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1047
|
+
# label columns is the desired type for output
|
944
1048
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
945
1049
|
# rename the output columns
|
946
1050
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
947
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
948
|
-
|
949
|
-
|
1051
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1052
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1053
|
+
)
|
950
1054
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
951
1055
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
952
|
-
# Clusterer returns int64 cluster labels.
|
1056
|
+
# Clusterer returns int64 cluster labels.
|
953
1057
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
954
1058
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
955
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
956
|
-
|
957
|
-
|
958
|
-
|
1059
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1060
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1061
|
+
)
|
1062
|
+
|
959
1063
|
# For regressor, the type of predict is float64
|
960
|
-
elif self._sklearn_object._estimator_type ==
|
1064
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
961
1065
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
962
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
963
|
-
|
964
|
-
|
965
|
-
|
1066
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1067
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1068
|
+
)
|
1069
|
+
|
966
1070
|
for prob_func in PROB_FUNCTIONS:
|
967
1071
|
if hasattr(self, prob_func):
|
968
1072
|
output_cols_prefix: str = f"{prob_func}_"
|
969
1073
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
970
1074
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
971
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
972
|
-
|
973
|
-
|
1075
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1076
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1077
|
+
)
|
974
1078
|
|
975
1079
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
976
1080
|
items = list(self._model_signature_dict.items())
|
@@ -983,10 +1087,10 @@ class PolynomialCountSketch(BaseTransformer):
|
|
983
1087
|
"""Returns model signature of current class.
|
984
1088
|
|
985
1089
|
Raises:
|
986
|
-
|
1090
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
987
1091
|
|
988
1092
|
Returns:
|
989
|
-
Dict
|
1093
|
+
Dict with each method and its input output signature
|
990
1094
|
"""
|
991
1095
|
if self._model_signature_dict is None:
|
992
1096
|
raise exceptions.SnowflakeMLException(
|
@@ -994,35 +1098,3 @@ class PolynomialCountSketch(BaseTransformer):
|
|
994
1098
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
995
1099
|
)
|
996
1100
|
return self._model_signature_dict
|
997
|
-
|
998
|
-
def to_sklearn(self) -> Any:
|
999
|
-
"""Get sklearn.kernel_approximation.PolynomialCountSketch object.
|
1000
|
-
"""
|
1001
|
-
if self._sklearn_object is None:
|
1002
|
-
self._sklearn_object = self._create_sklearn_object()
|
1003
|
-
return self._sklearn_object
|
1004
|
-
|
1005
|
-
def to_xgboost(self) -> Any:
|
1006
|
-
raise exceptions.SnowflakeMLException(
|
1007
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1008
|
-
original_exception=AttributeError(
|
1009
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1010
|
-
"to_xgboost()",
|
1011
|
-
"to_sklearn()"
|
1012
|
-
)
|
1013
|
-
),
|
1014
|
-
)
|
1015
|
-
|
1016
|
-
def to_lightgbm(self) -> Any:
|
1017
|
-
raise exceptions.SnowflakeMLException(
|
1018
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1019
|
-
original_exception=AttributeError(
|
1020
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1021
|
-
"to_lightgbm()",
|
1022
|
-
"to_sklearn()"
|
1023
|
-
)
|
1024
|
-
),
|
1025
|
-
)
|
1026
|
-
|
1027
|
-
def _get_dependencies(self) -> List[str]:
|
1028
|
-
return self._deps
|