snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
34
34
|
BatchInferenceKwargsTypedDict,
|
35
35
|
ScoreKwargsTypedDict
|
36
36
|
)
|
37
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
38
|
+
from snowflake.ml.model.model_signature import (
|
39
|
+
BaseFeatureSpec,
|
40
|
+
DataType,
|
41
|
+
FeatureSpec,
|
42
|
+
ModelSignature,
|
43
|
+
_infer_signature,
|
44
|
+
_rename_signature_with_snowflake_identifiers,
|
45
|
+
)
|
37
46
|
|
38
47
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
39
48
|
|
@@ -44,16 +53,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
44
53
|
validate_sklearn_args,
|
45
54
|
)
|
46
55
|
|
47
|
-
from snowflake.ml.model.model_signature import (
|
48
|
-
DataType,
|
49
|
-
FeatureSpec,
|
50
|
-
ModelSignature,
|
51
|
-
_infer_signature,
|
52
|
-
_rename_signature_with_snowflake_identifiers,
|
53
|
-
BaseFeatureSpec,
|
54
|
-
)
|
55
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
56
|
-
|
57
56
|
_PROJECT = "ModelDevelopment"
|
58
57
|
# Derive subproject from module name by removing "sklearn"
|
59
58
|
# and converting module name from underscore to CamelCase
|
@@ -205,12 +204,7 @@ class SelectPercentile(BaseTransformer):
|
|
205
204
|
)
|
206
205
|
return selected_cols
|
207
206
|
|
208
|
-
|
209
|
-
project=_PROJECT,
|
210
|
-
subproject=_SUBPROJECT,
|
211
|
-
custom_tags=dict([("autogen", True)]),
|
212
|
-
)
|
213
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SelectPercentile":
|
207
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SelectPercentile":
|
214
208
|
"""Run score function on (X, y) and get the appropriate features
|
215
209
|
For more details on this function, see [sklearn.feature_selection.SelectPercentile.fit]
|
216
210
|
(https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html#sklearn.feature_selection.SelectPercentile.fit)
|
@@ -237,12 +231,14 @@ class SelectPercentile(BaseTransformer):
|
|
237
231
|
|
238
232
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
239
233
|
|
240
|
-
|
234
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
241
235
|
if SNOWML_SPROC_ENV in os.environ:
|
242
236
|
statement_params = telemetry.get_function_usage_statement_params(
|
243
237
|
project=_PROJECT,
|
244
238
|
subproject=_SUBPROJECT,
|
245
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
239
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
240
|
+
inspect.currentframe(), SelectPercentile.__class__.__name__
|
241
|
+
),
|
246
242
|
api_calls=[Session.call],
|
247
243
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
248
244
|
)
|
@@ -263,7 +259,7 @@ class SelectPercentile(BaseTransformer):
|
|
263
259
|
)
|
264
260
|
self._sklearn_object = model_trainer.train()
|
265
261
|
self._is_fitted = True
|
266
|
-
self.
|
262
|
+
self._generate_model_signatures(dataset)
|
267
263
|
return self
|
268
264
|
|
269
265
|
def _batch_inference_validate_snowpark(
|
@@ -337,7 +333,9 @@ class SelectPercentile(BaseTransformer):
|
|
337
333
|
# when it is classifier, infer the datatype from label columns
|
338
334
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
339
335
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
340
|
-
label_cols_signatures = [
|
336
|
+
label_cols_signatures = [
|
337
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
338
|
+
]
|
341
339
|
if len(label_cols_signatures) == 0:
|
342
340
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
343
341
|
raise exceptions.SnowflakeMLException(
|
@@ -345,25 +343,22 @@ class SelectPercentile(BaseTransformer):
|
|
345
343
|
original_exception=ValueError(error_str),
|
346
344
|
)
|
347
345
|
|
348
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
349
|
-
label_cols_signatures[0].as_snowpark_type()
|
350
|
-
)
|
346
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
351
347
|
|
352
348
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
353
|
-
assert isinstance(
|
349
|
+
assert isinstance(
|
350
|
+
dataset._session, Session
|
351
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
354
352
|
|
355
353
|
transform_kwargs = dict(
|
356
|
-
session
|
357
|
-
dependencies
|
358
|
-
drop_input_cols
|
359
|
-
expected_output_cols_type
|
354
|
+
session=dataset._session,
|
355
|
+
dependencies=self._deps,
|
356
|
+
drop_input_cols=self._drop_input_cols,
|
357
|
+
expected_output_cols_type=expected_type_inferred,
|
360
358
|
)
|
361
359
|
|
362
360
|
elif isinstance(dataset, pd.DataFrame):
|
363
|
-
transform_kwargs = dict(
|
364
|
-
snowpark_input_cols = self._snowpark_cols,
|
365
|
-
drop_input_cols = self._drop_input_cols
|
366
|
-
)
|
361
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
367
362
|
|
368
363
|
transform_handlers = ModelTransformerBuilder.build(
|
369
364
|
dataset=dataset,
|
@@ -405,7 +400,7 @@ class SelectPercentile(BaseTransformer):
|
|
405
400
|
Transformed dataset.
|
406
401
|
"""
|
407
402
|
super()._check_dataset_type(dataset)
|
408
|
-
inference_method="transform"
|
403
|
+
inference_method = "transform"
|
409
404
|
|
410
405
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
411
406
|
# are specific to the type of dataset used.
|
@@ -442,17 +437,14 @@ class SelectPercentile(BaseTransformer):
|
|
442
437
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
443
438
|
|
444
439
|
transform_kwargs = dict(
|
445
|
-
session
|
446
|
-
dependencies
|
447
|
-
drop_input_cols
|
448
|
-
expected_output_cols_type
|
440
|
+
session=dataset._session,
|
441
|
+
dependencies=self._deps,
|
442
|
+
drop_input_cols=self._drop_input_cols,
|
443
|
+
expected_output_cols_type=expected_dtype,
|
449
444
|
)
|
450
445
|
|
451
446
|
elif isinstance(dataset, pd.DataFrame):
|
452
|
-
transform_kwargs = dict(
|
453
|
-
snowpark_input_cols = self._snowpark_cols,
|
454
|
-
drop_input_cols = self._drop_input_cols
|
455
|
-
)
|
447
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
456
448
|
|
457
449
|
transform_handlers = ModelTransformerBuilder.build(
|
458
450
|
dataset=dataset,
|
@@ -471,7 +463,11 @@ class SelectPercentile(BaseTransformer):
|
|
471
463
|
return output_df
|
472
464
|
|
473
465
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
474
|
-
def fit_predict(
|
466
|
+
def fit_predict(
|
467
|
+
self,
|
468
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
469
|
+
output_cols_prefix: str = "fit_predict_",
|
470
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
475
471
|
""" Method not supported for this class.
|
476
472
|
|
477
473
|
|
@@ -496,7 +492,9 @@ class SelectPercentile(BaseTransformer):
|
|
496
492
|
)
|
497
493
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
498
494
|
drop_input_cols=self._drop_input_cols,
|
499
|
-
expected_output_cols_list=
|
495
|
+
expected_output_cols_list=(
|
496
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
497
|
+
),
|
500
498
|
)
|
501
499
|
self._sklearn_object = fitted_estimator
|
502
500
|
self._is_fitted = True
|
@@ -513,6 +511,62 @@ class SelectPercentile(BaseTransformer):
|
|
513
511
|
assert self._sklearn_object is not None
|
514
512
|
return self._sklearn_object.embedding_
|
515
513
|
|
514
|
+
|
515
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
516
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
517
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
518
|
+
"""
|
519
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
520
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
521
|
+
if output_cols:
|
522
|
+
output_cols = [
|
523
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
524
|
+
for c in output_cols
|
525
|
+
]
|
526
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
527
|
+
output_cols = [output_cols_prefix]
|
528
|
+
elif self._sklearn_object is not None:
|
529
|
+
classes = self._sklearn_object.classes_
|
530
|
+
if isinstance(classes, numpy.ndarray):
|
531
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
532
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
533
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
534
|
+
output_cols = []
|
535
|
+
for i, cl in enumerate(classes):
|
536
|
+
# For binary classification, there is only one output column for each class
|
537
|
+
# ndarray as the two classes are complementary.
|
538
|
+
if len(cl) == 2:
|
539
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
540
|
+
else:
|
541
|
+
output_cols.extend([
|
542
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
543
|
+
])
|
544
|
+
else:
|
545
|
+
output_cols = []
|
546
|
+
|
547
|
+
# Make sure column names are valid snowflake identifiers.
|
548
|
+
assert output_cols is not None # Make MyPy happy
|
549
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
550
|
+
|
551
|
+
return rv
|
552
|
+
|
553
|
+
def _align_expected_output_names(
|
554
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
555
|
+
) -> List[str]:
|
556
|
+
# in case the inferred output column names dimension is different
|
557
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
558
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
559
|
+
output_df_columns = list(output_df_pd.columns)
|
560
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
561
|
+
if self.sample_weight_col:
|
562
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
563
|
+
# if the dimension of inferred output column names is correct; use it
|
564
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
565
|
+
return expected_output_cols_list
|
566
|
+
# otherwise, use the sklearn estimator's output
|
567
|
+
else:
|
568
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
569
|
+
|
516
570
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
517
571
|
@telemetry.send_api_usage_telemetry(
|
518
572
|
project=_PROJECT,
|
@@ -543,24 +597,28 @@ class SelectPercentile(BaseTransformer):
|
|
543
597
|
# are specific to the type of dataset used.
|
544
598
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
545
599
|
|
600
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
601
|
+
|
546
602
|
if isinstance(dataset, DataFrame):
|
547
603
|
self._deps = self._batch_inference_validate_snowpark(
|
548
604
|
dataset=dataset,
|
549
605
|
inference_method=inference_method,
|
550
606
|
)
|
551
|
-
assert isinstance(
|
607
|
+
assert isinstance(
|
608
|
+
dataset._session, Session
|
609
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
552
610
|
transform_kwargs = dict(
|
553
611
|
session=dataset._session,
|
554
612
|
dependencies=self._deps,
|
555
|
-
drop_input_cols
|
613
|
+
drop_input_cols=self._drop_input_cols,
|
556
614
|
expected_output_cols_type="float",
|
557
615
|
)
|
616
|
+
expected_output_cols = self._align_expected_output_names(
|
617
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
618
|
+
)
|
558
619
|
|
559
620
|
elif isinstance(dataset, pd.DataFrame):
|
560
|
-
transform_kwargs = dict(
|
561
|
-
snowpark_input_cols = self._snowpark_cols,
|
562
|
-
drop_input_cols = self._drop_input_cols
|
563
|
-
)
|
621
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
564
622
|
|
565
623
|
transform_handlers = ModelTransformerBuilder.build(
|
566
624
|
dataset=dataset,
|
@@ -572,7 +630,7 @@ class SelectPercentile(BaseTransformer):
|
|
572
630
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
573
631
|
inference_method=inference_method,
|
574
632
|
input_cols=self.input_cols,
|
575
|
-
expected_output_cols=
|
633
|
+
expected_output_cols=expected_output_cols,
|
576
634
|
**transform_kwargs
|
577
635
|
)
|
578
636
|
return output_df
|
@@ -602,7 +660,8 @@ class SelectPercentile(BaseTransformer):
|
|
602
660
|
Output dataset with log probability of the sample for each class in the model.
|
603
661
|
"""
|
604
662
|
super()._check_dataset_type(dataset)
|
605
|
-
inference_method="predict_log_proba"
|
663
|
+
inference_method = "predict_log_proba"
|
664
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
606
665
|
|
607
666
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
608
667
|
# are specific to the type of dataset used.
|
@@ -613,18 +672,20 @@ class SelectPercentile(BaseTransformer):
|
|
613
672
|
dataset=dataset,
|
614
673
|
inference_method=inference_method,
|
615
674
|
)
|
616
|
-
assert isinstance(
|
675
|
+
assert isinstance(
|
676
|
+
dataset._session, Session
|
677
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
617
678
|
transform_kwargs = dict(
|
618
679
|
session=dataset._session,
|
619
680
|
dependencies=self._deps,
|
620
|
-
drop_input_cols
|
681
|
+
drop_input_cols=self._drop_input_cols,
|
621
682
|
expected_output_cols_type="float",
|
622
683
|
)
|
684
|
+
expected_output_cols = self._align_expected_output_names(
|
685
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
686
|
+
)
|
623
687
|
elif isinstance(dataset, pd.DataFrame):
|
624
|
-
transform_kwargs = dict(
|
625
|
-
snowpark_input_cols = self._snowpark_cols,
|
626
|
-
drop_input_cols = self._drop_input_cols
|
627
|
-
)
|
688
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
628
689
|
|
629
690
|
transform_handlers = ModelTransformerBuilder.build(
|
630
691
|
dataset=dataset,
|
@@ -637,7 +698,7 @@ class SelectPercentile(BaseTransformer):
|
|
637
698
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
638
699
|
inference_method=inference_method,
|
639
700
|
input_cols=self.input_cols,
|
640
|
-
expected_output_cols=
|
701
|
+
expected_output_cols=expected_output_cols,
|
641
702
|
**transform_kwargs
|
642
703
|
)
|
643
704
|
return output_df
|
@@ -663,30 +724,34 @@ class SelectPercentile(BaseTransformer):
|
|
663
724
|
Output dataset with results of the decision function for the samples in input dataset.
|
664
725
|
"""
|
665
726
|
super()._check_dataset_type(dataset)
|
666
|
-
inference_method="decision_function"
|
727
|
+
inference_method = "decision_function"
|
667
728
|
|
668
729
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
669
730
|
# are specific to the type of dataset used.
|
670
731
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
671
732
|
|
733
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
734
|
+
|
672
735
|
if isinstance(dataset, DataFrame):
|
673
736
|
self._deps = self._batch_inference_validate_snowpark(
|
674
737
|
dataset=dataset,
|
675
738
|
inference_method=inference_method,
|
676
739
|
)
|
677
|
-
assert isinstance(
|
740
|
+
assert isinstance(
|
741
|
+
dataset._session, Session
|
742
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
678
743
|
transform_kwargs = dict(
|
679
744
|
session=dataset._session,
|
680
745
|
dependencies=self._deps,
|
681
|
-
drop_input_cols
|
746
|
+
drop_input_cols=self._drop_input_cols,
|
682
747
|
expected_output_cols_type="float",
|
683
748
|
)
|
749
|
+
expected_output_cols = self._align_expected_output_names(
|
750
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
751
|
+
)
|
684
752
|
|
685
753
|
elif isinstance(dataset, pd.DataFrame):
|
686
|
-
transform_kwargs = dict(
|
687
|
-
snowpark_input_cols = self._snowpark_cols,
|
688
|
-
drop_input_cols = self._drop_input_cols
|
689
|
-
)
|
754
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
690
755
|
|
691
756
|
transform_handlers = ModelTransformerBuilder.build(
|
692
757
|
dataset=dataset,
|
@@ -699,7 +764,7 @@ class SelectPercentile(BaseTransformer):
|
|
699
764
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
700
765
|
inference_method=inference_method,
|
701
766
|
input_cols=self.input_cols,
|
702
|
-
expected_output_cols=
|
767
|
+
expected_output_cols=expected_output_cols,
|
703
768
|
**transform_kwargs
|
704
769
|
)
|
705
770
|
return output_df
|
@@ -728,12 +793,14 @@ class SelectPercentile(BaseTransformer):
|
|
728
793
|
Output dataset with probability of the sample for each class in the model.
|
729
794
|
"""
|
730
795
|
super()._check_dataset_type(dataset)
|
731
|
-
inference_method="score_samples"
|
796
|
+
inference_method = "score_samples"
|
732
797
|
|
733
798
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
734
799
|
# are specific to the type of dataset used.
|
735
800
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
736
801
|
|
802
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
803
|
+
|
737
804
|
if isinstance(dataset, DataFrame):
|
738
805
|
self._deps = self._batch_inference_validate_snowpark(
|
739
806
|
dataset=dataset,
|
@@ -746,6 +813,9 @@ class SelectPercentile(BaseTransformer):
|
|
746
813
|
drop_input_cols = self._drop_input_cols,
|
747
814
|
expected_output_cols_type="float",
|
748
815
|
)
|
816
|
+
expected_output_cols = self._align_expected_output_names(
|
817
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
818
|
+
)
|
749
819
|
|
750
820
|
elif isinstance(dataset, pd.DataFrame):
|
751
821
|
transform_kwargs = dict(
|
@@ -764,7 +834,7 @@ class SelectPercentile(BaseTransformer):
|
|
764
834
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
765
835
|
inference_method=inference_method,
|
766
836
|
input_cols=self.input_cols,
|
767
|
-
expected_output_cols=
|
837
|
+
expected_output_cols=expected_output_cols,
|
768
838
|
**transform_kwargs
|
769
839
|
)
|
770
840
|
return output_df
|
@@ -909,50 +979,84 @@ class SelectPercentile(BaseTransformer):
|
|
909
979
|
)
|
910
980
|
return output_df
|
911
981
|
|
982
|
+
|
983
|
+
|
984
|
+
def to_sklearn(self) -> Any:
|
985
|
+
"""Get sklearn.feature_selection.SelectPercentile object.
|
986
|
+
"""
|
987
|
+
if self._sklearn_object is None:
|
988
|
+
self._sklearn_object = self._create_sklearn_object()
|
989
|
+
return self._sklearn_object
|
990
|
+
|
991
|
+
def to_xgboost(self) -> Any:
|
992
|
+
raise exceptions.SnowflakeMLException(
|
993
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
994
|
+
original_exception=AttributeError(
|
995
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
996
|
+
"to_xgboost()",
|
997
|
+
"to_sklearn()"
|
998
|
+
)
|
999
|
+
),
|
1000
|
+
)
|
1001
|
+
|
1002
|
+
def to_lightgbm(self) -> Any:
|
1003
|
+
raise exceptions.SnowflakeMLException(
|
1004
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1005
|
+
original_exception=AttributeError(
|
1006
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1007
|
+
"to_lightgbm()",
|
1008
|
+
"to_sklearn()"
|
1009
|
+
)
|
1010
|
+
),
|
1011
|
+
)
|
912
1012
|
|
913
|
-
def
|
1013
|
+
def _get_dependencies(self) -> List[str]:
|
1014
|
+
return self._deps
|
1015
|
+
|
1016
|
+
|
1017
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
914
1018
|
self._model_signature_dict = dict()
|
915
1019
|
|
916
1020
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
917
1021
|
|
918
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1022
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
919
1023
|
outputs: List[BaseFeatureSpec] = []
|
920
1024
|
if hasattr(self, "predict"):
|
921
1025
|
# keep mypy happy
|
922
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1026
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
923
1027
|
# For classifier, the type of predict is the same as the type of label
|
924
|
-
if self._sklearn_object._estimator_type ==
|
925
|
-
|
1028
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1029
|
+
# label columns is the desired type for output
|
926
1030
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
927
1031
|
# rename the output columns
|
928
1032
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
929
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
930
|
-
|
931
|
-
|
1033
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1034
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1035
|
+
)
|
932
1036
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
933
1037
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
934
|
-
# Clusterer returns int64 cluster labels.
|
1038
|
+
# Clusterer returns int64 cluster labels.
|
935
1039
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
936
1040
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
937
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
938
|
-
|
939
|
-
|
940
|
-
|
1041
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1042
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1043
|
+
)
|
1044
|
+
|
941
1045
|
# For regressor, the type of predict is float64
|
942
|
-
elif self._sklearn_object._estimator_type ==
|
1046
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
943
1047
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
944
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
945
|
-
|
946
|
-
|
947
|
-
|
1048
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1049
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1050
|
+
)
|
1051
|
+
|
948
1052
|
for prob_func in PROB_FUNCTIONS:
|
949
1053
|
if hasattr(self, prob_func):
|
950
1054
|
output_cols_prefix: str = f"{prob_func}_"
|
951
1055
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
952
1056
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
953
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
954
|
-
|
955
|
-
|
1057
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1058
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1059
|
+
)
|
956
1060
|
|
957
1061
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
958
1062
|
items = list(self._model_signature_dict.items())
|
@@ -965,10 +1069,10 @@ class SelectPercentile(BaseTransformer):
|
|
965
1069
|
"""Returns model signature of current class.
|
966
1070
|
|
967
1071
|
Raises:
|
968
|
-
|
1072
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
969
1073
|
|
970
1074
|
Returns:
|
971
|
-
Dict
|
1075
|
+
Dict with each method and its input output signature
|
972
1076
|
"""
|
973
1077
|
if self._model_signature_dict is None:
|
974
1078
|
raise exceptions.SnowflakeMLException(
|
@@ -976,35 +1080,3 @@ class SelectPercentile(BaseTransformer):
|
|
976
1080
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
977
1081
|
)
|
978
1082
|
return self._model_signature_dict
|
979
|
-
|
980
|
-
def to_sklearn(self) -> Any:
|
981
|
-
"""Get sklearn.feature_selection.SelectPercentile object.
|
982
|
-
"""
|
983
|
-
if self._sklearn_object is None:
|
984
|
-
self._sklearn_object = self._create_sklearn_object()
|
985
|
-
return self._sklearn_object
|
986
|
-
|
987
|
-
def to_xgboost(self) -> Any:
|
988
|
-
raise exceptions.SnowflakeMLException(
|
989
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
990
|
-
original_exception=AttributeError(
|
991
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
992
|
-
"to_xgboost()",
|
993
|
-
"to_sklearn()"
|
994
|
-
)
|
995
|
-
),
|
996
|
-
)
|
997
|
-
|
998
|
-
def to_lightgbm(self) -> Any:
|
999
|
-
raise exceptions.SnowflakeMLException(
|
1000
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1001
|
-
original_exception=AttributeError(
|
1002
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1003
|
-
"to_lightgbm()",
|
1004
|
-
"to_sklearn()"
|
1005
|
-
)
|
1006
|
-
),
|
1007
|
-
)
|
1008
|
-
|
1009
|
-
def _get_dependencies(self) -> List[str]:
|
1010
|
-
return self._deps
|