snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +77 -32
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/dataset/__init__.py +10 -0
- snowflake/ml/dataset/dataset.py +454 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +202 -0
- snowflake/ml/feature_store/feature_store.py +531 -332
- snowflake/ml/feature_store/feature_view.py +40 -23
- snowflake/ml/fileset/embedded_stage_fs.py +146 -0
- snowflake/ml/fileset/sfcfs.py +56 -54
- snowflake/ml/fileset/snowfs.py +159 -0
- snowflake/ml/fileset/stage_fs.py +49 -17
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +27 -0
- snowflake/ml/model/_client/model/model_version_impl.py +137 -50
- snowflake/ml/model/_client/ops/model_ops.py +159 -40
- snowflake/ml/model/_client/sql/model.py +25 -2
- snowflake/ml/model/_client/sql/model_version.py +131 -2
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -5
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
- snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
- snowflake/ml/modeling/cluster/birch.py +248 -175
- snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
- snowflake/ml/modeling/cluster/dbscan.py +246 -175
- snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
- snowflake/ml/modeling/cluster/k_means.py +248 -175
- snowflake/ml/modeling/cluster/mean_shift.py +246 -175
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
- snowflake/ml/modeling/cluster/optics.py +246 -175
- snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
- snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
- snowflake/ml/modeling/compose/column_transformer.py +248 -175
- snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
- snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
- snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
- snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
- snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
- snowflake/ml/modeling/covariance/oas.py +246 -175
- snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
- snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
- snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
- snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
- snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/pca.py +248 -175
- snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
- snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
- snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
- snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
- snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
- snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
- snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
- snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
- snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +72 -37
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
- snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
- snowflake/ml/modeling/impute/knn_imputer.py +248 -175
- snowflake/ml/modeling/impute/missing_indicator.py +248 -175
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/lars.py +246 -175
- snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
- snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/perceptron.py +246 -175
- snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/ridge.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
- snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
- snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
- snowflake/ml/modeling/manifold/isomap.py +248 -175
- snowflake/ml/modeling/manifold/mds.py +248 -175
- snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
- snowflake/ml/modeling/manifold/tsne.py +248 -175
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
- snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
- snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
- snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
- snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
- snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
- snowflake/ml/modeling/pipeline/pipeline.py +517 -35
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
- snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
- snowflake/ml/modeling/svm/linear_svc.py +246 -175
- snowflake/ml/modeling/svm/linear_svr.py +246 -175
- snowflake/ml/modeling/svm/nu_svc.py +246 -175
- snowflake/ml/modeling/svm/nu_svr.py +246 -175
- snowflake/ml/modeling/svm/svc.py +246 -175
- snowflake/ml/modeling/svm/svr.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
- snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
- snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
34
34
|
BatchInferenceKwargsTypedDict,
|
35
35
|
ScoreKwargsTypedDict
|
36
36
|
)
|
37
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
38
|
+
from snowflake.ml.model.model_signature import (
|
39
|
+
BaseFeatureSpec,
|
40
|
+
DataType,
|
41
|
+
FeatureSpec,
|
42
|
+
ModelSignature,
|
43
|
+
_infer_signature,
|
44
|
+
_rename_signature_with_snowflake_identifiers,
|
45
|
+
)
|
37
46
|
|
38
47
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
39
48
|
|
@@ -44,16 +53,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
44
53
|
validate_sklearn_args,
|
45
54
|
)
|
46
55
|
|
47
|
-
from snowflake.ml.model.model_signature import (
|
48
|
-
DataType,
|
49
|
-
FeatureSpec,
|
50
|
-
ModelSignature,
|
51
|
-
_infer_signature,
|
52
|
-
_rename_signature_with_snowflake_identifiers,
|
53
|
-
BaseFeatureSpec,
|
54
|
-
)
|
55
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
56
|
-
|
57
56
|
_PROJECT = "ModelDevelopment"
|
58
57
|
# Derive subproject from module name by removing "sklearn"
|
59
58
|
# and converting module name from underscore to CamelCase
|
@@ -62,12 +61,6 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.feature_selection".repla
|
|
62
61
|
|
63
62
|
DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
|
64
63
|
|
65
|
-
def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
|
66
|
-
def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
|
67
|
-
return False and callable(getattr(self._sklearn_object, "fit_transform", None))
|
68
|
-
return check
|
69
|
-
|
70
|
-
|
71
64
|
class SelectFpr(BaseTransformer):
|
72
65
|
r"""Filter: Select the pvalues below alpha based on a FPR test
|
73
66
|
For more details on this class, see [sklearn.feature_selection.SelectFpr]
|
@@ -205,12 +198,7 @@ class SelectFpr(BaseTransformer):
|
|
205
198
|
)
|
206
199
|
return selected_cols
|
207
200
|
|
208
|
-
|
209
|
-
project=_PROJECT,
|
210
|
-
subproject=_SUBPROJECT,
|
211
|
-
custom_tags=dict([("autogen", True)]),
|
212
|
-
)
|
213
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SelectFpr":
|
201
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SelectFpr":
|
214
202
|
"""Run score function on (X, y) and get the appropriate features
|
215
203
|
For more details on this function, see [sklearn.feature_selection.SelectFpr.fit]
|
216
204
|
(https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html#sklearn.feature_selection.SelectFpr.fit)
|
@@ -237,12 +225,14 @@ class SelectFpr(BaseTransformer):
|
|
237
225
|
|
238
226
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
239
227
|
|
240
|
-
|
228
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
241
229
|
if SNOWML_SPROC_ENV in os.environ:
|
242
230
|
statement_params = telemetry.get_function_usage_statement_params(
|
243
231
|
project=_PROJECT,
|
244
232
|
subproject=_SUBPROJECT,
|
245
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
233
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
234
|
+
inspect.currentframe(), SelectFpr.__class__.__name__
|
235
|
+
),
|
246
236
|
api_calls=[Session.call],
|
247
237
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
248
238
|
)
|
@@ -263,27 +253,24 @@ class SelectFpr(BaseTransformer):
|
|
263
253
|
)
|
264
254
|
self._sklearn_object = model_trainer.train()
|
265
255
|
self._is_fitted = True
|
266
|
-
self.
|
256
|
+
self._generate_model_signatures(dataset)
|
267
257
|
return self
|
268
258
|
|
269
259
|
def _batch_inference_validate_snowpark(
|
270
260
|
self,
|
271
261
|
dataset: DataFrame,
|
272
262
|
inference_method: str,
|
273
|
-
) ->
|
274
|
-
"""Util method to run validate that batch inference can be run on a snowpark dataframe
|
275
|
-
return the available package that exists in the snowflake anaconda channel
|
263
|
+
) -> None:
|
264
|
+
"""Util method to run validate that batch inference can be run on a snowpark dataframe.
|
276
265
|
|
277
266
|
Args:
|
278
267
|
dataset: snowpark dataframe
|
279
268
|
inference_method: the inference method such as predict, score...
|
280
|
-
|
269
|
+
|
281
270
|
Raises:
|
282
271
|
SnowflakeMLException: If the estimator is not fitted, raise error
|
283
272
|
SnowflakeMLException: If the session is None, raise error
|
284
273
|
|
285
|
-
Returns:
|
286
|
-
A list of available package that exists in the snowflake anaconda channel
|
287
274
|
"""
|
288
275
|
if not self._is_fitted:
|
289
276
|
raise exceptions.SnowflakeMLException(
|
@@ -301,9 +288,7 @@ class SelectFpr(BaseTransformer):
|
|
301
288
|
"Session must not specified for snowpark dataset."
|
302
289
|
),
|
303
290
|
)
|
304
|
-
|
305
|
-
return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
306
|
-
pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
|
291
|
+
|
307
292
|
|
308
293
|
@available_if(original_estimator_has_callable("predict")) # type: ignore[misc]
|
309
294
|
@telemetry.send_api_usage_telemetry(
|
@@ -337,7 +322,9 @@ class SelectFpr(BaseTransformer):
|
|
337
322
|
# when it is classifier, infer the datatype from label columns
|
338
323
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
339
324
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
340
|
-
label_cols_signatures = [
|
325
|
+
label_cols_signatures = [
|
326
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
327
|
+
]
|
341
328
|
if len(label_cols_signatures) == 0:
|
342
329
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
343
330
|
raise exceptions.SnowflakeMLException(
|
@@ -345,25 +332,23 @@ class SelectFpr(BaseTransformer):
|
|
345
332
|
original_exception=ValueError(error_str),
|
346
333
|
)
|
347
334
|
|
348
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
349
|
-
label_cols_signatures[0].as_snowpark_type()
|
350
|
-
)
|
335
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
351
336
|
|
352
|
-
self.
|
353
|
-
|
337
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
338
|
+
self._deps = self._get_dependencies()
|
339
|
+
assert isinstance(
|
340
|
+
dataset._session, Session
|
341
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
354
342
|
|
355
343
|
transform_kwargs = dict(
|
356
|
-
session
|
357
|
-
dependencies
|
358
|
-
drop_input_cols
|
359
|
-
expected_output_cols_type
|
344
|
+
session=dataset._session,
|
345
|
+
dependencies=self._deps,
|
346
|
+
drop_input_cols=self._drop_input_cols,
|
347
|
+
expected_output_cols_type=expected_type_inferred,
|
360
348
|
)
|
361
349
|
|
362
350
|
elif isinstance(dataset, pd.DataFrame):
|
363
|
-
transform_kwargs = dict(
|
364
|
-
snowpark_input_cols = self._snowpark_cols,
|
365
|
-
drop_input_cols = self._drop_input_cols
|
366
|
-
)
|
351
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
367
352
|
|
368
353
|
transform_handlers = ModelTransformerBuilder.build(
|
369
354
|
dataset=dataset,
|
@@ -405,7 +390,7 @@ class SelectFpr(BaseTransformer):
|
|
405
390
|
Transformed dataset.
|
406
391
|
"""
|
407
392
|
super()._check_dataset_type(dataset)
|
408
|
-
inference_method="transform"
|
393
|
+
inference_method = "transform"
|
409
394
|
|
410
395
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
411
396
|
# are specific to the type of dataset used.
|
@@ -435,24 +420,19 @@ class SelectFpr(BaseTransformer):
|
|
435
420
|
if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
|
436
421
|
expected_dtype = convert_sp_to_sf_type(output_types[0])
|
437
422
|
|
438
|
-
self.
|
439
|
-
|
440
|
-
inference_method=inference_method,
|
441
|
-
)
|
423
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
424
|
+
self._deps = self._get_dependencies()
|
442
425
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
443
426
|
|
444
427
|
transform_kwargs = dict(
|
445
|
-
session
|
446
|
-
dependencies
|
447
|
-
drop_input_cols
|
448
|
-
expected_output_cols_type
|
428
|
+
session=dataset._session,
|
429
|
+
dependencies=self._deps,
|
430
|
+
drop_input_cols=self._drop_input_cols,
|
431
|
+
expected_output_cols_type=expected_dtype,
|
449
432
|
)
|
450
433
|
|
451
434
|
elif isinstance(dataset, pd.DataFrame):
|
452
|
-
transform_kwargs = dict(
|
453
|
-
snowpark_input_cols = self._snowpark_cols,
|
454
|
-
drop_input_cols = self._drop_input_cols
|
455
|
-
)
|
435
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
456
436
|
|
457
437
|
transform_handlers = ModelTransformerBuilder.build(
|
458
438
|
dataset=dataset,
|
@@ -471,7 +451,11 @@ class SelectFpr(BaseTransformer):
|
|
471
451
|
return output_df
|
472
452
|
|
473
453
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
474
|
-
def fit_predict(
|
454
|
+
def fit_predict(
|
455
|
+
self,
|
456
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
457
|
+
output_cols_prefix: str = "fit_predict_",
|
458
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
475
459
|
""" Method not supported for this class.
|
476
460
|
|
477
461
|
|
@@ -496,22 +480,106 @@ class SelectFpr(BaseTransformer):
|
|
496
480
|
)
|
497
481
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
498
482
|
drop_input_cols=self._drop_input_cols,
|
499
|
-
expected_output_cols_list=
|
483
|
+
expected_output_cols_list=(
|
484
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
485
|
+
),
|
500
486
|
)
|
501
487
|
self._sklearn_object = fitted_estimator
|
502
488
|
self._is_fitted = True
|
503
489
|
return output_result
|
504
490
|
|
491
|
+
|
492
|
+
@available_if(original_estimator_has_callable("fit_transform")) # type: ignore[misc]
|
493
|
+
def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "fit_transform_",) -> Union[DataFrame, pd.DataFrame]:
|
494
|
+
""" Fit to data, then transform it
|
495
|
+
For more details on this function, see [sklearn.feature_selection.SelectFpr.fit_transform]
|
496
|
+
(https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html#sklearn.feature_selection.SelectFpr.fit_transform)
|
497
|
+
|
498
|
+
|
499
|
+
Raises:
|
500
|
+
TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame.
|
505
501
|
|
506
|
-
|
507
|
-
|
508
|
-
|
502
|
+
Args:
|
503
|
+
dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame]
|
504
|
+
Snowpark or Pandas DataFrame.
|
505
|
+
output_cols_prefix: Prefix for the response columns
|
509
506
|
Returns:
|
510
507
|
Transformed dataset.
|
511
508
|
"""
|
512
|
-
self.
|
513
|
-
|
514
|
-
|
509
|
+
self._infer_input_output_cols(dataset)
|
510
|
+
super()._check_dataset_type(dataset)
|
511
|
+
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
512
|
+
estimator=self._sklearn_object,
|
513
|
+
dataset=dataset,
|
514
|
+
input_cols=self.input_cols,
|
515
|
+
label_cols=self.label_cols,
|
516
|
+
sample_weight_col=self.sample_weight_col,
|
517
|
+
autogenerated=self._autogenerated,
|
518
|
+
subproject=_SUBPROJECT,
|
519
|
+
)
|
520
|
+
output_result, fitted_estimator = model_trainer.train_fit_transform(
|
521
|
+
drop_input_cols=self._drop_input_cols,
|
522
|
+
expected_output_cols_list=self.output_cols,
|
523
|
+
)
|
524
|
+
self._sklearn_object = fitted_estimator
|
525
|
+
self._is_fitted = True
|
526
|
+
return output_result
|
527
|
+
|
528
|
+
|
529
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
530
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
531
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
532
|
+
"""
|
533
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
534
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
535
|
+
if output_cols:
|
536
|
+
output_cols = [
|
537
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
538
|
+
for c in output_cols
|
539
|
+
]
|
540
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
541
|
+
output_cols = [output_cols_prefix]
|
542
|
+
elif self._sklearn_object is not None:
|
543
|
+
classes = self._sklearn_object.classes_
|
544
|
+
if isinstance(classes, numpy.ndarray):
|
545
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
546
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
547
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
548
|
+
output_cols = []
|
549
|
+
for i, cl in enumerate(classes):
|
550
|
+
# For binary classification, there is only one output column for each class
|
551
|
+
# ndarray as the two classes are complementary.
|
552
|
+
if len(cl) == 2:
|
553
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
554
|
+
else:
|
555
|
+
output_cols.extend([
|
556
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
557
|
+
])
|
558
|
+
else:
|
559
|
+
output_cols = []
|
560
|
+
|
561
|
+
# Make sure column names are valid snowflake identifiers.
|
562
|
+
assert output_cols is not None # Make MyPy happy
|
563
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
564
|
+
|
565
|
+
return rv
|
566
|
+
|
567
|
+
def _align_expected_output_names(
|
568
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
569
|
+
) -> List[str]:
|
570
|
+
# in case the inferred output column names dimension is different
|
571
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
573
|
+
output_df_columns = list(output_df_pd.columns)
|
574
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
575
|
+
if self.sample_weight_col:
|
576
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
577
|
+
# if the dimension of inferred output column names is correct; use it
|
578
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
579
|
+
return expected_output_cols_list
|
580
|
+
# otherwise, use the sklearn estimator's output
|
581
|
+
else:
|
582
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
515
583
|
|
516
584
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
517
585
|
@telemetry.send_api_usage_telemetry(
|
@@ -543,24 +611,26 @@ class SelectFpr(BaseTransformer):
|
|
543
611
|
# are specific to the type of dataset used.
|
544
612
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
545
613
|
|
614
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
615
|
+
|
546
616
|
if isinstance(dataset, DataFrame):
|
547
|
-
self.
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
617
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
618
|
+
self._deps = self._get_dependencies()
|
619
|
+
assert isinstance(
|
620
|
+
dataset._session, Session
|
621
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
552
622
|
transform_kwargs = dict(
|
553
623
|
session=dataset._session,
|
554
624
|
dependencies=self._deps,
|
555
|
-
drop_input_cols
|
625
|
+
drop_input_cols=self._drop_input_cols,
|
556
626
|
expected_output_cols_type="float",
|
557
627
|
)
|
628
|
+
expected_output_cols = self._align_expected_output_names(
|
629
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
630
|
+
)
|
558
631
|
|
559
632
|
elif isinstance(dataset, pd.DataFrame):
|
560
|
-
transform_kwargs = dict(
|
561
|
-
snowpark_input_cols = self._snowpark_cols,
|
562
|
-
drop_input_cols = self._drop_input_cols
|
563
|
-
)
|
633
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
564
634
|
|
565
635
|
transform_handlers = ModelTransformerBuilder.build(
|
566
636
|
dataset=dataset,
|
@@ -572,7 +642,7 @@ class SelectFpr(BaseTransformer):
|
|
572
642
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
573
643
|
inference_method=inference_method,
|
574
644
|
input_cols=self.input_cols,
|
575
|
-
expected_output_cols=
|
645
|
+
expected_output_cols=expected_output_cols,
|
576
646
|
**transform_kwargs
|
577
647
|
)
|
578
648
|
return output_df
|
@@ -602,29 +672,30 @@ class SelectFpr(BaseTransformer):
|
|
602
672
|
Output dataset with log probability of the sample for each class in the model.
|
603
673
|
"""
|
604
674
|
super()._check_dataset_type(dataset)
|
605
|
-
inference_method="predict_log_proba"
|
675
|
+
inference_method = "predict_log_proba"
|
676
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
606
677
|
|
607
678
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
608
679
|
# are specific to the type of dataset used.
|
609
680
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
610
681
|
|
611
682
|
if isinstance(dataset, DataFrame):
|
612
|
-
self.
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
683
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
684
|
+
self._deps = self._get_dependencies()
|
685
|
+
assert isinstance(
|
686
|
+
dataset._session, Session
|
687
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
617
688
|
transform_kwargs = dict(
|
618
689
|
session=dataset._session,
|
619
690
|
dependencies=self._deps,
|
620
|
-
drop_input_cols
|
691
|
+
drop_input_cols=self._drop_input_cols,
|
621
692
|
expected_output_cols_type="float",
|
622
693
|
)
|
694
|
+
expected_output_cols = self._align_expected_output_names(
|
695
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
696
|
+
)
|
623
697
|
elif isinstance(dataset, pd.DataFrame):
|
624
|
-
transform_kwargs = dict(
|
625
|
-
snowpark_input_cols = self._snowpark_cols,
|
626
|
-
drop_input_cols = self._drop_input_cols
|
627
|
-
)
|
698
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
628
699
|
|
629
700
|
transform_handlers = ModelTransformerBuilder.build(
|
630
701
|
dataset=dataset,
|
@@ -637,7 +708,7 @@ class SelectFpr(BaseTransformer):
|
|
637
708
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
638
709
|
inference_method=inference_method,
|
639
710
|
input_cols=self.input_cols,
|
640
|
-
expected_output_cols=
|
711
|
+
expected_output_cols=expected_output_cols,
|
641
712
|
**transform_kwargs
|
642
713
|
)
|
643
714
|
return output_df
|
@@ -663,30 +734,32 @@ class SelectFpr(BaseTransformer):
|
|
663
734
|
Output dataset with results of the decision function for the samples in input dataset.
|
664
735
|
"""
|
665
736
|
super()._check_dataset_type(dataset)
|
666
|
-
inference_method="decision_function"
|
737
|
+
inference_method = "decision_function"
|
667
738
|
|
668
739
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
669
740
|
# are specific to the type of dataset used.
|
670
741
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
671
742
|
|
743
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
744
|
+
|
672
745
|
if isinstance(dataset, DataFrame):
|
673
|
-
self.
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
746
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
747
|
+
self._deps = self._get_dependencies()
|
748
|
+
assert isinstance(
|
749
|
+
dataset._session, Session
|
750
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
678
751
|
transform_kwargs = dict(
|
679
752
|
session=dataset._session,
|
680
753
|
dependencies=self._deps,
|
681
|
-
drop_input_cols
|
754
|
+
drop_input_cols=self._drop_input_cols,
|
682
755
|
expected_output_cols_type="float",
|
683
756
|
)
|
757
|
+
expected_output_cols = self._align_expected_output_names(
|
758
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
759
|
+
)
|
684
760
|
|
685
761
|
elif isinstance(dataset, pd.DataFrame):
|
686
|
-
transform_kwargs = dict(
|
687
|
-
snowpark_input_cols = self._snowpark_cols,
|
688
|
-
drop_input_cols = self._drop_input_cols
|
689
|
-
)
|
762
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
690
763
|
|
691
764
|
transform_handlers = ModelTransformerBuilder.build(
|
692
765
|
dataset=dataset,
|
@@ -699,7 +772,7 @@ class SelectFpr(BaseTransformer):
|
|
699
772
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
700
773
|
inference_method=inference_method,
|
701
774
|
input_cols=self.input_cols,
|
702
|
-
expected_output_cols=
|
775
|
+
expected_output_cols=expected_output_cols,
|
703
776
|
**transform_kwargs
|
704
777
|
)
|
705
778
|
return output_df
|
@@ -728,17 +801,17 @@ class SelectFpr(BaseTransformer):
|
|
728
801
|
Output dataset with probability of the sample for each class in the model.
|
729
802
|
"""
|
730
803
|
super()._check_dataset_type(dataset)
|
731
|
-
inference_method="score_samples"
|
804
|
+
inference_method = "score_samples"
|
732
805
|
|
733
806
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
734
807
|
# are specific to the type of dataset used.
|
735
808
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
736
809
|
|
810
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
811
|
+
|
737
812
|
if isinstance(dataset, DataFrame):
|
738
|
-
self.
|
739
|
-
|
740
|
-
inference_method=inference_method,
|
741
|
-
)
|
813
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
814
|
+
self._deps = self._get_dependencies()
|
742
815
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
743
816
|
transform_kwargs = dict(
|
744
817
|
session=dataset._session,
|
@@ -746,6 +819,9 @@ class SelectFpr(BaseTransformer):
|
|
746
819
|
drop_input_cols = self._drop_input_cols,
|
747
820
|
expected_output_cols_type="float",
|
748
821
|
)
|
822
|
+
expected_output_cols = self._align_expected_output_names(
|
823
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
824
|
+
)
|
749
825
|
|
750
826
|
elif isinstance(dataset, pd.DataFrame):
|
751
827
|
transform_kwargs = dict(
|
@@ -764,7 +840,7 @@ class SelectFpr(BaseTransformer):
|
|
764
840
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
765
841
|
inference_method=inference_method,
|
766
842
|
input_cols=self.input_cols,
|
767
|
-
expected_output_cols=
|
843
|
+
expected_output_cols=expected_output_cols,
|
768
844
|
**transform_kwargs
|
769
845
|
)
|
770
846
|
return output_df
|
@@ -797,17 +873,15 @@ class SelectFpr(BaseTransformer):
|
|
797
873
|
transform_kwargs: ScoreKwargsTypedDict = dict()
|
798
874
|
|
799
875
|
if isinstance(dataset, DataFrame):
|
800
|
-
self.
|
801
|
-
|
802
|
-
inference_method="score",
|
803
|
-
)
|
876
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method="score")
|
877
|
+
self._deps = self._get_dependencies()
|
804
878
|
selected_cols = self._get_active_columns()
|
805
879
|
if len(selected_cols) > 0:
|
806
880
|
dataset = dataset.select(selected_cols)
|
807
881
|
assert isinstance(dataset._session, Session) # keep mypy happy
|
808
882
|
transform_kwargs = dict(
|
809
883
|
session=dataset._session,
|
810
|
-
dependencies=
|
884
|
+
dependencies=self._deps,
|
811
885
|
score_sproc_imports=['sklearn'],
|
812
886
|
)
|
813
887
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -872,11 +946,8 @@ class SelectFpr(BaseTransformer):
|
|
872
946
|
|
873
947
|
if isinstance(dataset, DataFrame):
|
874
948
|
|
875
|
-
self.
|
876
|
-
|
877
|
-
inference_method=inference_method,
|
878
|
-
|
879
|
-
)
|
949
|
+
self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
950
|
+
self._deps = self._get_dependencies()
|
880
951
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
881
952
|
transform_kwargs = dict(
|
882
953
|
session = dataset._session,
|
@@ -909,50 +980,84 @@ class SelectFpr(BaseTransformer):
|
|
909
980
|
)
|
910
981
|
return output_df
|
911
982
|
|
983
|
+
|
984
|
+
|
985
|
+
def to_sklearn(self) -> Any:
|
986
|
+
"""Get sklearn.feature_selection.SelectFpr object.
|
987
|
+
"""
|
988
|
+
if self._sklearn_object is None:
|
989
|
+
self._sklearn_object = self._create_sklearn_object()
|
990
|
+
return self._sklearn_object
|
991
|
+
|
992
|
+
def to_xgboost(self) -> Any:
|
993
|
+
raise exceptions.SnowflakeMLException(
|
994
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
995
|
+
original_exception=AttributeError(
|
996
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
997
|
+
"to_xgboost()",
|
998
|
+
"to_sklearn()"
|
999
|
+
)
|
1000
|
+
),
|
1001
|
+
)
|
912
1002
|
|
913
|
-
def
|
1003
|
+
def to_lightgbm(self) -> Any:
|
1004
|
+
raise exceptions.SnowflakeMLException(
|
1005
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1006
|
+
original_exception=AttributeError(
|
1007
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1008
|
+
"to_lightgbm()",
|
1009
|
+
"to_sklearn()"
|
1010
|
+
)
|
1011
|
+
),
|
1012
|
+
)
|
1013
|
+
|
1014
|
+
def _get_dependencies(self) -> List[str]:
|
1015
|
+
return self._deps
|
1016
|
+
|
1017
|
+
|
1018
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
914
1019
|
self._model_signature_dict = dict()
|
915
1020
|
|
916
1021
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
917
1022
|
|
918
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1023
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
919
1024
|
outputs: List[BaseFeatureSpec] = []
|
920
1025
|
if hasattr(self, "predict"):
|
921
1026
|
# keep mypy happy
|
922
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1027
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
923
1028
|
# For classifier, the type of predict is the same as the type of label
|
924
|
-
if self._sklearn_object._estimator_type ==
|
925
|
-
|
1029
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1030
|
+
# label columns is the desired type for output
|
926
1031
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
927
1032
|
# rename the output columns
|
928
1033
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
929
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
930
|
-
|
931
|
-
|
1034
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1035
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1036
|
+
)
|
932
1037
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
933
1038
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
934
|
-
# Clusterer returns int64 cluster labels.
|
1039
|
+
# Clusterer returns int64 cluster labels.
|
935
1040
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
936
1041
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
937
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
938
|
-
|
939
|
-
|
940
|
-
|
1042
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1043
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1044
|
+
)
|
1045
|
+
|
941
1046
|
# For regressor, the type of predict is float64
|
942
|
-
elif self._sklearn_object._estimator_type ==
|
1047
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
943
1048
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
944
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
945
|
-
|
946
|
-
|
947
|
-
|
1049
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1050
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1051
|
+
)
|
1052
|
+
|
948
1053
|
for prob_func in PROB_FUNCTIONS:
|
949
1054
|
if hasattr(self, prob_func):
|
950
1055
|
output_cols_prefix: str = f"{prob_func}_"
|
951
1056
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
952
1057
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
953
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
954
|
-
|
955
|
-
|
1058
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1059
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1060
|
+
)
|
956
1061
|
|
957
1062
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
958
1063
|
items = list(self._model_signature_dict.items())
|
@@ -965,10 +1070,10 @@ class SelectFpr(BaseTransformer):
|
|
965
1070
|
"""Returns model signature of current class.
|
966
1071
|
|
967
1072
|
Raises:
|
968
|
-
|
1073
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
969
1074
|
|
970
1075
|
Returns:
|
971
|
-
Dict
|
1076
|
+
Dict with each method and its input output signature
|
972
1077
|
"""
|
973
1078
|
if self._model_signature_dict is None:
|
974
1079
|
raise exceptions.SnowflakeMLException(
|
@@ -976,35 +1081,3 @@ class SelectFpr(BaseTransformer):
|
|
976
1081
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
977
1082
|
)
|
978
1083
|
return self._model_signature_dict
|
979
|
-
|
980
|
-
def to_sklearn(self) -> Any:
|
981
|
-
"""Get sklearn.feature_selection.SelectFpr object.
|
982
|
-
"""
|
983
|
-
if self._sklearn_object is None:
|
984
|
-
self._sklearn_object = self._create_sklearn_object()
|
985
|
-
return self._sklearn_object
|
986
|
-
|
987
|
-
def to_xgboost(self) -> Any:
|
988
|
-
raise exceptions.SnowflakeMLException(
|
989
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
990
|
-
original_exception=AttributeError(
|
991
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
992
|
-
"to_xgboost()",
|
993
|
-
"to_sklearn()"
|
994
|
-
)
|
995
|
-
),
|
996
|
-
)
|
997
|
-
|
998
|
-
def to_lightgbm(self) -> Any:
|
999
|
-
raise exceptions.SnowflakeMLException(
|
1000
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1001
|
-
original_exception=AttributeError(
|
1002
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1003
|
-
"to_lightgbm()",
|
1004
|
-
"to_sklearn()"
|
1005
|
-
)
|
1006
|
-
),
|
1007
|
-
)
|
1008
|
-
|
1009
|
-
def _get_dependencies(self) -> List[str]:
|
1010
|
-
return self._deps
|