snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -276,12 +275,7 @@ class BaggingClassifier(BaseTransformer):
|
|
276
275
|
)
|
277
276
|
return selected_cols
|
278
277
|
|
279
|
-
|
280
|
-
project=_PROJECT,
|
281
|
-
subproject=_SUBPROJECT,
|
282
|
-
custom_tags=dict([("autogen", True)]),
|
283
|
-
)
|
284
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BaggingClassifier":
|
278
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "BaggingClassifier":
|
285
279
|
"""Build a Bagging ensemble of estimators from the training set (X, y)
|
286
280
|
For more details on this function, see [sklearn.ensemble.BaggingClassifier.fit]
|
287
281
|
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier.fit)
|
@@ -308,12 +302,14 @@ class BaggingClassifier(BaseTransformer):
|
|
308
302
|
|
309
303
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
310
304
|
|
311
|
-
|
305
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
312
306
|
if SNOWML_SPROC_ENV in os.environ:
|
313
307
|
statement_params = telemetry.get_function_usage_statement_params(
|
314
308
|
project=_PROJECT,
|
315
309
|
subproject=_SUBPROJECT,
|
316
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
310
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
311
|
+
inspect.currentframe(), BaggingClassifier.__class__.__name__
|
312
|
+
),
|
317
313
|
api_calls=[Session.call],
|
318
314
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
319
315
|
)
|
@@ -334,7 +330,7 @@ class BaggingClassifier(BaseTransformer):
|
|
334
330
|
)
|
335
331
|
self._sklearn_object = model_trainer.train()
|
336
332
|
self._is_fitted = True
|
337
|
-
self.
|
333
|
+
self._generate_model_signatures(dataset)
|
338
334
|
return self
|
339
335
|
|
340
336
|
def _batch_inference_validate_snowpark(
|
@@ -410,7 +406,9 @@ class BaggingClassifier(BaseTransformer):
|
|
410
406
|
# when it is classifier, infer the datatype from label columns
|
411
407
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
412
408
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
413
|
-
label_cols_signatures = [
|
409
|
+
label_cols_signatures = [
|
410
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
411
|
+
]
|
414
412
|
if len(label_cols_signatures) == 0:
|
415
413
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
416
414
|
raise exceptions.SnowflakeMLException(
|
@@ -418,25 +416,22 @@ class BaggingClassifier(BaseTransformer):
|
|
418
416
|
original_exception=ValueError(error_str),
|
419
417
|
)
|
420
418
|
|
421
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
422
|
-
label_cols_signatures[0].as_snowpark_type()
|
423
|
-
)
|
419
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
424
420
|
|
425
421
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
426
|
-
assert isinstance(
|
422
|
+
assert isinstance(
|
423
|
+
dataset._session, Session
|
424
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
427
425
|
|
428
426
|
transform_kwargs = dict(
|
429
|
-
session
|
430
|
-
dependencies
|
431
|
-
drop_input_cols
|
432
|
-
expected_output_cols_type
|
427
|
+
session=dataset._session,
|
428
|
+
dependencies=self._deps,
|
429
|
+
drop_input_cols=self._drop_input_cols,
|
430
|
+
expected_output_cols_type=expected_type_inferred,
|
433
431
|
)
|
434
432
|
|
435
433
|
elif isinstance(dataset, pd.DataFrame):
|
436
|
-
transform_kwargs = dict(
|
437
|
-
snowpark_input_cols = self._snowpark_cols,
|
438
|
-
drop_input_cols = self._drop_input_cols
|
439
|
-
)
|
434
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
440
435
|
|
441
436
|
transform_handlers = ModelTransformerBuilder.build(
|
442
437
|
dataset=dataset,
|
@@ -476,7 +471,7 @@ class BaggingClassifier(BaseTransformer):
|
|
476
471
|
Transformed dataset.
|
477
472
|
"""
|
478
473
|
super()._check_dataset_type(dataset)
|
479
|
-
inference_method="transform"
|
474
|
+
inference_method = "transform"
|
480
475
|
|
481
476
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
482
477
|
# are specific to the type of dataset used.
|
@@ -513,17 +508,14 @@ class BaggingClassifier(BaseTransformer):
|
|
513
508
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
514
509
|
|
515
510
|
transform_kwargs = dict(
|
516
|
-
session
|
517
|
-
dependencies
|
518
|
-
drop_input_cols
|
519
|
-
expected_output_cols_type
|
511
|
+
session=dataset._session,
|
512
|
+
dependencies=self._deps,
|
513
|
+
drop_input_cols=self._drop_input_cols,
|
514
|
+
expected_output_cols_type=expected_dtype,
|
520
515
|
)
|
521
516
|
|
522
517
|
elif isinstance(dataset, pd.DataFrame):
|
523
|
-
transform_kwargs = dict(
|
524
|
-
snowpark_input_cols = self._snowpark_cols,
|
525
|
-
drop_input_cols = self._drop_input_cols
|
526
|
-
)
|
518
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
527
519
|
|
528
520
|
transform_handlers = ModelTransformerBuilder.build(
|
529
521
|
dataset=dataset,
|
@@ -542,7 +534,11 @@ class BaggingClassifier(BaseTransformer):
|
|
542
534
|
return output_df
|
543
535
|
|
544
536
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
545
|
-
def fit_predict(
|
537
|
+
def fit_predict(
|
538
|
+
self,
|
539
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
540
|
+
output_cols_prefix: str = "fit_predict_",
|
541
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
546
542
|
""" Method not supported for this class.
|
547
543
|
|
548
544
|
|
@@ -567,7 +563,9 @@ class BaggingClassifier(BaseTransformer):
|
|
567
563
|
)
|
568
564
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
569
565
|
drop_input_cols=self._drop_input_cols,
|
570
|
-
expected_output_cols_list=
|
566
|
+
expected_output_cols_list=(
|
567
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
568
|
+
),
|
571
569
|
)
|
572
570
|
self._sklearn_object = fitted_estimator
|
573
571
|
self._is_fitted = True
|
@@ -584,6 +582,62 @@ class BaggingClassifier(BaseTransformer):
|
|
584
582
|
assert self._sklearn_object is not None
|
585
583
|
return self._sklearn_object.embedding_
|
586
584
|
|
585
|
+
|
586
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
587
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
588
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
589
|
+
"""
|
590
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
591
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
592
|
+
if output_cols:
|
593
|
+
output_cols = [
|
594
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
595
|
+
for c in output_cols
|
596
|
+
]
|
597
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
598
|
+
output_cols = [output_cols_prefix]
|
599
|
+
elif self._sklearn_object is not None:
|
600
|
+
classes = self._sklearn_object.classes_
|
601
|
+
if isinstance(classes, numpy.ndarray):
|
602
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
603
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
604
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
605
|
+
output_cols = []
|
606
|
+
for i, cl in enumerate(classes):
|
607
|
+
# For binary classification, there is only one output column for each class
|
608
|
+
# ndarray as the two classes are complementary.
|
609
|
+
if len(cl) == 2:
|
610
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
611
|
+
else:
|
612
|
+
output_cols.extend([
|
613
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
614
|
+
])
|
615
|
+
else:
|
616
|
+
output_cols = []
|
617
|
+
|
618
|
+
# Make sure column names are valid snowflake identifiers.
|
619
|
+
assert output_cols is not None # Make MyPy happy
|
620
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
621
|
+
|
622
|
+
return rv
|
623
|
+
|
624
|
+
def _align_expected_output_names(
|
625
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
626
|
+
) -> List[str]:
|
627
|
+
# in case the inferred output column names dimension is different
|
628
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
629
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
630
|
+
output_df_columns = list(output_df_pd.columns)
|
631
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
632
|
+
if self.sample_weight_col:
|
633
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
634
|
+
# if the dimension of inferred output column names is correct; use it
|
635
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
636
|
+
return expected_output_cols_list
|
637
|
+
# otherwise, use the sklearn estimator's output
|
638
|
+
else:
|
639
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
640
|
+
|
587
641
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
588
642
|
@telemetry.send_api_usage_telemetry(
|
589
643
|
project=_PROJECT,
|
@@ -616,24 +670,28 @@ class BaggingClassifier(BaseTransformer):
|
|
616
670
|
# are specific to the type of dataset used.
|
617
671
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
618
672
|
|
673
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
674
|
+
|
619
675
|
if isinstance(dataset, DataFrame):
|
620
676
|
self._deps = self._batch_inference_validate_snowpark(
|
621
677
|
dataset=dataset,
|
622
678
|
inference_method=inference_method,
|
623
679
|
)
|
624
|
-
assert isinstance(
|
680
|
+
assert isinstance(
|
681
|
+
dataset._session, Session
|
682
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
625
683
|
transform_kwargs = dict(
|
626
684
|
session=dataset._session,
|
627
685
|
dependencies=self._deps,
|
628
|
-
drop_input_cols
|
686
|
+
drop_input_cols=self._drop_input_cols,
|
629
687
|
expected_output_cols_type="float",
|
630
688
|
)
|
689
|
+
expected_output_cols = self._align_expected_output_names(
|
690
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
691
|
+
)
|
631
692
|
|
632
693
|
elif isinstance(dataset, pd.DataFrame):
|
633
|
-
transform_kwargs = dict(
|
634
|
-
snowpark_input_cols = self._snowpark_cols,
|
635
|
-
drop_input_cols = self._drop_input_cols
|
636
|
-
)
|
694
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
637
695
|
|
638
696
|
transform_handlers = ModelTransformerBuilder.build(
|
639
697
|
dataset=dataset,
|
@@ -645,7 +703,7 @@ class BaggingClassifier(BaseTransformer):
|
|
645
703
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
646
704
|
inference_method=inference_method,
|
647
705
|
input_cols=self.input_cols,
|
648
|
-
expected_output_cols=
|
706
|
+
expected_output_cols=expected_output_cols,
|
649
707
|
**transform_kwargs
|
650
708
|
)
|
651
709
|
return output_df
|
@@ -677,7 +735,8 @@ class BaggingClassifier(BaseTransformer):
|
|
677
735
|
Output dataset with log probability of the sample for each class in the model.
|
678
736
|
"""
|
679
737
|
super()._check_dataset_type(dataset)
|
680
|
-
inference_method="predict_log_proba"
|
738
|
+
inference_method = "predict_log_proba"
|
739
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
681
740
|
|
682
741
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
683
742
|
# are specific to the type of dataset used.
|
@@ -688,18 +747,20 @@ class BaggingClassifier(BaseTransformer):
|
|
688
747
|
dataset=dataset,
|
689
748
|
inference_method=inference_method,
|
690
749
|
)
|
691
|
-
assert isinstance(
|
750
|
+
assert isinstance(
|
751
|
+
dataset._session, Session
|
752
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
692
753
|
transform_kwargs = dict(
|
693
754
|
session=dataset._session,
|
694
755
|
dependencies=self._deps,
|
695
|
-
drop_input_cols
|
756
|
+
drop_input_cols=self._drop_input_cols,
|
696
757
|
expected_output_cols_type="float",
|
697
758
|
)
|
759
|
+
expected_output_cols = self._align_expected_output_names(
|
760
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
761
|
+
)
|
698
762
|
elif isinstance(dataset, pd.DataFrame):
|
699
|
-
transform_kwargs = dict(
|
700
|
-
snowpark_input_cols = self._snowpark_cols,
|
701
|
-
drop_input_cols = self._drop_input_cols
|
702
|
-
)
|
763
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
703
764
|
|
704
765
|
transform_handlers = ModelTransformerBuilder.build(
|
705
766
|
dataset=dataset,
|
@@ -712,7 +773,7 @@ class BaggingClassifier(BaseTransformer):
|
|
712
773
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
713
774
|
inference_method=inference_method,
|
714
775
|
input_cols=self.input_cols,
|
715
|
-
expected_output_cols=
|
776
|
+
expected_output_cols=expected_output_cols,
|
716
777
|
**transform_kwargs
|
717
778
|
)
|
718
779
|
return output_df
|
@@ -740,30 +801,34 @@ class BaggingClassifier(BaseTransformer):
|
|
740
801
|
Output dataset with results of the decision function for the samples in input dataset.
|
741
802
|
"""
|
742
803
|
super()._check_dataset_type(dataset)
|
743
|
-
inference_method="decision_function"
|
804
|
+
inference_method = "decision_function"
|
744
805
|
|
745
806
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
746
807
|
# are specific to the type of dataset used.
|
747
808
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
748
809
|
|
810
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
811
|
+
|
749
812
|
if isinstance(dataset, DataFrame):
|
750
813
|
self._deps = self._batch_inference_validate_snowpark(
|
751
814
|
dataset=dataset,
|
752
815
|
inference_method=inference_method,
|
753
816
|
)
|
754
|
-
assert isinstance(
|
817
|
+
assert isinstance(
|
818
|
+
dataset._session, Session
|
819
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
755
820
|
transform_kwargs = dict(
|
756
821
|
session=dataset._session,
|
757
822
|
dependencies=self._deps,
|
758
|
-
drop_input_cols
|
823
|
+
drop_input_cols=self._drop_input_cols,
|
759
824
|
expected_output_cols_type="float",
|
760
825
|
)
|
826
|
+
expected_output_cols = self._align_expected_output_names(
|
827
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
828
|
+
)
|
761
829
|
|
762
830
|
elif isinstance(dataset, pd.DataFrame):
|
763
|
-
transform_kwargs = dict(
|
764
|
-
snowpark_input_cols = self._snowpark_cols,
|
765
|
-
drop_input_cols = self._drop_input_cols
|
766
|
-
)
|
831
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
767
832
|
|
768
833
|
transform_handlers = ModelTransformerBuilder.build(
|
769
834
|
dataset=dataset,
|
@@ -776,7 +841,7 @@ class BaggingClassifier(BaseTransformer):
|
|
776
841
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
777
842
|
inference_method=inference_method,
|
778
843
|
input_cols=self.input_cols,
|
779
|
-
expected_output_cols=
|
844
|
+
expected_output_cols=expected_output_cols,
|
780
845
|
**transform_kwargs
|
781
846
|
)
|
782
847
|
return output_df
|
@@ -805,12 +870,14 @@ class BaggingClassifier(BaseTransformer):
|
|
805
870
|
Output dataset with probability of the sample for each class in the model.
|
806
871
|
"""
|
807
872
|
super()._check_dataset_type(dataset)
|
808
|
-
inference_method="score_samples"
|
873
|
+
inference_method = "score_samples"
|
809
874
|
|
810
875
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
811
876
|
# are specific to the type of dataset used.
|
812
877
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
813
878
|
|
879
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
880
|
+
|
814
881
|
if isinstance(dataset, DataFrame):
|
815
882
|
self._deps = self._batch_inference_validate_snowpark(
|
816
883
|
dataset=dataset,
|
@@ -823,6 +890,9 @@ class BaggingClassifier(BaseTransformer):
|
|
823
890
|
drop_input_cols = self._drop_input_cols,
|
824
891
|
expected_output_cols_type="float",
|
825
892
|
)
|
893
|
+
expected_output_cols = self._align_expected_output_names(
|
894
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
895
|
+
)
|
826
896
|
|
827
897
|
elif isinstance(dataset, pd.DataFrame):
|
828
898
|
transform_kwargs = dict(
|
@@ -841,7 +911,7 @@ class BaggingClassifier(BaseTransformer):
|
|
841
911
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
842
912
|
inference_method=inference_method,
|
843
913
|
input_cols=self.input_cols,
|
844
|
-
expected_output_cols=
|
914
|
+
expected_output_cols=expected_output_cols,
|
845
915
|
**transform_kwargs
|
846
916
|
)
|
847
917
|
return output_df
|
@@ -988,50 +1058,84 @@ class BaggingClassifier(BaseTransformer):
|
|
988
1058
|
)
|
989
1059
|
return output_df
|
990
1060
|
|
1061
|
+
|
1062
|
+
|
1063
|
+
def to_sklearn(self) -> Any:
|
1064
|
+
"""Get sklearn.ensemble.BaggingClassifier object.
|
1065
|
+
"""
|
1066
|
+
if self._sklearn_object is None:
|
1067
|
+
self._sklearn_object = self._create_sklearn_object()
|
1068
|
+
return self._sklearn_object
|
1069
|
+
|
1070
|
+
def to_xgboost(self) -> Any:
|
1071
|
+
raise exceptions.SnowflakeMLException(
|
1072
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1073
|
+
original_exception=AttributeError(
|
1074
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1075
|
+
"to_xgboost()",
|
1076
|
+
"to_sklearn()"
|
1077
|
+
)
|
1078
|
+
),
|
1079
|
+
)
|
1080
|
+
|
1081
|
+
def to_lightgbm(self) -> Any:
|
1082
|
+
raise exceptions.SnowflakeMLException(
|
1083
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1084
|
+
original_exception=AttributeError(
|
1085
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1086
|
+
"to_lightgbm()",
|
1087
|
+
"to_sklearn()"
|
1088
|
+
)
|
1089
|
+
),
|
1090
|
+
)
|
991
1091
|
|
992
|
-
def
|
1092
|
+
def _get_dependencies(self) -> List[str]:
|
1093
|
+
return self._deps
|
1094
|
+
|
1095
|
+
|
1096
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
993
1097
|
self._model_signature_dict = dict()
|
994
1098
|
|
995
1099
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
996
1100
|
|
997
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1101
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
998
1102
|
outputs: List[BaseFeatureSpec] = []
|
999
1103
|
if hasattr(self, "predict"):
|
1000
1104
|
# keep mypy happy
|
1001
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1105
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1002
1106
|
# For classifier, the type of predict is the same as the type of label
|
1003
|
-
if self._sklearn_object._estimator_type ==
|
1004
|
-
|
1107
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1108
|
+
# label columns is the desired type for output
|
1005
1109
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1006
1110
|
# rename the output columns
|
1007
1111
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1008
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1009
|
-
|
1010
|
-
|
1112
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1113
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1114
|
+
)
|
1011
1115
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1012
1116
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1013
|
-
# Clusterer returns int64 cluster labels.
|
1117
|
+
# Clusterer returns int64 cluster labels.
|
1014
1118
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1015
1119
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1016
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1120
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1121
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1122
|
+
)
|
1123
|
+
|
1020
1124
|
# For regressor, the type of predict is float64
|
1021
|
-
elif self._sklearn_object._estimator_type ==
|
1125
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1022
1126
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1023
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1127
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1128
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1129
|
+
)
|
1130
|
+
|
1027
1131
|
for prob_func in PROB_FUNCTIONS:
|
1028
1132
|
if hasattr(self, prob_func):
|
1029
1133
|
output_cols_prefix: str = f"{prob_func}_"
|
1030
1134
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1031
1135
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1032
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1033
|
-
|
1034
|
-
|
1136
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1137
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1138
|
+
)
|
1035
1139
|
|
1036
1140
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1037
1141
|
items = list(self._model_signature_dict.items())
|
@@ -1044,10 +1148,10 @@ class BaggingClassifier(BaseTransformer):
|
|
1044
1148
|
"""Returns model signature of current class.
|
1045
1149
|
|
1046
1150
|
Raises:
|
1047
|
-
|
1151
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1048
1152
|
|
1049
1153
|
Returns:
|
1050
|
-
Dict
|
1154
|
+
Dict with each method and its input output signature
|
1051
1155
|
"""
|
1052
1156
|
if self._model_signature_dict is None:
|
1053
1157
|
raise exceptions.SnowflakeMLException(
|
@@ -1055,35 +1159,3 @@ class BaggingClassifier(BaseTransformer):
|
|
1055
1159
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1056
1160
|
)
|
1057
1161
|
return self._model_signature_dict
|
1058
|
-
|
1059
|
-
def to_sklearn(self) -> Any:
|
1060
|
-
"""Get sklearn.ensemble.BaggingClassifier object.
|
1061
|
-
"""
|
1062
|
-
if self._sklearn_object is None:
|
1063
|
-
self._sklearn_object = self._create_sklearn_object()
|
1064
|
-
return self._sklearn_object
|
1065
|
-
|
1066
|
-
def to_xgboost(self) -> Any:
|
1067
|
-
raise exceptions.SnowflakeMLException(
|
1068
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1069
|
-
original_exception=AttributeError(
|
1070
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1071
|
-
"to_xgboost()",
|
1072
|
-
"to_sklearn()"
|
1073
|
-
)
|
1074
|
-
),
|
1075
|
-
)
|
1076
|
-
|
1077
|
-
def to_lightgbm(self) -> Any:
|
1078
|
-
raise exceptions.SnowflakeMLException(
|
1079
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1080
|
-
original_exception=AttributeError(
|
1081
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1082
|
-
"to_lightgbm()",
|
1083
|
-
"to_sklearn()"
|
1084
|
-
)
|
1085
|
-
),
|
1086
|
-
)
|
1087
|
-
|
1088
|
-
def _get_dependencies(self) -> List[str]:
|
1089
|
-
return self._deps
|