snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_classify_text.py +2 -2
- snowflake/cortex/_embed_text_1024.py +37 -0
- snowflake/cortex/_embed_text_768.py +37 -0
- snowflake/cortex/_extract_answer.py +2 -2
- snowflake/cortex/_sentiment.py +2 -2
- snowflake/cortex/_summarize.py +2 -2
- snowflake/cortex/_translate.py +2 -2
- snowflake/cortex/_util.py +4 -4
- snowflake/ml/_internal/env_utils.py +5 -5
- snowflake/ml/_internal/exceptions/error_codes.py +2 -0
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/db_utils.py +50 -0
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/service_logger.py +63 -0
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +26 -2
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/ingestor_utils.py +20 -10
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/access_manager.py +3 -3
- snowflake/ml/feature_store/feature_store.py +60 -19
- snowflake/ml/feature_store/feature_view.py +84 -30
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/fileset/stage_fs.py +2 -1
- snowflake/ml/lineage/lineage_node.py +7 -2
- snowflake/ml/model/__init__.py +1 -2
- snowflake/ml/model/_client/model/model_version_impl.py +96 -12
- snowflake/ml/model/_client/ops/model_ops.py +124 -6
- snowflake/ml/model/_client/ops/service_ops.py +309 -9
- snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
- snowflake/ml/model/_client/sql/_base.py +5 -0
- snowflake/ml/model/_client/sql/model.py +1 -0
- snowflake/ml/model/_client/sql/model_version.py +9 -5
- snowflake/ml/model/_client/sql/service.py +121 -20
- snowflake/ml/model/_model_composer/model_composer.py +11 -39
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
- snowflake/ml/model/_packager/model_env/model_env.py +4 -38
- snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
- snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
- snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
- snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
- snowflake/ml/model/_packager/model_packager.py +14 -8
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/type_hints.py +12 -145
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
- snowflake/ml/modeling/cluster/birch.py +61 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
- snowflake/ml/modeling/cluster/dbscan.py +61 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
- snowflake/ml/modeling/cluster/k_means.py +61 -21
- snowflake/ml/modeling/cluster/mean_shift.py +61 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
- snowflake/ml/modeling/cluster/optics.py +61 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
- snowflake/ml/modeling/compose/column_transformer.py +61 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
- snowflake/ml/modeling/covariance/oas.py +61 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/pca.py +61 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
- snowflake/ml/modeling/impute/knn_imputer.py +61 -21
- snowflake/ml/modeling/impute/missing_indicator.py +61 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/lars.py +61 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/perceptron.py +61 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ridge.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
- snowflake/ml/modeling/manifold/isomap.py +61 -21
- snowflake/ml/modeling/manifold/mds.py +61 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
- snowflake/ml/modeling/manifold/tsne.py +61 -21
- snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
- snowflake/ml/modeling/metrics/ranking.py +0 -3
- snowflake/ml/modeling/metrics/regression.py +0 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
- snowflake/ml/modeling/svm/linear_svc.py +61 -21
- snowflake/ml/modeling/svm/linear_svr.py +61 -21
- snowflake/ml/modeling/svm/nu_svc.py +61 -21
- snowflake/ml/modeling/svm/nu_svr.py +61 -21
- snowflake/ml/modeling/svm/svc.py +61 -21
- snowflake/ml/modeling/svm/svr.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
- snowflake/ml/monitoring/_client/model_monitor.py +126 -0
- snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
- snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
- snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
- snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
- snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
- snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
- snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
- snowflake/ml/monitoring/entities/output_score_type.py +90 -0
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/registry.py +166 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
- snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
- snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
- snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
- snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
- snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
- snowflake/ml/_internal/utils/session_token_manager.py +0 -46
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
- snowflake/ml/_internal/utils/uri.py +0 -77
- snowflake/ml/data/torch_dataset.py +0 -33
- snowflake/ml/model/_api.py +0 -568
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
- snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
- snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
- snowflake/ml/model/deploy_platforms.py +0 -6
- snowflake/ml/model/models/llm.py +0 -104
- snowflake/ml/monitoring/monitor.py +0 -203
- snowflake/ml/registry/_initial_schema.py +0 -142
- snowflake/ml/registry/_schema.py +0 -82
- snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
- snowflake/ml/registry/_schema_version_manager.py +0 -163
- snowflake/ml/registry/model_registry.py +0 -2048
- snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -517,12 +514,23 @@ class AdaBoostClassifier(BaseTransformer):
|
|
517
514
|
autogenerated=self._autogenerated,
|
518
515
|
subproject=_SUBPROJECT,
|
519
516
|
)
|
520
|
-
|
521
|
-
|
522
|
-
expected_output_cols_list=(
|
523
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
524
|
-
),
|
517
|
+
expected_output_cols = (
|
518
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
525
519
|
)
|
520
|
+
if isinstance(dataset, DataFrame):
|
521
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
522
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
523
|
+
)
|
524
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
525
|
+
drop_input_cols=self._drop_input_cols,
|
526
|
+
expected_output_cols_list=expected_output_cols,
|
527
|
+
example_output_pd_df=example_output_pd_df,
|
528
|
+
)
|
529
|
+
else:
|
530
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
531
|
+
drop_input_cols=self._drop_input_cols,
|
532
|
+
expected_output_cols_list=expected_output_cols,
|
533
|
+
)
|
526
534
|
self._sklearn_object = fitted_estimator
|
527
535
|
self._is_fitted = True
|
528
536
|
return output_result
|
@@ -545,6 +553,7 @@ class AdaBoostClassifier(BaseTransformer):
|
|
545
553
|
"""
|
546
554
|
self._infer_input_output_cols(dataset)
|
547
555
|
super()._check_dataset_type(dataset)
|
556
|
+
|
548
557
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
549
558
|
estimator=self._sklearn_object,
|
550
559
|
dataset=dataset,
|
@@ -601,12 +610,41 @@ class AdaBoostClassifier(BaseTransformer):
|
|
601
610
|
|
602
611
|
return rv
|
603
612
|
|
604
|
-
def
|
605
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
606
|
-
) -> List[str]:
|
613
|
+
def _align_expected_output(
|
614
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
615
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
616
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
617
|
+
and output dataframe with 1 line.
|
618
|
+
If the method is fit_predict, run 2 lines of data.
|
619
|
+
"""
|
607
620
|
# in case the inferred output column names dimension is different
|
608
621
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
609
|
-
|
622
|
+
|
623
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
624
|
+
# so change the minimum of number of rows to 2
|
625
|
+
num_examples = 2
|
626
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
627
|
+
project=_PROJECT,
|
628
|
+
subproject=_SUBPROJECT,
|
629
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
630
|
+
inspect.currentframe(), AdaBoostClassifier.__class__.__name__
|
631
|
+
),
|
632
|
+
api_calls=[Session.call],
|
633
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
634
|
+
)
|
635
|
+
if output_cols_prefix == "fit_predict_":
|
636
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
637
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
638
|
+
num_examples = self._sklearn_object.n_clusters
|
639
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
640
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
641
|
+
num_examples = self._sklearn_object.min_samples
|
642
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
643
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
644
|
+
num_examples = self._sklearn_object.n_neighbors
|
645
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
646
|
+
else:
|
647
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
610
648
|
|
611
649
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
612
650
|
# seen during the fit.
|
@@ -618,12 +656,14 @@ class AdaBoostClassifier(BaseTransformer):
|
|
618
656
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
619
657
|
if self.sample_weight_col:
|
620
658
|
output_df_columns_set -= set(self.sample_weight_col)
|
659
|
+
|
621
660
|
# if the dimension of inferred output column names is correct; use it
|
622
661
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
623
|
-
return expected_output_cols_list
|
662
|
+
return expected_output_cols_list, output_df_pd
|
624
663
|
# otherwise, use the sklearn estimator's output
|
625
664
|
else:
|
626
|
-
|
665
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
666
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
627
667
|
|
628
668
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
629
669
|
@telemetry.send_api_usage_telemetry(
|
@@ -671,7 +711,7 @@ class AdaBoostClassifier(BaseTransformer):
|
|
671
711
|
drop_input_cols=self._drop_input_cols,
|
672
712
|
expected_output_cols_type="float",
|
673
713
|
)
|
674
|
-
expected_output_cols = self.
|
714
|
+
expected_output_cols, _ = self._align_expected_output(
|
675
715
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
676
716
|
)
|
677
717
|
|
@@ -739,7 +779,7 @@ class AdaBoostClassifier(BaseTransformer):
|
|
739
779
|
drop_input_cols=self._drop_input_cols,
|
740
780
|
expected_output_cols_type="float",
|
741
781
|
)
|
742
|
-
expected_output_cols = self.
|
782
|
+
expected_output_cols, _ = self._align_expected_output(
|
743
783
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
744
784
|
)
|
745
785
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -804,7 +844,7 @@ class AdaBoostClassifier(BaseTransformer):
|
|
804
844
|
drop_input_cols=self._drop_input_cols,
|
805
845
|
expected_output_cols_type="float",
|
806
846
|
)
|
807
|
-
expected_output_cols = self.
|
847
|
+
expected_output_cols, _ = self._align_expected_output(
|
808
848
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
809
849
|
)
|
810
850
|
|
@@ -869,7 +909,7 @@ class AdaBoostClassifier(BaseTransformer):
|
|
869
909
|
drop_input_cols = self._drop_input_cols,
|
870
910
|
expected_output_cols_type="float",
|
871
911
|
)
|
872
|
-
expected_output_cols = self.
|
912
|
+
expected_output_cols, _ = self._align_expected_output(
|
873
913
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
874
914
|
)
|
875
915
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -514,12 +511,23 @@ class AdaBoostRegressor(BaseTransformer):
|
|
514
511
|
autogenerated=self._autogenerated,
|
515
512
|
subproject=_SUBPROJECT,
|
516
513
|
)
|
517
|
-
|
518
|
-
|
519
|
-
expected_output_cols_list=(
|
520
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
521
|
-
),
|
514
|
+
expected_output_cols = (
|
515
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
522
516
|
)
|
517
|
+
if isinstance(dataset, DataFrame):
|
518
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
519
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
520
|
+
)
|
521
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
522
|
+
drop_input_cols=self._drop_input_cols,
|
523
|
+
expected_output_cols_list=expected_output_cols,
|
524
|
+
example_output_pd_df=example_output_pd_df,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
528
|
+
drop_input_cols=self._drop_input_cols,
|
529
|
+
expected_output_cols_list=expected_output_cols,
|
530
|
+
)
|
523
531
|
self._sklearn_object = fitted_estimator
|
524
532
|
self._is_fitted = True
|
525
533
|
return output_result
|
@@ -542,6 +550,7 @@ class AdaBoostRegressor(BaseTransformer):
|
|
542
550
|
"""
|
543
551
|
self._infer_input_output_cols(dataset)
|
544
552
|
super()._check_dataset_type(dataset)
|
553
|
+
|
545
554
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
546
555
|
estimator=self._sklearn_object,
|
547
556
|
dataset=dataset,
|
@@ -598,12 +607,41 @@ class AdaBoostRegressor(BaseTransformer):
|
|
598
607
|
|
599
608
|
return rv
|
600
609
|
|
601
|
-
def
|
602
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
603
|
-
) -> List[str]:
|
610
|
+
def _align_expected_output(
|
611
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
612
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
613
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
614
|
+
and output dataframe with 1 line.
|
615
|
+
If the method is fit_predict, run 2 lines of data.
|
616
|
+
"""
|
604
617
|
# in case the inferred output column names dimension is different
|
605
618
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
606
|
-
|
619
|
+
|
620
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
621
|
+
# so change the minimum of number of rows to 2
|
622
|
+
num_examples = 2
|
623
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
624
|
+
project=_PROJECT,
|
625
|
+
subproject=_SUBPROJECT,
|
626
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
627
|
+
inspect.currentframe(), AdaBoostRegressor.__class__.__name__
|
628
|
+
),
|
629
|
+
api_calls=[Session.call],
|
630
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
631
|
+
)
|
632
|
+
if output_cols_prefix == "fit_predict_":
|
633
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
634
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
635
|
+
num_examples = self._sklearn_object.n_clusters
|
636
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
637
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
638
|
+
num_examples = self._sklearn_object.min_samples
|
639
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
640
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
641
|
+
num_examples = self._sklearn_object.n_neighbors
|
642
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
643
|
+
else:
|
644
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
607
645
|
|
608
646
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
609
647
|
# seen during the fit.
|
@@ -615,12 +653,14 @@ class AdaBoostRegressor(BaseTransformer):
|
|
615
653
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
616
654
|
if self.sample_weight_col:
|
617
655
|
output_df_columns_set -= set(self.sample_weight_col)
|
656
|
+
|
618
657
|
# if the dimension of inferred output column names is correct; use it
|
619
658
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
620
|
-
return expected_output_cols_list
|
659
|
+
return expected_output_cols_list, output_df_pd
|
621
660
|
# otherwise, use the sklearn estimator's output
|
622
661
|
else:
|
623
|
-
|
662
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
663
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
624
664
|
|
625
665
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
626
666
|
@telemetry.send_api_usage_telemetry(
|
@@ -666,7 +706,7 @@ class AdaBoostRegressor(BaseTransformer):
|
|
666
706
|
drop_input_cols=self._drop_input_cols,
|
667
707
|
expected_output_cols_type="float",
|
668
708
|
)
|
669
|
-
expected_output_cols = self.
|
709
|
+
expected_output_cols, _ = self._align_expected_output(
|
670
710
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
671
711
|
)
|
672
712
|
|
@@ -732,7 +772,7 @@ class AdaBoostRegressor(BaseTransformer):
|
|
732
772
|
drop_input_cols=self._drop_input_cols,
|
733
773
|
expected_output_cols_type="float",
|
734
774
|
)
|
735
|
-
expected_output_cols = self.
|
775
|
+
expected_output_cols, _ = self._align_expected_output(
|
736
776
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
737
777
|
)
|
738
778
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -795,7 +835,7 @@ class AdaBoostRegressor(BaseTransformer):
|
|
795
835
|
drop_input_cols=self._drop_input_cols,
|
796
836
|
expected_output_cols_type="float",
|
797
837
|
)
|
798
|
-
expected_output_cols = self.
|
838
|
+
expected_output_cols, _ = self._align_expected_output(
|
799
839
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
800
840
|
)
|
801
841
|
|
@@ -860,7 +900,7 @@ class AdaBoostRegressor(BaseTransformer):
|
|
860
900
|
drop_input_cols = self._drop_input_cols,
|
861
901
|
expected_output_cols_type="float",
|
862
902
|
)
|
863
|
-
expected_output_cols = self.
|
903
|
+
expected_output_cols, _ = self._align_expected_output(
|
864
904
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
865
905
|
)
|
866
906
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -549,12 +546,23 @@ class BaggingClassifier(BaseTransformer):
|
|
549
546
|
autogenerated=self._autogenerated,
|
550
547
|
subproject=_SUBPROJECT,
|
551
548
|
)
|
552
|
-
|
553
|
-
|
554
|
-
expected_output_cols_list=(
|
555
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
556
|
-
),
|
549
|
+
expected_output_cols = (
|
550
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
557
551
|
)
|
552
|
+
if isinstance(dataset, DataFrame):
|
553
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
554
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
555
|
+
)
|
556
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
557
|
+
drop_input_cols=self._drop_input_cols,
|
558
|
+
expected_output_cols_list=expected_output_cols,
|
559
|
+
example_output_pd_df=example_output_pd_df,
|
560
|
+
)
|
561
|
+
else:
|
562
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
563
|
+
drop_input_cols=self._drop_input_cols,
|
564
|
+
expected_output_cols_list=expected_output_cols,
|
565
|
+
)
|
558
566
|
self._sklearn_object = fitted_estimator
|
559
567
|
self._is_fitted = True
|
560
568
|
return output_result
|
@@ -577,6 +585,7 @@ class BaggingClassifier(BaseTransformer):
|
|
577
585
|
"""
|
578
586
|
self._infer_input_output_cols(dataset)
|
579
587
|
super()._check_dataset_type(dataset)
|
588
|
+
|
580
589
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
581
590
|
estimator=self._sklearn_object,
|
582
591
|
dataset=dataset,
|
@@ -633,12 +642,41 @@ class BaggingClassifier(BaseTransformer):
|
|
633
642
|
|
634
643
|
return rv
|
635
644
|
|
636
|
-
def
|
637
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
638
|
-
) -> List[str]:
|
645
|
+
def _align_expected_output(
|
646
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
647
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
648
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
649
|
+
and output dataframe with 1 line.
|
650
|
+
If the method is fit_predict, run 2 lines of data.
|
651
|
+
"""
|
639
652
|
# in case the inferred output column names dimension is different
|
640
653
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
641
|
-
|
654
|
+
|
655
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
656
|
+
# so change the minimum of number of rows to 2
|
657
|
+
num_examples = 2
|
658
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
659
|
+
project=_PROJECT,
|
660
|
+
subproject=_SUBPROJECT,
|
661
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
662
|
+
inspect.currentframe(), BaggingClassifier.__class__.__name__
|
663
|
+
),
|
664
|
+
api_calls=[Session.call],
|
665
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
666
|
+
)
|
667
|
+
if output_cols_prefix == "fit_predict_":
|
668
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
669
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
670
|
+
num_examples = self._sklearn_object.n_clusters
|
671
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
672
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
673
|
+
num_examples = self._sklearn_object.min_samples
|
674
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
675
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
676
|
+
num_examples = self._sklearn_object.n_neighbors
|
677
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
678
|
+
else:
|
679
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
642
680
|
|
643
681
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
644
682
|
# seen during the fit.
|
@@ -650,12 +688,14 @@ class BaggingClassifier(BaseTransformer):
|
|
650
688
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
651
689
|
if self.sample_weight_col:
|
652
690
|
output_df_columns_set -= set(self.sample_weight_col)
|
691
|
+
|
653
692
|
# if the dimension of inferred output column names is correct; use it
|
654
693
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
655
|
-
return expected_output_cols_list
|
694
|
+
return expected_output_cols_list, output_df_pd
|
656
695
|
# otherwise, use the sklearn estimator's output
|
657
696
|
else:
|
658
|
-
|
697
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
698
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
659
699
|
|
660
700
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
661
701
|
@telemetry.send_api_usage_telemetry(
|
@@ -703,7 +743,7 @@ class BaggingClassifier(BaseTransformer):
|
|
703
743
|
drop_input_cols=self._drop_input_cols,
|
704
744
|
expected_output_cols_type="float",
|
705
745
|
)
|
706
|
-
expected_output_cols = self.
|
746
|
+
expected_output_cols, _ = self._align_expected_output(
|
707
747
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
708
748
|
)
|
709
749
|
|
@@ -771,7 +811,7 @@ class BaggingClassifier(BaseTransformer):
|
|
771
811
|
drop_input_cols=self._drop_input_cols,
|
772
812
|
expected_output_cols_type="float",
|
773
813
|
)
|
774
|
-
expected_output_cols = self.
|
814
|
+
expected_output_cols, _ = self._align_expected_output(
|
775
815
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
776
816
|
)
|
777
817
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -836,7 +876,7 @@ class BaggingClassifier(BaseTransformer):
|
|
836
876
|
drop_input_cols=self._drop_input_cols,
|
837
877
|
expected_output_cols_type="float",
|
838
878
|
)
|
839
|
-
expected_output_cols = self.
|
879
|
+
expected_output_cols, _ = self._align_expected_output(
|
840
880
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
841
881
|
)
|
842
882
|
|
@@ -901,7 +941,7 @@ class BaggingClassifier(BaseTransformer):
|
|
901
941
|
drop_input_cols = self._drop_input_cols,
|
902
942
|
expected_output_cols_type="float",
|
903
943
|
)
|
904
|
-
expected_output_cols = self.
|
944
|
+
expected_output_cols, _ = self._align_expected_output(
|
905
945
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
906
946
|
)
|
907
947
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -549,12 +546,23 @@ class BaggingRegressor(BaseTransformer):
|
|
549
546
|
autogenerated=self._autogenerated,
|
550
547
|
subproject=_SUBPROJECT,
|
551
548
|
)
|
552
|
-
|
553
|
-
|
554
|
-
expected_output_cols_list=(
|
555
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
556
|
-
),
|
549
|
+
expected_output_cols = (
|
550
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
557
551
|
)
|
552
|
+
if isinstance(dataset, DataFrame):
|
553
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
554
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
555
|
+
)
|
556
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
557
|
+
drop_input_cols=self._drop_input_cols,
|
558
|
+
expected_output_cols_list=expected_output_cols,
|
559
|
+
example_output_pd_df=example_output_pd_df,
|
560
|
+
)
|
561
|
+
else:
|
562
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
563
|
+
drop_input_cols=self._drop_input_cols,
|
564
|
+
expected_output_cols_list=expected_output_cols,
|
565
|
+
)
|
558
566
|
self._sklearn_object = fitted_estimator
|
559
567
|
self._is_fitted = True
|
560
568
|
return output_result
|
@@ -577,6 +585,7 @@ class BaggingRegressor(BaseTransformer):
|
|
577
585
|
"""
|
578
586
|
self._infer_input_output_cols(dataset)
|
579
587
|
super()._check_dataset_type(dataset)
|
588
|
+
|
580
589
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
581
590
|
estimator=self._sklearn_object,
|
582
591
|
dataset=dataset,
|
@@ -633,12 +642,41 @@ class BaggingRegressor(BaseTransformer):
|
|
633
642
|
|
634
643
|
return rv
|
635
644
|
|
636
|
-
def
|
637
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
638
|
-
) -> List[str]:
|
645
|
+
def _align_expected_output(
|
646
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
647
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
648
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
649
|
+
and output dataframe with 1 line.
|
650
|
+
If the method is fit_predict, run 2 lines of data.
|
651
|
+
"""
|
639
652
|
# in case the inferred output column names dimension is different
|
640
653
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
641
|
-
|
654
|
+
|
655
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
656
|
+
# so change the minimum of number of rows to 2
|
657
|
+
num_examples = 2
|
658
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
659
|
+
project=_PROJECT,
|
660
|
+
subproject=_SUBPROJECT,
|
661
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
662
|
+
inspect.currentframe(), BaggingRegressor.__class__.__name__
|
663
|
+
),
|
664
|
+
api_calls=[Session.call],
|
665
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
666
|
+
)
|
667
|
+
if output_cols_prefix == "fit_predict_":
|
668
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
669
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
670
|
+
num_examples = self._sklearn_object.n_clusters
|
671
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
672
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
673
|
+
num_examples = self._sklearn_object.min_samples
|
674
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
675
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
676
|
+
num_examples = self._sklearn_object.n_neighbors
|
677
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
678
|
+
else:
|
679
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
642
680
|
|
643
681
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
644
682
|
# seen during the fit.
|
@@ -650,12 +688,14 @@ class BaggingRegressor(BaseTransformer):
|
|
650
688
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
651
689
|
if self.sample_weight_col:
|
652
690
|
output_df_columns_set -= set(self.sample_weight_col)
|
691
|
+
|
653
692
|
# if the dimension of inferred output column names is correct; use it
|
654
693
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
655
|
-
return expected_output_cols_list
|
694
|
+
return expected_output_cols_list, output_df_pd
|
656
695
|
# otherwise, use the sklearn estimator's output
|
657
696
|
else:
|
658
|
-
|
697
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
698
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
659
699
|
|
660
700
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
661
701
|
@telemetry.send_api_usage_telemetry(
|
@@ -701,7 +741,7 @@ class BaggingRegressor(BaseTransformer):
|
|
701
741
|
drop_input_cols=self._drop_input_cols,
|
702
742
|
expected_output_cols_type="float",
|
703
743
|
)
|
704
|
-
expected_output_cols = self.
|
744
|
+
expected_output_cols, _ = self._align_expected_output(
|
705
745
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
706
746
|
)
|
707
747
|
|
@@ -767,7 +807,7 @@ class BaggingRegressor(BaseTransformer):
|
|
767
807
|
drop_input_cols=self._drop_input_cols,
|
768
808
|
expected_output_cols_type="float",
|
769
809
|
)
|
770
|
-
expected_output_cols = self.
|
810
|
+
expected_output_cols, _ = self._align_expected_output(
|
771
811
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
772
812
|
)
|
773
813
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -830,7 +870,7 @@ class BaggingRegressor(BaseTransformer):
|
|
830
870
|
drop_input_cols=self._drop_input_cols,
|
831
871
|
expected_output_cols_type="float",
|
832
872
|
)
|
833
|
-
expected_output_cols = self.
|
873
|
+
expected_output_cols, _ = self._align_expected_output(
|
834
874
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
835
875
|
)
|
836
876
|
|
@@ -895,7 +935,7 @@ class BaggingRegressor(BaseTransformer):
|
|
895
935
|
drop_input_cols = self._drop_input_cols,
|
896
936
|
expected_output_cols_type="float",
|
897
937
|
)
|
898
|
-
expected_output_cols = self.
|
938
|
+
expected_output_cols, _ = self._align_expected_output(
|
899
939
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
900
940
|
)
|
901
941
|
|