snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_classify_text.py +2 -2
- snowflake/cortex/_embed_text_1024.py +37 -0
- snowflake/cortex/_embed_text_768.py +37 -0
- snowflake/cortex/_extract_answer.py +2 -2
- snowflake/cortex/_sentiment.py +2 -2
- snowflake/cortex/_summarize.py +2 -2
- snowflake/cortex/_translate.py +2 -2
- snowflake/cortex/_util.py +4 -4
- snowflake/ml/_internal/env_utils.py +5 -5
- snowflake/ml/_internal/exceptions/error_codes.py +2 -0
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/db_utils.py +50 -0
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/service_logger.py +63 -0
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +26 -2
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/ingestor_utils.py +20 -10
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/access_manager.py +3 -3
- snowflake/ml/feature_store/feature_store.py +60 -19
- snowflake/ml/feature_store/feature_view.py +84 -30
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/fileset/stage_fs.py +2 -1
- snowflake/ml/lineage/lineage_node.py +7 -2
- snowflake/ml/model/__init__.py +1 -2
- snowflake/ml/model/_client/model/model_version_impl.py +96 -12
- snowflake/ml/model/_client/ops/model_ops.py +124 -6
- snowflake/ml/model/_client/ops/service_ops.py +309 -9
- snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
- snowflake/ml/model/_client/sql/_base.py +5 -0
- snowflake/ml/model/_client/sql/model.py +1 -0
- snowflake/ml/model/_client/sql/model_version.py +9 -5
- snowflake/ml/model/_client/sql/service.py +121 -20
- snowflake/ml/model/_model_composer/model_composer.py +11 -39
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
- snowflake/ml/model/_packager/model_env/model_env.py +4 -38
- snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
- snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
- snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
- snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
- snowflake/ml/model/_packager/model_packager.py +14 -8
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/type_hints.py +12 -145
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
- snowflake/ml/modeling/cluster/birch.py +61 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
- snowflake/ml/modeling/cluster/dbscan.py +61 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
- snowflake/ml/modeling/cluster/k_means.py +61 -21
- snowflake/ml/modeling/cluster/mean_shift.py +61 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
- snowflake/ml/modeling/cluster/optics.py +61 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
- snowflake/ml/modeling/compose/column_transformer.py +61 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
- snowflake/ml/modeling/covariance/oas.py +61 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/pca.py +61 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
- snowflake/ml/modeling/impute/knn_imputer.py +61 -21
- snowflake/ml/modeling/impute/missing_indicator.py +61 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/lars.py +61 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/perceptron.py +61 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ridge.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
- snowflake/ml/modeling/manifold/isomap.py +61 -21
- snowflake/ml/modeling/manifold/mds.py +61 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
- snowflake/ml/modeling/manifold/tsne.py +61 -21
- snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
- snowflake/ml/modeling/metrics/ranking.py +0 -3
- snowflake/ml/modeling/metrics/regression.py +0 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
- snowflake/ml/modeling/svm/linear_svc.py +61 -21
- snowflake/ml/modeling/svm/linear_svr.py +61 -21
- snowflake/ml/modeling/svm/nu_svc.py +61 -21
- snowflake/ml/modeling/svm/nu_svr.py +61 -21
- snowflake/ml/modeling/svm/svc.py +61 -21
- snowflake/ml/modeling/svm/svr.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
- snowflake/ml/monitoring/_client/model_monitor.py +126 -0
- snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
- snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
- snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
- snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
- snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
- snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
- snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
- snowflake/ml/monitoring/entities/output_score_type.py +90 -0
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/registry.py +166 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
- snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
- snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
- snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
- snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
- snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
- snowflake/ml/_internal/utils/session_token_manager.py +0 -46
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
- snowflake/ml/_internal/utils/uri.py +0 -77
- snowflake/ml/data/torch_dataset.py +0 -33
- snowflake/ml/model/_api.py +0 -568
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
- snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
- snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
- snowflake/ml/model/deploy_platforms.py +0 -6
- snowflake/ml/model/models/llm.py +0 -104
- snowflake/ml/monitoring/monitor.py +0 -203
- snowflake/ml/registry/_initial_schema.py +0 -142
- snowflake/ml/registry/_schema.py +0 -82
- snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
- snowflake/ml/registry/_schema_version_manager.py +0 -163
- snowflake/ml/registry/model_registry.py +0 -2048
- snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -627,12 +624,23 @@ class RandomForestRegressor(BaseTransformer):
|
|
627
624
|
autogenerated=self._autogenerated,
|
628
625
|
subproject=_SUBPROJECT,
|
629
626
|
)
|
630
|
-
|
631
|
-
|
632
|
-
expected_output_cols_list=(
|
633
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
634
|
-
),
|
627
|
+
expected_output_cols = (
|
628
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
635
629
|
)
|
630
|
+
if isinstance(dataset, DataFrame):
|
631
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
632
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
633
|
+
)
|
634
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
635
|
+
drop_input_cols=self._drop_input_cols,
|
636
|
+
expected_output_cols_list=expected_output_cols,
|
637
|
+
example_output_pd_df=example_output_pd_df,
|
638
|
+
)
|
639
|
+
else:
|
640
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
641
|
+
drop_input_cols=self._drop_input_cols,
|
642
|
+
expected_output_cols_list=expected_output_cols,
|
643
|
+
)
|
636
644
|
self._sklearn_object = fitted_estimator
|
637
645
|
self._is_fitted = True
|
638
646
|
return output_result
|
@@ -655,6 +663,7 @@ class RandomForestRegressor(BaseTransformer):
|
|
655
663
|
"""
|
656
664
|
self._infer_input_output_cols(dataset)
|
657
665
|
super()._check_dataset_type(dataset)
|
666
|
+
|
658
667
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
659
668
|
estimator=self._sklearn_object,
|
660
669
|
dataset=dataset,
|
@@ -711,12 +720,41 @@ class RandomForestRegressor(BaseTransformer):
|
|
711
720
|
|
712
721
|
return rv
|
713
722
|
|
714
|
-
def
|
715
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
716
|
-
) -> List[str]:
|
723
|
+
def _align_expected_output(
|
724
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
725
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
726
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
727
|
+
and output dataframe with 1 line.
|
728
|
+
If the method is fit_predict, run 2 lines of data.
|
729
|
+
"""
|
717
730
|
# in case the inferred output column names dimension is different
|
718
731
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
719
|
-
|
732
|
+
|
733
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
734
|
+
# so change the minimum of number of rows to 2
|
735
|
+
num_examples = 2
|
736
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
737
|
+
project=_PROJECT,
|
738
|
+
subproject=_SUBPROJECT,
|
739
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
740
|
+
inspect.currentframe(), RandomForestRegressor.__class__.__name__
|
741
|
+
),
|
742
|
+
api_calls=[Session.call],
|
743
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
744
|
+
)
|
745
|
+
if output_cols_prefix == "fit_predict_":
|
746
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
747
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
748
|
+
num_examples = self._sklearn_object.n_clusters
|
749
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
750
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
751
|
+
num_examples = self._sklearn_object.min_samples
|
752
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
753
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
754
|
+
num_examples = self._sklearn_object.n_neighbors
|
755
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
756
|
+
else:
|
757
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
720
758
|
|
721
759
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
722
760
|
# seen during the fit.
|
@@ -728,12 +766,14 @@ class RandomForestRegressor(BaseTransformer):
|
|
728
766
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
729
767
|
if self.sample_weight_col:
|
730
768
|
output_df_columns_set -= set(self.sample_weight_col)
|
769
|
+
|
731
770
|
# if the dimension of inferred output column names is correct; use it
|
732
771
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
733
|
-
return expected_output_cols_list
|
772
|
+
return expected_output_cols_list, output_df_pd
|
734
773
|
# otherwise, use the sklearn estimator's output
|
735
774
|
else:
|
736
|
-
|
775
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
776
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
737
777
|
|
738
778
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
739
779
|
@telemetry.send_api_usage_telemetry(
|
@@ -779,7 +819,7 @@ class RandomForestRegressor(BaseTransformer):
|
|
779
819
|
drop_input_cols=self._drop_input_cols,
|
780
820
|
expected_output_cols_type="float",
|
781
821
|
)
|
782
|
-
expected_output_cols = self.
|
822
|
+
expected_output_cols, _ = self._align_expected_output(
|
783
823
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
784
824
|
)
|
785
825
|
|
@@ -845,7 +885,7 @@ class RandomForestRegressor(BaseTransformer):
|
|
845
885
|
drop_input_cols=self._drop_input_cols,
|
846
886
|
expected_output_cols_type="float",
|
847
887
|
)
|
848
|
-
expected_output_cols = self.
|
888
|
+
expected_output_cols, _ = self._align_expected_output(
|
849
889
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
850
890
|
)
|
851
891
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -908,7 +948,7 @@ class RandomForestRegressor(BaseTransformer):
|
|
908
948
|
drop_input_cols=self._drop_input_cols,
|
909
949
|
expected_output_cols_type="float",
|
910
950
|
)
|
911
|
-
expected_output_cols = self.
|
951
|
+
expected_output_cols, _ = self._align_expected_output(
|
912
952
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
913
953
|
)
|
914
954
|
|
@@ -973,7 +1013,7 @@ class RandomForestRegressor(BaseTransformer):
|
|
973
1013
|
drop_input_cols = self._drop_input_cols,
|
974
1014
|
expected_output_cols_type="float",
|
975
1015
|
)
|
976
|
-
expected_output_cols = self.
|
1016
|
+
expected_output_cols, _ = self._align_expected_output(
|
977
1017
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
978
1018
|
)
|
979
1019
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -530,12 +527,23 @@ class StackingRegressor(BaseTransformer):
|
|
530
527
|
autogenerated=self._autogenerated,
|
531
528
|
subproject=_SUBPROJECT,
|
532
529
|
)
|
533
|
-
|
534
|
-
|
535
|
-
expected_output_cols_list=(
|
536
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
537
|
-
),
|
530
|
+
expected_output_cols = (
|
531
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
538
532
|
)
|
533
|
+
if isinstance(dataset, DataFrame):
|
534
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
535
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
536
|
+
)
|
537
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
538
|
+
drop_input_cols=self._drop_input_cols,
|
539
|
+
expected_output_cols_list=expected_output_cols,
|
540
|
+
example_output_pd_df=example_output_pd_df,
|
541
|
+
)
|
542
|
+
else:
|
543
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
544
|
+
drop_input_cols=self._drop_input_cols,
|
545
|
+
expected_output_cols_list=expected_output_cols,
|
546
|
+
)
|
539
547
|
self._sklearn_object = fitted_estimator
|
540
548
|
self._is_fitted = True
|
541
549
|
return output_result
|
@@ -560,6 +568,7 @@ class StackingRegressor(BaseTransformer):
|
|
560
568
|
"""
|
561
569
|
self._infer_input_output_cols(dataset)
|
562
570
|
super()._check_dataset_type(dataset)
|
571
|
+
|
563
572
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
564
573
|
estimator=self._sklearn_object,
|
565
574
|
dataset=dataset,
|
@@ -616,12 +625,41 @@ class StackingRegressor(BaseTransformer):
|
|
616
625
|
|
617
626
|
return rv
|
618
627
|
|
619
|
-
def
|
620
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
621
|
-
) -> List[str]:
|
628
|
+
def _align_expected_output(
|
629
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
630
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
631
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
632
|
+
and output dataframe with 1 line.
|
633
|
+
If the method is fit_predict, run 2 lines of data.
|
634
|
+
"""
|
622
635
|
# in case the inferred output column names dimension is different
|
623
636
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
624
|
-
|
637
|
+
|
638
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
639
|
+
# so change the minimum of number of rows to 2
|
640
|
+
num_examples = 2
|
641
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
642
|
+
project=_PROJECT,
|
643
|
+
subproject=_SUBPROJECT,
|
644
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
645
|
+
inspect.currentframe(), StackingRegressor.__class__.__name__
|
646
|
+
),
|
647
|
+
api_calls=[Session.call],
|
648
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
649
|
+
)
|
650
|
+
if output_cols_prefix == "fit_predict_":
|
651
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
652
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
653
|
+
num_examples = self._sklearn_object.n_clusters
|
654
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
655
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
656
|
+
num_examples = self._sklearn_object.min_samples
|
657
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
658
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
659
|
+
num_examples = self._sklearn_object.n_neighbors
|
660
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
661
|
+
else:
|
662
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
625
663
|
|
626
664
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
627
665
|
# seen during the fit.
|
@@ -633,12 +671,14 @@ class StackingRegressor(BaseTransformer):
|
|
633
671
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
634
672
|
if self.sample_weight_col:
|
635
673
|
output_df_columns_set -= set(self.sample_weight_col)
|
674
|
+
|
636
675
|
# if the dimension of inferred output column names is correct; use it
|
637
676
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
638
|
-
return expected_output_cols_list
|
677
|
+
return expected_output_cols_list, output_df_pd
|
639
678
|
# otherwise, use the sklearn estimator's output
|
640
679
|
else:
|
641
|
-
|
680
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
681
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
642
682
|
|
643
683
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
644
684
|
@telemetry.send_api_usage_telemetry(
|
@@ -684,7 +724,7 @@ class StackingRegressor(BaseTransformer):
|
|
684
724
|
drop_input_cols=self._drop_input_cols,
|
685
725
|
expected_output_cols_type="float",
|
686
726
|
)
|
687
|
-
expected_output_cols = self.
|
727
|
+
expected_output_cols, _ = self._align_expected_output(
|
688
728
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
689
729
|
)
|
690
730
|
|
@@ -750,7 +790,7 @@ class StackingRegressor(BaseTransformer):
|
|
750
790
|
drop_input_cols=self._drop_input_cols,
|
751
791
|
expected_output_cols_type="float",
|
752
792
|
)
|
753
|
-
expected_output_cols = self.
|
793
|
+
expected_output_cols, _ = self._align_expected_output(
|
754
794
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
755
795
|
)
|
756
796
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -813,7 +853,7 @@ class StackingRegressor(BaseTransformer):
|
|
813
853
|
drop_input_cols=self._drop_input_cols,
|
814
854
|
expected_output_cols_type="float",
|
815
855
|
)
|
816
|
-
expected_output_cols = self.
|
856
|
+
expected_output_cols, _ = self._align_expected_output(
|
817
857
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
818
858
|
)
|
819
859
|
|
@@ -878,7 +918,7 @@ class StackingRegressor(BaseTransformer):
|
|
878
918
|
drop_input_cols = self._drop_input_cols,
|
879
919
|
expected_output_cols_type="float",
|
880
920
|
)
|
881
|
-
expected_output_cols = self.
|
921
|
+
expected_output_cols, _ = self._align_expected_output(
|
882
922
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
883
923
|
)
|
884
924
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -512,12 +509,23 @@ class VotingClassifier(BaseTransformer):
|
|
512
509
|
autogenerated=self._autogenerated,
|
513
510
|
subproject=_SUBPROJECT,
|
514
511
|
)
|
515
|
-
|
516
|
-
|
517
|
-
expected_output_cols_list=(
|
518
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
519
|
-
),
|
512
|
+
expected_output_cols = (
|
513
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
520
514
|
)
|
515
|
+
if isinstance(dataset, DataFrame):
|
516
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
517
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
518
|
+
)
|
519
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
520
|
+
drop_input_cols=self._drop_input_cols,
|
521
|
+
expected_output_cols_list=expected_output_cols,
|
522
|
+
example_output_pd_df=example_output_pd_df,
|
523
|
+
)
|
524
|
+
else:
|
525
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
526
|
+
drop_input_cols=self._drop_input_cols,
|
527
|
+
expected_output_cols_list=expected_output_cols,
|
528
|
+
)
|
521
529
|
self._sklearn_object = fitted_estimator
|
522
530
|
self._is_fitted = True
|
523
531
|
return output_result
|
@@ -542,6 +550,7 @@ class VotingClassifier(BaseTransformer):
|
|
542
550
|
"""
|
543
551
|
self._infer_input_output_cols(dataset)
|
544
552
|
super()._check_dataset_type(dataset)
|
553
|
+
|
545
554
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
546
555
|
estimator=self._sklearn_object,
|
547
556
|
dataset=dataset,
|
@@ -598,12 +607,41 @@ class VotingClassifier(BaseTransformer):
|
|
598
607
|
|
599
608
|
return rv
|
600
609
|
|
601
|
-
def
|
602
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
603
|
-
) -> List[str]:
|
610
|
+
def _align_expected_output(
|
611
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
612
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
613
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
614
|
+
and output dataframe with 1 line.
|
615
|
+
If the method is fit_predict, run 2 lines of data.
|
616
|
+
"""
|
604
617
|
# in case the inferred output column names dimension is different
|
605
618
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
606
|
-
|
619
|
+
|
620
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
621
|
+
# so change the minimum of number of rows to 2
|
622
|
+
num_examples = 2
|
623
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
624
|
+
project=_PROJECT,
|
625
|
+
subproject=_SUBPROJECT,
|
626
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
627
|
+
inspect.currentframe(), VotingClassifier.__class__.__name__
|
628
|
+
),
|
629
|
+
api_calls=[Session.call],
|
630
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
631
|
+
)
|
632
|
+
if output_cols_prefix == "fit_predict_":
|
633
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
634
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
635
|
+
num_examples = self._sklearn_object.n_clusters
|
636
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
637
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
638
|
+
num_examples = self._sklearn_object.min_samples
|
639
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
640
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
641
|
+
num_examples = self._sklearn_object.n_neighbors
|
642
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
643
|
+
else:
|
644
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
607
645
|
|
608
646
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
609
647
|
# seen during the fit.
|
@@ -615,12 +653,14 @@ class VotingClassifier(BaseTransformer):
|
|
615
653
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
616
654
|
if self.sample_weight_col:
|
617
655
|
output_df_columns_set -= set(self.sample_weight_col)
|
656
|
+
|
618
657
|
# if the dimension of inferred output column names is correct; use it
|
619
658
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
620
|
-
return expected_output_cols_list
|
659
|
+
return expected_output_cols_list, output_df_pd
|
621
660
|
# otherwise, use the sklearn estimator's output
|
622
661
|
else:
|
623
|
-
|
662
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
663
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
624
664
|
|
625
665
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
626
666
|
@telemetry.send_api_usage_telemetry(
|
@@ -668,7 +708,7 @@ class VotingClassifier(BaseTransformer):
|
|
668
708
|
drop_input_cols=self._drop_input_cols,
|
669
709
|
expected_output_cols_type="float",
|
670
710
|
)
|
671
|
-
expected_output_cols = self.
|
711
|
+
expected_output_cols, _ = self._align_expected_output(
|
672
712
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
673
713
|
)
|
674
714
|
|
@@ -736,7 +776,7 @@ class VotingClassifier(BaseTransformer):
|
|
736
776
|
drop_input_cols=self._drop_input_cols,
|
737
777
|
expected_output_cols_type="float",
|
738
778
|
)
|
739
|
-
expected_output_cols = self.
|
779
|
+
expected_output_cols, _ = self._align_expected_output(
|
740
780
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
741
781
|
)
|
742
782
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -799,7 +839,7 @@ class VotingClassifier(BaseTransformer):
|
|
799
839
|
drop_input_cols=self._drop_input_cols,
|
800
840
|
expected_output_cols_type="float",
|
801
841
|
)
|
802
|
-
expected_output_cols = self.
|
842
|
+
expected_output_cols, _ = self._align_expected_output(
|
803
843
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
804
844
|
)
|
805
845
|
|
@@ -864,7 +904,7 @@ class VotingClassifier(BaseTransformer):
|
|
864
904
|
drop_input_cols = self._drop_input_cols,
|
865
905
|
expected_output_cols_type="float",
|
866
906
|
)
|
867
|
-
expected_output_cols = self.
|
907
|
+
expected_output_cols, _ = self._align_expected_output(
|
868
908
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
869
909
|
)
|
870
910
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -494,12 +491,23 @@ class VotingRegressor(BaseTransformer):
|
|
494
491
|
autogenerated=self._autogenerated,
|
495
492
|
subproject=_SUBPROJECT,
|
496
493
|
)
|
497
|
-
|
498
|
-
|
499
|
-
expected_output_cols_list=(
|
500
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
501
|
-
),
|
494
|
+
expected_output_cols = (
|
495
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
502
496
|
)
|
497
|
+
if isinstance(dataset, DataFrame):
|
498
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
499
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
500
|
+
)
|
501
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
502
|
+
drop_input_cols=self._drop_input_cols,
|
503
|
+
expected_output_cols_list=expected_output_cols,
|
504
|
+
example_output_pd_df=example_output_pd_df,
|
505
|
+
)
|
506
|
+
else:
|
507
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
508
|
+
drop_input_cols=self._drop_input_cols,
|
509
|
+
expected_output_cols_list=expected_output_cols,
|
510
|
+
)
|
503
511
|
self._sklearn_object = fitted_estimator
|
504
512
|
self._is_fitted = True
|
505
513
|
return output_result
|
@@ -524,6 +532,7 @@ class VotingRegressor(BaseTransformer):
|
|
524
532
|
"""
|
525
533
|
self._infer_input_output_cols(dataset)
|
526
534
|
super()._check_dataset_type(dataset)
|
535
|
+
|
527
536
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
528
537
|
estimator=self._sklearn_object,
|
529
538
|
dataset=dataset,
|
@@ -580,12 +589,41 @@ class VotingRegressor(BaseTransformer):
|
|
580
589
|
|
581
590
|
return rv
|
582
591
|
|
583
|
-
def
|
584
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
585
|
-
) -> List[str]:
|
592
|
+
def _align_expected_output(
|
593
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
594
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
595
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
596
|
+
and output dataframe with 1 line.
|
597
|
+
If the method is fit_predict, run 2 lines of data.
|
598
|
+
"""
|
586
599
|
# in case the inferred output column names dimension is different
|
587
600
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
588
|
-
|
601
|
+
|
602
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
603
|
+
# so change the minimum of number of rows to 2
|
604
|
+
num_examples = 2
|
605
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
606
|
+
project=_PROJECT,
|
607
|
+
subproject=_SUBPROJECT,
|
608
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
609
|
+
inspect.currentframe(), VotingRegressor.__class__.__name__
|
610
|
+
),
|
611
|
+
api_calls=[Session.call],
|
612
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
613
|
+
)
|
614
|
+
if output_cols_prefix == "fit_predict_":
|
615
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
616
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
617
|
+
num_examples = self._sklearn_object.n_clusters
|
618
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
619
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
620
|
+
num_examples = self._sklearn_object.min_samples
|
621
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
622
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
623
|
+
num_examples = self._sklearn_object.n_neighbors
|
624
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
625
|
+
else:
|
626
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
589
627
|
|
590
628
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
591
629
|
# seen during the fit.
|
@@ -597,12 +635,14 @@ class VotingRegressor(BaseTransformer):
|
|
597
635
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
598
636
|
if self.sample_weight_col:
|
599
637
|
output_df_columns_set -= set(self.sample_weight_col)
|
638
|
+
|
600
639
|
# if the dimension of inferred output column names is correct; use it
|
601
640
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
602
|
-
return expected_output_cols_list
|
641
|
+
return expected_output_cols_list, output_df_pd
|
603
642
|
# otherwise, use the sklearn estimator's output
|
604
643
|
else:
|
605
|
-
|
644
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
645
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
606
646
|
|
607
647
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
608
648
|
@telemetry.send_api_usage_telemetry(
|
@@ -648,7 +688,7 @@ class VotingRegressor(BaseTransformer):
|
|
648
688
|
drop_input_cols=self._drop_input_cols,
|
649
689
|
expected_output_cols_type="float",
|
650
690
|
)
|
651
|
-
expected_output_cols = self.
|
691
|
+
expected_output_cols, _ = self._align_expected_output(
|
652
692
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
653
693
|
)
|
654
694
|
|
@@ -714,7 +754,7 @@ class VotingRegressor(BaseTransformer):
|
|
714
754
|
drop_input_cols=self._drop_input_cols,
|
715
755
|
expected_output_cols_type="float",
|
716
756
|
)
|
717
|
-
expected_output_cols = self.
|
757
|
+
expected_output_cols, _ = self._align_expected_output(
|
718
758
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
719
759
|
)
|
720
760
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -777,7 +817,7 @@ class VotingRegressor(BaseTransformer):
|
|
777
817
|
drop_input_cols=self._drop_input_cols,
|
778
818
|
expected_output_cols_type="float",
|
779
819
|
)
|
780
|
-
expected_output_cols = self.
|
820
|
+
expected_output_cols, _ = self._align_expected_output(
|
781
821
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
782
822
|
)
|
783
823
|
|
@@ -842,7 +882,7 @@ class VotingRegressor(BaseTransformer):
|
|
842
882
|
drop_input_cols = self._drop_input_cols,
|
843
883
|
expected_output_cols_type="float",
|
844
884
|
)
|
845
|
-
expected_output_cols = self.
|
885
|
+
expected_output_cols, _ = self._align_expected_output(
|
846
886
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
847
887
|
)
|
848
888
|
|