snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_classify_text.py +2 -2
- snowflake/cortex/_embed_text_1024.py +37 -0
- snowflake/cortex/_embed_text_768.py +37 -0
- snowflake/cortex/_extract_answer.py +2 -2
- snowflake/cortex/_sentiment.py +2 -2
- snowflake/cortex/_summarize.py +2 -2
- snowflake/cortex/_translate.py +2 -2
- snowflake/cortex/_util.py +4 -4
- snowflake/ml/_internal/env_utils.py +5 -5
- snowflake/ml/_internal/exceptions/error_codes.py +2 -0
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/db_utils.py +50 -0
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/service_logger.py +63 -0
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +26 -2
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/ingestor_utils.py +20 -10
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/access_manager.py +3 -3
- snowflake/ml/feature_store/feature_store.py +60 -19
- snowflake/ml/feature_store/feature_view.py +84 -30
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/fileset/stage_fs.py +2 -1
- snowflake/ml/lineage/lineage_node.py +7 -2
- snowflake/ml/model/__init__.py +1 -2
- snowflake/ml/model/_client/model/model_version_impl.py +96 -12
- snowflake/ml/model/_client/ops/model_ops.py +124 -6
- snowflake/ml/model/_client/ops/service_ops.py +309 -9
- snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
- snowflake/ml/model/_client/sql/_base.py +5 -0
- snowflake/ml/model/_client/sql/model.py +1 -0
- snowflake/ml/model/_client/sql/model_version.py +9 -5
- snowflake/ml/model/_client/sql/service.py +121 -20
- snowflake/ml/model/_model_composer/model_composer.py +11 -39
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
- snowflake/ml/model/_packager/model_env/model_env.py +4 -38
- snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
- snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
- snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
- snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
- snowflake/ml/model/_packager/model_packager.py +14 -8
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/type_hints.py +12 -145
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
- snowflake/ml/modeling/cluster/birch.py +61 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
- snowflake/ml/modeling/cluster/dbscan.py +61 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
- snowflake/ml/modeling/cluster/k_means.py +61 -21
- snowflake/ml/modeling/cluster/mean_shift.py +61 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
- snowflake/ml/modeling/cluster/optics.py +61 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
- snowflake/ml/modeling/compose/column_transformer.py +61 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
- snowflake/ml/modeling/covariance/oas.py +61 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/pca.py +61 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
- snowflake/ml/modeling/impute/knn_imputer.py +61 -21
- snowflake/ml/modeling/impute/missing_indicator.py +61 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/lars.py +61 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/perceptron.py +61 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ridge.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
- snowflake/ml/modeling/manifold/isomap.py +61 -21
- snowflake/ml/modeling/manifold/mds.py +61 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
- snowflake/ml/modeling/manifold/tsne.py +61 -21
- snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
- snowflake/ml/modeling/metrics/ranking.py +0 -3
- snowflake/ml/modeling/metrics/regression.py +0 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
- snowflake/ml/modeling/svm/linear_svc.py +61 -21
- snowflake/ml/modeling/svm/linear_svr.py +61 -21
- snowflake/ml/modeling/svm/nu_svc.py +61 -21
- snowflake/ml/modeling/svm/nu_svr.py +61 -21
- snowflake/ml/modeling/svm/svc.py +61 -21
- snowflake/ml/modeling/svm/svr.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
- snowflake/ml/monitoring/_client/model_monitor.py +126 -0
- snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
- snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
- snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
- snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
- snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
- snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
- snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
- snowflake/ml/monitoring/entities/output_score_type.py +90 -0
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/registry.py +166 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
- snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
- snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
- snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
- snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
- snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
- snowflake/ml/_internal/utils/session_token_manager.py +0 -46
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
- snowflake/ml/_internal/utils/uri.py +0 -77
- snowflake/ml/data/torch_dataset.py +0 -33
- snowflake/ml/model/_api.py +0 -568
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
- snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
- snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
- snowflake/ml/model/deploy_platforms.py +0 -6
- snowflake/ml/model/models/llm.py +0 -104
- snowflake/ml/monitoring/monitor.py +0 -203
- snowflake/ml/registry/_initial_schema.py +0 -142
- snowflake/ml/registry/_schema.py +0 -82
- snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
- snowflake/ml/registry/_schema_version_manager.py +0 -163
- snowflake/ml/registry/model_registry.py +0 -2048
- snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -25,12 +23,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
25
23
|
from snowflake.ml._internal import telemetry
|
26
24
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
27
25
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
28
|
-
from snowflake.ml._internal.utils import
|
26
|
+
from snowflake.ml._internal.utils import identifier
|
29
27
|
from snowflake.snowpark import DataFrame, Session
|
30
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
29
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
32
30
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
33
|
-
ModelTransformHandlers,
|
34
31
|
BatchInferenceKwargsTypedDict,
|
35
32
|
ScoreKwargsTypedDict
|
36
33
|
)
|
@@ -482,12 +479,23 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
482
479
|
autogenerated=self._autogenerated,
|
483
480
|
subproject=_SUBPROJECT,
|
484
481
|
)
|
485
|
-
|
486
|
-
|
487
|
-
expected_output_cols_list=(
|
488
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
489
|
-
),
|
482
|
+
expected_output_cols = (
|
483
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
490
484
|
)
|
485
|
+
if isinstance(dataset, DataFrame):
|
486
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
487
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
488
|
+
)
|
489
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
490
|
+
drop_input_cols=self._drop_input_cols,
|
491
|
+
expected_output_cols_list=expected_output_cols,
|
492
|
+
example_output_pd_df=example_output_pd_df,
|
493
|
+
)
|
494
|
+
else:
|
495
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
496
|
+
drop_input_cols=self._drop_input_cols,
|
497
|
+
expected_output_cols_list=expected_output_cols,
|
498
|
+
)
|
491
499
|
self._sklearn_object = fitted_estimator
|
492
500
|
self._is_fitted = True
|
493
501
|
return output_result
|
@@ -512,6 +520,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
512
520
|
"""
|
513
521
|
self._infer_input_output_cols(dataset)
|
514
522
|
super()._check_dataset_type(dataset)
|
523
|
+
|
515
524
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
516
525
|
estimator=self._sklearn_object,
|
517
526
|
dataset=dataset,
|
@@ -568,12 +577,41 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
568
577
|
|
569
578
|
return rv
|
570
579
|
|
571
|
-
def
|
572
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
573
|
-
) -> List[str]:
|
580
|
+
def _align_expected_output(
|
581
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
582
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
583
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
584
|
+
and output dataframe with 1 line.
|
585
|
+
If the method is fit_predict, run 2 lines of data.
|
586
|
+
"""
|
574
587
|
# in case the inferred output column names dimension is different
|
575
588
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
576
|
-
|
589
|
+
|
590
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
591
|
+
# so change the minimum of number of rows to 2
|
592
|
+
num_examples = 2
|
593
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
594
|
+
project=_PROJECT,
|
595
|
+
subproject=_SUBPROJECT,
|
596
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
597
|
+
inspect.currentframe(), GenericUnivariateSelect.__class__.__name__
|
598
|
+
),
|
599
|
+
api_calls=[Session.call],
|
600
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
601
|
+
)
|
602
|
+
if output_cols_prefix == "fit_predict_":
|
603
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
604
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
605
|
+
num_examples = self._sklearn_object.n_clusters
|
606
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
607
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
608
|
+
num_examples = self._sklearn_object.min_samples
|
609
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
610
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
611
|
+
num_examples = self._sklearn_object.n_neighbors
|
612
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
613
|
+
else:
|
614
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
577
615
|
|
578
616
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
579
617
|
# seen during the fit.
|
@@ -585,12 +623,14 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
585
623
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
586
624
|
if self.sample_weight_col:
|
587
625
|
output_df_columns_set -= set(self.sample_weight_col)
|
626
|
+
|
588
627
|
# if the dimension of inferred output column names is correct; use it
|
589
628
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
590
|
-
return expected_output_cols_list
|
629
|
+
return expected_output_cols_list, output_df_pd
|
591
630
|
# otherwise, use the sklearn estimator's output
|
592
631
|
else:
|
593
|
-
|
632
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
633
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
594
634
|
|
595
635
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
596
636
|
@telemetry.send_api_usage_telemetry(
|
@@ -636,7 +676,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
636
676
|
drop_input_cols=self._drop_input_cols,
|
637
677
|
expected_output_cols_type="float",
|
638
678
|
)
|
639
|
-
expected_output_cols = self.
|
679
|
+
expected_output_cols, _ = self._align_expected_output(
|
640
680
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
641
681
|
)
|
642
682
|
|
@@ -702,7 +742,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
702
742
|
drop_input_cols=self._drop_input_cols,
|
703
743
|
expected_output_cols_type="float",
|
704
744
|
)
|
705
|
-
expected_output_cols = self.
|
745
|
+
expected_output_cols, _ = self._align_expected_output(
|
706
746
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
707
747
|
)
|
708
748
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -765,7 +805,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
765
805
|
drop_input_cols=self._drop_input_cols,
|
766
806
|
expected_output_cols_type="float",
|
767
807
|
)
|
768
|
-
expected_output_cols = self.
|
808
|
+
expected_output_cols, _ = self._align_expected_output(
|
769
809
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
770
810
|
)
|
771
811
|
|
@@ -830,7 +870,7 @@ class GenericUnivariateSelect(BaseTransformer):
|
|
830
870
|
drop_input_cols = self._drop_input_cols,
|
831
871
|
expected_output_cols_type="float",
|
832
872
|
)
|
833
|
-
expected_output_cols = self.
|
873
|
+
expected_output_cols, _ = self._align_expected_output(
|
834
874
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
835
875
|
)
|
836
876
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -25,12 +23,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
25
23
|
from snowflake.ml._internal import telemetry
|
26
24
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
27
25
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
28
|
-
from snowflake.ml._internal.utils import
|
26
|
+
from snowflake.ml._internal.utils import identifier
|
29
27
|
from snowflake.snowpark import DataFrame, Session
|
30
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
29
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
32
30
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
33
|
-
ModelTransformHandlers,
|
34
31
|
BatchInferenceKwargsTypedDict,
|
35
32
|
ScoreKwargsTypedDict
|
36
33
|
)
|
@@ -478,12 +475,23 @@ class SelectFdr(BaseTransformer):
|
|
478
475
|
autogenerated=self._autogenerated,
|
479
476
|
subproject=_SUBPROJECT,
|
480
477
|
)
|
481
|
-
|
482
|
-
|
483
|
-
expected_output_cols_list=(
|
484
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
485
|
-
),
|
478
|
+
expected_output_cols = (
|
479
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
486
480
|
)
|
481
|
+
if isinstance(dataset, DataFrame):
|
482
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
483
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
484
|
+
)
|
485
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
486
|
+
drop_input_cols=self._drop_input_cols,
|
487
|
+
expected_output_cols_list=expected_output_cols,
|
488
|
+
example_output_pd_df=example_output_pd_df,
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
492
|
+
drop_input_cols=self._drop_input_cols,
|
493
|
+
expected_output_cols_list=expected_output_cols,
|
494
|
+
)
|
487
495
|
self._sklearn_object = fitted_estimator
|
488
496
|
self._is_fitted = True
|
489
497
|
return output_result
|
@@ -508,6 +516,7 @@ class SelectFdr(BaseTransformer):
|
|
508
516
|
"""
|
509
517
|
self._infer_input_output_cols(dataset)
|
510
518
|
super()._check_dataset_type(dataset)
|
519
|
+
|
511
520
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
512
521
|
estimator=self._sklearn_object,
|
513
522
|
dataset=dataset,
|
@@ -564,12 +573,41 @@ class SelectFdr(BaseTransformer):
|
|
564
573
|
|
565
574
|
return rv
|
566
575
|
|
567
|
-
def
|
568
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
569
|
-
) -> List[str]:
|
576
|
+
def _align_expected_output(
|
577
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
578
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
579
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
580
|
+
and output dataframe with 1 line.
|
581
|
+
If the method is fit_predict, run 2 lines of data.
|
582
|
+
"""
|
570
583
|
# in case the inferred output column names dimension is different
|
571
584
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
585
|
+
|
586
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
587
|
+
# so change the minimum of number of rows to 2
|
588
|
+
num_examples = 2
|
589
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
590
|
+
project=_PROJECT,
|
591
|
+
subproject=_SUBPROJECT,
|
592
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
593
|
+
inspect.currentframe(), SelectFdr.__class__.__name__
|
594
|
+
),
|
595
|
+
api_calls=[Session.call],
|
596
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
597
|
+
)
|
598
|
+
if output_cols_prefix == "fit_predict_":
|
599
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
600
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
601
|
+
num_examples = self._sklearn_object.n_clusters
|
602
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
603
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
604
|
+
num_examples = self._sklearn_object.min_samples
|
605
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
606
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
607
|
+
num_examples = self._sklearn_object.n_neighbors
|
608
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
609
|
+
else:
|
610
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
573
611
|
|
574
612
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
613
|
# seen during the fit.
|
@@ -581,12 +619,14 @@ class SelectFdr(BaseTransformer):
|
|
581
619
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
582
620
|
if self.sample_weight_col:
|
583
621
|
output_df_columns_set -= set(self.sample_weight_col)
|
622
|
+
|
584
623
|
# if the dimension of inferred output column names is correct; use it
|
585
624
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
586
|
-
return expected_output_cols_list
|
625
|
+
return expected_output_cols_list, output_df_pd
|
587
626
|
# otherwise, use the sklearn estimator's output
|
588
627
|
else:
|
589
|
-
|
628
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
629
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
590
630
|
|
591
631
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
592
632
|
@telemetry.send_api_usage_telemetry(
|
@@ -632,7 +672,7 @@ class SelectFdr(BaseTransformer):
|
|
632
672
|
drop_input_cols=self._drop_input_cols,
|
633
673
|
expected_output_cols_type="float",
|
634
674
|
)
|
635
|
-
expected_output_cols = self.
|
675
|
+
expected_output_cols, _ = self._align_expected_output(
|
636
676
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
637
677
|
)
|
638
678
|
|
@@ -698,7 +738,7 @@ class SelectFdr(BaseTransformer):
|
|
698
738
|
drop_input_cols=self._drop_input_cols,
|
699
739
|
expected_output_cols_type="float",
|
700
740
|
)
|
701
|
-
expected_output_cols = self.
|
741
|
+
expected_output_cols, _ = self._align_expected_output(
|
702
742
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
703
743
|
)
|
704
744
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -761,7 +801,7 @@ class SelectFdr(BaseTransformer):
|
|
761
801
|
drop_input_cols=self._drop_input_cols,
|
762
802
|
expected_output_cols_type="float",
|
763
803
|
)
|
764
|
-
expected_output_cols = self.
|
804
|
+
expected_output_cols, _ = self._align_expected_output(
|
765
805
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
766
806
|
)
|
767
807
|
|
@@ -826,7 +866,7 @@ class SelectFdr(BaseTransformer):
|
|
826
866
|
drop_input_cols = self._drop_input_cols,
|
827
867
|
expected_output_cols_type="float",
|
828
868
|
)
|
829
|
-
expected_output_cols = self.
|
869
|
+
expected_output_cols, _ = self._align_expected_output(
|
830
870
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
831
871
|
)
|
832
872
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -25,12 +23,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
25
23
|
from snowflake.ml._internal import telemetry
|
26
24
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
27
25
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
28
|
-
from snowflake.ml._internal.utils import
|
26
|
+
from snowflake.ml._internal.utils import identifier
|
29
27
|
from snowflake.snowpark import DataFrame, Session
|
30
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
29
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
32
30
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
33
|
-
ModelTransformHandlers,
|
34
31
|
BatchInferenceKwargsTypedDict,
|
35
32
|
ScoreKwargsTypedDict
|
36
33
|
)
|
@@ -478,12 +475,23 @@ class SelectFpr(BaseTransformer):
|
|
478
475
|
autogenerated=self._autogenerated,
|
479
476
|
subproject=_SUBPROJECT,
|
480
477
|
)
|
481
|
-
|
482
|
-
|
483
|
-
expected_output_cols_list=(
|
484
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
485
|
-
),
|
478
|
+
expected_output_cols = (
|
479
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
486
480
|
)
|
481
|
+
if isinstance(dataset, DataFrame):
|
482
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
483
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
484
|
+
)
|
485
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
486
|
+
drop_input_cols=self._drop_input_cols,
|
487
|
+
expected_output_cols_list=expected_output_cols,
|
488
|
+
example_output_pd_df=example_output_pd_df,
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
492
|
+
drop_input_cols=self._drop_input_cols,
|
493
|
+
expected_output_cols_list=expected_output_cols,
|
494
|
+
)
|
487
495
|
self._sklearn_object = fitted_estimator
|
488
496
|
self._is_fitted = True
|
489
497
|
return output_result
|
@@ -508,6 +516,7 @@ class SelectFpr(BaseTransformer):
|
|
508
516
|
"""
|
509
517
|
self._infer_input_output_cols(dataset)
|
510
518
|
super()._check_dataset_type(dataset)
|
519
|
+
|
511
520
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
512
521
|
estimator=self._sklearn_object,
|
513
522
|
dataset=dataset,
|
@@ -564,12 +573,41 @@ class SelectFpr(BaseTransformer):
|
|
564
573
|
|
565
574
|
return rv
|
566
575
|
|
567
|
-
def
|
568
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
569
|
-
) -> List[str]:
|
576
|
+
def _align_expected_output(
|
577
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
578
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
579
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
580
|
+
and output dataframe with 1 line.
|
581
|
+
If the method is fit_predict, run 2 lines of data.
|
582
|
+
"""
|
570
583
|
# in case the inferred output column names dimension is different
|
571
584
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
585
|
+
|
586
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
587
|
+
# so change the minimum of number of rows to 2
|
588
|
+
num_examples = 2
|
589
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
590
|
+
project=_PROJECT,
|
591
|
+
subproject=_SUBPROJECT,
|
592
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
593
|
+
inspect.currentframe(), SelectFpr.__class__.__name__
|
594
|
+
),
|
595
|
+
api_calls=[Session.call],
|
596
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
597
|
+
)
|
598
|
+
if output_cols_prefix == "fit_predict_":
|
599
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
600
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
601
|
+
num_examples = self._sklearn_object.n_clusters
|
602
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
603
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
604
|
+
num_examples = self._sklearn_object.min_samples
|
605
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
606
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
607
|
+
num_examples = self._sklearn_object.n_neighbors
|
608
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
609
|
+
else:
|
610
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
573
611
|
|
574
612
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
613
|
# seen during the fit.
|
@@ -581,12 +619,14 @@ class SelectFpr(BaseTransformer):
|
|
581
619
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
582
620
|
if self.sample_weight_col:
|
583
621
|
output_df_columns_set -= set(self.sample_weight_col)
|
622
|
+
|
584
623
|
# if the dimension of inferred output column names is correct; use it
|
585
624
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
586
|
-
return expected_output_cols_list
|
625
|
+
return expected_output_cols_list, output_df_pd
|
587
626
|
# otherwise, use the sklearn estimator's output
|
588
627
|
else:
|
589
|
-
|
628
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
629
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
590
630
|
|
591
631
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
592
632
|
@telemetry.send_api_usage_telemetry(
|
@@ -632,7 +672,7 @@ class SelectFpr(BaseTransformer):
|
|
632
672
|
drop_input_cols=self._drop_input_cols,
|
633
673
|
expected_output_cols_type="float",
|
634
674
|
)
|
635
|
-
expected_output_cols = self.
|
675
|
+
expected_output_cols, _ = self._align_expected_output(
|
636
676
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
637
677
|
)
|
638
678
|
|
@@ -698,7 +738,7 @@ class SelectFpr(BaseTransformer):
|
|
698
738
|
drop_input_cols=self._drop_input_cols,
|
699
739
|
expected_output_cols_type="float",
|
700
740
|
)
|
701
|
-
expected_output_cols = self.
|
741
|
+
expected_output_cols, _ = self._align_expected_output(
|
702
742
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
703
743
|
)
|
704
744
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -761,7 +801,7 @@ class SelectFpr(BaseTransformer):
|
|
761
801
|
drop_input_cols=self._drop_input_cols,
|
762
802
|
expected_output_cols_type="float",
|
763
803
|
)
|
764
|
-
expected_output_cols = self.
|
804
|
+
expected_output_cols, _ = self._align_expected_output(
|
765
805
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
766
806
|
)
|
767
807
|
|
@@ -826,7 +866,7 @@ class SelectFpr(BaseTransformer):
|
|
826
866
|
drop_input_cols = self._drop_input_cols,
|
827
867
|
expected_output_cols_type="float",
|
828
868
|
)
|
829
|
-
expected_output_cols = self.
|
869
|
+
expected_output_cols, _ = self._align_expected_output(
|
830
870
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
831
871
|
)
|
832
872
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -25,12 +23,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
25
23
|
from snowflake.ml._internal import telemetry
|
26
24
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
27
25
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
28
|
-
from snowflake.ml._internal.utils import
|
26
|
+
from snowflake.ml._internal.utils import identifier
|
29
27
|
from snowflake.snowpark import DataFrame, Session
|
30
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
31
29
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
32
30
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
33
|
-
ModelTransformHandlers,
|
34
31
|
BatchInferenceKwargsTypedDict,
|
35
32
|
ScoreKwargsTypedDict
|
36
33
|
)
|
@@ -478,12 +475,23 @@ class SelectFwe(BaseTransformer):
|
|
478
475
|
autogenerated=self._autogenerated,
|
479
476
|
subproject=_SUBPROJECT,
|
480
477
|
)
|
481
|
-
|
482
|
-
|
483
|
-
expected_output_cols_list=(
|
484
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
485
|
-
),
|
478
|
+
expected_output_cols = (
|
479
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
486
480
|
)
|
481
|
+
if isinstance(dataset, DataFrame):
|
482
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
483
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
484
|
+
)
|
485
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
486
|
+
drop_input_cols=self._drop_input_cols,
|
487
|
+
expected_output_cols_list=expected_output_cols,
|
488
|
+
example_output_pd_df=example_output_pd_df,
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
492
|
+
drop_input_cols=self._drop_input_cols,
|
493
|
+
expected_output_cols_list=expected_output_cols,
|
494
|
+
)
|
487
495
|
self._sklearn_object = fitted_estimator
|
488
496
|
self._is_fitted = True
|
489
497
|
return output_result
|
@@ -508,6 +516,7 @@ class SelectFwe(BaseTransformer):
|
|
508
516
|
"""
|
509
517
|
self._infer_input_output_cols(dataset)
|
510
518
|
super()._check_dataset_type(dataset)
|
519
|
+
|
511
520
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
512
521
|
estimator=self._sklearn_object,
|
513
522
|
dataset=dataset,
|
@@ -564,12 +573,41 @@ class SelectFwe(BaseTransformer):
|
|
564
573
|
|
565
574
|
return rv
|
566
575
|
|
567
|
-
def
|
568
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
569
|
-
) -> List[str]:
|
576
|
+
def _align_expected_output(
|
577
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
578
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
579
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
580
|
+
and output dataframe with 1 line.
|
581
|
+
If the method is fit_predict, run 2 lines of data.
|
582
|
+
"""
|
570
583
|
# in case the inferred output column names dimension is different
|
571
584
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
572
|
-
|
585
|
+
|
586
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
587
|
+
# so change the minimum of number of rows to 2
|
588
|
+
num_examples = 2
|
589
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
590
|
+
project=_PROJECT,
|
591
|
+
subproject=_SUBPROJECT,
|
592
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
593
|
+
inspect.currentframe(), SelectFwe.__class__.__name__
|
594
|
+
),
|
595
|
+
api_calls=[Session.call],
|
596
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
597
|
+
)
|
598
|
+
if output_cols_prefix == "fit_predict_":
|
599
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
600
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
601
|
+
num_examples = self._sklearn_object.n_clusters
|
602
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
603
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
604
|
+
num_examples = self._sklearn_object.min_samples
|
605
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
606
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
607
|
+
num_examples = self._sklearn_object.n_neighbors
|
608
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
609
|
+
else:
|
610
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
573
611
|
|
574
612
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
575
613
|
# seen during the fit.
|
@@ -581,12 +619,14 @@ class SelectFwe(BaseTransformer):
|
|
581
619
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
582
620
|
if self.sample_weight_col:
|
583
621
|
output_df_columns_set -= set(self.sample_weight_col)
|
622
|
+
|
584
623
|
# if the dimension of inferred output column names is correct; use it
|
585
624
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
586
|
-
return expected_output_cols_list
|
625
|
+
return expected_output_cols_list, output_df_pd
|
587
626
|
# otherwise, use the sklearn estimator's output
|
588
627
|
else:
|
589
|
-
|
628
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
629
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
590
630
|
|
591
631
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
592
632
|
@telemetry.send_api_usage_telemetry(
|
@@ -632,7 +672,7 @@ class SelectFwe(BaseTransformer):
|
|
632
672
|
drop_input_cols=self._drop_input_cols,
|
633
673
|
expected_output_cols_type="float",
|
634
674
|
)
|
635
|
-
expected_output_cols = self.
|
675
|
+
expected_output_cols, _ = self._align_expected_output(
|
636
676
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
637
677
|
)
|
638
678
|
|
@@ -698,7 +738,7 @@ class SelectFwe(BaseTransformer):
|
|
698
738
|
drop_input_cols=self._drop_input_cols,
|
699
739
|
expected_output_cols_type="float",
|
700
740
|
)
|
701
|
-
expected_output_cols = self.
|
741
|
+
expected_output_cols, _ = self._align_expected_output(
|
702
742
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
703
743
|
)
|
704
744
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -761,7 +801,7 @@ class SelectFwe(BaseTransformer):
|
|
761
801
|
drop_input_cols=self._drop_input_cols,
|
762
802
|
expected_output_cols_type="float",
|
763
803
|
)
|
764
|
-
expected_output_cols = self.
|
804
|
+
expected_output_cols, _ = self._align_expected_output(
|
765
805
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
766
806
|
)
|
767
807
|
|
@@ -826,7 +866,7 @@ class SelectFwe(BaseTransformer):
|
|
826
866
|
drop_input_cols = self._drop_input_cols,
|
827
867
|
expected_output_cols_type="float",
|
828
868
|
)
|
829
|
-
expected_output_cols = self.
|
869
|
+
expected_output_cols, _ = self._align_expected_output(
|
830
870
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
831
871
|
)
|
832
872
|
|