snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/__init__.py +4 -0
- snowflake/cortex/_classify_text.py +2 -2
- snowflake/cortex/_embed_text_1024.py +37 -0
- snowflake/cortex/_embed_text_768.py +37 -0
- snowflake/cortex/_extract_answer.py +2 -2
- snowflake/cortex/_sentiment.py +2 -2
- snowflake/cortex/_summarize.py +2 -2
- snowflake/cortex/_translate.py +2 -2
- snowflake/cortex/_util.py +4 -4
- snowflake/ml/_internal/env_utils.py +5 -5
- snowflake/ml/_internal/exceptions/error_codes.py +2 -0
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/db_utils.py +50 -0
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/service_logger.py +63 -0
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +26 -2
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/data/_internal/arrow_ingestor.py +1 -11
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/ingestor_utils.py +20 -10
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/access_manager.py +3 -3
- snowflake/ml/feature_store/feature_store.py +60 -19
- snowflake/ml/feature_store/feature_view.py +84 -30
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/fileset/stage_fs.py +2 -1
- snowflake/ml/lineage/lineage_node.py +7 -2
- snowflake/ml/model/__init__.py +1 -2
- snowflake/ml/model/_client/model/model_version_impl.py +96 -12
- snowflake/ml/model/_client/ops/model_ops.py +124 -6
- snowflake/ml/model/_client/ops/service_ops.py +309 -9
- snowflake/ml/model/_client/service/model_deployment_spec.py +8 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +2 -2
- snowflake/ml/model/_client/sql/_base.py +5 -0
- snowflake/ml/model/_client/sql/model.py +1 -0
- snowflake/ml/model/_client/sql/model_version.py +9 -5
- snowflake/ml/model/_client/sql/service.py +121 -20
- snowflake/ml/model/_model_composer/model_composer.py +11 -39
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -11
- snowflake/ml/model/_packager/model_env/model_env.py +4 -38
- snowflake/ml/model/_packager/model_handlers/_utils.py +134 -28
- snowflake/ml/model/_packager/model_handlers/catboost.py +31 -30
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +26 -18
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +31 -58
- snowflake/ml/model/_packager/model_handlers/mlflow.py +3 -5
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +169 -0
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +15 -8
- snowflake/ml/model/_packager/model_handlers/sklearn.py +56 -60
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +141 -9
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +63 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +16 -42
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +1 -14
- snowflake/ml/model/_packager/model_packager.py +14 -8
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +11 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/snowpark_handler.py +3 -2
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/type_hints.py +12 -145
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -4
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +130 -166
- snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +0 -1
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +61 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +61 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +61 -21
- snowflake/ml/modeling/cluster/birch.py +61 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +61 -21
- snowflake/ml/modeling/cluster/dbscan.py +61 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +61 -21
- snowflake/ml/modeling/cluster/k_means.py +61 -21
- snowflake/ml/modeling/cluster/mean_shift.py +61 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +61 -21
- snowflake/ml/modeling/cluster/optics.py +61 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +61 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +61 -21
- snowflake/ml/modeling/compose/column_transformer.py +61 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +61 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +61 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +61 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +61 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +61 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +61 -21
- snowflake/ml/modeling/covariance/oas.py +61 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +61 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +61 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +61 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +61 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +61 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/pca.py +61 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +61 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +61 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +61 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +61 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +61 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +61 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +61 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +61 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +61 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +61 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +61 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +61 -21
- snowflake/ml/modeling/impute/knn_imputer.py +61 -21
- snowflake/ml/modeling/impute/missing_indicator.py +61 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +61 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +61 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +61 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +61 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +61 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +61 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/lars.py +61 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +61 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +61 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +61 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +61 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +61 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/perceptron.py +61 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/ridge.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +61 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +61 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +61 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +61 -21
- snowflake/ml/modeling/manifold/isomap.py +61 -21
- snowflake/ml/modeling/manifold/mds.py +61 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +61 -21
- snowflake/ml/modeling/manifold/tsne.py +61 -21
- snowflake/ml/modeling/metrics/metrics_utils.py +2 -2
- snowflake/ml/modeling/metrics/ranking.py +0 -3
- snowflake/ml/modeling/metrics/regression.py +0 -3
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +61 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +61 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +61 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +61 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +61 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +61 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +61 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +61 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +61 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +61 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +61 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -13
- snowflake/ml/modeling/preprocessing/polynomial_features.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +61 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +61 -21
- snowflake/ml/modeling/svm/linear_svc.py +61 -21
- snowflake/ml/modeling/svm/linear_svr.py +61 -21
- snowflake/ml/modeling/svm/nu_svc.py +61 -21
- snowflake/ml/modeling/svm/nu_svr.py +61 -21
- snowflake/ml/modeling/svm/svc.py +61 -21
- snowflake/ml/modeling/svm/svr.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +61 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +61 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +64 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +64 -23
- snowflake/ml/monitoring/_client/model_monitor.py +126 -0
- snowflake/ml/monitoring/_client/model_monitor_manager.py +361 -0
- snowflake/ml/monitoring/_client/model_monitor_version.py +1 -0
- snowflake/ml/monitoring/_client/monitor_sql_client.py +1335 -0
- snowflake/ml/monitoring/_client/queries/record_count.ssql +14 -0
- snowflake/ml/monitoring/_client/queries/rmse.ssql +28 -0
- snowflake/ml/monitoring/entities/model_monitor_config.py +28 -0
- snowflake/ml/monitoring/entities/model_monitor_interval.py +46 -0
- snowflake/ml/monitoring/entities/output_score_type.py +90 -0
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/registry.py +166 -8
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/METADATA +43 -9
- snowflake_ml_python-1.6.3.dist-info/RECORD +400 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/WHEEL +1 -1
- snowflake/ml/_internal/container_services/image_registry/credential.py +0 -84
- snowflake/ml/_internal/container_services/image_registry/http_client.py +0 -127
- snowflake/ml/_internal/container_services/image_registry/imagelib.py +0 -400
- snowflake/ml/_internal/container_services/image_registry/registry_client.py +0 -212
- snowflake/ml/_internal/utils/log_stream_processor.py +0 -30
- snowflake/ml/_internal/utils/session_token_manager.py +0 -46
- snowflake/ml/_internal/utils/spcs_attribution_utils.py +0 -122
- snowflake/ml/_internal/utils/uri.py +0 -77
- snowflake/ml/data/torch_dataset.py +0 -33
- snowflake/ml/model/_api.py +0 -568
- snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +0 -12
- snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +0 -249
- snowflake/ml/model/_deploy_client/image_builds/docker_context.py +0 -130
- snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +0 -36
- snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +0 -268
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +0 -215
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +0 -53
- snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template +0 -38
- snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template +0 -105
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +0 -611
- snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +0 -116
- snowflake/ml/model/_deploy_client/snowservice/instance_types.py +0 -10
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +0 -28
- snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template_with_model +0 -21
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -48
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +0 -280
- snowflake/ml/model/_deploy_client/warehouse/deploy.py +0 -202
- snowflake/ml/model/_deploy_client/warehouse/infer_template.py +0 -99
- snowflake/ml/model/_packager/model_handlers/llm.py +0 -267
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +0 -11
- snowflake/ml/model/deploy_platforms.py +0 -6
- snowflake/ml/model/models/llm.py +0 -104
- snowflake/ml/monitoring/monitor.py +0 -203
- snowflake/ml/registry/_initial_schema.py +0 -142
- snowflake/ml/registry/_schema.py +0 -82
- snowflake/ml/registry/_schema_upgrade_plans.py +0 -116
- snowflake/ml/registry/_schema_version_manager.py +0 -163
- snowflake/ml/registry/model_registry.py +0 -2048
- snowflake_ml_python-1.6.1.dist-info/RECORD +0 -422
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.3.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -536,12 +533,23 @@ class NearestNeighbors(BaseTransformer):
|
|
536
533
|
autogenerated=self._autogenerated,
|
537
534
|
subproject=_SUBPROJECT,
|
538
535
|
)
|
539
|
-
|
540
|
-
|
541
|
-
expected_output_cols_list=(
|
542
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
543
|
-
),
|
536
|
+
expected_output_cols = (
|
537
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
544
538
|
)
|
539
|
+
if isinstance(dataset, DataFrame):
|
540
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
541
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
542
|
+
)
|
543
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
544
|
+
drop_input_cols=self._drop_input_cols,
|
545
|
+
expected_output_cols_list=expected_output_cols,
|
546
|
+
example_output_pd_df=example_output_pd_df,
|
547
|
+
)
|
548
|
+
else:
|
549
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
550
|
+
drop_input_cols=self._drop_input_cols,
|
551
|
+
expected_output_cols_list=expected_output_cols,
|
552
|
+
)
|
545
553
|
self._sklearn_object = fitted_estimator
|
546
554
|
self._is_fitted = True
|
547
555
|
return output_result
|
@@ -564,6 +572,7 @@ class NearestNeighbors(BaseTransformer):
|
|
564
572
|
"""
|
565
573
|
self._infer_input_output_cols(dataset)
|
566
574
|
super()._check_dataset_type(dataset)
|
575
|
+
|
567
576
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
568
577
|
estimator=self._sklearn_object,
|
569
578
|
dataset=dataset,
|
@@ -620,12 +629,41 @@ class NearestNeighbors(BaseTransformer):
|
|
620
629
|
|
621
630
|
return rv
|
622
631
|
|
623
|
-
def
|
624
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
625
|
-
) -> List[str]:
|
632
|
+
def _align_expected_output(
|
633
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
634
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
635
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
636
|
+
and output dataframe with 1 line.
|
637
|
+
If the method is fit_predict, run 2 lines of data.
|
638
|
+
"""
|
626
639
|
# in case the inferred output column names dimension is different
|
627
640
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
628
|
-
|
641
|
+
|
642
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
643
|
+
# so change the minimum of number of rows to 2
|
644
|
+
num_examples = 2
|
645
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
646
|
+
project=_PROJECT,
|
647
|
+
subproject=_SUBPROJECT,
|
648
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
649
|
+
inspect.currentframe(), NearestNeighbors.__class__.__name__
|
650
|
+
),
|
651
|
+
api_calls=[Session.call],
|
652
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
653
|
+
)
|
654
|
+
if output_cols_prefix == "fit_predict_":
|
655
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
656
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
657
|
+
num_examples = self._sklearn_object.n_clusters
|
658
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
659
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
660
|
+
num_examples = self._sklearn_object.min_samples
|
661
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
662
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
663
|
+
num_examples = self._sklearn_object.n_neighbors
|
664
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
665
|
+
else:
|
666
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
629
667
|
|
630
668
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
631
669
|
# seen during the fit.
|
@@ -637,12 +675,14 @@ class NearestNeighbors(BaseTransformer):
|
|
637
675
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
638
676
|
if self.sample_weight_col:
|
639
677
|
output_df_columns_set -= set(self.sample_weight_col)
|
678
|
+
|
640
679
|
# if the dimension of inferred output column names is correct; use it
|
641
680
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
642
|
-
return expected_output_cols_list
|
681
|
+
return expected_output_cols_list, output_df_pd
|
643
682
|
# otherwise, use the sklearn estimator's output
|
644
683
|
else:
|
645
|
-
|
684
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
685
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
646
686
|
|
647
687
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
648
688
|
@telemetry.send_api_usage_telemetry(
|
@@ -688,7 +728,7 @@ class NearestNeighbors(BaseTransformer):
|
|
688
728
|
drop_input_cols=self._drop_input_cols,
|
689
729
|
expected_output_cols_type="float",
|
690
730
|
)
|
691
|
-
expected_output_cols = self.
|
731
|
+
expected_output_cols, _ = self._align_expected_output(
|
692
732
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
693
733
|
)
|
694
734
|
|
@@ -754,7 +794,7 @@ class NearestNeighbors(BaseTransformer):
|
|
754
794
|
drop_input_cols=self._drop_input_cols,
|
755
795
|
expected_output_cols_type="float",
|
756
796
|
)
|
757
|
-
expected_output_cols = self.
|
797
|
+
expected_output_cols, _ = self._align_expected_output(
|
758
798
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
759
799
|
)
|
760
800
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -817,7 +857,7 @@ class NearestNeighbors(BaseTransformer):
|
|
817
857
|
drop_input_cols=self._drop_input_cols,
|
818
858
|
expected_output_cols_type="float",
|
819
859
|
)
|
820
|
-
expected_output_cols = self.
|
860
|
+
expected_output_cols, _ = self._align_expected_output(
|
821
861
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
822
862
|
)
|
823
863
|
|
@@ -882,7 +922,7 @@ class NearestNeighbors(BaseTransformer):
|
|
882
922
|
drop_input_cols = self._drop_input_cols,
|
883
923
|
expected_output_cols_type="float",
|
884
924
|
)
|
885
|
-
expected_output_cols = self.
|
925
|
+
expected_output_cols, _ = self._align_expected_output(
|
886
926
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
887
927
|
)
|
888
928
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -557,12 +554,23 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
557
554
|
autogenerated=self._autogenerated,
|
558
555
|
subproject=_SUBPROJECT,
|
559
556
|
)
|
560
|
-
|
561
|
-
|
562
|
-
expected_output_cols_list=(
|
563
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
564
|
-
),
|
557
|
+
expected_output_cols = (
|
558
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
565
559
|
)
|
560
|
+
if isinstance(dataset, DataFrame):
|
561
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
562
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
563
|
+
)
|
564
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
565
|
+
drop_input_cols=self._drop_input_cols,
|
566
|
+
expected_output_cols_list=expected_output_cols,
|
567
|
+
example_output_pd_df=example_output_pd_df,
|
568
|
+
)
|
569
|
+
else:
|
570
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
571
|
+
drop_input_cols=self._drop_input_cols,
|
572
|
+
expected_output_cols_list=expected_output_cols,
|
573
|
+
)
|
566
574
|
self._sklearn_object = fitted_estimator
|
567
575
|
self._is_fitted = True
|
568
576
|
return output_result
|
@@ -587,6 +595,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
587
595
|
"""
|
588
596
|
self._infer_input_output_cols(dataset)
|
589
597
|
super()._check_dataset_type(dataset)
|
598
|
+
|
590
599
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
591
600
|
estimator=self._sklearn_object,
|
592
601
|
dataset=dataset,
|
@@ -643,12 +652,41 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
643
652
|
|
644
653
|
return rv
|
645
654
|
|
646
|
-
def
|
647
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
648
|
-
) -> List[str]:
|
655
|
+
def _align_expected_output(
|
656
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
657
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
658
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
659
|
+
and output dataframe with 1 line.
|
660
|
+
If the method is fit_predict, run 2 lines of data.
|
661
|
+
"""
|
649
662
|
# in case the inferred output column names dimension is different
|
650
663
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
651
|
-
|
664
|
+
|
665
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
666
|
+
# so change the minimum of number of rows to 2
|
667
|
+
num_examples = 2
|
668
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
669
|
+
project=_PROJECT,
|
670
|
+
subproject=_SUBPROJECT,
|
671
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
672
|
+
inspect.currentframe(), NeighborhoodComponentsAnalysis.__class__.__name__
|
673
|
+
),
|
674
|
+
api_calls=[Session.call],
|
675
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
676
|
+
)
|
677
|
+
if output_cols_prefix == "fit_predict_":
|
678
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
679
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
680
|
+
num_examples = self._sklearn_object.n_clusters
|
681
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
682
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
683
|
+
num_examples = self._sklearn_object.min_samples
|
684
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
685
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
686
|
+
num_examples = self._sklearn_object.n_neighbors
|
687
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
688
|
+
else:
|
689
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
652
690
|
|
653
691
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
654
692
|
# seen during the fit.
|
@@ -660,12 +698,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
660
698
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
661
699
|
if self.sample_weight_col:
|
662
700
|
output_df_columns_set -= set(self.sample_weight_col)
|
701
|
+
|
663
702
|
# if the dimension of inferred output column names is correct; use it
|
664
703
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
665
|
-
return expected_output_cols_list
|
704
|
+
return expected_output_cols_list, output_df_pd
|
666
705
|
# otherwise, use the sklearn estimator's output
|
667
706
|
else:
|
668
|
-
|
707
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
708
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
669
709
|
|
670
710
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
671
711
|
@telemetry.send_api_usage_telemetry(
|
@@ -711,7 +751,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
711
751
|
drop_input_cols=self._drop_input_cols,
|
712
752
|
expected_output_cols_type="float",
|
713
753
|
)
|
714
|
-
expected_output_cols = self.
|
754
|
+
expected_output_cols, _ = self._align_expected_output(
|
715
755
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
716
756
|
)
|
717
757
|
|
@@ -777,7 +817,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
777
817
|
drop_input_cols=self._drop_input_cols,
|
778
818
|
expected_output_cols_type="float",
|
779
819
|
)
|
780
|
-
expected_output_cols = self.
|
820
|
+
expected_output_cols, _ = self._align_expected_output(
|
781
821
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
782
822
|
)
|
783
823
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -840,7 +880,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
840
880
|
drop_input_cols=self._drop_input_cols,
|
841
881
|
expected_output_cols_type="float",
|
842
882
|
)
|
843
|
-
expected_output_cols = self.
|
883
|
+
expected_output_cols, _ = self._align_expected_output(
|
844
884
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
845
885
|
)
|
846
886
|
|
@@ -905,7 +945,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
|
|
905
945
|
drop_input_cols = self._drop_input_cols,
|
906
946
|
expected_output_cols_type="float",
|
907
947
|
)
|
908
|
-
expected_output_cols = self.
|
948
|
+
expected_output_cols, _ = self._align_expected_output(
|
909
949
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
910
950
|
)
|
911
951
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -558,12 +555,23 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
558
555
|
autogenerated=self._autogenerated,
|
559
556
|
subproject=_SUBPROJECT,
|
560
557
|
)
|
561
|
-
|
562
|
-
|
563
|
-
expected_output_cols_list=(
|
564
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
565
|
-
),
|
558
|
+
expected_output_cols = (
|
559
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
566
560
|
)
|
561
|
+
if isinstance(dataset, DataFrame):
|
562
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
563
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
564
|
+
)
|
565
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
566
|
+
drop_input_cols=self._drop_input_cols,
|
567
|
+
expected_output_cols_list=expected_output_cols,
|
568
|
+
example_output_pd_df=example_output_pd_df,
|
569
|
+
)
|
570
|
+
else:
|
571
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
572
|
+
drop_input_cols=self._drop_input_cols,
|
573
|
+
expected_output_cols_list=expected_output_cols,
|
574
|
+
)
|
567
575
|
self._sklearn_object = fitted_estimator
|
568
576
|
self._is_fitted = True
|
569
577
|
return output_result
|
@@ -586,6 +594,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
586
594
|
"""
|
587
595
|
self._infer_input_output_cols(dataset)
|
588
596
|
super()._check_dataset_type(dataset)
|
597
|
+
|
589
598
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
590
599
|
estimator=self._sklearn_object,
|
591
600
|
dataset=dataset,
|
@@ -642,12 +651,41 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
642
651
|
|
643
652
|
return rv
|
644
653
|
|
645
|
-
def
|
646
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
647
|
-
) -> List[str]:
|
654
|
+
def _align_expected_output(
|
655
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
656
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
657
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
658
|
+
and output dataframe with 1 line.
|
659
|
+
If the method is fit_predict, run 2 lines of data.
|
660
|
+
"""
|
648
661
|
# in case the inferred output column names dimension is different
|
649
662
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
650
|
-
|
663
|
+
|
664
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
665
|
+
# so change the minimum of number of rows to 2
|
666
|
+
num_examples = 2
|
667
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
668
|
+
project=_PROJECT,
|
669
|
+
subproject=_SUBPROJECT,
|
670
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
671
|
+
inspect.currentframe(), RadiusNeighborsClassifier.__class__.__name__
|
672
|
+
),
|
673
|
+
api_calls=[Session.call],
|
674
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
675
|
+
)
|
676
|
+
if output_cols_prefix == "fit_predict_":
|
677
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
678
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
679
|
+
num_examples = self._sklearn_object.n_clusters
|
680
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
681
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
682
|
+
num_examples = self._sklearn_object.min_samples
|
683
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
684
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
685
|
+
num_examples = self._sklearn_object.n_neighbors
|
686
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
687
|
+
else:
|
688
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
651
689
|
|
652
690
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
653
691
|
# seen during the fit.
|
@@ -659,12 +697,14 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
659
697
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
660
698
|
if self.sample_weight_col:
|
661
699
|
output_df_columns_set -= set(self.sample_weight_col)
|
700
|
+
|
662
701
|
# if the dimension of inferred output column names is correct; use it
|
663
702
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
664
|
-
return expected_output_cols_list
|
703
|
+
return expected_output_cols_list, output_df_pd
|
665
704
|
# otherwise, use the sklearn estimator's output
|
666
705
|
else:
|
667
|
-
|
706
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
707
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
668
708
|
|
669
709
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
670
710
|
@telemetry.send_api_usage_telemetry(
|
@@ -712,7 +752,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
712
752
|
drop_input_cols=self._drop_input_cols,
|
713
753
|
expected_output_cols_type="float",
|
714
754
|
)
|
715
|
-
expected_output_cols = self.
|
755
|
+
expected_output_cols, _ = self._align_expected_output(
|
716
756
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
717
757
|
)
|
718
758
|
|
@@ -780,7 +820,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
780
820
|
drop_input_cols=self._drop_input_cols,
|
781
821
|
expected_output_cols_type="float",
|
782
822
|
)
|
783
|
-
expected_output_cols = self.
|
823
|
+
expected_output_cols, _ = self._align_expected_output(
|
784
824
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
785
825
|
)
|
786
826
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -843,7 +883,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
843
883
|
drop_input_cols=self._drop_input_cols,
|
844
884
|
expected_output_cols_type="float",
|
845
885
|
)
|
846
|
-
expected_output_cols = self.
|
886
|
+
expected_output_cols, _ = self._align_expected_output(
|
847
887
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
848
888
|
)
|
849
889
|
|
@@ -908,7 +948,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
|
|
908
948
|
drop_input_cols = self._drop_input_cols,
|
909
949
|
expected_output_cols_type="float",
|
910
950
|
)
|
911
|
-
expected_output_cols = self.
|
951
|
+
expected_output_cols, _ = self._align_expected_output(
|
912
952
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
913
953
|
)
|
914
954
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -548,12 +545,23 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
548
545
|
autogenerated=self._autogenerated,
|
549
546
|
subproject=_SUBPROJECT,
|
550
547
|
)
|
551
|
-
|
552
|
-
|
553
|
-
expected_output_cols_list=(
|
554
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
555
|
-
),
|
548
|
+
expected_output_cols = (
|
549
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
556
550
|
)
|
551
|
+
if isinstance(dataset, DataFrame):
|
552
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
553
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
554
|
+
)
|
555
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
556
|
+
drop_input_cols=self._drop_input_cols,
|
557
|
+
expected_output_cols_list=expected_output_cols,
|
558
|
+
example_output_pd_df=example_output_pd_df,
|
559
|
+
)
|
560
|
+
else:
|
561
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
562
|
+
drop_input_cols=self._drop_input_cols,
|
563
|
+
expected_output_cols_list=expected_output_cols,
|
564
|
+
)
|
557
565
|
self._sklearn_object = fitted_estimator
|
558
566
|
self._is_fitted = True
|
559
567
|
return output_result
|
@@ -576,6 +584,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
576
584
|
"""
|
577
585
|
self._infer_input_output_cols(dataset)
|
578
586
|
super()._check_dataset_type(dataset)
|
587
|
+
|
579
588
|
model_trainer = ModelTrainerBuilder.build_fit_transform(
|
580
589
|
estimator=self._sklearn_object,
|
581
590
|
dataset=dataset,
|
@@ -632,12 +641,41 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
632
641
|
|
633
642
|
return rv
|
634
643
|
|
635
|
-
def
|
636
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
637
|
-
) -> List[str]:
|
644
|
+
def _align_expected_output(
|
645
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
646
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
647
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
648
|
+
and output dataframe with 1 line.
|
649
|
+
If the method is fit_predict, run 2 lines of data.
|
650
|
+
"""
|
638
651
|
# in case the inferred output column names dimension is different
|
639
652
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
640
|
-
|
653
|
+
|
654
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
655
|
+
# so change the minimum of number of rows to 2
|
656
|
+
num_examples = 2
|
657
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
658
|
+
project=_PROJECT,
|
659
|
+
subproject=_SUBPROJECT,
|
660
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
661
|
+
inspect.currentframe(), RadiusNeighborsRegressor.__class__.__name__
|
662
|
+
),
|
663
|
+
api_calls=[Session.call],
|
664
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
665
|
+
)
|
666
|
+
if output_cols_prefix == "fit_predict_":
|
667
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
668
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
669
|
+
num_examples = self._sklearn_object.n_clusters
|
670
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
671
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
672
|
+
num_examples = self._sklearn_object.min_samples
|
673
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
674
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
675
|
+
num_examples = self._sklearn_object.n_neighbors
|
676
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
677
|
+
else:
|
678
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
641
679
|
|
642
680
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
643
681
|
# seen during the fit.
|
@@ -649,12 +687,14 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
649
687
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
650
688
|
if self.sample_weight_col:
|
651
689
|
output_df_columns_set -= set(self.sample_weight_col)
|
690
|
+
|
652
691
|
# if the dimension of inferred output column names is correct; use it
|
653
692
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
654
|
-
return expected_output_cols_list
|
693
|
+
return expected_output_cols_list, output_df_pd
|
655
694
|
# otherwise, use the sklearn estimator's output
|
656
695
|
else:
|
657
|
-
|
696
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
697
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
658
698
|
|
659
699
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
660
700
|
@telemetry.send_api_usage_telemetry(
|
@@ -700,7 +740,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
700
740
|
drop_input_cols=self._drop_input_cols,
|
701
741
|
expected_output_cols_type="float",
|
702
742
|
)
|
703
|
-
expected_output_cols = self.
|
743
|
+
expected_output_cols, _ = self._align_expected_output(
|
704
744
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
705
745
|
)
|
706
746
|
|
@@ -766,7 +806,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
766
806
|
drop_input_cols=self._drop_input_cols,
|
767
807
|
expected_output_cols_type="float",
|
768
808
|
)
|
769
|
-
expected_output_cols = self.
|
809
|
+
expected_output_cols, _ = self._align_expected_output(
|
770
810
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
771
811
|
)
|
772
812
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -829,7 +869,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
829
869
|
drop_input_cols=self._drop_input_cols,
|
830
870
|
expected_output_cols_type="float",
|
831
871
|
)
|
832
|
-
expected_output_cols = self.
|
872
|
+
expected_output_cols, _ = self._align_expected_output(
|
833
873
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
834
874
|
)
|
835
875
|
|
@@ -894,7 +934,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
|
|
894
934
|
drop_input_cols = self._drop_input_cols,
|
895
935
|
expected_output_cols_type="float",
|
896
936
|
)
|
897
|
-
expected_output_cols = self.
|
937
|
+
expected_output_cols, _ = self._align_expected_output(
|
898
938
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
899
939
|
)
|
900
940
|
|