snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +1 -1
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/_internal/utils/uri.py +2 -2
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/feature_store.py +41 -17
- snowflake/ml/feature_store/feature_view.py +2 -2
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/model/_client/model/model_version_impl.py +22 -7
- snowflake/ml/model/_client/ops/model_ops.py +39 -3
- snowflake/ml/model/_client/ops/service_ops.py +198 -7
- snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
- snowflake/ml/model/_client/sql/service.py +85 -18
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
- snowflake/ml/model/_model_composer/model_composer.py +2 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
- snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
- snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
- snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
- snowflake/ml/model/_packager/model_packager.py +2 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/models/llm.py +3 -1
- snowflake/ml/model/type_hints.py +9 -1
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
- snowflake/ml/modeling/cluster/birch.py +60 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
- snowflake/ml/modeling/cluster/dbscan.py +60 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
- snowflake/ml/modeling/cluster/k_means.py +60 -21
- snowflake/ml/modeling/cluster/mean_shift.py +60 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
- snowflake/ml/modeling/cluster/optics.py +60 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
- snowflake/ml/modeling/compose/column_transformer.py +60 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
- snowflake/ml/modeling/covariance/oas.py +60 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/pca.py +60 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
- snowflake/ml/modeling/impute/knn_imputer.py +60 -21
- snowflake/ml/modeling/impute/missing_indicator.py +60 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/lars.py +60 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/perceptron.py +60 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ridge.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
- snowflake/ml/modeling/manifold/isomap.py +60 -21
- snowflake/ml/modeling/manifold/mds.py +60 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
- snowflake/ml/modeling/manifold/tsne.py +60 -21
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -12
- snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
- snowflake/ml/modeling/svm/linear_svc.py +60 -21
- snowflake/ml/modeling/svm/linear_svr.py +60 -21
- snowflake/ml/modeling/svm/nu_svc.py +60 -21
- snowflake/ml/modeling/svm/nu_svr.py +60 -21
- snowflake/ml/modeling/svm/svc.py +60 -21
- snowflake/ml/modeling/svm/svr.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/model_registry.py +1 -1
- snowflake/ml/registry/registry.py +1 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
- snowflake/ml/data/torch_dataset.py +0 -33
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -577,12 +574,23 @@ class ElasticNetCV(BaseTransformer):
|
|
577
574
|
autogenerated=self._autogenerated,
|
578
575
|
subproject=_SUBPROJECT,
|
579
576
|
)
|
580
|
-
|
581
|
-
|
582
|
-
expected_output_cols_list=(
|
583
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
584
|
-
),
|
577
|
+
expected_output_cols = (
|
578
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
585
579
|
)
|
580
|
+
if isinstance(dataset, DataFrame):
|
581
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
582
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
583
|
+
)
|
584
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
585
|
+
drop_input_cols=self._drop_input_cols,
|
586
|
+
expected_output_cols_list=expected_output_cols,
|
587
|
+
example_output_pd_df=example_output_pd_df,
|
588
|
+
)
|
589
|
+
else:
|
590
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
591
|
+
drop_input_cols=self._drop_input_cols,
|
592
|
+
expected_output_cols_list=expected_output_cols,
|
593
|
+
)
|
586
594
|
self._sklearn_object = fitted_estimator
|
587
595
|
self._is_fitted = True
|
588
596
|
return output_result
|
@@ -661,12 +669,41 @@ class ElasticNetCV(BaseTransformer):
|
|
661
669
|
|
662
670
|
return rv
|
663
671
|
|
664
|
-
def
|
665
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
666
|
-
) -> List[str]:
|
672
|
+
def _align_expected_output(
|
673
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
674
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
675
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
676
|
+
and output dataframe with 1 line.
|
677
|
+
If the method is fit_predict, run 2 lines of data.
|
678
|
+
"""
|
667
679
|
# in case the inferred output column names dimension is different
|
668
680
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
669
|
-
|
681
|
+
|
682
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
683
|
+
# so change the minimum of number of rows to 2
|
684
|
+
num_examples = 2
|
685
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
686
|
+
project=_PROJECT,
|
687
|
+
subproject=_SUBPROJECT,
|
688
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
689
|
+
inspect.currentframe(), ElasticNetCV.__class__.__name__
|
690
|
+
),
|
691
|
+
api_calls=[Session.call],
|
692
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
693
|
+
)
|
694
|
+
if output_cols_prefix == "fit_predict_":
|
695
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
696
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
697
|
+
num_examples = self._sklearn_object.n_clusters
|
698
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
699
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
700
|
+
num_examples = self._sklearn_object.min_samples
|
701
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
702
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
703
|
+
num_examples = self._sklearn_object.n_neighbors
|
704
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
705
|
+
else:
|
706
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
670
707
|
|
671
708
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
672
709
|
# seen during the fit.
|
@@ -678,12 +715,14 @@ class ElasticNetCV(BaseTransformer):
|
|
678
715
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
679
716
|
if self.sample_weight_col:
|
680
717
|
output_df_columns_set -= set(self.sample_weight_col)
|
718
|
+
|
681
719
|
# if the dimension of inferred output column names is correct; use it
|
682
720
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
683
|
-
return expected_output_cols_list
|
721
|
+
return expected_output_cols_list, output_df_pd
|
684
722
|
# otherwise, use the sklearn estimator's output
|
685
723
|
else:
|
686
|
-
|
724
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
725
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
687
726
|
|
688
727
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
689
728
|
@telemetry.send_api_usage_telemetry(
|
@@ -729,7 +768,7 @@ class ElasticNetCV(BaseTransformer):
|
|
729
768
|
drop_input_cols=self._drop_input_cols,
|
730
769
|
expected_output_cols_type="float",
|
731
770
|
)
|
732
|
-
expected_output_cols = self.
|
771
|
+
expected_output_cols, _ = self._align_expected_output(
|
733
772
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
734
773
|
)
|
735
774
|
|
@@ -795,7 +834,7 @@ class ElasticNetCV(BaseTransformer):
|
|
795
834
|
drop_input_cols=self._drop_input_cols,
|
796
835
|
expected_output_cols_type="float",
|
797
836
|
)
|
798
|
-
expected_output_cols = self.
|
837
|
+
expected_output_cols, _ = self._align_expected_output(
|
799
838
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
800
839
|
)
|
801
840
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -858,7 +897,7 @@ class ElasticNetCV(BaseTransformer):
|
|
858
897
|
drop_input_cols=self._drop_input_cols,
|
859
898
|
expected_output_cols_type="float",
|
860
899
|
)
|
861
|
-
expected_output_cols = self.
|
900
|
+
expected_output_cols, _ = self._align_expected_output(
|
862
901
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
863
902
|
)
|
864
903
|
|
@@ -923,7 +962,7 @@ class ElasticNetCV(BaseTransformer):
|
|
923
962
|
drop_input_cols = self._drop_input_cols,
|
924
963
|
expected_output_cols_type="float",
|
925
964
|
)
|
926
|
-
expected_output_cols = self.
|
965
|
+
expected_output_cols, _ = self._align_expected_output(
|
927
966
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
928
967
|
)
|
929
968
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -522,12 +519,23 @@ class GammaRegressor(BaseTransformer):
|
|
522
519
|
autogenerated=self._autogenerated,
|
523
520
|
subproject=_SUBPROJECT,
|
524
521
|
)
|
525
|
-
|
526
|
-
|
527
|
-
expected_output_cols_list=(
|
528
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
529
|
-
),
|
522
|
+
expected_output_cols = (
|
523
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
530
524
|
)
|
525
|
+
if isinstance(dataset, DataFrame):
|
526
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
527
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
528
|
+
)
|
529
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
530
|
+
drop_input_cols=self._drop_input_cols,
|
531
|
+
expected_output_cols_list=expected_output_cols,
|
532
|
+
example_output_pd_df=example_output_pd_df,
|
533
|
+
)
|
534
|
+
else:
|
535
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
536
|
+
drop_input_cols=self._drop_input_cols,
|
537
|
+
expected_output_cols_list=expected_output_cols,
|
538
|
+
)
|
531
539
|
self._sklearn_object = fitted_estimator
|
532
540
|
self._is_fitted = True
|
533
541
|
return output_result
|
@@ -606,12 +614,41 @@ class GammaRegressor(BaseTransformer):
|
|
606
614
|
|
607
615
|
return rv
|
608
616
|
|
609
|
-
def
|
610
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
611
|
-
) -> List[str]:
|
617
|
+
def _align_expected_output(
|
618
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
619
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
620
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
621
|
+
and output dataframe with 1 line.
|
622
|
+
If the method is fit_predict, run 2 lines of data.
|
623
|
+
"""
|
612
624
|
# in case the inferred output column names dimension is different
|
613
625
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
614
|
-
|
626
|
+
|
627
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
628
|
+
# so change the minimum of number of rows to 2
|
629
|
+
num_examples = 2
|
630
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
631
|
+
project=_PROJECT,
|
632
|
+
subproject=_SUBPROJECT,
|
633
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
634
|
+
inspect.currentframe(), GammaRegressor.__class__.__name__
|
635
|
+
),
|
636
|
+
api_calls=[Session.call],
|
637
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
638
|
+
)
|
639
|
+
if output_cols_prefix == "fit_predict_":
|
640
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
641
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
642
|
+
num_examples = self._sklearn_object.n_clusters
|
643
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
644
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
645
|
+
num_examples = self._sklearn_object.min_samples
|
646
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
647
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
648
|
+
num_examples = self._sklearn_object.n_neighbors
|
649
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
650
|
+
else:
|
651
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
615
652
|
|
616
653
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
617
654
|
# seen during the fit.
|
@@ -623,12 +660,14 @@ class GammaRegressor(BaseTransformer):
|
|
623
660
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
624
661
|
if self.sample_weight_col:
|
625
662
|
output_df_columns_set -= set(self.sample_weight_col)
|
663
|
+
|
626
664
|
# if the dimension of inferred output column names is correct; use it
|
627
665
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
628
|
-
return expected_output_cols_list
|
666
|
+
return expected_output_cols_list, output_df_pd
|
629
667
|
# otherwise, use the sklearn estimator's output
|
630
668
|
else:
|
631
|
-
|
669
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
670
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
632
671
|
|
633
672
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
634
673
|
@telemetry.send_api_usage_telemetry(
|
@@ -674,7 +713,7 @@ class GammaRegressor(BaseTransformer):
|
|
674
713
|
drop_input_cols=self._drop_input_cols,
|
675
714
|
expected_output_cols_type="float",
|
676
715
|
)
|
677
|
-
expected_output_cols = self.
|
716
|
+
expected_output_cols, _ = self._align_expected_output(
|
678
717
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
679
718
|
)
|
680
719
|
|
@@ -740,7 +779,7 @@ class GammaRegressor(BaseTransformer):
|
|
740
779
|
drop_input_cols=self._drop_input_cols,
|
741
780
|
expected_output_cols_type="float",
|
742
781
|
)
|
743
|
-
expected_output_cols = self.
|
782
|
+
expected_output_cols, _ = self._align_expected_output(
|
744
783
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
745
784
|
)
|
746
785
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -803,7 +842,7 @@ class GammaRegressor(BaseTransformer):
|
|
803
842
|
drop_input_cols=self._drop_input_cols,
|
804
843
|
expected_output_cols_type="float",
|
805
844
|
)
|
806
|
-
expected_output_cols = self.
|
845
|
+
expected_output_cols, _ = self._align_expected_output(
|
807
846
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
808
847
|
)
|
809
848
|
|
@@ -868,7 +907,7 @@ class GammaRegressor(BaseTransformer):
|
|
868
907
|
drop_input_cols = self._drop_input_cols,
|
869
908
|
expected_output_cols_type="float",
|
870
909
|
)
|
871
|
-
expected_output_cols = self.
|
910
|
+
expected_output_cols, _ = self._align_expected_output(
|
872
911
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
873
912
|
)
|
874
913
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -505,12 +502,23 @@ class HuberRegressor(BaseTransformer):
|
|
505
502
|
autogenerated=self._autogenerated,
|
506
503
|
subproject=_SUBPROJECT,
|
507
504
|
)
|
508
|
-
|
509
|
-
|
510
|
-
expected_output_cols_list=(
|
511
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
512
|
-
),
|
505
|
+
expected_output_cols = (
|
506
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
513
507
|
)
|
508
|
+
if isinstance(dataset, DataFrame):
|
509
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
510
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
511
|
+
)
|
512
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
513
|
+
drop_input_cols=self._drop_input_cols,
|
514
|
+
expected_output_cols_list=expected_output_cols,
|
515
|
+
example_output_pd_df=example_output_pd_df,
|
516
|
+
)
|
517
|
+
else:
|
518
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
519
|
+
drop_input_cols=self._drop_input_cols,
|
520
|
+
expected_output_cols_list=expected_output_cols,
|
521
|
+
)
|
514
522
|
self._sklearn_object = fitted_estimator
|
515
523
|
self._is_fitted = True
|
516
524
|
return output_result
|
@@ -589,12 +597,41 @@ class HuberRegressor(BaseTransformer):
|
|
589
597
|
|
590
598
|
return rv
|
591
599
|
|
592
|
-
def
|
593
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
594
|
-
) -> List[str]:
|
600
|
+
def _align_expected_output(
|
601
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
602
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
603
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
604
|
+
and output dataframe with 1 line.
|
605
|
+
If the method is fit_predict, run 2 lines of data.
|
606
|
+
"""
|
595
607
|
# in case the inferred output column names dimension is different
|
596
608
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
597
|
-
|
609
|
+
|
610
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
611
|
+
# so change the minimum of number of rows to 2
|
612
|
+
num_examples = 2
|
613
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
614
|
+
project=_PROJECT,
|
615
|
+
subproject=_SUBPROJECT,
|
616
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
617
|
+
inspect.currentframe(), HuberRegressor.__class__.__name__
|
618
|
+
),
|
619
|
+
api_calls=[Session.call],
|
620
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
621
|
+
)
|
622
|
+
if output_cols_prefix == "fit_predict_":
|
623
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
624
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
625
|
+
num_examples = self._sklearn_object.n_clusters
|
626
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
627
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
628
|
+
num_examples = self._sklearn_object.min_samples
|
629
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
630
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
631
|
+
num_examples = self._sklearn_object.n_neighbors
|
632
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
633
|
+
else:
|
634
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
598
635
|
|
599
636
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
600
637
|
# seen during the fit.
|
@@ -606,12 +643,14 @@ class HuberRegressor(BaseTransformer):
|
|
606
643
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
607
644
|
if self.sample_weight_col:
|
608
645
|
output_df_columns_set -= set(self.sample_weight_col)
|
646
|
+
|
609
647
|
# if the dimension of inferred output column names is correct; use it
|
610
648
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
611
|
-
return expected_output_cols_list
|
649
|
+
return expected_output_cols_list, output_df_pd
|
612
650
|
# otherwise, use the sklearn estimator's output
|
613
651
|
else:
|
614
|
-
|
652
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
653
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
615
654
|
|
616
655
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
617
656
|
@telemetry.send_api_usage_telemetry(
|
@@ -657,7 +696,7 @@ class HuberRegressor(BaseTransformer):
|
|
657
696
|
drop_input_cols=self._drop_input_cols,
|
658
697
|
expected_output_cols_type="float",
|
659
698
|
)
|
660
|
-
expected_output_cols = self.
|
699
|
+
expected_output_cols, _ = self._align_expected_output(
|
661
700
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
662
701
|
)
|
663
702
|
|
@@ -723,7 +762,7 @@ class HuberRegressor(BaseTransformer):
|
|
723
762
|
drop_input_cols=self._drop_input_cols,
|
724
763
|
expected_output_cols_type="float",
|
725
764
|
)
|
726
|
-
expected_output_cols = self.
|
765
|
+
expected_output_cols, _ = self._align_expected_output(
|
727
766
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
728
767
|
)
|
729
768
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -786,7 +825,7 @@ class HuberRegressor(BaseTransformer):
|
|
786
825
|
drop_input_cols=self._drop_input_cols,
|
787
826
|
expected_output_cols_type="float",
|
788
827
|
)
|
789
|
-
expected_output_cols = self.
|
828
|
+
expected_output_cols, _ = self._align_expected_output(
|
790
829
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
791
830
|
)
|
792
831
|
|
@@ -851,7 +890,7 @@ class HuberRegressor(BaseTransformer):
|
|
851
890
|
drop_input_cols = self._drop_input_cols,
|
852
891
|
expected_output_cols_type="float",
|
853
892
|
)
|
854
|
-
expected_output_cols = self.
|
893
|
+
expected_output_cols, _ = self._align_expected_output(
|
855
894
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
856
895
|
)
|
857
896
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -534,12 +531,23 @@ class Lars(BaseTransformer):
|
|
534
531
|
autogenerated=self._autogenerated,
|
535
532
|
subproject=_SUBPROJECT,
|
536
533
|
)
|
537
|
-
|
538
|
-
|
539
|
-
expected_output_cols_list=(
|
540
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
541
|
-
),
|
534
|
+
expected_output_cols = (
|
535
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
542
536
|
)
|
537
|
+
if isinstance(dataset, DataFrame):
|
538
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
539
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
540
|
+
)
|
541
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
542
|
+
drop_input_cols=self._drop_input_cols,
|
543
|
+
expected_output_cols_list=expected_output_cols,
|
544
|
+
example_output_pd_df=example_output_pd_df,
|
545
|
+
)
|
546
|
+
else:
|
547
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
548
|
+
drop_input_cols=self._drop_input_cols,
|
549
|
+
expected_output_cols_list=expected_output_cols,
|
550
|
+
)
|
543
551
|
self._sklearn_object = fitted_estimator
|
544
552
|
self._is_fitted = True
|
545
553
|
return output_result
|
@@ -618,12 +626,41 @@ class Lars(BaseTransformer):
|
|
618
626
|
|
619
627
|
return rv
|
620
628
|
|
621
|
-
def
|
622
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
623
|
-
) -> List[str]:
|
629
|
+
def _align_expected_output(
|
630
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
631
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
632
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
633
|
+
and output dataframe with 1 line.
|
634
|
+
If the method is fit_predict, run 2 lines of data.
|
635
|
+
"""
|
624
636
|
# in case the inferred output column names dimension is different
|
625
637
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
626
|
-
|
638
|
+
|
639
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
640
|
+
# so change the minimum of number of rows to 2
|
641
|
+
num_examples = 2
|
642
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
643
|
+
project=_PROJECT,
|
644
|
+
subproject=_SUBPROJECT,
|
645
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
646
|
+
inspect.currentframe(), Lars.__class__.__name__
|
647
|
+
),
|
648
|
+
api_calls=[Session.call],
|
649
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
650
|
+
)
|
651
|
+
if output_cols_prefix == "fit_predict_":
|
652
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
653
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
654
|
+
num_examples = self._sklearn_object.n_clusters
|
655
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
656
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
657
|
+
num_examples = self._sklearn_object.min_samples
|
658
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
659
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
660
|
+
num_examples = self._sklearn_object.n_neighbors
|
661
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
662
|
+
else:
|
663
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
627
664
|
|
628
665
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
629
666
|
# seen during the fit.
|
@@ -635,12 +672,14 @@ class Lars(BaseTransformer):
|
|
635
672
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
636
673
|
if self.sample_weight_col:
|
637
674
|
output_df_columns_set -= set(self.sample_weight_col)
|
675
|
+
|
638
676
|
# if the dimension of inferred output column names is correct; use it
|
639
677
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
640
|
-
return expected_output_cols_list
|
678
|
+
return expected_output_cols_list, output_df_pd
|
641
679
|
# otherwise, use the sklearn estimator's output
|
642
680
|
else:
|
643
|
-
|
681
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
682
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
644
683
|
|
645
684
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
646
685
|
@telemetry.send_api_usage_telemetry(
|
@@ -686,7 +725,7 @@ class Lars(BaseTransformer):
|
|
686
725
|
drop_input_cols=self._drop_input_cols,
|
687
726
|
expected_output_cols_type="float",
|
688
727
|
)
|
689
|
-
expected_output_cols = self.
|
728
|
+
expected_output_cols, _ = self._align_expected_output(
|
690
729
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
691
730
|
)
|
692
731
|
|
@@ -752,7 +791,7 @@ class Lars(BaseTransformer):
|
|
752
791
|
drop_input_cols=self._drop_input_cols,
|
753
792
|
expected_output_cols_type="float",
|
754
793
|
)
|
755
|
-
expected_output_cols = self.
|
794
|
+
expected_output_cols, _ = self._align_expected_output(
|
756
795
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
757
796
|
)
|
758
797
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -815,7 +854,7 @@ class Lars(BaseTransformer):
|
|
815
854
|
drop_input_cols=self._drop_input_cols,
|
816
855
|
expected_output_cols_type="float",
|
817
856
|
)
|
818
|
-
expected_output_cols = self.
|
857
|
+
expected_output_cols, _ = self._align_expected_output(
|
819
858
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
820
859
|
)
|
821
860
|
|
@@ -880,7 +919,7 @@ class Lars(BaseTransformer):
|
|
880
919
|
drop_input_cols = self._drop_input_cols,
|
881
920
|
expected_output_cols_type="float",
|
882
921
|
)
|
883
|
-
expected_output_cols = self.
|
922
|
+
expected_output_cols, _ = self._align_expected_output(
|
884
923
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
885
924
|
)
|
886
925
|
|