snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +1 -1
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/_internal/utils/uri.py +2 -2
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/feature_store.py +41 -17
- snowflake/ml/feature_store/feature_view.py +2 -2
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/model/_client/model/model_version_impl.py +22 -7
- snowflake/ml/model/_client/ops/model_ops.py +39 -3
- snowflake/ml/model/_client/ops/service_ops.py +198 -7
- snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
- snowflake/ml/model/_client/sql/service.py +85 -18
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
- snowflake/ml/model/_model_composer/model_composer.py +2 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
- snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
- snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
- snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
- snowflake/ml/model/_packager/model_packager.py +2 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/models/llm.py +3 -1
- snowflake/ml/model/type_hints.py +9 -1
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
- snowflake/ml/modeling/cluster/birch.py +60 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
- snowflake/ml/modeling/cluster/dbscan.py +60 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
- snowflake/ml/modeling/cluster/k_means.py +60 -21
- snowflake/ml/modeling/cluster/mean_shift.py +60 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
- snowflake/ml/modeling/cluster/optics.py +60 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
- snowflake/ml/modeling/compose/column_transformer.py +60 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
- snowflake/ml/modeling/covariance/oas.py +60 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/pca.py +60 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
- snowflake/ml/modeling/impute/knn_imputer.py +60 -21
- snowflake/ml/modeling/impute/missing_indicator.py +60 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/lars.py +60 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/perceptron.py +60 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ridge.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
- snowflake/ml/modeling/manifold/isomap.py +60 -21
- snowflake/ml/modeling/manifold/mds.py +60 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
- snowflake/ml/modeling/manifold/tsne.py +60 -21
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -12
- snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
- snowflake/ml/modeling/svm/linear_svc.py +60 -21
- snowflake/ml/modeling/svm/linear_svr.py +60 -21
- snowflake/ml/modeling/svm/nu_svc.py +60 -21
- snowflake/ml/modeling/svm/nu_svr.py +60 -21
- snowflake/ml/modeling/svm/svc.py +60 -21
- snowflake/ml/modeling/svm/svr.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/model_registry.py +1 -1
- snowflake/ml/registry/registry.py +1 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
- snowflake/ml/data/torch_dataset.py +0 -33
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -556,12 +553,23 @@ class LassoLarsCV(BaseTransformer):
|
|
556
553
|
autogenerated=self._autogenerated,
|
557
554
|
subproject=_SUBPROJECT,
|
558
555
|
)
|
559
|
-
|
560
|
-
|
561
|
-
expected_output_cols_list=(
|
562
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
563
|
-
),
|
556
|
+
expected_output_cols = (
|
557
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
564
558
|
)
|
559
|
+
if isinstance(dataset, DataFrame):
|
560
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
561
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
562
|
+
)
|
563
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
564
|
+
drop_input_cols=self._drop_input_cols,
|
565
|
+
expected_output_cols_list=expected_output_cols,
|
566
|
+
example_output_pd_df=example_output_pd_df,
|
567
|
+
)
|
568
|
+
else:
|
569
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
570
|
+
drop_input_cols=self._drop_input_cols,
|
571
|
+
expected_output_cols_list=expected_output_cols,
|
572
|
+
)
|
565
573
|
self._sklearn_object = fitted_estimator
|
566
574
|
self._is_fitted = True
|
567
575
|
return output_result
|
@@ -640,12 +648,41 @@ class LassoLarsCV(BaseTransformer):
|
|
640
648
|
|
641
649
|
return rv
|
642
650
|
|
643
|
-
def
|
644
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
645
|
-
) -> List[str]:
|
651
|
+
def _align_expected_output(
|
652
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
653
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
654
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
655
|
+
and output dataframe with 1 line.
|
656
|
+
If the method is fit_predict, run 2 lines of data.
|
657
|
+
"""
|
646
658
|
# in case the inferred output column names dimension is different
|
647
659
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
648
|
-
|
660
|
+
|
661
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
662
|
+
# so change the minimum of number of rows to 2
|
663
|
+
num_examples = 2
|
664
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
665
|
+
project=_PROJECT,
|
666
|
+
subproject=_SUBPROJECT,
|
667
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
668
|
+
inspect.currentframe(), LassoLarsCV.__class__.__name__
|
669
|
+
),
|
670
|
+
api_calls=[Session.call],
|
671
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
672
|
+
)
|
673
|
+
if output_cols_prefix == "fit_predict_":
|
674
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
675
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
676
|
+
num_examples = self._sklearn_object.n_clusters
|
677
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
678
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
679
|
+
num_examples = self._sklearn_object.min_samples
|
680
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
681
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
682
|
+
num_examples = self._sklearn_object.n_neighbors
|
683
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
684
|
+
else:
|
685
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
649
686
|
|
650
687
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
651
688
|
# seen during the fit.
|
@@ -657,12 +694,14 @@ class LassoLarsCV(BaseTransformer):
|
|
657
694
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
658
695
|
if self.sample_weight_col:
|
659
696
|
output_df_columns_set -= set(self.sample_weight_col)
|
697
|
+
|
660
698
|
# if the dimension of inferred output column names is correct; use it
|
661
699
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
662
|
-
return expected_output_cols_list
|
700
|
+
return expected_output_cols_list, output_df_pd
|
663
701
|
# otherwise, use the sklearn estimator's output
|
664
702
|
else:
|
665
|
-
|
703
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
704
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
666
705
|
|
667
706
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
668
707
|
@telemetry.send_api_usage_telemetry(
|
@@ -708,7 +747,7 @@ class LassoLarsCV(BaseTransformer):
|
|
708
747
|
drop_input_cols=self._drop_input_cols,
|
709
748
|
expected_output_cols_type="float",
|
710
749
|
)
|
711
|
-
expected_output_cols = self.
|
750
|
+
expected_output_cols, _ = self._align_expected_output(
|
712
751
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
713
752
|
)
|
714
753
|
|
@@ -774,7 +813,7 @@ class LassoLarsCV(BaseTransformer):
|
|
774
813
|
drop_input_cols=self._drop_input_cols,
|
775
814
|
expected_output_cols_type="float",
|
776
815
|
)
|
777
|
-
expected_output_cols = self.
|
816
|
+
expected_output_cols, _ = self._align_expected_output(
|
778
817
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
779
818
|
)
|
780
819
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -837,7 +876,7 @@ class LassoLarsCV(BaseTransformer):
|
|
837
876
|
drop_input_cols=self._drop_input_cols,
|
838
877
|
expected_output_cols_type="float",
|
839
878
|
)
|
840
|
-
expected_output_cols = self.
|
879
|
+
expected_output_cols, _ = self._align_expected_output(
|
841
880
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
842
881
|
)
|
843
882
|
|
@@ -902,7 +941,7 @@ class LassoLarsCV(BaseTransformer):
|
|
902
941
|
drop_input_cols = self._drop_input_cols,
|
903
942
|
expected_output_cols_type="float",
|
904
943
|
)
|
905
|
-
expected_output_cols = self.
|
944
|
+
expected_output_cols, _ = self._align_expected_output(
|
906
945
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
907
946
|
)
|
908
947
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -539,12 +536,23 @@ class LassoLarsIC(BaseTransformer):
|
|
539
536
|
autogenerated=self._autogenerated,
|
540
537
|
subproject=_SUBPROJECT,
|
541
538
|
)
|
542
|
-
|
543
|
-
|
544
|
-
expected_output_cols_list=(
|
545
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
546
|
-
),
|
539
|
+
expected_output_cols = (
|
540
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
547
541
|
)
|
542
|
+
if isinstance(dataset, DataFrame):
|
543
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
544
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
545
|
+
)
|
546
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
547
|
+
drop_input_cols=self._drop_input_cols,
|
548
|
+
expected_output_cols_list=expected_output_cols,
|
549
|
+
example_output_pd_df=example_output_pd_df,
|
550
|
+
)
|
551
|
+
else:
|
552
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
553
|
+
drop_input_cols=self._drop_input_cols,
|
554
|
+
expected_output_cols_list=expected_output_cols,
|
555
|
+
)
|
548
556
|
self._sklearn_object = fitted_estimator
|
549
557
|
self._is_fitted = True
|
550
558
|
return output_result
|
@@ -623,12 +631,41 @@ class LassoLarsIC(BaseTransformer):
|
|
623
631
|
|
624
632
|
return rv
|
625
633
|
|
626
|
-
def
|
627
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
628
|
-
) -> List[str]:
|
634
|
+
def _align_expected_output(
|
635
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
636
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
637
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
638
|
+
and output dataframe with 1 line.
|
639
|
+
If the method is fit_predict, run 2 lines of data.
|
640
|
+
"""
|
629
641
|
# in case the inferred output column names dimension is different
|
630
642
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
631
|
-
|
643
|
+
|
644
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
645
|
+
# so change the minimum of number of rows to 2
|
646
|
+
num_examples = 2
|
647
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
648
|
+
project=_PROJECT,
|
649
|
+
subproject=_SUBPROJECT,
|
650
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
651
|
+
inspect.currentframe(), LassoLarsIC.__class__.__name__
|
652
|
+
),
|
653
|
+
api_calls=[Session.call],
|
654
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
655
|
+
)
|
656
|
+
if output_cols_prefix == "fit_predict_":
|
657
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
658
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
659
|
+
num_examples = self._sklearn_object.n_clusters
|
660
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
661
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
662
|
+
num_examples = self._sklearn_object.min_samples
|
663
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
664
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
665
|
+
num_examples = self._sklearn_object.n_neighbors
|
666
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
667
|
+
else:
|
668
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
632
669
|
|
633
670
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
634
671
|
# seen during the fit.
|
@@ -640,12 +677,14 @@ class LassoLarsIC(BaseTransformer):
|
|
640
677
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
641
678
|
if self.sample_weight_col:
|
642
679
|
output_df_columns_set -= set(self.sample_weight_col)
|
680
|
+
|
643
681
|
# if the dimension of inferred output column names is correct; use it
|
644
682
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
645
|
-
return expected_output_cols_list
|
683
|
+
return expected_output_cols_list, output_df_pd
|
646
684
|
# otherwise, use the sklearn estimator's output
|
647
685
|
else:
|
648
|
-
|
686
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
687
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
649
688
|
|
650
689
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
651
690
|
@telemetry.send_api_usage_telemetry(
|
@@ -691,7 +730,7 @@ class LassoLarsIC(BaseTransformer):
|
|
691
730
|
drop_input_cols=self._drop_input_cols,
|
692
731
|
expected_output_cols_type="float",
|
693
732
|
)
|
694
|
-
expected_output_cols = self.
|
733
|
+
expected_output_cols, _ = self._align_expected_output(
|
695
734
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
696
735
|
)
|
697
736
|
|
@@ -757,7 +796,7 @@ class LassoLarsIC(BaseTransformer):
|
|
757
796
|
drop_input_cols=self._drop_input_cols,
|
758
797
|
expected_output_cols_type="float",
|
759
798
|
)
|
760
|
-
expected_output_cols = self.
|
799
|
+
expected_output_cols, _ = self._align_expected_output(
|
761
800
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
762
801
|
)
|
763
802
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -820,7 +859,7 @@ class LassoLarsIC(BaseTransformer):
|
|
820
859
|
drop_input_cols=self._drop_input_cols,
|
821
860
|
expected_output_cols_type="float",
|
822
861
|
)
|
823
|
-
expected_output_cols = self.
|
862
|
+
expected_output_cols, _ = self._align_expected_output(
|
824
863
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
825
864
|
)
|
826
865
|
|
@@ -885,7 +924,7 @@ class LassoLarsIC(BaseTransformer):
|
|
885
924
|
drop_input_cols = self._drop_input_cols,
|
886
925
|
expected_output_cols_type="float",
|
887
926
|
)
|
888
|
-
expected_output_cols = self.
|
927
|
+
expected_output_cols, _ = self._align_expected_output(
|
889
928
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
890
929
|
)
|
891
930
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -492,12 +489,23 @@ class LinearRegression(BaseTransformer):
|
|
492
489
|
autogenerated=self._autogenerated,
|
493
490
|
subproject=_SUBPROJECT,
|
494
491
|
)
|
495
|
-
|
496
|
-
|
497
|
-
expected_output_cols_list=(
|
498
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
499
|
-
),
|
492
|
+
expected_output_cols = (
|
493
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
500
494
|
)
|
495
|
+
if isinstance(dataset, DataFrame):
|
496
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
497
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
498
|
+
)
|
499
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
500
|
+
drop_input_cols=self._drop_input_cols,
|
501
|
+
expected_output_cols_list=expected_output_cols,
|
502
|
+
example_output_pd_df=example_output_pd_df,
|
503
|
+
)
|
504
|
+
else:
|
505
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
506
|
+
drop_input_cols=self._drop_input_cols,
|
507
|
+
expected_output_cols_list=expected_output_cols,
|
508
|
+
)
|
501
509
|
self._sklearn_object = fitted_estimator
|
502
510
|
self._is_fitted = True
|
503
511
|
return output_result
|
@@ -576,12 +584,41 @@ class LinearRegression(BaseTransformer):
|
|
576
584
|
|
577
585
|
return rv
|
578
586
|
|
579
|
-
def
|
580
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
581
|
-
) -> List[str]:
|
587
|
+
def _align_expected_output(
|
588
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
589
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
590
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
591
|
+
and output dataframe with 1 line.
|
592
|
+
If the method is fit_predict, run 2 lines of data.
|
593
|
+
"""
|
582
594
|
# in case the inferred output column names dimension is different
|
583
595
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
584
|
-
|
596
|
+
|
597
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
598
|
+
# so change the minimum of number of rows to 2
|
599
|
+
num_examples = 2
|
600
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
601
|
+
project=_PROJECT,
|
602
|
+
subproject=_SUBPROJECT,
|
603
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
604
|
+
inspect.currentframe(), LinearRegression.__class__.__name__
|
605
|
+
),
|
606
|
+
api_calls=[Session.call],
|
607
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
608
|
+
)
|
609
|
+
if output_cols_prefix == "fit_predict_":
|
610
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
611
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
612
|
+
num_examples = self._sklearn_object.n_clusters
|
613
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
614
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
615
|
+
num_examples = self._sklearn_object.min_samples
|
616
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
617
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
618
|
+
num_examples = self._sklearn_object.n_neighbors
|
619
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
620
|
+
else:
|
621
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
585
622
|
|
586
623
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
587
624
|
# seen during the fit.
|
@@ -593,12 +630,14 @@ class LinearRegression(BaseTransformer):
|
|
593
630
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
594
631
|
if self.sample_weight_col:
|
595
632
|
output_df_columns_set -= set(self.sample_weight_col)
|
633
|
+
|
596
634
|
# if the dimension of inferred output column names is correct; use it
|
597
635
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
598
|
-
return expected_output_cols_list
|
636
|
+
return expected_output_cols_list, output_df_pd
|
599
637
|
# otherwise, use the sklearn estimator's output
|
600
638
|
else:
|
601
|
-
|
639
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
640
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
602
641
|
|
603
642
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
604
643
|
@telemetry.send_api_usage_telemetry(
|
@@ -644,7 +683,7 @@ class LinearRegression(BaseTransformer):
|
|
644
683
|
drop_input_cols=self._drop_input_cols,
|
645
684
|
expected_output_cols_type="float",
|
646
685
|
)
|
647
|
-
expected_output_cols = self.
|
686
|
+
expected_output_cols, _ = self._align_expected_output(
|
648
687
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
649
688
|
)
|
650
689
|
|
@@ -710,7 +749,7 @@ class LinearRegression(BaseTransformer):
|
|
710
749
|
drop_input_cols=self._drop_input_cols,
|
711
750
|
expected_output_cols_type="float",
|
712
751
|
)
|
713
|
-
expected_output_cols = self.
|
752
|
+
expected_output_cols, _ = self._align_expected_output(
|
714
753
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
715
754
|
)
|
716
755
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -773,7 +812,7 @@ class LinearRegression(BaseTransformer):
|
|
773
812
|
drop_input_cols=self._drop_input_cols,
|
774
813
|
expected_output_cols_type="float",
|
775
814
|
)
|
776
|
-
expected_output_cols = self.
|
815
|
+
expected_output_cols, _ = self._align_expected_output(
|
777
816
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
778
817
|
)
|
779
818
|
|
@@ -838,7 +877,7 @@ class LinearRegression(BaseTransformer):
|
|
838
877
|
drop_input_cols = self._drop_input_cols,
|
839
878
|
expected_output_cols_type="float",
|
840
879
|
)
|
841
|
-
expected_output_cols = self.
|
880
|
+
expected_output_cols, _ = self._align_expected_output(
|
842
881
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
843
882
|
)
|
844
883
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -606,12 +603,23 @@ class LogisticRegression(BaseTransformer):
|
|
606
603
|
autogenerated=self._autogenerated,
|
607
604
|
subproject=_SUBPROJECT,
|
608
605
|
)
|
609
|
-
|
610
|
-
|
611
|
-
expected_output_cols_list=(
|
612
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
613
|
-
),
|
606
|
+
expected_output_cols = (
|
607
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
614
608
|
)
|
609
|
+
if isinstance(dataset, DataFrame):
|
610
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
611
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
612
|
+
)
|
613
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
614
|
+
drop_input_cols=self._drop_input_cols,
|
615
|
+
expected_output_cols_list=expected_output_cols,
|
616
|
+
example_output_pd_df=example_output_pd_df,
|
617
|
+
)
|
618
|
+
else:
|
619
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
620
|
+
drop_input_cols=self._drop_input_cols,
|
621
|
+
expected_output_cols_list=expected_output_cols,
|
622
|
+
)
|
615
623
|
self._sklearn_object = fitted_estimator
|
616
624
|
self._is_fitted = True
|
617
625
|
return output_result
|
@@ -690,12 +698,41 @@ class LogisticRegression(BaseTransformer):
|
|
690
698
|
|
691
699
|
return rv
|
692
700
|
|
693
|
-
def
|
694
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
695
|
-
) -> List[str]:
|
701
|
+
def _align_expected_output(
|
702
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
703
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
704
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
705
|
+
and output dataframe with 1 line.
|
706
|
+
If the method is fit_predict, run 2 lines of data.
|
707
|
+
"""
|
696
708
|
# in case the inferred output column names dimension is different
|
697
709
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
698
|
-
|
710
|
+
|
711
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
712
|
+
# so change the minimum of number of rows to 2
|
713
|
+
num_examples = 2
|
714
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
715
|
+
project=_PROJECT,
|
716
|
+
subproject=_SUBPROJECT,
|
717
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
718
|
+
inspect.currentframe(), LogisticRegression.__class__.__name__
|
719
|
+
),
|
720
|
+
api_calls=[Session.call],
|
721
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
722
|
+
)
|
723
|
+
if output_cols_prefix == "fit_predict_":
|
724
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
725
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
726
|
+
num_examples = self._sklearn_object.n_clusters
|
727
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
728
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
729
|
+
num_examples = self._sklearn_object.min_samples
|
730
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
731
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
732
|
+
num_examples = self._sklearn_object.n_neighbors
|
733
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
734
|
+
else:
|
735
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
699
736
|
|
700
737
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
701
738
|
# seen during the fit.
|
@@ -707,12 +744,14 @@ class LogisticRegression(BaseTransformer):
|
|
707
744
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
708
745
|
if self.sample_weight_col:
|
709
746
|
output_df_columns_set -= set(self.sample_weight_col)
|
747
|
+
|
710
748
|
# if the dimension of inferred output column names is correct; use it
|
711
749
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
712
|
-
return expected_output_cols_list
|
750
|
+
return expected_output_cols_list, output_df_pd
|
713
751
|
# otherwise, use the sklearn estimator's output
|
714
752
|
else:
|
715
|
-
|
753
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
754
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
716
755
|
|
717
756
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
718
757
|
@telemetry.send_api_usage_telemetry(
|
@@ -760,7 +799,7 @@ class LogisticRegression(BaseTransformer):
|
|
760
799
|
drop_input_cols=self._drop_input_cols,
|
761
800
|
expected_output_cols_type="float",
|
762
801
|
)
|
763
|
-
expected_output_cols = self.
|
802
|
+
expected_output_cols, _ = self._align_expected_output(
|
764
803
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
765
804
|
)
|
766
805
|
|
@@ -828,7 +867,7 @@ class LogisticRegression(BaseTransformer):
|
|
828
867
|
drop_input_cols=self._drop_input_cols,
|
829
868
|
expected_output_cols_type="float",
|
830
869
|
)
|
831
|
-
expected_output_cols = self.
|
870
|
+
expected_output_cols, _ = self._align_expected_output(
|
832
871
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
833
872
|
)
|
834
873
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -893,7 +932,7 @@ class LogisticRegression(BaseTransformer):
|
|
893
932
|
drop_input_cols=self._drop_input_cols,
|
894
933
|
expected_output_cols_type="float",
|
895
934
|
)
|
896
|
-
expected_output_cols = self.
|
935
|
+
expected_output_cols, _ = self._align_expected_output(
|
897
936
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
898
937
|
)
|
899
938
|
|
@@ -958,7 +997,7 @@ class LogisticRegression(BaseTransformer):
|
|
958
997
|
drop_input_cols = self._drop_input_cols,
|
959
998
|
expected_output_cols_type="float",
|
960
999
|
)
|
961
|
-
expected_output_cols = self.
|
1000
|
+
expected_output_cols, _ = self._align_expected_output(
|
962
1001
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
963
1002
|
)
|
964
1003
|
|