snowflake-ml-python 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_complete.py +7 -33
- snowflake/ml/_internal/env_utils.py +11 -5
- snowflake/ml/_internal/exceptions/modeling_error_messages.py +4 -1
- snowflake/ml/_internal/telemetry.py +156 -20
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/pkg_version_utils.py +8 -22
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +1 -1
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/_internal/utils/uri.py +2 -2
- snowflake/ml/data/_internal/arrow_ingestor.py +66 -10
- snowflake/ml/data/data_connector.py +88 -9
- snowflake/ml/data/data_ingestor.py +18 -1
- snowflake/ml/data/{_internal/ingestor_utils.py → ingestor_utils.py} +5 -1
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/dataset/dataset_metadata.py +3 -1
- snowflake/ml/dataset/dataset_reader.py +9 -3
- snowflake/ml/feature_store/examples/airline_features/entities.py +16 -0
- snowflake/ml/feature_store/examples/airline_features/features/plane_features.py +31 -0
- snowflake/ml/feature_store/examples/airline_features/features/weather_features.py +42 -0
- snowflake/ml/feature_store/examples/airline_features/source.yaml +7 -0
- snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +10 -4
- snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +6 -0
- snowflake/ml/feature_store/examples/citibike_trip_features/source.yaml +3 -0
- snowflake/ml/feature_store/examples/example_helper.py +69 -31
- snowflake/ml/feature_store/examples/new_york_taxi_features/entities.py +3 -3
- snowflake/ml/feature_store/examples/new_york_taxi_features/features/{dropoff_features.py → location_features.py} +14 -9
- snowflake/ml/feature_store/examples/new_york_taxi_features/features/trip_features.py +36 -0
- snowflake/ml/feature_store/examples/new_york_taxi_features/source.yaml +5 -1
- snowflake/ml/feature_store/examples/source_data/airline.yaml +4 -0
- snowflake/ml/feature_store/examples/source_data/citibike_trips.yaml +1 -1
- snowflake/ml/feature_store/examples/wine_quality_features/entities.py +3 -3
- snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +13 -6
- snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +8 -5
- snowflake/ml/feature_store/examples/wine_quality_features/source.yaml +3 -0
- snowflake/ml/feature_store/feature_store.py +100 -41
- snowflake/ml/feature_store/feature_view.py +149 -5
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/model/_client/model/model_impl.py +11 -2
- snowflake/ml/model/_client/model/model_version_impl.py +186 -20
- snowflake/ml/model/_client/ops/model_ops.py +144 -30
- snowflake/ml/model/_client/ops/service_ops.py +312 -0
- snowflake/ml/model/_client/service/model_deployment_spec.py +94 -0
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +30 -0
- snowflake/ml/model/_client/sql/model_version.py +13 -4
- snowflake/ml/model/_client/sql/service.py +196 -0
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
- snowflake/ml/model/_model_composer/model_composer.py +5 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +13 -10
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +3 -0
- snowflake/ml/model/_packager/model_env/model_env.py +7 -2
- snowflake/ml/model/_packager/model_handlers/_base.py +29 -12
- snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
- snowflake/ml/model/_packager/model_handlers/catboost.py +25 -16
- snowflake/ml/model/_packager/model_handlers/custom.py +6 -2
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +32 -20
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +23 -56
- snowflake/ml/model/_packager/model_handlers/llm.py +11 -5
- snowflake/ml/model/_packager/model_handlers/mlflow.py +8 -3
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
- snowflake/ml/model/_packager/model_handlers/pytorch.py +8 -3
- snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +8 -3
- snowflake/ml/model/_packager/model_handlers/sklearn.py +99 -4
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +123 -5
- snowflake/ml/model/_packager/model_handlers/tensorflow.py +9 -4
- snowflake/ml/model/_packager/model_handlers/torchscript.py +10 -5
- snowflake/ml/model/_packager/model_handlers/xgboost.py +56 -47
- snowflake/ml/model/_packager/model_meta/model_meta.py +35 -2
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +11 -0
- snowflake/ml/model/_packager/model_packager.py +4 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +4 -2
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/models/llm.py +3 -1
- snowflake/ml/model/type_hints.py +10 -4
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
- snowflake/ml/modeling/cluster/birch.py +60 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
- snowflake/ml/modeling/cluster/dbscan.py +60 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
- snowflake/ml/modeling/cluster/k_means.py +60 -21
- snowflake/ml/modeling/cluster/mean_shift.py +60 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
- snowflake/ml/modeling/cluster/optics.py +60 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
- snowflake/ml/modeling/compose/column_transformer.py +60 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
- snowflake/ml/modeling/covariance/oas.py +60 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/pca.py +60 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
- snowflake/ml/modeling/framework/base.py +28 -19
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
- snowflake/ml/modeling/impute/knn_imputer.py +60 -21
- snowflake/ml/modeling/impute/missing_indicator.py +60 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/lars.py +60 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/perceptron.py +60 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ridge.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
- snowflake/ml/modeling/manifold/isomap.py +60 -21
- snowflake/ml/modeling/manifold/mds.py +60 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
- snowflake/ml/modeling/manifold/tsne.py +60 -21
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +4 -12
- snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
- snowflake/ml/modeling/svm/linear_svc.py +60 -21
- snowflake/ml/modeling/svm/linear_svr.py +60 -21
- snowflake/ml/modeling/svm/nu_svc.py +60 -21
- snowflake/ml/modeling/svm/nu_svr.py +60 -21
- snowflake/ml/modeling/svm/svc.py +60 -21
- snowflake/ml/modeling/svm/svr.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
- snowflake/ml/registry/_manager/model_manager.py +20 -2
- snowflake/ml/registry/model_registry.py +1 -1
- snowflake/ml/registry/registry.py +1 -2
- snowflake/ml/utils/sql_client.py +22 -0
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.0.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +55 -3
- {snowflake_ml_python-1.6.0.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +251 -238
- {snowflake_ml_python-1.6.0.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
- snowflake/ml/feature_store/examples/new_york_taxi_features/features/pickup_features.py +0 -58
- {snowflake_ml_python-1.6.0.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.0.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -606,12 +603,23 @@ class LogisticRegression(BaseTransformer):
|
|
606
603
|
autogenerated=self._autogenerated,
|
607
604
|
subproject=_SUBPROJECT,
|
608
605
|
)
|
609
|
-
|
610
|
-
|
611
|
-
expected_output_cols_list=(
|
612
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
613
|
-
),
|
606
|
+
expected_output_cols = (
|
607
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
614
608
|
)
|
609
|
+
if isinstance(dataset, DataFrame):
|
610
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
611
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
612
|
+
)
|
613
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
614
|
+
drop_input_cols=self._drop_input_cols,
|
615
|
+
expected_output_cols_list=expected_output_cols,
|
616
|
+
example_output_pd_df=example_output_pd_df,
|
617
|
+
)
|
618
|
+
else:
|
619
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
620
|
+
drop_input_cols=self._drop_input_cols,
|
621
|
+
expected_output_cols_list=expected_output_cols,
|
622
|
+
)
|
615
623
|
self._sklearn_object = fitted_estimator
|
616
624
|
self._is_fitted = True
|
617
625
|
return output_result
|
@@ -690,12 +698,41 @@ class LogisticRegression(BaseTransformer):
|
|
690
698
|
|
691
699
|
return rv
|
692
700
|
|
693
|
-
def
|
694
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
695
|
-
) -> List[str]:
|
701
|
+
def _align_expected_output(
|
702
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
703
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
704
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
705
|
+
and output dataframe with 1 line.
|
706
|
+
If the method is fit_predict, run 2 lines of data.
|
707
|
+
"""
|
696
708
|
# in case the inferred output column names dimension is different
|
697
709
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
698
|
-
|
710
|
+
|
711
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
712
|
+
# so change the minimum of number of rows to 2
|
713
|
+
num_examples = 2
|
714
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
715
|
+
project=_PROJECT,
|
716
|
+
subproject=_SUBPROJECT,
|
717
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
718
|
+
inspect.currentframe(), LogisticRegression.__class__.__name__
|
719
|
+
),
|
720
|
+
api_calls=[Session.call],
|
721
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
722
|
+
)
|
723
|
+
if output_cols_prefix == "fit_predict_":
|
724
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
725
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
726
|
+
num_examples = self._sklearn_object.n_clusters
|
727
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
728
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
729
|
+
num_examples = self._sklearn_object.min_samples
|
730
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
731
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
732
|
+
num_examples = self._sklearn_object.n_neighbors
|
733
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
734
|
+
else:
|
735
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
699
736
|
|
700
737
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
701
738
|
# seen during the fit.
|
@@ -707,12 +744,14 @@ class LogisticRegression(BaseTransformer):
|
|
707
744
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
708
745
|
if self.sample_weight_col:
|
709
746
|
output_df_columns_set -= set(self.sample_weight_col)
|
747
|
+
|
710
748
|
# if the dimension of inferred output column names is correct; use it
|
711
749
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
712
|
-
return expected_output_cols_list
|
750
|
+
return expected_output_cols_list, output_df_pd
|
713
751
|
# otherwise, use the sklearn estimator's output
|
714
752
|
else:
|
715
|
-
|
753
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
754
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
716
755
|
|
717
756
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
718
757
|
@telemetry.send_api_usage_telemetry(
|
@@ -760,7 +799,7 @@ class LogisticRegression(BaseTransformer):
|
|
760
799
|
drop_input_cols=self._drop_input_cols,
|
761
800
|
expected_output_cols_type="float",
|
762
801
|
)
|
763
|
-
expected_output_cols = self.
|
802
|
+
expected_output_cols, _ = self._align_expected_output(
|
764
803
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
765
804
|
)
|
766
805
|
|
@@ -828,7 +867,7 @@ class LogisticRegression(BaseTransformer):
|
|
828
867
|
drop_input_cols=self._drop_input_cols,
|
829
868
|
expected_output_cols_type="float",
|
830
869
|
)
|
831
|
-
expected_output_cols = self.
|
870
|
+
expected_output_cols, _ = self._align_expected_output(
|
832
871
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
833
872
|
)
|
834
873
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -893,7 +932,7 @@ class LogisticRegression(BaseTransformer):
|
|
893
932
|
drop_input_cols=self._drop_input_cols,
|
894
933
|
expected_output_cols_type="float",
|
895
934
|
)
|
896
|
-
expected_output_cols = self.
|
935
|
+
expected_output_cols, _ = self._align_expected_output(
|
897
936
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
898
937
|
)
|
899
938
|
|
@@ -958,7 +997,7 @@ class LogisticRegression(BaseTransformer):
|
|
958
997
|
drop_input_cols = self._drop_input_cols,
|
959
998
|
expected_output_cols_type="float",
|
960
999
|
)
|
961
|
-
expected_output_cols = self.
|
1000
|
+
expected_output_cols, _ = self._align_expected_output(
|
962
1001
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
963
1002
|
)
|
964
1003
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -627,12 +624,23 @@ class LogisticRegressionCV(BaseTransformer):
|
|
627
624
|
autogenerated=self._autogenerated,
|
628
625
|
subproject=_SUBPROJECT,
|
629
626
|
)
|
630
|
-
|
631
|
-
|
632
|
-
expected_output_cols_list=(
|
633
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
634
|
-
),
|
627
|
+
expected_output_cols = (
|
628
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
635
629
|
)
|
630
|
+
if isinstance(dataset, DataFrame):
|
631
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
632
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
633
|
+
)
|
634
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
635
|
+
drop_input_cols=self._drop_input_cols,
|
636
|
+
expected_output_cols_list=expected_output_cols,
|
637
|
+
example_output_pd_df=example_output_pd_df,
|
638
|
+
)
|
639
|
+
else:
|
640
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
641
|
+
drop_input_cols=self._drop_input_cols,
|
642
|
+
expected_output_cols_list=expected_output_cols,
|
643
|
+
)
|
636
644
|
self._sklearn_object = fitted_estimator
|
637
645
|
self._is_fitted = True
|
638
646
|
return output_result
|
@@ -711,12 +719,41 @@ class LogisticRegressionCV(BaseTransformer):
|
|
711
719
|
|
712
720
|
return rv
|
713
721
|
|
714
|
-
def
|
715
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
716
|
-
) -> List[str]:
|
722
|
+
def _align_expected_output(
|
723
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
724
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
725
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
726
|
+
and output dataframe with 1 line.
|
727
|
+
If the method is fit_predict, run 2 lines of data.
|
728
|
+
"""
|
717
729
|
# in case the inferred output column names dimension is different
|
718
730
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
719
|
-
|
731
|
+
|
732
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
733
|
+
# so change the minimum of number of rows to 2
|
734
|
+
num_examples = 2
|
735
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
736
|
+
project=_PROJECT,
|
737
|
+
subproject=_SUBPROJECT,
|
738
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
739
|
+
inspect.currentframe(), LogisticRegressionCV.__class__.__name__
|
740
|
+
),
|
741
|
+
api_calls=[Session.call],
|
742
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
743
|
+
)
|
744
|
+
if output_cols_prefix == "fit_predict_":
|
745
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
746
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
747
|
+
num_examples = self._sklearn_object.n_clusters
|
748
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
749
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
750
|
+
num_examples = self._sklearn_object.min_samples
|
751
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
752
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
753
|
+
num_examples = self._sklearn_object.n_neighbors
|
754
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
755
|
+
else:
|
756
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
720
757
|
|
721
758
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
722
759
|
# seen during the fit.
|
@@ -728,12 +765,14 @@ class LogisticRegressionCV(BaseTransformer):
|
|
728
765
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
729
766
|
if self.sample_weight_col:
|
730
767
|
output_df_columns_set -= set(self.sample_weight_col)
|
768
|
+
|
731
769
|
# if the dimension of inferred output column names is correct; use it
|
732
770
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
733
|
-
return expected_output_cols_list
|
771
|
+
return expected_output_cols_list, output_df_pd
|
734
772
|
# otherwise, use the sklearn estimator's output
|
735
773
|
else:
|
736
|
-
|
774
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
775
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
737
776
|
|
738
777
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
739
778
|
@telemetry.send_api_usage_telemetry(
|
@@ -781,7 +820,7 @@ class LogisticRegressionCV(BaseTransformer):
|
|
781
820
|
drop_input_cols=self._drop_input_cols,
|
782
821
|
expected_output_cols_type="float",
|
783
822
|
)
|
784
|
-
expected_output_cols = self.
|
823
|
+
expected_output_cols, _ = self._align_expected_output(
|
785
824
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
786
825
|
)
|
787
826
|
|
@@ -849,7 +888,7 @@ class LogisticRegressionCV(BaseTransformer):
|
|
849
888
|
drop_input_cols=self._drop_input_cols,
|
850
889
|
expected_output_cols_type="float",
|
851
890
|
)
|
852
|
-
expected_output_cols = self.
|
891
|
+
expected_output_cols, _ = self._align_expected_output(
|
853
892
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
854
893
|
)
|
855
894
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -914,7 +953,7 @@ class LogisticRegressionCV(BaseTransformer):
|
|
914
953
|
drop_input_cols=self._drop_input_cols,
|
915
954
|
expected_output_cols_type="float",
|
916
955
|
)
|
917
|
-
expected_output_cols = self.
|
956
|
+
expected_output_cols, _ = self._align_expected_output(
|
918
957
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
919
958
|
)
|
920
959
|
|
@@ -979,7 +1018,7 @@ class LogisticRegressionCV(BaseTransformer):
|
|
979
1018
|
drop_input_cols = self._drop_input_cols,
|
980
1019
|
expected_output_cols_type="float",
|
981
1020
|
)
|
982
|
-
expected_output_cols = self.
|
1021
|
+
expected_output_cols, _ = self._align_expected_output(
|
983
1022
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
984
1023
|
)
|
985
1024
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -525,12 +522,23 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
525
522
|
autogenerated=self._autogenerated,
|
526
523
|
subproject=_SUBPROJECT,
|
527
524
|
)
|
528
|
-
|
529
|
-
|
530
|
-
expected_output_cols_list=(
|
531
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
532
|
-
),
|
525
|
+
expected_output_cols = (
|
526
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
533
527
|
)
|
528
|
+
if isinstance(dataset, DataFrame):
|
529
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
530
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
531
|
+
)
|
532
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
533
|
+
drop_input_cols=self._drop_input_cols,
|
534
|
+
expected_output_cols_list=expected_output_cols,
|
535
|
+
example_output_pd_df=example_output_pd_df,
|
536
|
+
)
|
537
|
+
else:
|
538
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
539
|
+
drop_input_cols=self._drop_input_cols,
|
540
|
+
expected_output_cols_list=expected_output_cols,
|
541
|
+
)
|
534
542
|
self._sklearn_object = fitted_estimator
|
535
543
|
self._is_fitted = True
|
536
544
|
return output_result
|
@@ -609,12 +617,41 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
609
617
|
|
610
618
|
return rv
|
611
619
|
|
612
|
-
def
|
613
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
614
|
-
) -> List[str]:
|
620
|
+
def _align_expected_output(
|
621
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
622
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
623
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
624
|
+
and output dataframe with 1 line.
|
625
|
+
If the method is fit_predict, run 2 lines of data.
|
626
|
+
"""
|
615
627
|
# in case the inferred output column names dimension is different
|
616
628
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
617
|
-
|
629
|
+
|
630
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
631
|
+
# so change the minimum of number of rows to 2
|
632
|
+
num_examples = 2
|
633
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
634
|
+
project=_PROJECT,
|
635
|
+
subproject=_SUBPROJECT,
|
636
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
637
|
+
inspect.currentframe(), MultiTaskElasticNet.__class__.__name__
|
638
|
+
),
|
639
|
+
api_calls=[Session.call],
|
640
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
641
|
+
)
|
642
|
+
if output_cols_prefix == "fit_predict_":
|
643
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
644
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
645
|
+
num_examples = self._sklearn_object.n_clusters
|
646
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
647
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
648
|
+
num_examples = self._sklearn_object.min_samples
|
649
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
650
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
651
|
+
num_examples = self._sklearn_object.n_neighbors
|
652
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
653
|
+
else:
|
654
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
618
655
|
|
619
656
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
620
657
|
# seen during the fit.
|
@@ -626,12 +663,14 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
626
663
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
627
664
|
if self.sample_weight_col:
|
628
665
|
output_df_columns_set -= set(self.sample_weight_col)
|
666
|
+
|
629
667
|
# if the dimension of inferred output column names is correct; use it
|
630
668
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
631
|
-
return expected_output_cols_list
|
669
|
+
return expected_output_cols_list, output_df_pd
|
632
670
|
# otherwise, use the sklearn estimator's output
|
633
671
|
else:
|
634
|
-
|
672
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
673
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
635
674
|
|
636
675
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
637
676
|
@telemetry.send_api_usage_telemetry(
|
@@ -677,7 +716,7 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
677
716
|
drop_input_cols=self._drop_input_cols,
|
678
717
|
expected_output_cols_type="float",
|
679
718
|
)
|
680
|
-
expected_output_cols = self.
|
719
|
+
expected_output_cols, _ = self._align_expected_output(
|
681
720
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
682
721
|
)
|
683
722
|
|
@@ -743,7 +782,7 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
743
782
|
drop_input_cols=self._drop_input_cols,
|
744
783
|
expected_output_cols_type="float",
|
745
784
|
)
|
746
|
-
expected_output_cols = self.
|
785
|
+
expected_output_cols, _ = self._align_expected_output(
|
747
786
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
748
787
|
)
|
749
788
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -806,7 +845,7 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
806
845
|
drop_input_cols=self._drop_input_cols,
|
807
846
|
expected_output_cols_type="float",
|
808
847
|
)
|
809
|
-
expected_output_cols = self.
|
848
|
+
expected_output_cols, _ = self._align_expected_output(
|
810
849
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
811
850
|
)
|
812
851
|
|
@@ -871,7 +910,7 @@ class MultiTaskElasticNet(BaseTransformer):
|
|
871
910
|
drop_input_cols = self._drop_input_cols,
|
872
911
|
expected_output_cols_type="float",
|
873
912
|
)
|
874
|
-
expected_output_cols = self.
|
913
|
+
expected_output_cols, _ = self._align_expected_output(
|
875
914
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
876
915
|
)
|
877
916
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -566,12 +563,23 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
566
563
|
autogenerated=self._autogenerated,
|
567
564
|
subproject=_SUBPROJECT,
|
568
565
|
)
|
569
|
-
|
570
|
-
|
571
|
-
expected_output_cols_list=(
|
572
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
573
|
-
),
|
566
|
+
expected_output_cols = (
|
567
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
574
568
|
)
|
569
|
+
if isinstance(dataset, DataFrame):
|
570
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
571
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
572
|
+
)
|
573
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
574
|
+
drop_input_cols=self._drop_input_cols,
|
575
|
+
expected_output_cols_list=expected_output_cols,
|
576
|
+
example_output_pd_df=example_output_pd_df,
|
577
|
+
)
|
578
|
+
else:
|
579
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
580
|
+
drop_input_cols=self._drop_input_cols,
|
581
|
+
expected_output_cols_list=expected_output_cols,
|
582
|
+
)
|
575
583
|
self._sklearn_object = fitted_estimator
|
576
584
|
self._is_fitted = True
|
577
585
|
return output_result
|
@@ -650,12 +658,41 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
650
658
|
|
651
659
|
return rv
|
652
660
|
|
653
|
-
def
|
654
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
655
|
-
) -> List[str]:
|
661
|
+
def _align_expected_output(
|
662
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
663
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
664
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
665
|
+
and output dataframe with 1 line.
|
666
|
+
If the method is fit_predict, run 2 lines of data.
|
667
|
+
"""
|
656
668
|
# in case the inferred output column names dimension is different
|
657
669
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
658
|
-
|
670
|
+
|
671
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
672
|
+
# so change the minimum of number of rows to 2
|
673
|
+
num_examples = 2
|
674
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
675
|
+
project=_PROJECT,
|
676
|
+
subproject=_SUBPROJECT,
|
677
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
678
|
+
inspect.currentframe(), MultiTaskElasticNetCV.__class__.__name__
|
679
|
+
),
|
680
|
+
api_calls=[Session.call],
|
681
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
682
|
+
)
|
683
|
+
if output_cols_prefix == "fit_predict_":
|
684
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
685
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
686
|
+
num_examples = self._sklearn_object.n_clusters
|
687
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
688
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
689
|
+
num_examples = self._sklearn_object.min_samples
|
690
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
691
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
692
|
+
num_examples = self._sklearn_object.n_neighbors
|
693
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
694
|
+
else:
|
695
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
659
696
|
|
660
697
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
661
698
|
# seen during the fit.
|
@@ -667,12 +704,14 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
667
704
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
668
705
|
if self.sample_weight_col:
|
669
706
|
output_df_columns_set -= set(self.sample_weight_col)
|
707
|
+
|
670
708
|
# if the dimension of inferred output column names is correct; use it
|
671
709
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
672
|
-
return expected_output_cols_list
|
710
|
+
return expected_output_cols_list, output_df_pd
|
673
711
|
# otherwise, use the sklearn estimator's output
|
674
712
|
else:
|
675
|
-
|
713
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
714
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
676
715
|
|
677
716
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
678
717
|
@telemetry.send_api_usage_telemetry(
|
@@ -718,7 +757,7 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
718
757
|
drop_input_cols=self._drop_input_cols,
|
719
758
|
expected_output_cols_type="float",
|
720
759
|
)
|
721
|
-
expected_output_cols = self.
|
760
|
+
expected_output_cols, _ = self._align_expected_output(
|
722
761
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
723
762
|
)
|
724
763
|
|
@@ -784,7 +823,7 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
784
823
|
drop_input_cols=self._drop_input_cols,
|
785
824
|
expected_output_cols_type="float",
|
786
825
|
)
|
787
|
-
expected_output_cols = self.
|
826
|
+
expected_output_cols, _ = self._align_expected_output(
|
788
827
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
789
828
|
)
|
790
829
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -847,7 +886,7 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
847
886
|
drop_input_cols=self._drop_input_cols,
|
848
887
|
expected_output_cols_type="float",
|
849
888
|
)
|
850
|
-
expected_output_cols = self.
|
889
|
+
expected_output_cols, _ = self._align_expected_output(
|
851
890
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
852
891
|
)
|
853
892
|
|
@@ -912,7 +951,7 @@ class MultiTaskElasticNetCV(BaseTransformer):
|
|
912
951
|
drop_input_cols = self._drop_input_cols,
|
913
952
|
expected_output_cols_type="float",
|
914
953
|
)
|
915
|
-
expected_output_cols = self.
|
954
|
+
expected_output_cols, _ = self._align_expected_output(
|
916
955
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
917
956
|
)
|
918
957
|
|