snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +142 -20
- snowflake/ml/_internal/utils/identifier.py +48 -11
- snowflake/ml/_internal/utils/snowflake_env.py +23 -13
- snowflake/ml/_internal/utils/sql_identifier.py +1 -1
- snowflake/ml/_internal/utils/table_manager.py +19 -1
- snowflake/ml/_internal/utils/uri.py +2 -2
- snowflake/ml/data/data_connector.py +33 -7
- snowflake/ml/data/torch_utils.py +68 -0
- snowflake/ml/dataset/dataset.py +1 -3
- snowflake/ml/feature_store/feature_store.py +41 -17
- snowflake/ml/feature_store/feature_view.py +2 -2
- snowflake/ml/fileset/embedded_stage_fs.py +1 -1
- snowflake/ml/fileset/fileset.py +1 -1
- snowflake/ml/fileset/sfcfs.py +9 -3
- snowflake/ml/model/_client/model/model_version_impl.py +22 -7
- snowflake/ml/model/_client/ops/model_ops.py +39 -3
- snowflake/ml/model/_client/ops/service_ops.py +198 -7
- snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
- snowflake/ml/model/_client/sql/service.py +85 -18
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
- snowflake/ml/model/_model_composer/model_composer.py +2 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
- snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
- snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
- snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
- snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
- snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
- snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
- snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
- snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
- snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
- snowflake/ml/model/_packager/model_packager.py +2 -0
- snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
- snowflake/ml/model/_signatures/utils.py +9 -0
- snowflake/ml/model/models/llm.py +3 -1
- snowflake/ml/model/type_hints.py +9 -1
- snowflake/ml/modeling/_internal/constants.py +1 -0
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
- snowflake/ml/modeling/_internal/model_specifications.py +2 -0
- snowflake/ml/modeling/_internal/model_trainer.py +1 -0
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
- snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
- snowflake/ml/modeling/cluster/birch.py +60 -21
- snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
- snowflake/ml/modeling/cluster/dbscan.py +60 -21
- snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
- snowflake/ml/modeling/cluster/k_means.py +60 -21
- snowflake/ml/modeling/cluster/mean_shift.py +60 -21
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
- snowflake/ml/modeling/cluster/optics.py +60 -21
- snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
- snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
- snowflake/ml/modeling/compose/column_transformer.py +60 -21
- snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
- snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
- snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
- snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
- snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
- snowflake/ml/modeling/covariance/oas.py +60 -21
- snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
- snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
- snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
- snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
- snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/pca.py +60 -21
- snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
- snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
- snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
- snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
- snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
- snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
- snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
- snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
- snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
- snowflake/ml/modeling/impute/knn_imputer.py +60 -21
- snowflake/ml/modeling/impute/missing_indicator.py +60 -21
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/lars.py +60 -21
- snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
- snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/perceptron.py +60 -21
- snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/ridge.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
- snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
- snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
- snowflake/ml/modeling/manifold/isomap.py +60 -21
- snowflake/ml/modeling/manifold/mds.py +60 -21
- snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
- snowflake/ml/modeling/manifold/tsne.py +60 -21
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
- snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
- snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
- snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
- snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
- snowflake/ml/modeling/pipeline/pipeline.py +1 -12
- snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
- snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
- snowflake/ml/modeling/svm/linear_svc.py +60 -21
- snowflake/ml/modeling/svm/linear_svr.py +60 -21
- snowflake/ml/modeling/svm/nu_svc.py +60 -21
- snowflake/ml/modeling/svm/nu_svr.py +60 -21
- snowflake/ml/modeling/svm/svc.py +60 -21
- snowflake/ml/modeling/svm/svr.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
- snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
- snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
- snowflake/ml/registry/_manager/model_manager.py +4 -0
- snowflake/ml/registry/model_registry.py +1 -1
- snowflake/ml/registry/registry.py +1 -2
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
- snowflake/ml/data/torch_dataset.py +0 -33
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -660,12 +657,23 @@ class MLPClassifier(BaseTransformer):
|
|
660
657
|
autogenerated=self._autogenerated,
|
661
658
|
subproject=_SUBPROJECT,
|
662
659
|
)
|
663
|
-
|
664
|
-
|
665
|
-
expected_output_cols_list=(
|
666
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
667
|
-
),
|
660
|
+
expected_output_cols = (
|
661
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
668
662
|
)
|
663
|
+
if isinstance(dataset, DataFrame):
|
664
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
665
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
666
|
+
)
|
667
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
668
|
+
drop_input_cols=self._drop_input_cols,
|
669
|
+
expected_output_cols_list=expected_output_cols,
|
670
|
+
example_output_pd_df=example_output_pd_df,
|
671
|
+
)
|
672
|
+
else:
|
673
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
674
|
+
drop_input_cols=self._drop_input_cols,
|
675
|
+
expected_output_cols_list=expected_output_cols,
|
676
|
+
)
|
669
677
|
self._sklearn_object = fitted_estimator
|
670
678
|
self._is_fitted = True
|
671
679
|
return output_result
|
@@ -744,12 +752,41 @@ class MLPClassifier(BaseTransformer):
|
|
744
752
|
|
745
753
|
return rv
|
746
754
|
|
747
|
-
def
|
748
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
749
|
-
) -> List[str]:
|
755
|
+
def _align_expected_output(
|
756
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
757
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
758
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
759
|
+
and output dataframe with 1 line.
|
760
|
+
If the method is fit_predict, run 2 lines of data.
|
761
|
+
"""
|
750
762
|
# in case the inferred output column names dimension is different
|
751
763
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
752
|
-
|
764
|
+
|
765
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
766
|
+
# so change the minimum of number of rows to 2
|
767
|
+
num_examples = 2
|
768
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
769
|
+
project=_PROJECT,
|
770
|
+
subproject=_SUBPROJECT,
|
771
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
772
|
+
inspect.currentframe(), MLPClassifier.__class__.__name__
|
773
|
+
),
|
774
|
+
api_calls=[Session.call],
|
775
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
776
|
+
)
|
777
|
+
if output_cols_prefix == "fit_predict_":
|
778
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
779
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
780
|
+
num_examples = self._sklearn_object.n_clusters
|
781
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
782
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
783
|
+
num_examples = self._sklearn_object.min_samples
|
784
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
785
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
786
|
+
num_examples = self._sklearn_object.n_neighbors
|
787
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
788
|
+
else:
|
789
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
753
790
|
|
754
791
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
755
792
|
# seen during the fit.
|
@@ -761,12 +798,14 @@ class MLPClassifier(BaseTransformer):
|
|
761
798
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
762
799
|
if self.sample_weight_col:
|
763
800
|
output_df_columns_set -= set(self.sample_weight_col)
|
801
|
+
|
764
802
|
# if the dimension of inferred output column names is correct; use it
|
765
803
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
766
|
-
return expected_output_cols_list
|
804
|
+
return expected_output_cols_list, output_df_pd
|
767
805
|
# otherwise, use the sklearn estimator's output
|
768
806
|
else:
|
769
|
-
|
807
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
808
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
770
809
|
|
771
810
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
772
811
|
@telemetry.send_api_usage_telemetry(
|
@@ -814,7 +853,7 @@ class MLPClassifier(BaseTransformer):
|
|
814
853
|
drop_input_cols=self._drop_input_cols,
|
815
854
|
expected_output_cols_type="float",
|
816
855
|
)
|
817
|
-
expected_output_cols = self.
|
856
|
+
expected_output_cols, _ = self._align_expected_output(
|
818
857
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
819
858
|
)
|
820
859
|
|
@@ -882,7 +921,7 @@ class MLPClassifier(BaseTransformer):
|
|
882
921
|
drop_input_cols=self._drop_input_cols,
|
883
922
|
expected_output_cols_type="float",
|
884
923
|
)
|
885
|
-
expected_output_cols = self.
|
924
|
+
expected_output_cols, _ = self._align_expected_output(
|
886
925
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
887
926
|
)
|
888
927
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -945,7 +984,7 @@ class MLPClassifier(BaseTransformer):
|
|
945
984
|
drop_input_cols=self._drop_input_cols,
|
946
985
|
expected_output_cols_type="float",
|
947
986
|
)
|
948
|
-
expected_output_cols = self.
|
987
|
+
expected_output_cols, _ = self._align_expected_output(
|
949
988
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
950
989
|
)
|
951
990
|
|
@@ -1010,7 +1049,7 @@ class MLPClassifier(BaseTransformer):
|
|
1010
1049
|
drop_input_cols = self._drop_input_cols,
|
1011
1050
|
expected_output_cols_type="float",
|
1012
1051
|
)
|
1013
|
-
expected_output_cols = self.
|
1052
|
+
expected_output_cols, _ = self._align_expected_output(
|
1014
1053
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
1015
1054
|
)
|
1016
1055
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -656,12 +653,23 @@ class MLPRegressor(BaseTransformer):
|
|
656
653
|
autogenerated=self._autogenerated,
|
657
654
|
subproject=_SUBPROJECT,
|
658
655
|
)
|
659
|
-
|
660
|
-
|
661
|
-
expected_output_cols_list=(
|
662
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
663
|
-
),
|
656
|
+
expected_output_cols = (
|
657
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
664
658
|
)
|
659
|
+
if isinstance(dataset, DataFrame):
|
660
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
661
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
662
|
+
)
|
663
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
664
|
+
drop_input_cols=self._drop_input_cols,
|
665
|
+
expected_output_cols_list=expected_output_cols,
|
666
|
+
example_output_pd_df=example_output_pd_df,
|
667
|
+
)
|
668
|
+
else:
|
669
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
670
|
+
drop_input_cols=self._drop_input_cols,
|
671
|
+
expected_output_cols_list=expected_output_cols,
|
672
|
+
)
|
665
673
|
self._sklearn_object = fitted_estimator
|
666
674
|
self._is_fitted = True
|
667
675
|
return output_result
|
@@ -740,12 +748,41 @@ class MLPRegressor(BaseTransformer):
|
|
740
748
|
|
741
749
|
return rv
|
742
750
|
|
743
|
-
def
|
744
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
745
|
-
) -> List[str]:
|
751
|
+
def _align_expected_output(
|
752
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
753
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
754
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
755
|
+
and output dataframe with 1 line.
|
756
|
+
If the method is fit_predict, run 2 lines of data.
|
757
|
+
"""
|
746
758
|
# in case the inferred output column names dimension is different
|
747
759
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
748
|
-
|
760
|
+
|
761
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
762
|
+
# so change the minimum of number of rows to 2
|
763
|
+
num_examples = 2
|
764
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
765
|
+
project=_PROJECT,
|
766
|
+
subproject=_SUBPROJECT,
|
767
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
768
|
+
inspect.currentframe(), MLPRegressor.__class__.__name__
|
769
|
+
),
|
770
|
+
api_calls=[Session.call],
|
771
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
772
|
+
)
|
773
|
+
if output_cols_prefix == "fit_predict_":
|
774
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
775
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
776
|
+
num_examples = self._sklearn_object.n_clusters
|
777
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
778
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
779
|
+
num_examples = self._sklearn_object.min_samples
|
780
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
781
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
782
|
+
num_examples = self._sklearn_object.n_neighbors
|
783
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
784
|
+
else:
|
785
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
749
786
|
|
750
787
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
751
788
|
# seen during the fit.
|
@@ -757,12 +794,14 @@ class MLPRegressor(BaseTransformer):
|
|
757
794
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
758
795
|
if self.sample_weight_col:
|
759
796
|
output_df_columns_set -= set(self.sample_weight_col)
|
797
|
+
|
760
798
|
# if the dimension of inferred output column names is correct; use it
|
761
799
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
762
|
-
return expected_output_cols_list
|
800
|
+
return expected_output_cols_list, output_df_pd
|
763
801
|
# otherwise, use the sklearn estimator's output
|
764
802
|
else:
|
765
|
-
|
803
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
804
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
766
805
|
|
767
806
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
768
807
|
@telemetry.send_api_usage_telemetry(
|
@@ -808,7 +847,7 @@ class MLPRegressor(BaseTransformer):
|
|
808
847
|
drop_input_cols=self._drop_input_cols,
|
809
848
|
expected_output_cols_type="float",
|
810
849
|
)
|
811
|
-
expected_output_cols = self.
|
850
|
+
expected_output_cols, _ = self._align_expected_output(
|
812
851
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
813
852
|
)
|
814
853
|
|
@@ -874,7 +913,7 @@ class MLPRegressor(BaseTransformer):
|
|
874
913
|
drop_input_cols=self._drop_input_cols,
|
875
914
|
expected_output_cols_type="float",
|
876
915
|
)
|
877
|
-
expected_output_cols = self.
|
916
|
+
expected_output_cols, _ = self._align_expected_output(
|
878
917
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
879
918
|
)
|
880
919
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -937,7 +976,7 @@ class MLPRegressor(BaseTransformer):
|
|
937
976
|
drop_input_cols=self._drop_input_cols,
|
938
977
|
expected_output_cols_type="float",
|
939
978
|
)
|
940
|
-
expected_output_cols = self.
|
979
|
+
expected_output_cols, _ = self._align_expected_output(
|
941
980
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
942
981
|
)
|
943
982
|
|
@@ -1002,7 +1041,7 @@ class MLPRegressor(BaseTransformer):
|
|
1002
1041
|
drop_input_cols = self._drop_input_cols,
|
1003
1042
|
expected_output_cols_type="float",
|
1004
1043
|
)
|
1005
|
-
expected_output_cols = self.
|
1044
|
+
expected_output_cols, _ = self._align_expected_output(
|
1006
1045
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
1007
1046
|
)
|
1008
1047
|
|
@@ -418,9 +418,6 @@ class Pipeline(base.BaseTransformer):
|
|
418
418
|
|
419
419
|
Returns:
|
420
420
|
Fitted pipeline.
|
421
|
-
|
422
|
-
Raises:
|
423
|
-
ValueError: A pipeline incompatible with sklearn is used on MLRS
|
424
421
|
"""
|
425
422
|
|
426
423
|
self._validate_steps()
|
@@ -437,8 +434,6 @@ class Pipeline(base.BaseTransformer):
|
|
437
434
|
lineage_utils.set_data_sources(self, data_sources)
|
438
435
|
|
439
436
|
if self._can_be_trained_in_ml_runtime(dataset):
|
440
|
-
if not self._is_convertible_to_sklearn:
|
441
|
-
raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
|
442
437
|
self._fit_ml_runtime(dataset)
|
443
438
|
|
444
439
|
elif squash and isinstance(dataset, snowpark.DataFrame):
|
@@ -611,14 +606,8 @@ class Pipeline(base.BaseTransformer):
|
|
611
606
|
|
612
607
|
Returns:
|
613
608
|
Output dataset.
|
614
|
-
|
615
|
-
Raises:
|
616
|
-
ValueError: An sklearn object has not been fit and stored before calling this function.
|
617
609
|
"""
|
618
|
-
if os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
619
|
-
if self._sklearn_object is None:
|
620
|
-
raise ValueError("Model must be fit before inference.")
|
621
|
-
|
610
|
+
if os.environ.get(IN_ML_RUNTIME_ENV_VAR) and self._sklearn_object is not None:
|
622
611
|
expected_output_cols = self._infer_output_cols()
|
623
612
|
handler = ModelTransformerBuilder.build(
|
624
613
|
dataset=dataset,
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -497,12 +494,23 @@ class PolynomialFeatures(BaseTransformer):
|
|
497
494
|
autogenerated=self._autogenerated,
|
498
495
|
subproject=_SUBPROJECT,
|
499
496
|
)
|
500
|
-
|
501
|
-
|
502
|
-
expected_output_cols_list=(
|
503
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
504
|
-
),
|
497
|
+
expected_output_cols = (
|
498
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
505
499
|
)
|
500
|
+
if isinstance(dataset, DataFrame):
|
501
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
502
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
503
|
+
)
|
504
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
505
|
+
drop_input_cols=self._drop_input_cols,
|
506
|
+
expected_output_cols_list=expected_output_cols,
|
507
|
+
example_output_pd_df=example_output_pd_df,
|
508
|
+
)
|
509
|
+
else:
|
510
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
511
|
+
drop_input_cols=self._drop_input_cols,
|
512
|
+
expected_output_cols_list=expected_output_cols,
|
513
|
+
)
|
506
514
|
self._sklearn_object = fitted_estimator
|
507
515
|
self._is_fitted = True
|
508
516
|
return output_result
|
@@ -583,12 +591,41 @@ class PolynomialFeatures(BaseTransformer):
|
|
583
591
|
|
584
592
|
return rv
|
585
593
|
|
586
|
-
def
|
587
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
588
|
-
) -> List[str]:
|
594
|
+
def _align_expected_output(
|
595
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
596
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
597
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
598
|
+
and output dataframe with 1 line.
|
599
|
+
If the method is fit_predict, run 2 lines of data.
|
600
|
+
"""
|
589
601
|
# in case the inferred output column names dimension is different
|
590
602
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
591
|
-
|
603
|
+
|
604
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
605
|
+
# so change the minimum of number of rows to 2
|
606
|
+
num_examples = 2
|
607
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
608
|
+
project=_PROJECT,
|
609
|
+
subproject=_SUBPROJECT,
|
610
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
611
|
+
inspect.currentframe(), PolynomialFeatures.__class__.__name__
|
612
|
+
),
|
613
|
+
api_calls=[Session.call],
|
614
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
615
|
+
)
|
616
|
+
if output_cols_prefix == "fit_predict_":
|
617
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
618
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
619
|
+
num_examples = self._sklearn_object.n_clusters
|
620
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
621
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
622
|
+
num_examples = self._sklearn_object.min_samples
|
623
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
624
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
625
|
+
num_examples = self._sklearn_object.n_neighbors
|
626
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
627
|
+
else:
|
628
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
592
629
|
|
593
630
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
594
631
|
# seen during the fit.
|
@@ -600,12 +637,14 @@ class PolynomialFeatures(BaseTransformer):
|
|
600
637
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
601
638
|
if self.sample_weight_col:
|
602
639
|
output_df_columns_set -= set(self.sample_weight_col)
|
640
|
+
|
603
641
|
# if the dimension of inferred output column names is correct; use it
|
604
642
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
605
|
-
return expected_output_cols_list
|
643
|
+
return expected_output_cols_list, output_df_pd
|
606
644
|
# otherwise, use the sklearn estimator's output
|
607
645
|
else:
|
608
|
-
|
646
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
647
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
609
648
|
|
610
649
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
611
650
|
@telemetry.send_api_usage_telemetry(
|
@@ -651,7 +690,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
651
690
|
drop_input_cols=self._drop_input_cols,
|
652
691
|
expected_output_cols_type="float",
|
653
692
|
)
|
654
|
-
expected_output_cols = self.
|
693
|
+
expected_output_cols, _ = self._align_expected_output(
|
655
694
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
656
695
|
)
|
657
696
|
|
@@ -717,7 +756,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
717
756
|
drop_input_cols=self._drop_input_cols,
|
718
757
|
expected_output_cols_type="float",
|
719
758
|
)
|
720
|
-
expected_output_cols = self.
|
759
|
+
expected_output_cols, _ = self._align_expected_output(
|
721
760
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
722
761
|
)
|
723
762
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -780,7 +819,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
780
819
|
drop_input_cols=self._drop_input_cols,
|
781
820
|
expected_output_cols_type="float",
|
782
821
|
)
|
783
|
-
expected_output_cols = self.
|
822
|
+
expected_output_cols, _ = self._align_expected_output(
|
784
823
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
785
824
|
)
|
786
825
|
|
@@ -845,7 +884,7 @@ class PolynomialFeatures(BaseTransformer):
|
|
845
884
|
drop_input_cols = self._drop_input_cols,
|
846
885
|
expected_output_cols_type="float",
|
847
886
|
)
|
848
|
-
expected_output_cols = self.
|
887
|
+
expected_output_cols, _ = self._align_expected_output(
|
849
888
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
850
889
|
)
|
851
890
|
|
@@ -4,14 +4,12 @@
|
|
4
4
|
#
|
5
5
|
import inspect
|
6
6
|
import os
|
7
|
-
import
|
8
|
-
from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
|
9
|
-
from typing_extensions import TypeGuard
|
7
|
+
from typing import Iterable, Optional, Union, List, Any, Dict, Set, Tuple
|
10
8
|
from uuid import uuid4
|
11
9
|
|
12
10
|
import cloudpickle as cp
|
13
|
-
import pandas as pd
|
14
11
|
import numpy as np
|
12
|
+
import pandas as pd
|
15
13
|
from numpy import typing as npt
|
16
14
|
|
17
15
|
|
@@ -24,12 +22,11 @@ from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols
|
|
24
22
|
from snowflake.ml._internal import telemetry
|
25
23
|
from snowflake.ml._internal.exceptions import error_codes, exceptions, modeling_error_messages
|
26
24
|
from snowflake.ml._internal.env_utils import SNOWML_SPROC_ENV
|
27
|
-
from snowflake.ml._internal.utils import
|
25
|
+
from snowflake.ml._internal.utils import identifier
|
28
26
|
from snowflake.snowpark import DataFrame, Session
|
29
27
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
30
28
|
from snowflake.ml.modeling._internal.model_trainer_builder import ModelTrainerBuilder
|
31
29
|
from snowflake.ml.modeling._internal.transformer_protocols import (
|
32
|
-
ModelTransformHandlers,
|
33
30
|
BatchInferenceKwargsTypedDict,
|
34
31
|
ScoreKwargsTypedDict
|
35
32
|
)
|
@@ -501,12 +498,23 @@ class LabelPropagation(BaseTransformer):
|
|
501
498
|
autogenerated=self._autogenerated,
|
502
499
|
subproject=_SUBPROJECT,
|
503
500
|
)
|
504
|
-
|
505
|
-
|
506
|
-
expected_output_cols_list=(
|
507
|
-
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
508
|
-
),
|
501
|
+
expected_output_cols = (
|
502
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
509
503
|
)
|
504
|
+
if isinstance(dataset, DataFrame):
|
505
|
+
expected_output_cols, example_output_pd_df = self._align_expected_output(
|
506
|
+
"fit_predict", dataset, expected_output_cols, output_cols_prefix
|
507
|
+
)
|
508
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
509
|
+
drop_input_cols=self._drop_input_cols,
|
510
|
+
expected_output_cols_list=expected_output_cols,
|
511
|
+
example_output_pd_df=example_output_pd_df,
|
512
|
+
)
|
513
|
+
else:
|
514
|
+
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
515
|
+
drop_input_cols=self._drop_input_cols,
|
516
|
+
expected_output_cols_list=expected_output_cols,
|
517
|
+
)
|
510
518
|
self._sklearn_object = fitted_estimator
|
511
519
|
self._is_fitted = True
|
512
520
|
return output_result
|
@@ -585,12 +593,41 @@ class LabelPropagation(BaseTransformer):
|
|
585
593
|
|
586
594
|
return rv
|
587
595
|
|
588
|
-
def
|
589
|
-
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
590
|
-
) -> List[str]:
|
596
|
+
def _align_expected_output(
|
597
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str,
|
598
|
+
) -> Tuple[List[str], pd.DataFrame]:
|
599
|
+
""" Run 1 line of data with the desired method, and return one tuple that consists of the output column names
|
600
|
+
and output dataframe with 1 line.
|
601
|
+
If the method is fit_predict, run 2 lines of data.
|
602
|
+
"""
|
591
603
|
# in case the inferred output column names dimension is different
|
592
604
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
593
|
-
|
605
|
+
|
606
|
+
# For fit_predict method, a minimum of 2 is required by MinCovDet, BayesianGaussianMixture
|
607
|
+
# so change the minimum of number of rows to 2
|
608
|
+
num_examples = 2
|
609
|
+
statement_params = telemetry.get_function_usage_statement_params(
|
610
|
+
project=_PROJECT,
|
611
|
+
subproject=_SUBPROJECT,
|
612
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
613
|
+
inspect.currentframe(), LabelPropagation.__class__.__name__
|
614
|
+
),
|
615
|
+
api_calls=[Session.call],
|
616
|
+
custom_tags={"autogen": True} if self._autogenerated else None,
|
617
|
+
)
|
618
|
+
if output_cols_prefix == "fit_predict_":
|
619
|
+
if hasattr(self._sklearn_object, "n_clusters"):
|
620
|
+
# cluster classes such as BisectingKMeansTest requires # of examples >= n_clusters
|
621
|
+
num_examples = self._sklearn_object.n_clusters
|
622
|
+
elif hasattr(self._sklearn_object, "min_samples"):
|
623
|
+
# OPTICS default min_samples 5, which requires at least 5 lines of data
|
624
|
+
num_examples = self._sklearn_object.min_samples
|
625
|
+
elif hasattr(self._sklearn_object, "n_neighbors") and hasattr(self._sklearn_object, "n_samples"):
|
626
|
+
# LocalOutlierFactor expects n_neighbors <= n_samples
|
627
|
+
num_examples = self._sklearn_object.n_neighbors
|
628
|
+
sample_pd_df = dataset.select(self.input_cols).limit(num_examples).to_pandas(statement_params=statement_params)
|
629
|
+
else:
|
630
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas(statement_params=statement_params)
|
594
631
|
|
595
632
|
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
596
633
|
# seen during the fit.
|
@@ -602,12 +639,14 @@ class LabelPropagation(BaseTransformer):
|
|
602
639
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
603
640
|
if self.sample_weight_col:
|
604
641
|
output_df_columns_set -= set(self.sample_weight_col)
|
642
|
+
|
605
643
|
# if the dimension of inferred output column names is correct; use it
|
606
644
|
if len(expected_output_cols_list) == len(output_df_columns_set):
|
607
|
-
return expected_output_cols_list
|
645
|
+
return expected_output_cols_list, output_df_pd
|
608
646
|
# otherwise, use the sklearn estimator's output
|
609
647
|
else:
|
610
|
-
|
648
|
+
expected_output_cols_list = sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
649
|
+
return expected_output_cols_list, output_df_pd[expected_output_cols_list]
|
611
650
|
|
612
651
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
613
652
|
@telemetry.send_api_usage_telemetry(
|
@@ -655,7 +694,7 @@ class LabelPropagation(BaseTransformer):
|
|
655
694
|
drop_input_cols=self._drop_input_cols,
|
656
695
|
expected_output_cols_type="float",
|
657
696
|
)
|
658
|
-
expected_output_cols = self.
|
697
|
+
expected_output_cols, _ = self._align_expected_output(
|
659
698
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
660
699
|
)
|
661
700
|
|
@@ -723,7 +762,7 @@ class LabelPropagation(BaseTransformer):
|
|
723
762
|
drop_input_cols=self._drop_input_cols,
|
724
763
|
expected_output_cols_type="float",
|
725
764
|
)
|
726
|
-
expected_output_cols = self.
|
765
|
+
expected_output_cols, _ = self._align_expected_output(
|
727
766
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
728
767
|
)
|
729
768
|
elif isinstance(dataset, pd.DataFrame):
|
@@ -786,7 +825,7 @@ class LabelPropagation(BaseTransformer):
|
|
786
825
|
drop_input_cols=self._drop_input_cols,
|
787
826
|
expected_output_cols_type="float",
|
788
827
|
)
|
789
|
-
expected_output_cols = self.
|
828
|
+
expected_output_cols, _ = self._align_expected_output(
|
790
829
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
791
830
|
)
|
792
831
|
|
@@ -851,7 +890,7 @@ class LabelPropagation(BaseTransformer):
|
|
851
890
|
drop_input_cols = self._drop_input_cols,
|
852
891
|
expected_output_cols_type="float",
|
853
892
|
)
|
854
|
-
expected_output_cols = self.
|
893
|
+
expected_output_cols, _ = self._align_expected_output(
|
855
894
|
inference_method, dataset, expected_output_cols, output_cols_prefix
|
856
895
|
)
|
857
896
|
|