snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/cortex/_sentiment.py +7 -4
- snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
- snowflake/ml/feature_store/access_manager.py +34 -30
- snowflake/ml/feature_store/feature_store.py +1 -1
- snowflake/ml/feature_store/feature_view.py +12 -11
- snowflake/ml/fileset/snowfs.py +2 -31
- snowflake/ml/model/_client/ops/model_ops.py +43 -0
- snowflake/ml/model/_client/sql/model_version.py +53 -1
- snowflake/ml/model/_model_composer/model_composer.py +6 -2
- snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +58 -139
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
- snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
- snowflake/ml/modeling/cluster/birch.py +8 -1
- snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
- snowflake/ml/modeling/cluster/dbscan.py +8 -1
- snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
- snowflake/ml/modeling/cluster/k_means.py +8 -1
- snowflake/ml/modeling/cluster/mean_shift.py +8 -1
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
- snowflake/ml/modeling/cluster/optics.py +8 -1
- snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
- snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
- snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
- snowflake/ml/modeling/compose/column_transformer.py +8 -1
- snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
- snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
- snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
- snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
- snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
- snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
- snowflake/ml/modeling/covariance/oas.py +8 -1
- snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
- snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
- snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
- snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
- snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
- snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
- snowflake/ml/modeling/decomposition/pca.py +8 -1
- snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
- snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
- snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
- snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
- snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
- snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
- snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
- snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
- snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
- snowflake/ml/modeling/impute/knn_imputer.py +8 -1
- snowflake/ml/modeling/impute/missing_indicator.py +8 -1
- snowflake/ml/modeling/impute/simple_imputer.py +21 -2
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
- snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
- snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
- snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/lars.py +8 -1
- snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
- snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
- snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/perceptron.py +8 -1
- snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/ridge.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
- snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
- snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
- snowflake/ml/modeling/manifold/isomap.py +8 -1
- snowflake/ml/modeling/manifold/mds.py +8 -1
- snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
- snowflake/ml/modeling/manifold/tsne.py +8 -1
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
- snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
- snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
- snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
- snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
- snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
- snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
- snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
- snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
- snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
- snowflake/ml/modeling/svm/linear_svc.py +8 -1
- snowflake/ml/modeling/svm/linear_svr.py +8 -1
- snowflake/ml/modeling/svm/nu_svc.py +8 -1
- snowflake/ml/modeling/svm/nu_svr.py +8 -1
- snowflake/ml/modeling/svm/svc.py +8 -1
- snowflake/ml/modeling/svm/svr.py +8 -1
- snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
- snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
- snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
- snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
- snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
- snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
- snowflake/ml/registry/_manager/model_manager.py +59 -1
- snowflake/ml/registry/registry.py +10 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +13 -1
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +174 -172
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,10 @@ import io
|
|
4
4
|
import os
|
5
5
|
import posixpath
|
6
6
|
import sys
|
7
|
-
from typing import Any, Dict, List, Optional,
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import cloudpickle as cp
|
10
10
|
import numpy as np
|
11
|
-
import numpy.typing as npt
|
12
11
|
from sklearn import model_selection
|
13
12
|
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
14
13
|
|
@@ -36,6 +35,7 @@ from snowflake.snowpark._internal.utils import (
|
|
36
35
|
from snowflake.snowpark.functions import sproc, udtf
|
37
36
|
from snowflake.snowpark.row import Row
|
38
37
|
from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
|
38
|
+
from snowflake.snowpark.udtf import UDTFRegistration
|
39
39
|
|
40
40
|
cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
|
41
41
|
cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
|
@@ -698,7 +698,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
698
698
|
) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
|
699
699
|
from itertools import product
|
700
700
|
|
701
|
-
import cachetools
|
702
701
|
from sklearn.base import clone, is_classifier
|
703
702
|
from sklearn.calibration import check_cv
|
704
703
|
|
@@ -719,9 +718,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
719
718
|
# Create a temp file and dump the estimator to that file.
|
720
719
|
estimator_file_name = get_temp_file_path()
|
721
720
|
params_to_evaluate = list(param_grid)
|
722
|
-
|
723
|
-
|
724
|
-
|
721
|
+
CONSTANTS: Dict[str, Any] = dict()
|
722
|
+
CONSTANTS["dataset_snowpark_cols"] = dataset.columns
|
723
|
+
CONSTANTS["n_candidates"] = len(params_to_evaluate)
|
724
|
+
CONSTANTS["_N_JOBS"] = estimator.n_jobs
|
725
|
+
CONSTANTS["_PRE_DISPATCH"] = estimator.pre_dispatch
|
725
726
|
|
726
727
|
with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
|
727
728
|
cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
|
@@ -743,6 +744,9 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
743
744
|
api_calls=[udtf],
|
744
745
|
custom_tags=dict([("hpo_memory_efficient", True)]),
|
745
746
|
)
|
747
|
+
from snowflake.ml.modeling._internal.snowpark_implementations.distributed_search_udf_file import (
|
748
|
+
execute_template,
|
749
|
+
)
|
746
750
|
|
747
751
|
# Put locally serialized estimator on stage.
|
748
752
|
session.file.put(
|
@@ -753,6 +757,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
753
757
|
)
|
754
758
|
estimator_location = os.path.basename(estimator_file_name)
|
755
759
|
imports.append(f"@{temp_stage_name}/{estimator_location}")
|
760
|
+
CONSTANTS["estimator_location"] = estimator_location
|
756
761
|
|
757
762
|
search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
|
758
763
|
random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
|
@@ -783,7 +788,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
783
788
|
) -> str:
|
784
789
|
import os
|
785
790
|
import time
|
786
|
-
from typing import Iterator
|
787
791
|
|
788
792
|
import cloudpickle as cp
|
789
793
|
import pandas as pd
|
@@ -905,145 +909,60 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
905
909
|
fit_and_score_kwargs_location = os.path.basename(local_fit_and_score_kwargs_file_name)
|
906
910
|
imports.append(f"@{temp_stage_name}/{fit_and_score_kwargs_location}")
|
907
911
|
|
908
|
-
|
909
|
-
|
912
|
+
CONSTANTS["input_cols"] = input_cols
|
913
|
+
CONSTANTS["label_cols"] = label_cols
|
914
|
+
CONSTANTS["DATA_LENGTH"] = DATA_LENGTH
|
915
|
+
CONSTANTS["n_splits"] = n_splits
|
916
|
+
CONSTANTS["indices_location"] = indices_location
|
917
|
+
CONSTANTS["base_estimator_location"] = base_estimator_location
|
918
|
+
CONSTANTS["fit_and_score_kwargs_location"] = fit_and_score_kwargs_location
|
910
919
|
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
npt.NDArray[Any],
|
916
|
-
npt.NDArray[Any],
|
917
|
-
List[List[int]],
|
918
|
-
List[Dict[str, Any]],
|
919
|
-
object,
|
920
|
-
Dict[str, Any],
|
921
|
-
]:
|
922
|
-
import pyarrow.parquet as pq
|
920
|
+
# (6) store the constants
|
921
|
+
local_constant_file_name = get_temp_file_path(prefix="constant")
|
922
|
+
with open(local_constant_file_name, mode="w+b") as local_indices_file_obj:
|
923
|
+
cp.dump(CONSTANTS, local_indices_file_obj)
|
923
924
|
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
df = pd.concat(partial_df, ignore_index=True)
|
934
|
-
df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
|
935
|
-
|
936
|
-
# load parameter grid
|
937
|
-
local_estimator_file_path = os.path.join(
|
938
|
-
sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
|
939
|
-
)
|
940
|
-
with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
|
941
|
-
estimator_objects = cp.load(local_estimator_file_obj)
|
942
|
-
params_to_evaluate = estimator_objects["param_grid"]
|
925
|
+
# Put locally serialized indices on stage.
|
926
|
+
session.file.put(
|
927
|
+
local_constant_file_name,
|
928
|
+
temp_stage_name,
|
929
|
+
auto_compress=False,
|
930
|
+
overwrite=True,
|
931
|
+
)
|
932
|
+
constant_location = os.path.basename(local_constant_file_name)
|
933
|
+
imports.append(f"@{temp_stage_name}/{constant_location}")
|
943
934
|
|
944
|
-
|
945
|
-
|
946
|
-
sys._xoptions["snowflake_import_directory"], f"{indices_location}"
|
947
|
-
)
|
948
|
-
with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
|
949
|
-
indices = cp.load(local_indices_file_obj)
|
935
|
+
cross_validator_indices_length = int(len(cross_validator_indices))
|
936
|
+
parameter_grid_length = len(param_grid)
|
950
937
|
|
951
|
-
|
952
|
-
local_base_estimator_file_path = os.path.join(
|
953
|
-
sys._xoptions["snowflake_import_directory"], f"{base_estimator_location}"
|
954
|
-
)
|
955
|
-
with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
|
956
|
-
base_estimator = cp.load(local_base_estimator_file_obj)
|
938
|
+
assert estimator is not None
|
957
939
|
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
940
|
+
# Instantiate UDTFRegistration with the session object
|
941
|
+
udtf_registration = UDTFRegistration(session)
|
942
|
+
|
943
|
+
import tempfile
|
944
|
+
|
945
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
|
946
|
+
udf_code = execute_template
|
947
|
+
f.file.write(udf_code)
|
948
|
+
f.file.flush()
|
949
|
+
|
950
|
+
# Register the UDTF function from the file
|
951
|
+
udtf_registration.register_from_file(
|
952
|
+
file_path=f.name,
|
953
|
+
handler_name="SearchCV",
|
954
|
+
name=random_udtf_name,
|
955
|
+
output_schema=StructType(
|
956
|
+
[StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
|
957
|
+
),
|
958
|
+
input_types=[IntegerType(), IntegerType(), IntegerType()],
|
959
|
+
replace=True,
|
960
|
+
imports=imports, # type: ignore[arg-type]
|
961
|
+
is_permanent=False,
|
962
|
+
packages=required_deps, # type: ignore[arg-type]
|
963
|
+
statement_params=udtf_statement_params,
|
973
964
|
)
|
974
965
|
|
975
|
-
# Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
|
976
|
-
class SearchCV:
|
977
|
-
def __init__(self) -> None:
|
978
|
-
X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
|
979
|
-
self.X = X
|
980
|
-
self.y = y
|
981
|
-
self.test_indices = indices
|
982
|
-
self.params_to_evaluate = params_to_evaluate
|
983
|
-
self.base_estimator = base_estimator
|
984
|
-
self.fit_and_score_kwargs = fit_and_score_kwargs
|
985
|
-
self.fit_score_params: List[Any] = []
|
986
|
-
self.cv_indices_set: Set[int] = set()
|
987
|
-
|
988
|
-
def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
|
989
|
-
self.fit_score_params.extend([[idx, params_idx, cv_idx]])
|
990
|
-
self.cv_indices_set.add(cv_idx)
|
991
|
-
|
992
|
-
def end_partition(self) -> Iterator[Tuple[int, str]]:
|
993
|
-
from sklearn.base import clone
|
994
|
-
from sklearn.model_selection._validation import _fit_and_score
|
995
|
-
from sklearn.utils.parallel import Parallel, delayed
|
996
|
-
|
997
|
-
cached_train_test_indices = {}
|
998
|
-
# Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
|
999
|
-
full_index = np.arange(DATA_LENGTH)
|
1000
|
-
for i in self.cv_indices_set:
|
1001
|
-
cached_train_test_indices[i] = [
|
1002
|
-
np.setdiff1d(full_index, self.test_indices[i]),
|
1003
|
-
self.test_indices[i],
|
1004
|
-
]
|
1005
|
-
|
1006
|
-
parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
|
1007
|
-
|
1008
|
-
out = parallel(
|
1009
|
-
delayed(_fit_and_score)(
|
1010
|
-
clone(self.base_estimator),
|
1011
|
-
self.X,
|
1012
|
-
self.y,
|
1013
|
-
train=cached_train_test_indices[split_idx][0],
|
1014
|
-
test=cached_train_test_indices[split_idx][1],
|
1015
|
-
parameters=self.params_to_evaluate[cand_idx],
|
1016
|
-
split_progress=(split_idx, n_splits),
|
1017
|
-
candidate_progress=(cand_idx, n_candidates),
|
1018
|
-
**self.fit_and_score_kwargs, # load sample weight here
|
1019
|
-
)
|
1020
|
-
for _, cand_idx, split_idx in self.fit_score_params
|
1021
|
-
)
|
1022
|
-
|
1023
|
-
binary_cv_results = None
|
1024
|
-
with io.BytesIO() as f:
|
1025
|
-
cp.dump(out, f)
|
1026
|
-
f.seek(0)
|
1027
|
-
binary_cv_results = f.getvalue().hex()
|
1028
|
-
yield (
|
1029
|
-
self.fit_score_params[0][0],
|
1030
|
-
binary_cv_results,
|
1031
|
-
)
|
1032
|
-
|
1033
|
-
session.udtf.register(
|
1034
|
-
SearchCV,
|
1035
|
-
output_schema=StructType(
|
1036
|
-
[StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
|
1037
|
-
),
|
1038
|
-
input_types=[IntegerType(), IntegerType(), IntegerType()],
|
1039
|
-
name=random_udtf_name,
|
1040
|
-
packages=required_deps, # type: ignore[arg-type]
|
1041
|
-
replace=True,
|
1042
|
-
is_permanent=False,
|
1043
|
-
imports=imports, # type: ignore[arg-type]
|
1044
|
-
statement_params=udtf_statement_params,
|
1045
|
-
)
|
1046
|
-
|
1047
966
|
HP_TUNING = F.table_function(random_udtf_name)
|
1048
967
|
|
1049
968
|
# param_indices is for the index for each parameter grid;
|
@@ -0,0 +1,159 @@
|
|
1
|
+
"""
|
2
|
+
Description:
|
3
|
+
This is the helper file for distributed_hpo_trainer.py to create UDTF by `register_from_file`.
|
4
|
+
Performance Benefits:
|
5
|
+
The performance benefits come from two aspects,
|
6
|
+
1. register_from_file can reduce duplicating loading data by only loading data once in each node
|
7
|
+
2. register_from_file enable user to load data in global variable, whereas writing UDF in python script cannot.
|
8
|
+
Developer Tips:
|
9
|
+
Because this script is now a string, so there's no type hinting, linting, etc. It is highly recommended
|
10
|
+
to develop in a python script, test the type hinting, and then convert it into a string.
|
11
|
+
"""
|
12
|
+
|
13
|
+
execute_template = """
|
14
|
+
from typing import Tuple, Any, List, Dict, Set, Iterator
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
import pandas as pd
|
18
|
+
import numpy as np
|
19
|
+
import numpy.typing as npt
|
20
|
+
import cloudpickle as cp
|
21
|
+
import io
|
22
|
+
|
23
|
+
|
24
|
+
def _load_data_into_udf() -> Tuple[
|
25
|
+
npt.NDArray[Any],
|
26
|
+
npt.NDArray[Any],
|
27
|
+
List[List[int]],
|
28
|
+
List[Dict[str, Any]],
|
29
|
+
object,
|
30
|
+
Dict[str, Any],
|
31
|
+
Dict[str, Any],
|
32
|
+
]:
|
33
|
+
import pyarrow.parquet as pq
|
34
|
+
|
35
|
+
data_files = [
|
36
|
+
filename
|
37
|
+
for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
|
38
|
+
if filename.startswith("dataset")
|
39
|
+
]
|
40
|
+
partial_df = [
|
41
|
+
pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
|
42
|
+
for file_name in data_files
|
43
|
+
]
|
44
|
+
df = pd.concat(partial_df, ignore_index=True)
|
45
|
+
constant_file_path = None
|
46
|
+
for filename in os.listdir(sys._xoptions["snowflake_import_directory"]):
|
47
|
+
if filename.startswith("constant"):
|
48
|
+
constant_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{filename}")
|
49
|
+
if constant_file_path is None:
|
50
|
+
raise ValueError("UDTF cannot find the constant location, abort!")
|
51
|
+
with open(constant_file_path, mode="rb") as constant_file_obj:
|
52
|
+
CONSTANTS = cp.load(constant_file_obj)
|
53
|
+
df.columns = CONSTANTS['dataset_snowpark_cols']
|
54
|
+
|
55
|
+
# load parameter grid
|
56
|
+
local_estimator_file_path = os.path.join(
|
57
|
+
sys._xoptions["snowflake_import_directory"],
|
58
|
+
f"{CONSTANTS['estimator_location']}"
|
59
|
+
)
|
60
|
+
with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
|
61
|
+
estimator_objects = cp.load(local_estimator_file_obj)
|
62
|
+
params_to_evaluate = estimator_objects["param_grid"]
|
63
|
+
|
64
|
+
# load indices
|
65
|
+
local_indices_file_path = os.path.join(
|
66
|
+
sys._xoptions["snowflake_import_directory"],
|
67
|
+
f"{CONSTANTS['indices_location']}"
|
68
|
+
)
|
69
|
+
with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
|
70
|
+
indices = cp.load(local_indices_file_obj)
|
71
|
+
|
72
|
+
# load base estimator
|
73
|
+
local_base_estimator_file_path = os.path.join(
|
74
|
+
sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['base_estimator_location']}"
|
75
|
+
)
|
76
|
+
with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
|
77
|
+
base_estimator = cp.load(local_base_estimator_file_obj)
|
78
|
+
|
79
|
+
# load fit_and_score_kwargs
|
80
|
+
local_fit_and_score_kwargs_file_path = os.path.join(
|
81
|
+
sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['fit_and_score_kwargs_location']}"
|
82
|
+
)
|
83
|
+
with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
|
84
|
+
fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
|
85
|
+
|
86
|
+
# convert dataframe to numpy would save memory consumption
|
87
|
+
return (
|
88
|
+
df[CONSTANTS['input_cols']].to_numpy(),
|
89
|
+
df[CONSTANTS['label_cols']].squeeze().to_numpy(),
|
90
|
+
indices,
|
91
|
+
params_to_evaluate,
|
92
|
+
base_estimator,
|
93
|
+
fit_and_score_kwargs,
|
94
|
+
CONSTANTS
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
global_load_data = _load_data_into_udf()
|
99
|
+
|
100
|
+
|
101
|
+
# Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
|
102
|
+
class SearchCV:
|
103
|
+
def __init__(self) -> None:
|
104
|
+
X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs, CONSTANTS = global_load_data
|
105
|
+
self.X = X
|
106
|
+
self.y = y
|
107
|
+
self.test_indices = indices
|
108
|
+
self.params_to_evaluate = params_to_evaluate
|
109
|
+
self.base_estimator = base_estimator
|
110
|
+
self.fit_and_score_kwargs = fit_and_score_kwargs
|
111
|
+
self.fit_score_params: List[Any] = []
|
112
|
+
self.CONSTANTS = CONSTANTS
|
113
|
+
self.cv_indices_set: Set[int] = set()
|
114
|
+
|
115
|
+
def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
|
116
|
+
self.fit_score_params.extend([[idx, params_idx, cv_idx]])
|
117
|
+
self.cv_indices_set.add(cv_idx)
|
118
|
+
|
119
|
+
def end_partition(self) -> Iterator[Tuple[int, str]]:
|
120
|
+
from sklearn.base import clone
|
121
|
+
from sklearn.model_selection._validation import _fit_and_score
|
122
|
+
from sklearn.utils.parallel import Parallel, delayed
|
123
|
+
|
124
|
+
cached_train_test_indices = {}
|
125
|
+
# Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
|
126
|
+
full_index = np.arange(self.CONSTANTS['DATA_LENGTH'])
|
127
|
+
for i in self.cv_indices_set:
|
128
|
+
cached_train_test_indices[i] = [
|
129
|
+
np.setdiff1d(full_index, self.test_indices[i]),
|
130
|
+
self.test_indices[i],
|
131
|
+
]
|
132
|
+
|
133
|
+
parallel = Parallel(n_jobs=self.CONSTANTS['_N_JOBS'], pre_dispatch=self.CONSTANTS['_PRE_DISPATCH'])
|
134
|
+
|
135
|
+
out = parallel(
|
136
|
+
delayed(_fit_and_score)(
|
137
|
+
clone(self.base_estimator),
|
138
|
+
self.X,
|
139
|
+
self.y,
|
140
|
+
train=cached_train_test_indices[split_idx][0],
|
141
|
+
test=cached_train_test_indices[split_idx][1],
|
142
|
+
parameters=self.params_to_evaluate[cand_idx],
|
143
|
+
split_progress=(split_idx, self.CONSTANTS['n_splits']),
|
144
|
+
candidate_progress=(cand_idx, self.CONSTANTS['n_candidates']),
|
145
|
+
**self.fit_and_score_kwargs, # load sample weight here
|
146
|
+
)
|
147
|
+
for _, cand_idx, split_idx in self.fit_score_params
|
148
|
+
)
|
149
|
+
|
150
|
+
binary_cv_results = None
|
151
|
+
with io.BytesIO() as f:
|
152
|
+
cp.dump(out, f)
|
153
|
+
f.seek(0)
|
154
|
+
binary_cv_results = f.getvalue().hex()
|
155
|
+
yield (
|
156
|
+
self.fit_score_params[0][0],
|
157
|
+
binary_cv_results,
|
158
|
+
)
|
159
|
+
"""
|
@@ -629,7 +629,14 @@ class CalibratedClassifierCV(BaseTransformer):
|
|
629
629
|
) -> List[str]:
|
630
630
|
# in case the inferred output column names dimension is different
|
631
631
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
632
|
-
|
632
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
633
|
+
|
634
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
635
|
+
# seen during the fit.
|
636
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
637
|
+
sample_pd_df.columns = snowpark_column_names
|
638
|
+
|
639
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
633
640
|
output_df_columns = list(output_df_pd.columns)
|
634
641
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
635
642
|
if self.sample_weight_col:
|
@@ -606,7 +606,14 @@ class AffinityPropagation(BaseTransformer):
|
|
606
606
|
) -> List[str]:
|
607
607
|
# in case the inferred output column names dimension is different
|
608
608
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
609
|
-
|
609
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
610
|
+
|
611
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
612
|
+
# seen during the fit.
|
613
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
614
|
+
sample_pd_df.columns = snowpark_column_names
|
615
|
+
|
616
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
610
617
|
output_df_columns = list(output_df_pd.columns)
|
611
618
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
612
619
|
if self.sample_weight_col:
|
@@ -637,7 +637,14 @@ class AgglomerativeClustering(BaseTransformer):
|
|
637
637
|
) -> List[str]:
|
638
638
|
# in case the inferred output column names dimension is different
|
639
639
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
640
|
-
|
640
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
641
|
+
|
642
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
643
|
+
# seen during the fit.
|
644
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
645
|
+
sample_pd_df.columns = snowpark_column_names
|
646
|
+
|
647
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
641
648
|
output_df_columns = list(output_df_pd.columns)
|
642
649
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
643
650
|
if self.sample_weight_col:
|
@@ -601,7 +601,14 @@ class Birch(BaseTransformer):
|
|
601
601
|
) -> List[str]:
|
602
602
|
# in case the inferred output column names dimension is different
|
603
603
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
604
|
-
|
604
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
605
|
+
|
606
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
607
|
+
# seen during the fit.
|
608
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
609
|
+
sample_pd_df.columns = snowpark_column_names
|
610
|
+
|
611
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
605
612
|
output_df_columns = list(output_df_pd.columns)
|
606
613
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
607
614
|
if self.sample_weight_col:
|
@@ -650,7 +650,14 @@ class BisectingKMeans(BaseTransformer):
|
|
650
650
|
) -> List[str]:
|
651
651
|
# in case the inferred output column names dimension is different
|
652
652
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
653
|
-
|
653
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
654
|
+
|
655
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
656
|
+
# seen during the fit.
|
657
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
658
|
+
sample_pd_df.columns = snowpark_column_names
|
659
|
+
|
660
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
654
661
|
output_df_columns = list(output_df_pd.columns)
|
655
662
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
656
663
|
if self.sample_weight_col:
|
@@ -612,7 +612,14 @@ class DBSCAN(BaseTransformer):
|
|
612
612
|
) -> List[str]:
|
613
613
|
# in case the inferred output column names dimension is different
|
614
614
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
615
|
-
|
615
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
616
|
+
|
617
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
618
|
+
# seen during the fit.
|
619
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
620
|
+
sample_pd_df.columns = snowpark_column_names
|
621
|
+
|
622
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
616
623
|
output_df_columns = list(output_df_pd.columns)
|
617
624
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
618
625
|
if self.sample_weight_col:
|
@@ -648,7 +648,14 @@ class FeatureAgglomeration(BaseTransformer):
|
|
648
648
|
) -> List[str]:
|
649
649
|
# in case the inferred output column names dimension is different
|
650
650
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
651
|
-
|
651
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
652
|
+
|
653
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
654
|
+
# seen during the fit.
|
655
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
656
|
+
sample_pd_df.columns = snowpark_column_names
|
657
|
+
|
658
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
652
659
|
output_df_columns = list(output_df_pd.columns)
|
653
660
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
654
661
|
if self.sample_weight_col:
|
@@ -645,7 +645,14 @@ class KMeans(BaseTransformer):
|
|
645
645
|
) -> List[str]:
|
646
646
|
# in case the inferred output column names dimension is different
|
647
647
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
648
|
-
|
648
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
649
|
+
|
650
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
651
|
+
# seen during the fit.
|
652
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
653
|
+
sample_pd_df.columns = snowpark_column_names
|
654
|
+
|
655
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
649
656
|
output_df_columns = list(output_df_pd.columns)
|
650
657
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
651
658
|
if self.sample_weight_col:
|
@@ -617,7 +617,14 @@ class MeanShift(BaseTransformer):
|
|
617
617
|
) -> List[str]:
|
618
618
|
# in case the inferred output column names dimension is different
|
619
619
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
620
|
-
|
620
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
621
|
+
|
622
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
623
|
+
# seen during the fit.
|
624
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
625
|
+
sample_pd_df.columns = snowpark_column_names
|
626
|
+
|
627
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
621
628
|
output_df_columns = list(output_df_pd.columns)
|
622
629
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
623
630
|
if self.sample_weight_col:
|
@@ -671,7 +671,14 @@ class MiniBatchKMeans(BaseTransformer):
|
|
671
671
|
) -> List[str]:
|
672
672
|
# in case the inferred output column names dimension is different
|
673
673
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
674
|
-
|
674
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
675
|
+
|
676
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
677
|
+
# seen during the fit.
|
678
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
679
|
+
sample_pd_df.columns = snowpark_column_names
|
680
|
+
|
681
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
675
682
|
output_df_columns = list(output_df_pd.columns)
|
676
683
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
677
684
|
if self.sample_weight_col:
|
@@ -685,7 +685,14 @@ class OPTICS(BaseTransformer):
|
|
685
685
|
) -> List[str]:
|
686
686
|
# in case the inferred output column names dimension is different
|
687
687
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
688
|
-
|
688
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
689
|
+
|
690
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
691
|
+
# seen during the fit.
|
692
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
693
|
+
sample_pd_df.columns = snowpark_column_names
|
694
|
+
|
695
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
689
696
|
output_df_columns = list(output_df_pd.columns)
|
690
697
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
691
698
|
if self.sample_weight_col:
|
@@ -621,7 +621,14 @@ class SpectralBiclustering(BaseTransformer):
|
|
621
621
|
) -> List[str]:
|
622
622
|
# in case the inferred output column names dimension is different
|
623
623
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
624
|
-
|
624
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
625
|
+
|
626
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
627
|
+
# seen during the fit.
|
628
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
629
|
+
sample_pd_df.columns = snowpark_column_names
|
630
|
+
|
631
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
625
632
|
output_df_columns = list(output_df_pd.columns)
|
626
633
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
627
634
|
if self.sample_weight_col:
|
@@ -681,7 +681,14 @@ class SpectralClustering(BaseTransformer):
|
|
681
681
|
) -> List[str]:
|
682
682
|
# in case the inferred output column names dimension is different
|
683
683
|
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
684
|
-
|
684
|
+
sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
|
685
|
+
|
686
|
+
# Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
|
687
|
+
# seen during the fit.
|
688
|
+
snowpark_column_names = dataset.select(self.input_cols).columns
|
689
|
+
sample_pd_df.columns = snowpark_column_names
|
690
|
+
|
691
|
+
output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
|
685
692
|
output_df_columns = list(output_df_pd.columns)
|
686
693
|
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
687
694
|
if self.sample_weight_col:
|