snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. snowflake/cortex/_sentiment.py +7 -4
  2. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  3. snowflake/ml/feature_store/access_manager.py +34 -30
  4. snowflake/ml/feature_store/feature_store.py +1 -1
  5. snowflake/ml/feature_store/feature_view.py +12 -11
  6. snowflake/ml/fileset/snowfs.py +2 -31
  7. snowflake/ml/model/_client/ops/model_ops.py +43 -0
  8. snowflake/ml/model/_client/sql/model_version.py +53 -1
  9. snowflake/ml/model/_model_composer/model_composer.py +6 -2
  10. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  11. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  12. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +58 -139
  13. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
  14. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
  15. snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
  16. snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
  17. snowflake/ml/modeling/cluster/birch.py +8 -1
  18. snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
  19. snowflake/ml/modeling/cluster/dbscan.py +8 -1
  20. snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
  21. snowflake/ml/modeling/cluster/k_means.py +8 -1
  22. snowflake/ml/modeling/cluster/mean_shift.py +8 -1
  23. snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
  24. snowflake/ml/modeling/cluster/optics.py +8 -1
  25. snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
  26. snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
  27. snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
  28. snowflake/ml/modeling/compose/column_transformer.py +8 -1
  29. snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
  30. snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
  31. snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
  32. snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
  33. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
  34. snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
  35. snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
  36. snowflake/ml/modeling/covariance/oas.py +8 -1
  37. snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
  38. snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
  39. snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
  40. snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
  41. snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
  42. snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
  43. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
  44. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
  45. snowflake/ml/modeling/decomposition/pca.py +8 -1
  46. snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
  47. snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
  48. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
  49. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
  50. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
  51. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
  52. snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
  53. snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
  54. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
  55. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
  56. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
  57. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
  58. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
  59. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
  60. snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
  61. snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
  62. snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
  63. snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
  64. snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
  65. snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
  66. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
  67. snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
  68. snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
  69. snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
  70. snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
  71. snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
  72. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
  73. snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
  74. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
  75. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
  76. snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
  77. snowflake/ml/modeling/impute/knn_imputer.py +8 -1
  78. snowflake/ml/modeling/impute/missing_indicator.py +8 -1
  79. snowflake/ml/modeling/impute/simple_imputer.py +21 -2
  80. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
  81. snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
  82. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
  83. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
  84. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
  85. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
  86. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
  87. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
  88. snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
  89. snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
  90. snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
  91. snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
  92. snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
  93. snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
  94. snowflake/ml/modeling/linear_model/lars.py +8 -1
  95. snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
  96. snowflake/ml/modeling/linear_model/lasso.py +8 -1
  97. snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
  98. snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
  99. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
  100. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
  101. snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
  102. snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
  103. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
  104. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
  105. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
  106. snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
  107. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
  108. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
  109. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
  110. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
  111. snowflake/ml/modeling/linear_model/perceptron.py +8 -1
  112. snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
  113. snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
  114. snowflake/ml/modeling/linear_model/ridge.py +8 -1
  115. snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
  116. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
  117. snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
  118. snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
  119. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
  120. snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
  121. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
  122. snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
  123. snowflake/ml/modeling/manifold/isomap.py +8 -1
  124. snowflake/ml/modeling/manifold/mds.py +8 -1
  125. snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
  126. snowflake/ml/modeling/manifold/tsne.py +8 -1
  127. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
  128. snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
  129. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
  130. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
  131. snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
  132. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
  133. snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
  134. snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
  135. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
  136. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
  137. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
  138. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
  139. snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
  140. snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
  141. snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
  142. snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
  143. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
  144. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
  145. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
  146. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
  147. snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
  148. snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
  149. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  150. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
  151. snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
  152. snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
  153. snowflake/ml/modeling/svm/linear_svc.py +8 -1
  154. snowflake/ml/modeling/svm/linear_svr.py +8 -1
  155. snowflake/ml/modeling/svm/nu_svc.py +8 -1
  156. snowflake/ml/modeling/svm/nu_svr.py +8 -1
  157. snowflake/ml/modeling/svm/svc.py +8 -1
  158. snowflake/ml/modeling/svm/svr.py +8 -1
  159. snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
  160. snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
  161. snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
  162. snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
  163. snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
  164. snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
  165. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
  166. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
  167. snowflake/ml/registry/_manager/model_manager.py +59 -1
  168. snowflake/ml/registry/registry.py +10 -1
  169. snowflake/ml/version.py +1 -1
  170. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +13 -1
  171. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +174 -172
  172. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
  173. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
  174. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,10 @@ import io
4
4
  import os
5
5
  import posixpath
6
6
  import sys
7
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
7
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
8
 
9
9
  import cloudpickle as cp
10
10
  import numpy as np
11
- import numpy.typing as npt
12
11
  from sklearn import model_selection
13
12
  from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
14
13
 
@@ -36,6 +35,7 @@ from snowflake.snowpark._internal.utils import (
36
35
  from snowflake.snowpark.functions import sproc, udtf
37
36
  from snowflake.snowpark.row import Row
38
37
  from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
38
+ from snowflake.snowpark.udtf import UDTFRegistration
39
39
 
40
40
  cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
41
41
  cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
@@ -698,7 +698,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
698
698
  ) -> Union[model_selection.GridSearchCV, model_selection.RandomizedSearchCV]:
699
699
  from itertools import product
700
700
 
701
- import cachetools
702
701
  from sklearn.base import clone, is_classifier
703
702
  from sklearn.calibration import check_cv
704
703
 
@@ -719,9 +718,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
719
718
  # Create a temp file and dump the estimator to that file.
720
719
  estimator_file_name = get_temp_file_path()
721
720
  params_to_evaluate = list(param_grid)
722
- n_candidates = len(params_to_evaluate)
723
- _N_JOBS = estimator.n_jobs
724
- _PRE_DISPATCH = estimator.pre_dispatch
721
+ CONSTANTS: Dict[str, Any] = dict()
722
+ CONSTANTS["dataset_snowpark_cols"] = dataset.columns
723
+ CONSTANTS["n_candidates"] = len(params_to_evaluate)
724
+ CONSTANTS["_N_JOBS"] = estimator.n_jobs
725
+ CONSTANTS["_PRE_DISPATCH"] = estimator.pre_dispatch
725
726
 
726
727
  with open(estimator_file_name, mode="w+b") as local_estimator_file_obj:
727
728
  cp.dump(dict(estimator=estimator, param_grid=params_to_evaluate), local_estimator_file_obj)
@@ -743,6 +744,9 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
743
744
  api_calls=[udtf],
744
745
  custom_tags=dict([("hpo_memory_efficient", True)]),
745
746
  )
747
+ from snowflake.ml.modeling._internal.snowpark_implementations.distributed_search_udf_file import (
748
+ execute_template,
749
+ )
746
750
 
747
751
  # Put locally serialized estimator on stage.
748
752
  session.file.put(
@@ -753,6 +757,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
753
757
  )
754
758
  estimator_location = os.path.basename(estimator_file_name)
755
759
  imports.append(f"@{temp_stage_name}/{estimator_location}")
760
+ CONSTANTS["estimator_location"] = estimator_location
756
761
 
757
762
  search_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
758
763
  random_udtf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
@@ -783,7 +788,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
783
788
  ) -> str:
784
789
  import os
785
790
  import time
786
- from typing import Iterator
787
791
 
788
792
  import cloudpickle as cp
789
793
  import pandas as pd
@@ -905,145 +909,60 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
905
909
  fit_and_score_kwargs_location = os.path.basename(local_fit_and_score_kwargs_file_name)
906
910
  imports.append(f"@{temp_stage_name}/{fit_and_score_kwargs_location}")
907
911
 
908
- cross_validator_indices_length = int(len(cross_validator_indices))
909
- parameter_grid_length = len(param_grid)
912
+ CONSTANTS["input_cols"] = input_cols
913
+ CONSTANTS["label_cols"] = label_cols
914
+ CONSTANTS["DATA_LENGTH"] = DATA_LENGTH
915
+ CONSTANTS["n_splits"] = n_splits
916
+ CONSTANTS["indices_location"] = indices_location
917
+ CONSTANTS["base_estimator_location"] = base_estimator_location
918
+ CONSTANTS["fit_and_score_kwargs_location"] = fit_and_score_kwargs_location
910
919
 
911
- assert estimator is not None
912
-
913
- @cachetools.cached(cache={})
914
- def _load_data_into_udf() -> Tuple[
915
- npt.NDArray[Any],
916
- npt.NDArray[Any],
917
- List[List[int]],
918
- List[Dict[str, Any]],
919
- object,
920
- Dict[str, Any],
921
- ]:
922
- import pyarrow.parquet as pq
920
+ # (6) store the constants
921
+ local_constant_file_name = get_temp_file_path(prefix="constant")
922
+ with open(local_constant_file_name, mode="w+b") as local_indices_file_obj:
923
+ cp.dump(CONSTANTS, local_indices_file_obj)
923
924
 
924
- data_files = [
925
- filename
926
- for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
927
- if filename.startswith(dataset_file_name)
928
- ]
929
- partial_df = [
930
- pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
931
- for file_name in data_files
932
- ]
933
- df = pd.concat(partial_df, ignore_index=True)
934
- df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
935
-
936
- # load parameter grid
937
- local_estimator_file_path = os.path.join(
938
- sys._xoptions["snowflake_import_directory"], f"{estimator_location}"
939
- )
940
- with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
941
- estimator_objects = cp.load(local_estimator_file_obj)
942
- params_to_evaluate = estimator_objects["param_grid"]
925
+ # Put locally serialized indices on stage.
926
+ session.file.put(
927
+ local_constant_file_name,
928
+ temp_stage_name,
929
+ auto_compress=False,
930
+ overwrite=True,
931
+ )
932
+ constant_location = os.path.basename(local_constant_file_name)
933
+ imports.append(f"@{temp_stage_name}/{constant_location}")
943
934
 
944
- # load indices
945
- local_indices_file_path = os.path.join(
946
- sys._xoptions["snowflake_import_directory"], f"{indices_location}"
947
- )
948
- with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
949
- indices = cp.load(local_indices_file_obj)
935
+ cross_validator_indices_length = int(len(cross_validator_indices))
936
+ parameter_grid_length = len(param_grid)
950
937
 
951
- # load base estimator
952
- local_base_estimator_file_path = os.path.join(
953
- sys._xoptions["snowflake_import_directory"], f"{base_estimator_location}"
954
- )
955
- with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
956
- base_estimator = cp.load(local_base_estimator_file_obj)
938
+ assert estimator is not None
957
939
 
958
- # load fit_and_score_kwargs
959
- local_fit_and_score_kwargs_file_path = os.path.join(
960
- sys._xoptions["snowflake_import_directory"], f"{fit_and_score_kwargs_location}"
961
- )
962
- with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
963
- fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
964
-
965
- # convert dataframe to numpy would save memory consumption
966
- return (
967
- df[input_cols].to_numpy(),
968
- df[label_cols].squeeze().to_numpy(),
969
- indices,
970
- params_to_evaluate,
971
- base_estimator,
972
- fit_and_score_kwargs,
940
+ # Instantiate UDTFRegistration with the session object
941
+ udtf_registration = UDTFRegistration(session)
942
+
943
+ import tempfile
944
+
945
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
946
+ udf_code = execute_template
947
+ f.file.write(udf_code)
948
+ f.file.flush()
949
+
950
+ # Register the UDTF function from the file
951
+ udtf_registration.register_from_file(
952
+ file_path=f.name,
953
+ handler_name="SearchCV",
954
+ name=random_udtf_name,
955
+ output_schema=StructType(
956
+ [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
957
+ ),
958
+ input_types=[IntegerType(), IntegerType(), IntegerType()],
959
+ replace=True,
960
+ imports=imports, # type: ignore[arg-type]
961
+ is_permanent=False,
962
+ packages=required_deps, # type: ignore[arg-type]
963
+ statement_params=udtf_statement_params,
973
964
  )
974
965
 
975
- # Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
976
- class SearchCV:
977
- def __init__(self) -> None:
978
- X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
979
- self.X = X
980
- self.y = y
981
- self.test_indices = indices
982
- self.params_to_evaluate = params_to_evaluate
983
- self.base_estimator = base_estimator
984
- self.fit_and_score_kwargs = fit_and_score_kwargs
985
- self.fit_score_params: List[Any] = []
986
- self.cv_indices_set: Set[int] = set()
987
-
988
- def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
989
- self.fit_score_params.extend([[idx, params_idx, cv_idx]])
990
- self.cv_indices_set.add(cv_idx)
991
-
992
- def end_partition(self) -> Iterator[Tuple[int, str]]:
993
- from sklearn.base import clone
994
- from sklearn.model_selection._validation import _fit_and_score
995
- from sklearn.utils.parallel import Parallel, delayed
996
-
997
- cached_train_test_indices = {}
998
- # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
999
- full_index = np.arange(DATA_LENGTH)
1000
- for i in self.cv_indices_set:
1001
- cached_train_test_indices[i] = [
1002
- np.setdiff1d(full_index, self.test_indices[i]),
1003
- self.test_indices[i],
1004
- ]
1005
-
1006
- parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
1007
-
1008
- out = parallel(
1009
- delayed(_fit_and_score)(
1010
- clone(self.base_estimator),
1011
- self.X,
1012
- self.y,
1013
- train=cached_train_test_indices[split_idx][0],
1014
- test=cached_train_test_indices[split_idx][1],
1015
- parameters=self.params_to_evaluate[cand_idx],
1016
- split_progress=(split_idx, n_splits),
1017
- candidate_progress=(cand_idx, n_candidates),
1018
- **self.fit_and_score_kwargs, # load sample weight here
1019
- )
1020
- for _, cand_idx, split_idx in self.fit_score_params
1021
- )
1022
-
1023
- binary_cv_results = None
1024
- with io.BytesIO() as f:
1025
- cp.dump(out, f)
1026
- f.seek(0)
1027
- binary_cv_results = f.getvalue().hex()
1028
- yield (
1029
- self.fit_score_params[0][0],
1030
- binary_cv_results,
1031
- )
1032
-
1033
- session.udtf.register(
1034
- SearchCV,
1035
- output_schema=StructType(
1036
- [StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
1037
- ),
1038
- input_types=[IntegerType(), IntegerType(), IntegerType()],
1039
- name=random_udtf_name,
1040
- packages=required_deps, # type: ignore[arg-type]
1041
- replace=True,
1042
- is_permanent=False,
1043
- imports=imports, # type: ignore[arg-type]
1044
- statement_params=udtf_statement_params,
1045
- )
1046
-
1047
966
  HP_TUNING = F.table_function(random_udtf_name)
1048
967
 
1049
968
  # param_indices is for the index for each parameter grid;
@@ -0,0 +1,159 @@
1
+ """
2
+ Description:
3
+ This is the helper file for distributed_hpo_trainer.py to create UDTF by `register_from_file`.
4
+ Performance Benefits:
5
+ The performance benefits come from two aspects,
6
+ 1. register_from_file can reduce duplicating loading data by only loading data once in each node
7
+ 2. register_from_file enable user to load data in global variable, whereas writing UDF in python script cannot.
8
+ Developer Tips:
9
+ Because this script is now a string, so there's no type hinting, linting, etc. It is highly recommended
10
+ to develop in a python script, test the type hinting, and then convert it into a string.
11
+ """
12
+
13
+ execute_template = """
14
+ from typing import Tuple, Any, List, Dict, Set, Iterator
15
+ import os
16
+ import sys
17
+ import pandas as pd
18
+ import numpy as np
19
+ import numpy.typing as npt
20
+ import cloudpickle as cp
21
+ import io
22
+
23
+
24
+ def _load_data_into_udf() -> Tuple[
25
+ npt.NDArray[Any],
26
+ npt.NDArray[Any],
27
+ List[List[int]],
28
+ List[Dict[str, Any]],
29
+ object,
30
+ Dict[str, Any],
31
+ Dict[str, Any],
32
+ ]:
33
+ import pyarrow.parquet as pq
34
+
35
+ data_files = [
36
+ filename
37
+ for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
38
+ if filename.startswith("dataset")
39
+ ]
40
+ partial_df = [
41
+ pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
42
+ for file_name in data_files
43
+ ]
44
+ df = pd.concat(partial_df, ignore_index=True)
45
+ constant_file_path = None
46
+ for filename in os.listdir(sys._xoptions["snowflake_import_directory"]):
47
+ if filename.startswith("constant"):
48
+ constant_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{filename}")
49
+ if constant_file_path is None:
50
+ raise ValueError("UDTF cannot find the constant location, abort!")
51
+ with open(constant_file_path, mode="rb") as constant_file_obj:
52
+ CONSTANTS = cp.load(constant_file_obj)
53
+ df.columns = CONSTANTS['dataset_snowpark_cols']
54
+
55
+ # load parameter grid
56
+ local_estimator_file_path = os.path.join(
57
+ sys._xoptions["snowflake_import_directory"],
58
+ f"{CONSTANTS['estimator_location']}"
59
+ )
60
+ with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
61
+ estimator_objects = cp.load(local_estimator_file_obj)
62
+ params_to_evaluate = estimator_objects["param_grid"]
63
+
64
+ # load indices
65
+ local_indices_file_path = os.path.join(
66
+ sys._xoptions["snowflake_import_directory"],
67
+ f"{CONSTANTS['indices_location']}"
68
+ )
69
+ with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
70
+ indices = cp.load(local_indices_file_obj)
71
+
72
+ # load base estimator
73
+ local_base_estimator_file_path = os.path.join(
74
+ sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['base_estimator_location']}"
75
+ )
76
+ with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
77
+ base_estimator = cp.load(local_base_estimator_file_obj)
78
+
79
+ # load fit_and_score_kwargs
80
+ local_fit_and_score_kwargs_file_path = os.path.join(
81
+ sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['fit_and_score_kwargs_location']}"
82
+ )
83
+ with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
84
+ fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
85
+
86
+ # convert dataframe to numpy would save memory consumption
87
+ return (
88
+ df[CONSTANTS['input_cols']].to_numpy(),
89
+ df[CONSTANTS['label_cols']].squeeze().to_numpy(),
90
+ indices,
91
+ params_to_evaluate,
92
+ base_estimator,
93
+ fit_and_score_kwargs,
94
+ CONSTANTS
95
+ )
96
+
97
+
98
+ global_load_data = _load_data_into_udf()
99
+
100
+
101
+ # Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
102
+ class SearchCV:
103
+ def __init__(self) -> None:
104
+ X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs, CONSTANTS = global_load_data
105
+ self.X = X
106
+ self.y = y
107
+ self.test_indices = indices
108
+ self.params_to_evaluate = params_to_evaluate
109
+ self.base_estimator = base_estimator
110
+ self.fit_and_score_kwargs = fit_and_score_kwargs
111
+ self.fit_score_params: List[Any] = []
112
+ self.CONSTANTS = CONSTANTS
113
+ self.cv_indices_set: Set[int] = set()
114
+
115
+ def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
116
+ self.fit_score_params.extend([[idx, params_idx, cv_idx]])
117
+ self.cv_indices_set.add(cv_idx)
118
+
119
+ def end_partition(self) -> Iterator[Tuple[int, str]]:
120
+ from sklearn.base import clone
121
+ from sklearn.model_selection._validation import _fit_and_score
122
+ from sklearn.utils.parallel import Parallel, delayed
123
+
124
+ cached_train_test_indices = {}
125
+ # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
126
+ full_index = np.arange(self.CONSTANTS['DATA_LENGTH'])
127
+ for i in self.cv_indices_set:
128
+ cached_train_test_indices[i] = [
129
+ np.setdiff1d(full_index, self.test_indices[i]),
130
+ self.test_indices[i],
131
+ ]
132
+
133
+ parallel = Parallel(n_jobs=self.CONSTANTS['_N_JOBS'], pre_dispatch=self.CONSTANTS['_PRE_DISPATCH'])
134
+
135
+ out = parallel(
136
+ delayed(_fit_and_score)(
137
+ clone(self.base_estimator),
138
+ self.X,
139
+ self.y,
140
+ train=cached_train_test_indices[split_idx][0],
141
+ test=cached_train_test_indices[split_idx][1],
142
+ parameters=self.params_to_evaluate[cand_idx],
143
+ split_progress=(split_idx, self.CONSTANTS['n_splits']),
144
+ candidate_progress=(cand_idx, self.CONSTANTS['n_candidates']),
145
+ **self.fit_and_score_kwargs, # load sample weight here
146
+ )
147
+ for _, cand_idx, split_idx in self.fit_score_params
148
+ )
149
+
150
+ binary_cv_results = None
151
+ with io.BytesIO() as f:
152
+ cp.dump(out, f)
153
+ f.seek(0)
154
+ binary_cv_results = f.getvalue().hex()
155
+ yield (
156
+ self.fit_score_params[0][0],
157
+ binary_cv_results,
158
+ )
159
+ """
@@ -629,7 +629,14 @@ class CalibratedClassifierCV(BaseTransformer):
629
629
  ) -> List[str]:
630
630
  # in case the inferred output column names dimension is different
631
631
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
632
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
632
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
633
+
634
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
635
+ # seen during the fit.
636
+ snowpark_column_names = dataset.select(self.input_cols).columns
637
+ sample_pd_df.columns = snowpark_column_names
638
+
639
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
633
640
  output_df_columns = list(output_df_pd.columns)
634
641
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
635
642
  if self.sample_weight_col:
@@ -606,7 +606,14 @@ class AffinityPropagation(BaseTransformer):
606
606
  ) -> List[str]:
607
607
  # in case the inferred output column names dimension is different
608
608
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
609
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
609
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
610
+
611
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
612
+ # seen during the fit.
613
+ snowpark_column_names = dataset.select(self.input_cols).columns
614
+ sample_pd_df.columns = snowpark_column_names
615
+
616
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
610
617
  output_df_columns = list(output_df_pd.columns)
611
618
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
612
619
  if self.sample_weight_col:
@@ -637,7 +637,14 @@ class AgglomerativeClustering(BaseTransformer):
637
637
  ) -> List[str]:
638
638
  # in case the inferred output column names dimension is different
639
639
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
640
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
640
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
641
+
642
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
643
+ # seen during the fit.
644
+ snowpark_column_names = dataset.select(self.input_cols).columns
645
+ sample_pd_df.columns = snowpark_column_names
646
+
647
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
641
648
  output_df_columns = list(output_df_pd.columns)
642
649
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
643
650
  if self.sample_weight_col:
@@ -601,7 +601,14 @@ class Birch(BaseTransformer):
601
601
  ) -> List[str]:
602
602
  # in case the inferred output column names dimension is different
603
603
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
604
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
604
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
605
+
606
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
607
+ # seen during the fit.
608
+ snowpark_column_names = dataset.select(self.input_cols).columns
609
+ sample_pd_df.columns = snowpark_column_names
610
+
611
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
605
612
  output_df_columns = list(output_df_pd.columns)
606
613
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
607
614
  if self.sample_weight_col:
@@ -650,7 +650,14 @@ class BisectingKMeans(BaseTransformer):
650
650
  ) -> List[str]:
651
651
  # in case the inferred output column names dimension is different
652
652
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
653
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
653
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
654
+
655
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
656
+ # seen during the fit.
657
+ snowpark_column_names = dataset.select(self.input_cols).columns
658
+ sample_pd_df.columns = snowpark_column_names
659
+
660
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
654
661
  output_df_columns = list(output_df_pd.columns)
655
662
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
656
663
  if self.sample_weight_col:
@@ -612,7 +612,14 @@ class DBSCAN(BaseTransformer):
612
612
  ) -> List[str]:
613
613
  # in case the inferred output column names dimension is different
614
614
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
615
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
615
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
616
+
617
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
618
+ # seen during the fit.
619
+ snowpark_column_names = dataset.select(self.input_cols).columns
620
+ sample_pd_df.columns = snowpark_column_names
621
+
622
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
616
623
  output_df_columns = list(output_df_pd.columns)
617
624
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
618
625
  if self.sample_weight_col:
@@ -648,7 +648,14 @@ class FeatureAgglomeration(BaseTransformer):
648
648
  ) -> List[str]:
649
649
  # in case the inferred output column names dimension is different
650
650
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
651
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
651
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
652
+
653
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
654
+ # seen during the fit.
655
+ snowpark_column_names = dataset.select(self.input_cols).columns
656
+ sample_pd_df.columns = snowpark_column_names
657
+
658
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
652
659
  output_df_columns = list(output_df_pd.columns)
653
660
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
654
661
  if self.sample_weight_col:
@@ -645,7 +645,14 @@ class KMeans(BaseTransformer):
645
645
  ) -> List[str]:
646
646
  # in case the inferred output column names dimension is different
647
647
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
648
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
648
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
649
+
650
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
651
+ # seen during the fit.
652
+ snowpark_column_names = dataset.select(self.input_cols).columns
653
+ sample_pd_df.columns = snowpark_column_names
654
+
655
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
649
656
  output_df_columns = list(output_df_pd.columns)
650
657
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
651
658
  if self.sample_weight_col:
@@ -617,7 +617,14 @@ class MeanShift(BaseTransformer):
617
617
  ) -> List[str]:
618
618
  # in case the inferred output column names dimension is different
619
619
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
620
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
620
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
621
+
622
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
623
+ # seen during the fit.
624
+ snowpark_column_names = dataset.select(self.input_cols).columns
625
+ sample_pd_df.columns = snowpark_column_names
626
+
627
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
621
628
  output_df_columns = list(output_df_pd.columns)
622
629
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
623
630
  if self.sample_weight_col:
@@ -671,7 +671,14 @@ class MiniBatchKMeans(BaseTransformer):
671
671
  ) -> List[str]:
672
672
  # in case the inferred output column names dimension is different
673
673
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
674
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
674
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
675
+
676
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
677
+ # seen during the fit.
678
+ snowpark_column_names = dataset.select(self.input_cols).columns
679
+ sample_pd_df.columns = snowpark_column_names
680
+
681
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
675
682
  output_df_columns = list(output_df_pd.columns)
676
683
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
677
684
  if self.sample_weight_col:
@@ -685,7 +685,14 @@ class OPTICS(BaseTransformer):
685
685
  ) -> List[str]:
686
686
  # in case the inferred output column names dimension is different
687
687
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
688
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
688
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
689
+
690
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
691
+ # seen during the fit.
692
+ snowpark_column_names = dataset.select(self.input_cols).columns
693
+ sample_pd_df.columns = snowpark_column_names
694
+
695
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
689
696
  output_df_columns = list(output_df_pd.columns)
690
697
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
691
698
  if self.sample_weight_col:
@@ -621,7 +621,14 @@ class SpectralBiclustering(BaseTransformer):
621
621
  ) -> List[str]:
622
622
  # in case the inferred output column names dimension is different
623
623
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
624
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
624
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
625
+
626
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
627
+ # seen during the fit.
628
+ snowpark_column_names = dataset.select(self.input_cols).columns
629
+ sample_pd_df.columns = snowpark_column_names
630
+
631
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
625
632
  output_df_columns = list(output_df_pd.columns)
626
633
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
627
634
  if self.sample_weight_col:
@@ -681,7 +681,14 @@ class SpectralClustering(BaseTransformer):
681
681
  ) -> List[str]:
682
682
  # in case the inferred output column names dimension is different
683
683
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
684
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
684
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
685
+
686
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
687
+ # seen during the fit.
688
+ snowpark_column_names = dataset.select(self.input_cols).columns
689
+ sample_pd_df.columns = snowpark_column_names
690
+
691
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
685
692
  output_df_columns = list(output_df_pd.columns)
686
693
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
687
694
  if self.sample_weight_col: