snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. snowflake/cortex/_complete.py +26 -5
  2. snowflake/cortex/_sentiment.py +7 -4
  3. snowflake/cortex/_sse_client.py +81 -0
  4. snowflake/cortex/_util.py +105 -8
  5. snowflake/ml/_internal/lineage/lineage_utils.py +34 -25
  6. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  7. snowflake/ml/dataset/dataset.py +15 -12
  8. snowflake/ml/dataset/dataset_factory.py +3 -4
  9. snowflake/ml/feature_store/access_manager.py +34 -30
  10. snowflake/ml/feature_store/feature_store.py +3 -3
  11. snowflake/ml/feature_store/feature_view.py +12 -11
  12. snowflake/ml/fileset/snowfs.py +2 -31
  13. snowflake/ml/model/_client/ops/model_ops.py +43 -0
  14. snowflake/ml/model/_client/sql/model_version.py +55 -3
  15. snowflake/ml/model/_model_composer/model_composer.py +7 -3
  16. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -1
  17. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  18. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  19. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
  20. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  21. snowflake/ml/model/_signatures/builtins_handler.py +2 -1
  22. snowflake/ml/model/_signatures/core.py +13 -1
  23. snowflake/ml/model/_signatures/pandas_handler.py +2 -0
  24. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  25. snowflake/ml/model/model_signature.py +2 -0
  26. snowflake/ml/model/type_hints.py +1 -0
  27. snowflake/ml/modeling/_internal/estimator_utils.py +58 -1
  28. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +196 -242
  29. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +161 -0
  30. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +38 -18
  31. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +82 -134
  32. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +21 -17
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +9 -2
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +9 -2
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +9 -2
  36. snowflake/ml/modeling/cluster/birch.py +9 -2
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +9 -2
  38. snowflake/ml/modeling/cluster/dbscan.py +9 -2
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +9 -2
  40. snowflake/ml/modeling/cluster/k_means.py +9 -2
  41. snowflake/ml/modeling/cluster/mean_shift.py +9 -2
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +9 -2
  43. snowflake/ml/modeling/cluster/optics.py +9 -2
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +9 -2
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +9 -2
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +9 -2
  47. snowflake/ml/modeling/compose/column_transformer.py +9 -2
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +9 -2
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +9 -2
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +9 -2
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +9 -2
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +9 -2
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +9 -2
  54. snowflake/ml/modeling/covariance/min_cov_det.py +9 -2
  55. snowflake/ml/modeling/covariance/oas.py +9 -2
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +9 -2
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +9 -2
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +9 -2
  59. snowflake/ml/modeling/decomposition/fast_ica.py +9 -2
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +9 -2
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +9 -2
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +9 -2
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +9 -2
  64. snowflake/ml/modeling/decomposition/pca.py +9 -2
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +9 -2
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +9 -2
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +9 -2
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +9 -2
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +9 -2
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +9 -2
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +9 -2
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +9 -2
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +9 -2
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +9 -2
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +9 -2
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +9 -2
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +9 -2
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +9 -2
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +9 -2
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +9 -2
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +9 -2
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +9 -2
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +9 -2
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +9 -2
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +9 -2
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +9 -2
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +9 -2
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +9 -2
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +9 -2
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +9 -2
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +9 -2
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +9 -2
  93. snowflake/ml/modeling/framework/base.py +3 -8
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +9 -2
  95. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +9 -2
  96. snowflake/ml/modeling/impute/iterative_imputer.py +9 -2
  97. snowflake/ml/modeling/impute/knn_imputer.py +9 -2
  98. snowflake/ml/modeling/impute/missing_indicator.py +9 -2
  99. snowflake/ml/modeling/impute/simple_imputer.py +28 -5
  100. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +9 -2
  101. snowflake/ml/modeling/kernel_approximation/nystroem.py +9 -2
  102. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +9 -2
  103. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +9 -2
  104. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +9 -2
  105. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +9 -2
  106. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +9 -2
  107. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +9 -2
  108. snowflake/ml/modeling/linear_model/ard_regression.py +9 -2
  109. snowflake/ml/modeling/linear_model/bayesian_ridge.py +9 -2
  110. snowflake/ml/modeling/linear_model/elastic_net.py +9 -2
  111. snowflake/ml/modeling/linear_model/elastic_net_cv.py +9 -2
  112. snowflake/ml/modeling/linear_model/gamma_regressor.py +9 -2
  113. snowflake/ml/modeling/linear_model/huber_regressor.py +9 -2
  114. snowflake/ml/modeling/linear_model/lars.py +9 -2
  115. snowflake/ml/modeling/linear_model/lars_cv.py +9 -2
  116. snowflake/ml/modeling/linear_model/lasso.py +9 -2
  117. snowflake/ml/modeling/linear_model/lasso_cv.py +9 -2
  118. snowflake/ml/modeling/linear_model/lasso_lars.py +9 -2
  119. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +9 -2
  120. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +9 -2
  121. snowflake/ml/modeling/linear_model/linear_regression.py +9 -2
  122. snowflake/ml/modeling/linear_model/logistic_regression.py +9 -2
  123. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +9 -2
  124. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +9 -2
  125. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +9 -2
  126. snowflake/ml/modeling/linear_model/multi_task_lasso.py +9 -2
  127. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +9 -2
  128. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +9 -2
  129. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +9 -2
  130. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +9 -2
  131. snowflake/ml/modeling/linear_model/perceptron.py +9 -2
  132. snowflake/ml/modeling/linear_model/poisson_regressor.py +9 -2
  133. snowflake/ml/modeling/linear_model/ransac_regressor.py +9 -2
  134. snowflake/ml/modeling/linear_model/ridge.py +9 -2
  135. snowflake/ml/modeling/linear_model/ridge_classifier.py +9 -2
  136. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +9 -2
  137. snowflake/ml/modeling/linear_model/ridge_cv.py +9 -2
  138. snowflake/ml/modeling/linear_model/sgd_classifier.py +9 -2
  139. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +9 -2
  140. snowflake/ml/modeling/linear_model/sgd_regressor.py +9 -2
  141. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +9 -2
  142. snowflake/ml/modeling/linear_model/tweedie_regressor.py +9 -2
  143. snowflake/ml/modeling/manifold/isomap.py +9 -2
  144. snowflake/ml/modeling/manifold/mds.py +9 -2
  145. snowflake/ml/modeling/manifold/spectral_embedding.py +9 -2
  146. snowflake/ml/modeling/manifold/tsne.py +9 -2
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +9 -2
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +9 -2
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +1 -5
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +1 -5
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +9 -2
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +9 -2
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +9 -2
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +9 -2
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +9 -2
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +9 -2
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +9 -2
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +9 -2
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +9 -2
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +9 -2
  161. snowflake/ml/modeling/neighbors/kernel_density.py +9 -2
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +9 -2
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +9 -2
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +9 -2
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +9 -2
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +9 -2
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +9 -2
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +9 -2
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +9 -2
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +9 -2
  171. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  172. snowflake/ml/modeling/pipeline/pipeline.py +5 -0
  173. snowflake/ml/modeling/preprocessing/binarizer.py +7 -3
  174. snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +7 -2
  175. snowflake/ml/modeling/preprocessing/label_encoder.py +8 -7
  176. snowflake/ml/modeling/preprocessing/max_abs_scaler.py +7 -3
  177. snowflake/ml/modeling/preprocessing/min_max_scaler.py +7 -4
  178. snowflake/ml/modeling/preprocessing/normalizer.py +7 -3
  179. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +10 -2
  180. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +8 -5
  181. snowflake/ml/modeling/preprocessing/polynomial_features.py +9 -2
  182. snowflake/ml/modeling/preprocessing/robust_scaler.py +7 -4
  183. snowflake/ml/modeling/preprocessing/standard_scaler.py +7 -3
  184. snowflake/ml/modeling/semi_supervised/label_propagation.py +9 -2
  185. snowflake/ml/modeling/semi_supervised/label_spreading.py +9 -2
  186. snowflake/ml/modeling/svm/linear_svc.py +9 -2
  187. snowflake/ml/modeling/svm/linear_svr.py +9 -2
  188. snowflake/ml/modeling/svm/nu_svc.py +9 -2
  189. snowflake/ml/modeling/svm/nu_svr.py +9 -2
  190. snowflake/ml/modeling/svm/svc.py +9 -2
  191. snowflake/ml/modeling/svm/svr.py +9 -2
  192. snowflake/ml/modeling/tree/decision_tree_classifier.py +9 -2
  193. snowflake/ml/modeling/tree/decision_tree_regressor.py +9 -2
  194. snowflake/ml/modeling/tree/extra_tree_classifier.py +9 -2
  195. snowflake/ml/modeling/tree/extra_tree_regressor.py +9 -2
  196. snowflake/ml/modeling/xgboost/xgb_classifier.py +9 -2
  197. snowflake/ml/modeling/xgboost/xgb_regressor.py +9 -2
  198. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +9 -2
  199. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +9 -2
  200. snowflake/ml/registry/_manager/model_manager.py +59 -1
  201. snowflake/ml/registry/registry.py +10 -1
  202. snowflake/ml/version.py +1 -1
  203. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/METADATA +32 -4
  204. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/RECORD +207 -204
  205. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/LICENSE.txt +0 -0
  206. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/WHEEL +0 -0
  207. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/top_level.txt +0 -0
@@ -248,7 +248,7 @@ class VotingRegressor(BaseTransformer):
248
248
  inspect.currentframe(), VotingRegressor.__class__.__name__
249
249
  ),
250
250
  api_calls=[Session.call],
251
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
251
+ custom_tags={"autogen": True} if self._autogenerated else None,
252
252
  )
253
253
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
254
254
  pd_df.columns = dataset.columns
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
585
585
  ) -> List[str]:
586
586
  # in case the inferred output column names dimension is different
587
587
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
588
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
588
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
589
+
590
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
591
+ # seen during the fit.
592
+ snowpark_column_names = dataset.select(self.input_cols).columns
593
+ sample_pd_df.columns = snowpark_column_names
594
+
595
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
589
596
  output_df_columns = list(output_df_pd.columns)
590
597
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
591
598
  if self.sample_weight_col:
@@ -238,7 +238,7 @@ class GenericUnivariateSelect(BaseTransformer):
238
238
  inspect.currentframe(), GenericUnivariateSelect.__class__.__name__
239
239
  ),
240
240
  api_calls=[Session.call],
241
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
241
+ custom_tags={"autogen": True} if self._autogenerated else None,
242
242
  )
243
243
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
244
244
  pd_df.columns = dataset.columns
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
573
573
  ) -> List[str]:
574
574
  # in case the inferred output column names dimension is different
575
575
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
576
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
576
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
577
+
578
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
579
+ # seen during the fit.
580
+ snowpark_column_names = dataset.select(self.input_cols).columns
581
+ sample_pd_df.columns = snowpark_column_names
582
+
583
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
577
584
  output_df_columns = list(output_df_pd.columns)
578
585
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
579
586
  if self.sample_weight_col:
@@ -234,7 +234,7 @@ class SelectFdr(BaseTransformer):
234
234
  inspect.currentframe(), SelectFdr.__class__.__name__
235
235
  ),
236
236
  api_calls=[Session.call],
237
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
237
+ custom_tags={"autogen": True} if self._autogenerated else None,
238
238
  )
239
239
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
240
240
  pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -234,7 +234,7 @@ class SelectFpr(BaseTransformer):
234
234
  inspect.currentframe(), SelectFpr.__class__.__name__
235
235
  ),
236
236
  api_calls=[Session.call],
237
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
237
+ custom_tags={"autogen": True} if self._autogenerated else None,
238
238
  )
239
239
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
240
240
  pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -234,7 +234,7 @@ class SelectFwe(BaseTransformer):
234
234
  inspect.currentframe(), SelectFwe.__class__.__name__
235
235
  ),
236
236
  api_calls=[Session.call],
237
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
237
+ custom_tags={"autogen": True} if self._autogenerated else None,
238
238
  )
239
239
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
240
240
  pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -235,7 +235,7 @@ class SelectKBest(BaseTransformer):
235
235
  inspect.currentframe(), SelectKBest.__class__.__name__
236
236
  ),
237
237
  api_calls=[Session.call],
238
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
238
+ custom_tags={"autogen": True} if self._autogenerated else None,
239
239
  )
240
240
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
241
241
  pd_df.columns = dataset.columns
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
570
570
  ) -> List[str]:
571
571
  # in case the inferred output column names dimension is different
572
572
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
573
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
573
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
574
+
575
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
576
+ # seen during the fit.
577
+ snowpark_column_names = dataset.select(self.input_cols).columns
578
+ sample_pd_df.columns = snowpark_column_names
579
+
580
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
574
581
  output_df_columns = list(output_df_pd.columns)
575
582
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
576
583
  if self.sample_weight_col:
@@ -234,7 +234,7 @@ class SelectPercentile(BaseTransformer):
234
234
  inspect.currentframe(), SelectPercentile.__class__.__name__
235
235
  ),
236
236
  api_calls=[Session.call],
237
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
237
+ custom_tags={"autogen": True} if self._autogenerated else None,
238
238
  )
239
239
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
240
240
  pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -292,7 +292,7 @@ class SequentialFeatureSelector(BaseTransformer):
292
292
  inspect.currentframe(), SequentialFeatureSelector.__class__.__name__
293
293
  ),
294
294
  api_calls=[Session.call],
295
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
295
+ custom_tags={"autogen": True} if self._autogenerated else None,
296
296
  )
297
297
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
298
298
  pd_df.columns = dataset.columns
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
627
627
  ) -> List[str]:
628
628
  # in case the inferred output column names dimension is different
629
629
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
630
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
630
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
631
+
632
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
633
+ # seen during the fit.
634
+ snowpark_column_names = dataset.select(self.input_cols).columns
635
+ sample_pd_df.columns = snowpark_column_names
636
+
637
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
631
638
  output_df_columns = list(output_df_pd.columns)
632
639
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
633
640
  if self.sample_weight_col:
@@ -225,7 +225,7 @@ class VarianceThreshold(BaseTransformer):
225
225
  inspect.currentframe(), VarianceThreshold.__class__.__name__
226
226
  ),
227
227
  api_calls=[Session.call],
228
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
228
+ custom_tags={"autogen": True} if self._autogenerated else None,
229
229
  )
230
230
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
231
231
  pd_df.columns = dataset.columns
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
560
560
  ) -> List[str]:
561
561
  # in case the inferred output column names dimension is different
562
562
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
563
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
563
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
564
+
565
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
566
+ # seen during the fit.
567
+ snowpark_column_names = dataset.select(self.input_cols).columns
568
+ sample_pd_df.columns = snowpark_column_names
569
+
570
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
564
571
  output_df_columns = list(output_df_pd.columns)
565
572
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
566
573
  if self.sample_weight_col:
@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
16
16
  exceptions,
17
17
  modeling_error_messages,
18
18
  )
19
- from snowflake.ml._internal.lineage import data_source, lineage_utils
19
+ from snowflake.ml._internal.lineage import lineage_utils
20
20
  from snowflake.ml._internal.utils import identifier, parallelize
21
21
  from snowflake.ml.modeling.framework import _utils
22
22
  from snowflake.snowpark import functions as F
@@ -386,7 +386,6 @@ class BaseEstimator(Base):
386
386
  self.file_names = file_names
387
387
  self.custom_states = custom_states
388
388
  self.sample_weight_col = sample_weight_col
389
- self._data_sources: Optional[List[data_source.DataSource]] = None
390
389
 
391
390
  self.start_time = datetime.now().strftime(_utils.DATETIME_FORMAT)[:-3]
392
391
 
@@ -421,18 +420,14 @@ class BaseEstimator(Base):
421
420
  """
422
421
  return []
423
422
 
424
- def _get_data_sources(self) -> Optional[List[data_source.DataSource]]:
425
- return self._data_sources
426
-
427
423
  @telemetry.send_api_usage_telemetry(
428
424
  project=PROJECT,
429
425
  subproject=SUBPROJECT,
430
426
  )
431
427
  def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
432
428
  """Runs universal logics for all fit implementations."""
433
- self._data_sources = getattr(dataset, lineage_utils.DATA_SOURCES_ATTR, None)
434
- if self._data_sources:
435
- assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
429
+ data_sources = lineage_utils.get_data_sources(dataset)
430
+ lineage_utils.set_data_sources(self, data_sources)
436
431
  return self._fit(dataset)
437
432
 
438
433
  @abstractmethod
@@ -320,7 +320,7 @@ class GaussianProcessClassifier(BaseTransformer):
320
320
  inspect.currentframe(), GaussianProcessClassifier.__class__.__name__
321
321
  ),
322
322
  api_calls=[Session.call],
323
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
323
+ custom_tags={"autogen": True} if self._autogenerated else None,
324
324
  )
325
325
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
326
326
  pd_df.columns = dataset.columns
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
653
653
  ) -> List[str]:
654
654
  # in case the inferred output column names dimension is different
655
655
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
656
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
656
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
657
+
658
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
659
+ # seen during the fit.
660
+ snowpark_column_names = dataset.select(self.input_cols).columns
661
+ sample_pd_df.columns = snowpark_column_names
662
+
663
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
657
664
  output_df_columns = list(output_df_pd.columns)
658
665
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
659
666
  if self.sample_weight_col:
@@ -311,7 +311,7 @@ class GaussianProcessRegressor(BaseTransformer):
311
311
  inspect.currentframe(), GaussianProcessRegressor.__class__.__name__
312
312
  ),
313
313
  api_calls=[Session.call],
314
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
314
+ custom_tags={"autogen": True} if self._autogenerated else None,
315
315
  )
316
316
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
317
317
  pd_df.columns = dataset.columns
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -353,7 +353,7 @@ class IterativeImputer(BaseTransformer):
353
353
  inspect.currentframe(), IterativeImputer.__class__.__name__
354
354
  ),
355
355
  api_calls=[Session.call],
356
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
356
+ custom_tags={"autogen": True} if self._autogenerated else None,
357
357
  )
358
358
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
359
359
  pd_df.columns = dataset.columns
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
688
688
  ) -> List[str]:
689
689
  # in case the inferred output column names dimension is different
690
690
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
691
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
691
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
692
+
693
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
694
+ # seen during the fit.
695
+ snowpark_column_names = dataset.select(self.input_cols).columns
696
+ sample_pd_df.columns = snowpark_column_names
697
+
698
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
692
699
  output_df_columns = list(output_df_pd.columns)
693
700
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
694
701
  if self.sample_weight_col:
@@ -279,7 +279,7 @@ class KNNImputer(BaseTransformer):
279
279
  inspect.currentframe(), KNNImputer.__class__.__name__
280
280
  ),
281
281
  api_calls=[Session.call],
282
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
282
+ custom_tags={"autogen": True} if self._autogenerated else None,
283
283
  )
284
284
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
285
285
  pd_df.columns = dataset.columns
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
614
614
  ) -> List[str]:
615
615
  # in case the inferred output column names dimension is different
616
616
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
617
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
617
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
618
+
619
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
620
+ # seen during the fit.
621
+ snowpark_column_names = dataset.select(self.input_cols).columns
622
+ sample_pd_df.columns = snowpark_column_names
623
+
624
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
618
625
  output_df_columns = list(output_df_pd.columns)
619
626
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
620
627
  if self.sample_weight_col:
@@ -253,7 +253,7 @@ class MissingIndicator(BaseTransformer):
253
253
  inspect.currentframe(), MissingIndicator.__class__.__name__
254
254
  ),
255
255
  api_calls=[Session.call],
256
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
256
+ custom_tags={"autogen": True} if self._autogenerated else None,
257
257
  )
258
258
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
259
259
  pd_df.columns = dataset.columns
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
588
588
  ) -> List[str]:
589
589
  # in case the inferred output column names dimension is different
590
590
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
591
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
591
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
592
+
593
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
594
+ # seen during the fit.
595
+ snowpark_column_names = dataset.select(self.input_cols).columns
596
+ sample_pd_df.columns = snowpark_column_names
597
+
598
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
592
599
  output_df_columns = list(output_df_pd.columns)
593
600
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
594
601
  if self.sample_weight_col:
@@ -102,10 +102,14 @@ class SimpleImputer(base.BaseTransformer):
102
102
  For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when
103
103
  imputing numerical data and `missing_value` for strings and object data types.
104
104
  input_cols: Optional[Union[str, List[str]]]
105
- Columns to use as inputs during fit and transform.
105
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be imputed. Input
106
+ columns must be specified before fit with this argument or after initialization with the
107
+ `set_input_cols` method. This argument is optional for API consistency.
106
108
  output_cols: Optional[Union[str, List[str]]]
107
- A string or list of strings representing column names that will store the output of transform operation.
108
- The length of `output_cols` must equal the length of `input_cols`.
109
+ The name(s) to assign output columns in the output DataFrame. The number of
110
+ output columns specified must equal the number of input columns. Output columns must be specified before
111
+ transform with this argument or after initialization with the `set_output_cols` method. This argument is
112
+ optional for API consistency.
109
113
  passthrough_cols: A string or a list of strings indicating column names to be excluded from any
110
114
  operations (such as train, transform, or inference). These specified column(s)
111
115
  will remain untouched throughout the process. This option is helpful in scenarios
@@ -158,6 +162,7 @@ class SimpleImputer(base.BaseTransformer):
158
162
 
159
163
  self.fill_value = fill_value
160
164
  self.missing_values = missing_values
165
+ self.statistics_: Dict[str, Any] = {}
161
166
  # TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
162
167
  # Add back when `keep_empty_features` is supported.
163
168
  # self.keep_empty_features = keep_empty_features
@@ -229,8 +234,27 @@ class SimpleImputer(base.BaseTransformer):
229
234
 
230
235
  return input_col_datatypes
231
236
 
237
+ def _fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
238
+ if isinstance(dataset, snowpark.DataFrame):
239
+ return self._fit_snowpark(dataset)
240
+ else:
241
+ return self._fit_sklearn(dataset)
242
+
243
+ def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
244
+ dataset = self._use_input_cols_only(dataset)
245
+ sklearn_simple_imputer = self._create_sklearn_object()
246
+ sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
247
+ self._sklearn_object = sklearn_simple_imputer
248
+ for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
249
+ self.statistics_[input_col] = fill_value
250
+ self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
251
+ self.n_features_in_ = len(self.input_cols)
252
+ self.feature_names_in_ = self.input_cols
253
+ self._is_fitted = True
254
+ return self
255
+
232
256
  @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
233
- def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
257
+ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
234
258
  """
235
259
  Compute values to impute for the dataset according to the strategy.
236
260
 
@@ -245,7 +269,6 @@ class SimpleImputer(base.BaseTransformer):
245
269
  # In order to fit, the input columns should have the same type.
246
270
  input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
247
271
 
248
- self.statistics_: Dict[str, Any] = {}
249
272
  statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
250
273
 
251
274
  if self.strategy == "constant":
@@ -228,7 +228,7 @@ class AdditiveChi2Sampler(BaseTransformer):
228
228
  inspect.currentframe(), AdditiveChi2Sampler.__class__.__name__
229
229
  ),
230
230
  api_calls=[Session.call],
231
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
231
+ custom_tags={"autogen": True} if self._autogenerated else None,
232
232
  )
233
233
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
234
234
  pd_df.columns = dataset.columns
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
563
563
  ) -> List[str]:
564
564
  # in case the inferred output column names dimension is different
565
565
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
566
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
566
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
567
+
568
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
569
+ # seen during the fit.
570
+ snowpark_column_names = dataset.select(self.input_cols).columns
571
+ sample_pd_df.columns = snowpark_column_names
572
+
573
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
567
574
  output_df_columns = list(output_df_pd.columns)
568
575
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
569
576
  if self.sample_weight_col:
@@ -276,7 +276,7 @@ class Nystroem(BaseTransformer):
276
276
  inspect.currentframe(), Nystroem.__class__.__name__
277
277
  ),
278
278
  api_calls=[Session.call],
279
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
279
+ custom_tags={"autogen": True} if self._autogenerated else None,
280
280
  )
281
281
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
282
282
  pd_df.columns = dataset.columns
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
611
611
  ) -> List[str]:
612
612
  # in case the inferred output column names dimension is different
613
613
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
614
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
614
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
615
+
616
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
617
+ # seen during the fit.
618
+ snowpark_column_names = dataset.select(self.input_cols).columns
619
+ sample_pd_df.columns = snowpark_column_names
620
+
621
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
615
622
  output_df_columns = list(output_df_pd.columns)
616
623
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
617
624
  if self.sample_weight_col:
@@ -252,7 +252,7 @@ class PolynomialCountSketch(BaseTransformer):
252
252
  inspect.currentframe(), PolynomialCountSketch.__class__.__name__
253
253
  ),
254
254
  api_calls=[Session.call],
255
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
255
+ custom_tags={"autogen": True} if self._autogenerated else None,
256
256
  )
257
257
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
258
258
  pd_df.columns = dataset.columns
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
587
587
  ) -> List[str]:
588
588
  # in case the inferred output column names dimension is different
589
589
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
590
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
590
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
591
+
592
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
593
+ # seen during the fit.
594
+ snowpark_column_names = dataset.select(self.input_cols).columns
595
+ sample_pd_df.columns = snowpark_column_names
596
+
597
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
591
598
  output_df_columns = list(output_df_pd.columns)
592
599
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
593
600
  if self.sample_weight_col:
@@ -239,7 +239,7 @@ class RBFSampler(BaseTransformer):
239
239
  inspect.currentframe(), RBFSampler.__class__.__name__
240
240
  ),
241
241
  api_calls=[Session.call],
242
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
242
+ custom_tags={"autogen": True} if self._autogenerated else None,
243
243
  )
244
244
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
245
245
  pd_df.columns = dataset.columns
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
574
574
  ) -> List[str]:
575
575
  # in case the inferred output column names dimension is different
576
576
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
577
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
577
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
578
+
579
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
580
+ # seen during the fit.
581
+ snowpark_column_names = dataset.select(self.input_cols).columns
582
+ sample_pd_df.columns = snowpark_column_names
583
+
584
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
578
585
  output_df_columns = list(output_df_pd.columns)
579
586
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
580
587
  if self.sample_weight_col:
@@ -237,7 +237,7 @@ class SkewedChi2Sampler(BaseTransformer):
237
237
  inspect.currentframe(), SkewedChi2Sampler.__class__.__name__
238
238
  ),
239
239
  api_calls=[Session.call],
240
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
240
+ custom_tags={"autogen": True} if self._autogenerated else None,
241
241
  )
242
242
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
243
243
  pd_df.columns = dataset.columns
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
572
572
  ) -> List[str]:
573
573
  # in case the inferred output column names dimension is different
574
574
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
575
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
575
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
576
+
577
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
578
+ # seen during the fit.
579
+ snowpark_column_names = dataset.select(self.input_cols).columns
580
+ sample_pd_df.columns = snowpark_column_names
581
+
582
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
576
583
  output_df_columns = list(output_df_pd.columns)
577
584
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
578
585
  if self.sample_weight_col:
@@ -273,7 +273,7 @@ class KernelRidge(BaseTransformer):
273
273
  inspect.currentframe(), KernelRidge.__class__.__name__
274
274
  ),
275
275
  api_calls=[Session.call],
276
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
276
+ custom_tags={"autogen": True} if self._autogenerated else None,
277
277
  )
278
278
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
279
279
  pd_df.columns = dataset.columns
@@ -606,7 +606,14 @@ class KernelRidge(BaseTransformer):
606
606
  ) -> List[str]:
607
607
  # in case the inferred output column names dimension is different
608
608
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
609
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
609
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
610
+
611
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
612
+ # seen during the fit.
613
+ snowpark_column_names = dataset.select(self.input_cols).columns
614
+ sample_pd_df.columns = snowpark_column_names
615
+
616
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
610
617
  output_df_columns = list(output_df_pd.columns)
611
618
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
612
619
  if self.sample_weight_col: