snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. snowflake/cortex/_complete.py +26 -5
  2. snowflake/cortex/_sentiment.py +7 -4
  3. snowflake/cortex/_sse_client.py +81 -0
  4. snowflake/cortex/_util.py +105 -8
  5. snowflake/ml/_internal/lineage/lineage_utils.py +34 -25
  6. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  7. snowflake/ml/dataset/dataset.py +15 -12
  8. snowflake/ml/dataset/dataset_factory.py +3 -4
  9. snowflake/ml/feature_store/access_manager.py +34 -30
  10. snowflake/ml/feature_store/feature_store.py +3 -3
  11. snowflake/ml/feature_store/feature_view.py +12 -11
  12. snowflake/ml/fileset/snowfs.py +2 -31
  13. snowflake/ml/model/_client/ops/model_ops.py +43 -0
  14. snowflake/ml/model/_client/sql/model_version.py +55 -3
  15. snowflake/ml/model/_model_composer/model_composer.py +7 -3
  16. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -1
  17. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  18. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  19. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
  20. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  21. snowflake/ml/model/_signatures/builtins_handler.py +2 -1
  22. snowflake/ml/model/_signatures/core.py +13 -1
  23. snowflake/ml/model/_signatures/pandas_handler.py +2 -0
  24. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  25. snowflake/ml/model/model_signature.py +2 -0
  26. snowflake/ml/model/type_hints.py +1 -0
  27. snowflake/ml/modeling/_internal/estimator_utils.py +58 -1
  28. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +196 -242
  29. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +161 -0
  30. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +38 -18
  31. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +82 -134
  32. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +21 -17
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +9 -2
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +9 -2
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +9 -2
  36. snowflake/ml/modeling/cluster/birch.py +9 -2
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +9 -2
  38. snowflake/ml/modeling/cluster/dbscan.py +9 -2
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +9 -2
  40. snowflake/ml/modeling/cluster/k_means.py +9 -2
  41. snowflake/ml/modeling/cluster/mean_shift.py +9 -2
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +9 -2
  43. snowflake/ml/modeling/cluster/optics.py +9 -2
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +9 -2
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +9 -2
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +9 -2
  47. snowflake/ml/modeling/compose/column_transformer.py +9 -2
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +9 -2
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +9 -2
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +9 -2
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +9 -2
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +9 -2
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +9 -2
  54. snowflake/ml/modeling/covariance/min_cov_det.py +9 -2
  55. snowflake/ml/modeling/covariance/oas.py +9 -2
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +9 -2
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +9 -2
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +9 -2
  59. snowflake/ml/modeling/decomposition/fast_ica.py +9 -2
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +9 -2
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +9 -2
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +9 -2
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +9 -2
  64. snowflake/ml/modeling/decomposition/pca.py +9 -2
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +9 -2
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +9 -2
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +9 -2
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +9 -2
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +9 -2
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +9 -2
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +9 -2
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +9 -2
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +9 -2
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +9 -2
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +9 -2
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +9 -2
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +9 -2
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +9 -2
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +9 -2
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +9 -2
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +9 -2
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +9 -2
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +9 -2
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +9 -2
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +9 -2
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +9 -2
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +9 -2
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +9 -2
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +9 -2
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +9 -2
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +9 -2
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +9 -2
  93. snowflake/ml/modeling/framework/base.py +3 -8
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +9 -2
  95. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +9 -2
  96. snowflake/ml/modeling/impute/iterative_imputer.py +9 -2
  97. snowflake/ml/modeling/impute/knn_imputer.py +9 -2
  98. snowflake/ml/modeling/impute/missing_indicator.py +9 -2
  99. snowflake/ml/modeling/impute/simple_imputer.py +28 -5
  100. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +9 -2
  101. snowflake/ml/modeling/kernel_approximation/nystroem.py +9 -2
  102. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +9 -2
  103. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +9 -2
  104. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +9 -2
  105. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +9 -2
  106. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +9 -2
  107. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +9 -2
  108. snowflake/ml/modeling/linear_model/ard_regression.py +9 -2
  109. snowflake/ml/modeling/linear_model/bayesian_ridge.py +9 -2
  110. snowflake/ml/modeling/linear_model/elastic_net.py +9 -2
  111. snowflake/ml/modeling/linear_model/elastic_net_cv.py +9 -2
  112. snowflake/ml/modeling/linear_model/gamma_regressor.py +9 -2
  113. snowflake/ml/modeling/linear_model/huber_regressor.py +9 -2
  114. snowflake/ml/modeling/linear_model/lars.py +9 -2
  115. snowflake/ml/modeling/linear_model/lars_cv.py +9 -2
  116. snowflake/ml/modeling/linear_model/lasso.py +9 -2
  117. snowflake/ml/modeling/linear_model/lasso_cv.py +9 -2
  118. snowflake/ml/modeling/linear_model/lasso_lars.py +9 -2
  119. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +9 -2
  120. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +9 -2
  121. snowflake/ml/modeling/linear_model/linear_regression.py +9 -2
  122. snowflake/ml/modeling/linear_model/logistic_regression.py +9 -2
  123. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +9 -2
  124. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +9 -2
  125. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +9 -2
  126. snowflake/ml/modeling/linear_model/multi_task_lasso.py +9 -2
  127. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +9 -2
  128. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +9 -2
  129. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +9 -2
  130. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +9 -2
  131. snowflake/ml/modeling/linear_model/perceptron.py +9 -2
  132. snowflake/ml/modeling/linear_model/poisson_regressor.py +9 -2
  133. snowflake/ml/modeling/linear_model/ransac_regressor.py +9 -2
  134. snowflake/ml/modeling/linear_model/ridge.py +9 -2
  135. snowflake/ml/modeling/linear_model/ridge_classifier.py +9 -2
  136. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +9 -2
  137. snowflake/ml/modeling/linear_model/ridge_cv.py +9 -2
  138. snowflake/ml/modeling/linear_model/sgd_classifier.py +9 -2
  139. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +9 -2
  140. snowflake/ml/modeling/linear_model/sgd_regressor.py +9 -2
  141. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +9 -2
  142. snowflake/ml/modeling/linear_model/tweedie_regressor.py +9 -2
  143. snowflake/ml/modeling/manifold/isomap.py +9 -2
  144. snowflake/ml/modeling/manifold/mds.py +9 -2
  145. snowflake/ml/modeling/manifold/spectral_embedding.py +9 -2
  146. snowflake/ml/modeling/manifold/tsne.py +9 -2
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +9 -2
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +9 -2
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +1 -5
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +1 -5
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +9 -2
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +9 -2
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +9 -2
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +9 -2
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +9 -2
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +9 -2
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +9 -2
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +9 -2
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +9 -2
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +9 -2
  161. snowflake/ml/modeling/neighbors/kernel_density.py +9 -2
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +9 -2
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +9 -2
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +9 -2
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +9 -2
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +9 -2
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +9 -2
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +9 -2
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +9 -2
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +9 -2
  171. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  172. snowflake/ml/modeling/pipeline/pipeline.py +5 -0
  173. snowflake/ml/modeling/preprocessing/binarizer.py +7 -3
  174. snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +7 -2
  175. snowflake/ml/modeling/preprocessing/label_encoder.py +8 -7
  176. snowflake/ml/modeling/preprocessing/max_abs_scaler.py +7 -3
  177. snowflake/ml/modeling/preprocessing/min_max_scaler.py +7 -4
  178. snowflake/ml/modeling/preprocessing/normalizer.py +7 -3
  179. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +10 -2
  180. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +8 -5
  181. snowflake/ml/modeling/preprocessing/polynomial_features.py +9 -2
  182. snowflake/ml/modeling/preprocessing/robust_scaler.py +7 -4
  183. snowflake/ml/modeling/preprocessing/standard_scaler.py +7 -3
  184. snowflake/ml/modeling/semi_supervised/label_propagation.py +9 -2
  185. snowflake/ml/modeling/semi_supervised/label_spreading.py +9 -2
  186. snowflake/ml/modeling/svm/linear_svc.py +9 -2
  187. snowflake/ml/modeling/svm/linear_svr.py +9 -2
  188. snowflake/ml/modeling/svm/nu_svc.py +9 -2
  189. snowflake/ml/modeling/svm/nu_svr.py +9 -2
  190. snowflake/ml/modeling/svm/svc.py +9 -2
  191. snowflake/ml/modeling/svm/svr.py +9 -2
  192. snowflake/ml/modeling/tree/decision_tree_classifier.py +9 -2
  193. snowflake/ml/modeling/tree/decision_tree_regressor.py +9 -2
  194. snowflake/ml/modeling/tree/extra_tree_classifier.py +9 -2
  195. snowflake/ml/modeling/tree/extra_tree_regressor.py +9 -2
  196. snowflake/ml/modeling/xgboost/xgb_classifier.py +9 -2
  197. snowflake/ml/modeling/xgboost/xgb_regressor.py +9 -2
  198. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +9 -2
  199. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +9 -2
  200. snowflake/ml/registry/_manager/model_manager.py +59 -1
  201. snowflake/ml/registry/registry.py +10 -1
  202. snowflake/ml/version.py +1 -1
  203. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/METADATA +32 -4
  204. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/RECORD +207 -204
  205. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/LICENSE.txt +0 -0
  206. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/WHEEL +0 -0
  207. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/top_level.txt +0 -0
@@ -262,7 +262,7 @@ class Birch(BaseTransformer):
262
262
  inspect.currentframe(), Birch.__class__.__name__
263
263
  ),
264
264
  api_calls=[Session.call],
265
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
265
+ custom_tags={"autogen": True} if self._autogenerated else None,
266
266
  )
267
267
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
268
268
  pd_df.columns = dataset.columns
@@ -601,7 +601,14 @@ class Birch(BaseTransformer):
601
601
  ) -> List[str]:
602
602
  # in case the inferred output column names dimension is different
603
603
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
604
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
604
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
605
+
606
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
607
+ # seen during the fit.
608
+ snowpark_column_names = dataset.select(self.input_cols).columns
609
+ sample_pd_df.columns = snowpark_column_names
610
+
611
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
605
612
  output_df_columns = list(output_df_pd.columns)
606
613
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
607
614
  if self.sample_weight_col:
@@ -311,7 +311,7 @@ class BisectingKMeans(BaseTransformer):
311
311
  inspect.currentframe(), BisectingKMeans.__class__.__name__
312
312
  ),
313
313
  api_calls=[Session.call],
314
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
314
+ custom_tags={"autogen": True} if self._autogenerated else None,
315
315
  )
316
316
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
317
317
  pd_df.columns = dataset.columns
@@ -650,7 +650,14 @@ class BisectingKMeans(BaseTransformer):
650
650
  ) -> List[str]:
651
651
  # in case the inferred output column names dimension is different
652
652
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
653
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
653
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
654
+
655
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
656
+ # seen during the fit.
657
+ snowpark_column_names = dataset.select(self.input_cols).columns
658
+ sample_pd_df.columns = snowpark_column_names
659
+
660
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
654
661
  output_df_columns = list(output_df_pd.columns)
655
662
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
656
663
  if self.sample_weight_col:
@@ -279,7 +279,7 @@ class DBSCAN(BaseTransformer):
279
279
  inspect.currentframe(), DBSCAN.__class__.__name__
280
280
  ),
281
281
  api_calls=[Session.call],
282
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
282
+ custom_tags={"autogen": True} if self._autogenerated else None,
283
283
  )
284
284
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
285
285
  pd_df.columns = dataset.columns
@@ -612,7 +612,14 @@ class DBSCAN(BaseTransformer):
612
612
  ) -> List[str]:
613
613
  # in case the inferred output column names dimension is different
614
614
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
615
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
615
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
616
+
617
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
618
+ # seen during the fit.
619
+ snowpark_column_names = dataset.select(self.input_cols).columns
620
+ sample_pd_df.columns = snowpark_column_names
621
+
622
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
616
623
  output_df_columns = list(output_df_pd.columns)
617
624
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
618
625
  if self.sample_weight_col:
@@ -311,7 +311,7 @@ class FeatureAgglomeration(BaseTransformer):
311
311
  inspect.currentframe(), FeatureAgglomeration.__class__.__name__
312
312
  ),
313
313
  api_calls=[Session.call],
314
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
314
+ custom_tags={"autogen": True} if self._autogenerated else None,
315
315
  )
316
316
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
317
317
  pd_df.columns = dataset.columns
@@ -648,7 +648,14 @@ class FeatureAgglomeration(BaseTransformer):
648
648
  ) -> List[str]:
649
649
  # in case the inferred output column names dimension is different
650
650
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
651
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
651
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
652
+
653
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
654
+ # seen during the fit.
655
+ snowpark_column_names = dataset.select(self.input_cols).columns
656
+ sample_pd_df.columns = snowpark_column_names
657
+
658
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
652
659
  output_df_columns = list(output_df_pd.columns)
653
660
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
654
661
  if self.sample_weight_col:
@@ -306,7 +306,7 @@ class KMeans(BaseTransformer):
306
306
  inspect.currentframe(), KMeans.__class__.__name__
307
307
  ),
308
308
  api_calls=[Session.call],
309
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
309
+ custom_tags={"autogen": True} if self._autogenerated else None,
310
310
  )
311
311
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
312
312
  pd_df.columns = dataset.columns
@@ -645,7 +645,14 @@ class KMeans(BaseTransformer):
645
645
  ) -> List[str]:
646
646
  # in case the inferred output column names dimension is different
647
647
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
648
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
648
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
649
+
650
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
651
+ # seen during the fit.
652
+ snowpark_column_names = dataset.select(self.input_cols).columns
653
+ sample_pd_df.columns = snowpark_column_names
654
+
655
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
649
656
  output_df_columns = list(output_df_pd.columns)
650
657
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
651
658
  if self.sample_weight_col:
@@ -282,7 +282,7 @@ class MeanShift(BaseTransformer):
282
282
  inspect.currentframe(), MeanShift.__class__.__name__
283
283
  ),
284
284
  api_calls=[Session.call],
285
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
285
+ custom_tags={"autogen": True} if self._autogenerated else None,
286
286
  )
287
287
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
288
288
  pd_df.columns = dataset.columns
@@ -617,7 +617,14 @@ class MeanShift(BaseTransformer):
617
617
  ) -> List[str]:
618
618
  # in case the inferred output column names dimension is different
619
619
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
620
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
620
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
621
+
622
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
623
+ # seen during the fit.
624
+ snowpark_column_names = dataset.select(self.input_cols).columns
625
+ sample_pd_df.columns = snowpark_column_names
626
+
627
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
621
628
  output_df_columns = list(output_df_pd.columns)
622
629
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
623
630
  if self.sample_weight_col:
@@ -332,7 +332,7 @@ class MiniBatchKMeans(BaseTransformer):
332
332
  inspect.currentframe(), MiniBatchKMeans.__class__.__name__
333
333
  ),
334
334
  api_calls=[Session.call],
335
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
335
+ custom_tags={"autogen": True} if self._autogenerated else None,
336
336
  )
337
337
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
338
338
  pd_df.columns = dataset.columns
@@ -671,7 +671,14 @@ class MiniBatchKMeans(BaseTransformer):
671
671
  ) -> List[str]:
672
672
  # in case the inferred output column names dimension is different
673
673
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
674
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
674
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
675
+
676
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
677
+ # seen during the fit.
678
+ snowpark_column_names = dataset.select(self.input_cols).columns
679
+ sample_pd_df.columns = snowpark_column_names
680
+
681
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
675
682
  output_df_columns = list(output_df_pd.columns)
676
683
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
677
684
  if self.sample_weight_col:
@@ -352,7 +352,7 @@ class OPTICS(BaseTransformer):
352
352
  inspect.currentframe(), OPTICS.__class__.__name__
353
353
  ),
354
354
  api_calls=[Session.call],
355
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
355
+ custom_tags={"autogen": True} if self._autogenerated else None,
356
356
  )
357
357
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
358
358
  pd_df.columns = dataset.columns
@@ -685,7 +685,14 @@ class OPTICS(BaseTransformer):
685
685
  ) -> List[str]:
686
686
  # in case the inferred output column names dimension is different
687
687
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
688
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
688
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
689
+
690
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
691
+ # seen during the fit.
692
+ snowpark_column_names = dataset.select(self.input_cols).columns
693
+ sample_pd_df.columns = snowpark_column_names
694
+
695
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
689
696
  output_df_columns = list(output_df_pd.columns)
690
697
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
691
698
  if self.sample_weight_col:
@@ -290,7 +290,7 @@ class SpectralBiclustering(BaseTransformer):
290
290
  inspect.currentframe(), SpectralBiclustering.__class__.__name__
291
291
  ),
292
292
  api_calls=[Session.call],
293
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
293
+ custom_tags={"autogen": True} if self._autogenerated else None,
294
294
  )
295
295
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
296
296
  pd_df.columns = dataset.columns
@@ -621,7 +621,14 @@ class SpectralBiclustering(BaseTransformer):
621
621
  ) -> List[str]:
622
622
  # in case the inferred output column names dimension is different
623
623
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
624
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
624
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
625
+
626
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
627
+ # seen during the fit.
628
+ snowpark_column_names = dataset.select(self.input_cols).columns
629
+ sample_pd_df.columns = snowpark_column_names
630
+
631
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
625
632
  output_df_columns = list(output_df_pd.columns)
626
633
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
627
634
  if self.sample_weight_col:
@@ -348,7 +348,7 @@ class SpectralClustering(BaseTransformer):
348
348
  inspect.currentframe(), SpectralClustering.__class__.__name__
349
349
  ),
350
350
  api_calls=[Session.call],
351
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
351
+ custom_tags={"autogen": True} if self._autogenerated else None,
352
352
  )
353
353
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
354
354
  pd_df.columns = dataset.columns
@@ -681,7 +681,14 @@ class SpectralClustering(BaseTransformer):
681
681
  ) -> List[str]:
682
682
  # in case the inferred output column names dimension is different
683
683
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
684
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
684
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
685
+
686
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
687
+ # seen during the fit.
688
+ snowpark_column_names = dataset.select(self.input_cols).columns
689
+ sample_pd_df.columns = snowpark_column_names
690
+
691
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
685
692
  output_df_columns = list(output_df_pd.columns)
686
693
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
687
694
  if self.sample_weight_col:
@@ -269,7 +269,7 @@ class SpectralCoclustering(BaseTransformer):
269
269
  inspect.currentframe(), SpectralCoclustering.__class__.__name__
270
270
  ),
271
271
  api_calls=[Session.call],
272
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
272
+ custom_tags={"autogen": True} if self._autogenerated else None,
273
273
  )
274
274
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
275
275
  pd_df.columns = dataset.columns
@@ -600,7 +600,14 @@ class SpectralCoclustering(BaseTransformer):
600
600
  ) -> List[str]:
601
601
  # in case the inferred output column names dimension is different
602
602
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
603
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
603
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
604
+
605
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
606
+ # seen during the fit.
607
+ snowpark_column_names = dataset.select(self.input_cols).columns
608
+ sample_pd_df.columns = snowpark_column_names
609
+
610
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
604
611
  output_df_columns = list(output_df_pd.columns)
605
612
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
606
613
  if self.sample_weight_col:
@@ -299,7 +299,7 @@ class ColumnTransformer(BaseTransformer):
299
299
  inspect.currentframe(), ColumnTransformer.__class__.__name__
300
300
  ),
301
301
  api_calls=[Session.call],
302
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
302
+ custom_tags={"autogen": True} if self._autogenerated else None,
303
303
  )
304
304
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
305
305
  pd_df.columns = dataset.columns
@@ -634,7 +634,14 @@ class ColumnTransformer(BaseTransformer):
634
634
  ) -> List[str]:
635
635
  # in case the inferred output column names dimension is different
636
636
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
637
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
637
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
638
+
639
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
640
+ # seen during the fit.
641
+ snowpark_column_names = dataset.select(self.input_cols).columns
642
+ sample_pd_df.columns = snowpark_column_names
643
+
644
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
638
645
  output_df_columns = list(output_df_pd.columns)
639
646
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
640
647
  if self.sample_weight_col:
@@ -260,7 +260,7 @@ class TransformedTargetRegressor(BaseTransformer):
260
260
  inspect.currentframe(), TransformedTargetRegressor.__class__.__name__
261
261
  ),
262
262
  api_calls=[Session.call],
263
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
263
+ custom_tags={"autogen": True} if self._autogenerated else None,
264
264
  )
265
265
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
266
266
  pd_df.columns = dataset.columns
@@ -593,7 +593,14 @@ class TransformedTargetRegressor(BaseTransformer):
593
593
  ) -> List[str]:
594
594
  # in case the inferred output column names dimension is different
595
595
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
596
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
596
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
597
+
598
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
599
+ # seen during the fit.
600
+ snowpark_column_names = dataset.select(self.input_cols).columns
601
+ sample_pd_df.columns = snowpark_column_names
602
+
603
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
597
604
  output_df_columns = list(output_df_pd.columns)
598
605
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
599
606
  if self.sample_weight_col:
@@ -255,7 +255,7 @@ class EllipticEnvelope(BaseTransformer):
255
255
  inspect.currentframe(), EllipticEnvelope.__class__.__name__
256
256
  ),
257
257
  api_calls=[Session.call],
258
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
258
+ custom_tags={"autogen": True} if self._autogenerated else None,
259
259
  )
260
260
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
261
261
  pd_df.columns = dataset.columns
@@ -590,7 +590,14 @@ class EllipticEnvelope(BaseTransformer):
590
590
  ) -> List[str]:
591
591
  # in case the inferred output column names dimension is different
592
592
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
593
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
593
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
594
+
595
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
596
+ # seen during the fit.
597
+ snowpark_column_names = dataset.select(self.input_cols).columns
598
+ sample_pd_df.columns = snowpark_column_names
599
+
600
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
594
601
  output_df_columns = list(output_df_pd.columns)
595
602
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
596
603
  if self.sample_weight_col:
@@ -231,7 +231,7 @@ class EmpiricalCovariance(BaseTransformer):
231
231
  inspect.currentframe(), EmpiricalCovariance.__class__.__name__
232
232
  ),
233
233
  api_calls=[Session.call],
234
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
234
+ custom_tags={"autogen": True} if self._autogenerated else None,
235
235
  )
236
236
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
237
237
  pd_df.columns = dataset.columns
@@ -562,7 +562,14 @@ class EmpiricalCovariance(BaseTransformer):
562
562
  ) -> List[str]:
563
563
  # in case the inferred output column names dimension is different
564
564
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
565
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
565
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
566
+
567
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
568
+ # seen during the fit.
569
+ snowpark_column_names = dataset.select(self.input_cols).columns
570
+ sample_pd_df.columns = snowpark_column_names
571
+
572
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
566
573
  output_df_columns = list(output_df_pd.columns)
567
574
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
568
575
  if self.sample_weight_col:
@@ -279,7 +279,7 @@ class GraphicalLasso(BaseTransformer):
279
279
  inspect.currentframe(), GraphicalLasso.__class__.__name__
280
280
  ),
281
281
  api_calls=[Session.call],
282
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
282
+ custom_tags={"autogen": True} if self._autogenerated else None,
283
283
  )
284
284
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
285
285
  pd_df.columns = dataset.columns
@@ -610,7 +610,14 @@ class GraphicalLasso(BaseTransformer):
610
610
  ) -> List[str]:
611
611
  # in case the inferred output column names dimension is different
612
612
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
613
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
613
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
614
+
615
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
616
+ # seen during the fit.
617
+ snowpark_column_names = dataset.select(self.input_cols).columns
618
+ sample_pd_df.columns = snowpark_column_names
619
+
620
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
614
621
  output_df_columns = list(output_df_pd.columns)
615
622
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
616
623
  if self.sample_weight_col:
@@ -305,7 +305,7 @@ class GraphicalLassoCV(BaseTransformer):
305
305
  inspect.currentframe(), GraphicalLassoCV.__class__.__name__
306
306
  ),
307
307
  api_calls=[Session.call],
308
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
308
+ custom_tags={"autogen": True} if self._autogenerated else None,
309
309
  )
310
310
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
311
311
  pd_df.columns = dataset.columns
@@ -636,7 +636,14 @@ class GraphicalLassoCV(BaseTransformer):
636
636
  ) -> List[str]:
637
637
  # in case the inferred output column names dimension is different
638
638
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
639
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
639
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
640
+
641
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
642
+ # seen during the fit.
643
+ snowpark_column_names = dataset.select(self.input_cols).columns
644
+ sample_pd_df.columns = snowpark_column_names
645
+
646
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
640
647
  output_df_columns = list(output_df_pd.columns)
641
648
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
642
649
  if self.sample_weight_col:
@@ -238,7 +238,7 @@ class LedoitWolf(BaseTransformer):
238
238
  inspect.currentframe(), LedoitWolf.__class__.__name__
239
239
  ),
240
240
  api_calls=[Session.call],
241
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
241
+ custom_tags={"autogen": True} if self._autogenerated else None,
242
242
  )
243
243
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
244
244
  pd_df.columns = dataset.columns
@@ -569,7 +569,14 @@ class LedoitWolf(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -250,7 +250,7 @@ class MinCovDet(BaseTransformer):
250
250
  inspect.currentframe(), MinCovDet.__class__.__name__
251
251
  ),
252
252
  api_calls=[Session.call],
253
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
253
+ custom_tags={"autogen": True} if self._autogenerated else None,
254
254
  )
255
255
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
256
256
  pd_df.columns = dataset.columns
@@ -581,7 +581,14 @@ class MinCovDet(BaseTransformer):
581
581
  ) -> List[str]:
582
582
  # in case the inferred output column names dimension is different
583
583
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
584
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
584
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
585
+
586
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
587
+ # seen during the fit.
588
+ snowpark_column_names = dataset.select(self.input_cols).columns
589
+ sample_pd_df.columns = snowpark_column_names
590
+
591
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
585
592
  output_df_columns = list(output_df_pd.columns)
586
593
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
587
594
  if self.sample_weight_col:
@@ -231,7 +231,7 @@ class OAS(BaseTransformer):
231
231
  inspect.currentframe(), OAS.__class__.__name__
232
232
  ),
233
233
  api_calls=[Session.call],
234
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
234
+ custom_tags={"autogen": True} if self._autogenerated else None,
235
235
  )
236
236
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
237
237
  pd_df.columns = dataset.columns
@@ -562,7 +562,14 @@ class OAS(BaseTransformer):
562
562
  ) -> List[str]:
563
563
  # in case the inferred output column names dimension is different
564
564
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
565
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
565
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
566
+
567
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
568
+ # seen during the fit.
569
+ snowpark_column_names = dataset.select(self.input_cols).columns
570
+ sample_pd_df.columns = snowpark_column_names
571
+
572
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
566
573
  output_df_columns = list(output_df_pd.columns)
567
574
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
568
575
  if self.sample_weight_col:
@@ -237,7 +237,7 @@ class ShrunkCovariance(BaseTransformer):
237
237
  inspect.currentframe(), ShrunkCovariance.__class__.__name__
238
238
  ),
239
239
  api_calls=[Session.call],
240
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
240
+ custom_tags={"autogen": True} if self._autogenerated else None,
241
241
  )
242
242
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
243
243
  pd_df.columns = dataset.columns
@@ -568,7 +568,14 @@ class ShrunkCovariance(BaseTransformer):
568
568
  ) -> List[str]:
569
569
  # in case the inferred output column names dimension is different
570
570
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
571
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
571
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
572
+
573
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
574
+ # seen during the fit.
575
+ snowpark_column_names = dataset.select(self.input_cols).columns
576
+ sample_pd_df.columns = snowpark_column_names
577
+
578
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
572
579
  output_df_columns = list(output_df_pd.columns)
573
580
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
574
581
  if self.sample_weight_col:
@@ -343,7 +343,7 @@ class DictionaryLearning(BaseTransformer):
343
343
  inspect.currentframe(), DictionaryLearning.__class__.__name__
344
344
  ),
345
345
  api_calls=[Session.call],
346
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
346
+ custom_tags={"autogen": True} if self._autogenerated else None,
347
347
  )
348
348
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
349
349
  pd_df.columns = dataset.columns
@@ -678,7 +678,14 @@ class DictionaryLearning(BaseTransformer):
678
678
  ) -> List[str]:
679
679
  # in case the inferred output column names dimension is different
680
680
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
681
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
681
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
682
+
683
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
684
+ # seen during the fit.
685
+ snowpark_column_names = dataset.select(self.input_cols).columns
686
+ sample_pd_df.columns = snowpark_column_names
687
+
688
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
682
689
  output_df_columns = list(output_df_pd.columns)
683
690
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
684
691
  if self.sample_weight_col:
@@ -280,7 +280,7 @@ class FactorAnalysis(BaseTransformer):
280
280
  inspect.currentframe(), FactorAnalysis.__class__.__name__
281
281
  ),
282
282
  api_calls=[Session.call],
283
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
283
+ custom_tags={"autogen": True} if self._autogenerated else None,
284
284
  )
285
285
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
286
286
  pd_df.columns = dataset.columns
@@ -615,7 +615,14 @@ class FactorAnalysis(BaseTransformer):
615
615
  ) -> List[str]:
616
616
  # in case the inferred output column names dimension is different
617
617
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
618
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
618
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
619
+
620
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
621
+ # seen during the fit.
622
+ snowpark_column_names = dataset.select(self.input_cols).columns
623
+ sample_pd_df.columns = snowpark_column_names
624
+
625
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
619
626
  output_df_columns = list(output_df_pd.columns)
620
627
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
621
628
  if self.sample_weight_col:
@@ -298,7 +298,7 @@ class FastICA(BaseTransformer):
298
298
  inspect.currentframe(), FastICA.__class__.__name__
299
299
  ),
300
300
  api_calls=[Session.call],
301
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
301
+ custom_tags={"autogen": True} if self._autogenerated else None,
302
302
  )
303
303
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
304
304
  pd_df.columns = dataset.columns
@@ -633,7 +633,14 @@ class FastICA(BaseTransformer):
633
633
  ) -> List[str]:
634
634
  # in case the inferred output column names dimension is different
635
635
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
636
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
636
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
637
+
638
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
639
+ # seen during the fit.
640
+ snowpark_column_names = dataset.select(self.input_cols).columns
641
+ sample_pd_df.columns = snowpark_column_names
642
+
643
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
637
644
  output_df_columns = list(output_df_pd.columns)
638
645
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
639
646
  if self.sample_weight_col: