snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. snowflake/cortex/_sentiment.py +7 -4
  2. snowflake/ml/_internal/env_utils.py +6 -0
  3. snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
  4. snowflake/ml/_internal/telemetry.py +1 -0
  5. snowflake/ml/_internal/utils/identifier.py +1 -1
  6. snowflake/ml/_internal/utils/sql_identifier.py +14 -1
  7. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  8. snowflake/ml/dataset/__init__.py +2 -1
  9. snowflake/ml/dataset/dataset.py +4 -3
  10. snowflake/ml/dataset/dataset_reader.py +5 -8
  11. snowflake/ml/feature_store/__init__.py +6 -0
  12. snowflake/ml/feature_store/access_manager.py +283 -0
  13. snowflake/ml/feature_store/feature_store.py +160 -100
  14. snowflake/ml/feature_store/feature_view.py +30 -19
  15. snowflake/ml/fileset/embedded_stage_fs.py +15 -12
  16. snowflake/ml/fileset/snowfs.py +2 -30
  17. snowflake/ml/fileset/stage_fs.py +25 -7
  18. snowflake/ml/model/_client/model/model_impl.py +46 -39
  19. snowflake/ml/model/_client/model/model_version_impl.py +24 -2
  20. snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
  21. snowflake/ml/model/_client/ops/model_ops.py +174 -16
  22. snowflake/ml/model/_client/sql/_base.py +34 -0
  23. snowflake/ml/model/_client/sql/model.py +32 -39
  24. snowflake/ml/model/_client/sql/model_version.py +111 -42
  25. snowflake/ml/model/_client/sql/stage.py +6 -32
  26. snowflake/ml/model/_client/sql/tag.py +32 -56
  27. snowflake/ml/model/_model_composer/model_composer.py +8 -4
  28. snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
  29. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  30. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  31. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +90 -142
  32. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
  33. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +81 -3
  34. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
  35. snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
  36. snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
  37. snowflake/ml/modeling/cluster/birch.py +8 -1
  38. snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
  39. snowflake/ml/modeling/cluster/dbscan.py +8 -1
  40. snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
  41. snowflake/ml/modeling/cluster/k_means.py +8 -1
  42. snowflake/ml/modeling/cluster/mean_shift.py +8 -1
  43. snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
  44. snowflake/ml/modeling/cluster/optics.py +8 -1
  45. snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
  46. snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
  47. snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
  48. snowflake/ml/modeling/compose/column_transformer.py +8 -1
  49. snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
  50. snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
  51. snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
  52. snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
  53. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
  54. snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
  55. snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
  56. snowflake/ml/modeling/covariance/oas.py +8 -1
  57. snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
  58. snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
  59. snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
  60. snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
  61. snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
  62. snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
  63. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
  64. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
  65. snowflake/ml/modeling/decomposition/pca.py +8 -1
  66. snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
  67. snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
  68. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
  69. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
  70. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
  71. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
  72. snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
  73. snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
  74. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
  75. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
  76. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
  77. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
  79. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
  80. snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
  81. snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
  82. snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
  83. snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
  84. snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
  85. snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
  86. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
  87. snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
  88. snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
  89. snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
  90. snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
  91. snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
  92. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
  93. snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
  94. snowflake/ml/modeling/framework/base.py +4 -3
  95. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
  96. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
  97. snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
  98. snowflake/ml/modeling/impute/knn_imputer.py +8 -1
  99. snowflake/ml/modeling/impute/missing_indicator.py +8 -1
  100. snowflake/ml/modeling/impute/simple_imputer.py +21 -2
  101. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
  102. snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
  103. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
  104. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
  105. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
  106. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
  107. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
  108. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
  109. snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
  110. snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
  111. snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
  112. snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
  113. snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
  114. snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
  115. snowflake/ml/modeling/linear_model/lars.py +8 -1
  116. snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
  117. snowflake/ml/modeling/linear_model/lasso.py +8 -1
  118. snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
  119. snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
  120. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
  121. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
  122. snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
  123. snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
  124. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
  125. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
  126. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
  127. snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
  128. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
  129. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
  130. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
  131. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
  132. snowflake/ml/modeling/linear_model/perceptron.py +8 -1
  133. snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
  134. snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
  135. snowflake/ml/modeling/linear_model/ridge.py +8 -1
  136. snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
  137. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
  138. snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
  139. snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
  140. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
  141. snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
  142. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
  143. snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
  144. snowflake/ml/modeling/manifold/isomap.py +8 -1
  145. snowflake/ml/modeling/manifold/mds.py +8 -1
  146. snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
  147. snowflake/ml/modeling/manifold/tsne.py +8 -1
  148. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
  149. snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
  150. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
  151. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
  152. snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
  153. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
  154. snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
  155. snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
  156. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
  157. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
  158. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
  159. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
  160. snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
  161. snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
  162. snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
  163. snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
  164. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
  165. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
  166. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
  167. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
  168. snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
  169. snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
  170. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  171. snowflake/ml/modeling/pipeline/pipeline.py +27 -7
  172. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
  173. snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
  174. snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
  175. snowflake/ml/modeling/svm/linear_svc.py +8 -1
  176. snowflake/ml/modeling/svm/linear_svr.py +8 -1
  177. snowflake/ml/modeling/svm/nu_svc.py +8 -1
  178. snowflake/ml/modeling/svm/nu_svr.py +8 -1
  179. snowflake/ml/modeling/svm/svc.py +8 -1
  180. snowflake/ml/modeling/svm/svr.py +8 -1
  181. snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
  182. snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
  183. snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
  184. snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
  185. snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
  186. snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
  187. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
  188. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
  189. snowflake/ml/registry/_manager/model_manager.py +95 -8
  190. snowflake/ml/registry/registry.py +10 -1
  191. snowflake/ml/version.py +1 -1
  192. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +66 -10
  193. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +196 -192
  194. snowflake/ml/_internal/lineage/dataset_dataframe.py +0 -44
  195. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
  196. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
  197. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -638,7 +638,14 @@ class BaggingClassifier(BaseTransformer):
638
638
  ) -> List[str]:
639
639
  # in case the inferred output column names dimension is different
640
640
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
641
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
641
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
642
+
643
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
644
+ # seen during the fit.
645
+ snowpark_column_names = dataset.select(self.input_cols).columns
646
+ sample_pd_df.columns = snowpark_column_names
647
+
648
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
642
649
  output_df_columns = list(output_df_pd.columns)
643
650
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
644
651
  if self.sample_weight_col:
@@ -638,7 +638,14 @@ class BaggingRegressor(BaseTransformer):
638
638
  ) -> List[str]:
639
639
  # in case the inferred output column names dimension is different
640
640
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
641
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
641
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
642
+
643
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
644
+ # seen during the fit.
645
+ snowpark_column_names = dataset.select(self.input_cols).columns
646
+ sample_pd_df.columns = snowpark_column_names
647
+
648
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
642
649
  output_df_columns = list(output_df_pd.columns)
643
650
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
644
651
  if self.sample_weight_col:
@@ -741,7 +741,14 @@ class ExtraTreesClassifier(BaseTransformer):
741
741
  ) -> List[str]:
742
742
  # in case the inferred output column names dimension is different
743
743
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
744
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
744
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
745
+
746
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
747
+ # seen during the fit.
748
+ snowpark_column_names = dataset.select(self.input_cols).columns
749
+ sample_pd_df.columns = snowpark_column_names
750
+
751
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
745
752
  output_df_columns = list(output_df_pd.columns)
746
753
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
747
754
  if self.sample_weight_col:
@@ -720,7 +720,14 @@ class ExtraTreesRegressor(BaseTransformer):
720
720
  ) -> List[str]:
721
721
  # in case the inferred output column names dimension is different
722
722
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
723
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
723
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
724
+
725
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
726
+ # seen during the fit.
727
+ snowpark_column_names = dataset.select(self.input_cols).columns
728
+ sample_pd_df.columns = snowpark_column_names
729
+
730
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
724
731
  output_df_columns = list(output_df_pd.columns)
725
732
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
726
733
  if self.sample_weight_col:
@@ -753,7 +753,14 @@ class GradientBoostingClassifier(BaseTransformer):
753
753
  ) -> List[str]:
754
754
  # in case the inferred output column names dimension is different
755
755
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
756
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
756
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
757
+
758
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
759
+ # seen during the fit.
760
+ snowpark_column_names = dataset.select(self.input_cols).columns
761
+ sample_pd_df.columns = snowpark_column_names
762
+
763
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
757
764
  output_df_columns = list(output_df_pd.columns)
758
765
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
759
766
  if self.sample_weight_col:
@@ -762,7 +762,14 @@ class GradientBoostingRegressor(BaseTransformer):
762
762
  ) -> List[str]:
763
763
  # in case the inferred output column names dimension is different
764
764
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
765
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
765
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
766
+
767
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
768
+ # seen during the fit.
769
+ snowpark_column_names = dataset.select(self.input_cols).columns
770
+ sample_pd_df.columns = snowpark_column_names
771
+
772
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
766
773
  output_df_columns = list(output_df_pd.columns)
767
774
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
768
775
  if self.sample_weight_col:
@@ -734,7 +734,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
734
734
  ) -> List[str]:
735
735
  # in case the inferred output column names dimension is different
736
736
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
737
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
737
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
738
+
739
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
740
+ # seen during the fit.
741
+ snowpark_column_names = dataset.select(self.input_cols).columns
742
+ sample_pd_df.columns = snowpark_column_names
743
+
744
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
738
745
  output_df_columns = list(output_df_pd.columns)
739
746
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
740
747
  if self.sample_weight_col:
@@ -725,7 +725,14 @@ class HistGradientBoostingRegressor(BaseTransformer):
725
725
  ) -> List[str]:
726
726
  # in case the inferred output column names dimension is different
727
727
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
728
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
728
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
729
+
730
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
731
+ # seen during the fit.
732
+ snowpark_column_names = dataset.select(self.input_cols).columns
733
+ sample_pd_df.columns = snowpark_column_names
734
+
735
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
729
736
  output_df_columns = list(output_df_pd.columns)
730
737
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
731
738
  if self.sample_weight_col:
@@ -627,7 +627,14 @@ class IsolationForest(BaseTransformer):
627
627
  ) -> List[str]:
628
628
  # in case the inferred output column names dimension is different
629
629
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
630
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
630
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
631
+
632
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
633
+ # seen during the fit.
634
+ snowpark_column_names = dataset.select(self.input_cols).columns
635
+ sample_pd_df.columns = snowpark_column_names
636
+
637
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
631
638
  output_df_columns = list(output_df_pd.columns)
632
639
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
633
640
  if self.sample_weight_col:
@@ -737,7 +737,14 @@ class RandomForestClassifier(BaseTransformer):
737
737
  ) -> List[str]:
738
738
  # in case the inferred output column names dimension is different
739
739
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
740
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
740
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
741
+
742
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
743
+ # seen during the fit.
744
+ snowpark_column_names = dataset.select(self.input_cols).columns
745
+ sample_pd_df.columns = snowpark_column_names
746
+
747
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
741
748
  output_df_columns = list(output_df_pd.columns)
742
749
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
743
750
  if self.sample_weight_col:
@@ -716,7 +716,14 @@ class RandomForestRegressor(BaseTransformer):
716
716
  ) -> List[str]:
717
717
  # in case the inferred output column names dimension is different
718
718
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
719
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
719
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
720
+
721
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
722
+ # seen during the fit.
723
+ snowpark_column_names = dataset.select(self.input_cols).columns
724
+ sample_pd_df.columns = snowpark_column_names
725
+
726
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
720
727
  output_df_columns = list(output_df_pd.columns)
721
728
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
722
729
  if self.sample_weight_col:
@@ -621,7 +621,14 @@ class StackingRegressor(BaseTransformer):
621
621
  ) -> List[str]:
622
622
  # in case the inferred output column names dimension is different
623
623
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
624
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
624
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
625
+
626
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
627
+ # seen during the fit.
628
+ snowpark_column_names = dataset.select(self.input_cols).columns
629
+ sample_pd_df.columns = snowpark_column_names
630
+
631
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
625
632
  output_df_columns = list(output_df_pd.columns)
626
633
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
627
634
  if self.sample_weight_col:
@@ -603,7 +603,14 @@ class VotingClassifier(BaseTransformer):
603
603
  ) -> List[str]:
604
604
  # in case the inferred output column names dimension is different
605
605
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
606
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
606
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
607
+
608
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
609
+ # seen during the fit.
610
+ snowpark_column_names = dataset.select(self.input_cols).columns
611
+ sample_pd_df.columns = snowpark_column_names
612
+
613
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
607
614
  output_df_columns = list(output_df_pd.columns)
608
615
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
609
616
  if self.sample_weight_col:
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
585
585
  ) -> List[str]:
586
586
  # in case the inferred output column names dimension is different
587
587
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
588
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
588
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
589
+
590
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
591
+ # seen during the fit.
592
+ snowpark_column_names = dataset.select(self.input_cols).columns
593
+ sample_pd_df.columns = snowpark_column_names
594
+
595
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
589
596
  output_df_columns = list(output_df_pd.columns)
590
597
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
591
598
  if self.sample_weight_col:
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
573
573
  ) -> List[str]:
574
574
  # in case the inferred output column names dimension is different
575
575
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
576
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
576
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
577
+
578
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
579
+ # seen during the fit.
580
+ snowpark_column_names = dataset.select(self.input_cols).columns
581
+ sample_pd_df.columns = snowpark_column_names
582
+
583
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
577
584
  output_df_columns = list(output_df_pd.columns)
578
585
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
579
586
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
570
570
  ) -> List[str]:
571
571
  # in case the inferred output column names dimension is different
572
572
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
573
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
573
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
574
+
575
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
576
+ # seen during the fit.
577
+ snowpark_column_names = dataset.select(self.input_cols).columns
578
+ sample_pd_df.columns = snowpark_column_names
579
+
580
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
574
581
  output_df_columns = list(output_df_pd.columns)
575
582
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
576
583
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
627
627
  ) -> List[str]:
628
628
  # in case the inferred output column names dimension is different
629
629
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
630
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
630
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
631
+
632
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
633
+ # seen during the fit.
634
+ snowpark_column_names = dataset.select(self.input_cols).columns
635
+ sample_pd_df.columns = snowpark_column_names
636
+
637
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
631
638
  output_df_columns = list(output_df_pd.columns)
632
639
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
633
640
  if self.sample_weight_col:
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
560
560
  ) -> List[str]:
561
561
  # in case the inferred output column names dimension is different
562
562
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
563
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
563
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
564
+
565
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
566
+ # seen during the fit.
567
+ snowpark_column_names = dataset.select(self.input_cols).columns
568
+ sample_pd_df.columns = snowpark_column_names
569
+
570
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
564
571
  output_df_columns = list(output_df_pd.columns)
565
572
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
566
573
  if self.sample_weight_col:
@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
16
16
  exceptions,
17
17
  modeling_error_messages,
18
18
  )
19
- from snowflake.ml._internal.lineage import data_source, dataset_dataframe
19
+ from snowflake.ml._internal.lineage import data_source, lineage_utils
20
20
  from snowflake.ml._internal.utils import identifier, parallelize
21
21
  from snowflake.ml.modeling.framework import _utils
22
22
  from snowflake.snowpark import functions as F
@@ -430,8 +430,9 @@ class BaseEstimator(Base):
430
430
  )
431
431
  def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
432
432
  """Runs universal logics for all fit implementations."""
433
- if isinstance(dataset, dataset_dataframe.DatasetDataFrame):
434
- self._data_sources = dataset._get_sources()
433
+ self._data_sources = getattr(dataset, lineage_utils.DATA_SOURCES_ATTR, None)
434
+ if self._data_sources:
435
+ assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
435
436
  return self._fit(dataset)
436
437
 
437
438
  @abstractmethod
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
653
653
  ) -> List[str]:
654
654
  # in case the inferred output column names dimension is different
655
655
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
656
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
656
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
657
+
658
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
659
+ # seen during the fit.
660
+ snowpark_column_names = dataset.select(self.input_cols).columns
661
+ sample_pd_df.columns = snowpark_column_names
662
+
663
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
657
664
  output_df_columns = list(output_df_pd.columns)
658
665
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
659
666
  if self.sample_weight_col:
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
688
688
  ) -> List[str]:
689
689
  # in case the inferred output column names dimension is different
690
690
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
691
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
691
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
692
+
693
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
694
+ # seen during the fit.
695
+ snowpark_column_names = dataset.select(self.input_cols).columns
696
+ sample_pd_df.columns = snowpark_column_names
697
+
698
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
692
699
  output_df_columns = list(output_df_pd.columns)
693
700
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
694
701
  if self.sample_weight_col:
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
614
614
  ) -> List[str]:
615
615
  # in case the inferred output column names dimension is different
616
616
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
617
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
617
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
618
+
619
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
620
+ # seen during the fit.
621
+ snowpark_column_names = dataset.select(self.input_cols).columns
622
+ sample_pd_df.columns = snowpark_column_names
623
+
624
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
618
625
  output_df_columns = list(output_df_pd.columns)
619
626
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
620
627
  if self.sample_weight_col:
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
588
588
  ) -> List[str]:
589
589
  # in case the inferred output column names dimension is different
590
590
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
591
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
591
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
592
+
593
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
594
+ # seen during the fit.
595
+ snowpark_column_names = dataset.select(self.input_cols).columns
596
+ sample_pd_df.columns = snowpark_column_names
597
+
598
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
592
599
  output_df_columns = list(output_df_pd.columns)
593
600
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
594
601
  if self.sample_weight_col:
@@ -158,6 +158,7 @@ class SimpleImputer(base.BaseTransformer):
158
158
 
159
159
  self.fill_value = fill_value
160
160
  self.missing_values = missing_values
161
+ self.statistics_: Dict[str, Any] = {}
161
162
  # TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
162
163
  # Add back when `keep_empty_features` is supported.
163
164
  # self.keep_empty_features = keep_empty_features
@@ -229,8 +230,27 @@ class SimpleImputer(base.BaseTransformer):
229
230
 
230
231
  return input_col_datatypes
231
232
 
233
+ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
234
+ if isinstance(dataset, snowpark.DataFrame):
235
+ return self._fit_snowpark(dataset)
236
+ else:
237
+ return self._fit_sklearn(dataset)
238
+
239
+ def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
240
+ dataset = self._use_input_cols_only(dataset)
241
+ sklearn_simple_imputer = self._create_sklearn_object()
242
+ sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
243
+ self._sklearn_object = sklearn_simple_imputer
244
+ for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
245
+ self.statistics_[input_col] = fill_value
246
+ self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
247
+ self.n_features_in_ = len(self.input_cols)
248
+ self.feature_names_in_ = self.input_cols
249
+ self._is_fitted = True
250
+ return self
251
+
232
252
  @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
233
- def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
253
+ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
234
254
  """
235
255
  Compute values to impute for the dataset according to the strategy.
236
256
 
@@ -245,7 +265,6 @@ class SimpleImputer(base.BaseTransformer):
245
265
  # In order to fit, the input columns should have the same type.
246
266
  input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
247
267
 
248
- self.statistics_: Dict[str, Any] = {}
249
268
  statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
250
269
 
251
270
  if self.strategy == "constant":
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
563
563
  ) -> List[str]:
564
564
  # in case the inferred output column names dimension is different
565
565
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
566
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
566
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
567
+
568
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
569
+ # seen during the fit.
570
+ snowpark_column_names = dataset.select(self.input_cols).columns
571
+ sample_pd_df.columns = snowpark_column_names
572
+
573
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
567
574
  output_df_columns = list(output_df_pd.columns)
568
575
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
569
576
  if self.sample_weight_col:
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
611
611
  ) -> List[str]:
612
612
  # in case the inferred output column names dimension is different
613
613
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
614
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
614
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
615
+
616
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
617
+ # seen during the fit.
618
+ snowpark_column_names = dataset.select(self.input_cols).columns
619
+ sample_pd_df.columns = snowpark_column_names
620
+
621
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
615
622
  output_df_columns = list(output_df_pd.columns)
616
623
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
617
624
  if self.sample_weight_col:
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
587
587
  ) -> List[str]:
588
588
  # in case the inferred output column names dimension is different
589
589
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
590
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
590
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
591
+
592
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
593
+ # seen during the fit.
594
+ snowpark_column_names = dataset.select(self.input_cols).columns
595
+ sample_pd_df.columns = snowpark_column_names
596
+
597
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
591
598
  output_df_columns = list(output_df_pd.columns)
592
599
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
593
600
  if self.sample_weight_col:
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
574
574
  ) -> List[str]:
575
575
  # in case the inferred output column names dimension is different
576
576
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
577
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
577
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
578
+
579
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
580
+ # seen during the fit.
581
+ snowpark_column_names = dataset.select(self.input_cols).columns
582
+ sample_pd_df.columns = snowpark_column_names
583
+
584
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
578
585
  output_df_columns = list(output_df_pd.columns)
579
586
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
580
587
  if self.sample_weight_col:
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
572
572
  ) -> List[str]:
573
573
  # in case the inferred output column names dimension is different
574
574
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
575
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
575
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
576
+
577
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
578
+ # seen during the fit.
579
+ snowpark_column_names = dataset.select(self.input_cols).columns
580
+ sample_pd_df.columns = snowpark_column_names
581
+
582
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
576
583
  output_df_columns = list(output_df_pd.columns)
577
584
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
578
585
  if self.sample_weight_col: