snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. snowflake/cortex/_sentiment.py +7 -4
  2. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  3. snowflake/ml/feature_store/access_manager.py +34 -30
  4. snowflake/ml/feature_store/feature_store.py +1 -1
  5. snowflake/ml/feature_store/feature_view.py +12 -11
  6. snowflake/ml/fileset/snowfs.py +2 -31
  7. snowflake/ml/model/_client/ops/model_ops.py +43 -0
  8. snowflake/ml/model/_client/sql/model_version.py +53 -1
  9. snowflake/ml/model/_model_composer/model_composer.py +6 -2
  10. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  11. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  12. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +58 -139
  13. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
  14. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
  15. snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
  16. snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
  17. snowflake/ml/modeling/cluster/birch.py +8 -1
  18. snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
  19. snowflake/ml/modeling/cluster/dbscan.py +8 -1
  20. snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
  21. snowflake/ml/modeling/cluster/k_means.py +8 -1
  22. snowflake/ml/modeling/cluster/mean_shift.py +8 -1
  23. snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
  24. snowflake/ml/modeling/cluster/optics.py +8 -1
  25. snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
  26. snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
  27. snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
  28. snowflake/ml/modeling/compose/column_transformer.py +8 -1
  29. snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
  30. snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
  31. snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
  32. snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
  33. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
  34. snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
  35. snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
  36. snowflake/ml/modeling/covariance/oas.py +8 -1
  37. snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
  38. snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
  39. snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
  40. snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
  41. snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
  42. snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
  43. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
  44. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
  45. snowflake/ml/modeling/decomposition/pca.py +8 -1
  46. snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
  47. snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
  48. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
  49. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
  50. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
  51. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
  52. snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
  53. snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
  54. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
  55. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
  56. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
  57. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
  58. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
  59. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
  60. snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
  61. snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
  62. snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
  63. snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
  64. snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
  65. snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
  66. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
  67. snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
  68. snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
  69. snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
  70. snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
  71. snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
  72. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
  73. snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
  74. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
  75. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
  76. snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
  77. snowflake/ml/modeling/impute/knn_imputer.py +8 -1
  78. snowflake/ml/modeling/impute/missing_indicator.py +8 -1
  79. snowflake/ml/modeling/impute/simple_imputer.py +21 -2
  80. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
  81. snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
  82. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
  83. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
  84. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
  85. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
  86. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
  87. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
  88. snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
  89. snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
  90. snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
  91. snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
  92. snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
  93. snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
  94. snowflake/ml/modeling/linear_model/lars.py +8 -1
  95. snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
  96. snowflake/ml/modeling/linear_model/lasso.py +8 -1
  97. snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
  98. snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
  99. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
  100. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
  101. snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
  102. snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
  103. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
  104. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
  105. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
  106. snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
  107. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
  108. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
  109. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
  110. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
  111. snowflake/ml/modeling/linear_model/perceptron.py +8 -1
  112. snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
  113. snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
  114. snowflake/ml/modeling/linear_model/ridge.py +8 -1
  115. snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
  116. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
  117. snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
  118. snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
  119. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
  120. snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
  121. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
  122. snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
  123. snowflake/ml/modeling/manifold/isomap.py +8 -1
  124. snowflake/ml/modeling/manifold/mds.py +8 -1
  125. snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
  126. snowflake/ml/modeling/manifold/tsne.py +8 -1
  127. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
  128. snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
  129. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
  130. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
  131. snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
  132. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
  133. snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
  134. snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
  135. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
  136. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
  137. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
  138. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
  139. snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
  140. snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
  141. snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
  142. snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
  143. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
  144. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
  145. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
  146. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
  147. snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
  148. snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
  149. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  150. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
  151. snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
  152. snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
  153. snowflake/ml/modeling/svm/linear_svc.py +8 -1
  154. snowflake/ml/modeling/svm/linear_svr.py +8 -1
  155. snowflake/ml/modeling/svm/nu_svc.py +8 -1
  156. snowflake/ml/modeling/svm/nu_svr.py +8 -1
  157. snowflake/ml/modeling/svm/svc.py +8 -1
  158. snowflake/ml/modeling/svm/svr.py +8 -1
  159. snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
  160. snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
  161. snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
  162. snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
  163. snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
  164. snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
  165. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
  166. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
  167. snowflake/ml/registry/_manager/model_manager.py +59 -1
  168. snowflake/ml/registry/registry.py +10 -1
  169. snowflake/ml/version.py +1 -1
  170. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +13 -1
  171. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +174 -172
  172. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
  173. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
  174. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -621,7 +621,14 @@ class StackingRegressor(BaseTransformer):
621
621
  ) -> List[str]:
622
622
  # in case the inferred output column names dimension is different
623
623
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
624
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
624
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
625
+
626
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
627
+ # seen during the fit.
628
+ snowpark_column_names = dataset.select(self.input_cols).columns
629
+ sample_pd_df.columns = snowpark_column_names
630
+
631
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
625
632
  output_df_columns = list(output_df_pd.columns)
626
633
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
627
634
  if self.sample_weight_col:
@@ -603,7 +603,14 @@ class VotingClassifier(BaseTransformer):
603
603
  ) -> List[str]:
604
604
  # in case the inferred output column names dimension is different
605
605
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
606
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
606
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
607
+
608
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
609
+ # seen during the fit.
610
+ snowpark_column_names = dataset.select(self.input_cols).columns
611
+ sample_pd_df.columns = snowpark_column_names
612
+
613
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
607
614
  output_df_columns = list(output_df_pd.columns)
608
615
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
609
616
  if self.sample_weight_col:
@@ -585,7 +585,14 @@ class VotingRegressor(BaseTransformer):
585
585
  ) -> List[str]:
586
586
  # in case the inferred output column names dimension is different
587
587
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
588
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
588
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
589
+
590
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
591
+ # seen during the fit.
592
+ snowpark_column_names = dataset.select(self.input_cols).columns
593
+ sample_pd_df.columns = snowpark_column_names
594
+
595
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
589
596
  output_df_columns = list(output_df_pd.columns)
590
597
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
591
598
  if self.sample_weight_col:
@@ -573,7 +573,14 @@ class GenericUnivariateSelect(BaseTransformer):
573
573
  ) -> List[str]:
574
574
  # in case the inferred output column names dimension is different
575
575
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
576
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
576
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
577
+
578
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
579
+ # seen during the fit.
580
+ snowpark_column_names = dataset.select(self.input_cols).columns
581
+ sample_pd_df.columns = snowpark_column_names
582
+
583
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
577
584
  output_df_columns = list(output_df_pd.columns)
578
585
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
579
586
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFdr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFpr(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectFwe(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -570,7 +570,14 @@ class SelectKBest(BaseTransformer):
570
570
  ) -> List[str]:
571
571
  # in case the inferred output column names dimension is different
572
572
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
573
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
573
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
574
+
575
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
576
+ # seen during the fit.
577
+ snowpark_column_names = dataset.select(self.input_cols).columns
578
+ sample_pd_df.columns = snowpark_column_names
579
+
580
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
574
581
  output_df_columns = list(output_df_pd.columns)
575
582
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
576
583
  if self.sample_weight_col:
@@ -569,7 +569,14 @@ class SelectPercentile(BaseTransformer):
569
569
  ) -> List[str]:
570
570
  # in case the inferred output column names dimension is different
571
571
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
572
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
572
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
573
+
574
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
575
+ # seen during the fit.
576
+ snowpark_column_names = dataset.select(self.input_cols).columns
577
+ sample_pd_df.columns = snowpark_column_names
578
+
579
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
573
580
  output_df_columns = list(output_df_pd.columns)
574
581
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
575
582
  if self.sample_weight_col:
@@ -627,7 +627,14 @@ class SequentialFeatureSelector(BaseTransformer):
627
627
  ) -> List[str]:
628
628
  # in case the inferred output column names dimension is different
629
629
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
630
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
630
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
631
+
632
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
633
+ # seen during the fit.
634
+ snowpark_column_names = dataset.select(self.input_cols).columns
635
+ sample_pd_df.columns = snowpark_column_names
636
+
637
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
631
638
  output_df_columns = list(output_df_pd.columns)
632
639
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
633
640
  if self.sample_weight_col:
@@ -560,7 +560,14 @@ class VarianceThreshold(BaseTransformer):
560
560
  ) -> List[str]:
561
561
  # in case the inferred output column names dimension is different
562
562
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
563
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
563
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
564
+
565
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
566
+ # seen during the fit.
567
+ snowpark_column_names = dataset.select(self.input_cols).columns
568
+ sample_pd_df.columns = snowpark_column_names
569
+
570
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
564
571
  output_df_columns = list(output_df_pd.columns)
565
572
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
566
573
  if self.sample_weight_col:
@@ -653,7 +653,14 @@ class GaussianProcessClassifier(BaseTransformer):
653
653
  ) -> List[str]:
654
654
  # in case the inferred output column names dimension is different
655
655
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
656
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
656
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
657
+
658
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
659
+ # seen during the fit.
660
+ snowpark_column_names = dataset.select(self.input_cols).columns
661
+ sample_pd_df.columns = snowpark_column_names
662
+
663
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
657
664
  output_df_columns = list(output_df_pd.columns)
658
665
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
659
666
  if self.sample_weight_col:
@@ -644,7 +644,14 @@ class GaussianProcessRegressor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -688,7 +688,14 @@ class IterativeImputer(BaseTransformer):
688
688
  ) -> List[str]:
689
689
  # in case the inferred output column names dimension is different
690
690
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
691
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
691
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
692
+
693
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
694
+ # seen during the fit.
695
+ snowpark_column_names = dataset.select(self.input_cols).columns
696
+ sample_pd_df.columns = snowpark_column_names
697
+
698
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
692
699
  output_df_columns = list(output_df_pd.columns)
693
700
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
694
701
  if self.sample_weight_col:
@@ -614,7 +614,14 @@ class KNNImputer(BaseTransformer):
614
614
  ) -> List[str]:
615
615
  # in case the inferred output column names dimension is different
616
616
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
617
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
617
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
618
+
619
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
620
+ # seen during the fit.
621
+ snowpark_column_names = dataset.select(self.input_cols).columns
622
+ sample_pd_df.columns = snowpark_column_names
623
+
624
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
618
625
  output_df_columns = list(output_df_pd.columns)
619
626
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
620
627
  if self.sample_weight_col:
@@ -588,7 +588,14 @@ class MissingIndicator(BaseTransformer):
588
588
  ) -> List[str]:
589
589
  # in case the inferred output column names dimension is different
590
590
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
591
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
591
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
592
+
593
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
594
+ # seen during the fit.
595
+ snowpark_column_names = dataset.select(self.input_cols).columns
596
+ sample_pd_df.columns = snowpark_column_names
597
+
598
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
592
599
  output_df_columns = list(output_df_pd.columns)
593
600
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
594
601
  if self.sample_weight_col:
@@ -158,6 +158,7 @@ class SimpleImputer(base.BaseTransformer):
158
158
 
159
159
  self.fill_value = fill_value
160
160
  self.missing_values = missing_values
161
+ self.statistics_: Dict[str, Any] = {}
161
162
  # TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
162
163
  # Add back when `keep_empty_features` is supported.
163
164
  # self.keep_empty_features = keep_empty_features
@@ -229,8 +230,27 @@ class SimpleImputer(base.BaseTransformer):
229
230
 
230
231
  return input_col_datatypes
231
232
 
233
+ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "SimpleImputer":
234
+ if isinstance(dataset, snowpark.DataFrame):
235
+ return self._fit_snowpark(dataset)
236
+ else:
237
+ return self._fit_sklearn(dataset)
238
+
239
+ def _fit_sklearn(self, dataset: pd.DataFrame) -> "SimpleImputer":
240
+ dataset = self._use_input_cols_only(dataset)
241
+ sklearn_simple_imputer = self._create_sklearn_object()
242
+ sklearn_simple_imputer = sklearn_simple_imputer.fit(dataset)
243
+ self._sklearn_object = sklearn_simple_imputer
244
+ for input_col, fill_value in zip(self.input_cols, sklearn_simple_imputer.statistics_.tolist()):
245
+ self.statistics_[input_col] = fill_value
246
+ self._sklearn_fit_dtype = sklearn_simple_imputer._fit_dtype
247
+ self.n_features_in_ = len(self.input_cols)
248
+ self.feature_names_in_ = self.input_cols
249
+ self._is_fitted = True
250
+ return self
251
+
232
252
  @telemetry.send_api_usage_telemetry(project=base.PROJECT, subproject=_SUBPROJECT)
233
- def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
253
+ def _fit_snowpark(self, dataset: snowpark.DataFrame) -> "SimpleImputer":
234
254
  """
235
255
  Compute values to impute for the dataset according to the strategy.
236
256
 
@@ -245,7 +265,6 @@ class SimpleImputer(base.BaseTransformer):
245
265
  # In order to fit, the input columns should have the same type.
246
266
  input_col_datatypes = self._get_dataset_input_col_datatypes(dataset)
247
267
 
248
- self.statistics_: Dict[str, Any] = {}
249
268
  statement_params = telemetry.get_statement_params(base.PROJECT, _SUBPROJECT, self.__class__.__name__)
250
269
 
251
270
  if self.strategy == "constant":
@@ -563,7 +563,14 @@ class AdditiveChi2Sampler(BaseTransformer):
563
563
  ) -> List[str]:
564
564
  # in case the inferred output column names dimension is different
565
565
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
566
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
566
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
567
+
568
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
569
+ # seen during the fit.
570
+ snowpark_column_names = dataset.select(self.input_cols).columns
571
+ sample_pd_df.columns = snowpark_column_names
572
+
573
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
567
574
  output_df_columns = list(output_df_pd.columns)
568
575
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
569
576
  if self.sample_weight_col:
@@ -611,7 +611,14 @@ class Nystroem(BaseTransformer):
611
611
  ) -> List[str]:
612
612
  # in case the inferred output column names dimension is different
613
613
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
614
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
614
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
615
+
616
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
617
+ # seen during the fit.
618
+ snowpark_column_names = dataset.select(self.input_cols).columns
619
+ sample_pd_df.columns = snowpark_column_names
620
+
621
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
615
622
  output_df_columns = list(output_df_pd.columns)
616
623
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
617
624
  if self.sample_weight_col:
@@ -587,7 +587,14 @@ class PolynomialCountSketch(BaseTransformer):
587
587
  ) -> List[str]:
588
588
  # in case the inferred output column names dimension is different
589
589
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
590
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
590
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
591
+
592
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
593
+ # seen during the fit.
594
+ snowpark_column_names = dataset.select(self.input_cols).columns
595
+ sample_pd_df.columns = snowpark_column_names
596
+
597
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
591
598
  output_df_columns = list(output_df_pd.columns)
592
599
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
593
600
  if self.sample_weight_col:
@@ -574,7 +574,14 @@ class RBFSampler(BaseTransformer):
574
574
  ) -> List[str]:
575
575
  # in case the inferred output column names dimension is different
576
576
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
577
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
577
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
578
+
579
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
580
+ # seen during the fit.
581
+ snowpark_column_names = dataset.select(self.input_cols).columns
582
+ sample_pd_df.columns = snowpark_column_names
583
+
584
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
578
585
  output_df_columns = list(output_df_pd.columns)
579
586
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
580
587
  if self.sample_weight_col:
@@ -572,7 +572,14 @@ class SkewedChi2Sampler(BaseTransformer):
572
572
  ) -> List[str]:
573
573
  # in case the inferred output column names dimension is different
574
574
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
575
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
575
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
576
+
577
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
578
+ # seen during the fit.
579
+ snowpark_column_names = dataset.select(self.input_cols).columns
580
+ sample_pd_df.columns = snowpark_column_names
581
+
582
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
576
583
  output_df_columns = list(output_df_pd.columns)
577
584
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
578
585
  if self.sample_weight_col:
@@ -606,7 +606,14 @@ class KernelRidge(BaseTransformer):
606
606
  ) -> List[str]:
607
607
  # in case the inferred output column names dimension is different
608
608
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
609
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
609
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
610
+
611
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
612
+ # seen during the fit.
613
+ snowpark_column_names = dataset.select(self.input_cols).columns
614
+ sample_pd_df.columns = snowpark_column_names
615
+
616
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
610
617
  output_df_columns = list(output_df_pd.columns)
611
618
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
612
619
  if self.sample_weight_col:
@@ -595,7 +595,14 @@ class LGBMClassifier(BaseTransformer):
595
595
  ) -> List[str]:
596
596
  # in case the inferred output column names dimension is different
597
597
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
598
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
598
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
599
+
600
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
601
+ # seen during the fit.
602
+ snowpark_column_names = dataset.select(self.input_cols).columns
603
+ sample_pd_df.columns = snowpark_column_names
604
+
605
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
599
606
  output_df_columns = list(output_df_pd.columns)
600
607
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
601
608
  if self.sample_weight_col:
@@ -595,7 +595,14 @@ class LGBMRegressor(BaseTransformer):
595
595
  ) -> List[str]:
596
596
  # in case the inferred output column names dimension is different
597
597
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
598
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
598
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
599
+
600
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
601
+ # seen during the fit.
602
+ snowpark_column_names = dataset.select(self.input_cols).columns
603
+ sample_pd_df.columns = snowpark_column_names
604
+
605
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
599
606
  output_df_columns = list(output_df_pd.columns)
600
607
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
601
608
  if self.sample_weight_col:
@@ -620,7 +620,14 @@ class ARDRegression(BaseTransformer):
620
620
  ) -> List[str]:
621
621
  # in case the inferred output column names dimension is different
622
622
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
623
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
623
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
624
+
625
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
626
+ # seen during the fit.
627
+ snowpark_column_names = dataset.select(self.input_cols).columns
628
+ sample_pd_df.columns = snowpark_column_names
629
+
630
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
624
631
  output_df_columns = list(output_df_pd.columns)
625
632
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
626
633
  if self.sample_weight_col:
@@ -631,7 +631,14 @@ class BayesianRidge(BaseTransformer):
631
631
  ) -> List[str]:
632
632
  # in case the inferred output column names dimension is different
633
633
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
634
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
634
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
635
+
636
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
637
+ # seen during the fit.
638
+ snowpark_column_names = dataset.select(self.input_cols).columns
639
+ sample_pd_df.columns = snowpark_column_names
640
+
641
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
635
642
  output_df_columns = list(output_df_pd.columns)
636
643
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
637
644
  if self.sample_weight_col:
@@ -630,7 +630,14 @@ class ElasticNet(BaseTransformer):
630
630
  ) -> List[str]:
631
631
  # in case the inferred output column names dimension is different
632
632
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
633
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
633
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
634
+
635
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
636
+ # seen during the fit.
637
+ snowpark_column_names = dataset.select(self.input_cols).columns
638
+ sample_pd_df.columns = snowpark_column_names
639
+
640
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
634
641
  output_df_columns = list(output_df_pd.columns)
635
642
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
636
643
  if self.sample_weight_col:
@@ -666,7 +666,14 @@ class ElasticNetCV(BaseTransformer):
666
666
  ) -> List[str]:
667
667
  # in case the inferred output column names dimension is different
668
668
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
669
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
669
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
670
+
671
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
672
+ # seen during the fit.
673
+ snowpark_column_names = dataset.select(self.input_cols).columns
674
+ sample_pd_df.columns = snowpark_column_names
675
+
676
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
670
677
  output_df_columns = list(output_df_pd.columns)
671
678
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
672
679
  if self.sample_weight_col:
@@ -611,7 +611,14 @@ class GammaRegressor(BaseTransformer):
611
611
  ) -> List[str]:
612
612
  # in case the inferred output column names dimension is different
613
613
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
614
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
614
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
615
+
616
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
617
+ # seen during the fit.
618
+ snowpark_column_names = dataset.select(self.input_cols).columns
619
+ sample_pd_df.columns = snowpark_column_names
620
+
621
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
615
622
  output_df_columns = list(output_df_pd.columns)
616
623
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
617
624
  if self.sample_weight_col:
@@ -594,7 +594,14 @@ class HuberRegressor(BaseTransformer):
594
594
  ) -> List[str]:
595
595
  # in case the inferred output column names dimension is different
596
596
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
597
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
597
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
598
+
599
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
600
+ # seen during the fit.
601
+ snowpark_column_names = dataset.select(self.input_cols).columns
602
+ sample_pd_df.columns = snowpark_column_names
603
+
604
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
598
605
  output_df_columns = list(output_df_pd.columns)
599
606
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
600
607
  if self.sample_weight_col:
@@ -623,7 +623,14 @@ class Lars(BaseTransformer):
623
623
  ) -> List[str]:
624
624
  # in case the inferred output column names dimension is different
625
625
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
626
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
626
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
627
+
628
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
629
+ # seen during the fit.
630
+ snowpark_column_names = dataset.select(self.input_cols).columns
631
+ sample_pd_df.columns = snowpark_column_names
632
+
633
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
627
634
  output_df_columns = list(output_df_pd.columns)
628
635
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
629
636
  if self.sample_weight_col:
@@ -631,7 +631,14 @@ class LarsCV(BaseTransformer):
631
631
  ) -> List[str]:
632
632
  # in case the inferred output column names dimension is different
633
633
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
634
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
634
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
635
+
636
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
637
+ # seen during the fit.
638
+ snowpark_column_names = dataset.select(self.input_cols).columns
639
+ sample_pd_df.columns = snowpark_column_names
640
+
641
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
635
642
  output_df_columns = list(output_df_pd.columns)
636
643
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
637
644
  if self.sample_weight_col:
@@ -624,7 +624,14 @@ class Lasso(BaseTransformer):
624
624
  ) -> List[str]:
625
625
  # in case the inferred output column names dimension is different
626
626
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
627
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
627
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
628
+
629
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
630
+ # seen during the fit.
631
+ snowpark_column_names = dataset.select(self.input_cols).columns
632
+ sample_pd_df.columns = snowpark_column_names
633
+
634
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
628
635
  output_df_columns = list(output_df_pd.columns)
629
636
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
630
637
  if self.sample_weight_col: