snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. snowflake/cortex/_sentiment.py +7 -4
  2. snowflake/ml/_internal/env_utils.py +6 -0
  3. snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
  4. snowflake/ml/_internal/telemetry.py +1 -0
  5. snowflake/ml/_internal/utils/identifier.py +1 -1
  6. snowflake/ml/_internal/utils/sql_identifier.py +14 -1
  7. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  8. snowflake/ml/dataset/__init__.py +2 -1
  9. snowflake/ml/dataset/dataset.py +4 -3
  10. snowflake/ml/dataset/dataset_reader.py +5 -8
  11. snowflake/ml/feature_store/__init__.py +6 -0
  12. snowflake/ml/feature_store/access_manager.py +283 -0
  13. snowflake/ml/feature_store/feature_store.py +160 -100
  14. snowflake/ml/feature_store/feature_view.py +30 -19
  15. snowflake/ml/fileset/embedded_stage_fs.py +15 -12
  16. snowflake/ml/fileset/snowfs.py +2 -30
  17. snowflake/ml/fileset/stage_fs.py +25 -7
  18. snowflake/ml/model/_client/model/model_impl.py +46 -39
  19. snowflake/ml/model/_client/model/model_version_impl.py +24 -2
  20. snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
  21. snowflake/ml/model/_client/ops/model_ops.py +174 -16
  22. snowflake/ml/model/_client/sql/_base.py +34 -0
  23. snowflake/ml/model/_client/sql/model.py +32 -39
  24. snowflake/ml/model/_client/sql/model_version.py +111 -42
  25. snowflake/ml/model/_client/sql/stage.py +6 -32
  26. snowflake/ml/model/_client/sql/tag.py +32 -56
  27. snowflake/ml/model/_model_composer/model_composer.py +8 -4
  28. snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
  29. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  30. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  31. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +90 -142
  32. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +159 -0
  33. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +81 -3
  34. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +8 -1
  35. snowflake/ml/modeling/cluster/affinity_propagation.py +8 -1
  36. snowflake/ml/modeling/cluster/agglomerative_clustering.py +8 -1
  37. snowflake/ml/modeling/cluster/birch.py +8 -1
  38. snowflake/ml/modeling/cluster/bisecting_k_means.py +8 -1
  39. snowflake/ml/modeling/cluster/dbscan.py +8 -1
  40. snowflake/ml/modeling/cluster/feature_agglomeration.py +8 -1
  41. snowflake/ml/modeling/cluster/k_means.py +8 -1
  42. snowflake/ml/modeling/cluster/mean_shift.py +8 -1
  43. snowflake/ml/modeling/cluster/mini_batch_k_means.py +8 -1
  44. snowflake/ml/modeling/cluster/optics.py +8 -1
  45. snowflake/ml/modeling/cluster/spectral_biclustering.py +8 -1
  46. snowflake/ml/modeling/cluster/spectral_clustering.py +8 -1
  47. snowflake/ml/modeling/cluster/spectral_coclustering.py +8 -1
  48. snowflake/ml/modeling/compose/column_transformer.py +8 -1
  49. snowflake/ml/modeling/compose/transformed_target_regressor.py +8 -1
  50. snowflake/ml/modeling/covariance/elliptic_envelope.py +8 -1
  51. snowflake/ml/modeling/covariance/empirical_covariance.py +8 -1
  52. snowflake/ml/modeling/covariance/graphical_lasso.py +8 -1
  53. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +8 -1
  54. snowflake/ml/modeling/covariance/ledoit_wolf.py +8 -1
  55. snowflake/ml/modeling/covariance/min_cov_det.py +8 -1
  56. snowflake/ml/modeling/covariance/oas.py +8 -1
  57. snowflake/ml/modeling/covariance/shrunk_covariance.py +8 -1
  58. snowflake/ml/modeling/decomposition/dictionary_learning.py +8 -1
  59. snowflake/ml/modeling/decomposition/factor_analysis.py +8 -1
  60. snowflake/ml/modeling/decomposition/fast_ica.py +8 -1
  61. snowflake/ml/modeling/decomposition/incremental_pca.py +8 -1
  62. snowflake/ml/modeling/decomposition/kernel_pca.py +8 -1
  63. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +8 -1
  64. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +8 -1
  65. snowflake/ml/modeling/decomposition/pca.py +8 -1
  66. snowflake/ml/modeling/decomposition/sparse_pca.py +8 -1
  67. snowflake/ml/modeling/decomposition/truncated_svd.py +8 -1
  68. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +8 -1
  69. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +8 -1
  70. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +8 -1
  71. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +8 -1
  72. snowflake/ml/modeling/ensemble/bagging_classifier.py +8 -1
  73. snowflake/ml/modeling/ensemble/bagging_regressor.py +8 -1
  74. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +8 -1
  75. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +8 -1
  76. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +8 -1
  77. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +8 -1
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +8 -1
  79. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +8 -1
  80. snowflake/ml/modeling/ensemble/isolation_forest.py +8 -1
  81. snowflake/ml/modeling/ensemble/random_forest_classifier.py +8 -1
  82. snowflake/ml/modeling/ensemble/random_forest_regressor.py +8 -1
  83. snowflake/ml/modeling/ensemble/stacking_regressor.py +8 -1
  84. snowflake/ml/modeling/ensemble/voting_classifier.py +8 -1
  85. snowflake/ml/modeling/ensemble/voting_regressor.py +8 -1
  86. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +8 -1
  87. snowflake/ml/modeling/feature_selection/select_fdr.py +8 -1
  88. snowflake/ml/modeling/feature_selection/select_fpr.py +8 -1
  89. snowflake/ml/modeling/feature_selection/select_fwe.py +8 -1
  90. snowflake/ml/modeling/feature_selection/select_k_best.py +8 -1
  91. snowflake/ml/modeling/feature_selection/select_percentile.py +8 -1
  92. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +8 -1
  93. snowflake/ml/modeling/feature_selection/variance_threshold.py +8 -1
  94. snowflake/ml/modeling/framework/base.py +4 -3
  95. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +8 -1
  96. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +8 -1
  97. snowflake/ml/modeling/impute/iterative_imputer.py +8 -1
  98. snowflake/ml/modeling/impute/knn_imputer.py +8 -1
  99. snowflake/ml/modeling/impute/missing_indicator.py +8 -1
  100. snowflake/ml/modeling/impute/simple_imputer.py +21 -2
  101. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +8 -1
  102. snowflake/ml/modeling/kernel_approximation/nystroem.py +8 -1
  103. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +8 -1
  104. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +8 -1
  105. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +8 -1
  106. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +8 -1
  107. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +8 -1
  108. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +8 -1
  109. snowflake/ml/modeling/linear_model/ard_regression.py +8 -1
  110. snowflake/ml/modeling/linear_model/bayesian_ridge.py +8 -1
  111. snowflake/ml/modeling/linear_model/elastic_net.py +8 -1
  112. snowflake/ml/modeling/linear_model/elastic_net_cv.py +8 -1
  113. snowflake/ml/modeling/linear_model/gamma_regressor.py +8 -1
  114. snowflake/ml/modeling/linear_model/huber_regressor.py +8 -1
  115. snowflake/ml/modeling/linear_model/lars.py +8 -1
  116. snowflake/ml/modeling/linear_model/lars_cv.py +8 -1
  117. snowflake/ml/modeling/linear_model/lasso.py +8 -1
  118. snowflake/ml/modeling/linear_model/lasso_cv.py +8 -1
  119. snowflake/ml/modeling/linear_model/lasso_lars.py +8 -1
  120. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +8 -1
  121. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +8 -1
  122. snowflake/ml/modeling/linear_model/linear_regression.py +8 -1
  123. snowflake/ml/modeling/linear_model/logistic_regression.py +8 -1
  124. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +8 -1
  125. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +8 -1
  126. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +8 -1
  127. snowflake/ml/modeling/linear_model/multi_task_lasso.py +8 -1
  128. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +8 -1
  129. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +8 -1
  130. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +8 -1
  131. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +8 -1
  132. snowflake/ml/modeling/linear_model/perceptron.py +8 -1
  133. snowflake/ml/modeling/linear_model/poisson_regressor.py +8 -1
  134. snowflake/ml/modeling/linear_model/ransac_regressor.py +8 -1
  135. snowflake/ml/modeling/linear_model/ridge.py +8 -1
  136. snowflake/ml/modeling/linear_model/ridge_classifier.py +8 -1
  137. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +8 -1
  138. snowflake/ml/modeling/linear_model/ridge_cv.py +8 -1
  139. snowflake/ml/modeling/linear_model/sgd_classifier.py +8 -1
  140. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +8 -1
  141. snowflake/ml/modeling/linear_model/sgd_regressor.py +8 -1
  142. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +8 -1
  143. snowflake/ml/modeling/linear_model/tweedie_regressor.py +8 -1
  144. snowflake/ml/modeling/manifold/isomap.py +8 -1
  145. snowflake/ml/modeling/manifold/mds.py +8 -1
  146. snowflake/ml/modeling/manifold/spectral_embedding.py +8 -1
  147. snowflake/ml/modeling/manifold/tsne.py +8 -1
  148. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +8 -1
  149. snowflake/ml/modeling/mixture/gaussian_mixture.py +8 -1
  150. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +8 -1
  151. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +8 -1
  152. snowflake/ml/modeling/multiclass/output_code_classifier.py +8 -1
  153. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +8 -1
  154. snowflake/ml/modeling/naive_bayes/categorical_nb.py +8 -1
  155. snowflake/ml/modeling/naive_bayes/complement_nb.py +8 -1
  156. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +8 -1
  157. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +8 -1
  158. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +8 -1
  159. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +8 -1
  160. snowflake/ml/modeling/neighbors/kernel_density.py +8 -1
  161. snowflake/ml/modeling/neighbors/local_outlier_factor.py +8 -1
  162. snowflake/ml/modeling/neighbors/nearest_centroid.py +8 -1
  163. snowflake/ml/modeling/neighbors/nearest_neighbors.py +8 -1
  164. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +8 -1
  165. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +8 -1
  166. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +8 -1
  167. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +8 -1
  168. snowflake/ml/modeling/neural_network/mlp_classifier.py +8 -1
  169. snowflake/ml/modeling/neural_network/mlp_regressor.py +8 -1
  170. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  171. snowflake/ml/modeling/pipeline/pipeline.py +27 -7
  172. snowflake/ml/modeling/preprocessing/polynomial_features.py +8 -1
  173. snowflake/ml/modeling/semi_supervised/label_propagation.py +8 -1
  174. snowflake/ml/modeling/semi_supervised/label_spreading.py +8 -1
  175. snowflake/ml/modeling/svm/linear_svc.py +8 -1
  176. snowflake/ml/modeling/svm/linear_svr.py +8 -1
  177. snowflake/ml/modeling/svm/nu_svc.py +8 -1
  178. snowflake/ml/modeling/svm/nu_svr.py +8 -1
  179. snowflake/ml/modeling/svm/svc.py +8 -1
  180. snowflake/ml/modeling/svm/svr.py +8 -1
  181. snowflake/ml/modeling/tree/decision_tree_classifier.py +8 -1
  182. snowflake/ml/modeling/tree/decision_tree_regressor.py +8 -1
  183. snowflake/ml/modeling/tree/extra_tree_classifier.py +8 -1
  184. snowflake/ml/modeling/tree/extra_tree_regressor.py +8 -1
  185. snowflake/ml/modeling/xgboost/xgb_classifier.py +8 -1
  186. snowflake/ml/modeling/xgboost/xgb_regressor.py +8 -1
  187. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +8 -1
  188. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +8 -1
  189. snowflake/ml/registry/_manager/model_manager.py +95 -8
  190. snowflake/ml/registry/registry.py +10 -1
  191. snowflake/ml/version.py +1 -1
  192. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/METADATA +66 -10
  193. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/RECORD +196 -192
  194. snowflake/ml/_internal/lineage/dataset_dataframe.py +0 -44
  195. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/LICENSE.txt +0 -0
  196. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/WHEEL +0 -0
  197. {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.2.dist-info}/top_level.txt +0 -0
@@ -618,7 +618,14 @@ class TheilSenRegressor(BaseTransformer):
618
618
  ) -> List[str]:
619
619
  # in case the inferred output column names dimension is different
620
620
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
621
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
621
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
622
+
623
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
624
+ # seen during the fit.
625
+ snowpark_column_names = dataset.select(self.input_cols).columns
626
+ sample_pd_df.columns = snowpark_column_names
627
+
628
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
622
629
  output_df_columns = list(output_df_pd.columns)
623
630
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
624
631
  if self.sample_weight_col:
@@ -644,7 +644,14 @@ class TweedieRegressor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -642,7 +642,14 @@ class Isomap(BaseTransformer):
642
642
  ) -> List[str]:
643
643
  # in case the inferred output column names dimension is different
644
644
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
645
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
645
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
646
+
647
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
648
+ # seen during the fit.
649
+ snowpark_column_names = dataset.select(self.input_cols).columns
650
+ sample_pd_df.columns = snowpark_column_names
651
+
652
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
646
653
  output_df_columns = list(output_df_pd.columns)
647
654
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
648
655
  if self.sample_weight_col:
@@ -623,7 +623,14 @@ class MDS(BaseTransformer):
623
623
  ) -> List[str]:
624
624
  # in case the inferred output column names dimension is different
625
625
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
626
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
626
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
627
+
628
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
629
+ # seen during the fit.
630
+ snowpark_column_names = dataset.select(self.input_cols).columns
631
+ sample_pd_df.columns = snowpark_column_names
632
+
633
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
627
634
  output_df_columns = list(output_df_pd.columns)
628
635
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
629
636
  if self.sample_weight_col:
@@ -625,7 +625,14 @@ class SpectralEmbedding(BaseTransformer):
625
625
  ) -> List[str]:
626
626
  # in case the inferred output column names dimension is different
627
627
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
628
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
628
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
629
+
630
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
631
+ # seen during the fit.
632
+ snowpark_column_names = dataset.select(self.input_cols).columns
633
+ sample_pd_df.columns = snowpark_column_names
634
+
635
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
629
636
  output_df_columns = list(output_df_pd.columns)
630
637
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
631
638
  if self.sample_weight_col:
@@ -684,7 +684,14 @@ class TSNE(BaseTransformer):
684
684
  ) -> List[str]:
685
685
  # in case the inferred output column names dimension is different
686
686
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
687
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
687
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
688
+
689
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
690
+ # seen during the fit.
691
+ snowpark_column_names = dataset.select(self.input_cols).columns
692
+ sample_pd_df.columns = snowpark_column_names
693
+
694
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
688
695
  output_df_columns = list(output_df_pd.columns)
689
696
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
690
697
  if self.sample_weight_col:
@@ -689,7 +689,14 @@ class BayesianGaussianMixture(BaseTransformer):
689
689
  ) -> List[str]:
690
690
  # in case the inferred output column names dimension is different
691
691
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
692
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
692
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
693
+
694
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
695
+ # seen during the fit.
696
+ snowpark_column_names = dataset.select(self.input_cols).columns
697
+ sample_pd_df.columns = snowpark_column_names
698
+
699
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
693
700
  output_df_columns = list(output_df_pd.columns)
694
701
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
695
702
  if self.sample_weight_col:
@@ -662,7 +662,14 @@ class GaussianMixture(BaseTransformer):
662
662
  ) -> List[str]:
663
663
  # in case the inferred output column names dimension is different
664
664
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
665
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
665
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
666
+
667
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
668
+ # seen during the fit.
669
+ snowpark_column_names = dataset.select(self.input_cols).columns
670
+ sample_pd_df.columns = snowpark_column_names
671
+
672
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
666
673
  output_df_columns = list(output_df_pd.columns)
667
674
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
668
675
  if self.sample_weight_col:
@@ -572,7 +572,14 @@ class OneVsOneClassifier(BaseTransformer):
572
572
  ) -> List[str]:
573
573
  # in case the inferred output column names dimension is different
574
574
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
575
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
575
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
576
+
577
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
578
+ # seen during the fit.
579
+ snowpark_column_names = dataset.select(self.input_cols).columns
580
+ sample_pd_df.columns = snowpark_column_names
581
+
582
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
576
583
  output_df_columns = list(output_df_pd.columns)
577
584
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
578
585
  if self.sample_weight_col:
@@ -581,7 +581,14 @@ class OneVsRestClassifier(BaseTransformer):
581
581
  ) -> List[str]:
582
582
  # in case the inferred output column names dimension is different
583
583
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
584
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
584
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
585
+
586
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
587
+ # seen during the fit.
588
+ snowpark_column_names = dataset.select(self.input_cols).columns
589
+ sample_pd_df.columns = snowpark_column_names
590
+
591
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
585
592
  output_df_columns = list(output_df_pd.columns)
586
593
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
587
594
  if self.sample_weight_col:
@@ -584,7 +584,14 @@ class OutputCodeClassifier(BaseTransformer):
584
584
  ) -> List[str]:
585
585
  # in case the inferred output column names dimension is different
586
586
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
587
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
587
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
588
+
589
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
590
+ # seen during the fit.
591
+ snowpark_column_names = dataset.select(self.input_cols).columns
592
+ sample_pd_df.columns = snowpark_column_names
593
+
594
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
588
595
  output_df_columns = list(output_df_pd.columns)
589
596
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
590
597
  if self.sample_weight_col:
@@ -584,7 +584,14 @@ class BernoulliNB(BaseTransformer):
584
584
  ) -> List[str]:
585
585
  # in case the inferred output column names dimension is different
586
586
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
587
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
587
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
588
+
589
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
590
+ # seen during the fit.
591
+ snowpark_column_names = dataset.select(self.input_cols).columns
592
+ sample_pd_df.columns = snowpark_column_names
593
+
594
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
588
595
  output_df_columns = list(output_df_pd.columns)
589
596
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
590
597
  if self.sample_weight_col:
@@ -590,7 +590,14 @@ class CategoricalNB(BaseTransformer):
590
590
  ) -> List[str]:
591
591
  # in case the inferred output column names dimension is different
592
592
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
593
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
593
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
594
+
595
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
596
+ # seen during the fit.
597
+ snowpark_column_names = dataset.select(self.input_cols).columns
598
+ sample_pd_df.columns = snowpark_column_names
599
+
600
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
594
601
  output_df_columns = list(output_df_pd.columns)
595
602
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
596
603
  if self.sample_weight_col:
@@ -584,7 +584,14 @@ class ComplementNB(BaseTransformer):
584
584
  ) -> List[str]:
585
585
  # in case the inferred output column names dimension is different
586
586
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
587
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
587
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
588
+
589
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
590
+ # seen during the fit.
591
+ snowpark_column_names = dataset.select(self.input_cols).columns
592
+ sample_pd_df.columns = snowpark_column_names
593
+
594
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
588
595
  output_df_columns = list(output_df_pd.columns)
589
596
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
590
597
  if self.sample_weight_col:
@@ -565,7 +565,14 @@ class GaussianNB(BaseTransformer):
565
565
  ) -> List[str]:
566
566
  # in case the inferred output column names dimension is different
567
567
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
568
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
568
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
569
+
570
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
571
+ # seen during the fit.
572
+ snowpark_column_names = dataset.select(self.input_cols).columns
573
+ sample_pd_df.columns = snowpark_column_names
574
+
575
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
569
576
  output_df_columns = list(output_df_pd.columns)
570
577
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
571
578
  if self.sample_weight_col:
@@ -578,7 +578,14 @@ class MultinomialNB(BaseTransformer):
578
578
  ) -> List[str]:
579
579
  # in case the inferred output column names dimension is different
580
580
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
581
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
581
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
582
+
583
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
584
+ # seen during the fit.
585
+ snowpark_column_names = dataset.select(self.input_cols).columns
586
+ sample_pd_df.columns = snowpark_column_names
587
+
588
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
582
589
  output_df_columns = list(output_df_pd.columns)
583
590
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
584
591
  if self.sample_weight_col:
@@ -635,7 +635,14 @@ class KNeighborsClassifier(BaseTransformer):
635
635
  ) -> List[str]:
636
636
  # in case the inferred output column names dimension is different
637
637
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
638
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
638
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
639
+
640
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
641
+ # seen during the fit.
642
+ snowpark_column_names = dataset.select(self.input_cols).columns
643
+ sample_pd_df.columns = snowpark_column_names
644
+
645
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
639
646
  output_df_columns = list(output_df_pd.columns)
640
647
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
641
648
  if self.sample_weight_col:
@@ -637,7 +637,14 @@ class KNeighborsRegressor(BaseTransformer):
637
637
  ) -> List[str]:
638
638
  # in case the inferred output column names dimension is different
639
639
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
640
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
640
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
641
+
642
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
643
+ # seen during the fit.
644
+ snowpark_column_names = dataset.select(self.input_cols).columns
645
+ sample_pd_df.columns = snowpark_column_names
646
+
647
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
641
648
  output_df_columns = list(output_df_pd.columns)
642
649
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
643
650
  if self.sample_weight_col:
@@ -612,7 +612,14 @@ class KernelDensity(BaseTransformer):
612
612
  ) -> List[str]:
613
613
  # in case the inferred output column names dimension is different
614
614
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
615
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
615
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
616
+
617
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
618
+ # seen during the fit.
619
+ snowpark_column_names = dataset.select(self.input_cols).columns
620
+ sample_pd_df.columns = snowpark_column_names
621
+
622
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
616
623
  output_df_columns = list(output_df_pd.columns)
617
624
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
618
625
  if self.sample_weight_col:
@@ -644,7 +644,14 @@ class LocalOutlierFactor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -575,7 +575,14 @@ class NearestCentroid(BaseTransformer):
575
575
  ) -> List[str]:
576
576
  # in case the inferred output column names dimension is different
577
577
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
578
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
578
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
579
+
580
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
581
+ # seen during the fit.
582
+ snowpark_column_names = dataset.select(self.input_cols).columns
583
+ sample_pd_df.columns = snowpark_column_names
584
+
585
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
579
586
  output_df_columns = list(output_df_pd.columns)
580
587
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
581
588
  if self.sample_weight_col:
@@ -623,7 +623,14 @@ class NearestNeighbors(BaseTransformer):
623
623
  ) -> List[str]:
624
624
  # in case the inferred output column names dimension is different
625
625
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
626
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
626
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
627
+
628
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
629
+ # seen during the fit.
630
+ snowpark_column_names = dataset.select(self.input_cols).columns
631
+ sample_pd_df.columns = snowpark_column_names
632
+
633
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
627
634
  output_df_columns = list(output_df_pd.columns)
628
635
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
629
636
  if self.sample_weight_col:
@@ -648,7 +648,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
648
648
  ) -> List[str]:
649
649
  # in case the inferred output column names dimension is different
650
650
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
651
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
651
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
652
+
653
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
654
+ # seen during the fit.
655
+ snowpark_column_names = dataset.select(self.input_cols).columns
656
+ sample_pd_df.columns = snowpark_column_names
657
+
658
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
652
659
  output_df_columns = list(output_df_pd.columns)
653
660
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
654
661
  if self.sample_weight_col:
@@ -647,7 +647,14 @@ class RadiusNeighborsClassifier(BaseTransformer):
647
647
  ) -> List[str]:
648
648
  # in case the inferred output column names dimension is different
649
649
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
650
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
650
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
651
+
652
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
653
+ # seen during the fit.
654
+ snowpark_column_names = dataset.select(self.input_cols).columns
655
+ sample_pd_df.columns = snowpark_column_names
656
+
657
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
651
658
  output_df_columns = list(output_df_pd.columns)
652
659
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
653
660
  if self.sample_weight_col:
@@ -637,7 +637,14 @@ class RadiusNeighborsRegressor(BaseTransformer):
637
637
  ) -> List[str]:
638
638
  # in case the inferred output column names dimension is different
639
639
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
640
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
640
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
641
+
642
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
643
+ # seen during the fit.
644
+ snowpark_column_names = dataset.select(self.input_cols).columns
645
+ sample_pd_df.columns = snowpark_column_names
646
+
647
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
641
648
  output_df_columns = list(output_df_pd.columns)
642
649
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
643
650
  if self.sample_weight_col:
@@ -596,7 +596,14 @@ class BernoulliRBM(BaseTransformer):
596
596
  ) -> List[str]:
597
597
  # in case the inferred output column names dimension is different
598
598
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
599
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
599
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
600
+
601
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
602
+ # seen during the fit.
603
+ snowpark_column_names = dataset.select(self.input_cols).columns
604
+ sample_pd_df.columns = snowpark_column_names
605
+
606
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
600
607
  output_df_columns = list(output_df_pd.columns)
601
608
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
602
609
  if self.sample_weight_col:
@@ -749,7 +749,14 @@ class MLPClassifier(BaseTransformer):
749
749
  ) -> List[str]:
750
750
  # in case the inferred output column names dimension is different
751
751
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
752
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
752
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
753
+
754
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
755
+ # seen during the fit.
756
+ snowpark_column_names = dataset.select(self.input_cols).columns
757
+ sample_pd_df.columns = snowpark_column_names
758
+
759
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
753
760
  output_df_columns = list(output_df_pd.columns)
754
761
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
755
762
  if self.sample_weight_col:
@@ -745,7 +745,14 @@ class MLPRegressor(BaseTransformer):
745
745
  ) -> List[str]:
746
746
  # in case the inferred output column names dimension is different
747
747
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
748
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
748
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
749
+
750
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
751
+ # seen during the fit.
752
+ snowpark_column_names = dataset.select(self.input_cols).columns
753
+ sample_pd_df.columns = snowpark_column_names
754
+
755
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
749
756
  output_df_columns = list(output_df_pd.columns)
750
757
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
751
758
  if self.sample_weight_col:
@@ -0,0 +1,5 @@
1
+ """Enables the anonymous stored procedures for running modeling fit"""
2
+
3
+ from snowflake.ml.modeling._internal.snowpark_implementations import snowpark_trainer
4
+
5
+ snowpark_trainer._ENABLE_ANONYMOUS_SPROC = True
@@ -115,7 +115,7 @@ class Pipeline(base.BaseTransformer):
115
115
  self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = []
116
116
  self._n_features_in: List[int] = []
117
117
  self._transformers_to_input_indices: Dict[str, List[int]] = {}
118
- self._is_convertible_to_sklearn = True
118
+ self._modifies_label_or_sample_weight = True
119
119
 
120
120
  self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
121
121
 
@@ -126,6 +126,9 @@ class Pipeline(base.BaseTransformer):
126
126
  self._deps = list(deps)
127
127
  self._sklearn_object = None
128
128
  self.label_cols = self._get_label_cols()
129
+ self._is_convertible_to_sklearn = self._is_convertible_to_sklearn_object()
130
+
131
+ self._send_pipeline_configuration_telemetry()
129
132
 
130
133
  @staticmethod
131
134
  def _is_estimator(obj: object) -> bool:
@@ -228,7 +231,7 @@ class Pipeline(base.BaseTransformer):
228
231
  return [c for c in columns if c not in target_cols]
229
232
 
230
233
  def _append_step_feature_consumption_info(self, step_name: str, all_cols: List[str], input_cols: List[str]) -> None:
231
- if self._is_convertible_to_sklearn:
234
+ if self._modifies_label_or_sample_weight:
232
235
  all_cols = self._get_sanitized_list_of_columns(all_cols)
233
236
  self._feature_names_in.append(np.asarray(all_cols, dtype=object))
234
237
  self._n_features_in.append(len(all_cols))
@@ -248,7 +251,7 @@ class Pipeline(base.BaseTransformer):
248
251
  self, dataset: Union[snowpark.DataFrame, pd.DataFrame]
249
252
  ) -> Union[snowpark.DataFrame, pd.DataFrame]:
250
253
  self._reset()
251
- self._is_convertible_to_sklearn = not self._is_pipeline_modifying_label_or_sample_weight()
254
+ self._modifies_label_or_sample_weight = not self._is_pipeline_modifying_label_or_sample_weight()
252
255
  transformed_dataset = dataset
253
256
  for name, trans in self._get_transformers():
254
257
  self._append_step_feature_consumption_info(
@@ -425,7 +428,7 @@ class Pipeline(base.BaseTransformer):
425
428
  )
426
429
 
427
430
  if self._can_be_trained_in_ml_runtime(dataset):
428
- if not self._is_convertible_to_sklearn_object():
431
+ if not self._is_convertible_to_sklearn:
429
432
  raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
430
433
  self._fit_ml_runtime(dataset)
431
434
 
@@ -947,7 +950,7 @@ class Pipeline(base.BaseTransformer):
947
950
  if not os.environ.get(IN_ML_RUNTIME_ENV_VAR):
948
951
  return False
949
952
 
950
- return self._is_convertible_to_sklearn_object()
953
+ return self._is_convertible_to_sklearn
951
954
 
952
955
  @staticmethod
953
956
  def _wrap_transformer_in_column_transformer(
@@ -1003,7 +1006,7 @@ class Pipeline(base.BaseTransformer):
1003
1006
  if not self._is_fitted:
1004
1007
  return self._create_unfitted_sklearn_object()
1005
1008
 
1006
- if not self._is_convertible_to_sklearn:
1009
+ if not self._modifies_label_or_sample_weight:
1007
1010
  raise exceptions.SnowflakeMLException(
1008
1011
  error_code=error_codes.METHOD_NOT_ALLOWED,
1009
1012
  original_exception=ValueError(
@@ -1109,7 +1112,24 @@ class Pipeline(base.BaseTransformer):
1109
1112
  else:
1110
1113
  return self._create_sklearn_object()
1111
1114
  else:
1112
- if self._is_convertible_to_sklearn_object():
1115
+ if self._is_convertible_to_sklearn:
1113
1116
  return self._create_unfitted_sklearn_object()
1114
1117
  else:
1115
1118
  raise ValueError("This pipeline can not be converted to an sklearn pipeline.")
1119
+
1120
+ def _send_pipeline_configuration_telemetry(self) -> None:
1121
+ """Track information about the pipeline setup. Currently, we want to track:
1122
+ - Whether the pipeline is converible to an sklearn pipeline
1123
+ - Whether the pipeline is being used in the SPCS ml runtime.
1124
+ """
1125
+
1126
+ telemetry_data = {
1127
+ "pipeline_is_convertible_to_sklearn": self._is_convertible_to_sklearn,
1128
+ "in_spcs_ml_runtime": bool(os.environ.get(IN_ML_RUNTIME_ENV_VAR)),
1129
+ }
1130
+ telemetry.send_custom_usage(
1131
+ project=_PROJECT,
1132
+ subproject=_SUBPROJECT,
1133
+ telemetry_type=telemetry.TelemetryField.TYPE_SNOWML_PIPELINE_USAGE.value,
1134
+ data=telemetry_data,
1135
+ )
@@ -586,7 +586,14 @@ class PolynomialFeatures(BaseTransformer):
586
586
  ) -> List[str]:
587
587
  # in case the inferred output column names dimension is different
588
588
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
589
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
589
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
590
+
591
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
592
+ # seen during the fit.
593
+ snowpark_column_names = dataset.select(self.input_cols).columns
594
+ sample_pd_df.columns = snowpark_column_names
595
+
596
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
590
597
  output_df_columns = list(output_df_pd.columns)
591
598
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
592
599
  if self.sample_weight_col:
@@ -590,7 +590,14 @@ class LabelPropagation(BaseTransformer):
590
590
  ) -> List[str]:
591
591
  # in case the inferred output column names dimension is different
592
592
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
593
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
593
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
594
+
595
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
596
+ # seen during the fit.
597
+ snowpark_column_names = dataset.select(self.input_cols).columns
598
+ sample_pd_df.columns = snowpark_column_names
599
+
600
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
594
601
  output_df_columns = list(output_df_pd.columns)
595
602
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
596
603
  if self.sample_weight_col:
@@ -599,7 +599,14 @@ class LabelSpreading(BaseTransformer):
599
599
  ) -> List[str]:
600
600
  # in case the inferred output column names dimension is different
601
601
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
602
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
602
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
603
+
604
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
605
+ # seen during the fit.
606
+ snowpark_column_names = dataset.select(self.input_cols).columns
607
+ sample_pd_df.columns = snowpark_column_names
608
+
609
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
603
610
  output_df_columns = list(output_df_pd.columns)
604
611
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
605
612
  if self.sample_weight_col: