snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. snowflake/cortex/_complete.py +26 -5
  2. snowflake/cortex/_sentiment.py +7 -4
  3. snowflake/cortex/_sse_client.py +81 -0
  4. snowflake/cortex/_util.py +105 -8
  5. snowflake/ml/_internal/lineage/lineage_utils.py +34 -25
  6. snowflake/ml/_internal/utils/temp_file_utils.py +5 -2
  7. snowflake/ml/dataset/dataset.py +15 -12
  8. snowflake/ml/dataset/dataset_factory.py +3 -4
  9. snowflake/ml/feature_store/access_manager.py +34 -30
  10. snowflake/ml/feature_store/feature_store.py +3 -3
  11. snowflake/ml/feature_store/feature_view.py +12 -11
  12. snowflake/ml/fileset/snowfs.py +2 -31
  13. snowflake/ml/model/_client/ops/model_ops.py +43 -0
  14. snowflake/ml/model/_client/sql/model_version.py +55 -3
  15. snowflake/ml/model/_model_composer/model_composer.py +7 -3
  16. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -1
  17. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  18. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -3
  19. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
  20. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -27
  21. snowflake/ml/model/_signatures/builtins_handler.py +2 -1
  22. snowflake/ml/model/_signatures/core.py +13 -1
  23. snowflake/ml/model/_signatures/pandas_handler.py +2 -0
  24. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  25. snowflake/ml/model/model_signature.py +2 -0
  26. snowflake/ml/model/type_hints.py +1 -0
  27. snowflake/ml/modeling/_internal/estimator_utils.py +58 -1
  28. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +196 -242
  29. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +161 -0
  30. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +38 -18
  31. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +82 -134
  32. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +21 -17
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +9 -2
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +9 -2
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +9 -2
  36. snowflake/ml/modeling/cluster/birch.py +9 -2
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +9 -2
  38. snowflake/ml/modeling/cluster/dbscan.py +9 -2
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +9 -2
  40. snowflake/ml/modeling/cluster/k_means.py +9 -2
  41. snowflake/ml/modeling/cluster/mean_shift.py +9 -2
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +9 -2
  43. snowflake/ml/modeling/cluster/optics.py +9 -2
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +9 -2
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +9 -2
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +9 -2
  47. snowflake/ml/modeling/compose/column_transformer.py +9 -2
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +9 -2
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +9 -2
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +9 -2
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +9 -2
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +9 -2
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +9 -2
  54. snowflake/ml/modeling/covariance/min_cov_det.py +9 -2
  55. snowflake/ml/modeling/covariance/oas.py +9 -2
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +9 -2
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +9 -2
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +9 -2
  59. snowflake/ml/modeling/decomposition/fast_ica.py +9 -2
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +9 -2
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +9 -2
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +9 -2
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +9 -2
  64. snowflake/ml/modeling/decomposition/pca.py +9 -2
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +9 -2
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +9 -2
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +9 -2
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +9 -2
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +9 -2
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +9 -2
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +9 -2
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +9 -2
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +9 -2
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +9 -2
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +9 -2
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +9 -2
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +9 -2
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +9 -2
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +9 -2
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +9 -2
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +9 -2
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +9 -2
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +9 -2
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +9 -2
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +9 -2
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +9 -2
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +9 -2
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +9 -2
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +9 -2
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +9 -2
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +9 -2
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +9 -2
  93. snowflake/ml/modeling/framework/base.py +3 -8
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +9 -2
  95. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +9 -2
  96. snowflake/ml/modeling/impute/iterative_imputer.py +9 -2
  97. snowflake/ml/modeling/impute/knn_imputer.py +9 -2
  98. snowflake/ml/modeling/impute/missing_indicator.py +9 -2
  99. snowflake/ml/modeling/impute/simple_imputer.py +28 -5
  100. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +9 -2
  101. snowflake/ml/modeling/kernel_approximation/nystroem.py +9 -2
  102. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +9 -2
  103. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +9 -2
  104. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +9 -2
  105. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +9 -2
  106. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +9 -2
  107. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +9 -2
  108. snowflake/ml/modeling/linear_model/ard_regression.py +9 -2
  109. snowflake/ml/modeling/linear_model/bayesian_ridge.py +9 -2
  110. snowflake/ml/modeling/linear_model/elastic_net.py +9 -2
  111. snowflake/ml/modeling/linear_model/elastic_net_cv.py +9 -2
  112. snowflake/ml/modeling/linear_model/gamma_regressor.py +9 -2
  113. snowflake/ml/modeling/linear_model/huber_regressor.py +9 -2
  114. snowflake/ml/modeling/linear_model/lars.py +9 -2
  115. snowflake/ml/modeling/linear_model/lars_cv.py +9 -2
  116. snowflake/ml/modeling/linear_model/lasso.py +9 -2
  117. snowflake/ml/modeling/linear_model/lasso_cv.py +9 -2
  118. snowflake/ml/modeling/linear_model/lasso_lars.py +9 -2
  119. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +9 -2
  120. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +9 -2
  121. snowflake/ml/modeling/linear_model/linear_regression.py +9 -2
  122. snowflake/ml/modeling/linear_model/logistic_regression.py +9 -2
  123. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +9 -2
  124. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +9 -2
  125. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +9 -2
  126. snowflake/ml/modeling/linear_model/multi_task_lasso.py +9 -2
  127. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +9 -2
  128. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +9 -2
  129. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +9 -2
  130. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +9 -2
  131. snowflake/ml/modeling/linear_model/perceptron.py +9 -2
  132. snowflake/ml/modeling/linear_model/poisson_regressor.py +9 -2
  133. snowflake/ml/modeling/linear_model/ransac_regressor.py +9 -2
  134. snowflake/ml/modeling/linear_model/ridge.py +9 -2
  135. snowflake/ml/modeling/linear_model/ridge_classifier.py +9 -2
  136. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +9 -2
  137. snowflake/ml/modeling/linear_model/ridge_cv.py +9 -2
  138. snowflake/ml/modeling/linear_model/sgd_classifier.py +9 -2
  139. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +9 -2
  140. snowflake/ml/modeling/linear_model/sgd_regressor.py +9 -2
  141. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +9 -2
  142. snowflake/ml/modeling/linear_model/tweedie_regressor.py +9 -2
  143. snowflake/ml/modeling/manifold/isomap.py +9 -2
  144. snowflake/ml/modeling/manifold/mds.py +9 -2
  145. snowflake/ml/modeling/manifold/spectral_embedding.py +9 -2
  146. snowflake/ml/modeling/manifold/tsne.py +9 -2
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +9 -2
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +9 -2
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +1 -5
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +1 -5
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +9 -2
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +9 -2
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +9 -2
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +9 -2
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +9 -2
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +9 -2
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +9 -2
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +9 -2
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +9 -2
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +9 -2
  161. snowflake/ml/modeling/neighbors/kernel_density.py +9 -2
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +9 -2
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +9 -2
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +9 -2
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +9 -2
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +9 -2
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +9 -2
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +9 -2
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +9 -2
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +9 -2
  171. snowflake/ml/modeling/parameters/enable_anonymous_sproc.py +5 -0
  172. snowflake/ml/modeling/pipeline/pipeline.py +5 -0
  173. snowflake/ml/modeling/preprocessing/binarizer.py +7 -3
  174. snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +7 -2
  175. snowflake/ml/modeling/preprocessing/label_encoder.py +8 -7
  176. snowflake/ml/modeling/preprocessing/max_abs_scaler.py +7 -3
  177. snowflake/ml/modeling/preprocessing/min_max_scaler.py +7 -4
  178. snowflake/ml/modeling/preprocessing/normalizer.py +7 -3
  179. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +10 -2
  180. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +8 -5
  181. snowflake/ml/modeling/preprocessing/polynomial_features.py +9 -2
  182. snowflake/ml/modeling/preprocessing/robust_scaler.py +7 -4
  183. snowflake/ml/modeling/preprocessing/standard_scaler.py +7 -3
  184. snowflake/ml/modeling/semi_supervised/label_propagation.py +9 -2
  185. snowflake/ml/modeling/semi_supervised/label_spreading.py +9 -2
  186. snowflake/ml/modeling/svm/linear_svc.py +9 -2
  187. snowflake/ml/modeling/svm/linear_svr.py +9 -2
  188. snowflake/ml/modeling/svm/nu_svc.py +9 -2
  189. snowflake/ml/modeling/svm/nu_svr.py +9 -2
  190. snowflake/ml/modeling/svm/svc.py +9 -2
  191. snowflake/ml/modeling/svm/svr.py +9 -2
  192. snowflake/ml/modeling/tree/decision_tree_classifier.py +9 -2
  193. snowflake/ml/modeling/tree/decision_tree_regressor.py +9 -2
  194. snowflake/ml/modeling/tree/extra_tree_classifier.py +9 -2
  195. snowflake/ml/modeling/tree/extra_tree_regressor.py +9 -2
  196. snowflake/ml/modeling/xgboost/xgb_classifier.py +9 -2
  197. snowflake/ml/modeling/xgboost/xgb_regressor.py +9 -2
  198. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +9 -2
  199. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +9 -2
  200. snowflake/ml/registry/_manager/model_manager.py +59 -1
  201. snowflake/ml/registry/registry.py +10 -1
  202. snowflake/ml/version.py +1 -1
  203. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/METADATA +32 -4
  204. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/RECORD +207 -204
  205. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/LICENSE.txt +0 -0
  206. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/WHEEL +0 -0
  207. {snowflake_ml_python-1.5.1.dist-info → snowflake_ml_python-1.5.3.dist-info}/top_level.txt +0 -0
@@ -257,7 +257,7 @@ class CategoricalNB(BaseTransformer):
257
257
  inspect.currentframe(), CategoricalNB.__class__.__name__
258
258
  ),
259
259
  api_calls=[Session.call],
260
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
260
+ custom_tags={"autogen": True} if self._autogenerated else None,
261
261
  )
262
262
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
263
263
  pd_df.columns = dataset.columns
@@ -590,7 +590,14 @@ class CategoricalNB(BaseTransformer):
590
590
  ) -> List[str]:
591
591
  # in case the inferred output column names dimension is different
592
592
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
593
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
593
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
594
+
595
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
596
+ # seen during the fit.
597
+ snowpark_column_names = dataset.select(self.input_cols).columns
598
+ sample_pd_df.columns = snowpark_column_names
599
+
600
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
594
601
  output_df_columns = list(output_df_pd.columns)
595
602
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
596
603
  if self.sample_weight_col:
@@ -251,7 +251,7 @@ class ComplementNB(BaseTransformer):
251
251
  inspect.currentframe(), ComplementNB.__class__.__name__
252
252
  ),
253
253
  api_calls=[Session.call],
254
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
254
+ custom_tags={"autogen": True} if self._autogenerated else None,
255
255
  )
256
256
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
257
257
  pd_df.columns = dataset.columns
@@ -584,7 +584,14 @@ class ComplementNB(BaseTransformer):
584
584
  ) -> List[str]:
585
585
  # in case the inferred output column names dimension is different
586
586
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
587
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
587
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
588
+
589
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
590
+ # seen during the fit.
591
+ snowpark_column_names = dataset.select(self.input_cols).columns
592
+ sample_pd_df.columns = snowpark_column_names
593
+
594
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
588
595
  output_df_columns = list(output_df_pd.columns)
589
596
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
590
597
  if self.sample_weight_col:
@@ -232,7 +232,7 @@ class GaussianNB(BaseTransformer):
232
232
  inspect.currentframe(), GaussianNB.__class__.__name__
233
233
  ),
234
234
  api_calls=[Session.call],
235
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
235
+ custom_tags={"autogen": True} if self._autogenerated else None,
236
236
  )
237
237
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
238
238
  pd_df.columns = dataset.columns
@@ -565,7 +565,14 @@ class GaussianNB(BaseTransformer):
565
565
  ) -> List[str]:
566
566
  # in case the inferred output column names dimension is different
567
567
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
568
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
568
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
569
+
570
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
571
+ # seen during the fit.
572
+ snowpark_column_names = dataset.select(self.input_cols).columns
573
+ sample_pd_df.columns = snowpark_column_names
574
+
575
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
569
576
  output_df_columns = list(output_df_pd.columns)
570
577
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
571
578
  if self.sample_weight_col:
@@ -245,7 +245,7 @@ class MultinomialNB(BaseTransformer):
245
245
  inspect.currentframe(), MultinomialNB.__class__.__name__
246
246
  ),
247
247
  api_calls=[Session.call],
248
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
248
+ custom_tags={"autogen": True} if self._autogenerated else None,
249
249
  )
250
250
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
251
251
  pd_df.columns = dataset.columns
@@ -578,7 +578,14 @@ class MultinomialNB(BaseTransformer):
578
578
  ) -> List[str]:
579
579
  # in case the inferred output column names dimension is different
580
580
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
581
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
581
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
582
+
583
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
584
+ # seen during the fit.
585
+ snowpark_column_names = dataset.select(self.input_cols).columns
586
+ sample_pd_df.columns = snowpark_column_names
587
+
588
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
582
589
  output_df_columns = list(output_df_pd.columns)
583
590
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
584
591
  if self.sample_weight_col:
@@ -302,7 +302,7 @@ class KNeighborsClassifier(BaseTransformer):
302
302
  inspect.currentframe(), KNeighborsClassifier.__class__.__name__
303
303
  ),
304
304
  api_calls=[Session.call],
305
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
305
+ custom_tags={"autogen": True} if self._autogenerated else None,
306
306
  )
307
307
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
308
308
  pd_df.columns = dataset.columns
@@ -635,7 +635,14 @@ class KNeighborsClassifier(BaseTransformer):
635
635
  ) -> List[str]:
636
636
  # in case the inferred output column names dimension is different
637
637
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
638
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
638
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
639
+
640
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
641
+ # seen during the fit.
642
+ snowpark_column_names = dataset.select(self.input_cols).columns
643
+ sample_pd_df.columns = snowpark_column_names
644
+
645
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
639
646
  output_df_columns = list(output_df_pd.columns)
640
647
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
641
648
  if self.sample_weight_col:
@@ -304,7 +304,7 @@ class KNeighborsRegressor(BaseTransformer):
304
304
  inspect.currentframe(), KNeighborsRegressor.__class__.__name__
305
305
  ),
306
306
  api_calls=[Session.call],
307
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
307
+ custom_tags={"autogen": True} if self._autogenerated else None,
308
308
  )
309
309
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
310
310
  pd_df.columns = dataset.columns
@@ -637,7 +637,14 @@ class KNeighborsRegressor(BaseTransformer):
637
637
  ) -> List[str]:
638
638
  # in case the inferred output column names dimension is different
639
639
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
640
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
640
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
641
+
642
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
643
+ # seen during the fit.
644
+ snowpark_column_names = dataset.select(self.input_cols).columns
645
+ sample_pd_df.columns = snowpark_column_names
646
+
647
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
641
648
  output_df_columns = list(output_df_pd.columns)
642
649
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
643
650
  if self.sample_weight_col:
@@ -281,7 +281,7 @@ class KernelDensity(BaseTransformer):
281
281
  inspect.currentframe(), KernelDensity.__class__.__name__
282
282
  ),
283
283
  api_calls=[Session.call],
284
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
284
+ custom_tags={"autogen": True} if self._autogenerated else None,
285
285
  )
286
286
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
287
287
  pd_df.columns = dataset.columns
@@ -612,7 +612,14 @@ class KernelDensity(BaseTransformer):
612
612
  ) -> List[str]:
613
613
  # in case the inferred output column names dimension is different
614
614
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
615
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
615
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
616
+
617
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
618
+ # seen during the fit.
619
+ snowpark_column_names = dataset.select(self.input_cols).columns
620
+ sample_pd_df.columns = snowpark_column_names
621
+
622
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
616
623
  output_df_columns = list(output_df_pd.columns)
617
624
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
618
625
  if self.sample_weight_col:
@@ -309,7 +309,7 @@ class LocalOutlierFactor(BaseTransformer):
309
309
  inspect.currentframe(), LocalOutlierFactor.__class__.__name__
310
310
  ),
311
311
  api_calls=[Session.call],
312
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
312
+ custom_tags={"autogen": True} if self._autogenerated else None,
313
313
  )
314
314
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
315
315
  pd_df.columns = dataset.columns
@@ -644,7 +644,14 @@ class LocalOutlierFactor(BaseTransformer):
644
644
  ) -> List[str]:
645
645
  # in case the inferred output column names dimension is different
646
646
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
647
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
647
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
648
+
649
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
650
+ # seen during the fit.
651
+ snowpark_column_names = dataset.select(self.input_cols).columns
652
+ sample_pd_df.columns = snowpark_column_names
653
+
654
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
648
655
  output_df_columns = list(output_df_pd.columns)
649
656
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
650
657
  if self.sample_weight_col:
@@ -242,7 +242,7 @@ class NearestCentroid(BaseTransformer):
242
242
  inspect.currentframe(), NearestCentroid.__class__.__name__
243
243
  ),
244
244
  api_calls=[Session.call],
245
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
245
+ custom_tags={"autogen": True} if self._autogenerated else None,
246
246
  )
247
247
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
248
248
  pd_df.columns = dataset.columns
@@ -575,7 +575,14 @@ class NearestCentroid(BaseTransformer):
575
575
  ) -> List[str]:
576
576
  # in case the inferred output column names dimension is different
577
577
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
578
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
578
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
579
+
580
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
581
+ # seen during the fit.
582
+ snowpark_column_names = dataset.select(self.input_cols).columns
583
+ sample_pd_df.columns = snowpark_column_names
584
+
585
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
579
586
  output_df_columns = list(output_df_pd.columns)
580
587
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
581
588
  if self.sample_weight_col:
@@ -292,7 +292,7 @@ class NearestNeighbors(BaseTransformer):
292
292
  inspect.currentframe(), NearestNeighbors.__class__.__name__
293
293
  ),
294
294
  api_calls=[Session.call],
295
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
295
+ custom_tags={"autogen": True} if self._autogenerated else None,
296
296
  )
297
297
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
298
298
  pd_df.columns = dataset.columns
@@ -623,7 +623,14 @@ class NearestNeighbors(BaseTransformer):
623
623
  ) -> List[str]:
624
624
  # in case the inferred output column names dimension is different
625
625
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
626
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
626
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
627
+
628
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
629
+ # seen during the fit.
630
+ snowpark_column_names = dataset.select(self.input_cols).columns
631
+ sample_pd_df.columns = snowpark_column_names
632
+
633
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
627
634
  output_df_columns = list(output_df_pd.columns)
628
635
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
629
636
  if self.sample_weight_col:
@@ -313,7 +313,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
313
313
  inspect.currentframe(), NeighborhoodComponentsAnalysis.__class__.__name__
314
314
  ),
315
315
  api_calls=[Session.call],
316
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
316
+ custom_tags={"autogen": True} if self._autogenerated else None,
317
317
  )
318
318
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
319
319
  pd_df.columns = dataset.columns
@@ -648,7 +648,14 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
648
648
  ) -> List[str]:
649
649
  # in case the inferred output column names dimension is different
650
650
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
651
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
651
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
652
+
653
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
654
+ # seen during the fit.
655
+ snowpark_column_names = dataset.select(self.input_cols).columns
656
+ sample_pd_df.columns = snowpark_column_names
657
+
658
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
652
659
  output_df_columns = list(output_df_pd.columns)
653
660
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
654
661
  if self.sample_weight_col:
@@ -314,7 +314,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
314
314
  inspect.currentframe(), RadiusNeighborsClassifier.__class__.__name__
315
315
  ),
316
316
  api_calls=[Session.call],
317
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
317
+ custom_tags={"autogen": True} if self._autogenerated else None,
318
318
  )
319
319
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
320
320
  pd_df.columns = dataset.columns
@@ -647,7 +647,14 @@ class RadiusNeighborsClassifier(BaseTransformer):
647
647
  ) -> List[str]:
648
648
  # in case the inferred output column names dimension is different
649
649
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
650
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
650
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
651
+
652
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
653
+ # seen during the fit.
654
+ snowpark_column_names = dataset.select(self.input_cols).columns
655
+ sample_pd_df.columns = snowpark_column_names
656
+
657
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
651
658
  output_df_columns = list(output_df_pd.columns)
652
659
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
653
660
  if self.sample_weight_col:
@@ -304,7 +304,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
304
304
  inspect.currentframe(), RadiusNeighborsRegressor.__class__.__name__
305
305
  ),
306
306
  api_calls=[Session.call],
307
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
307
+ custom_tags={"autogen": True} if self._autogenerated else None,
308
308
  )
309
309
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
310
310
  pd_df.columns = dataset.columns
@@ -637,7 +637,14 @@ class RadiusNeighborsRegressor(BaseTransformer):
637
637
  ) -> List[str]:
638
638
  # in case the inferred output column names dimension is different
639
639
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
640
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
640
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
641
+
642
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
643
+ # seen during the fit.
644
+ snowpark_column_names = dataset.select(self.input_cols).columns
645
+ sample_pd_df.columns = snowpark_column_names
646
+
647
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
641
648
  output_df_columns = list(output_df_pd.columns)
642
649
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
643
650
  if self.sample_weight_col:
@@ -261,7 +261,7 @@ class BernoulliRBM(BaseTransformer):
261
261
  inspect.currentframe(), BernoulliRBM.__class__.__name__
262
262
  ),
263
263
  api_calls=[Session.call],
264
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
264
+ custom_tags={"autogen": True} if self._autogenerated else None,
265
265
  )
266
266
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
267
267
  pd_df.columns = dataset.columns
@@ -596,7 +596,14 @@ class BernoulliRBM(BaseTransformer):
596
596
  ) -> List[str]:
597
597
  # in case the inferred output column names dimension is different
598
598
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
599
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
599
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
600
+
601
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
602
+ # seen during the fit.
603
+ snowpark_column_names = dataset.select(self.input_cols).columns
604
+ sample_pd_df.columns = snowpark_column_names
605
+
606
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
600
607
  output_df_columns = list(output_df_pd.columns)
601
608
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
602
609
  if self.sample_weight_col:
@@ -416,7 +416,7 @@ class MLPClassifier(BaseTransformer):
416
416
  inspect.currentframe(), MLPClassifier.__class__.__name__
417
417
  ),
418
418
  api_calls=[Session.call],
419
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
419
+ custom_tags={"autogen": True} if self._autogenerated else None,
420
420
  )
421
421
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
422
422
  pd_df.columns = dataset.columns
@@ -749,7 +749,14 @@ class MLPClassifier(BaseTransformer):
749
749
  ) -> List[str]:
750
750
  # in case the inferred output column names dimension is different
751
751
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
752
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
752
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
753
+
754
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
755
+ # seen during the fit.
756
+ snowpark_column_names = dataset.select(self.input_cols).columns
757
+ sample_pd_df.columns = snowpark_column_names
758
+
759
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
753
760
  output_df_columns = list(output_df_pd.columns)
754
761
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
755
762
  if self.sample_weight_col:
@@ -412,7 +412,7 @@ class MLPRegressor(BaseTransformer):
412
412
  inspect.currentframe(), MLPRegressor.__class__.__name__
413
413
  ),
414
414
  api_calls=[Session.call],
415
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
415
+ custom_tags={"autogen": True} if self._autogenerated else None,
416
416
  )
417
417
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
418
418
  pd_df.columns = dataset.columns
@@ -745,7 +745,14 @@ class MLPRegressor(BaseTransformer):
745
745
  ) -> List[str]:
746
746
  # in case the inferred output column names dimension is different
747
747
  # we use one line of snowpark dataframe and put it into sklearn estimator using pandas
748
- output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
748
+ sample_pd_df = dataset.select(self.input_cols).limit(1).to_pandas()
749
+
750
+ # Rename the pandas df column names to snowflake identifiers and reorder columns to match the order
751
+ # seen during the fit.
752
+ snowpark_column_names = dataset.select(self.input_cols).columns
753
+ sample_pd_df.columns = snowpark_column_names
754
+
755
+ output_df_pd = getattr(self, method)(sample_pd_df, output_cols_prefix)
749
756
  output_df_columns = list(output_df_pd.columns)
750
757
  output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
751
758
  if self.sample_weight_col:
@@ -0,0 +1,5 @@
1
+ """Enables the anonymous stored procedures for running modeling fit"""
2
+
3
+ from snowflake.ml.modeling._internal.snowpark_implementations import snowpark_trainer
4
+
5
+ snowpark_trainer._ENABLE_ANONYMOUS_SPROC = True
@@ -17,6 +17,7 @@ from sklearn.utils import metaestimators
17
17
  from snowflake import snowpark
18
18
  from snowflake.ml._internal import file_utils, telemetry
19
19
  from snowflake.ml._internal.exceptions import error_codes, exceptions
20
+ from snowflake.ml._internal.lineage import lineage_utils
20
21
  from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
21
22
  from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
22
23
  from snowflake.ml.modeling._internal.model_transformer_builder import (
@@ -427,6 +428,10 @@ class Pipeline(base.BaseTransformer):
427
428
  else dataset
428
429
  )
429
430
 
431
+ # Extract lineage information here since we're overriding fit() directly
432
+ data_sources = lineage_utils.get_data_sources(dataset)
433
+ lineage_utils.set_data_sources(self, data_sources)
434
+
430
435
  if self._can_be_trained_in_ml_runtime(dataset):
431
436
  if not self._is_convertible_to_sklearn:
432
437
  raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
@@ -25,11 +25,15 @@ class Binarizer(base.BaseTransformer):
25
25
  Feature values below or equal to this are replaced by 0, above it by 1. Default values is 0.0.
26
26
 
27
27
  input_cols: Optional[Union[str, Iterable[str]]], default=None
28
- The name(s) of one or more columns in a DataFrame containing a feature to be binarized.
28
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be binarized. Input
29
+ columns must be specified before transform with this argument or after initialization with the
30
+ `set_input_cols` method. This argument is optional for API consistency.
29
31
 
30
32
  output_cols: Optional[Union[str, Iterable[str]]], default=None
31
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
32
- columns specified must match the number of input columns.
33
+ The name(s) to assign output columns in the output DataFrame. The number of
34
+ columns specified must equal the number of input columns. Output columns must be specified before transform
35
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
36
+ API consistency.
33
37
 
34
38
  passthrough_cols: Optional[Union[str, Iterable[str]]], default=None
35
39
  A string or a list of strings indicating column names to be excluded from any
@@ -74,10 +74,15 @@ class KBinsDiscretizer(base.BaseTransformer):
74
74
  - 'quantile': All bins in each feature have the same number of points.
75
75
 
76
76
  input_cols: str or Iterable [column_name], default=None
77
- Single or multiple input columns.
77
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be discretized.
78
+ Input columns must be specified before fit with this argument or after initialization with the
79
+ `set_input_cols` method. This argument is optional for API consistency.
78
80
 
79
81
  output_cols: str or Iterable [column_name], default=None
80
- Single or multiple output columns.
82
+ The name(s) to assign output columns in the output DataFrame. The number of
83
+ columns specified must equal the number of input columns. Output columns must be specified before transform
84
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
85
+ API consistency.
81
86
 
82
87
  passthrough_cols: A string or a list of strings indicating column names to be excluded from any
83
88
  operations (such as train, transform, or inference). These specified column(s)
@@ -25,11 +25,12 @@ class LabelEncoder(base.BaseTransformer):
25
25
 
26
26
  Args:
27
27
  input_cols: Optional[Union[str, List[str]]]
28
- The name of a column in a DataFrame to be encoded. May be a string or a list containing one string.
28
+ The name of a column or a list containing one column name to be encoded in the input DataFrame. There must
29
+ be exactly one input column specified before fit. This argument is optional for API consistency.
29
30
 
30
31
  output_cols: Optional[Union[str, List[str]]]
31
- The name of a column in a DataFrame where the results will be stored. May be a string or a list
32
- containing one string.
32
+ The name of a column or a list containing one column name where the results will be stored. There must be
33
+ exactly one output column specified before trainsform. This argument is optional for API consistency.
33
34
 
34
35
  passthrough_cols: Optional[Union[str, List[str]]]
35
36
  A string or a list of strings indicating column names to be excluded from any
@@ -54,11 +55,11 @@ class LabelEncoder(base.BaseTransformer):
54
55
 
55
56
  Args:
56
57
  input_cols: Optional[Union[str, List[str]]]
57
- The name of a column in a DataFrame to be encoded. May be a string or a list containing one
58
- string.
58
+ The name of a column or a list containing one column name to be encoded in the input DataFrame. There
59
+ must be exactly one input column specified before fit. This argument is optional for API consistency.
59
60
  output_cols: Optional[Union[str, List[str]]]
60
- The name of a column in a DataFrame where the results will be stored. May be a string or a list
61
- containing one string.
61
+ The name of a column or a list containing one column name where the results will be stored. There must
62
+ be exactly one output column specified before transform. This argument is optional for API consistency.
62
63
  passthrough_cols: Optional[Union[str, List[str]]]
63
64
  A string or a list of strings indicating column names to be excluded from any
64
65
  operations (such as train, transform, or inference). These specified column(s)
@@ -28,11 +28,15 @@ class MaxAbsScaler(base.BaseTransformer):
28
28
 
29
29
  Args:
30
30
  input_cols: Optional[Union[str, List[str]]], default=None
31
- The name(s) of one or more columns in a DataFrame containing a feature to be scaled.
31
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be scaled. Input
32
+ columns must be specified before fit with this argument or after initialization with the
33
+ `set_input_cols` method. This argument is optional for API consistency.
32
34
 
33
35
  output_cols: Optional[Union[str, List[str]]], default=None
34
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
35
- columns specified must match the number of input columns.
36
+ The name(s) to assign output columns in the output DataFrame. The number of
37
+ columns specified must equal the number of input columns. Output columns must be specified before transform
38
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
39
+ API consistency.
36
40
 
37
41
  passthrough_cols: Optional[Union[str, List[str]]], default=None
38
42
  A string or a list of strings indicating column names to be excluded from any
@@ -29,12 +29,15 @@ class MinMaxScaler(base.BaseTransformer):
29
29
  Whether to clip transformed values of held-out data to the specified feature range (default is True).
30
30
 
31
31
  input_cols: Optional[Union[str, List[str]]], default=None
32
- The name(s) of one or more columns in a DataFrame containing a feature to be scaled. Each specified
33
- input column is scaled independently and stored in the corresponding output column.
32
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be scaled. Input
33
+ columns must be specified before fit with this argument or after initialization with the
34
+ `set_input_cols` method. This argument is optional for API consistency.
34
35
 
35
36
  output_cols: Optional[Union[str, List[str]]], default=None
36
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
37
- columns specified must match the number of input columns.
37
+ The name(s) to assign output columns in the output DataFrame. The number of
38
+ columns specified must equal the number of input columns. Output columns must be specified before transform
39
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
40
+ API consistency.
38
41
 
39
42
  passthrough_cols: Optional[Union[str, List[str]]], default=None
40
43
  A string or a list of strings indicating column names to be excluded from any
@@ -28,11 +28,15 @@ class Normalizer(base.BaseTransformer):
28
28
  values. It must be one of 'l1', 'l2', or 'max'.
29
29
 
30
30
  input_cols: Optional[Union[str, List[str]]]
31
- Columns to use as inputs during transform.
31
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be normalized. Input
32
+ columns must be specified before transform with this argument or after initialization with the
33
+ `set_input_cols` method. This argument is optional for API consistency.
32
34
 
33
35
  output_cols: Optional[Union[str, List[str]]]
34
- A string or list of strings representing column names that will store the output of transform operation.
35
- The length of `output_cols` must equal the length of `input_cols`.
36
+ The name(s) to assign output columns in the output DataFrame. The number of
37
+ columns specified must equal the number of input columns. Output columns must be specified before transform
38
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
39
+ API consistency.
36
40
 
37
41
  passthrough_cols: Optional[Union[str, List[str]]]
38
42
  A string or a list of strings indicating column names to be excluded from any
@@ -157,10 +157,18 @@ class OneHotEncoder(base.BaseTransformer):
157
157
  there is no limit to the number of output features.
158
158
 
159
159
  input_cols: Optional[Union[str, List[str]]], default=None
160
- Single or multiple input columns.
160
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be encoded. Input
161
+ columns must be specified before fit with this argument or after initialization with the
162
+ `set_input_cols` method. This argument is optional for API consistency.
161
163
 
162
164
  output_cols: Optional[Union[str, List[str]]], default=None
163
- Single or multiple output columns.
165
+ The prefix to be used for encoded output for each input column. The number of
166
+ output column prefixes specified must match the number of input columns. Output column prefixes must be
167
+ specified before transform with this argument or after initialization with the `set_output_cols` method.
168
+
169
+ Note: Dense output column names are case-sensitive and resolve identifiers following Snowflake rules, e.g.
170
+ `"PREFIX_a"`, `PREFIX_A`, `"prefix_A"`. Therefore, there is no need to provide double-quoted column names
171
+ as that would result in invalid identifiers.
164
172
 
165
173
  passthrough_cols: Optional[Union[str, List[str]]]
166
174
  A string or a list of strings indicating column names to be excluded from any