snowflake-ml-python 1.5.2__py3-none-any.whl → 1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/cortex/_complete.py +26 -5
  2. snowflake/cortex/_sse_client.py +81 -0
  3. snowflake/cortex/_util.py +105 -8
  4. snowflake/ml/_internal/lineage/lineage_utils.py +34 -25
  5. snowflake/ml/dataset/dataset.py +15 -12
  6. snowflake/ml/dataset/dataset_factory.py +3 -4
  7. snowflake/ml/feature_store/feature_store.py +2 -2
  8. snowflake/ml/model/_client/sql/model_version.py +2 -2
  9. snowflake/ml/model/_model_composer/model_composer.py +2 -2
  10. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -1
  11. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  12. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
  13. snowflake/ml/model/_signatures/builtins_handler.py +2 -1
  14. snowflake/ml/model/_signatures/core.py +13 -1
  15. snowflake/ml/model/_signatures/pandas_handler.py +2 -0
  16. snowflake/ml/model/_signatures/snowpark_handler.py +3 -3
  17. snowflake/ml/model/model_signature.py +2 -0
  18. snowflake/ml/model/type_hints.py +1 -0
  19. snowflake/ml/modeling/_internal/estimator_utils.py +58 -1
  20. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +156 -121
  21. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +2 -0
  22. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +38 -18
  23. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +82 -134
  24. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +21 -17
  25. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +1 -1
  26. snowflake/ml/modeling/cluster/affinity_propagation.py +1 -1
  27. snowflake/ml/modeling/cluster/agglomerative_clustering.py +1 -1
  28. snowflake/ml/modeling/cluster/birch.py +1 -1
  29. snowflake/ml/modeling/cluster/bisecting_k_means.py +1 -1
  30. snowflake/ml/modeling/cluster/dbscan.py +1 -1
  31. snowflake/ml/modeling/cluster/feature_agglomeration.py +1 -1
  32. snowflake/ml/modeling/cluster/k_means.py +1 -1
  33. snowflake/ml/modeling/cluster/mean_shift.py +1 -1
  34. snowflake/ml/modeling/cluster/mini_batch_k_means.py +1 -1
  35. snowflake/ml/modeling/cluster/optics.py +1 -1
  36. snowflake/ml/modeling/cluster/spectral_biclustering.py +1 -1
  37. snowflake/ml/modeling/cluster/spectral_clustering.py +1 -1
  38. snowflake/ml/modeling/cluster/spectral_coclustering.py +1 -1
  39. snowflake/ml/modeling/compose/column_transformer.py +1 -1
  40. snowflake/ml/modeling/compose/transformed_target_regressor.py +1 -1
  41. snowflake/ml/modeling/covariance/elliptic_envelope.py +1 -1
  42. snowflake/ml/modeling/covariance/empirical_covariance.py +1 -1
  43. snowflake/ml/modeling/covariance/graphical_lasso.py +1 -1
  44. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +1 -1
  45. snowflake/ml/modeling/covariance/ledoit_wolf.py +1 -1
  46. snowflake/ml/modeling/covariance/min_cov_det.py +1 -1
  47. snowflake/ml/modeling/covariance/oas.py +1 -1
  48. snowflake/ml/modeling/covariance/shrunk_covariance.py +1 -1
  49. snowflake/ml/modeling/decomposition/dictionary_learning.py +1 -1
  50. snowflake/ml/modeling/decomposition/factor_analysis.py +1 -1
  51. snowflake/ml/modeling/decomposition/fast_ica.py +1 -1
  52. snowflake/ml/modeling/decomposition/incremental_pca.py +1 -1
  53. snowflake/ml/modeling/decomposition/kernel_pca.py +1 -1
  54. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +1 -1
  55. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +1 -1
  56. snowflake/ml/modeling/decomposition/pca.py +1 -1
  57. snowflake/ml/modeling/decomposition/sparse_pca.py +1 -1
  58. snowflake/ml/modeling/decomposition/truncated_svd.py +1 -1
  59. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +1 -1
  60. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +1 -1
  61. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +1 -1
  62. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +1 -1
  63. snowflake/ml/modeling/ensemble/bagging_classifier.py +1 -1
  64. snowflake/ml/modeling/ensemble/bagging_regressor.py +1 -1
  65. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +1 -1
  66. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +1 -1
  67. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +1 -1
  68. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +1 -1
  69. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +1 -1
  70. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +1 -1
  71. snowflake/ml/modeling/ensemble/isolation_forest.py +1 -1
  72. snowflake/ml/modeling/ensemble/random_forest_classifier.py +1 -1
  73. snowflake/ml/modeling/ensemble/random_forest_regressor.py +1 -1
  74. snowflake/ml/modeling/ensemble/stacking_regressor.py +1 -1
  75. snowflake/ml/modeling/ensemble/voting_classifier.py +1 -1
  76. snowflake/ml/modeling/ensemble/voting_regressor.py +1 -1
  77. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +1 -1
  78. snowflake/ml/modeling/feature_selection/select_fdr.py +1 -1
  79. snowflake/ml/modeling/feature_selection/select_fpr.py +1 -1
  80. snowflake/ml/modeling/feature_selection/select_fwe.py +1 -1
  81. snowflake/ml/modeling/feature_selection/select_k_best.py +1 -1
  82. snowflake/ml/modeling/feature_selection/select_percentile.py +1 -1
  83. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +1 -1
  84. snowflake/ml/modeling/feature_selection/variance_threshold.py +1 -1
  85. snowflake/ml/modeling/framework/base.py +3 -8
  86. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +1 -1
  87. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +1 -1
  88. snowflake/ml/modeling/impute/iterative_imputer.py +1 -1
  89. snowflake/ml/modeling/impute/knn_imputer.py +1 -1
  90. snowflake/ml/modeling/impute/missing_indicator.py +1 -1
  91. snowflake/ml/modeling/impute/simple_imputer.py +8 -4
  92. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +1 -1
  93. snowflake/ml/modeling/kernel_approximation/nystroem.py +1 -1
  94. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +1 -1
  95. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +1 -1
  96. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +1 -1
  97. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +1 -1
  98. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +1 -1
  99. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +1 -1
  100. snowflake/ml/modeling/linear_model/ard_regression.py +1 -1
  101. snowflake/ml/modeling/linear_model/bayesian_ridge.py +1 -1
  102. snowflake/ml/modeling/linear_model/elastic_net.py +1 -1
  103. snowflake/ml/modeling/linear_model/elastic_net_cv.py +1 -1
  104. snowflake/ml/modeling/linear_model/gamma_regressor.py +1 -1
  105. snowflake/ml/modeling/linear_model/huber_regressor.py +1 -1
  106. snowflake/ml/modeling/linear_model/lars.py +1 -1
  107. snowflake/ml/modeling/linear_model/lars_cv.py +1 -1
  108. snowflake/ml/modeling/linear_model/lasso.py +1 -1
  109. snowflake/ml/modeling/linear_model/lasso_cv.py +1 -1
  110. snowflake/ml/modeling/linear_model/lasso_lars.py +1 -1
  111. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +1 -1
  112. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +1 -1
  113. snowflake/ml/modeling/linear_model/linear_regression.py +1 -1
  114. snowflake/ml/modeling/linear_model/logistic_regression.py +1 -1
  115. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +1 -1
  116. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +1 -1
  117. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +1 -1
  118. snowflake/ml/modeling/linear_model/multi_task_lasso.py +1 -1
  119. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +1 -1
  120. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +1 -1
  121. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +1 -1
  122. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +1 -1
  123. snowflake/ml/modeling/linear_model/perceptron.py +1 -1
  124. snowflake/ml/modeling/linear_model/poisson_regressor.py +1 -1
  125. snowflake/ml/modeling/linear_model/ransac_regressor.py +1 -1
  126. snowflake/ml/modeling/linear_model/ridge.py +1 -1
  127. snowflake/ml/modeling/linear_model/ridge_classifier.py +1 -1
  128. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +1 -1
  129. snowflake/ml/modeling/linear_model/ridge_cv.py +1 -1
  130. snowflake/ml/modeling/linear_model/sgd_classifier.py +1 -1
  131. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +1 -1
  132. snowflake/ml/modeling/linear_model/sgd_regressor.py +1 -1
  133. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +1 -1
  134. snowflake/ml/modeling/linear_model/tweedie_regressor.py +1 -1
  135. snowflake/ml/modeling/manifold/isomap.py +1 -1
  136. snowflake/ml/modeling/manifold/mds.py +1 -1
  137. snowflake/ml/modeling/manifold/spectral_embedding.py +1 -1
  138. snowflake/ml/modeling/manifold/tsne.py +1 -1
  139. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +1 -1
  140. snowflake/ml/modeling/mixture/gaussian_mixture.py +1 -1
  141. snowflake/ml/modeling/model_selection/grid_search_cv.py +1 -5
  142. snowflake/ml/modeling/model_selection/randomized_search_cv.py +1 -5
  143. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +1 -1
  144. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +1 -1
  145. snowflake/ml/modeling/multiclass/output_code_classifier.py +1 -1
  146. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +1 -1
  147. snowflake/ml/modeling/naive_bayes/categorical_nb.py +1 -1
  148. snowflake/ml/modeling/naive_bayes/complement_nb.py +1 -1
  149. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +1 -1
  150. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +1 -1
  151. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +1 -1
  152. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +1 -1
  153. snowflake/ml/modeling/neighbors/kernel_density.py +1 -1
  154. snowflake/ml/modeling/neighbors/local_outlier_factor.py +1 -1
  155. snowflake/ml/modeling/neighbors/nearest_centroid.py +1 -1
  156. snowflake/ml/modeling/neighbors/nearest_neighbors.py +1 -1
  157. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +1 -1
  158. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +1 -1
  159. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +1 -1
  160. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +1 -1
  161. snowflake/ml/modeling/neural_network/mlp_classifier.py +1 -1
  162. snowflake/ml/modeling/neural_network/mlp_regressor.py +1 -1
  163. snowflake/ml/modeling/pipeline/pipeline.py +5 -0
  164. snowflake/ml/modeling/preprocessing/binarizer.py +7 -3
  165. snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +7 -2
  166. snowflake/ml/modeling/preprocessing/label_encoder.py +8 -7
  167. snowflake/ml/modeling/preprocessing/max_abs_scaler.py +7 -3
  168. snowflake/ml/modeling/preprocessing/min_max_scaler.py +7 -4
  169. snowflake/ml/modeling/preprocessing/normalizer.py +7 -3
  170. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +10 -2
  171. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +8 -5
  172. snowflake/ml/modeling/preprocessing/polynomial_features.py +1 -1
  173. snowflake/ml/modeling/preprocessing/robust_scaler.py +7 -4
  174. snowflake/ml/modeling/preprocessing/standard_scaler.py +7 -3
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +1 -1
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +1 -1
  177. snowflake/ml/modeling/svm/linear_svc.py +1 -1
  178. snowflake/ml/modeling/svm/linear_svr.py +1 -1
  179. snowflake/ml/modeling/svm/nu_svc.py +1 -1
  180. snowflake/ml/modeling/svm/nu_svr.py +1 -1
  181. snowflake/ml/modeling/svm/svc.py +1 -1
  182. snowflake/ml/modeling/svm/svr.py +1 -1
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +1 -1
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +1 -1
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +1 -1
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +1 -1
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +1 -1
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +1 -1
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +1 -1
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +1 -1
  191. snowflake/ml/version.py +1 -1
  192. {snowflake_ml_python-1.5.2.dist-info → snowflake_ml_python-1.5.3.dist-info}/METADATA +21 -5
  193. {snowflake_ml_python-1.5.2.dist-info → snowflake_ml_python-1.5.3.dist-info}/RECORD +196 -195
  194. {snowflake_ml_python-1.5.2.dist-info → snowflake_ml_python-1.5.3.dist-info}/LICENSE.txt +0 -0
  195. {snowflake_ml_python-1.5.2.dist-info → snowflake_ml_python-1.5.3.dist-info}/WHEEL +0 -0
  196. {snowflake_ml_python-1.5.2.dist-info → snowflake_ml_python-1.5.3.dist-info}/top_level.txt +0 -0
@@ -277,7 +277,7 @@ class RidgeClassifierCV(BaseTransformer):
277
277
  inspect.currentframe(), RidgeClassifierCV.__class__.__name__
278
278
  ),
279
279
  api_calls=[Session.call],
280
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
280
+ custom_tags={"autogen": True} if self._autogenerated else None,
281
281
  )
282
282
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
283
283
  pd_df.columns = dataset.columns
@@ -298,7 +298,7 @@ class RidgeCV(BaseTransformer):
298
298
  inspect.currentframe(), RidgeCV.__class__.__name__
299
299
  ),
300
300
  api_calls=[Session.call],
301
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
301
+ custom_tags={"autogen": True} if self._autogenerated else None,
302
302
  )
303
303
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
304
304
  pd_df.columns = dataset.columns
@@ -417,7 +417,7 @@ class SGDClassifier(BaseTransformer):
417
417
  inspect.currentframe(), SGDClassifier.__class__.__name__
418
418
  ),
419
419
  api_calls=[Session.call],
420
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
420
+ custom_tags={"autogen": True} if self._autogenerated else None,
421
421
  )
422
422
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
423
423
  pd_df.columns = dataset.columns
@@ -315,7 +315,7 @@ class SGDOneClassSVM(BaseTransformer):
315
315
  inspect.currentframe(), SGDOneClassSVM.__class__.__name__
316
316
  ),
317
317
  api_calls=[Session.call],
318
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
318
+ custom_tags={"autogen": True} if self._autogenerated else None,
319
319
  )
320
320
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
321
321
  pd_df.columns = dataset.columns
@@ -383,7 +383,7 @@ class SGDRegressor(BaseTransformer):
383
383
  inspect.currentframe(), SGDRegressor.__class__.__name__
384
384
  ),
385
385
  api_calls=[Session.call],
386
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
386
+ custom_tags={"autogen": True} if self._autogenerated else None,
387
387
  )
388
388
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
389
389
  pd_df.columns = dataset.columns
@@ -285,7 +285,7 @@ class TheilSenRegressor(BaseTransformer):
285
285
  inspect.currentframe(), TheilSenRegressor.__class__.__name__
286
286
  ),
287
287
  api_calls=[Session.call],
288
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
288
+ custom_tags={"autogen": True} if self._autogenerated else None,
289
289
  )
290
290
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
291
291
  pd_df.columns = dataset.columns
@@ -311,7 +311,7 @@ class TweedieRegressor(BaseTransformer):
311
311
  inspect.currentframe(), TweedieRegressor.__class__.__name__
312
312
  ),
313
313
  api_calls=[Session.call],
314
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
314
+ custom_tags={"autogen": True} if self._autogenerated else None,
315
315
  )
316
316
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
317
317
  pd_df.columns = dataset.columns
@@ -307,7 +307,7 @@ class Isomap(BaseTransformer):
307
307
  inspect.currentframe(), Isomap.__class__.__name__
308
308
  ),
309
309
  api_calls=[Session.call],
310
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
310
+ custom_tags={"autogen": True} if self._autogenerated else None,
311
311
  )
312
312
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
313
313
  pd_df.columns = dataset.columns
@@ -290,7 +290,7 @@ class MDS(BaseTransformer):
290
290
  inspect.currentframe(), MDS.__class__.__name__
291
291
  ),
292
292
  api_calls=[Session.call],
293
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
293
+ custom_tags={"autogen": True} if self._autogenerated else None,
294
294
  )
295
295
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
296
296
  pd_df.columns = dataset.columns
@@ -292,7 +292,7 @@ class SpectralEmbedding(BaseTransformer):
292
292
  inspect.currentframe(), SpectralEmbedding.__class__.__name__
293
293
  ),
294
294
  api_calls=[Session.call],
295
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
295
+ custom_tags={"autogen": True} if self._autogenerated else None,
296
296
  )
297
297
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
298
298
  pd_df.columns = dataset.columns
@@ -351,7 +351,7 @@ class TSNE(BaseTransformer):
351
351
  inspect.currentframe(), TSNE.__class__.__name__
352
352
  ),
353
353
  api_calls=[Session.call],
354
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
354
+ custom_tags={"autogen": True} if self._autogenerated else None,
355
355
  )
356
356
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
357
357
  pd_df.columns = dataset.columns
@@ -354,7 +354,7 @@ class BayesianGaussianMixture(BaseTransformer):
354
354
  inspect.currentframe(), BayesianGaussianMixture.__class__.__name__
355
355
  ),
356
356
  api_calls=[Session.call],
357
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
357
+ custom_tags={"autogen": True} if self._autogenerated else None,
358
358
  )
359
359
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
360
360
  pd_df.columns = dataset.columns
@@ -327,7 +327,7 @@ class GaussianMixture(BaseTransformer):
327
327
  inspect.currentframe(), GaussianMixture.__class__.__name__
328
328
  ),
329
329
  api_calls=[Session.call],
330
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
330
+ custom_tags={"autogen": True} if self._autogenerated else None,
331
331
  )
332
332
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
333
333
  pd_df.columns = dataset.columns
@@ -285,11 +285,7 @@ class GridSearchCV(BaseTransformer):
285
285
  )
286
286
  return selected_cols
287
287
 
288
- @telemetry.send_api_usage_telemetry(
289
- project=_PROJECT,
290
- subproject=_SUBPROJECT,
291
- )
292
- def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GridSearchCV":
288
+ def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GridSearchCV":
293
289
  """Run fit with all sets of parameters
294
290
  For more details on this function, see [sklearn.model_selection.GridSearchCV.fit]
295
291
  (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.fit)
@@ -298,11 +298,7 @@ class RandomizedSearchCV(BaseTransformer):
298
298
  )
299
299
  return selected_cols
300
300
 
301
- @telemetry.send_api_usage_telemetry(
302
- project=_PROJECT,
303
- subproject=_SUBPROJECT,
304
- )
305
- def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "RandomizedSearchCV":
301
+ def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "RandomizedSearchCV":
306
302
  """Run fit with all sets of parameters
307
303
  For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.fit]
308
304
  (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.fit)
@@ -239,7 +239,7 @@ class OneVsOneClassifier(BaseTransformer):
239
239
  inspect.currentframe(), OneVsOneClassifier.__class__.__name__
240
240
  ),
241
241
  api_calls=[Session.call],
242
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
242
+ custom_tags={"autogen": True} if self._autogenerated else None,
243
243
  )
244
244
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
245
245
  pd_df.columns = dataset.columns
@@ -248,7 +248,7 @@ class OneVsRestClassifier(BaseTransformer):
248
248
  inspect.currentframe(), OneVsRestClassifier.__class__.__name__
249
249
  ),
250
250
  api_calls=[Session.call],
251
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
251
+ custom_tags={"autogen": True} if self._autogenerated else None,
252
252
  )
253
253
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
254
254
  pd_df.columns = dataset.columns
@@ -251,7 +251,7 @@ class OutputCodeClassifier(BaseTransformer):
251
251
  inspect.currentframe(), OutputCodeClassifier.__class__.__name__
252
252
  ),
253
253
  api_calls=[Session.call],
254
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
254
+ custom_tags={"autogen": True} if self._autogenerated else None,
255
255
  )
256
256
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
257
257
  pd_df.columns = dataset.columns
@@ -251,7 +251,7 @@ class BernoulliNB(BaseTransformer):
251
251
  inspect.currentframe(), BernoulliNB.__class__.__name__
252
252
  ),
253
253
  api_calls=[Session.call],
254
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
254
+ custom_tags={"autogen": True} if self._autogenerated else None,
255
255
  )
256
256
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
257
257
  pd_df.columns = dataset.columns
@@ -257,7 +257,7 @@ class CategoricalNB(BaseTransformer):
257
257
  inspect.currentframe(), CategoricalNB.__class__.__name__
258
258
  ),
259
259
  api_calls=[Session.call],
260
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
260
+ custom_tags={"autogen": True} if self._autogenerated else None,
261
261
  )
262
262
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
263
263
  pd_df.columns = dataset.columns
@@ -251,7 +251,7 @@ class ComplementNB(BaseTransformer):
251
251
  inspect.currentframe(), ComplementNB.__class__.__name__
252
252
  ),
253
253
  api_calls=[Session.call],
254
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
254
+ custom_tags={"autogen": True} if self._autogenerated else None,
255
255
  )
256
256
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
257
257
  pd_df.columns = dataset.columns
@@ -232,7 +232,7 @@ class GaussianNB(BaseTransformer):
232
232
  inspect.currentframe(), GaussianNB.__class__.__name__
233
233
  ),
234
234
  api_calls=[Session.call],
235
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
235
+ custom_tags={"autogen": True} if self._autogenerated else None,
236
236
  )
237
237
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
238
238
  pd_df.columns = dataset.columns
@@ -245,7 +245,7 @@ class MultinomialNB(BaseTransformer):
245
245
  inspect.currentframe(), MultinomialNB.__class__.__name__
246
246
  ),
247
247
  api_calls=[Session.call],
248
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
248
+ custom_tags={"autogen": True} if self._autogenerated else None,
249
249
  )
250
250
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
251
251
  pd_df.columns = dataset.columns
@@ -302,7 +302,7 @@ class KNeighborsClassifier(BaseTransformer):
302
302
  inspect.currentframe(), KNeighborsClassifier.__class__.__name__
303
303
  ),
304
304
  api_calls=[Session.call],
305
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
305
+ custom_tags={"autogen": True} if self._autogenerated else None,
306
306
  )
307
307
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
308
308
  pd_df.columns = dataset.columns
@@ -304,7 +304,7 @@ class KNeighborsRegressor(BaseTransformer):
304
304
  inspect.currentframe(), KNeighborsRegressor.__class__.__name__
305
305
  ),
306
306
  api_calls=[Session.call],
307
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
307
+ custom_tags={"autogen": True} if self._autogenerated else None,
308
308
  )
309
309
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
310
310
  pd_df.columns = dataset.columns
@@ -281,7 +281,7 @@ class KernelDensity(BaseTransformer):
281
281
  inspect.currentframe(), KernelDensity.__class__.__name__
282
282
  ),
283
283
  api_calls=[Session.call],
284
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
284
+ custom_tags={"autogen": True} if self._autogenerated else None,
285
285
  )
286
286
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
287
287
  pd_df.columns = dataset.columns
@@ -309,7 +309,7 @@ class LocalOutlierFactor(BaseTransformer):
309
309
  inspect.currentframe(), LocalOutlierFactor.__class__.__name__
310
310
  ),
311
311
  api_calls=[Session.call],
312
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
312
+ custom_tags={"autogen": True} if self._autogenerated else None,
313
313
  )
314
314
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
315
315
  pd_df.columns = dataset.columns
@@ -242,7 +242,7 @@ class NearestCentroid(BaseTransformer):
242
242
  inspect.currentframe(), NearestCentroid.__class__.__name__
243
243
  ),
244
244
  api_calls=[Session.call],
245
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
245
+ custom_tags={"autogen": True} if self._autogenerated else None,
246
246
  )
247
247
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
248
248
  pd_df.columns = dataset.columns
@@ -292,7 +292,7 @@ class NearestNeighbors(BaseTransformer):
292
292
  inspect.currentframe(), NearestNeighbors.__class__.__name__
293
293
  ),
294
294
  api_calls=[Session.call],
295
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
295
+ custom_tags={"autogen": True} if self._autogenerated else None,
296
296
  )
297
297
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
298
298
  pd_df.columns = dataset.columns
@@ -313,7 +313,7 @@ class NeighborhoodComponentsAnalysis(BaseTransformer):
313
313
  inspect.currentframe(), NeighborhoodComponentsAnalysis.__class__.__name__
314
314
  ),
315
315
  api_calls=[Session.call],
316
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
316
+ custom_tags={"autogen": True} if self._autogenerated else None,
317
317
  )
318
318
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
319
319
  pd_df.columns = dataset.columns
@@ -314,7 +314,7 @@ class RadiusNeighborsClassifier(BaseTransformer):
314
314
  inspect.currentframe(), RadiusNeighborsClassifier.__class__.__name__
315
315
  ),
316
316
  api_calls=[Session.call],
317
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
317
+ custom_tags={"autogen": True} if self._autogenerated else None,
318
318
  )
319
319
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
320
320
  pd_df.columns = dataset.columns
@@ -304,7 +304,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
304
304
  inspect.currentframe(), RadiusNeighborsRegressor.__class__.__name__
305
305
  ),
306
306
  api_calls=[Session.call],
307
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
307
+ custom_tags={"autogen": True} if self._autogenerated else None,
308
308
  )
309
309
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
310
310
  pd_df.columns = dataset.columns
@@ -261,7 +261,7 @@ class BernoulliRBM(BaseTransformer):
261
261
  inspect.currentframe(), BernoulliRBM.__class__.__name__
262
262
  ),
263
263
  api_calls=[Session.call],
264
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
264
+ custom_tags={"autogen": True} if self._autogenerated else None,
265
265
  )
266
266
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
267
267
  pd_df.columns = dataset.columns
@@ -416,7 +416,7 @@ class MLPClassifier(BaseTransformer):
416
416
  inspect.currentframe(), MLPClassifier.__class__.__name__
417
417
  ),
418
418
  api_calls=[Session.call],
419
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
419
+ custom_tags={"autogen": True} if self._autogenerated else None,
420
420
  )
421
421
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
422
422
  pd_df.columns = dataset.columns
@@ -412,7 +412,7 @@ class MLPRegressor(BaseTransformer):
412
412
  inspect.currentframe(), MLPRegressor.__class__.__name__
413
413
  ),
414
414
  api_calls=[Session.call],
415
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
415
+ custom_tags={"autogen": True} if self._autogenerated else None,
416
416
  )
417
417
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
418
418
  pd_df.columns = dataset.columns
@@ -17,6 +17,7 @@ from sklearn.utils import metaestimators
17
17
  from snowflake import snowpark
18
18
  from snowflake.ml._internal import file_utils, telemetry
19
19
  from snowflake.ml._internal.exceptions import error_codes, exceptions
20
+ from snowflake.ml._internal.lineage import lineage_utils
20
21
  from snowflake.ml._internal.utils import snowpark_dataframe_utils, temp_file_utils
21
22
  from snowflake.ml.model.model_signature import ModelSignature, _infer_signature
22
23
  from snowflake.ml.modeling._internal.model_transformer_builder import (
@@ -427,6 +428,10 @@ class Pipeline(base.BaseTransformer):
427
428
  else dataset
428
429
  )
429
430
 
431
+ # Extract lineage information here since we're overriding fit() directly
432
+ data_sources = lineage_utils.get_data_sources(dataset)
433
+ lineage_utils.set_data_sources(self, data_sources)
434
+
430
435
  if self._can_be_trained_in_ml_runtime(dataset):
431
436
  if not self._is_convertible_to_sklearn:
432
437
  raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
@@ -25,11 +25,15 @@ class Binarizer(base.BaseTransformer):
25
25
  Feature values below or equal to this are replaced by 0, above it by 1. Default values is 0.0.
26
26
 
27
27
  input_cols: Optional[Union[str, Iterable[str]]], default=None
28
- The name(s) of one or more columns in a DataFrame containing a feature to be binarized.
28
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be binarized. Input
29
+ columns must be specified before transform with this argument or after initialization with the
30
+ `set_input_cols` method. This argument is optional for API consistency.
29
31
 
30
32
  output_cols: Optional[Union[str, Iterable[str]]], default=None
31
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
32
- columns specified must match the number of input columns.
33
+ The name(s) to assign output columns in the output DataFrame. The number of
34
+ columns specified must equal the number of input columns. Output columns must be specified before transform
35
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
36
+ API consistency.
33
37
 
34
38
  passthrough_cols: Optional[Union[str, Iterable[str]]], default=None
35
39
  A string or a list of strings indicating column names to be excluded from any
@@ -74,10 +74,15 @@ class KBinsDiscretizer(base.BaseTransformer):
74
74
  - 'quantile': All bins in each feature have the same number of points.
75
75
 
76
76
  input_cols: str or Iterable [column_name], default=None
77
- Single or multiple input columns.
77
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be discretized.
78
+ Input columns must be specified before fit with this argument or after initialization with the
79
+ `set_input_cols` method. This argument is optional for API consistency.
78
80
 
79
81
  output_cols: str or Iterable [column_name], default=None
80
- Single or multiple output columns.
82
+ The name(s) to assign output columns in the output DataFrame. The number of
83
+ columns specified must equal the number of input columns. Output columns must be specified before transform
84
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
85
+ API consistency.
81
86
 
82
87
  passthrough_cols: A string or a list of strings indicating column names to be excluded from any
83
88
  operations (such as train, transform, or inference). These specified column(s)
@@ -25,11 +25,12 @@ class LabelEncoder(base.BaseTransformer):
25
25
 
26
26
  Args:
27
27
  input_cols: Optional[Union[str, List[str]]]
28
- The name of a column in a DataFrame to be encoded. May be a string or a list containing one string.
28
+ The name of a column or a list containing one column name to be encoded in the input DataFrame. There must
29
+ be exactly one input column specified before fit. This argument is optional for API consistency.
29
30
 
30
31
  output_cols: Optional[Union[str, List[str]]]
31
- The name of a column in a DataFrame where the results will be stored. May be a string or a list
32
- containing one string.
32
+ The name of a column or a list containing one column name where the results will be stored. There must be
33
+ exactly one output column specified before trainsform. This argument is optional for API consistency.
33
34
 
34
35
  passthrough_cols: Optional[Union[str, List[str]]]
35
36
  A string or a list of strings indicating column names to be excluded from any
@@ -54,11 +55,11 @@ class LabelEncoder(base.BaseTransformer):
54
55
 
55
56
  Args:
56
57
  input_cols: Optional[Union[str, List[str]]]
57
- The name of a column in a DataFrame to be encoded. May be a string or a list containing one
58
- string.
58
+ The name of a column or a list containing one column name to be encoded in the input DataFrame. There
59
+ must be exactly one input column specified before fit. This argument is optional for API consistency.
59
60
  output_cols: Optional[Union[str, List[str]]]
60
- The name of a column in a DataFrame where the results will be stored. May be a string or a list
61
- containing one string.
61
+ The name of a column or a list containing one column name where the results will be stored. There must
62
+ be exactly one output column specified before transform. This argument is optional for API consistency.
62
63
  passthrough_cols: Optional[Union[str, List[str]]]
63
64
  A string or a list of strings indicating column names to be excluded from any
64
65
  operations (such as train, transform, or inference). These specified column(s)
@@ -28,11 +28,15 @@ class MaxAbsScaler(base.BaseTransformer):
28
28
 
29
29
  Args:
30
30
  input_cols: Optional[Union[str, List[str]]], default=None
31
- The name(s) of one or more columns in a DataFrame containing a feature to be scaled.
31
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be scaled. Input
32
+ columns must be specified before fit with this argument or after initialization with the
33
+ `set_input_cols` method. This argument is optional for API consistency.
32
34
 
33
35
  output_cols: Optional[Union[str, List[str]]], default=None
34
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
35
- columns specified must match the number of input columns.
36
+ The name(s) to assign output columns in the output DataFrame. The number of
37
+ columns specified must equal the number of input columns. Output columns must be specified before transform
38
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
39
+ API consistency.
36
40
 
37
41
  passthrough_cols: Optional[Union[str, List[str]]], default=None
38
42
  A string or a list of strings indicating column names to be excluded from any
@@ -29,12 +29,15 @@ class MinMaxScaler(base.BaseTransformer):
29
29
  Whether to clip transformed values of held-out data to the specified feature range (default is True).
30
30
 
31
31
  input_cols: Optional[Union[str, List[str]]], default=None
32
- The name(s) of one or more columns in a DataFrame containing a feature to be scaled. Each specified
33
- input column is scaled independently and stored in the corresponding output column.
32
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be scaled. Input
33
+ columns must be specified before fit with this argument or after initialization with the
34
+ `set_input_cols` method. This argument is optional for API consistency.
34
35
 
35
36
  output_cols: Optional[Union[str, List[str]]], default=None
36
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
37
- columns specified must match the number of input columns.
37
+ The name(s) to assign output columns in the output DataFrame. The number of
38
+ columns specified must equal the number of input columns. Output columns must be specified before transform
39
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
40
+ API consistency.
38
41
 
39
42
  passthrough_cols: Optional[Union[str, List[str]]], default=None
40
43
  A string or a list of strings indicating column names to be excluded from any
@@ -28,11 +28,15 @@ class Normalizer(base.BaseTransformer):
28
28
  values. It must be one of 'l1', 'l2', or 'max'.
29
29
 
30
30
  input_cols: Optional[Union[str, List[str]]]
31
- Columns to use as inputs during transform.
31
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be normalized. Input
32
+ columns must be specified before transform with this argument or after initialization with the
33
+ `set_input_cols` method. This argument is optional for API consistency.
32
34
 
33
35
  output_cols: Optional[Union[str, List[str]]]
34
- A string or list of strings representing column names that will store the output of transform operation.
35
- The length of `output_cols` must equal the length of `input_cols`.
36
+ The name(s) to assign output columns in the output DataFrame. The number of
37
+ columns specified must equal the number of input columns. Output columns must be specified before transform
38
+ with this argument or after initialization with the `set_output_cols` method. This argument is optional for
39
+ API consistency.
36
40
 
37
41
  passthrough_cols: Optional[Union[str, List[str]]]
38
42
  A string or a list of strings indicating column names to be excluded from any
@@ -157,10 +157,18 @@ class OneHotEncoder(base.BaseTransformer):
157
157
  there is no limit to the number of output features.
158
158
 
159
159
  input_cols: Optional[Union[str, List[str]]], default=None
160
- Single or multiple input columns.
160
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be encoded. Input
161
+ columns must be specified before fit with this argument or after initialization with the
162
+ `set_input_cols` method. This argument is optional for API consistency.
161
163
 
162
164
  output_cols: Optional[Union[str, List[str]]], default=None
163
- Single or multiple output columns.
165
+ The prefix to be used for encoded output for each input column. The number of
166
+ output column prefixes specified must match the number of input columns. Output column prefixes must be
167
+ specified before transform with this argument or after initialization with the `set_output_cols` method.
168
+
169
+ Note: Dense output column names are case-sensitive and resolve identifiers following Snowflake rules, e.g.
170
+ `"PREFIX_a"`, `PREFIX_A`, `"prefix_A"`. Therefore, there is no need to provide double-quoted column names
171
+ as that would result in invalid identifiers.
164
172
 
165
173
  passthrough_cols: Optional[Union[str, List[str]]]
166
174
  A string or a list of strings indicating column names to be excluded from any
@@ -67,11 +67,14 @@ class OrdinalEncoder(base.BaseTransformer):
67
67
  The value to be used to encode unknown categories.
68
68
 
69
69
  input_cols: Optional[Union[str, List[str]]], default=None
70
- The name(s) of one or more columns in a DataFrame containing a feature to be encoded.
70
+ The name(s) of one or more columns in the input DataFrame containing feature(s) to be encoded. Input
71
+ columns must be specified before fit with this argument or after initialization with the
72
+ `set_input_cols` method. This argument is optional for API consistency.
71
73
 
72
74
  output_cols: Optional[Union[str, List[str]]], default=None
73
- The name(s) of one or more columns in a DataFrame in which results will be stored. The number of
74
- columns specified must match the number of input columns.
75
+ The prefix to be used for encoded output for each input column. The number of
76
+ output column prefixes specified must equal the number of input columns. Output column prefixes must be
77
+ specified before transform with this argument or after initialization with the `set_output_cols` method.
75
78
 
76
79
  passthrough_cols: Optional[Union[str, List[str]]], default=None
77
80
  A string or a list of strings indicating column names to be excluded from any
@@ -247,7 +250,7 @@ class OrdinalEncoder(base.BaseTransformer):
247
250
  # columns: COLUMN_NAME, CATEGORY, INDEX
248
251
  state_df = self._get_category_index_state_df(dataset)
249
252
  # save the dataframe on server side so that transform doesn't need to upload
250
- state_df.write.save_as_table( # type: ignore[call-overload]
253
+ state_df.write.save_as_table(
251
254
  self._vocab_table_name,
252
255
  mode="overwrite",
253
256
  table_type="temporary",
@@ -520,7 +523,7 @@ class OrdinalEncoder(base.BaseTransformer):
520
523
  )
521
524
 
522
525
  batch_table_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.TABLE)
523
- transformed_dataset.write.save_as_table( # type: ignore[call-overload]
526
+ transformed_dataset.write.save_as_table(
524
527
  batch_table_name,
525
528
  mode="overwrite",
526
529
  table_type="temporary",
@@ -251,7 +251,7 @@ class PolynomialFeatures(BaseTransformer):
251
251
  inspect.currentframe(), PolynomialFeatures.__class__.__name__
252
252
  ),
253
253
  api_calls=[Session.call],
254
- custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
254
+ custom_tags={"autogen": True} if self._autogenerated else None,
255
255
  )
256
256
  pd_df: pd.DataFrame = dataset.to_pandas(statement_params=statement_params)
257
257
  pd_df.columns = dataset.columns