snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -266,18 +266,24 @@ class SelectFwe(BaseTransformer):
266
266
  self._get_model_signatures(dataset)
267
267
  return self
268
268
 
269
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
270
- if self._drop_input_cols:
271
- return []
272
- else:
273
- return list(set(dataset.columns) - set(self.output_cols))
274
-
275
269
  def _batch_inference_validate_snowpark(
276
270
  self,
277
271
  dataset: DataFrame,
278
272
  inference_method: str,
279
273
  ) -> List[str]:
280
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
274
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
275
+ return the available package that exists in the snowflake anaconda channel
276
+
277
+ Args:
278
+ dataset: snowpark dataframe
279
+ inference_method: the inference method such as predict, score...
280
+
281
+ Raises:
282
+ SnowflakeMLException: If the estimator is not fitted, raise error
283
+ SnowflakeMLException: If the session is None, raise error
284
+
285
+ Returns:
286
+ A list of available package that exists in the snowflake anaconda channel
281
287
  """
282
288
  if not self._is_fitted:
283
289
  raise exceptions.SnowflakeMLException(
@@ -349,7 +355,7 @@ class SelectFwe(BaseTransformer):
349
355
  transform_kwargs = dict(
350
356
  session = dataset._session,
351
357
  dependencies = self._deps,
352
- pass_through_cols = self._get_pass_through_columns(dataset),
358
+ drop_input_cols = self._drop_input_cols,
353
359
  expected_output_cols_type = expected_type_inferred,
354
360
  )
355
361
 
@@ -411,16 +417,16 @@ class SelectFwe(BaseTransformer):
411
417
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
412
418
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
413
419
  # each row containing a list of values.
414
- expected_dtype = "ARRAY"
420
+ expected_dtype = "array"
415
421
 
416
422
  # If we were unable to assign a type to this transform in the factory, infer the type here.
417
423
  if expected_dtype == "":
418
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
424
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
419
425
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
420
- expected_dtype = "ARRAY"
421
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
426
+ expected_dtype = "array"
427
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
422
428
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
423
- expected_dtype = "ARRAY"
429
+ expected_dtype = "array"
424
430
  else:
425
431
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
426
432
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -438,7 +444,7 @@ class SelectFwe(BaseTransformer):
438
444
  transform_kwargs = dict(
439
445
  session = dataset._session,
440
446
  dependencies = self._deps,
441
- pass_through_cols = self._get_pass_through_columns(dataset),
447
+ drop_input_cols = self._drop_input_cols,
442
448
  expected_output_cols_type = expected_dtype,
443
449
  )
444
450
 
@@ -489,7 +495,7 @@ class SelectFwe(BaseTransformer):
489
495
  subproject=_SUBPROJECT,
490
496
  )
491
497
  output_result, fitted_estimator = model_trainer.train_fit_predict(
492
- pass_through_columns=self._get_pass_through_columns(dataset),
498
+ drop_input_cols=self._drop_input_cols,
493
499
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
494
500
  )
495
501
  self._sklearn_object = fitted_estimator
@@ -507,44 +513,6 @@ class SelectFwe(BaseTransformer):
507
513
  assert self._sklearn_object is not None
508
514
  return self._sklearn_object.embedding_
509
515
 
510
-
511
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
512
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
513
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
514
- """
515
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
516
- if output_cols:
517
- output_cols = [
518
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
519
- for c in output_cols
520
- ]
521
- elif getattr(self._sklearn_object, "classes_", None) is None:
522
- output_cols = [output_cols_prefix]
523
- elif self._sklearn_object is not None:
524
- classes = self._sklearn_object.classes_
525
- if isinstance(classes, numpy.ndarray):
526
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
527
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
528
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
529
- output_cols = []
530
- for i, cl in enumerate(classes):
531
- # For binary classification, there is only one output column for each class
532
- # ndarray as the two classes are complementary.
533
- if len(cl) == 2:
534
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
535
- else:
536
- output_cols.extend([
537
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
538
- ])
539
- else:
540
- output_cols = []
541
-
542
- # Make sure column names are valid snowflake identifiers.
543
- assert output_cols is not None # Make MyPy happy
544
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
545
-
546
- return rv
547
-
548
516
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
549
517
  @telemetry.send_api_usage_telemetry(
550
518
  project=_PROJECT,
@@ -584,7 +552,7 @@ class SelectFwe(BaseTransformer):
584
552
  transform_kwargs = dict(
585
553
  session=dataset._session,
586
554
  dependencies=self._deps,
587
- pass_through_cols=self._get_pass_through_columns(dataset),
555
+ drop_input_cols = self._drop_input_cols,
588
556
  expected_output_cols_type="float",
589
557
  )
590
558
 
@@ -649,7 +617,7 @@ class SelectFwe(BaseTransformer):
649
617
  transform_kwargs = dict(
650
618
  session=dataset._session,
651
619
  dependencies=self._deps,
652
- pass_through_cols=self._get_pass_through_columns(dataset),
620
+ drop_input_cols = self._drop_input_cols,
653
621
  expected_output_cols_type="float",
654
622
  )
655
623
  elif isinstance(dataset, pd.DataFrame):
@@ -710,7 +678,7 @@ class SelectFwe(BaseTransformer):
710
678
  transform_kwargs = dict(
711
679
  session=dataset._session,
712
680
  dependencies=self._deps,
713
- pass_through_cols=self._get_pass_through_columns(dataset),
681
+ drop_input_cols = self._drop_input_cols,
714
682
  expected_output_cols_type="float",
715
683
  )
716
684
 
@@ -775,7 +743,7 @@ class SelectFwe(BaseTransformer):
775
743
  transform_kwargs = dict(
776
744
  session=dataset._session,
777
745
  dependencies=self._deps,
778
- pass_through_cols=self._get_pass_through_columns(dataset),
746
+ drop_input_cols = self._drop_input_cols,
779
747
  expected_output_cols_type="float",
780
748
  )
781
749
 
@@ -829,13 +797,17 @@ class SelectFwe(BaseTransformer):
829
797
  transform_kwargs: ScoreKwargsTypedDict = dict()
830
798
 
831
799
  if isinstance(dataset, DataFrame):
800
+ self._deps = self._batch_inference_validate_snowpark(
801
+ dataset=dataset,
802
+ inference_method="score",
803
+ )
832
804
  selected_cols = self._get_active_columns()
833
805
  if len(selected_cols) > 0:
834
806
  dataset = dataset.select(selected_cols)
835
807
  assert isinstance(dataset._session, Session) # keep mypy happy
836
808
  transform_kwargs = dict(
837
809
  session=dataset._session,
838
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
810
+ dependencies=["snowflake-snowpark-python"] + self._deps,
839
811
  score_sproc_imports=['sklearn'],
840
812
  )
841
813
  elif isinstance(dataset, pd.DataFrame):
@@ -909,9 +881,9 @@ class SelectFwe(BaseTransformer):
909
881
  transform_kwargs = dict(
910
882
  session = dataset._session,
911
883
  dependencies = self._deps,
912
- pass_through_cols = self._get_pass_through_columns(dataset),
913
- expected_output_cols_type = "array",
914
- n_neighbors = n_neighbors,
884
+ drop_input_cols = self._drop_input_cols,
885
+ expected_output_cols_type="array",
886
+ n_neighbors = n_neighbors,
915
887
  return_distance = return_distance
916
888
  )
917
889
  elif isinstance(dataset, pd.DataFrame):
@@ -267,18 +267,24 @@ class SelectKBest(BaseTransformer):
267
267
  self._get_model_signatures(dataset)
268
268
  return self
269
269
 
270
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
271
- if self._drop_input_cols:
272
- return []
273
- else:
274
- return list(set(dataset.columns) - set(self.output_cols))
275
-
276
270
  def _batch_inference_validate_snowpark(
277
271
  self,
278
272
  dataset: DataFrame,
279
273
  inference_method: str,
280
274
  ) -> List[str]:
281
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
275
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
276
+ return the available package that exists in the snowflake anaconda channel
277
+
278
+ Args:
279
+ dataset: snowpark dataframe
280
+ inference_method: the inference method such as predict, score...
281
+
282
+ Raises:
283
+ SnowflakeMLException: If the estimator is not fitted, raise error
284
+ SnowflakeMLException: If the session is None, raise error
285
+
286
+ Returns:
287
+ A list of available package that exists in the snowflake anaconda channel
282
288
  """
283
289
  if not self._is_fitted:
284
290
  raise exceptions.SnowflakeMLException(
@@ -350,7 +356,7 @@ class SelectKBest(BaseTransformer):
350
356
  transform_kwargs = dict(
351
357
  session = dataset._session,
352
358
  dependencies = self._deps,
353
- pass_through_cols = self._get_pass_through_columns(dataset),
359
+ drop_input_cols = self._drop_input_cols,
354
360
  expected_output_cols_type = expected_type_inferred,
355
361
  )
356
362
 
@@ -412,16 +418,16 @@ class SelectKBest(BaseTransformer):
412
418
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
413
419
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
414
420
  # each row containing a list of values.
415
- expected_dtype = "ARRAY"
421
+ expected_dtype = "array"
416
422
 
417
423
  # If we were unable to assign a type to this transform in the factory, infer the type here.
418
424
  if expected_dtype == "":
419
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
425
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
420
426
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
421
- expected_dtype = "ARRAY"
422
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
427
+ expected_dtype = "array"
428
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
423
429
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
424
- expected_dtype = "ARRAY"
430
+ expected_dtype = "array"
425
431
  else:
426
432
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
427
433
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -439,7 +445,7 @@ class SelectKBest(BaseTransformer):
439
445
  transform_kwargs = dict(
440
446
  session = dataset._session,
441
447
  dependencies = self._deps,
442
- pass_through_cols = self._get_pass_through_columns(dataset),
448
+ drop_input_cols = self._drop_input_cols,
443
449
  expected_output_cols_type = expected_dtype,
444
450
  )
445
451
 
@@ -490,7 +496,7 @@ class SelectKBest(BaseTransformer):
490
496
  subproject=_SUBPROJECT,
491
497
  )
492
498
  output_result, fitted_estimator = model_trainer.train_fit_predict(
493
- pass_through_columns=self._get_pass_through_columns(dataset),
499
+ drop_input_cols=self._drop_input_cols,
494
500
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
495
501
  )
496
502
  self._sklearn_object = fitted_estimator
@@ -508,44 +514,6 @@ class SelectKBest(BaseTransformer):
508
514
  assert self._sklearn_object is not None
509
515
  return self._sklearn_object.embedding_
510
516
 
511
-
512
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
513
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
514
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
515
- """
516
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
517
- if output_cols:
518
- output_cols = [
519
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
520
- for c in output_cols
521
- ]
522
- elif getattr(self._sklearn_object, "classes_", None) is None:
523
- output_cols = [output_cols_prefix]
524
- elif self._sklearn_object is not None:
525
- classes = self._sklearn_object.classes_
526
- if isinstance(classes, numpy.ndarray):
527
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
528
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
529
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
530
- output_cols = []
531
- for i, cl in enumerate(classes):
532
- # For binary classification, there is only one output column for each class
533
- # ndarray as the two classes are complementary.
534
- if len(cl) == 2:
535
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
536
- else:
537
- output_cols.extend([
538
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
539
- ])
540
- else:
541
- output_cols = []
542
-
543
- # Make sure column names are valid snowflake identifiers.
544
- assert output_cols is not None # Make MyPy happy
545
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
546
-
547
- return rv
548
-
549
517
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
550
518
  @telemetry.send_api_usage_telemetry(
551
519
  project=_PROJECT,
@@ -585,7 +553,7 @@ class SelectKBest(BaseTransformer):
585
553
  transform_kwargs = dict(
586
554
  session=dataset._session,
587
555
  dependencies=self._deps,
588
- pass_through_cols=self._get_pass_through_columns(dataset),
556
+ drop_input_cols = self._drop_input_cols,
589
557
  expected_output_cols_type="float",
590
558
  )
591
559
 
@@ -650,7 +618,7 @@ class SelectKBest(BaseTransformer):
650
618
  transform_kwargs = dict(
651
619
  session=dataset._session,
652
620
  dependencies=self._deps,
653
- pass_through_cols=self._get_pass_through_columns(dataset),
621
+ drop_input_cols = self._drop_input_cols,
654
622
  expected_output_cols_type="float",
655
623
  )
656
624
  elif isinstance(dataset, pd.DataFrame):
@@ -711,7 +679,7 @@ class SelectKBest(BaseTransformer):
711
679
  transform_kwargs = dict(
712
680
  session=dataset._session,
713
681
  dependencies=self._deps,
714
- pass_through_cols=self._get_pass_through_columns(dataset),
682
+ drop_input_cols = self._drop_input_cols,
715
683
  expected_output_cols_type="float",
716
684
  )
717
685
 
@@ -776,7 +744,7 @@ class SelectKBest(BaseTransformer):
776
744
  transform_kwargs = dict(
777
745
  session=dataset._session,
778
746
  dependencies=self._deps,
779
- pass_through_cols=self._get_pass_through_columns(dataset),
747
+ drop_input_cols = self._drop_input_cols,
780
748
  expected_output_cols_type="float",
781
749
  )
782
750
 
@@ -830,13 +798,17 @@ class SelectKBest(BaseTransformer):
830
798
  transform_kwargs: ScoreKwargsTypedDict = dict()
831
799
 
832
800
  if isinstance(dataset, DataFrame):
801
+ self._deps = self._batch_inference_validate_snowpark(
802
+ dataset=dataset,
803
+ inference_method="score",
804
+ )
833
805
  selected_cols = self._get_active_columns()
834
806
  if len(selected_cols) > 0:
835
807
  dataset = dataset.select(selected_cols)
836
808
  assert isinstance(dataset._session, Session) # keep mypy happy
837
809
  transform_kwargs = dict(
838
810
  session=dataset._session,
839
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
811
+ dependencies=["snowflake-snowpark-python"] + self._deps,
840
812
  score_sproc_imports=['sklearn'],
841
813
  )
842
814
  elif isinstance(dataset, pd.DataFrame):
@@ -910,9 +882,9 @@ class SelectKBest(BaseTransformer):
910
882
  transform_kwargs = dict(
911
883
  session = dataset._session,
912
884
  dependencies = self._deps,
913
- pass_through_cols = self._get_pass_through_columns(dataset),
914
- expected_output_cols_type = "array",
915
- n_neighbors = n_neighbors,
885
+ drop_input_cols = self._drop_input_cols,
886
+ expected_output_cols_type="array",
887
+ n_neighbors = n_neighbors,
916
888
  return_distance = return_distance
917
889
  )
918
890
  elif isinstance(dataset, pd.DataFrame):
@@ -266,18 +266,24 @@ class SelectPercentile(BaseTransformer):
266
266
  self._get_model_signatures(dataset)
267
267
  return self
268
268
 
269
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
270
- if self._drop_input_cols:
271
- return []
272
- else:
273
- return list(set(dataset.columns) - set(self.output_cols))
274
-
275
269
  def _batch_inference_validate_snowpark(
276
270
  self,
277
271
  dataset: DataFrame,
278
272
  inference_method: str,
279
273
  ) -> List[str]:
280
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
274
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
275
+ return the available package that exists in the snowflake anaconda channel
276
+
277
+ Args:
278
+ dataset: snowpark dataframe
279
+ inference_method: the inference method such as predict, score...
280
+
281
+ Raises:
282
+ SnowflakeMLException: If the estimator is not fitted, raise error
283
+ SnowflakeMLException: If the session is None, raise error
284
+
285
+ Returns:
286
+ A list of available package that exists in the snowflake anaconda channel
281
287
  """
282
288
  if not self._is_fitted:
283
289
  raise exceptions.SnowflakeMLException(
@@ -349,7 +355,7 @@ class SelectPercentile(BaseTransformer):
349
355
  transform_kwargs = dict(
350
356
  session = dataset._session,
351
357
  dependencies = self._deps,
352
- pass_through_cols = self._get_pass_through_columns(dataset),
358
+ drop_input_cols = self._drop_input_cols,
353
359
  expected_output_cols_type = expected_type_inferred,
354
360
  )
355
361
 
@@ -411,16 +417,16 @@ class SelectPercentile(BaseTransformer):
411
417
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
412
418
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
413
419
  # each row containing a list of values.
414
- expected_dtype = "ARRAY"
420
+ expected_dtype = "array"
415
421
 
416
422
  # If we were unable to assign a type to this transform in the factory, infer the type here.
417
423
  if expected_dtype == "":
418
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
424
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
419
425
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
420
- expected_dtype = "ARRAY"
421
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
426
+ expected_dtype = "array"
427
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
422
428
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
423
- expected_dtype = "ARRAY"
429
+ expected_dtype = "array"
424
430
  else:
425
431
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
426
432
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -438,7 +444,7 @@ class SelectPercentile(BaseTransformer):
438
444
  transform_kwargs = dict(
439
445
  session = dataset._session,
440
446
  dependencies = self._deps,
441
- pass_through_cols = self._get_pass_through_columns(dataset),
447
+ drop_input_cols = self._drop_input_cols,
442
448
  expected_output_cols_type = expected_dtype,
443
449
  )
444
450
 
@@ -489,7 +495,7 @@ class SelectPercentile(BaseTransformer):
489
495
  subproject=_SUBPROJECT,
490
496
  )
491
497
  output_result, fitted_estimator = model_trainer.train_fit_predict(
492
- pass_through_columns=self._get_pass_through_columns(dataset),
498
+ drop_input_cols=self._drop_input_cols,
493
499
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
494
500
  )
495
501
  self._sklearn_object = fitted_estimator
@@ -507,44 +513,6 @@ class SelectPercentile(BaseTransformer):
507
513
  assert self._sklearn_object is not None
508
514
  return self._sklearn_object.embedding_
509
515
 
510
-
511
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
512
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
513
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
514
- """
515
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
516
- if output_cols:
517
- output_cols = [
518
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
519
- for c in output_cols
520
- ]
521
- elif getattr(self._sklearn_object, "classes_", None) is None:
522
- output_cols = [output_cols_prefix]
523
- elif self._sklearn_object is not None:
524
- classes = self._sklearn_object.classes_
525
- if isinstance(classes, numpy.ndarray):
526
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
527
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
528
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
529
- output_cols = []
530
- for i, cl in enumerate(classes):
531
- # For binary classification, there is only one output column for each class
532
- # ndarray as the two classes are complementary.
533
- if len(cl) == 2:
534
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
535
- else:
536
- output_cols.extend([
537
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
538
- ])
539
- else:
540
- output_cols = []
541
-
542
- # Make sure column names are valid snowflake identifiers.
543
- assert output_cols is not None # Make MyPy happy
544
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
545
-
546
- return rv
547
-
548
516
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
549
517
  @telemetry.send_api_usage_telemetry(
550
518
  project=_PROJECT,
@@ -584,7 +552,7 @@ class SelectPercentile(BaseTransformer):
584
552
  transform_kwargs = dict(
585
553
  session=dataset._session,
586
554
  dependencies=self._deps,
587
- pass_through_cols=self._get_pass_through_columns(dataset),
555
+ drop_input_cols = self._drop_input_cols,
588
556
  expected_output_cols_type="float",
589
557
  )
590
558
 
@@ -649,7 +617,7 @@ class SelectPercentile(BaseTransformer):
649
617
  transform_kwargs = dict(
650
618
  session=dataset._session,
651
619
  dependencies=self._deps,
652
- pass_through_cols=self._get_pass_through_columns(dataset),
620
+ drop_input_cols = self._drop_input_cols,
653
621
  expected_output_cols_type="float",
654
622
  )
655
623
  elif isinstance(dataset, pd.DataFrame):
@@ -710,7 +678,7 @@ class SelectPercentile(BaseTransformer):
710
678
  transform_kwargs = dict(
711
679
  session=dataset._session,
712
680
  dependencies=self._deps,
713
- pass_through_cols=self._get_pass_through_columns(dataset),
681
+ drop_input_cols = self._drop_input_cols,
714
682
  expected_output_cols_type="float",
715
683
  )
716
684
 
@@ -775,7 +743,7 @@ class SelectPercentile(BaseTransformer):
775
743
  transform_kwargs = dict(
776
744
  session=dataset._session,
777
745
  dependencies=self._deps,
778
- pass_through_cols=self._get_pass_through_columns(dataset),
746
+ drop_input_cols = self._drop_input_cols,
779
747
  expected_output_cols_type="float",
780
748
  )
781
749
 
@@ -829,13 +797,17 @@ class SelectPercentile(BaseTransformer):
829
797
  transform_kwargs: ScoreKwargsTypedDict = dict()
830
798
 
831
799
  if isinstance(dataset, DataFrame):
800
+ self._deps = self._batch_inference_validate_snowpark(
801
+ dataset=dataset,
802
+ inference_method="score",
803
+ )
832
804
  selected_cols = self._get_active_columns()
833
805
  if len(selected_cols) > 0:
834
806
  dataset = dataset.select(selected_cols)
835
807
  assert isinstance(dataset._session, Session) # keep mypy happy
836
808
  transform_kwargs = dict(
837
809
  session=dataset._session,
838
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
810
+ dependencies=["snowflake-snowpark-python"] + self._deps,
839
811
  score_sproc_imports=['sklearn'],
840
812
  )
841
813
  elif isinstance(dataset, pd.DataFrame):
@@ -909,9 +881,9 @@ class SelectPercentile(BaseTransformer):
909
881
  transform_kwargs = dict(
910
882
  session = dataset._session,
911
883
  dependencies = self._deps,
912
- pass_through_cols = self._get_pass_through_columns(dataset),
913
- expected_output_cols_type = "array",
914
- n_neighbors = n_neighbors,
884
+ drop_input_cols = self._drop_input_cols,
885
+ expected_output_cols_type="array",
886
+ n_neighbors = n_neighbors,
915
887
  return_distance = return_distance
916
888
  )
917
889
  elif isinstance(dataset, pd.DataFrame):