snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -270,18 +270,24 @@ class GenericUnivariateSelect(BaseTransformer):
270
270
  self._get_model_signatures(dataset)
271
271
  return self
272
272
 
273
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
274
- if self._drop_input_cols:
275
- return []
276
- else:
277
- return list(set(dataset.columns) - set(self.output_cols))
278
-
279
273
  def _batch_inference_validate_snowpark(
280
274
  self,
281
275
  dataset: DataFrame,
282
276
  inference_method: str,
283
277
  ) -> List[str]:
284
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
278
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
279
+ return the available package that exists in the snowflake anaconda channel
280
+
281
+ Args:
282
+ dataset: snowpark dataframe
283
+ inference_method: the inference method such as predict, score...
284
+
285
+ Raises:
286
+ SnowflakeMLException: If the estimator is not fitted, raise error
287
+ SnowflakeMLException: If the session is None, raise error
288
+
289
+ Returns:
290
+ A list of available package that exists in the snowflake anaconda channel
285
291
  """
286
292
  if not self._is_fitted:
287
293
  raise exceptions.SnowflakeMLException(
@@ -353,7 +359,7 @@ class GenericUnivariateSelect(BaseTransformer):
353
359
  transform_kwargs = dict(
354
360
  session = dataset._session,
355
361
  dependencies = self._deps,
356
- pass_through_cols = self._get_pass_through_columns(dataset),
362
+ drop_input_cols = self._drop_input_cols,
357
363
  expected_output_cols_type = expected_type_inferred,
358
364
  )
359
365
 
@@ -415,16 +421,16 @@ class GenericUnivariateSelect(BaseTransformer):
415
421
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
416
422
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
417
423
  # each row containing a list of values.
418
- expected_dtype = "ARRAY"
424
+ expected_dtype = "array"
419
425
 
420
426
  # If we were unable to assign a type to this transform in the factory, infer the type here.
421
427
  if expected_dtype == "":
422
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
428
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
423
429
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
424
- expected_dtype = "ARRAY"
425
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
430
+ expected_dtype = "array"
431
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
426
432
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
427
- expected_dtype = "ARRAY"
433
+ expected_dtype = "array"
428
434
  else:
429
435
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
430
436
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -442,7 +448,7 @@ class GenericUnivariateSelect(BaseTransformer):
442
448
  transform_kwargs = dict(
443
449
  session = dataset._session,
444
450
  dependencies = self._deps,
445
- pass_through_cols = self._get_pass_through_columns(dataset),
451
+ drop_input_cols = self._drop_input_cols,
446
452
  expected_output_cols_type = expected_dtype,
447
453
  )
448
454
 
@@ -493,7 +499,7 @@ class GenericUnivariateSelect(BaseTransformer):
493
499
  subproject=_SUBPROJECT,
494
500
  )
495
501
  output_result, fitted_estimator = model_trainer.train_fit_predict(
496
- pass_through_columns=self._get_pass_through_columns(dataset),
502
+ drop_input_cols=self._drop_input_cols,
497
503
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
498
504
  )
499
505
  self._sklearn_object = fitted_estimator
@@ -511,44 +517,6 @@ class GenericUnivariateSelect(BaseTransformer):
511
517
  assert self._sklearn_object is not None
512
518
  return self._sklearn_object.embedding_
513
519
 
514
-
515
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
516
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
517
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
518
- """
519
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
520
- if output_cols:
521
- output_cols = [
522
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
523
- for c in output_cols
524
- ]
525
- elif getattr(self._sklearn_object, "classes_", None) is None:
526
- output_cols = [output_cols_prefix]
527
- elif self._sklearn_object is not None:
528
- classes = self._sklearn_object.classes_
529
- if isinstance(classes, numpy.ndarray):
530
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
531
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
532
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
533
- output_cols = []
534
- for i, cl in enumerate(classes):
535
- # For binary classification, there is only one output column for each class
536
- # ndarray as the two classes are complementary.
537
- if len(cl) == 2:
538
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
539
- else:
540
- output_cols.extend([
541
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
542
- ])
543
- else:
544
- output_cols = []
545
-
546
- # Make sure column names are valid snowflake identifiers.
547
- assert output_cols is not None # Make MyPy happy
548
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
549
-
550
- return rv
551
-
552
520
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
553
521
  @telemetry.send_api_usage_telemetry(
554
522
  project=_PROJECT,
@@ -588,7 +556,7 @@ class GenericUnivariateSelect(BaseTransformer):
588
556
  transform_kwargs = dict(
589
557
  session=dataset._session,
590
558
  dependencies=self._deps,
591
- pass_through_cols=self._get_pass_through_columns(dataset),
559
+ drop_input_cols = self._drop_input_cols,
592
560
  expected_output_cols_type="float",
593
561
  )
594
562
 
@@ -653,7 +621,7 @@ class GenericUnivariateSelect(BaseTransformer):
653
621
  transform_kwargs = dict(
654
622
  session=dataset._session,
655
623
  dependencies=self._deps,
656
- pass_through_cols=self._get_pass_through_columns(dataset),
624
+ drop_input_cols = self._drop_input_cols,
657
625
  expected_output_cols_type="float",
658
626
  )
659
627
  elif isinstance(dataset, pd.DataFrame):
@@ -714,7 +682,7 @@ class GenericUnivariateSelect(BaseTransformer):
714
682
  transform_kwargs = dict(
715
683
  session=dataset._session,
716
684
  dependencies=self._deps,
717
- pass_through_cols=self._get_pass_through_columns(dataset),
685
+ drop_input_cols = self._drop_input_cols,
718
686
  expected_output_cols_type="float",
719
687
  )
720
688
 
@@ -779,7 +747,7 @@ class GenericUnivariateSelect(BaseTransformer):
779
747
  transform_kwargs = dict(
780
748
  session=dataset._session,
781
749
  dependencies=self._deps,
782
- pass_through_cols=self._get_pass_through_columns(dataset),
750
+ drop_input_cols = self._drop_input_cols,
783
751
  expected_output_cols_type="float",
784
752
  )
785
753
 
@@ -833,13 +801,17 @@ class GenericUnivariateSelect(BaseTransformer):
833
801
  transform_kwargs: ScoreKwargsTypedDict = dict()
834
802
 
835
803
  if isinstance(dataset, DataFrame):
804
+ self._deps = self._batch_inference_validate_snowpark(
805
+ dataset=dataset,
806
+ inference_method="score",
807
+ )
836
808
  selected_cols = self._get_active_columns()
837
809
  if len(selected_cols) > 0:
838
810
  dataset = dataset.select(selected_cols)
839
811
  assert isinstance(dataset._session, Session) # keep mypy happy
840
812
  transform_kwargs = dict(
841
813
  session=dataset._session,
842
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
814
+ dependencies=["snowflake-snowpark-python"] + self._deps,
843
815
  score_sproc_imports=['sklearn'],
844
816
  )
845
817
  elif isinstance(dataset, pd.DataFrame):
@@ -913,9 +885,9 @@ class GenericUnivariateSelect(BaseTransformer):
913
885
  transform_kwargs = dict(
914
886
  session = dataset._session,
915
887
  dependencies = self._deps,
916
- pass_through_cols = self._get_pass_through_columns(dataset),
917
- expected_output_cols_type = "array",
918
- n_neighbors = n_neighbors,
888
+ drop_input_cols = self._drop_input_cols,
889
+ expected_output_cols_type="array",
890
+ n_neighbors = n_neighbors,
919
891
  return_distance = return_distance
920
892
  )
921
893
  elif isinstance(dataset, pd.DataFrame):
@@ -266,18 +266,24 @@ class SelectFdr(BaseTransformer):
266
266
  self._get_model_signatures(dataset)
267
267
  return self
268
268
 
269
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
270
- if self._drop_input_cols:
271
- return []
272
- else:
273
- return list(set(dataset.columns) - set(self.output_cols))
274
-
275
269
  def _batch_inference_validate_snowpark(
276
270
  self,
277
271
  dataset: DataFrame,
278
272
  inference_method: str,
279
273
  ) -> List[str]:
280
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
274
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
275
+ return the available package that exists in the snowflake anaconda channel
276
+
277
+ Args:
278
+ dataset: snowpark dataframe
279
+ inference_method: the inference method such as predict, score...
280
+
281
+ Raises:
282
+ SnowflakeMLException: If the estimator is not fitted, raise error
283
+ SnowflakeMLException: If the session is None, raise error
284
+
285
+ Returns:
286
+ A list of available package that exists in the snowflake anaconda channel
281
287
  """
282
288
  if not self._is_fitted:
283
289
  raise exceptions.SnowflakeMLException(
@@ -349,7 +355,7 @@ class SelectFdr(BaseTransformer):
349
355
  transform_kwargs = dict(
350
356
  session = dataset._session,
351
357
  dependencies = self._deps,
352
- pass_through_cols = self._get_pass_through_columns(dataset),
358
+ drop_input_cols = self._drop_input_cols,
353
359
  expected_output_cols_type = expected_type_inferred,
354
360
  )
355
361
 
@@ -411,16 +417,16 @@ class SelectFdr(BaseTransformer):
411
417
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
412
418
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
413
419
  # each row containing a list of values.
414
- expected_dtype = "ARRAY"
420
+ expected_dtype = "array"
415
421
 
416
422
  # If we were unable to assign a type to this transform in the factory, infer the type here.
417
423
  if expected_dtype == "":
418
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
424
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
419
425
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
420
- expected_dtype = "ARRAY"
421
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
426
+ expected_dtype = "array"
427
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
422
428
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
423
- expected_dtype = "ARRAY"
429
+ expected_dtype = "array"
424
430
  else:
425
431
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
426
432
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -438,7 +444,7 @@ class SelectFdr(BaseTransformer):
438
444
  transform_kwargs = dict(
439
445
  session = dataset._session,
440
446
  dependencies = self._deps,
441
- pass_through_cols = self._get_pass_through_columns(dataset),
447
+ drop_input_cols = self._drop_input_cols,
442
448
  expected_output_cols_type = expected_dtype,
443
449
  )
444
450
 
@@ -489,7 +495,7 @@ class SelectFdr(BaseTransformer):
489
495
  subproject=_SUBPROJECT,
490
496
  )
491
497
  output_result, fitted_estimator = model_trainer.train_fit_predict(
492
- pass_through_columns=self._get_pass_through_columns(dataset),
498
+ drop_input_cols=self._drop_input_cols,
493
499
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
494
500
  )
495
501
  self._sklearn_object = fitted_estimator
@@ -507,44 +513,6 @@ class SelectFdr(BaseTransformer):
507
513
  assert self._sklearn_object is not None
508
514
  return self._sklearn_object.embedding_
509
515
 
510
-
511
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
512
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
513
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
514
- """
515
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
516
- if output_cols:
517
- output_cols = [
518
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
519
- for c in output_cols
520
- ]
521
- elif getattr(self._sklearn_object, "classes_", None) is None:
522
- output_cols = [output_cols_prefix]
523
- elif self._sklearn_object is not None:
524
- classes = self._sklearn_object.classes_
525
- if isinstance(classes, numpy.ndarray):
526
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
527
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
528
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
529
- output_cols = []
530
- for i, cl in enumerate(classes):
531
- # For binary classification, there is only one output column for each class
532
- # ndarray as the two classes are complementary.
533
- if len(cl) == 2:
534
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
535
- else:
536
- output_cols.extend([
537
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
538
- ])
539
- else:
540
- output_cols = []
541
-
542
- # Make sure column names are valid snowflake identifiers.
543
- assert output_cols is not None # Make MyPy happy
544
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
545
-
546
- return rv
547
-
548
516
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
549
517
  @telemetry.send_api_usage_telemetry(
550
518
  project=_PROJECT,
@@ -584,7 +552,7 @@ class SelectFdr(BaseTransformer):
584
552
  transform_kwargs = dict(
585
553
  session=dataset._session,
586
554
  dependencies=self._deps,
587
- pass_through_cols=self._get_pass_through_columns(dataset),
555
+ drop_input_cols = self._drop_input_cols,
588
556
  expected_output_cols_type="float",
589
557
  )
590
558
 
@@ -649,7 +617,7 @@ class SelectFdr(BaseTransformer):
649
617
  transform_kwargs = dict(
650
618
  session=dataset._session,
651
619
  dependencies=self._deps,
652
- pass_through_cols=self._get_pass_through_columns(dataset),
620
+ drop_input_cols = self._drop_input_cols,
653
621
  expected_output_cols_type="float",
654
622
  )
655
623
  elif isinstance(dataset, pd.DataFrame):
@@ -710,7 +678,7 @@ class SelectFdr(BaseTransformer):
710
678
  transform_kwargs = dict(
711
679
  session=dataset._session,
712
680
  dependencies=self._deps,
713
- pass_through_cols=self._get_pass_through_columns(dataset),
681
+ drop_input_cols = self._drop_input_cols,
714
682
  expected_output_cols_type="float",
715
683
  )
716
684
 
@@ -775,7 +743,7 @@ class SelectFdr(BaseTransformer):
775
743
  transform_kwargs = dict(
776
744
  session=dataset._session,
777
745
  dependencies=self._deps,
778
- pass_through_cols=self._get_pass_through_columns(dataset),
746
+ drop_input_cols = self._drop_input_cols,
779
747
  expected_output_cols_type="float",
780
748
  )
781
749
 
@@ -829,13 +797,17 @@ class SelectFdr(BaseTransformer):
829
797
  transform_kwargs: ScoreKwargsTypedDict = dict()
830
798
 
831
799
  if isinstance(dataset, DataFrame):
800
+ self._deps = self._batch_inference_validate_snowpark(
801
+ dataset=dataset,
802
+ inference_method="score",
803
+ )
832
804
  selected_cols = self._get_active_columns()
833
805
  if len(selected_cols) > 0:
834
806
  dataset = dataset.select(selected_cols)
835
807
  assert isinstance(dataset._session, Session) # keep mypy happy
836
808
  transform_kwargs = dict(
837
809
  session=dataset._session,
838
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
810
+ dependencies=["snowflake-snowpark-python"] + self._deps,
839
811
  score_sproc_imports=['sklearn'],
840
812
  )
841
813
  elif isinstance(dataset, pd.DataFrame):
@@ -909,9 +881,9 @@ class SelectFdr(BaseTransformer):
909
881
  transform_kwargs = dict(
910
882
  session = dataset._session,
911
883
  dependencies = self._deps,
912
- pass_through_cols = self._get_pass_through_columns(dataset),
913
- expected_output_cols_type = "array",
914
- n_neighbors = n_neighbors,
884
+ drop_input_cols = self._drop_input_cols,
885
+ expected_output_cols_type="array",
886
+ n_neighbors = n_neighbors,
915
887
  return_distance = return_distance
916
888
  )
917
889
  elif isinstance(dataset, pd.DataFrame):
@@ -266,18 +266,24 @@ class SelectFpr(BaseTransformer):
266
266
  self._get_model_signatures(dataset)
267
267
  return self
268
268
 
269
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
270
- if self._drop_input_cols:
271
- return []
272
- else:
273
- return list(set(dataset.columns) - set(self.output_cols))
274
-
275
269
  def _batch_inference_validate_snowpark(
276
270
  self,
277
271
  dataset: DataFrame,
278
272
  inference_method: str,
279
273
  ) -> List[str]:
280
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
274
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
275
+ return the available package that exists in the snowflake anaconda channel
276
+
277
+ Args:
278
+ dataset: snowpark dataframe
279
+ inference_method: the inference method such as predict, score...
280
+
281
+ Raises:
282
+ SnowflakeMLException: If the estimator is not fitted, raise error
283
+ SnowflakeMLException: If the session is None, raise error
284
+
285
+ Returns:
286
+ A list of available package that exists in the snowflake anaconda channel
281
287
  """
282
288
  if not self._is_fitted:
283
289
  raise exceptions.SnowflakeMLException(
@@ -349,7 +355,7 @@ class SelectFpr(BaseTransformer):
349
355
  transform_kwargs = dict(
350
356
  session = dataset._session,
351
357
  dependencies = self._deps,
352
- pass_through_cols = self._get_pass_through_columns(dataset),
358
+ drop_input_cols = self._drop_input_cols,
353
359
  expected_output_cols_type = expected_type_inferred,
354
360
  )
355
361
 
@@ -411,16 +417,16 @@ class SelectFpr(BaseTransformer):
411
417
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
412
418
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
413
419
  # each row containing a list of values.
414
- expected_dtype = "ARRAY"
420
+ expected_dtype = "array"
415
421
 
416
422
  # If we were unable to assign a type to this transform in the factory, infer the type here.
417
423
  if expected_dtype == "":
418
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
424
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
419
425
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
420
- expected_dtype = "ARRAY"
421
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
426
+ expected_dtype = "array"
427
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
422
428
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
423
- expected_dtype = "ARRAY"
429
+ expected_dtype = "array"
424
430
  else:
425
431
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
426
432
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -438,7 +444,7 @@ class SelectFpr(BaseTransformer):
438
444
  transform_kwargs = dict(
439
445
  session = dataset._session,
440
446
  dependencies = self._deps,
441
- pass_through_cols = self._get_pass_through_columns(dataset),
447
+ drop_input_cols = self._drop_input_cols,
442
448
  expected_output_cols_type = expected_dtype,
443
449
  )
444
450
 
@@ -489,7 +495,7 @@ class SelectFpr(BaseTransformer):
489
495
  subproject=_SUBPROJECT,
490
496
  )
491
497
  output_result, fitted_estimator = model_trainer.train_fit_predict(
492
- pass_through_columns=self._get_pass_through_columns(dataset),
498
+ drop_input_cols=self._drop_input_cols,
493
499
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
494
500
  )
495
501
  self._sklearn_object = fitted_estimator
@@ -507,44 +513,6 @@ class SelectFpr(BaseTransformer):
507
513
  assert self._sklearn_object is not None
508
514
  return self._sklearn_object.embedding_
509
515
 
510
-
511
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
512
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
513
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
514
- """
515
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
516
- if output_cols:
517
- output_cols = [
518
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
519
- for c in output_cols
520
- ]
521
- elif getattr(self._sklearn_object, "classes_", None) is None:
522
- output_cols = [output_cols_prefix]
523
- elif self._sklearn_object is not None:
524
- classes = self._sklearn_object.classes_
525
- if isinstance(classes, numpy.ndarray):
526
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
527
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
528
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
529
- output_cols = []
530
- for i, cl in enumerate(classes):
531
- # For binary classification, there is only one output column for each class
532
- # ndarray as the two classes are complementary.
533
- if len(cl) == 2:
534
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
535
- else:
536
- output_cols.extend([
537
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
538
- ])
539
- else:
540
- output_cols = []
541
-
542
- # Make sure column names are valid snowflake identifiers.
543
- assert output_cols is not None # Make MyPy happy
544
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
545
-
546
- return rv
547
-
548
516
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
549
517
  @telemetry.send_api_usage_telemetry(
550
518
  project=_PROJECT,
@@ -584,7 +552,7 @@ class SelectFpr(BaseTransformer):
584
552
  transform_kwargs = dict(
585
553
  session=dataset._session,
586
554
  dependencies=self._deps,
587
- pass_through_cols=self._get_pass_through_columns(dataset),
555
+ drop_input_cols = self._drop_input_cols,
588
556
  expected_output_cols_type="float",
589
557
  )
590
558
 
@@ -649,7 +617,7 @@ class SelectFpr(BaseTransformer):
649
617
  transform_kwargs = dict(
650
618
  session=dataset._session,
651
619
  dependencies=self._deps,
652
- pass_through_cols=self._get_pass_through_columns(dataset),
620
+ drop_input_cols = self._drop_input_cols,
653
621
  expected_output_cols_type="float",
654
622
  )
655
623
  elif isinstance(dataset, pd.DataFrame):
@@ -710,7 +678,7 @@ class SelectFpr(BaseTransformer):
710
678
  transform_kwargs = dict(
711
679
  session=dataset._session,
712
680
  dependencies=self._deps,
713
- pass_through_cols=self._get_pass_through_columns(dataset),
681
+ drop_input_cols = self._drop_input_cols,
714
682
  expected_output_cols_type="float",
715
683
  )
716
684
 
@@ -775,7 +743,7 @@ class SelectFpr(BaseTransformer):
775
743
  transform_kwargs = dict(
776
744
  session=dataset._session,
777
745
  dependencies=self._deps,
778
- pass_through_cols=self._get_pass_through_columns(dataset),
746
+ drop_input_cols = self._drop_input_cols,
779
747
  expected_output_cols_type="float",
780
748
  )
781
749
 
@@ -829,13 +797,17 @@ class SelectFpr(BaseTransformer):
829
797
  transform_kwargs: ScoreKwargsTypedDict = dict()
830
798
 
831
799
  if isinstance(dataset, DataFrame):
800
+ self._deps = self._batch_inference_validate_snowpark(
801
+ dataset=dataset,
802
+ inference_method="score",
803
+ )
832
804
  selected_cols = self._get_active_columns()
833
805
  if len(selected_cols) > 0:
834
806
  dataset = dataset.select(selected_cols)
835
807
  assert isinstance(dataset._session, Session) # keep mypy happy
836
808
  transform_kwargs = dict(
837
809
  session=dataset._session,
838
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
810
+ dependencies=["snowflake-snowpark-python"] + self._deps,
839
811
  score_sproc_imports=['sklearn'],
840
812
  )
841
813
  elif isinstance(dataset, pd.DataFrame):
@@ -909,9 +881,9 @@ class SelectFpr(BaseTransformer):
909
881
  transform_kwargs = dict(
910
882
  session = dataset._session,
911
883
  dependencies = self._deps,
912
- pass_through_cols = self._get_pass_through_columns(dataset),
913
- expected_output_cols_type = "array",
914
- n_neighbors = n_neighbors,
884
+ drop_input_cols = self._drop_input_cols,
885
+ expected_output_cols_type="array",
886
+ n_neighbors = n_neighbors,
915
887
  return_distance = return_distance
916
888
  )
917
889
  elif isinstance(dataset, pd.DataFrame):