snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -285,18 +285,24 @@ class MissingIndicator(BaseTransformer):
285
285
  self._get_model_signatures(dataset)
286
286
  return self
287
287
 
288
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
289
- if self._drop_input_cols:
290
- return []
291
- else:
292
- return list(set(dataset.columns) - set(self.output_cols))
293
-
294
288
  def _batch_inference_validate_snowpark(
295
289
  self,
296
290
  dataset: DataFrame,
297
291
  inference_method: str,
298
292
  ) -> List[str]:
299
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
293
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
294
+ return the available package that exists in the snowflake anaconda channel
295
+
296
+ Args:
297
+ dataset: snowpark dataframe
298
+ inference_method: the inference method such as predict, score...
299
+
300
+ Raises:
301
+ SnowflakeMLException: If the estimator is not fitted, raise error
302
+ SnowflakeMLException: If the session is None, raise error
303
+
304
+ Returns:
305
+ A list of available package that exists in the snowflake anaconda channel
300
306
  """
301
307
  if not self._is_fitted:
302
308
  raise exceptions.SnowflakeMLException(
@@ -368,7 +374,7 @@ class MissingIndicator(BaseTransformer):
368
374
  transform_kwargs = dict(
369
375
  session = dataset._session,
370
376
  dependencies = self._deps,
371
- pass_through_cols = self._get_pass_through_columns(dataset),
377
+ drop_input_cols = self._drop_input_cols,
372
378
  expected_output_cols_type = expected_type_inferred,
373
379
  )
374
380
 
@@ -430,16 +436,16 @@ class MissingIndicator(BaseTransformer):
430
436
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
431
437
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
432
438
  # each row containing a list of values.
433
- expected_dtype = "ARRAY"
439
+ expected_dtype = "array"
434
440
 
435
441
  # If we were unable to assign a type to this transform in the factory, infer the type here.
436
442
  if expected_dtype == "":
437
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
443
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
438
444
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
439
- expected_dtype = "ARRAY"
440
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
445
+ expected_dtype = "array"
446
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
441
447
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
442
- expected_dtype = "ARRAY"
448
+ expected_dtype = "array"
443
449
  else:
444
450
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
445
451
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -457,7 +463,7 @@ class MissingIndicator(BaseTransformer):
457
463
  transform_kwargs = dict(
458
464
  session = dataset._session,
459
465
  dependencies = self._deps,
460
- pass_through_cols = self._get_pass_through_columns(dataset),
466
+ drop_input_cols = self._drop_input_cols,
461
467
  expected_output_cols_type = expected_dtype,
462
468
  )
463
469
 
@@ -508,7 +514,7 @@ class MissingIndicator(BaseTransformer):
508
514
  subproject=_SUBPROJECT,
509
515
  )
510
516
  output_result, fitted_estimator = model_trainer.train_fit_predict(
511
- pass_through_columns=self._get_pass_through_columns(dataset),
517
+ drop_input_cols=self._drop_input_cols,
512
518
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
513
519
  )
514
520
  self._sklearn_object = fitted_estimator
@@ -526,44 +532,6 @@ class MissingIndicator(BaseTransformer):
526
532
  assert self._sklearn_object is not None
527
533
  return self._sklearn_object.embedding_
528
534
 
529
-
530
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
531
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
532
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
533
- """
534
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
535
- if output_cols:
536
- output_cols = [
537
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
538
- for c in output_cols
539
- ]
540
- elif getattr(self._sklearn_object, "classes_", None) is None:
541
- output_cols = [output_cols_prefix]
542
- elif self._sklearn_object is not None:
543
- classes = self._sklearn_object.classes_
544
- if isinstance(classes, numpy.ndarray):
545
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
546
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
547
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
548
- output_cols = []
549
- for i, cl in enumerate(classes):
550
- # For binary classification, there is only one output column for each class
551
- # ndarray as the two classes are complementary.
552
- if len(cl) == 2:
553
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
554
- else:
555
- output_cols.extend([
556
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
557
- ])
558
- else:
559
- output_cols = []
560
-
561
- # Make sure column names are valid snowflake identifiers.
562
- assert output_cols is not None # Make MyPy happy
563
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
564
-
565
- return rv
566
-
567
535
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
568
536
  @telemetry.send_api_usage_telemetry(
569
537
  project=_PROJECT,
@@ -603,7 +571,7 @@ class MissingIndicator(BaseTransformer):
603
571
  transform_kwargs = dict(
604
572
  session=dataset._session,
605
573
  dependencies=self._deps,
606
- pass_through_cols=self._get_pass_through_columns(dataset),
574
+ drop_input_cols = self._drop_input_cols,
607
575
  expected_output_cols_type="float",
608
576
  )
609
577
 
@@ -668,7 +636,7 @@ class MissingIndicator(BaseTransformer):
668
636
  transform_kwargs = dict(
669
637
  session=dataset._session,
670
638
  dependencies=self._deps,
671
- pass_through_cols=self._get_pass_through_columns(dataset),
639
+ drop_input_cols = self._drop_input_cols,
672
640
  expected_output_cols_type="float",
673
641
  )
674
642
  elif isinstance(dataset, pd.DataFrame):
@@ -729,7 +697,7 @@ class MissingIndicator(BaseTransformer):
729
697
  transform_kwargs = dict(
730
698
  session=dataset._session,
731
699
  dependencies=self._deps,
732
- pass_through_cols=self._get_pass_through_columns(dataset),
700
+ drop_input_cols = self._drop_input_cols,
733
701
  expected_output_cols_type="float",
734
702
  )
735
703
 
@@ -794,7 +762,7 @@ class MissingIndicator(BaseTransformer):
794
762
  transform_kwargs = dict(
795
763
  session=dataset._session,
796
764
  dependencies=self._deps,
797
- pass_through_cols=self._get_pass_through_columns(dataset),
765
+ drop_input_cols = self._drop_input_cols,
798
766
  expected_output_cols_type="float",
799
767
  )
800
768
 
@@ -848,13 +816,17 @@ class MissingIndicator(BaseTransformer):
848
816
  transform_kwargs: ScoreKwargsTypedDict = dict()
849
817
 
850
818
  if isinstance(dataset, DataFrame):
819
+ self._deps = self._batch_inference_validate_snowpark(
820
+ dataset=dataset,
821
+ inference_method="score",
822
+ )
851
823
  selected_cols = self._get_active_columns()
852
824
  if len(selected_cols) > 0:
853
825
  dataset = dataset.select(selected_cols)
854
826
  assert isinstance(dataset._session, Session) # keep mypy happy
855
827
  transform_kwargs = dict(
856
828
  session=dataset._session,
857
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
829
+ dependencies=["snowflake-snowpark-python"] + self._deps,
858
830
  score_sproc_imports=['sklearn'],
859
831
  )
860
832
  elif isinstance(dataset, pd.DataFrame):
@@ -928,9 +900,9 @@ class MissingIndicator(BaseTransformer):
928
900
  transform_kwargs = dict(
929
901
  session = dataset._session,
930
902
  dependencies = self._deps,
931
- pass_through_cols = self._get_pass_through_columns(dataset),
932
- expected_output_cols_type = "array",
933
- n_neighbors = n_neighbors,
903
+ drop_input_cols = self._drop_input_cols,
904
+ expected_output_cols_type="array",
905
+ n_neighbors = n_neighbors,
934
906
  return_distance = return_distance
935
907
  )
936
908
  elif isinstance(dataset, pd.DataFrame):
@@ -74,8 +74,6 @@ _NUMERIC_TYPES = [
74
74
  ]
75
75
 
76
76
 
77
- # TODO(thoyt): Implement logic for `add_indicator` parameter and `indicator_` attribute.Requires
78
- # `snowflake.ml.impute.MissingIndicator` to be implemented.
79
77
  class SimpleImputer(base.BaseTransformer):
80
78
  """
81
79
  Univariate imputer for completing missing values with simple strategies.
@@ -96,7 +94,8 @@ class SimpleImputer(base.BaseTransformer):
96
94
  * If "most_frequent", replace missing using the most frequent value along each column.
97
95
  Can be used with strings or numeric data.
98
96
  If there is more than one such value, only the smallest is returned.
99
- * If "constant", replace the missing values with `fill_value`. Can be used with strings or numeric data.
97
+ * If "constant", replace the missing values with `fill_value`, including columns that are entirely
98
+ null. Can be used with strings or numeric data.
100
99
 
101
100
  fill_value: Optional[str]
102
101
  When `strategy == "constant"`, `fill_value` is used to replace all occurrences of `missing_values`.
@@ -262,18 +261,8 @@ class SimpleImputer(base.BaseTransformer):
262
261
  break
263
262
 
264
263
  for input_col in self.input_cols:
265
- # Check whether input column is empty if necessary.
266
- if (
267
- # TODO(hayu): [SNOW-752265] Support SimpleImputer keep_empty_features.
268
- # Add back when `keep_empty_features` is supported.
269
- # not self.keep_empty_features
270
- # and dataset.filter(F.col(input_col).is_not_null()).count(statement_params=statement_params) == 0
271
- dataset.filter(F.col(input_col).is_not_null()).count(statement_params=statement_params)
272
- == 0
273
- ):
274
- self.statistics_[input_col] = np.nan
275
- else:
276
- self.statistics_[input_col] = self.fill_value
264
+ self.statistics_[input_col] = self.fill_value
265
+
277
266
  else:
278
267
  state = STRATEGY_TO_STATE_DICT[self.strategy]
279
268
  assert state is not None
@@ -260,18 +260,24 @@ class AdditiveChi2Sampler(BaseTransformer):
260
260
  self._get_model_signatures(dataset)
261
261
  return self
262
262
 
263
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
264
- if self._drop_input_cols:
265
- return []
266
- else:
267
- return list(set(dataset.columns) - set(self.output_cols))
268
-
269
263
  def _batch_inference_validate_snowpark(
270
264
  self,
271
265
  dataset: DataFrame,
272
266
  inference_method: str,
273
267
  ) -> List[str]:
274
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
268
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
269
+ return the available package that exists in the snowflake anaconda channel
270
+
271
+ Args:
272
+ dataset: snowpark dataframe
273
+ inference_method: the inference method such as predict, score...
274
+
275
+ Raises:
276
+ SnowflakeMLException: If the estimator is not fitted, raise error
277
+ SnowflakeMLException: If the session is None, raise error
278
+
279
+ Returns:
280
+ A list of available package that exists in the snowflake anaconda channel
275
281
  """
276
282
  if not self._is_fitted:
277
283
  raise exceptions.SnowflakeMLException(
@@ -343,7 +349,7 @@ class AdditiveChi2Sampler(BaseTransformer):
343
349
  transform_kwargs = dict(
344
350
  session = dataset._session,
345
351
  dependencies = self._deps,
346
- pass_through_cols = self._get_pass_through_columns(dataset),
352
+ drop_input_cols = self._drop_input_cols,
347
353
  expected_output_cols_type = expected_type_inferred,
348
354
  )
349
355
 
@@ -405,16 +411,16 @@ class AdditiveChi2Sampler(BaseTransformer):
405
411
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
406
412
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
407
413
  # each row containing a list of values.
408
- expected_dtype = "ARRAY"
414
+ expected_dtype = "array"
409
415
 
410
416
  # If we were unable to assign a type to this transform in the factory, infer the type here.
411
417
  if expected_dtype == "":
412
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
418
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
413
419
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
414
- expected_dtype = "ARRAY"
415
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
420
+ expected_dtype = "array"
421
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
416
422
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
417
- expected_dtype = "ARRAY"
423
+ expected_dtype = "array"
418
424
  else:
419
425
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
420
426
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -432,7 +438,7 @@ class AdditiveChi2Sampler(BaseTransformer):
432
438
  transform_kwargs = dict(
433
439
  session = dataset._session,
434
440
  dependencies = self._deps,
435
- pass_through_cols = self._get_pass_through_columns(dataset),
441
+ drop_input_cols = self._drop_input_cols,
436
442
  expected_output_cols_type = expected_dtype,
437
443
  )
438
444
 
@@ -483,7 +489,7 @@ class AdditiveChi2Sampler(BaseTransformer):
483
489
  subproject=_SUBPROJECT,
484
490
  )
485
491
  output_result, fitted_estimator = model_trainer.train_fit_predict(
486
- pass_through_columns=self._get_pass_through_columns(dataset),
492
+ drop_input_cols=self._drop_input_cols,
487
493
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
488
494
  )
489
495
  self._sklearn_object = fitted_estimator
@@ -501,44 +507,6 @@ class AdditiveChi2Sampler(BaseTransformer):
501
507
  assert self._sklearn_object is not None
502
508
  return self._sklearn_object.embedding_
503
509
 
504
-
505
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
506
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
507
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
508
- """
509
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
510
- if output_cols:
511
- output_cols = [
512
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
513
- for c in output_cols
514
- ]
515
- elif getattr(self._sklearn_object, "classes_", None) is None:
516
- output_cols = [output_cols_prefix]
517
- elif self._sklearn_object is not None:
518
- classes = self._sklearn_object.classes_
519
- if isinstance(classes, numpy.ndarray):
520
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
521
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
522
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
523
- output_cols = []
524
- for i, cl in enumerate(classes):
525
- # For binary classification, there is only one output column for each class
526
- # ndarray as the two classes are complementary.
527
- if len(cl) == 2:
528
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
529
- else:
530
- output_cols.extend([
531
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
532
- ])
533
- else:
534
- output_cols = []
535
-
536
- # Make sure column names are valid snowflake identifiers.
537
- assert output_cols is not None # Make MyPy happy
538
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
539
-
540
- return rv
541
-
542
510
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
543
511
  @telemetry.send_api_usage_telemetry(
544
512
  project=_PROJECT,
@@ -578,7 +546,7 @@ class AdditiveChi2Sampler(BaseTransformer):
578
546
  transform_kwargs = dict(
579
547
  session=dataset._session,
580
548
  dependencies=self._deps,
581
- pass_through_cols=self._get_pass_through_columns(dataset),
549
+ drop_input_cols = self._drop_input_cols,
582
550
  expected_output_cols_type="float",
583
551
  )
584
552
 
@@ -643,7 +611,7 @@ class AdditiveChi2Sampler(BaseTransformer):
643
611
  transform_kwargs = dict(
644
612
  session=dataset._session,
645
613
  dependencies=self._deps,
646
- pass_through_cols=self._get_pass_through_columns(dataset),
614
+ drop_input_cols = self._drop_input_cols,
647
615
  expected_output_cols_type="float",
648
616
  )
649
617
  elif isinstance(dataset, pd.DataFrame):
@@ -704,7 +672,7 @@ class AdditiveChi2Sampler(BaseTransformer):
704
672
  transform_kwargs = dict(
705
673
  session=dataset._session,
706
674
  dependencies=self._deps,
707
- pass_through_cols=self._get_pass_through_columns(dataset),
675
+ drop_input_cols = self._drop_input_cols,
708
676
  expected_output_cols_type="float",
709
677
  )
710
678
 
@@ -769,7 +737,7 @@ class AdditiveChi2Sampler(BaseTransformer):
769
737
  transform_kwargs = dict(
770
738
  session=dataset._session,
771
739
  dependencies=self._deps,
772
- pass_through_cols=self._get_pass_through_columns(dataset),
740
+ drop_input_cols = self._drop_input_cols,
773
741
  expected_output_cols_type="float",
774
742
  )
775
743
 
@@ -823,13 +791,17 @@ class AdditiveChi2Sampler(BaseTransformer):
823
791
  transform_kwargs: ScoreKwargsTypedDict = dict()
824
792
 
825
793
  if isinstance(dataset, DataFrame):
794
+ self._deps = self._batch_inference_validate_snowpark(
795
+ dataset=dataset,
796
+ inference_method="score",
797
+ )
826
798
  selected_cols = self._get_active_columns()
827
799
  if len(selected_cols) > 0:
828
800
  dataset = dataset.select(selected_cols)
829
801
  assert isinstance(dataset._session, Session) # keep mypy happy
830
802
  transform_kwargs = dict(
831
803
  session=dataset._session,
832
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
804
+ dependencies=["snowflake-snowpark-python"] + self._deps,
833
805
  score_sproc_imports=['sklearn'],
834
806
  )
835
807
  elif isinstance(dataset, pd.DataFrame):
@@ -903,9 +875,9 @@ class AdditiveChi2Sampler(BaseTransformer):
903
875
  transform_kwargs = dict(
904
876
  session = dataset._session,
905
877
  dependencies = self._deps,
906
- pass_through_cols = self._get_pass_through_columns(dataset),
907
- expected_output_cols_type = "array",
908
- n_neighbors = n_neighbors,
878
+ drop_input_cols = self._drop_input_cols,
879
+ expected_output_cols_type="array",
880
+ n_neighbors = n_neighbors,
909
881
  return_distance = return_distance
910
882
  )
911
883
  elif isinstance(dataset, pd.DataFrame):
@@ -308,18 +308,24 @@ class Nystroem(BaseTransformer):
308
308
  self._get_model_signatures(dataset)
309
309
  return self
310
310
 
311
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
312
- if self._drop_input_cols:
313
- return []
314
- else:
315
- return list(set(dataset.columns) - set(self.output_cols))
316
-
317
311
  def _batch_inference_validate_snowpark(
318
312
  self,
319
313
  dataset: DataFrame,
320
314
  inference_method: str,
321
315
  ) -> List[str]:
322
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
316
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
317
+ return the available package that exists in the snowflake anaconda channel
318
+
319
+ Args:
320
+ dataset: snowpark dataframe
321
+ inference_method: the inference method such as predict, score...
322
+
323
+ Raises:
324
+ SnowflakeMLException: If the estimator is not fitted, raise error
325
+ SnowflakeMLException: If the session is None, raise error
326
+
327
+ Returns:
328
+ A list of available package that exists in the snowflake anaconda channel
323
329
  """
324
330
  if not self._is_fitted:
325
331
  raise exceptions.SnowflakeMLException(
@@ -391,7 +397,7 @@ class Nystroem(BaseTransformer):
391
397
  transform_kwargs = dict(
392
398
  session = dataset._session,
393
399
  dependencies = self._deps,
394
- pass_through_cols = self._get_pass_through_columns(dataset),
400
+ drop_input_cols = self._drop_input_cols,
395
401
  expected_output_cols_type = expected_type_inferred,
396
402
  )
397
403
 
@@ -453,16 +459,16 @@ class Nystroem(BaseTransformer):
453
459
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
454
460
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
455
461
  # each row containing a list of values.
456
- expected_dtype = "ARRAY"
462
+ expected_dtype = "array"
457
463
 
458
464
  # If we were unable to assign a type to this transform in the factory, infer the type here.
459
465
  if expected_dtype == "":
460
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
466
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
461
467
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
462
- expected_dtype = "ARRAY"
463
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
468
+ expected_dtype = "array"
469
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
464
470
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
465
- expected_dtype = "ARRAY"
471
+ expected_dtype = "array"
466
472
  else:
467
473
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
468
474
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -480,7 +486,7 @@ class Nystroem(BaseTransformer):
480
486
  transform_kwargs = dict(
481
487
  session = dataset._session,
482
488
  dependencies = self._deps,
483
- pass_through_cols = self._get_pass_through_columns(dataset),
489
+ drop_input_cols = self._drop_input_cols,
484
490
  expected_output_cols_type = expected_dtype,
485
491
  )
486
492
 
@@ -531,7 +537,7 @@ class Nystroem(BaseTransformer):
531
537
  subproject=_SUBPROJECT,
532
538
  )
533
539
  output_result, fitted_estimator = model_trainer.train_fit_predict(
534
- pass_through_columns=self._get_pass_through_columns(dataset),
540
+ drop_input_cols=self._drop_input_cols,
535
541
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
536
542
  )
537
543
  self._sklearn_object = fitted_estimator
@@ -549,44 +555,6 @@ class Nystroem(BaseTransformer):
549
555
  assert self._sklearn_object is not None
550
556
  return self._sklearn_object.embedding_
551
557
 
552
-
553
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
554
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
555
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
556
- """
557
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
558
- if output_cols:
559
- output_cols = [
560
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
561
- for c in output_cols
562
- ]
563
- elif getattr(self._sklearn_object, "classes_", None) is None:
564
- output_cols = [output_cols_prefix]
565
- elif self._sklearn_object is not None:
566
- classes = self._sklearn_object.classes_
567
- if isinstance(classes, numpy.ndarray):
568
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
569
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
570
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
571
- output_cols = []
572
- for i, cl in enumerate(classes):
573
- # For binary classification, there is only one output column for each class
574
- # ndarray as the two classes are complementary.
575
- if len(cl) == 2:
576
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
577
- else:
578
- output_cols.extend([
579
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
580
- ])
581
- else:
582
- output_cols = []
583
-
584
- # Make sure column names are valid snowflake identifiers.
585
- assert output_cols is not None # Make MyPy happy
586
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
587
-
588
- return rv
589
-
590
558
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
591
559
  @telemetry.send_api_usage_telemetry(
592
560
  project=_PROJECT,
@@ -626,7 +594,7 @@ class Nystroem(BaseTransformer):
626
594
  transform_kwargs = dict(
627
595
  session=dataset._session,
628
596
  dependencies=self._deps,
629
- pass_through_cols=self._get_pass_through_columns(dataset),
597
+ drop_input_cols = self._drop_input_cols,
630
598
  expected_output_cols_type="float",
631
599
  )
632
600
 
@@ -691,7 +659,7 @@ class Nystroem(BaseTransformer):
691
659
  transform_kwargs = dict(
692
660
  session=dataset._session,
693
661
  dependencies=self._deps,
694
- pass_through_cols=self._get_pass_through_columns(dataset),
662
+ drop_input_cols = self._drop_input_cols,
695
663
  expected_output_cols_type="float",
696
664
  )
697
665
  elif isinstance(dataset, pd.DataFrame):
@@ -752,7 +720,7 @@ class Nystroem(BaseTransformer):
752
720
  transform_kwargs = dict(
753
721
  session=dataset._session,
754
722
  dependencies=self._deps,
755
- pass_through_cols=self._get_pass_through_columns(dataset),
723
+ drop_input_cols = self._drop_input_cols,
756
724
  expected_output_cols_type="float",
757
725
  )
758
726
 
@@ -817,7 +785,7 @@ class Nystroem(BaseTransformer):
817
785
  transform_kwargs = dict(
818
786
  session=dataset._session,
819
787
  dependencies=self._deps,
820
- pass_through_cols=self._get_pass_through_columns(dataset),
788
+ drop_input_cols = self._drop_input_cols,
821
789
  expected_output_cols_type="float",
822
790
  )
823
791
 
@@ -871,13 +839,17 @@ class Nystroem(BaseTransformer):
871
839
  transform_kwargs: ScoreKwargsTypedDict = dict()
872
840
 
873
841
  if isinstance(dataset, DataFrame):
842
+ self._deps = self._batch_inference_validate_snowpark(
843
+ dataset=dataset,
844
+ inference_method="score",
845
+ )
874
846
  selected_cols = self._get_active_columns()
875
847
  if len(selected_cols) > 0:
876
848
  dataset = dataset.select(selected_cols)
877
849
  assert isinstance(dataset._session, Session) # keep mypy happy
878
850
  transform_kwargs = dict(
879
851
  session=dataset._session,
880
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
852
+ dependencies=["snowflake-snowpark-python"] + self._deps,
881
853
  score_sproc_imports=['sklearn'],
882
854
  )
883
855
  elif isinstance(dataset, pd.DataFrame):
@@ -951,9 +923,9 @@ class Nystroem(BaseTransformer):
951
923
  transform_kwargs = dict(
952
924
  session = dataset._session,
953
925
  dependencies = self._deps,
954
- pass_through_cols = self._get_pass_through_columns(dataset),
955
- expected_output_cols_type = "array",
956
- n_neighbors = n_neighbors,
926
+ drop_input_cols = self._drop_input_cols,
927
+ expected_output_cols_type="array",
928
+ n_neighbors = n_neighbors,
957
929
  return_distance = return_distance
958
930
  )
959
931
  elif isinstance(dataset, pd.DataFrame):