snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -343,18 +343,24 @@ class BisectingKMeans(BaseTransformer):
343
343
  self._get_model_signatures(dataset)
344
344
  return self
345
345
 
346
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
347
- if self._drop_input_cols:
348
- return []
349
- else:
350
- return list(set(dataset.columns) - set(self.output_cols))
351
-
352
346
  def _batch_inference_validate_snowpark(
353
347
  self,
354
348
  dataset: DataFrame,
355
349
  inference_method: str,
356
350
  ) -> List[str]:
357
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
351
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
352
+ return the available package that exists in the snowflake anaconda channel
353
+
354
+ Args:
355
+ dataset: snowpark dataframe
356
+ inference_method: the inference method such as predict, score...
357
+
358
+ Raises:
359
+ SnowflakeMLException: If the estimator is not fitted, raise error
360
+ SnowflakeMLException: If the session is None, raise error
361
+
362
+ Returns:
363
+ A list of available package that exists in the snowflake anaconda channel
358
364
  """
359
365
  if not self._is_fitted:
360
366
  raise exceptions.SnowflakeMLException(
@@ -428,7 +434,7 @@ class BisectingKMeans(BaseTransformer):
428
434
  transform_kwargs = dict(
429
435
  session = dataset._session,
430
436
  dependencies = self._deps,
431
- pass_through_cols = self._get_pass_through_columns(dataset),
437
+ drop_input_cols = self._drop_input_cols,
432
438
  expected_output_cols_type = expected_type_inferred,
433
439
  )
434
440
 
@@ -490,16 +496,16 @@ class BisectingKMeans(BaseTransformer):
490
496
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
491
497
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
492
498
  # each row containing a list of values.
493
- expected_dtype = "ARRAY"
499
+ expected_dtype = "array"
494
500
 
495
501
  # If we were unable to assign a type to this transform in the factory, infer the type here.
496
502
  if expected_dtype == "":
497
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
503
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
498
504
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
499
- expected_dtype = "ARRAY"
500
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
505
+ expected_dtype = "array"
506
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
501
507
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
502
- expected_dtype = "ARRAY"
508
+ expected_dtype = "array"
503
509
  else:
504
510
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
505
511
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -517,7 +523,7 @@ class BisectingKMeans(BaseTransformer):
517
523
  transform_kwargs = dict(
518
524
  session = dataset._session,
519
525
  dependencies = self._deps,
520
- pass_through_cols = self._get_pass_through_columns(dataset),
526
+ drop_input_cols = self._drop_input_cols,
521
527
  expected_output_cols_type = expected_dtype,
522
528
  )
523
529
 
@@ -570,7 +576,7 @@ class BisectingKMeans(BaseTransformer):
570
576
  subproject=_SUBPROJECT,
571
577
  )
572
578
  output_result, fitted_estimator = model_trainer.train_fit_predict(
573
- pass_through_columns=self._get_pass_through_columns(dataset),
579
+ drop_input_cols=self._drop_input_cols,
574
580
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
575
581
  )
576
582
  self._sklearn_object = fitted_estimator
@@ -588,44 +594,6 @@ class BisectingKMeans(BaseTransformer):
588
594
  assert self._sklearn_object is not None
589
595
  return self._sklearn_object.embedding_
590
596
 
591
-
592
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
593
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
594
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
595
- """
596
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
597
- if output_cols:
598
- output_cols = [
599
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
600
- for c in output_cols
601
- ]
602
- elif getattr(self._sklearn_object, "classes_", None) is None:
603
- output_cols = [output_cols_prefix]
604
- elif self._sklearn_object is not None:
605
- classes = self._sklearn_object.classes_
606
- if isinstance(classes, numpy.ndarray):
607
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
608
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
609
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
610
- output_cols = []
611
- for i, cl in enumerate(classes):
612
- # For binary classification, there is only one output column for each class
613
- # ndarray as the two classes are complementary.
614
- if len(cl) == 2:
615
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
616
- else:
617
- output_cols.extend([
618
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
619
- ])
620
- else:
621
- output_cols = []
622
-
623
- # Make sure column names are valid snowflake identifiers.
624
- assert output_cols is not None # Make MyPy happy
625
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
626
-
627
- return rv
628
-
629
597
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
630
598
  @telemetry.send_api_usage_telemetry(
631
599
  project=_PROJECT,
@@ -665,7 +633,7 @@ class BisectingKMeans(BaseTransformer):
665
633
  transform_kwargs = dict(
666
634
  session=dataset._session,
667
635
  dependencies=self._deps,
668
- pass_through_cols=self._get_pass_through_columns(dataset),
636
+ drop_input_cols = self._drop_input_cols,
669
637
  expected_output_cols_type="float",
670
638
  )
671
639
 
@@ -730,7 +698,7 @@ class BisectingKMeans(BaseTransformer):
730
698
  transform_kwargs = dict(
731
699
  session=dataset._session,
732
700
  dependencies=self._deps,
733
- pass_through_cols=self._get_pass_through_columns(dataset),
701
+ drop_input_cols = self._drop_input_cols,
734
702
  expected_output_cols_type="float",
735
703
  )
736
704
  elif isinstance(dataset, pd.DataFrame):
@@ -791,7 +759,7 @@ class BisectingKMeans(BaseTransformer):
791
759
  transform_kwargs = dict(
792
760
  session=dataset._session,
793
761
  dependencies=self._deps,
794
- pass_through_cols=self._get_pass_through_columns(dataset),
762
+ drop_input_cols = self._drop_input_cols,
795
763
  expected_output_cols_type="float",
796
764
  )
797
765
 
@@ -856,7 +824,7 @@ class BisectingKMeans(BaseTransformer):
856
824
  transform_kwargs = dict(
857
825
  session=dataset._session,
858
826
  dependencies=self._deps,
859
- pass_through_cols=self._get_pass_through_columns(dataset),
827
+ drop_input_cols = self._drop_input_cols,
860
828
  expected_output_cols_type="float",
861
829
  )
862
830
 
@@ -912,13 +880,17 @@ class BisectingKMeans(BaseTransformer):
912
880
  transform_kwargs: ScoreKwargsTypedDict = dict()
913
881
 
914
882
  if isinstance(dataset, DataFrame):
883
+ self._deps = self._batch_inference_validate_snowpark(
884
+ dataset=dataset,
885
+ inference_method="score",
886
+ )
915
887
  selected_cols = self._get_active_columns()
916
888
  if len(selected_cols) > 0:
917
889
  dataset = dataset.select(selected_cols)
918
890
  assert isinstance(dataset._session, Session) # keep mypy happy
919
891
  transform_kwargs = dict(
920
892
  session=dataset._session,
921
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
893
+ dependencies=["snowflake-snowpark-python"] + self._deps,
922
894
  score_sproc_imports=['sklearn'],
923
895
  )
924
896
  elif isinstance(dataset, pd.DataFrame):
@@ -992,9 +964,9 @@ class BisectingKMeans(BaseTransformer):
992
964
  transform_kwargs = dict(
993
965
  session = dataset._session,
994
966
  dependencies = self._deps,
995
- pass_through_cols = self._get_pass_through_columns(dataset),
996
- expected_output_cols_type = "array",
997
- n_neighbors = n_neighbors,
967
+ drop_input_cols = self._drop_input_cols,
968
+ expected_output_cols_type="array",
969
+ n_neighbors = n_neighbors,
998
970
  return_distance = return_distance
999
971
  )
1000
972
  elif isinstance(dataset, pd.DataFrame):
@@ -311,18 +311,24 @@ class DBSCAN(BaseTransformer):
311
311
  self._get_model_signatures(dataset)
312
312
  return self
313
313
 
314
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
315
- if self._drop_input_cols:
316
- return []
317
- else:
318
- return list(set(dataset.columns) - set(self.output_cols))
319
-
320
314
  def _batch_inference_validate_snowpark(
321
315
  self,
322
316
  dataset: DataFrame,
323
317
  inference_method: str,
324
318
  ) -> List[str]:
325
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
319
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
320
+ return the available package that exists in the snowflake anaconda channel
321
+
322
+ Args:
323
+ dataset: snowpark dataframe
324
+ inference_method: the inference method such as predict, score...
325
+
326
+ Raises:
327
+ SnowflakeMLException: If the estimator is not fitted, raise error
328
+ SnowflakeMLException: If the session is None, raise error
329
+
330
+ Returns:
331
+ A list of available package that exists in the snowflake anaconda channel
326
332
  """
327
333
  if not self._is_fitted:
328
334
  raise exceptions.SnowflakeMLException(
@@ -394,7 +400,7 @@ class DBSCAN(BaseTransformer):
394
400
  transform_kwargs = dict(
395
401
  session = dataset._session,
396
402
  dependencies = self._deps,
397
- pass_through_cols = self._get_pass_through_columns(dataset),
403
+ drop_input_cols = self._drop_input_cols,
398
404
  expected_output_cols_type = expected_type_inferred,
399
405
  )
400
406
 
@@ -454,16 +460,16 @@ class DBSCAN(BaseTransformer):
454
460
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
455
461
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
456
462
  # each row containing a list of values.
457
- expected_dtype = "ARRAY"
463
+ expected_dtype = "array"
458
464
 
459
465
  # If we were unable to assign a type to this transform in the factory, infer the type here.
460
466
  if expected_dtype == "":
461
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
467
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
462
468
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
463
- expected_dtype = "ARRAY"
464
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
469
+ expected_dtype = "array"
470
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
465
471
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
466
- expected_dtype = "ARRAY"
472
+ expected_dtype = "array"
467
473
  else:
468
474
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
469
475
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -481,7 +487,7 @@ class DBSCAN(BaseTransformer):
481
487
  transform_kwargs = dict(
482
488
  session = dataset._session,
483
489
  dependencies = self._deps,
484
- pass_through_cols = self._get_pass_through_columns(dataset),
490
+ drop_input_cols = self._drop_input_cols,
485
491
  expected_output_cols_type = expected_dtype,
486
492
  )
487
493
 
@@ -534,7 +540,7 @@ class DBSCAN(BaseTransformer):
534
540
  subproject=_SUBPROJECT,
535
541
  )
536
542
  output_result, fitted_estimator = model_trainer.train_fit_predict(
537
- pass_through_columns=self._get_pass_through_columns(dataset),
543
+ drop_input_cols=self._drop_input_cols,
538
544
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
539
545
  )
540
546
  self._sklearn_object = fitted_estimator
@@ -552,44 +558,6 @@ class DBSCAN(BaseTransformer):
552
558
  assert self._sklearn_object is not None
553
559
  return self._sklearn_object.embedding_
554
560
 
555
-
556
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
557
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
558
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
559
- """
560
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
561
- if output_cols:
562
- output_cols = [
563
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
564
- for c in output_cols
565
- ]
566
- elif getattr(self._sklearn_object, "classes_", None) is None:
567
- output_cols = [output_cols_prefix]
568
- elif self._sklearn_object is not None:
569
- classes = self._sklearn_object.classes_
570
- if isinstance(classes, numpy.ndarray):
571
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
572
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
573
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
574
- output_cols = []
575
- for i, cl in enumerate(classes):
576
- # For binary classification, there is only one output column for each class
577
- # ndarray as the two classes are complementary.
578
- if len(cl) == 2:
579
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
580
- else:
581
- output_cols.extend([
582
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
583
- ])
584
- else:
585
- output_cols = []
586
-
587
- # Make sure column names are valid snowflake identifiers.
588
- assert output_cols is not None # Make MyPy happy
589
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
590
-
591
- return rv
592
-
593
561
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
594
562
  @telemetry.send_api_usage_telemetry(
595
563
  project=_PROJECT,
@@ -629,7 +597,7 @@ class DBSCAN(BaseTransformer):
629
597
  transform_kwargs = dict(
630
598
  session=dataset._session,
631
599
  dependencies=self._deps,
632
- pass_through_cols=self._get_pass_through_columns(dataset),
600
+ drop_input_cols = self._drop_input_cols,
633
601
  expected_output_cols_type="float",
634
602
  )
635
603
 
@@ -694,7 +662,7 @@ class DBSCAN(BaseTransformer):
694
662
  transform_kwargs = dict(
695
663
  session=dataset._session,
696
664
  dependencies=self._deps,
697
- pass_through_cols=self._get_pass_through_columns(dataset),
665
+ drop_input_cols = self._drop_input_cols,
698
666
  expected_output_cols_type="float",
699
667
  )
700
668
  elif isinstance(dataset, pd.DataFrame):
@@ -755,7 +723,7 @@ class DBSCAN(BaseTransformer):
755
723
  transform_kwargs = dict(
756
724
  session=dataset._session,
757
725
  dependencies=self._deps,
758
- pass_through_cols=self._get_pass_through_columns(dataset),
726
+ drop_input_cols = self._drop_input_cols,
759
727
  expected_output_cols_type="float",
760
728
  )
761
729
 
@@ -820,7 +788,7 @@ class DBSCAN(BaseTransformer):
820
788
  transform_kwargs = dict(
821
789
  session=dataset._session,
822
790
  dependencies=self._deps,
823
- pass_through_cols=self._get_pass_through_columns(dataset),
791
+ drop_input_cols = self._drop_input_cols,
824
792
  expected_output_cols_type="float",
825
793
  )
826
794
 
@@ -874,13 +842,17 @@ class DBSCAN(BaseTransformer):
874
842
  transform_kwargs: ScoreKwargsTypedDict = dict()
875
843
 
876
844
  if isinstance(dataset, DataFrame):
845
+ self._deps = self._batch_inference_validate_snowpark(
846
+ dataset=dataset,
847
+ inference_method="score",
848
+ )
877
849
  selected_cols = self._get_active_columns()
878
850
  if len(selected_cols) > 0:
879
851
  dataset = dataset.select(selected_cols)
880
852
  assert isinstance(dataset._session, Session) # keep mypy happy
881
853
  transform_kwargs = dict(
882
854
  session=dataset._session,
883
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
855
+ dependencies=["snowflake-snowpark-python"] + self._deps,
884
856
  score_sproc_imports=['sklearn'],
885
857
  )
886
858
  elif isinstance(dataset, pd.DataFrame):
@@ -954,9 +926,9 @@ class DBSCAN(BaseTransformer):
954
926
  transform_kwargs = dict(
955
927
  session = dataset._session,
956
928
  dependencies = self._deps,
957
- pass_through_cols = self._get_pass_through_columns(dataset),
958
- expected_output_cols_type = "array",
959
- n_neighbors = n_neighbors,
929
+ drop_input_cols = self._drop_input_cols,
930
+ expected_output_cols_type="array",
931
+ n_neighbors = n_neighbors,
960
932
  return_distance = return_distance
961
933
  )
962
934
  elif isinstance(dataset, pd.DataFrame):
@@ -343,18 +343,24 @@ class FeatureAgglomeration(BaseTransformer):
343
343
  self._get_model_signatures(dataset)
344
344
  return self
345
345
 
346
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
347
- if self._drop_input_cols:
348
- return []
349
- else:
350
- return list(set(dataset.columns) - set(self.output_cols))
351
-
352
346
  def _batch_inference_validate_snowpark(
353
347
  self,
354
348
  dataset: DataFrame,
355
349
  inference_method: str,
356
350
  ) -> List[str]:
357
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
351
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
352
+ return the available package that exists in the snowflake anaconda channel
353
+
354
+ Args:
355
+ dataset: snowpark dataframe
356
+ inference_method: the inference method such as predict, score...
357
+
358
+ Raises:
359
+ SnowflakeMLException: If the estimator is not fitted, raise error
360
+ SnowflakeMLException: If the session is None, raise error
361
+
362
+ Returns:
363
+ A list of available package that exists in the snowflake anaconda channel
358
364
  """
359
365
  if not self._is_fitted:
360
366
  raise exceptions.SnowflakeMLException(
@@ -426,7 +432,7 @@ class FeatureAgglomeration(BaseTransformer):
426
432
  transform_kwargs = dict(
427
433
  session = dataset._session,
428
434
  dependencies = self._deps,
429
- pass_through_cols = self._get_pass_through_columns(dataset),
435
+ drop_input_cols = self._drop_input_cols,
430
436
  expected_output_cols_type = expected_type_inferred,
431
437
  )
432
438
 
@@ -488,16 +494,16 @@ class FeatureAgglomeration(BaseTransformer):
488
494
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
489
495
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
490
496
  # each row containing a list of values.
491
- expected_dtype = "ARRAY"
497
+ expected_dtype = "array"
492
498
 
493
499
  # If we were unable to assign a type to this transform in the factory, infer the type here.
494
500
  if expected_dtype == "":
495
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
501
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
496
502
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
497
- expected_dtype = "ARRAY"
498
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
503
+ expected_dtype = "array"
504
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
499
505
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
500
- expected_dtype = "ARRAY"
506
+ expected_dtype = "array"
501
507
  else:
502
508
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
503
509
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -515,7 +521,7 @@ class FeatureAgglomeration(BaseTransformer):
515
521
  transform_kwargs = dict(
516
522
  session = dataset._session,
517
523
  dependencies = self._deps,
518
- pass_through_cols = self._get_pass_through_columns(dataset),
524
+ drop_input_cols = self._drop_input_cols,
519
525
  expected_output_cols_type = expected_dtype,
520
526
  )
521
527
 
@@ -568,7 +574,7 @@ class FeatureAgglomeration(BaseTransformer):
568
574
  subproject=_SUBPROJECT,
569
575
  )
570
576
  output_result, fitted_estimator = model_trainer.train_fit_predict(
571
- pass_through_columns=self._get_pass_through_columns(dataset),
577
+ drop_input_cols=self._drop_input_cols,
572
578
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
573
579
  )
574
580
  self._sklearn_object = fitted_estimator
@@ -586,44 +592,6 @@ class FeatureAgglomeration(BaseTransformer):
586
592
  assert self._sklearn_object is not None
587
593
  return self._sklearn_object.embedding_
588
594
 
589
-
590
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
591
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
592
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
593
- """
594
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
595
- if output_cols:
596
- output_cols = [
597
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
598
- for c in output_cols
599
- ]
600
- elif getattr(self._sklearn_object, "classes_", None) is None:
601
- output_cols = [output_cols_prefix]
602
- elif self._sklearn_object is not None:
603
- classes = self._sklearn_object.classes_
604
- if isinstance(classes, numpy.ndarray):
605
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
606
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
607
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
608
- output_cols = []
609
- for i, cl in enumerate(classes):
610
- # For binary classification, there is only one output column for each class
611
- # ndarray as the two classes are complementary.
612
- if len(cl) == 2:
613
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
614
- else:
615
- output_cols.extend([
616
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
617
- ])
618
- else:
619
- output_cols = []
620
-
621
- # Make sure column names are valid snowflake identifiers.
622
- assert output_cols is not None # Make MyPy happy
623
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
624
-
625
- return rv
626
-
627
595
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
628
596
  @telemetry.send_api_usage_telemetry(
629
597
  project=_PROJECT,
@@ -663,7 +631,7 @@ class FeatureAgglomeration(BaseTransformer):
663
631
  transform_kwargs = dict(
664
632
  session=dataset._session,
665
633
  dependencies=self._deps,
666
- pass_through_cols=self._get_pass_through_columns(dataset),
634
+ drop_input_cols = self._drop_input_cols,
667
635
  expected_output_cols_type="float",
668
636
  )
669
637
 
@@ -728,7 +696,7 @@ class FeatureAgglomeration(BaseTransformer):
728
696
  transform_kwargs = dict(
729
697
  session=dataset._session,
730
698
  dependencies=self._deps,
731
- pass_through_cols=self._get_pass_through_columns(dataset),
699
+ drop_input_cols = self._drop_input_cols,
732
700
  expected_output_cols_type="float",
733
701
  )
734
702
  elif isinstance(dataset, pd.DataFrame):
@@ -789,7 +757,7 @@ class FeatureAgglomeration(BaseTransformer):
789
757
  transform_kwargs = dict(
790
758
  session=dataset._session,
791
759
  dependencies=self._deps,
792
- pass_through_cols=self._get_pass_through_columns(dataset),
760
+ drop_input_cols = self._drop_input_cols,
793
761
  expected_output_cols_type="float",
794
762
  )
795
763
 
@@ -854,7 +822,7 @@ class FeatureAgglomeration(BaseTransformer):
854
822
  transform_kwargs = dict(
855
823
  session=dataset._session,
856
824
  dependencies=self._deps,
857
- pass_through_cols=self._get_pass_through_columns(dataset),
825
+ drop_input_cols = self._drop_input_cols,
858
826
  expected_output_cols_type="float",
859
827
  )
860
828
 
@@ -908,13 +876,17 @@ class FeatureAgglomeration(BaseTransformer):
908
876
  transform_kwargs: ScoreKwargsTypedDict = dict()
909
877
 
910
878
  if isinstance(dataset, DataFrame):
879
+ self._deps = self._batch_inference_validate_snowpark(
880
+ dataset=dataset,
881
+ inference_method="score",
882
+ )
911
883
  selected_cols = self._get_active_columns()
912
884
  if len(selected_cols) > 0:
913
885
  dataset = dataset.select(selected_cols)
914
886
  assert isinstance(dataset._session, Session) # keep mypy happy
915
887
  transform_kwargs = dict(
916
888
  session=dataset._session,
917
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
889
+ dependencies=["snowflake-snowpark-python"] + self._deps,
918
890
  score_sproc_imports=['sklearn'],
919
891
  )
920
892
  elif isinstance(dataset, pd.DataFrame):
@@ -988,9 +960,9 @@ class FeatureAgglomeration(BaseTransformer):
988
960
  transform_kwargs = dict(
989
961
  session = dataset._session,
990
962
  dependencies = self._deps,
991
- pass_through_cols = self._get_pass_through_columns(dataset),
992
- expected_output_cols_type = "array",
993
- n_neighbors = n_neighbors,
963
+ drop_input_cols = self._drop_input_cols,
964
+ expected_output_cols_type="array",
965
+ n_neighbors = n_neighbors,
994
966
  return_distance = return_distance
995
967
  )
996
968
  elif isinstance(dataset, pd.DataFrame):