snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -461,18 +461,24 @@ class GradientBoostingRegressor(BaseTransformer):
461
461
  self._get_model_signatures(dataset)
462
462
  return self
463
463
 
464
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
465
- if self._drop_input_cols:
466
- return []
467
- else:
468
- return list(set(dataset.columns) - set(self.output_cols))
469
-
470
464
  def _batch_inference_validate_snowpark(
471
465
  self,
472
466
  dataset: DataFrame,
473
467
  inference_method: str,
474
468
  ) -> List[str]:
475
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
469
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
470
+ return the available package that exists in the snowflake anaconda channel
471
+
472
+ Args:
473
+ dataset: snowpark dataframe
474
+ inference_method: the inference method such as predict, score...
475
+
476
+ Raises:
477
+ SnowflakeMLException: If the estimator is not fitted, raise error
478
+ SnowflakeMLException: If the session is None, raise error
479
+
480
+ Returns:
481
+ A list of available package that exists in the snowflake anaconda channel
476
482
  """
477
483
  if not self._is_fitted:
478
484
  raise exceptions.SnowflakeMLException(
@@ -546,7 +552,7 @@ class GradientBoostingRegressor(BaseTransformer):
546
552
  transform_kwargs = dict(
547
553
  session = dataset._session,
548
554
  dependencies = self._deps,
549
- pass_through_cols = self._get_pass_through_columns(dataset),
555
+ drop_input_cols = self._drop_input_cols,
550
556
  expected_output_cols_type = expected_type_inferred,
551
557
  )
552
558
 
@@ -606,16 +612,16 @@ class GradientBoostingRegressor(BaseTransformer):
606
612
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
607
613
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
608
614
  # each row containing a list of values.
609
- expected_dtype = "ARRAY"
615
+ expected_dtype = "array"
610
616
 
611
617
  # If we were unable to assign a type to this transform in the factory, infer the type here.
612
618
  if expected_dtype == "":
613
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
619
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
614
620
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
615
- expected_dtype = "ARRAY"
616
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
621
+ expected_dtype = "array"
622
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
617
623
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
618
- expected_dtype = "ARRAY"
624
+ expected_dtype = "array"
619
625
  else:
620
626
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
621
627
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -633,7 +639,7 @@ class GradientBoostingRegressor(BaseTransformer):
633
639
  transform_kwargs = dict(
634
640
  session = dataset._session,
635
641
  dependencies = self._deps,
636
- pass_through_cols = self._get_pass_through_columns(dataset),
642
+ drop_input_cols = self._drop_input_cols,
637
643
  expected_output_cols_type = expected_dtype,
638
644
  )
639
645
 
@@ -684,7 +690,7 @@ class GradientBoostingRegressor(BaseTransformer):
684
690
  subproject=_SUBPROJECT,
685
691
  )
686
692
  output_result, fitted_estimator = model_trainer.train_fit_predict(
687
- pass_through_columns=self._get_pass_through_columns(dataset),
693
+ drop_input_cols=self._drop_input_cols,
688
694
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
689
695
  )
690
696
  self._sklearn_object = fitted_estimator
@@ -702,44 +708,6 @@ class GradientBoostingRegressor(BaseTransformer):
702
708
  assert self._sklearn_object is not None
703
709
  return self._sklearn_object.embedding_
704
710
 
705
-
706
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
707
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
708
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
709
- """
710
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
711
- if output_cols:
712
- output_cols = [
713
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
714
- for c in output_cols
715
- ]
716
- elif getattr(self._sklearn_object, "classes_", None) is None:
717
- output_cols = [output_cols_prefix]
718
- elif self._sklearn_object is not None:
719
- classes = self._sklearn_object.classes_
720
- if isinstance(classes, numpy.ndarray):
721
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
722
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
723
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
724
- output_cols = []
725
- for i, cl in enumerate(classes):
726
- # For binary classification, there is only one output column for each class
727
- # ndarray as the two classes are complementary.
728
- if len(cl) == 2:
729
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
730
- else:
731
- output_cols.extend([
732
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
733
- ])
734
- else:
735
- output_cols = []
736
-
737
- # Make sure column names are valid snowflake identifiers.
738
- assert output_cols is not None # Make MyPy happy
739
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
740
-
741
- return rv
742
-
743
711
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
744
712
  @telemetry.send_api_usage_telemetry(
745
713
  project=_PROJECT,
@@ -779,7 +747,7 @@ class GradientBoostingRegressor(BaseTransformer):
779
747
  transform_kwargs = dict(
780
748
  session=dataset._session,
781
749
  dependencies=self._deps,
782
- pass_through_cols=self._get_pass_through_columns(dataset),
750
+ drop_input_cols = self._drop_input_cols,
783
751
  expected_output_cols_type="float",
784
752
  )
785
753
 
@@ -844,7 +812,7 @@ class GradientBoostingRegressor(BaseTransformer):
844
812
  transform_kwargs = dict(
845
813
  session=dataset._session,
846
814
  dependencies=self._deps,
847
- pass_through_cols=self._get_pass_through_columns(dataset),
815
+ drop_input_cols = self._drop_input_cols,
848
816
  expected_output_cols_type="float",
849
817
  )
850
818
  elif isinstance(dataset, pd.DataFrame):
@@ -905,7 +873,7 @@ class GradientBoostingRegressor(BaseTransformer):
905
873
  transform_kwargs = dict(
906
874
  session=dataset._session,
907
875
  dependencies=self._deps,
908
- pass_through_cols=self._get_pass_through_columns(dataset),
876
+ drop_input_cols = self._drop_input_cols,
909
877
  expected_output_cols_type="float",
910
878
  )
911
879
 
@@ -970,7 +938,7 @@ class GradientBoostingRegressor(BaseTransformer):
970
938
  transform_kwargs = dict(
971
939
  session=dataset._session,
972
940
  dependencies=self._deps,
973
- pass_through_cols=self._get_pass_through_columns(dataset),
941
+ drop_input_cols = self._drop_input_cols,
974
942
  expected_output_cols_type="float",
975
943
  )
976
944
 
@@ -1026,13 +994,17 @@ class GradientBoostingRegressor(BaseTransformer):
1026
994
  transform_kwargs: ScoreKwargsTypedDict = dict()
1027
995
 
1028
996
  if isinstance(dataset, DataFrame):
997
+ self._deps = self._batch_inference_validate_snowpark(
998
+ dataset=dataset,
999
+ inference_method="score",
1000
+ )
1029
1001
  selected_cols = self._get_active_columns()
1030
1002
  if len(selected_cols) > 0:
1031
1003
  dataset = dataset.select(selected_cols)
1032
1004
  assert isinstance(dataset._session, Session) # keep mypy happy
1033
1005
  transform_kwargs = dict(
1034
1006
  session=dataset._session,
1035
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
1007
+ dependencies=["snowflake-snowpark-python"] + self._deps,
1036
1008
  score_sproc_imports=['sklearn'],
1037
1009
  )
1038
1010
  elif isinstance(dataset, pd.DataFrame):
@@ -1106,9 +1078,9 @@ class GradientBoostingRegressor(BaseTransformer):
1106
1078
  transform_kwargs = dict(
1107
1079
  session = dataset._session,
1108
1080
  dependencies = self._deps,
1109
- pass_through_cols = self._get_pass_through_columns(dataset),
1110
- expected_output_cols_type = "array",
1111
- n_neighbors = n_neighbors,
1081
+ drop_input_cols = self._drop_input_cols,
1082
+ expected_output_cols_type="array",
1083
+ n_neighbors = n_neighbors,
1112
1084
  return_distance = return_distance
1113
1085
  )
1114
1086
  elif isinstance(dataset, pd.DataFrame):
@@ -433,18 +433,24 @@ class HistGradientBoostingClassifier(BaseTransformer):
433
433
  self._get_model_signatures(dataset)
434
434
  return self
435
435
 
436
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
437
- if self._drop_input_cols:
438
- return []
439
- else:
440
- return list(set(dataset.columns) - set(self.output_cols))
441
-
442
436
  def _batch_inference_validate_snowpark(
443
437
  self,
444
438
  dataset: DataFrame,
445
439
  inference_method: str,
446
440
  ) -> List[str]:
447
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
441
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
442
+ return the available package that exists in the snowflake anaconda channel
443
+
444
+ Args:
445
+ dataset: snowpark dataframe
446
+ inference_method: the inference method such as predict, score...
447
+
448
+ Raises:
449
+ SnowflakeMLException: If the estimator is not fitted, raise error
450
+ SnowflakeMLException: If the session is None, raise error
451
+
452
+ Returns:
453
+ A list of available package that exists in the snowflake anaconda channel
448
454
  """
449
455
  if not self._is_fitted:
450
456
  raise exceptions.SnowflakeMLException(
@@ -518,7 +524,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
518
524
  transform_kwargs = dict(
519
525
  session = dataset._session,
520
526
  dependencies = self._deps,
521
- pass_through_cols = self._get_pass_through_columns(dataset),
527
+ drop_input_cols = self._drop_input_cols,
522
528
  expected_output_cols_type = expected_type_inferred,
523
529
  )
524
530
 
@@ -578,16 +584,16 @@ class HistGradientBoostingClassifier(BaseTransformer):
578
584
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
579
585
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
580
586
  # each row containing a list of values.
581
- expected_dtype = "ARRAY"
587
+ expected_dtype = "array"
582
588
 
583
589
  # If we were unable to assign a type to this transform in the factory, infer the type here.
584
590
  if expected_dtype == "":
585
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
591
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
586
592
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
587
- expected_dtype = "ARRAY"
588
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
593
+ expected_dtype = "array"
594
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
589
595
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
590
- expected_dtype = "ARRAY"
596
+ expected_dtype = "array"
591
597
  else:
592
598
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
593
599
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -605,7 +611,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
605
611
  transform_kwargs = dict(
606
612
  session = dataset._session,
607
613
  dependencies = self._deps,
608
- pass_through_cols = self._get_pass_through_columns(dataset),
614
+ drop_input_cols = self._drop_input_cols,
609
615
  expected_output_cols_type = expected_dtype,
610
616
  )
611
617
 
@@ -656,7 +662,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
656
662
  subproject=_SUBPROJECT,
657
663
  )
658
664
  output_result, fitted_estimator = model_trainer.train_fit_predict(
659
- pass_through_columns=self._get_pass_through_columns(dataset),
665
+ drop_input_cols=self._drop_input_cols,
660
666
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
661
667
  )
662
668
  self._sklearn_object = fitted_estimator
@@ -674,44 +680,6 @@ class HistGradientBoostingClassifier(BaseTransformer):
674
680
  assert self._sklearn_object is not None
675
681
  return self._sklearn_object.embedding_
676
682
 
677
-
678
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
679
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
680
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
681
- """
682
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
683
- if output_cols:
684
- output_cols = [
685
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
686
- for c in output_cols
687
- ]
688
- elif getattr(self._sklearn_object, "classes_", None) is None:
689
- output_cols = [output_cols_prefix]
690
- elif self._sklearn_object is not None:
691
- classes = self._sklearn_object.classes_
692
- if isinstance(classes, numpy.ndarray):
693
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
694
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
695
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
696
- output_cols = []
697
- for i, cl in enumerate(classes):
698
- # For binary classification, there is only one output column for each class
699
- # ndarray as the two classes are complementary.
700
- if len(cl) == 2:
701
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
702
- else:
703
- output_cols.extend([
704
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
705
- ])
706
- else:
707
- output_cols = []
708
-
709
- # Make sure column names are valid snowflake identifiers.
710
- assert output_cols is not None # Make MyPy happy
711
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
712
-
713
- return rv
714
-
715
683
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
716
684
  @telemetry.send_api_usage_telemetry(
717
685
  project=_PROJECT,
@@ -753,7 +721,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
753
721
  transform_kwargs = dict(
754
722
  session=dataset._session,
755
723
  dependencies=self._deps,
756
- pass_through_cols=self._get_pass_through_columns(dataset),
724
+ drop_input_cols = self._drop_input_cols,
757
725
  expected_output_cols_type="float",
758
726
  )
759
727
 
@@ -820,7 +788,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
820
788
  transform_kwargs = dict(
821
789
  session=dataset._session,
822
790
  dependencies=self._deps,
823
- pass_through_cols=self._get_pass_through_columns(dataset),
791
+ drop_input_cols = self._drop_input_cols,
824
792
  expected_output_cols_type="float",
825
793
  )
826
794
  elif isinstance(dataset, pd.DataFrame):
@@ -883,7 +851,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
883
851
  transform_kwargs = dict(
884
852
  session=dataset._session,
885
853
  dependencies=self._deps,
886
- pass_through_cols=self._get_pass_through_columns(dataset),
854
+ drop_input_cols = self._drop_input_cols,
887
855
  expected_output_cols_type="float",
888
856
  )
889
857
 
@@ -948,7 +916,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
948
916
  transform_kwargs = dict(
949
917
  session=dataset._session,
950
918
  dependencies=self._deps,
951
- pass_through_cols=self._get_pass_through_columns(dataset),
919
+ drop_input_cols = self._drop_input_cols,
952
920
  expected_output_cols_type="float",
953
921
  )
954
922
 
@@ -1004,13 +972,17 @@ class HistGradientBoostingClassifier(BaseTransformer):
1004
972
  transform_kwargs: ScoreKwargsTypedDict = dict()
1005
973
 
1006
974
  if isinstance(dataset, DataFrame):
975
+ self._deps = self._batch_inference_validate_snowpark(
976
+ dataset=dataset,
977
+ inference_method="score",
978
+ )
1007
979
  selected_cols = self._get_active_columns()
1008
980
  if len(selected_cols) > 0:
1009
981
  dataset = dataset.select(selected_cols)
1010
982
  assert isinstance(dataset._session, Session) # keep mypy happy
1011
983
  transform_kwargs = dict(
1012
984
  session=dataset._session,
1013
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
985
+ dependencies=["snowflake-snowpark-python"] + self._deps,
1014
986
  score_sproc_imports=['sklearn'],
1015
987
  )
1016
988
  elif isinstance(dataset, pd.DataFrame):
@@ -1084,9 +1056,9 @@ class HistGradientBoostingClassifier(BaseTransformer):
1084
1056
  transform_kwargs = dict(
1085
1057
  session = dataset._session,
1086
1058
  dependencies = self._deps,
1087
- pass_through_cols = self._get_pass_through_columns(dataset),
1088
- expected_output_cols_type = "array",
1089
- n_neighbors = n_neighbors,
1059
+ drop_input_cols = self._drop_input_cols,
1060
+ expected_output_cols_type="array",
1061
+ n_neighbors = n_neighbors,
1090
1062
  return_distance = return_distance
1091
1063
  )
1092
1064
  elif isinstance(dataset, pd.DataFrame):
@@ -424,18 +424,24 @@ class HistGradientBoostingRegressor(BaseTransformer):
424
424
  self._get_model_signatures(dataset)
425
425
  return self
426
426
 
427
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
428
- if self._drop_input_cols:
429
- return []
430
- else:
431
- return list(set(dataset.columns) - set(self.output_cols))
432
-
433
427
  def _batch_inference_validate_snowpark(
434
428
  self,
435
429
  dataset: DataFrame,
436
430
  inference_method: str,
437
431
  ) -> List[str]:
438
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
432
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
433
+ return the available package that exists in the snowflake anaconda channel
434
+
435
+ Args:
436
+ dataset: snowpark dataframe
437
+ inference_method: the inference method such as predict, score...
438
+
439
+ Raises:
440
+ SnowflakeMLException: If the estimator is not fitted, raise error
441
+ SnowflakeMLException: If the session is None, raise error
442
+
443
+ Returns:
444
+ A list of available package that exists in the snowflake anaconda channel
439
445
  """
440
446
  if not self._is_fitted:
441
447
  raise exceptions.SnowflakeMLException(
@@ -509,7 +515,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
509
515
  transform_kwargs = dict(
510
516
  session = dataset._session,
511
517
  dependencies = self._deps,
512
- pass_through_cols = self._get_pass_through_columns(dataset),
518
+ drop_input_cols = self._drop_input_cols,
513
519
  expected_output_cols_type = expected_type_inferred,
514
520
  )
515
521
 
@@ -569,16 +575,16 @@ class HistGradientBoostingRegressor(BaseTransformer):
569
575
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
570
576
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
571
577
  # each row containing a list of values.
572
- expected_dtype = "ARRAY"
578
+ expected_dtype = "array"
573
579
 
574
580
  # If we were unable to assign a type to this transform in the factory, infer the type here.
575
581
  if expected_dtype == "":
576
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
582
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
577
583
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
578
- expected_dtype = "ARRAY"
579
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
584
+ expected_dtype = "array"
585
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
580
586
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
581
- expected_dtype = "ARRAY"
587
+ expected_dtype = "array"
582
588
  else:
583
589
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
584
590
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -596,7 +602,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
596
602
  transform_kwargs = dict(
597
603
  session = dataset._session,
598
604
  dependencies = self._deps,
599
- pass_through_cols = self._get_pass_through_columns(dataset),
605
+ drop_input_cols = self._drop_input_cols,
600
606
  expected_output_cols_type = expected_dtype,
601
607
  )
602
608
 
@@ -647,7 +653,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
647
653
  subproject=_SUBPROJECT,
648
654
  )
649
655
  output_result, fitted_estimator = model_trainer.train_fit_predict(
650
- pass_through_columns=self._get_pass_through_columns(dataset),
656
+ drop_input_cols=self._drop_input_cols,
651
657
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
652
658
  )
653
659
  self._sklearn_object = fitted_estimator
@@ -665,44 +671,6 @@ class HistGradientBoostingRegressor(BaseTransformer):
665
671
  assert self._sklearn_object is not None
666
672
  return self._sklearn_object.embedding_
667
673
 
668
-
669
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
670
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
671
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
672
- """
673
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
674
- if output_cols:
675
- output_cols = [
676
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
677
- for c in output_cols
678
- ]
679
- elif getattr(self._sklearn_object, "classes_", None) is None:
680
- output_cols = [output_cols_prefix]
681
- elif self._sklearn_object is not None:
682
- classes = self._sklearn_object.classes_
683
- if isinstance(classes, numpy.ndarray):
684
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
685
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
686
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
687
- output_cols = []
688
- for i, cl in enumerate(classes):
689
- # For binary classification, there is only one output column for each class
690
- # ndarray as the two classes are complementary.
691
- if len(cl) == 2:
692
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
693
- else:
694
- output_cols.extend([
695
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
696
- ])
697
- else:
698
- output_cols = []
699
-
700
- # Make sure column names are valid snowflake identifiers.
701
- assert output_cols is not None # Make MyPy happy
702
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
703
-
704
- return rv
705
-
706
674
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
707
675
  @telemetry.send_api_usage_telemetry(
708
676
  project=_PROJECT,
@@ -742,7 +710,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
742
710
  transform_kwargs = dict(
743
711
  session=dataset._session,
744
712
  dependencies=self._deps,
745
- pass_through_cols=self._get_pass_through_columns(dataset),
713
+ drop_input_cols = self._drop_input_cols,
746
714
  expected_output_cols_type="float",
747
715
  )
748
716
 
@@ -807,7 +775,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
807
775
  transform_kwargs = dict(
808
776
  session=dataset._session,
809
777
  dependencies=self._deps,
810
- pass_through_cols=self._get_pass_through_columns(dataset),
778
+ drop_input_cols = self._drop_input_cols,
811
779
  expected_output_cols_type="float",
812
780
  )
813
781
  elif isinstance(dataset, pd.DataFrame):
@@ -868,7 +836,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
868
836
  transform_kwargs = dict(
869
837
  session=dataset._session,
870
838
  dependencies=self._deps,
871
- pass_through_cols=self._get_pass_through_columns(dataset),
839
+ drop_input_cols = self._drop_input_cols,
872
840
  expected_output_cols_type="float",
873
841
  )
874
842
 
@@ -933,7 +901,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
933
901
  transform_kwargs = dict(
934
902
  session=dataset._session,
935
903
  dependencies=self._deps,
936
- pass_through_cols=self._get_pass_through_columns(dataset),
904
+ drop_input_cols = self._drop_input_cols,
937
905
  expected_output_cols_type="float",
938
906
  )
939
907
 
@@ -989,13 +957,17 @@ class HistGradientBoostingRegressor(BaseTransformer):
989
957
  transform_kwargs: ScoreKwargsTypedDict = dict()
990
958
 
991
959
  if isinstance(dataset, DataFrame):
960
+ self._deps = self._batch_inference_validate_snowpark(
961
+ dataset=dataset,
962
+ inference_method="score",
963
+ )
992
964
  selected_cols = self._get_active_columns()
993
965
  if len(selected_cols) > 0:
994
966
  dataset = dataset.select(selected_cols)
995
967
  assert isinstance(dataset._session, Session) # keep mypy happy
996
968
  transform_kwargs = dict(
997
969
  session=dataset._session,
998
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
970
+ dependencies=["snowflake-snowpark-python"] + self._deps,
999
971
  score_sproc_imports=['sklearn'],
1000
972
  )
1001
973
  elif isinstance(dataset, pd.DataFrame):
@@ -1069,9 +1041,9 @@ class HistGradientBoostingRegressor(BaseTransformer):
1069
1041
  transform_kwargs = dict(
1070
1042
  session = dataset._session,
1071
1043
  dependencies = self._deps,
1072
- pass_through_cols = self._get_pass_through_columns(dataset),
1073
- expected_output_cols_type = "array",
1074
- n_neighbors = n_neighbors,
1044
+ drop_input_cols = self._drop_input_cols,
1045
+ expected_output_cols_type="array",
1046
+ n_neighbors = n_neighbors,
1075
1047
  return_distance = return_distance
1076
1048
  )
1077
1049
  elif isinstance(dataset, pd.DataFrame):