snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -440,18 +440,24 @@ class ExtraTreesClassifier(BaseTransformer):
440
440
  self._get_model_signatures(dataset)
441
441
  return self
442
442
 
443
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
444
- if self._drop_input_cols:
445
- return []
446
- else:
447
- return list(set(dataset.columns) - set(self.output_cols))
448
-
449
443
  def _batch_inference_validate_snowpark(
450
444
  self,
451
445
  dataset: DataFrame,
452
446
  inference_method: str,
453
447
  ) -> List[str]:
454
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
448
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
449
+ return the available package that exists in the snowflake anaconda channel
450
+
451
+ Args:
452
+ dataset: snowpark dataframe
453
+ inference_method: the inference method such as predict, score...
454
+
455
+ Raises:
456
+ SnowflakeMLException: If the estimator is not fitted, raise error
457
+ SnowflakeMLException: If the session is None, raise error
458
+
459
+ Returns:
460
+ A list of available package that exists in the snowflake anaconda channel
455
461
  """
456
462
  if not self._is_fitted:
457
463
  raise exceptions.SnowflakeMLException(
@@ -525,7 +531,7 @@ class ExtraTreesClassifier(BaseTransformer):
525
531
  transform_kwargs = dict(
526
532
  session = dataset._session,
527
533
  dependencies = self._deps,
528
- pass_through_cols = self._get_pass_through_columns(dataset),
534
+ drop_input_cols = self._drop_input_cols,
529
535
  expected_output_cols_type = expected_type_inferred,
530
536
  )
531
537
 
@@ -585,16 +591,16 @@ class ExtraTreesClassifier(BaseTransformer):
585
591
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
586
592
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
587
593
  # each row containing a list of values.
588
- expected_dtype = "ARRAY"
594
+ expected_dtype = "array"
589
595
 
590
596
  # If we were unable to assign a type to this transform in the factory, infer the type here.
591
597
  if expected_dtype == "":
592
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
598
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
593
599
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
594
- expected_dtype = "ARRAY"
595
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
600
+ expected_dtype = "array"
601
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
596
602
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
597
- expected_dtype = "ARRAY"
603
+ expected_dtype = "array"
598
604
  else:
599
605
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
600
606
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -612,7 +618,7 @@ class ExtraTreesClassifier(BaseTransformer):
612
618
  transform_kwargs = dict(
613
619
  session = dataset._session,
614
620
  dependencies = self._deps,
615
- pass_through_cols = self._get_pass_through_columns(dataset),
621
+ drop_input_cols = self._drop_input_cols,
616
622
  expected_output_cols_type = expected_dtype,
617
623
  )
618
624
 
@@ -663,7 +669,7 @@ class ExtraTreesClassifier(BaseTransformer):
663
669
  subproject=_SUBPROJECT,
664
670
  )
665
671
  output_result, fitted_estimator = model_trainer.train_fit_predict(
666
- pass_through_columns=self._get_pass_through_columns(dataset),
672
+ drop_input_cols=self._drop_input_cols,
667
673
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
668
674
  )
669
675
  self._sklearn_object = fitted_estimator
@@ -681,44 +687,6 @@ class ExtraTreesClassifier(BaseTransformer):
681
687
  assert self._sklearn_object is not None
682
688
  return self._sklearn_object.embedding_
683
689
 
684
-
685
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
686
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
687
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
688
- """
689
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
690
- if output_cols:
691
- output_cols = [
692
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
693
- for c in output_cols
694
- ]
695
- elif getattr(self._sklearn_object, "classes_", None) is None:
696
- output_cols = [output_cols_prefix]
697
- elif self._sklearn_object is not None:
698
- classes = self._sklearn_object.classes_
699
- if isinstance(classes, numpy.ndarray):
700
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
701
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
702
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
703
- output_cols = []
704
- for i, cl in enumerate(classes):
705
- # For binary classification, there is only one output column for each class
706
- # ndarray as the two classes are complementary.
707
- if len(cl) == 2:
708
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
709
- else:
710
- output_cols.extend([
711
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
712
- ])
713
- else:
714
- output_cols = []
715
-
716
- # Make sure column names are valid snowflake identifiers.
717
- assert output_cols is not None # Make MyPy happy
718
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
719
-
720
- return rv
721
-
722
690
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
723
691
  @telemetry.send_api_usage_telemetry(
724
692
  project=_PROJECT,
@@ -760,7 +728,7 @@ class ExtraTreesClassifier(BaseTransformer):
760
728
  transform_kwargs = dict(
761
729
  session=dataset._session,
762
730
  dependencies=self._deps,
763
- pass_through_cols=self._get_pass_through_columns(dataset),
731
+ drop_input_cols = self._drop_input_cols,
764
732
  expected_output_cols_type="float",
765
733
  )
766
734
 
@@ -827,7 +795,7 @@ class ExtraTreesClassifier(BaseTransformer):
827
795
  transform_kwargs = dict(
828
796
  session=dataset._session,
829
797
  dependencies=self._deps,
830
- pass_through_cols=self._get_pass_through_columns(dataset),
798
+ drop_input_cols = self._drop_input_cols,
831
799
  expected_output_cols_type="float",
832
800
  )
833
801
  elif isinstance(dataset, pd.DataFrame):
@@ -888,7 +856,7 @@ class ExtraTreesClassifier(BaseTransformer):
888
856
  transform_kwargs = dict(
889
857
  session=dataset._session,
890
858
  dependencies=self._deps,
891
- pass_through_cols=self._get_pass_through_columns(dataset),
859
+ drop_input_cols = self._drop_input_cols,
892
860
  expected_output_cols_type="float",
893
861
  )
894
862
 
@@ -953,7 +921,7 @@ class ExtraTreesClassifier(BaseTransformer):
953
921
  transform_kwargs = dict(
954
922
  session=dataset._session,
955
923
  dependencies=self._deps,
956
- pass_through_cols=self._get_pass_through_columns(dataset),
924
+ drop_input_cols = self._drop_input_cols,
957
925
  expected_output_cols_type="float",
958
926
  )
959
927
 
@@ -1009,13 +977,17 @@ class ExtraTreesClassifier(BaseTransformer):
1009
977
  transform_kwargs: ScoreKwargsTypedDict = dict()
1010
978
 
1011
979
  if isinstance(dataset, DataFrame):
980
+ self._deps = self._batch_inference_validate_snowpark(
981
+ dataset=dataset,
982
+ inference_method="score",
983
+ )
1012
984
  selected_cols = self._get_active_columns()
1013
985
  if len(selected_cols) > 0:
1014
986
  dataset = dataset.select(selected_cols)
1015
987
  assert isinstance(dataset._session, Session) # keep mypy happy
1016
988
  transform_kwargs = dict(
1017
989
  session=dataset._session,
1018
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
990
+ dependencies=["snowflake-snowpark-python"] + self._deps,
1019
991
  score_sproc_imports=['sklearn'],
1020
992
  )
1021
993
  elif isinstance(dataset, pd.DataFrame):
@@ -1089,9 +1061,9 @@ class ExtraTreesClassifier(BaseTransformer):
1089
1061
  transform_kwargs = dict(
1090
1062
  session = dataset._session,
1091
1063
  dependencies = self._deps,
1092
- pass_through_cols = self._get_pass_through_columns(dataset),
1093
- expected_output_cols_type = "array",
1094
- n_neighbors = n_neighbors,
1064
+ drop_input_cols = self._drop_input_cols,
1065
+ expected_output_cols_type="array",
1066
+ n_neighbors = n_neighbors,
1095
1067
  return_distance = return_distance
1096
1068
  )
1097
1069
  elif isinstance(dataset, pd.DataFrame):
@@ -419,18 +419,24 @@ class ExtraTreesRegressor(BaseTransformer):
419
419
  self._get_model_signatures(dataset)
420
420
  return self
421
421
 
422
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
423
- if self._drop_input_cols:
424
- return []
425
- else:
426
- return list(set(dataset.columns) - set(self.output_cols))
427
-
428
422
  def _batch_inference_validate_snowpark(
429
423
  self,
430
424
  dataset: DataFrame,
431
425
  inference_method: str,
432
426
  ) -> List[str]:
433
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
427
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
428
+ return the available package that exists in the snowflake anaconda channel
429
+
430
+ Args:
431
+ dataset: snowpark dataframe
432
+ inference_method: the inference method such as predict, score...
433
+
434
+ Raises:
435
+ SnowflakeMLException: If the estimator is not fitted, raise error
436
+ SnowflakeMLException: If the session is None, raise error
437
+
438
+ Returns:
439
+ A list of available package that exists in the snowflake anaconda channel
434
440
  """
435
441
  if not self._is_fitted:
436
442
  raise exceptions.SnowflakeMLException(
@@ -504,7 +510,7 @@ class ExtraTreesRegressor(BaseTransformer):
504
510
  transform_kwargs = dict(
505
511
  session = dataset._session,
506
512
  dependencies = self._deps,
507
- pass_through_cols = self._get_pass_through_columns(dataset),
513
+ drop_input_cols = self._drop_input_cols,
508
514
  expected_output_cols_type = expected_type_inferred,
509
515
  )
510
516
 
@@ -564,16 +570,16 @@ class ExtraTreesRegressor(BaseTransformer):
564
570
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
565
571
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
566
572
  # each row containing a list of values.
567
- expected_dtype = "ARRAY"
573
+ expected_dtype = "array"
568
574
 
569
575
  # If we were unable to assign a type to this transform in the factory, infer the type here.
570
576
  if expected_dtype == "":
571
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
577
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
572
578
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
573
- expected_dtype = "ARRAY"
574
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
579
+ expected_dtype = "array"
580
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
575
581
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
576
- expected_dtype = "ARRAY"
582
+ expected_dtype = "array"
577
583
  else:
578
584
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
579
585
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -591,7 +597,7 @@ class ExtraTreesRegressor(BaseTransformer):
591
597
  transform_kwargs = dict(
592
598
  session = dataset._session,
593
599
  dependencies = self._deps,
594
- pass_through_cols = self._get_pass_through_columns(dataset),
600
+ drop_input_cols = self._drop_input_cols,
595
601
  expected_output_cols_type = expected_dtype,
596
602
  )
597
603
 
@@ -642,7 +648,7 @@ class ExtraTreesRegressor(BaseTransformer):
642
648
  subproject=_SUBPROJECT,
643
649
  )
644
650
  output_result, fitted_estimator = model_trainer.train_fit_predict(
645
- pass_through_columns=self._get_pass_through_columns(dataset),
651
+ drop_input_cols=self._drop_input_cols,
646
652
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
647
653
  )
648
654
  self._sklearn_object = fitted_estimator
@@ -660,44 +666,6 @@ class ExtraTreesRegressor(BaseTransformer):
660
666
  assert self._sklearn_object is not None
661
667
  return self._sklearn_object.embedding_
662
668
 
663
-
664
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
665
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
666
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
667
- """
668
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
669
- if output_cols:
670
- output_cols = [
671
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
672
- for c in output_cols
673
- ]
674
- elif getattr(self._sklearn_object, "classes_", None) is None:
675
- output_cols = [output_cols_prefix]
676
- elif self._sklearn_object is not None:
677
- classes = self._sklearn_object.classes_
678
- if isinstance(classes, numpy.ndarray):
679
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
680
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
681
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
682
- output_cols = []
683
- for i, cl in enumerate(classes):
684
- # For binary classification, there is only one output column for each class
685
- # ndarray as the two classes are complementary.
686
- if len(cl) == 2:
687
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
688
- else:
689
- output_cols.extend([
690
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
691
- ])
692
- else:
693
- output_cols = []
694
-
695
- # Make sure column names are valid snowflake identifiers.
696
- assert output_cols is not None # Make MyPy happy
697
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
698
-
699
- return rv
700
-
701
669
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
702
670
  @telemetry.send_api_usage_telemetry(
703
671
  project=_PROJECT,
@@ -737,7 +705,7 @@ class ExtraTreesRegressor(BaseTransformer):
737
705
  transform_kwargs = dict(
738
706
  session=dataset._session,
739
707
  dependencies=self._deps,
740
- pass_through_cols=self._get_pass_through_columns(dataset),
708
+ drop_input_cols = self._drop_input_cols,
741
709
  expected_output_cols_type="float",
742
710
  )
743
711
 
@@ -802,7 +770,7 @@ class ExtraTreesRegressor(BaseTransformer):
802
770
  transform_kwargs = dict(
803
771
  session=dataset._session,
804
772
  dependencies=self._deps,
805
- pass_through_cols=self._get_pass_through_columns(dataset),
773
+ drop_input_cols = self._drop_input_cols,
806
774
  expected_output_cols_type="float",
807
775
  )
808
776
  elif isinstance(dataset, pd.DataFrame):
@@ -863,7 +831,7 @@ class ExtraTreesRegressor(BaseTransformer):
863
831
  transform_kwargs = dict(
864
832
  session=dataset._session,
865
833
  dependencies=self._deps,
866
- pass_through_cols=self._get_pass_through_columns(dataset),
834
+ drop_input_cols = self._drop_input_cols,
867
835
  expected_output_cols_type="float",
868
836
  )
869
837
 
@@ -928,7 +896,7 @@ class ExtraTreesRegressor(BaseTransformer):
928
896
  transform_kwargs = dict(
929
897
  session=dataset._session,
930
898
  dependencies=self._deps,
931
- pass_through_cols=self._get_pass_through_columns(dataset),
899
+ drop_input_cols = self._drop_input_cols,
932
900
  expected_output_cols_type="float",
933
901
  )
934
902
 
@@ -984,13 +952,17 @@ class ExtraTreesRegressor(BaseTransformer):
984
952
  transform_kwargs: ScoreKwargsTypedDict = dict()
985
953
 
986
954
  if isinstance(dataset, DataFrame):
955
+ self._deps = self._batch_inference_validate_snowpark(
956
+ dataset=dataset,
957
+ inference_method="score",
958
+ )
987
959
  selected_cols = self._get_active_columns()
988
960
  if len(selected_cols) > 0:
989
961
  dataset = dataset.select(selected_cols)
990
962
  assert isinstance(dataset._session, Session) # keep mypy happy
991
963
  transform_kwargs = dict(
992
964
  session=dataset._session,
993
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
965
+ dependencies=["snowflake-snowpark-python"] + self._deps,
994
966
  score_sproc_imports=['sklearn'],
995
967
  )
996
968
  elif isinstance(dataset, pd.DataFrame):
@@ -1064,9 +1036,9 @@ class ExtraTreesRegressor(BaseTransformer):
1064
1036
  transform_kwargs = dict(
1065
1037
  session = dataset._session,
1066
1038
  dependencies = self._deps,
1067
- pass_through_cols = self._get_pass_through_columns(dataset),
1068
- expected_output_cols_type = "array",
1069
- n_neighbors = n_neighbors,
1039
+ drop_input_cols = self._drop_input_cols,
1040
+ expected_output_cols_type="array",
1041
+ n_neighbors = n_neighbors,
1070
1042
  return_distance = return_distance
1071
1043
  )
1072
1044
  elif isinstance(dataset, pd.DataFrame):
@@ -452,18 +452,24 @@ class GradientBoostingClassifier(BaseTransformer):
452
452
  self._get_model_signatures(dataset)
453
453
  return self
454
454
 
455
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
456
- if self._drop_input_cols:
457
- return []
458
- else:
459
- return list(set(dataset.columns) - set(self.output_cols))
460
-
461
455
  def _batch_inference_validate_snowpark(
462
456
  self,
463
457
  dataset: DataFrame,
464
458
  inference_method: str,
465
459
  ) -> List[str]:
466
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
460
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
461
+ return the available package that exists in the snowflake anaconda channel
462
+
463
+ Args:
464
+ dataset: snowpark dataframe
465
+ inference_method: the inference method such as predict, score...
466
+
467
+ Raises:
468
+ SnowflakeMLException: If the estimator is not fitted, raise error
469
+ SnowflakeMLException: If the session is None, raise error
470
+
471
+ Returns:
472
+ A list of available package that exists in the snowflake anaconda channel
467
473
  """
468
474
  if not self._is_fitted:
469
475
  raise exceptions.SnowflakeMLException(
@@ -537,7 +543,7 @@ class GradientBoostingClassifier(BaseTransformer):
537
543
  transform_kwargs = dict(
538
544
  session = dataset._session,
539
545
  dependencies = self._deps,
540
- pass_through_cols = self._get_pass_through_columns(dataset),
546
+ drop_input_cols = self._drop_input_cols,
541
547
  expected_output_cols_type = expected_type_inferred,
542
548
  )
543
549
 
@@ -597,16 +603,16 @@ class GradientBoostingClassifier(BaseTransformer):
597
603
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
598
604
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
599
605
  # each row containing a list of values.
600
- expected_dtype = "ARRAY"
606
+ expected_dtype = "array"
601
607
 
602
608
  # If we were unable to assign a type to this transform in the factory, infer the type here.
603
609
  if expected_dtype == "":
604
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
610
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
605
611
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
606
- expected_dtype = "ARRAY"
607
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
612
+ expected_dtype = "array"
613
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
608
614
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
609
- expected_dtype = "ARRAY"
615
+ expected_dtype = "array"
610
616
  else:
611
617
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
612
618
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -624,7 +630,7 @@ class GradientBoostingClassifier(BaseTransformer):
624
630
  transform_kwargs = dict(
625
631
  session = dataset._session,
626
632
  dependencies = self._deps,
627
- pass_through_cols = self._get_pass_through_columns(dataset),
633
+ drop_input_cols = self._drop_input_cols,
628
634
  expected_output_cols_type = expected_dtype,
629
635
  )
630
636
 
@@ -675,7 +681,7 @@ class GradientBoostingClassifier(BaseTransformer):
675
681
  subproject=_SUBPROJECT,
676
682
  )
677
683
  output_result, fitted_estimator = model_trainer.train_fit_predict(
678
- pass_through_columns=self._get_pass_through_columns(dataset),
684
+ drop_input_cols=self._drop_input_cols,
679
685
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
680
686
  )
681
687
  self._sklearn_object = fitted_estimator
@@ -693,44 +699,6 @@ class GradientBoostingClassifier(BaseTransformer):
693
699
  assert self._sklearn_object is not None
694
700
  return self._sklearn_object.embedding_
695
701
 
696
-
697
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
698
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
699
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
700
- """
701
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
702
- if output_cols:
703
- output_cols = [
704
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
705
- for c in output_cols
706
- ]
707
- elif getattr(self._sklearn_object, "classes_", None) is None:
708
- output_cols = [output_cols_prefix]
709
- elif self._sklearn_object is not None:
710
- classes = self._sklearn_object.classes_
711
- if isinstance(classes, numpy.ndarray):
712
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
713
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
714
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
715
- output_cols = []
716
- for i, cl in enumerate(classes):
717
- # For binary classification, there is only one output column for each class
718
- # ndarray as the two classes are complementary.
719
- if len(cl) == 2:
720
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
721
- else:
722
- output_cols.extend([
723
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
724
- ])
725
- else:
726
- output_cols = []
727
-
728
- # Make sure column names are valid snowflake identifiers.
729
- assert output_cols is not None # Make MyPy happy
730
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
731
-
732
- return rv
733
-
734
702
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
735
703
  @telemetry.send_api_usage_telemetry(
736
704
  project=_PROJECT,
@@ -772,7 +740,7 @@ class GradientBoostingClassifier(BaseTransformer):
772
740
  transform_kwargs = dict(
773
741
  session=dataset._session,
774
742
  dependencies=self._deps,
775
- pass_through_cols=self._get_pass_through_columns(dataset),
743
+ drop_input_cols = self._drop_input_cols,
776
744
  expected_output_cols_type="float",
777
745
  )
778
746
 
@@ -839,7 +807,7 @@ class GradientBoostingClassifier(BaseTransformer):
839
807
  transform_kwargs = dict(
840
808
  session=dataset._session,
841
809
  dependencies=self._deps,
842
- pass_through_cols=self._get_pass_through_columns(dataset),
810
+ drop_input_cols = self._drop_input_cols,
843
811
  expected_output_cols_type="float",
844
812
  )
845
813
  elif isinstance(dataset, pd.DataFrame):
@@ -902,7 +870,7 @@ class GradientBoostingClassifier(BaseTransformer):
902
870
  transform_kwargs = dict(
903
871
  session=dataset._session,
904
872
  dependencies=self._deps,
905
- pass_through_cols=self._get_pass_through_columns(dataset),
873
+ drop_input_cols = self._drop_input_cols,
906
874
  expected_output_cols_type="float",
907
875
  )
908
876
 
@@ -967,7 +935,7 @@ class GradientBoostingClassifier(BaseTransformer):
967
935
  transform_kwargs = dict(
968
936
  session=dataset._session,
969
937
  dependencies=self._deps,
970
- pass_through_cols=self._get_pass_through_columns(dataset),
938
+ drop_input_cols = self._drop_input_cols,
971
939
  expected_output_cols_type="float",
972
940
  )
973
941
 
@@ -1023,13 +991,17 @@ class GradientBoostingClassifier(BaseTransformer):
1023
991
  transform_kwargs: ScoreKwargsTypedDict = dict()
1024
992
 
1025
993
  if isinstance(dataset, DataFrame):
994
+ self._deps = self._batch_inference_validate_snowpark(
995
+ dataset=dataset,
996
+ inference_method="score",
997
+ )
1026
998
  selected_cols = self._get_active_columns()
1027
999
  if len(selected_cols) > 0:
1028
1000
  dataset = dataset.select(selected_cols)
1029
1001
  assert isinstance(dataset._session, Session) # keep mypy happy
1030
1002
  transform_kwargs = dict(
1031
1003
  session=dataset._session,
1032
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
1004
+ dependencies=["snowflake-snowpark-python"] + self._deps,
1033
1005
  score_sproc_imports=['sklearn'],
1034
1006
  )
1035
1007
  elif isinstance(dataset, pd.DataFrame):
@@ -1103,9 +1075,9 @@ class GradientBoostingClassifier(BaseTransformer):
1103
1075
  transform_kwargs = dict(
1104
1076
  session = dataset._session,
1105
1077
  dependencies = self._deps,
1106
- pass_through_cols = self._get_pass_through_columns(dataset),
1107
- expected_output_cols_type = "array",
1108
- n_neighbors = n_neighbors,
1078
+ drop_input_cols = self._drop_input_cols,
1079
+ expected_output_cols_type="array",
1080
+ n_neighbors = n_neighbors,
1109
1081
  return_distance = return_distance
1110
1082
  )
1111
1083
  elif isinstance(dataset, pd.DataFrame):