snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -324,18 +324,24 @@ class IsolationForest(BaseTransformer):
324
324
  self._get_model_signatures(dataset)
325
325
  return self
326
326
 
327
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
328
- if self._drop_input_cols:
329
- return []
330
- else:
331
- return list(set(dataset.columns) - set(self.output_cols))
332
-
333
327
  def _batch_inference_validate_snowpark(
334
328
  self,
335
329
  dataset: DataFrame,
336
330
  inference_method: str,
337
331
  ) -> List[str]:
338
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
332
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
333
+ return the available package that exists in the snowflake anaconda channel
334
+
335
+ Args:
336
+ dataset: snowpark dataframe
337
+ inference_method: the inference method such as predict, score...
338
+
339
+ Raises:
340
+ SnowflakeMLException: If the estimator is not fitted, raise error
341
+ SnowflakeMLException: If the session is None, raise error
342
+
343
+ Returns:
344
+ A list of available package that exists in the snowflake anaconda channel
339
345
  """
340
346
  if not self._is_fitted:
341
347
  raise exceptions.SnowflakeMLException(
@@ -409,7 +415,7 @@ class IsolationForest(BaseTransformer):
409
415
  transform_kwargs = dict(
410
416
  session = dataset._session,
411
417
  dependencies = self._deps,
412
- pass_through_cols = self._get_pass_through_columns(dataset),
418
+ drop_input_cols = self._drop_input_cols,
413
419
  expected_output_cols_type = expected_type_inferred,
414
420
  )
415
421
 
@@ -469,16 +475,16 @@ class IsolationForest(BaseTransformer):
469
475
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
470
476
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
471
477
  # each row containing a list of values.
472
- expected_dtype = "ARRAY"
478
+ expected_dtype = "array"
473
479
 
474
480
  # If we were unable to assign a type to this transform in the factory, infer the type here.
475
481
  if expected_dtype == "":
476
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
482
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
477
483
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
478
- expected_dtype = "ARRAY"
479
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
484
+ expected_dtype = "array"
485
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
480
486
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
481
- expected_dtype = "ARRAY"
487
+ expected_dtype = "array"
482
488
  else:
483
489
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
484
490
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -496,7 +502,7 @@ class IsolationForest(BaseTransformer):
496
502
  transform_kwargs = dict(
497
503
  session = dataset._session,
498
504
  dependencies = self._deps,
499
- pass_through_cols = self._get_pass_through_columns(dataset),
505
+ drop_input_cols = self._drop_input_cols,
500
506
  expected_output_cols_type = expected_dtype,
501
507
  )
502
508
 
@@ -549,7 +555,7 @@ class IsolationForest(BaseTransformer):
549
555
  subproject=_SUBPROJECT,
550
556
  )
551
557
  output_result, fitted_estimator = model_trainer.train_fit_predict(
552
- pass_through_columns=self._get_pass_through_columns(dataset),
558
+ drop_input_cols=self._drop_input_cols,
553
559
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
554
560
  )
555
561
  self._sklearn_object = fitted_estimator
@@ -567,44 +573,6 @@ class IsolationForest(BaseTransformer):
567
573
  assert self._sklearn_object is not None
568
574
  return self._sklearn_object.embedding_
569
575
 
570
-
571
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
572
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
573
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
574
- """
575
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
576
- if output_cols:
577
- output_cols = [
578
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
579
- for c in output_cols
580
- ]
581
- elif getattr(self._sklearn_object, "classes_", None) is None:
582
- output_cols = [output_cols_prefix]
583
- elif self._sklearn_object is not None:
584
- classes = self._sklearn_object.classes_
585
- if isinstance(classes, numpy.ndarray):
586
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
587
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
588
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
589
- output_cols = []
590
- for i, cl in enumerate(classes):
591
- # For binary classification, there is only one output column for each class
592
- # ndarray as the two classes are complementary.
593
- if len(cl) == 2:
594
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
595
- else:
596
- output_cols.extend([
597
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
598
- ])
599
- else:
600
- output_cols = []
601
-
602
- # Make sure column names are valid snowflake identifiers.
603
- assert output_cols is not None # Make MyPy happy
604
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
605
-
606
- return rv
607
-
608
576
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
609
577
  @telemetry.send_api_usage_telemetry(
610
578
  project=_PROJECT,
@@ -644,7 +612,7 @@ class IsolationForest(BaseTransformer):
644
612
  transform_kwargs = dict(
645
613
  session=dataset._session,
646
614
  dependencies=self._deps,
647
- pass_through_cols=self._get_pass_through_columns(dataset),
615
+ drop_input_cols = self._drop_input_cols,
648
616
  expected_output_cols_type="float",
649
617
  )
650
618
 
@@ -709,7 +677,7 @@ class IsolationForest(BaseTransformer):
709
677
  transform_kwargs = dict(
710
678
  session=dataset._session,
711
679
  dependencies=self._deps,
712
- pass_through_cols=self._get_pass_through_columns(dataset),
680
+ drop_input_cols = self._drop_input_cols,
713
681
  expected_output_cols_type="float",
714
682
  )
715
683
  elif isinstance(dataset, pd.DataFrame):
@@ -772,7 +740,7 @@ class IsolationForest(BaseTransformer):
772
740
  transform_kwargs = dict(
773
741
  session=dataset._session,
774
742
  dependencies=self._deps,
775
- pass_through_cols=self._get_pass_through_columns(dataset),
743
+ drop_input_cols = self._drop_input_cols,
776
744
  expected_output_cols_type="float",
777
745
  )
778
746
 
@@ -839,7 +807,7 @@ class IsolationForest(BaseTransformer):
839
807
  transform_kwargs = dict(
840
808
  session=dataset._session,
841
809
  dependencies=self._deps,
842
- pass_through_cols=self._get_pass_through_columns(dataset),
810
+ drop_input_cols = self._drop_input_cols,
843
811
  expected_output_cols_type="float",
844
812
  )
845
813
 
@@ -893,13 +861,17 @@ class IsolationForest(BaseTransformer):
893
861
  transform_kwargs: ScoreKwargsTypedDict = dict()
894
862
 
895
863
  if isinstance(dataset, DataFrame):
864
+ self._deps = self._batch_inference_validate_snowpark(
865
+ dataset=dataset,
866
+ inference_method="score",
867
+ )
896
868
  selected_cols = self._get_active_columns()
897
869
  if len(selected_cols) > 0:
898
870
  dataset = dataset.select(selected_cols)
899
871
  assert isinstance(dataset._session, Session) # keep mypy happy
900
872
  transform_kwargs = dict(
901
873
  session=dataset._session,
902
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
874
+ dependencies=["snowflake-snowpark-python"] + self._deps,
903
875
  score_sproc_imports=['sklearn'],
904
876
  )
905
877
  elif isinstance(dataset, pd.DataFrame):
@@ -973,9 +945,9 @@ class IsolationForest(BaseTransformer):
973
945
  transform_kwargs = dict(
974
946
  session = dataset._session,
975
947
  dependencies = self._deps,
976
- pass_through_cols = self._get_pass_through_columns(dataset),
977
- expected_output_cols_type = "array",
978
- n_neighbors = n_neighbors,
948
+ drop_input_cols = self._drop_input_cols,
949
+ expected_output_cols_type="array",
950
+ n_neighbors = n_neighbors,
979
951
  return_distance = return_distance
980
952
  )
981
953
  elif isinstance(dataset, pd.DataFrame):
@@ -436,18 +436,24 @@ class RandomForestClassifier(BaseTransformer):
436
436
  self._get_model_signatures(dataset)
437
437
  return self
438
438
 
439
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
440
- if self._drop_input_cols:
441
- return []
442
- else:
443
- return list(set(dataset.columns) - set(self.output_cols))
444
-
445
439
  def _batch_inference_validate_snowpark(
446
440
  self,
447
441
  dataset: DataFrame,
448
442
  inference_method: str,
449
443
  ) -> List[str]:
450
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
444
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
445
+ return the available package that exists in the snowflake anaconda channel
446
+
447
+ Args:
448
+ dataset: snowpark dataframe
449
+ inference_method: the inference method such as predict, score...
450
+
451
+ Raises:
452
+ SnowflakeMLException: If the estimator is not fitted, raise error
453
+ SnowflakeMLException: If the session is None, raise error
454
+
455
+ Returns:
456
+ A list of available package that exists in the snowflake anaconda channel
451
457
  """
452
458
  if not self._is_fitted:
453
459
  raise exceptions.SnowflakeMLException(
@@ -521,7 +527,7 @@ class RandomForestClassifier(BaseTransformer):
521
527
  transform_kwargs = dict(
522
528
  session = dataset._session,
523
529
  dependencies = self._deps,
524
- pass_through_cols = self._get_pass_through_columns(dataset),
530
+ drop_input_cols = self._drop_input_cols,
525
531
  expected_output_cols_type = expected_type_inferred,
526
532
  )
527
533
 
@@ -581,16 +587,16 @@ class RandomForestClassifier(BaseTransformer):
581
587
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
582
588
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
583
589
  # each row containing a list of values.
584
- expected_dtype = "ARRAY"
590
+ expected_dtype = "array"
585
591
 
586
592
  # If we were unable to assign a type to this transform in the factory, infer the type here.
587
593
  if expected_dtype == "":
588
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
594
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
589
595
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
590
- expected_dtype = "ARRAY"
591
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
596
+ expected_dtype = "array"
597
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
592
598
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
593
- expected_dtype = "ARRAY"
599
+ expected_dtype = "array"
594
600
  else:
595
601
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
596
602
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -608,7 +614,7 @@ class RandomForestClassifier(BaseTransformer):
608
614
  transform_kwargs = dict(
609
615
  session = dataset._session,
610
616
  dependencies = self._deps,
611
- pass_through_cols = self._get_pass_through_columns(dataset),
617
+ drop_input_cols = self._drop_input_cols,
612
618
  expected_output_cols_type = expected_dtype,
613
619
  )
614
620
 
@@ -659,7 +665,7 @@ class RandomForestClassifier(BaseTransformer):
659
665
  subproject=_SUBPROJECT,
660
666
  )
661
667
  output_result, fitted_estimator = model_trainer.train_fit_predict(
662
- pass_through_columns=self._get_pass_through_columns(dataset),
668
+ drop_input_cols=self._drop_input_cols,
663
669
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
664
670
  )
665
671
  self._sklearn_object = fitted_estimator
@@ -677,44 +683,6 @@ class RandomForestClassifier(BaseTransformer):
677
683
  assert self._sklearn_object is not None
678
684
  return self._sklearn_object.embedding_
679
685
 
680
-
681
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
682
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
683
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
684
- """
685
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
686
- if output_cols:
687
- output_cols = [
688
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
689
- for c in output_cols
690
- ]
691
- elif getattr(self._sklearn_object, "classes_", None) is None:
692
- output_cols = [output_cols_prefix]
693
- elif self._sklearn_object is not None:
694
- classes = self._sklearn_object.classes_
695
- if isinstance(classes, numpy.ndarray):
696
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
697
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
698
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
699
- output_cols = []
700
- for i, cl in enumerate(classes):
701
- # For binary classification, there is only one output column for each class
702
- # ndarray as the two classes are complementary.
703
- if len(cl) == 2:
704
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
705
- else:
706
- output_cols.extend([
707
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
708
- ])
709
- else:
710
- output_cols = []
711
-
712
- # Make sure column names are valid snowflake identifiers.
713
- assert output_cols is not None # Make MyPy happy
714
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
715
-
716
- return rv
717
-
718
686
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
719
687
  @telemetry.send_api_usage_telemetry(
720
688
  project=_PROJECT,
@@ -756,7 +724,7 @@ class RandomForestClassifier(BaseTransformer):
756
724
  transform_kwargs = dict(
757
725
  session=dataset._session,
758
726
  dependencies=self._deps,
759
- pass_through_cols=self._get_pass_through_columns(dataset),
727
+ drop_input_cols = self._drop_input_cols,
760
728
  expected_output_cols_type="float",
761
729
  )
762
730
 
@@ -823,7 +791,7 @@ class RandomForestClassifier(BaseTransformer):
823
791
  transform_kwargs = dict(
824
792
  session=dataset._session,
825
793
  dependencies=self._deps,
826
- pass_through_cols=self._get_pass_through_columns(dataset),
794
+ drop_input_cols = self._drop_input_cols,
827
795
  expected_output_cols_type="float",
828
796
  )
829
797
  elif isinstance(dataset, pd.DataFrame):
@@ -884,7 +852,7 @@ class RandomForestClassifier(BaseTransformer):
884
852
  transform_kwargs = dict(
885
853
  session=dataset._session,
886
854
  dependencies=self._deps,
887
- pass_through_cols=self._get_pass_through_columns(dataset),
855
+ drop_input_cols = self._drop_input_cols,
888
856
  expected_output_cols_type="float",
889
857
  )
890
858
 
@@ -949,7 +917,7 @@ class RandomForestClassifier(BaseTransformer):
949
917
  transform_kwargs = dict(
950
918
  session=dataset._session,
951
919
  dependencies=self._deps,
952
- pass_through_cols=self._get_pass_through_columns(dataset),
920
+ drop_input_cols = self._drop_input_cols,
953
921
  expected_output_cols_type="float",
954
922
  )
955
923
 
@@ -1005,13 +973,17 @@ class RandomForestClassifier(BaseTransformer):
1005
973
  transform_kwargs: ScoreKwargsTypedDict = dict()
1006
974
 
1007
975
  if isinstance(dataset, DataFrame):
976
+ self._deps = self._batch_inference_validate_snowpark(
977
+ dataset=dataset,
978
+ inference_method="score",
979
+ )
1008
980
  selected_cols = self._get_active_columns()
1009
981
  if len(selected_cols) > 0:
1010
982
  dataset = dataset.select(selected_cols)
1011
983
  assert isinstance(dataset._session, Session) # keep mypy happy
1012
984
  transform_kwargs = dict(
1013
985
  session=dataset._session,
1014
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
986
+ dependencies=["snowflake-snowpark-python"] + self._deps,
1015
987
  score_sproc_imports=['sklearn'],
1016
988
  )
1017
989
  elif isinstance(dataset, pd.DataFrame):
@@ -1085,9 +1057,9 @@ class RandomForestClassifier(BaseTransformer):
1085
1057
  transform_kwargs = dict(
1086
1058
  session = dataset._session,
1087
1059
  dependencies = self._deps,
1088
- pass_through_cols = self._get_pass_through_columns(dataset),
1089
- expected_output_cols_type = "array",
1090
- n_neighbors = n_neighbors,
1060
+ drop_input_cols = self._drop_input_cols,
1061
+ expected_output_cols_type="array",
1062
+ n_neighbors = n_neighbors,
1091
1063
  return_distance = return_distance
1092
1064
  )
1093
1065
  elif isinstance(dataset, pd.DataFrame):
@@ -415,18 +415,24 @@ class RandomForestRegressor(BaseTransformer):
415
415
  self._get_model_signatures(dataset)
416
416
  return self
417
417
 
418
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
419
- if self._drop_input_cols:
420
- return []
421
- else:
422
- return list(set(dataset.columns) - set(self.output_cols))
423
-
424
418
  def _batch_inference_validate_snowpark(
425
419
  self,
426
420
  dataset: DataFrame,
427
421
  inference_method: str,
428
422
  ) -> List[str]:
429
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
423
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
424
+ return the available package that exists in the snowflake anaconda channel
425
+
426
+ Args:
427
+ dataset: snowpark dataframe
428
+ inference_method: the inference method such as predict, score...
429
+
430
+ Raises:
431
+ SnowflakeMLException: If the estimator is not fitted, raise error
432
+ SnowflakeMLException: If the session is None, raise error
433
+
434
+ Returns:
435
+ A list of available package that exists in the snowflake anaconda channel
430
436
  """
431
437
  if not self._is_fitted:
432
438
  raise exceptions.SnowflakeMLException(
@@ -500,7 +506,7 @@ class RandomForestRegressor(BaseTransformer):
500
506
  transform_kwargs = dict(
501
507
  session = dataset._session,
502
508
  dependencies = self._deps,
503
- pass_through_cols = self._get_pass_through_columns(dataset),
509
+ drop_input_cols = self._drop_input_cols,
504
510
  expected_output_cols_type = expected_type_inferred,
505
511
  )
506
512
 
@@ -560,16 +566,16 @@ class RandomForestRegressor(BaseTransformer):
560
566
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
561
567
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
562
568
  # each row containing a list of values.
563
- expected_dtype = "ARRAY"
569
+ expected_dtype = "array"
564
570
 
565
571
  # If we were unable to assign a type to this transform in the factory, infer the type here.
566
572
  if expected_dtype == "":
567
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
573
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
568
574
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
569
- expected_dtype = "ARRAY"
570
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
575
+ expected_dtype = "array"
576
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
571
577
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
572
- expected_dtype = "ARRAY"
578
+ expected_dtype = "array"
573
579
  else:
574
580
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
575
581
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -587,7 +593,7 @@ class RandomForestRegressor(BaseTransformer):
587
593
  transform_kwargs = dict(
588
594
  session = dataset._session,
589
595
  dependencies = self._deps,
590
- pass_through_cols = self._get_pass_through_columns(dataset),
596
+ drop_input_cols = self._drop_input_cols,
591
597
  expected_output_cols_type = expected_dtype,
592
598
  )
593
599
 
@@ -638,7 +644,7 @@ class RandomForestRegressor(BaseTransformer):
638
644
  subproject=_SUBPROJECT,
639
645
  )
640
646
  output_result, fitted_estimator = model_trainer.train_fit_predict(
641
- pass_through_columns=self._get_pass_through_columns(dataset),
647
+ drop_input_cols=self._drop_input_cols,
642
648
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
643
649
  )
644
650
  self._sklearn_object = fitted_estimator
@@ -656,44 +662,6 @@ class RandomForestRegressor(BaseTransformer):
656
662
  assert self._sklearn_object is not None
657
663
  return self._sklearn_object.embedding_
658
664
 
659
-
660
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
661
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
662
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
663
- """
664
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
665
- if output_cols:
666
- output_cols = [
667
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
668
- for c in output_cols
669
- ]
670
- elif getattr(self._sklearn_object, "classes_", None) is None:
671
- output_cols = [output_cols_prefix]
672
- elif self._sklearn_object is not None:
673
- classes = self._sklearn_object.classes_
674
- if isinstance(classes, numpy.ndarray):
675
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
676
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
677
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
678
- output_cols = []
679
- for i, cl in enumerate(classes):
680
- # For binary classification, there is only one output column for each class
681
- # ndarray as the two classes are complementary.
682
- if len(cl) == 2:
683
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
684
- else:
685
- output_cols.extend([
686
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
687
- ])
688
- else:
689
- output_cols = []
690
-
691
- # Make sure column names are valid snowflake identifiers.
692
- assert output_cols is not None # Make MyPy happy
693
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
694
-
695
- return rv
696
-
697
665
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
698
666
  @telemetry.send_api_usage_telemetry(
699
667
  project=_PROJECT,
@@ -733,7 +701,7 @@ class RandomForestRegressor(BaseTransformer):
733
701
  transform_kwargs = dict(
734
702
  session=dataset._session,
735
703
  dependencies=self._deps,
736
- pass_through_cols=self._get_pass_through_columns(dataset),
704
+ drop_input_cols = self._drop_input_cols,
737
705
  expected_output_cols_type="float",
738
706
  )
739
707
 
@@ -798,7 +766,7 @@ class RandomForestRegressor(BaseTransformer):
798
766
  transform_kwargs = dict(
799
767
  session=dataset._session,
800
768
  dependencies=self._deps,
801
- pass_through_cols=self._get_pass_through_columns(dataset),
769
+ drop_input_cols = self._drop_input_cols,
802
770
  expected_output_cols_type="float",
803
771
  )
804
772
  elif isinstance(dataset, pd.DataFrame):
@@ -859,7 +827,7 @@ class RandomForestRegressor(BaseTransformer):
859
827
  transform_kwargs = dict(
860
828
  session=dataset._session,
861
829
  dependencies=self._deps,
862
- pass_through_cols=self._get_pass_through_columns(dataset),
830
+ drop_input_cols = self._drop_input_cols,
863
831
  expected_output_cols_type="float",
864
832
  )
865
833
 
@@ -924,7 +892,7 @@ class RandomForestRegressor(BaseTransformer):
924
892
  transform_kwargs = dict(
925
893
  session=dataset._session,
926
894
  dependencies=self._deps,
927
- pass_through_cols=self._get_pass_through_columns(dataset),
895
+ drop_input_cols = self._drop_input_cols,
928
896
  expected_output_cols_type="float",
929
897
  )
930
898
 
@@ -980,13 +948,17 @@ class RandomForestRegressor(BaseTransformer):
980
948
  transform_kwargs: ScoreKwargsTypedDict = dict()
981
949
 
982
950
  if isinstance(dataset, DataFrame):
951
+ self._deps = self._batch_inference_validate_snowpark(
952
+ dataset=dataset,
953
+ inference_method="score",
954
+ )
983
955
  selected_cols = self._get_active_columns()
984
956
  if len(selected_cols) > 0:
985
957
  dataset = dataset.select(selected_cols)
986
958
  assert isinstance(dataset._session, Session) # keep mypy happy
987
959
  transform_kwargs = dict(
988
960
  session=dataset._session,
989
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
961
+ dependencies=["snowflake-snowpark-python"] + self._deps,
990
962
  score_sproc_imports=['sklearn'],
991
963
  )
992
964
  elif isinstance(dataset, pd.DataFrame):
@@ -1060,9 +1032,9 @@ class RandomForestRegressor(BaseTransformer):
1060
1032
  transform_kwargs = dict(
1061
1033
  session = dataset._session,
1062
1034
  dependencies = self._deps,
1063
- pass_through_cols = self._get_pass_through_columns(dataset),
1064
- expected_output_cols_type = "array",
1065
- n_neighbors = n_neighbors,
1035
+ drop_input_cols = self._drop_input_cols,
1036
+ expected_output_cols_type="array",
1037
+ n_neighbors = n_neighbors,
1066
1038
  return_distance = return_distance
1067
1039
  )
1068
1040
  elif isinstance(dataset, pd.DataFrame):