snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -384,18 +384,24 @@ class OPTICS(BaseTransformer):
384
384
  self._get_model_signatures(dataset)
385
385
  return self
386
386
 
387
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
388
- if self._drop_input_cols:
389
- return []
390
- else:
391
- return list(set(dataset.columns) - set(self.output_cols))
392
-
393
387
  def _batch_inference_validate_snowpark(
394
388
  self,
395
389
  dataset: DataFrame,
396
390
  inference_method: str,
397
391
  ) -> List[str]:
398
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
392
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
393
+ return the available package that exists in the snowflake anaconda channel
394
+
395
+ Args:
396
+ dataset: snowpark dataframe
397
+ inference_method: the inference method such as predict, score...
398
+
399
+ Raises:
400
+ SnowflakeMLException: If the estimator is not fitted, raise error
401
+ SnowflakeMLException: If the session is None, raise error
402
+
403
+ Returns:
404
+ A list of available package that exists in the snowflake anaconda channel
399
405
  """
400
406
  if not self._is_fitted:
401
407
  raise exceptions.SnowflakeMLException(
@@ -467,7 +473,7 @@ class OPTICS(BaseTransformer):
467
473
  transform_kwargs = dict(
468
474
  session = dataset._session,
469
475
  dependencies = self._deps,
470
- pass_through_cols = self._get_pass_through_columns(dataset),
476
+ drop_input_cols = self._drop_input_cols,
471
477
  expected_output_cols_type = expected_type_inferred,
472
478
  )
473
479
 
@@ -527,16 +533,16 @@ class OPTICS(BaseTransformer):
527
533
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
528
534
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
529
535
  # each row containing a list of values.
530
- expected_dtype = "ARRAY"
536
+ expected_dtype = "array"
531
537
 
532
538
  # If we were unable to assign a type to this transform in the factory, infer the type here.
533
539
  if expected_dtype == "":
534
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
540
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
535
541
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
536
- expected_dtype = "ARRAY"
537
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
542
+ expected_dtype = "array"
543
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
538
544
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
539
- expected_dtype = "ARRAY"
545
+ expected_dtype = "array"
540
546
  else:
541
547
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
542
548
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -554,7 +560,7 @@ class OPTICS(BaseTransformer):
554
560
  transform_kwargs = dict(
555
561
  session = dataset._session,
556
562
  dependencies = self._deps,
557
- pass_through_cols = self._get_pass_through_columns(dataset),
563
+ drop_input_cols = self._drop_input_cols,
558
564
  expected_output_cols_type = expected_dtype,
559
565
  )
560
566
 
@@ -607,7 +613,7 @@ class OPTICS(BaseTransformer):
607
613
  subproject=_SUBPROJECT,
608
614
  )
609
615
  output_result, fitted_estimator = model_trainer.train_fit_predict(
610
- pass_through_columns=self._get_pass_through_columns(dataset),
616
+ drop_input_cols=self._drop_input_cols,
611
617
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
612
618
  )
613
619
  self._sklearn_object = fitted_estimator
@@ -625,44 +631,6 @@ class OPTICS(BaseTransformer):
625
631
  assert self._sklearn_object is not None
626
632
  return self._sklearn_object.embedding_
627
633
 
628
-
629
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
630
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
631
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
632
- """
633
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
634
- if output_cols:
635
- output_cols = [
636
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
637
- for c in output_cols
638
- ]
639
- elif getattr(self._sklearn_object, "classes_", None) is None:
640
- output_cols = [output_cols_prefix]
641
- elif self._sklearn_object is not None:
642
- classes = self._sklearn_object.classes_
643
- if isinstance(classes, numpy.ndarray):
644
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
645
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
646
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
647
- output_cols = []
648
- for i, cl in enumerate(classes):
649
- # For binary classification, there is only one output column for each class
650
- # ndarray as the two classes are complementary.
651
- if len(cl) == 2:
652
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
653
- else:
654
- output_cols.extend([
655
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
656
- ])
657
- else:
658
- output_cols = []
659
-
660
- # Make sure column names are valid snowflake identifiers.
661
- assert output_cols is not None # Make MyPy happy
662
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
663
-
664
- return rv
665
-
666
634
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
667
635
  @telemetry.send_api_usage_telemetry(
668
636
  project=_PROJECT,
@@ -702,7 +670,7 @@ class OPTICS(BaseTransformer):
702
670
  transform_kwargs = dict(
703
671
  session=dataset._session,
704
672
  dependencies=self._deps,
705
- pass_through_cols=self._get_pass_through_columns(dataset),
673
+ drop_input_cols = self._drop_input_cols,
706
674
  expected_output_cols_type="float",
707
675
  )
708
676
 
@@ -767,7 +735,7 @@ class OPTICS(BaseTransformer):
767
735
  transform_kwargs = dict(
768
736
  session=dataset._session,
769
737
  dependencies=self._deps,
770
- pass_through_cols=self._get_pass_through_columns(dataset),
738
+ drop_input_cols = self._drop_input_cols,
771
739
  expected_output_cols_type="float",
772
740
  )
773
741
  elif isinstance(dataset, pd.DataFrame):
@@ -828,7 +796,7 @@ class OPTICS(BaseTransformer):
828
796
  transform_kwargs = dict(
829
797
  session=dataset._session,
830
798
  dependencies=self._deps,
831
- pass_through_cols=self._get_pass_through_columns(dataset),
799
+ drop_input_cols = self._drop_input_cols,
832
800
  expected_output_cols_type="float",
833
801
  )
834
802
 
@@ -893,7 +861,7 @@ class OPTICS(BaseTransformer):
893
861
  transform_kwargs = dict(
894
862
  session=dataset._session,
895
863
  dependencies=self._deps,
896
- pass_through_cols=self._get_pass_through_columns(dataset),
864
+ drop_input_cols = self._drop_input_cols,
897
865
  expected_output_cols_type="float",
898
866
  )
899
867
 
@@ -947,13 +915,17 @@ class OPTICS(BaseTransformer):
947
915
  transform_kwargs: ScoreKwargsTypedDict = dict()
948
916
 
949
917
  if isinstance(dataset, DataFrame):
918
+ self._deps = self._batch_inference_validate_snowpark(
919
+ dataset=dataset,
920
+ inference_method="score",
921
+ )
950
922
  selected_cols = self._get_active_columns()
951
923
  if len(selected_cols) > 0:
952
924
  dataset = dataset.select(selected_cols)
953
925
  assert isinstance(dataset._session, Session) # keep mypy happy
954
926
  transform_kwargs = dict(
955
927
  session=dataset._session,
956
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
928
+ dependencies=["snowflake-snowpark-python"] + self._deps,
957
929
  score_sproc_imports=['sklearn'],
958
930
  )
959
931
  elif isinstance(dataset, pd.DataFrame):
@@ -1027,9 +999,9 @@ class OPTICS(BaseTransformer):
1027
999
  transform_kwargs = dict(
1028
1000
  session = dataset._session,
1029
1001
  dependencies = self._deps,
1030
- pass_through_cols = self._get_pass_through_columns(dataset),
1031
- expected_output_cols_type = "array",
1032
- n_neighbors = n_neighbors,
1002
+ drop_input_cols = self._drop_input_cols,
1003
+ expected_output_cols_type="array",
1004
+ n_neighbors = n_neighbors,
1033
1005
  return_distance = return_distance
1034
1006
  )
1035
1007
  elif isinstance(dataset, pd.DataFrame):
@@ -322,18 +322,24 @@ class SpectralBiclustering(BaseTransformer):
322
322
  self._get_model_signatures(dataset)
323
323
  return self
324
324
 
325
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
326
- if self._drop_input_cols:
327
- return []
328
- else:
329
- return list(set(dataset.columns) - set(self.output_cols))
330
-
331
325
  def _batch_inference_validate_snowpark(
332
326
  self,
333
327
  dataset: DataFrame,
334
328
  inference_method: str,
335
329
  ) -> List[str]:
336
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
330
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
331
+ return the available package that exists in the snowflake anaconda channel
332
+
333
+ Args:
334
+ dataset: snowpark dataframe
335
+ inference_method: the inference method such as predict, score...
336
+
337
+ Raises:
338
+ SnowflakeMLException: If the estimator is not fitted, raise error
339
+ SnowflakeMLException: If the session is None, raise error
340
+
341
+ Returns:
342
+ A list of available package that exists in the snowflake anaconda channel
337
343
  """
338
344
  if not self._is_fitted:
339
345
  raise exceptions.SnowflakeMLException(
@@ -405,7 +411,7 @@ class SpectralBiclustering(BaseTransformer):
405
411
  transform_kwargs = dict(
406
412
  session = dataset._session,
407
413
  dependencies = self._deps,
408
- pass_through_cols = self._get_pass_through_columns(dataset),
414
+ drop_input_cols = self._drop_input_cols,
409
415
  expected_output_cols_type = expected_type_inferred,
410
416
  )
411
417
 
@@ -465,16 +471,16 @@ class SpectralBiclustering(BaseTransformer):
465
471
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
466
472
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
467
473
  # each row containing a list of values.
468
- expected_dtype = "ARRAY"
474
+ expected_dtype = "array"
469
475
 
470
476
  # If we were unable to assign a type to this transform in the factory, infer the type here.
471
477
  if expected_dtype == "":
472
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
478
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
473
479
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
474
- expected_dtype = "ARRAY"
475
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
480
+ expected_dtype = "array"
481
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
476
482
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
477
- expected_dtype = "ARRAY"
483
+ expected_dtype = "array"
478
484
  else:
479
485
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
480
486
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -492,7 +498,7 @@ class SpectralBiclustering(BaseTransformer):
492
498
  transform_kwargs = dict(
493
499
  session = dataset._session,
494
500
  dependencies = self._deps,
495
- pass_through_cols = self._get_pass_through_columns(dataset),
501
+ drop_input_cols = self._drop_input_cols,
496
502
  expected_output_cols_type = expected_dtype,
497
503
  )
498
504
 
@@ -543,7 +549,7 @@ class SpectralBiclustering(BaseTransformer):
543
549
  subproject=_SUBPROJECT,
544
550
  )
545
551
  output_result, fitted_estimator = model_trainer.train_fit_predict(
546
- pass_through_columns=self._get_pass_through_columns(dataset),
552
+ drop_input_cols=self._drop_input_cols,
547
553
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
548
554
  )
549
555
  self._sklearn_object = fitted_estimator
@@ -561,44 +567,6 @@ class SpectralBiclustering(BaseTransformer):
561
567
  assert self._sklearn_object is not None
562
568
  return self._sklearn_object.embedding_
563
569
 
564
-
565
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
566
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
567
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
568
- """
569
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
570
- if output_cols:
571
- output_cols = [
572
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
573
- for c in output_cols
574
- ]
575
- elif getattr(self._sklearn_object, "classes_", None) is None:
576
- output_cols = [output_cols_prefix]
577
- elif self._sklearn_object is not None:
578
- classes = self._sklearn_object.classes_
579
- if isinstance(classes, numpy.ndarray):
580
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
581
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
582
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
583
- output_cols = []
584
- for i, cl in enumerate(classes):
585
- # For binary classification, there is only one output column for each class
586
- # ndarray as the two classes are complementary.
587
- if len(cl) == 2:
588
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
589
- else:
590
- output_cols.extend([
591
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
592
- ])
593
- else:
594
- output_cols = []
595
-
596
- # Make sure column names are valid snowflake identifiers.
597
- assert output_cols is not None # Make MyPy happy
598
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
599
-
600
- return rv
601
-
602
570
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
603
571
  @telemetry.send_api_usage_telemetry(
604
572
  project=_PROJECT,
@@ -638,7 +606,7 @@ class SpectralBiclustering(BaseTransformer):
638
606
  transform_kwargs = dict(
639
607
  session=dataset._session,
640
608
  dependencies=self._deps,
641
- pass_through_cols=self._get_pass_through_columns(dataset),
609
+ drop_input_cols = self._drop_input_cols,
642
610
  expected_output_cols_type="float",
643
611
  )
644
612
 
@@ -703,7 +671,7 @@ class SpectralBiclustering(BaseTransformer):
703
671
  transform_kwargs = dict(
704
672
  session=dataset._session,
705
673
  dependencies=self._deps,
706
- pass_through_cols=self._get_pass_through_columns(dataset),
674
+ drop_input_cols = self._drop_input_cols,
707
675
  expected_output_cols_type="float",
708
676
  )
709
677
  elif isinstance(dataset, pd.DataFrame):
@@ -764,7 +732,7 @@ class SpectralBiclustering(BaseTransformer):
764
732
  transform_kwargs = dict(
765
733
  session=dataset._session,
766
734
  dependencies=self._deps,
767
- pass_through_cols=self._get_pass_through_columns(dataset),
735
+ drop_input_cols = self._drop_input_cols,
768
736
  expected_output_cols_type="float",
769
737
  )
770
738
 
@@ -829,7 +797,7 @@ class SpectralBiclustering(BaseTransformer):
829
797
  transform_kwargs = dict(
830
798
  session=dataset._session,
831
799
  dependencies=self._deps,
832
- pass_through_cols=self._get_pass_through_columns(dataset),
800
+ drop_input_cols = self._drop_input_cols,
833
801
  expected_output_cols_type="float",
834
802
  )
835
803
 
@@ -883,13 +851,17 @@ class SpectralBiclustering(BaseTransformer):
883
851
  transform_kwargs: ScoreKwargsTypedDict = dict()
884
852
 
885
853
  if isinstance(dataset, DataFrame):
854
+ self._deps = self._batch_inference_validate_snowpark(
855
+ dataset=dataset,
856
+ inference_method="score",
857
+ )
886
858
  selected_cols = self._get_active_columns()
887
859
  if len(selected_cols) > 0:
888
860
  dataset = dataset.select(selected_cols)
889
861
  assert isinstance(dataset._session, Session) # keep mypy happy
890
862
  transform_kwargs = dict(
891
863
  session=dataset._session,
892
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
864
+ dependencies=["snowflake-snowpark-python"] + self._deps,
893
865
  score_sproc_imports=['sklearn'],
894
866
  )
895
867
  elif isinstance(dataset, pd.DataFrame):
@@ -963,9 +935,9 @@ class SpectralBiclustering(BaseTransformer):
963
935
  transform_kwargs = dict(
964
936
  session = dataset._session,
965
937
  dependencies = self._deps,
966
- pass_through_cols = self._get_pass_through_columns(dataset),
967
- expected_output_cols_type = "array",
968
- n_neighbors = n_neighbors,
938
+ drop_input_cols = self._drop_input_cols,
939
+ expected_output_cols_type="array",
940
+ n_neighbors = n_neighbors,
969
941
  return_distance = return_distance
970
942
  )
971
943
  elif isinstance(dataset, pd.DataFrame):
@@ -380,18 +380,24 @@ class SpectralClustering(BaseTransformer):
380
380
  self._get_model_signatures(dataset)
381
381
  return self
382
382
 
383
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
384
- if self._drop_input_cols:
385
- return []
386
- else:
387
- return list(set(dataset.columns) - set(self.output_cols))
388
-
389
383
  def _batch_inference_validate_snowpark(
390
384
  self,
391
385
  dataset: DataFrame,
392
386
  inference_method: str,
393
387
  ) -> List[str]:
394
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
388
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
389
+ return the available package that exists in the snowflake anaconda channel
390
+
391
+ Args:
392
+ dataset: snowpark dataframe
393
+ inference_method: the inference method such as predict, score...
394
+
395
+ Raises:
396
+ SnowflakeMLException: If the estimator is not fitted, raise error
397
+ SnowflakeMLException: If the session is None, raise error
398
+
399
+ Returns:
400
+ A list of available package that exists in the snowflake anaconda channel
395
401
  """
396
402
  if not self._is_fitted:
397
403
  raise exceptions.SnowflakeMLException(
@@ -463,7 +469,7 @@ class SpectralClustering(BaseTransformer):
463
469
  transform_kwargs = dict(
464
470
  session = dataset._session,
465
471
  dependencies = self._deps,
466
- pass_through_cols = self._get_pass_through_columns(dataset),
472
+ drop_input_cols = self._drop_input_cols,
467
473
  expected_output_cols_type = expected_type_inferred,
468
474
  )
469
475
 
@@ -523,16 +529,16 @@ class SpectralClustering(BaseTransformer):
523
529
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
524
530
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
525
531
  # each row containing a list of values.
526
- expected_dtype = "ARRAY"
532
+ expected_dtype = "array"
527
533
 
528
534
  # If we were unable to assign a type to this transform in the factory, infer the type here.
529
535
  if expected_dtype == "":
530
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
536
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
531
537
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
532
- expected_dtype = "ARRAY"
533
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
538
+ expected_dtype = "array"
539
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
534
540
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
535
- expected_dtype = "ARRAY"
541
+ expected_dtype = "array"
536
542
  else:
537
543
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
538
544
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -550,7 +556,7 @@ class SpectralClustering(BaseTransformer):
550
556
  transform_kwargs = dict(
551
557
  session = dataset._session,
552
558
  dependencies = self._deps,
553
- pass_through_cols = self._get_pass_through_columns(dataset),
559
+ drop_input_cols = self._drop_input_cols,
554
560
  expected_output_cols_type = expected_dtype,
555
561
  )
556
562
 
@@ -603,7 +609,7 @@ class SpectralClustering(BaseTransformer):
603
609
  subproject=_SUBPROJECT,
604
610
  )
605
611
  output_result, fitted_estimator = model_trainer.train_fit_predict(
606
- pass_through_columns=self._get_pass_through_columns(dataset),
612
+ drop_input_cols=self._drop_input_cols,
607
613
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
608
614
  )
609
615
  self._sklearn_object = fitted_estimator
@@ -621,44 +627,6 @@ class SpectralClustering(BaseTransformer):
621
627
  assert self._sklearn_object is not None
622
628
  return self._sklearn_object.embedding_
623
629
 
624
-
625
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
626
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
627
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
628
- """
629
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
630
- if output_cols:
631
- output_cols = [
632
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
633
- for c in output_cols
634
- ]
635
- elif getattr(self._sklearn_object, "classes_", None) is None:
636
- output_cols = [output_cols_prefix]
637
- elif self._sklearn_object is not None:
638
- classes = self._sklearn_object.classes_
639
- if isinstance(classes, numpy.ndarray):
640
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
641
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
642
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
643
- output_cols = []
644
- for i, cl in enumerate(classes):
645
- # For binary classification, there is only one output column for each class
646
- # ndarray as the two classes are complementary.
647
- if len(cl) == 2:
648
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
649
- else:
650
- output_cols.extend([
651
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
652
- ])
653
- else:
654
- output_cols = []
655
-
656
- # Make sure column names are valid snowflake identifiers.
657
- assert output_cols is not None # Make MyPy happy
658
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
659
-
660
- return rv
661
-
662
630
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
663
631
  @telemetry.send_api_usage_telemetry(
664
632
  project=_PROJECT,
@@ -698,7 +666,7 @@ class SpectralClustering(BaseTransformer):
698
666
  transform_kwargs = dict(
699
667
  session=dataset._session,
700
668
  dependencies=self._deps,
701
- pass_through_cols=self._get_pass_through_columns(dataset),
669
+ drop_input_cols = self._drop_input_cols,
702
670
  expected_output_cols_type="float",
703
671
  )
704
672
 
@@ -763,7 +731,7 @@ class SpectralClustering(BaseTransformer):
763
731
  transform_kwargs = dict(
764
732
  session=dataset._session,
765
733
  dependencies=self._deps,
766
- pass_through_cols=self._get_pass_through_columns(dataset),
734
+ drop_input_cols = self._drop_input_cols,
767
735
  expected_output_cols_type="float",
768
736
  )
769
737
  elif isinstance(dataset, pd.DataFrame):
@@ -824,7 +792,7 @@ class SpectralClustering(BaseTransformer):
824
792
  transform_kwargs = dict(
825
793
  session=dataset._session,
826
794
  dependencies=self._deps,
827
- pass_through_cols=self._get_pass_through_columns(dataset),
795
+ drop_input_cols = self._drop_input_cols,
828
796
  expected_output_cols_type="float",
829
797
  )
830
798
 
@@ -889,7 +857,7 @@ class SpectralClustering(BaseTransformer):
889
857
  transform_kwargs = dict(
890
858
  session=dataset._session,
891
859
  dependencies=self._deps,
892
- pass_through_cols=self._get_pass_through_columns(dataset),
860
+ drop_input_cols = self._drop_input_cols,
893
861
  expected_output_cols_type="float",
894
862
  )
895
863
 
@@ -943,13 +911,17 @@ class SpectralClustering(BaseTransformer):
943
911
  transform_kwargs: ScoreKwargsTypedDict = dict()
944
912
 
945
913
  if isinstance(dataset, DataFrame):
914
+ self._deps = self._batch_inference_validate_snowpark(
915
+ dataset=dataset,
916
+ inference_method="score",
917
+ )
946
918
  selected_cols = self._get_active_columns()
947
919
  if len(selected_cols) > 0:
948
920
  dataset = dataset.select(selected_cols)
949
921
  assert isinstance(dataset._session, Session) # keep mypy happy
950
922
  transform_kwargs = dict(
951
923
  session=dataset._session,
952
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
924
+ dependencies=["snowflake-snowpark-python"] + self._deps,
953
925
  score_sproc_imports=['sklearn'],
954
926
  )
955
927
  elif isinstance(dataset, pd.DataFrame):
@@ -1023,9 +995,9 @@ class SpectralClustering(BaseTransformer):
1023
995
  transform_kwargs = dict(
1024
996
  session = dataset._session,
1025
997
  dependencies = self._deps,
1026
- pass_through_cols = self._get_pass_through_columns(dataset),
1027
- expected_output_cols_type = "array",
1028
- n_neighbors = n_neighbors,
998
+ drop_input_cols = self._drop_input_cols,
999
+ expected_output_cols_type="array",
1000
+ n_neighbors = n_neighbors,
1029
1001
  return_distance = return_distance
1030
1002
  )
1031
1003
  elif isinstance(dataset, pd.DataFrame):