snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. snowflake/ml/_internal/file_utils.py +3 -3
  2. snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
  3. snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
  4. snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
  5. snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
  6. snowflake/ml/_internal/telemetry.py +11 -2
  7. snowflake/ml/_internal/utils/formatting.py +1 -1
  8. snowflake/ml/feature_store/feature_store.py +15 -106
  9. snowflake/ml/fileset/sfcfs.py +4 -3
  10. snowflake/ml/fileset/stage_fs.py +18 -0
  11. snowflake/ml/model/_api.py +9 -9
  12. snowflake/ml/model/_client/model/model_version_impl.py +20 -15
  13. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
  14. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
  15. snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
  16. snowflake/ml/model/_model_composer/model_composer.py +10 -8
  17. snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
  18. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
  19. snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
  20. snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
  21. snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
  22. snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
  23. snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
  24. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
  25. snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
  26. snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
  27. snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
  28. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
  29. snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
  30. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
  31. snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
  32. snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
  33. snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
  34. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  35. snowflake/ml/model/_packager/model_packager.py +8 -6
  36. snowflake/ml/model/custom_model.py +3 -1
  37. snowflake/ml/model/type_hints.py +13 -0
  38. snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
  39. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
  40. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
  41. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
  42. snowflake/ml/modeling/_internal/model_specifications.py +3 -1
  43. snowflake/ml/modeling/_internal/model_trainer.py +2 -2
  44. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
  45. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
  46. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
  47. snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
  48. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
  49. snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
  50. snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
  51. snowflake/ml/modeling/cluster/birch.py +33 -61
  52. snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
  53. snowflake/ml/modeling/cluster/dbscan.py +33 -61
  54. snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
  55. snowflake/ml/modeling/cluster/k_means.py +33 -61
  56. snowflake/ml/modeling/cluster/mean_shift.py +33 -61
  57. snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
  58. snowflake/ml/modeling/cluster/optics.py +33 -61
  59. snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
  60. snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
  61. snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
  62. snowflake/ml/modeling/compose/column_transformer.py +33 -61
  63. snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
  64. snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
  65. snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
  66. snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
  67. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
  68. snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
  69. snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
  70. snowflake/ml/modeling/covariance/oas.py +33 -61
  71. snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
  72. snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
  73. snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
  74. snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
  75. snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
  76. snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
  77. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
  78. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
  79. snowflake/ml/modeling/decomposition/pca.py +33 -61
  80. snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
  81. snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
  82. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
  83. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
  84. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
  85. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
  86. snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
  87. snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
  88. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
  89. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
  90. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
  91. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
  92. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
  93. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
  94. snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
  95. snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
  96. snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
  97. snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
  98. snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
  99. snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
  100. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
  101. snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
  102. snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
  103. snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
  104. snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
  105. snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
  106. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
  107. snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
  108. snowflake/ml/modeling/framework/base.py +55 -5
  109. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
  110. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
  111. snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
  112. snowflake/ml/modeling/impute/knn_imputer.py +33 -61
  113. snowflake/ml/modeling/impute/missing_indicator.py +33 -61
  114. snowflake/ml/modeling/impute/simple_imputer.py +4 -15
  115. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
  116. snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
  117. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
  118. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
  119. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
  120. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
  121. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
  122. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
  123. snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
  124. snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
  125. snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
  126. snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
  127. snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
  128. snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
  129. snowflake/ml/modeling/linear_model/lars.py +33 -61
  130. snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
  131. snowflake/ml/modeling/linear_model/lasso.py +33 -61
  132. snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
  133. snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
  134. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
  135. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
  136. snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
  137. snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
  138. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
  139. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
  140. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
  141. snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
  142. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
  143. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
  144. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
  145. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
  146. snowflake/ml/modeling/linear_model/perceptron.py +33 -61
  147. snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
  148. snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
  149. snowflake/ml/modeling/linear_model/ridge.py +33 -61
  150. snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
  151. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
  152. snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
  153. snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
  154. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
  155. snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
  156. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
  157. snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
  158. snowflake/ml/modeling/manifold/isomap.py +33 -61
  159. snowflake/ml/modeling/manifold/mds.py +33 -61
  160. snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
  161. snowflake/ml/modeling/manifold/tsne.py +33 -61
  162. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
  163. snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
  164. snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
  165. snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
  166. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
  167. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
  168. snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
  169. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
  170. snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
  171. snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
  172. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
  173. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
  174. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
  175. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
  176. snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
  177. snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
  178. snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
  179. snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
  180. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
  181. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
  182. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
  183. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
  184. snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
  185. snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
  186. snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
  187. snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
  188. snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
  189. snowflake/ml/modeling/svm/linear_svc.py +33 -61
  190. snowflake/ml/modeling/svm/linear_svr.py +33 -61
  191. snowflake/ml/modeling/svm/nu_svc.py +33 -61
  192. snowflake/ml/modeling/svm/nu_svr.py +33 -61
  193. snowflake/ml/modeling/svm/svc.py +33 -61
  194. snowflake/ml/modeling/svm/svr.py +33 -61
  195. snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
  196. snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
  197. snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
  198. snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
  199. snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
  200. snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
  201. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
  202. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
  203. snowflake/ml/registry/_manager/model_manager.py +6 -2
  204. snowflake/ml/registry/model_registry.py +100 -27
  205. snowflake/ml/registry/registry.py +6 -2
  206. snowflake/ml/version.py +1 -1
  207. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
  208. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
  209. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
  210. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
  211. {snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0
@@ -305,18 +305,24 @@ class KernelRidge(BaseTransformer):
305
305
  self._get_model_signatures(dataset)
306
306
  return self
307
307
 
308
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
309
- if self._drop_input_cols:
310
- return []
311
- else:
312
- return list(set(dataset.columns) - set(self.output_cols))
313
-
314
308
  def _batch_inference_validate_snowpark(
315
309
  self,
316
310
  dataset: DataFrame,
317
311
  inference_method: str,
318
312
  ) -> List[str]:
319
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
313
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
314
+ return the available package that exists in the snowflake anaconda channel
315
+
316
+ Args:
317
+ dataset: snowpark dataframe
318
+ inference_method: the inference method such as predict, score...
319
+
320
+ Raises:
321
+ SnowflakeMLException: If the estimator is not fitted, raise error
322
+ SnowflakeMLException: If the session is None, raise error
323
+
324
+ Returns:
325
+ A list of available package that exists in the snowflake anaconda channel
320
326
  """
321
327
  if not self._is_fitted:
322
328
  raise exceptions.SnowflakeMLException(
@@ -390,7 +396,7 @@ class KernelRidge(BaseTransformer):
390
396
  transform_kwargs = dict(
391
397
  session = dataset._session,
392
398
  dependencies = self._deps,
393
- pass_through_cols = self._get_pass_through_columns(dataset),
399
+ drop_input_cols = self._drop_input_cols,
394
400
  expected_output_cols_type = expected_type_inferred,
395
401
  )
396
402
 
@@ -450,16 +456,16 @@ class KernelRidge(BaseTransformer):
450
456
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
451
457
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
452
458
  # each row containing a list of values.
453
- expected_dtype = "ARRAY"
459
+ expected_dtype = "array"
454
460
 
455
461
  # If we were unable to assign a type to this transform in the factory, infer the type here.
456
462
  if expected_dtype == "":
457
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
463
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
458
464
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
459
- expected_dtype = "ARRAY"
460
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
465
+ expected_dtype = "array"
466
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
461
467
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
462
- expected_dtype = "ARRAY"
468
+ expected_dtype = "array"
463
469
  else:
464
470
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
465
471
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -477,7 +483,7 @@ class KernelRidge(BaseTransformer):
477
483
  transform_kwargs = dict(
478
484
  session = dataset._session,
479
485
  dependencies = self._deps,
480
- pass_through_cols = self._get_pass_through_columns(dataset),
486
+ drop_input_cols = self._drop_input_cols,
481
487
  expected_output_cols_type = expected_dtype,
482
488
  )
483
489
 
@@ -528,7 +534,7 @@ class KernelRidge(BaseTransformer):
528
534
  subproject=_SUBPROJECT,
529
535
  )
530
536
  output_result, fitted_estimator = model_trainer.train_fit_predict(
531
- pass_through_columns=self._get_pass_through_columns(dataset),
537
+ drop_input_cols=self._drop_input_cols,
532
538
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
533
539
  )
534
540
  self._sklearn_object = fitted_estimator
@@ -546,44 +552,6 @@ class KernelRidge(BaseTransformer):
546
552
  assert self._sklearn_object is not None
547
553
  return self._sklearn_object.embedding_
548
554
 
549
-
550
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
551
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
552
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
553
- """
554
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
555
- if output_cols:
556
- output_cols = [
557
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
558
- for c in output_cols
559
- ]
560
- elif getattr(self._sklearn_object, "classes_", None) is None:
561
- output_cols = [output_cols_prefix]
562
- elif self._sklearn_object is not None:
563
- classes = self._sklearn_object.classes_
564
- if isinstance(classes, numpy.ndarray):
565
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
566
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
567
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
568
- output_cols = []
569
- for i, cl in enumerate(classes):
570
- # For binary classification, there is only one output column for each class
571
- # ndarray as the two classes are complementary.
572
- if len(cl) == 2:
573
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
574
- else:
575
- output_cols.extend([
576
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
577
- ])
578
- else:
579
- output_cols = []
580
-
581
- # Make sure column names are valid snowflake identifiers.
582
- assert output_cols is not None # Make MyPy happy
583
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
584
-
585
- return rv
586
-
587
555
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
588
556
  @telemetry.send_api_usage_telemetry(
589
557
  project=_PROJECT,
@@ -623,7 +591,7 @@ class KernelRidge(BaseTransformer):
623
591
  transform_kwargs = dict(
624
592
  session=dataset._session,
625
593
  dependencies=self._deps,
626
- pass_through_cols=self._get_pass_through_columns(dataset),
594
+ drop_input_cols = self._drop_input_cols,
627
595
  expected_output_cols_type="float",
628
596
  )
629
597
 
@@ -688,7 +656,7 @@ class KernelRidge(BaseTransformer):
688
656
  transform_kwargs = dict(
689
657
  session=dataset._session,
690
658
  dependencies=self._deps,
691
- pass_through_cols=self._get_pass_through_columns(dataset),
659
+ drop_input_cols = self._drop_input_cols,
692
660
  expected_output_cols_type="float",
693
661
  )
694
662
  elif isinstance(dataset, pd.DataFrame):
@@ -749,7 +717,7 @@ class KernelRidge(BaseTransformer):
749
717
  transform_kwargs = dict(
750
718
  session=dataset._session,
751
719
  dependencies=self._deps,
752
- pass_through_cols=self._get_pass_through_columns(dataset),
720
+ drop_input_cols = self._drop_input_cols,
753
721
  expected_output_cols_type="float",
754
722
  )
755
723
 
@@ -814,7 +782,7 @@ class KernelRidge(BaseTransformer):
814
782
  transform_kwargs = dict(
815
783
  session=dataset._session,
816
784
  dependencies=self._deps,
817
- pass_through_cols=self._get_pass_through_columns(dataset),
785
+ drop_input_cols = self._drop_input_cols,
818
786
  expected_output_cols_type="float",
819
787
  )
820
788
 
@@ -870,13 +838,17 @@ class KernelRidge(BaseTransformer):
870
838
  transform_kwargs: ScoreKwargsTypedDict = dict()
871
839
 
872
840
  if isinstance(dataset, DataFrame):
841
+ self._deps = self._batch_inference_validate_snowpark(
842
+ dataset=dataset,
843
+ inference_method="score",
844
+ )
873
845
  selected_cols = self._get_active_columns()
874
846
  if len(selected_cols) > 0:
875
847
  dataset = dataset.select(selected_cols)
876
848
  assert isinstance(dataset._session, Session) # keep mypy happy
877
849
  transform_kwargs = dict(
878
850
  session=dataset._session,
879
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
851
+ dependencies=["snowflake-snowpark-python"] + self._deps,
880
852
  score_sproc_imports=['sklearn'],
881
853
  )
882
854
  elif isinstance(dataset, pd.DataFrame):
@@ -950,9 +922,9 @@ class KernelRidge(BaseTransformer):
950
922
  transform_kwargs = dict(
951
923
  session = dataset._session,
952
924
  dependencies = self._deps,
953
- pass_through_cols = self._get_pass_through_columns(dataset),
954
- expected_output_cols_type = "array",
955
- n_neighbors = n_neighbors,
925
+ drop_input_cols = self._drop_input_cols,
926
+ expected_output_cols_type="array",
927
+ n_neighbors = n_neighbors,
956
928
  return_distance = return_distance
957
929
  )
958
930
  elif isinstance(dataset, pd.DataFrame):
@@ -16,6 +16,7 @@ from numpy import typing as npt
16
16
 
17
17
 
18
18
  import numpy
19
+ import sklearn
19
20
  import lightgbm
20
21
  from sklearn.utils.metaestimators import available_if
21
22
 
@@ -160,7 +161,7 @@ class LGBMClassifier(BaseTransformer):
160
161
  self.set_sample_weight_col(sample_weight_col)
161
162
  self._use_external_memory_version = False
162
163
  self._batch_size = -1
163
- deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'])
164
+ deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}', f'scikit-learn=={sklearn.__version__}'])
164
165
 
165
166
  self._deps = list(deps)
166
167
 
@@ -293,18 +294,24 @@ class LGBMClassifier(BaseTransformer):
293
294
  self._get_model_signatures(dataset)
294
295
  return self
295
296
 
296
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
297
- if self._drop_input_cols:
298
- return []
299
- else:
300
- return list(set(dataset.columns) - set(self.output_cols))
301
-
302
297
  def _batch_inference_validate_snowpark(
303
298
  self,
304
299
  dataset: DataFrame,
305
300
  inference_method: str,
306
301
  ) -> List[str]:
307
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
302
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
303
+ return the available package that exists in the snowflake anaconda channel
304
+
305
+ Args:
306
+ dataset: snowpark dataframe
307
+ inference_method: the inference method such as predict, score...
308
+
309
+ Raises:
310
+ SnowflakeMLException: If the estimator is not fitted, raise error
311
+ SnowflakeMLException: If the session is None, raise error
312
+
313
+ Returns:
314
+ A list of available package that exists in the snowflake anaconda channel
308
315
  """
309
316
  if not self._is_fitted:
310
317
  raise exceptions.SnowflakeMLException(
@@ -378,7 +385,7 @@ class LGBMClassifier(BaseTransformer):
378
385
  transform_kwargs = dict(
379
386
  session = dataset._session,
380
387
  dependencies = self._deps,
381
- pass_through_cols = self._get_pass_through_columns(dataset),
388
+ drop_input_cols = self._drop_input_cols,
382
389
  expected_output_cols_type = expected_type_inferred,
383
390
  )
384
391
 
@@ -438,16 +445,16 @@ class LGBMClassifier(BaseTransformer):
438
445
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
439
446
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
440
447
  # each row containing a list of values.
441
- expected_dtype = "ARRAY"
448
+ expected_dtype = "array"
442
449
 
443
450
  # If we were unable to assign a type to this transform in the factory, infer the type here.
444
451
  if expected_dtype == "":
445
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
452
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
446
453
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
447
- expected_dtype = "ARRAY"
448
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
454
+ expected_dtype = "array"
455
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
449
456
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
450
- expected_dtype = "ARRAY"
457
+ expected_dtype = "array"
451
458
  else:
452
459
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
453
460
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -465,7 +472,7 @@ class LGBMClassifier(BaseTransformer):
465
472
  transform_kwargs = dict(
466
473
  session = dataset._session,
467
474
  dependencies = self._deps,
468
- pass_through_cols = self._get_pass_through_columns(dataset),
475
+ drop_input_cols = self._drop_input_cols,
469
476
  expected_output_cols_type = expected_dtype,
470
477
  )
471
478
 
@@ -516,7 +523,7 @@ class LGBMClassifier(BaseTransformer):
516
523
  subproject=_SUBPROJECT,
517
524
  )
518
525
  output_result, fitted_estimator = model_trainer.train_fit_predict(
519
- pass_through_columns=self._get_pass_through_columns(dataset),
526
+ drop_input_cols=self._drop_input_cols,
520
527
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
521
528
  )
522
529
  self._sklearn_object = fitted_estimator
@@ -534,44 +541,6 @@ class LGBMClassifier(BaseTransformer):
534
541
  assert self._sklearn_object is not None
535
542
  return self._sklearn_object.embedding_
536
543
 
537
-
538
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
539
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
540
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
541
- """
542
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
543
- if output_cols:
544
- output_cols = [
545
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
546
- for c in output_cols
547
- ]
548
- elif getattr(self._sklearn_object, "classes_", None) is None:
549
- output_cols = [output_cols_prefix]
550
- elif self._sklearn_object is not None:
551
- classes = self._sklearn_object.classes_
552
- if isinstance(classes, numpy.ndarray):
553
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
554
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
555
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
556
- output_cols = []
557
- for i, cl in enumerate(classes):
558
- # For binary classification, there is only one output column for each class
559
- # ndarray as the two classes are complementary.
560
- if len(cl) == 2:
561
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
562
- else:
563
- output_cols.extend([
564
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
565
- ])
566
- else:
567
- output_cols = []
568
-
569
- # Make sure column names are valid snowflake identifiers.
570
- assert output_cols is not None # Make MyPy happy
571
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
572
-
573
- return rv
574
-
575
544
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
576
545
  @telemetry.send_api_usage_telemetry(
577
546
  project=_PROJECT,
@@ -613,7 +582,7 @@ class LGBMClassifier(BaseTransformer):
613
582
  transform_kwargs = dict(
614
583
  session=dataset._session,
615
584
  dependencies=self._deps,
616
- pass_through_cols=self._get_pass_through_columns(dataset),
585
+ drop_input_cols = self._drop_input_cols,
617
586
  expected_output_cols_type="float",
618
587
  )
619
588
 
@@ -680,7 +649,7 @@ class LGBMClassifier(BaseTransformer):
680
649
  transform_kwargs = dict(
681
650
  session=dataset._session,
682
651
  dependencies=self._deps,
683
- pass_through_cols=self._get_pass_through_columns(dataset),
652
+ drop_input_cols = self._drop_input_cols,
684
653
  expected_output_cols_type="float",
685
654
  )
686
655
  elif isinstance(dataset, pd.DataFrame):
@@ -741,7 +710,7 @@ class LGBMClassifier(BaseTransformer):
741
710
  transform_kwargs = dict(
742
711
  session=dataset._session,
743
712
  dependencies=self._deps,
744
- pass_through_cols=self._get_pass_through_columns(dataset),
713
+ drop_input_cols = self._drop_input_cols,
745
714
  expected_output_cols_type="float",
746
715
  )
747
716
 
@@ -806,7 +775,7 @@ class LGBMClassifier(BaseTransformer):
806
775
  transform_kwargs = dict(
807
776
  session=dataset._session,
808
777
  dependencies=self._deps,
809
- pass_through_cols=self._get_pass_through_columns(dataset),
778
+ drop_input_cols = self._drop_input_cols,
810
779
  expected_output_cols_type="float",
811
780
  )
812
781
 
@@ -862,14 +831,18 @@ class LGBMClassifier(BaseTransformer):
862
831
  transform_kwargs: ScoreKwargsTypedDict = dict()
863
832
 
864
833
  if isinstance(dataset, DataFrame):
834
+ self._deps = self._batch_inference_validate_snowpark(
835
+ dataset=dataset,
836
+ inference_method="score",
837
+ )
865
838
  selected_cols = self._get_active_columns()
866
839
  if len(selected_cols) > 0:
867
840
  dataset = dataset.select(selected_cols)
868
841
  assert isinstance(dataset._session, Session) # keep mypy happy
869
842
  transform_kwargs = dict(
870
843
  session=dataset._session,
871
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
872
- score_sproc_imports=['lightgbm'],
844
+ dependencies=["snowflake-snowpark-python"] + self._deps,
845
+ score_sproc_imports=['lightgbm', 'sklearn'],
873
846
  )
874
847
  elif isinstance(dataset, pd.DataFrame):
875
848
  # pandas_handler.score() does not require any extra kwargs.
@@ -942,9 +915,9 @@ class LGBMClassifier(BaseTransformer):
942
915
  transform_kwargs = dict(
943
916
  session = dataset._session,
944
917
  dependencies = self._deps,
945
- pass_through_cols = self._get_pass_through_columns(dataset),
946
- expected_output_cols_type = "array",
947
- n_neighbors = n_neighbors,
918
+ drop_input_cols = self._drop_input_cols,
919
+ expected_output_cols_type="array",
920
+ n_neighbors = n_neighbors,
948
921
  return_distance = return_distance
949
922
  )
950
923
  elif isinstance(dataset, pd.DataFrame):
@@ -16,6 +16,7 @@ from numpy import typing as npt
16
16
 
17
17
 
18
18
  import numpy
19
+ import sklearn
19
20
  import lightgbm
20
21
  from sklearn.utils.metaestimators import available_if
21
22
 
@@ -160,7 +161,7 @@ class LGBMRegressor(BaseTransformer):
160
161
  self.set_sample_weight_col(sample_weight_col)
161
162
  self._use_external_memory_version = False
162
163
  self._batch_size = -1
163
- deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'])
164
+ deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}', f'scikit-learn=={sklearn.__version__}'])
164
165
 
165
166
  self._deps = list(deps)
166
167
 
@@ -293,18 +294,24 @@ class LGBMRegressor(BaseTransformer):
293
294
  self._get_model_signatures(dataset)
294
295
  return self
295
296
 
296
- def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
297
- if self._drop_input_cols:
298
- return []
299
- else:
300
- return list(set(dataset.columns) - set(self.output_cols))
301
-
302
297
  def _batch_inference_validate_snowpark(
303
298
  self,
304
299
  dataset: DataFrame,
305
300
  inference_method: str,
306
301
  ) -> List[str]:
307
- """Util method to run validate that batch inference can be run on a snowpark dataframe.
302
+ """Util method to run validate that batch inference can be run on a snowpark dataframe and
303
+ return the available package that exists in the snowflake anaconda channel
304
+
305
+ Args:
306
+ dataset: snowpark dataframe
307
+ inference_method: the inference method such as predict, score...
308
+
309
+ Raises:
310
+ SnowflakeMLException: If the estimator is not fitted, raise error
311
+ SnowflakeMLException: If the session is None, raise error
312
+
313
+ Returns:
314
+ A list of available package that exists in the snowflake anaconda channel
308
315
  """
309
316
  if not self._is_fitted:
310
317
  raise exceptions.SnowflakeMLException(
@@ -378,7 +385,7 @@ class LGBMRegressor(BaseTransformer):
378
385
  transform_kwargs = dict(
379
386
  session = dataset._session,
380
387
  dependencies = self._deps,
381
- pass_through_cols = self._get_pass_through_columns(dataset),
388
+ drop_input_cols = self._drop_input_cols,
382
389
  expected_output_cols_type = expected_type_inferred,
383
390
  )
384
391
 
@@ -438,16 +445,16 @@ class LGBMRegressor(BaseTransformer):
438
445
  # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
439
446
  # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
440
447
  # each row containing a list of values.
441
- expected_dtype = "ARRAY"
448
+ expected_dtype = "array"
442
449
 
443
450
  # If we were unable to assign a type to this transform in the factory, infer the type here.
444
451
  if expected_dtype == "":
445
- # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
452
+ # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
446
453
  if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
447
- expected_dtype = "ARRAY"
448
- # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
454
+ expected_dtype = "array"
455
+ # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
449
456
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
450
- expected_dtype = "ARRAY"
457
+ expected_dtype = "array"
451
458
  else:
452
459
  output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
453
460
  # We can only infer the output types from the input types if the following two statemetns are true:
@@ -465,7 +472,7 @@ class LGBMRegressor(BaseTransformer):
465
472
  transform_kwargs = dict(
466
473
  session = dataset._session,
467
474
  dependencies = self._deps,
468
- pass_through_cols = self._get_pass_through_columns(dataset),
475
+ drop_input_cols = self._drop_input_cols,
469
476
  expected_output_cols_type = expected_dtype,
470
477
  )
471
478
 
@@ -516,7 +523,7 @@ class LGBMRegressor(BaseTransformer):
516
523
  subproject=_SUBPROJECT,
517
524
  )
518
525
  output_result, fitted_estimator = model_trainer.train_fit_predict(
519
- pass_through_columns=self._get_pass_through_columns(dataset),
526
+ drop_input_cols=self._drop_input_cols,
520
527
  expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
521
528
  )
522
529
  self._sklearn_object = fitted_estimator
@@ -534,44 +541,6 @@ class LGBMRegressor(BaseTransformer):
534
541
  assert self._sklearn_object is not None
535
542
  return self._sklearn_object.embedding_
536
543
 
537
-
538
- def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
539
- """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
540
- Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
541
- """
542
- output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
543
- if output_cols:
544
- output_cols = [
545
- identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
546
- for c in output_cols
547
- ]
548
- elif getattr(self._sklearn_object, "classes_", None) is None:
549
- output_cols = [output_cols_prefix]
550
- elif self._sklearn_object is not None:
551
- classes = self._sklearn_object.classes_
552
- if isinstance(classes, numpy.ndarray):
553
- output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
554
- elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
555
- # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
556
- output_cols = []
557
- for i, cl in enumerate(classes):
558
- # For binary classification, there is only one output column for each class
559
- # ndarray as the two classes are complementary.
560
- if len(cl) == 2:
561
- output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
562
- else:
563
- output_cols.extend([
564
- f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
565
- ])
566
- else:
567
- output_cols = []
568
-
569
- # Make sure column names are valid snowflake identifiers.
570
- assert output_cols is not None # Make MyPy happy
571
- rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
572
-
573
- return rv
574
-
575
544
  @available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
576
545
  @telemetry.send_api_usage_telemetry(
577
546
  project=_PROJECT,
@@ -611,7 +580,7 @@ class LGBMRegressor(BaseTransformer):
611
580
  transform_kwargs = dict(
612
581
  session=dataset._session,
613
582
  dependencies=self._deps,
614
- pass_through_cols=self._get_pass_through_columns(dataset),
583
+ drop_input_cols = self._drop_input_cols,
615
584
  expected_output_cols_type="float",
616
585
  )
617
586
 
@@ -676,7 +645,7 @@ class LGBMRegressor(BaseTransformer):
676
645
  transform_kwargs = dict(
677
646
  session=dataset._session,
678
647
  dependencies=self._deps,
679
- pass_through_cols=self._get_pass_through_columns(dataset),
648
+ drop_input_cols = self._drop_input_cols,
680
649
  expected_output_cols_type="float",
681
650
  )
682
651
  elif isinstance(dataset, pd.DataFrame):
@@ -737,7 +706,7 @@ class LGBMRegressor(BaseTransformer):
737
706
  transform_kwargs = dict(
738
707
  session=dataset._session,
739
708
  dependencies=self._deps,
740
- pass_through_cols=self._get_pass_through_columns(dataset),
709
+ drop_input_cols = self._drop_input_cols,
741
710
  expected_output_cols_type="float",
742
711
  )
743
712
 
@@ -802,7 +771,7 @@ class LGBMRegressor(BaseTransformer):
802
771
  transform_kwargs = dict(
803
772
  session=dataset._session,
804
773
  dependencies=self._deps,
805
- pass_through_cols=self._get_pass_through_columns(dataset),
774
+ drop_input_cols = self._drop_input_cols,
806
775
  expected_output_cols_type="float",
807
776
  )
808
777
 
@@ -858,14 +827,18 @@ class LGBMRegressor(BaseTransformer):
858
827
  transform_kwargs: ScoreKwargsTypedDict = dict()
859
828
 
860
829
  if isinstance(dataset, DataFrame):
830
+ self._deps = self._batch_inference_validate_snowpark(
831
+ dataset=dataset,
832
+ inference_method="score",
833
+ )
861
834
  selected_cols = self._get_active_columns()
862
835
  if len(selected_cols) > 0:
863
836
  dataset = dataset.select(selected_cols)
864
837
  assert isinstance(dataset._session, Session) # keep mypy happy
865
838
  transform_kwargs = dict(
866
839
  session=dataset._session,
867
- dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
868
- score_sproc_imports=['lightgbm'],
840
+ dependencies=["snowflake-snowpark-python"] + self._deps,
841
+ score_sproc_imports=['lightgbm', 'sklearn'],
869
842
  )
870
843
  elif isinstance(dataset, pd.DataFrame):
871
844
  # pandas_handler.score() does not require any extra kwargs.
@@ -938,9 +911,9 @@ class LGBMRegressor(BaseTransformer):
938
911
  transform_kwargs = dict(
939
912
  session = dataset._session,
940
913
  dependencies = self._deps,
941
- pass_through_cols = self._get_pass_through_columns(dataset),
942
- expected_output_cols_type = "array",
943
- n_neighbors = n_neighbors,
914
+ drop_input_cols = self._drop_input_cols,
915
+ expected_output_cols_type="array",
916
+ n_neighbors = n_neighbors,
944
917
  return_distance = return_distance
945
918
  )
946
919
  elif isinstance(dataset, pd.DataFrame):