snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. snowflake/cortex/__init__.py +16 -8
  2. snowflake/cortex/_classify_text.py +12 -1
  3. snowflake/cortex/_complete.py +101 -13
  4. snowflake/cortex/_embed_text_1024.py +9 -2
  5. snowflake/cortex/_embed_text_768.py +9 -2
  6. snowflake/cortex/_extract_answer.py +9 -2
  7. snowflake/cortex/_sentiment.py +9 -2
  8. snowflake/cortex/_summarize.py +9 -2
  9. snowflake/cortex/_translate.py +9 -2
  10. snowflake/ml/_internal/env_utils.py +7 -52
  11. snowflake/ml/_internal/platform_capabilities.py +87 -0
  12. snowflake/ml/_internal/utils/identifier.py +4 -2
  13. snowflake/ml/data/__init__.py +3 -0
  14. snowflake/ml/data/_internal/arrow_ingestor.py +4 -4
  15. snowflake/ml/data/data_connector.py +53 -11
  16. snowflake/ml/data/data_ingestor.py +2 -1
  17. snowflake/ml/data/torch_utils.py +18 -5
  18. snowflake/ml/dataset/dataset.py +0 -1
  19. snowflake/ml/feature_store/examples/example_helper.py +2 -1
  20. snowflake/ml/fileset/fileset.py +24 -18
  21. snowflake/ml/jobs/__init__.py +21 -0
  22. snowflake/ml/jobs/_utils/constants.py +51 -0
  23. snowflake/ml/jobs/_utils/payload_utils.py +352 -0
  24. snowflake/ml/jobs/_utils/spec_utils.py +298 -0
  25. snowflake/ml/jobs/_utils/types.py +39 -0
  26. snowflake/ml/jobs/decorators.py +91 -0
  27. snowflake/ml/jobs/job.py +113 -0
  28. snowflake/ml/jobs/manager.py +298 -0
  29. snowflake/ml/model/_client/model/model_version_impl.py +5 -3
  30. snowflake/ml/model/_client/ops/model_ops.py +13 -8
  31. snowflake/ml/model/_client/ops/service_ops.py +1 -11
  32. snowflake/ml/model/_client/sql/model_version.py +11 -0
  33. snowflake/ml/model/_client/sql/service.py +13 -6
  34. snowflake/ml/model/_model_composer/model_composer.py +8 -3
  35. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +20 -1
  36. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +1 -0
  37. snowflake/ml/model/_model_composer/model_method/constants.py +1 -0
  38. snowflake/ml/model/_model_composer/model_method/function_generator.py +2 -0
  39. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +1 -1
  40. snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +1 -1
  41. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +1 -1
  42. snowflake/ml/model/_model_composer/model_method/model_method.py +9 -1
  43. snowflake/ml/model/_model_composer/model_user_file/model_user_file.py +27 -0
  44. snowflake/ml/model/_packager/model_handlers/_utils.py +39 -5
  45. snowflake/ml/model/_packager/model_handlers/catboost.py +3 -3
  46. snowflake/ml/model/_packager/model_handlers/custom.py +1 -2
  47. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +6 -1
  48. snowflake/ml/model/_packager/model_handlers/lightgbm.py +5 -3
  49. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +55 -20
  50. snowflake/ml/model/_packager/model_handlers/sklearn.py +9 -10
  51. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +66 -28
  52. snowflake/ml/model/_packager/model_handlers/tensorflow.py +70 -17
  53. snowflake/ml/model/_packager/model_handlers/xgboost.py +3 -3
  54. snowflake/ml/model/_packager/model_meta/model_meta.py +3 -0
  55. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +6 -1
  56. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +2 -2
  57. snowflake/ml/model/_packager/model_task/model_task_utils.py +3 -2
  58. snowflake/ml/model/_signatures/base_handler.py +1 -2
  59. snowflake/ml/model/_signatures/builtins_handler.py +2 -2
  60. snowflake/ml/model/_signatures/numpy_handler.py +6 -7
  61. snowflake/ml/model/_signatures/pandas_handler.py +3 -3
  62. snowflake/ml/model/_signatures/pytorch_handler.py +2 -5
  63. snowflake/ml/model/_signatures/snowpark_handler.py +11 -5
  64. snowflake/ml/model/_signatures/tensorflow_handler.py +2 -7
  65. snowflake/ml/model/model_signature.py +17 -4
  66. snowflake/ml/model/type_hints.py +1 -0
  67. snowflake/ml/modeling/_internal/model_trainer_builder.py +0 -8
  68. snowflake/ml/modeling/_internal/model_transformer_builder.py +0 -13
  69. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +6 -3
  70. snowflake/ml/modeling/cluster/affinity_propagation.py +6 -3
  71. snowflake/ml/modeling/cluster/agglomerative_clustering.py +6 -3
  72. snowflake/ml/modeling/cluster/birch.py +6 -3
  73. snowflake/ml/modeling/cluster/bisecting_k_means.py +6 -3
  74. snowflake/ml/modeling/cluster/dbscan.py +6 -3
  75. snowflake/ml/modeling/cluster/feature_agglomeration.py +6 -3
  76. snowflake/ml/modeling/cluster/k_means.py +6 -3
  77. snowflake/ml/modeling/cluster/mean_shift.py +6 -3
  78. snowflake/ml/modeling/cluster/mini_batch_k_means.py +6 -3
  79. snowflake/ml/modeling/cluster/optics.py +6 -3
  80. snowflake/ml/modeling/cluster/spectral_biclustering.py +6 -3
  81. snowflake/ml/modeling/cluster/spectral_clustering.py +6 -3
  82. snowflake/ml/modeling/cluster/spectral_coclustering.py +6 -3
  83. snowflake/ml/modeling/compose/column_transformer.py +6 -3
  84. snowflake/ml/modeling/compose/transformed_target_regressor.py +6 -3
  85. snowflake/ml/modeling/covariance/elliptic_envelope.py +6 -3
  86. snowflake/ml/modeling/covariance/empirical_covariance.py +6 -3
  87. snowflake/ml/modeling/covariance/graphical_lasso.py +6 -3
  88. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +6 -3
  89. snowflake/ml/modeling/covariance/ledoit_wolf.py +6 -3
  90. snowflake/ml/modeling/covariance/min_cov_det.py +6 -3
  91. snowflake/ml/modeling/covariance/oas.py +6 -3
  92. snowflake/ml/modeling/covariance/shrunk_covariance.py +6 -3
  93. snowflake/ml/modeling/decomposition/dictionary_learning.py +6 -3
  94. snowflake/ml/modeling/decomposition/factor_analysis.py +6 -3
  95. snowflake/ml/modeling/decomposition/fast_ica.py +6 -3
  96. snowflake/ml/modeling/decomposition/incremental_pca.py +6 -3
  97. snowflake/ml/modeling/decomposition/kernel_pca.py +6 -3
  98. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +6 -3
  99. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +6 -3
  100. snowflake/ml/modeling/decomposition/pca.py +6 -3
  101. snowflake/ml/modeling/decomposition/sparse_pca.py +6 -3
  102. snowflake/ml/modeling/decomposition/truncated_svd.py +6 -3
  103. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +6 -3
  104. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +6 -3
  105. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +6 -3
  106. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +6 -3
  107. snowflake/ml/modeling/ensemble/bagging_classifier.py +6 -3
  108. snowflake/ml/modeling/ensemble/bagging_regressor.py +6 -3
  109. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +6 -3
  110. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +6 -3
  111. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +6 -3
  112. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +6 -3
  113. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +6 -3
  114. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +6 -3
  115. snowflake/ml/modeling/ensemble/isolation_forest.py +6 -3
  116. snowflake/ml/modeling/ensemble/random_forest_classifier.py +6 -3
  117. snowflake/ml/modeling/ensemble/random_forest_regressor.py +6 -3
  118. snowflake/ml/modeling/ensemble/stacking_regressor.py +6 -3
  119. snowflake/ml/modeling/ensemble/voting_classifier.py +6 -3
  120. snowflake/ml/modeling/ensemble/voting_regressor.py +6 -3
  121. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +6 -3
  122. snowflake/ml/modeling/feature_selection/select_fdr.py +6 -3
  123. snowflake/ml/modeling/feature_selection/select_fpr.py +6 -3
  124. snowflake/ml/modeling/feature_selection/select_fwe.py +6 -3
  125. snowflake/ml/modeling/feature_selection/select_k_best.py +6 -3
  126. snowflake/ml/modeling/feature_selection/select_percentile.py +6 -3
  127. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +6 -3
  128. snowflake/ml/modeling/feature_selection/variance_threshold.py +6 -3
  129. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +6 -3
  130. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +6 -3
  131. snowflake/ml/modeling/impute/iterative_imputer.py +6 -3
  132. snowflake/ml/modeling/impute/knn_imputer.py +6 -3
  133. snowflake/ml/modeling/impute/missing_indicator.py +6 -3
  134. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +6 -3
  135. snowflake/ml/modeling/kernel_approximation/nystroem.py +6 -3
  136. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +6 -3
  137. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +6 -3
  138. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +6 -3
  139. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +6 -3
  140. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +6 -3
  141. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +6 -3
  142. snowflake/ml/modeling/linear_model/ard_regression.py +6 -3
  143. snowflake/ml/modeling/linear_model/bayesian_ridge.py +6 -3
  144. snowflake/ml/modeling/linear_model/elastic_net.py +6 -3
  145. snowflake/ml/modeling/linear_model/elastic_net_cv.py +6 -3
  146. snowflake/ml/modeling/linear_model/gamma_regressor.py +6 -3
  147. snowflake/ml/modeling/linear_model/huber_regressor.py +6 -3
  148. snowflake/ml/modeling/linear_model/lars.py +6 -3
  149. snowflake/ml/modeling/linear_model/lars_cv.py +6 -3
  150. snowflake/ml/modeling/linear_model/lasso.py +6 -3
  151. snowflake/ml/modeling/linear_model/lasso_cv.py +6 -3
  152. snowflake/ml/modeling/linear_model/lasso_lars.py +6 -3
  153. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +6 -3
  154. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +6 -3
  155. snowflake/ml/modeling/linear_model/linear_regression.py +6 -3
  156. snowflake/ml/modeling/linear_model/logistic_regression.py +6 -3
  157. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +6 -3
  158. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +6 -3
  159. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +6 -3
  160. snowflake/ml/modeling/linear_model/multi_task_lasso.py +6 -3
  161. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +6 -3
  162. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +6 -3
  163. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +6 -3
  164. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +6 -3
  165. snowflake/ml/modeling/linear_model/perceptron.py +6 -3
  166. snowflake/ml/modeling/linear_model/poisson_regressor.py +6 -3
  167. snowflake/ml/modeling/linear_model/ransac_regressor.py +6 -3
  168. snowflake/ml/modeling/linear_model/ridge.py +6 -3
  169. snowflake/ml/modeling/linear_model/ridge_classifier.py +6 -3
  170. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +6 -3
  171. snowflake/ml/modeling/linear_model/ridge_cv.py +6 -3
  172. snowflake/ml/modeling/linear_model/sgd_classifier.py +6 -3
  173. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +6 -3
  174. snowflake/ml/modeling/linear_model/sgd_regressor.py +6 -3
  175. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +6 -3
  176. snowflake/ml/modeling/linear_model/tweedie_regressor.py +6 -3
  177. snowflake/ml/modeling/manifold/isomap.py +6 -3
  178. snowflake/ml/modeling/manifold/mds.py +6 -3
  179. snowflake/ml/modeling/manifold/spectral_embedding.py +6 -3
  180. snowflake/ml/modeling/manifold/tsne.py +6 -3
  181. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +6 -3
  182. snowflake/ml/modeling/mixture/gaussian_mixture.py +6 -3
  183. snowflake/ml/modeling/model_selection/grid_search_cv.py +17 -2
  184. snowflake/ml/modeling/model_selection/randomized_search_cv.py +17 -2
  185. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +6 -3
  186. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +6 -3
  187. snowflake/ml/modeling/multiclass/output_code_classifier.py +6 -3
  188. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +6 -3
  189. snowflake/ml/modeling/naive_bayes/categorical_nb.py +6 -3
  190. snowflake/ml/modeling/naive_bayes/complement_nb.py +6 -3
  191. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +6 -3
  192. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +6 -3
  193. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +6 -3
  194. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +6 -3
  195. snowflake/ml/modeling/neighbors/kernel_density.py +6 -3
  196. snowflake/ml/modeling/neighbors/local_outlier_factor.py +6 -3
  197. snowflake/ml/modeling/neighbors/nearest_centroid.py +6 -3
  198. snowflake/ml/modeling/neighbors/nearest_neighbors.py +6 -3
  199. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +6 -3
  200. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +6 -3
  201. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +6 -3
  202. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +6 -3
  203. snowflake/ml/modeling/neural_network/mlp_classifier.py +6 -3
  204. snowflake/ml/modeling/neural_network/mlp_regressor.py +6 -3
  205. snowflake/ml/modeling/pipeline/pipeline.py +16 -178
  206. snowflake/ml/modeling/preprocessing/polynomial_features.py +6 -3
  207. snowflake/ml/modeling/semi_supervised/label_propagation.py +6 -3
  208. snowflake/ml/modeling/semi_supervised/label_spreading.py +6 -3
  209. snowflake/ml/modeling/svm/linear_svc.py +6 -3
  210. snowflake/ml/modeling/svm/linear_svr.py +6 -3
  211. snowflake/ml/modeling/svm/nu_svc.py +6 -3
  212. snowflake/ml/modeling/svm/nu_svr.py +6 -3
  213. snowflake/ml/modeling/svm/svc.py +6 -3
  214. snowflake/ml/modeling/svm/svr.py +6 -3
  215. snowflake/ml/modeling/tree/decision_tree_classifier.py +6 -3
  216. snowflake/ml/modeling/tree/decision_tree_regressor.py +6 -3
  217. snowflake/ml/modeling/tree/extra_tree_classifier.py +6 -3
  218. snowflake/ml/modeling/tree/extra_tree_regressor.py +6 -3
  219. snowflake/ml/modeling/xgboost/xgb_classifier.py +167 -91
  220. snowflake/ml/modeling/xgboost/xgb_regressor.py +166 -88
  221. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +166 -88
  222. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +166 -88
  223. snowflake/ml/monitoring/_client/model_monitor_sql_client.py +4 -4
  224. snowflake/ml/registry/_manager/model_manager.py +70 -33
  225. snowflake/ml/registry/registry.py +41 -22
  226. snowflake/ml/version.py +1 -1
  227. {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/METADATA +63 -19
  228. {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/RECORD +231 -226
  229. {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/WHEEL +1 -1
  230. snowflake/ml/_internal/utils/retryable_http.py +0 -39
  231. snowflake/ml/fileset/parquet_parser.py +0 -170
  232. snowflake/ml/fileset/tf_dataset.py +0 -88
  233. snowflake/ml/fileset/torch_datapipe.py +0 -57
  234. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +0 -151
  235. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_trainer.py +0 -66
  236. {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/LICENSE.txt +0 -0
  237. {snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.4.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.svm".replace("sklearn.",
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class SVR(BaseTransformer):
61
64
  r"""Epsilon-Support Vector Regression
62
65
  For more details on this class, see [sklearn.svm.SVR]
@@ -470,7 +473,7 @@ class SVR(BaseTransformer):
470
473
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
471
474
  expected_dtype = "array"
472
475
  else:
473
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
476
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
474
477
  # We can only infer the output types from the input types if the following two statemetns are true:
475
478
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
476
479
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1127,7 +1130,7 @@ class SVR(BaseTransformer):
1127
1130
 
1128
1131
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1129
1132
 
1130
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1133
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1131
1134
  outputs: List[BaseFeatureSpec] = []
1132
1135
  if hasattr(self, "predict"):
1133
1136
  # keep mypy happy
@@ -1135,7 +1138,7 @@ class SVR(BaseTransformer):
1135
1138
  # For classifier, the type of predict is the same as the type of label
1136
1139
  if self._sklearn_object._estimator_type == "classifier":
1137
1140
  # label columns is the desired type for output
1138
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1141
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1139
1142
  # rename the output columns
1140
1143
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1141
1144
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.tree".replace("sklearn."
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class DecisionTreeClassifier(BaseTransformer):
61
64
  r"""A decision tree classifier
62
65
  For more details on this class, see [sklearn.tree.DecisionTreeClassifier]
@@ -554,7 +557,7 @@ class DecisionTreeClassifier(BaseTransformer):
554
557
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
555
558
  expected_dtype = "array"
556
559
  else:
557
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
560
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
558
561
  # We can only infer the output types from the input types if the following two statemetns are true:
559
562
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
560
563
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1215,7 +1218,7 @@ class DecisionTreeClassifier(BaseTransformer):
1215
1218
 
1216
1219
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1217
1220
 
1218
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1221
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1219
1222
  outputs: List[BaseFeatureSpec] = []
1220
1223
  if hasattr(self, "predict"):
1221
1224
  # keep mypy happy
@@ -1223,7 +1226,7 @@ class DecisionTreeClassifier(BaseTransformer):
1223
1226
  # For classifier, the type of predict is the same as the type of label
1224
1227
  if self._sklearn_object._estimator_type == "classifier":
1225
1228
  # label columns is the desired type for output
1226
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1229
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1227
1230
  # rename the output columns
1228
1231
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1229
1232
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.tree".replace("sklearn."
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class DecisionTreeRegressor(BaseTransformer):
61
64
  r"""A decision tree regressor
62
65
  For more details on this class, see [sklearn.tree.DecisionTreeRegressor]
@@ -533,7 +536,7 @@ class DecisionTreeRegressor(BaseTransformer):
533
536
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
534
537
  expected_dtype = "array"
535
538
  else:
536
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
539
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
537
540
  # We can only infer the output types from the input types if the following two statemetns are true:
538
541
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
539
542
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1190,7 +1193,7 @@ class DecisionTreeRegressor(BaseTransformer):
1190
1193
 
1191
1194
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1192
1195
 
1193
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1196
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1194
1197
  outputs: List[BaseFeatureSpec] = []
1195
1198
  if hasattr(self, "predict"):
1196
1199
  # keep mypy happy
@@ -1198,7 +1201,7 @@ class DecisionTreeRegressor(BaseTransformer):
1198
1201
  # For classifier, the type of predict is the same as the type of label
1199
1202
  if self._sklearn_object._estimator_type == "classifier":
1200
1203
  # label columns is the desired type for output
1201
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1204
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1202
1205
  # rename the output columns
1203
1206
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1204
1207
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.tree".replace("sklearn."
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class ExtraTreeClassifier(BaseTransformer):
61
64
  r"""An extremely randomized tree classifier
62
65
  For more details on this class, see [sklearn.tree.ExtraTreeClassifier]
@@ -546,7 +549,7 @@ class ExtraTreeClassifier(BaseTransformer):
546
549
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
547
550
  expected_dtype = "array"
548
551
  else:
549
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
552
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
550
553
  # We can only infer the output types from the input types if the following two statemetns are true:
551
554
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
552
555
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1207,7 +1210,7 @@ class ExtraTreeClassifier(BaseTransformer):
1207
1210
 
1208
1211
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1209
1212
 
1210
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1213
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1211
1214
  outputs: List[BaseFeatureSpec] = []
1212
1215
  if hasattr(self, "predict"):
1213
1216
  # keep mypy happy
@@ -1215,7 +1218,7 @@ class ExtraTreeClassifier(BaseTransformer):
1215
1218
  # For classifier, the type of predict is the same as the type of label
1216
1219
  if self._sklearn_object._estimator_type == "classifier":
1217
1220
  # label columns is the desired type for output
1218
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1221
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1219
1222
  # rename the output columns
1220
1223
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1221
1224
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.tree".replace("sklearn."
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class ExtraTreeRegressor(BaseTransformer):
61
64
  r"""An extremely randomized tree regressor
62
65
  For more details on this class, see [sklearn.tree.ExtraTreeRegressor]
@@ -525,7 +528,7 @@ class ExtraTreeRegressor(BaseTransformer):
525
528
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
526
529
  expected_dtype = "array"
527
530
  else:
528
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
531
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
529
532
  # We can only infer the output types from the input types if the following two statemetns are true:
530
533
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
531
534
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1182,7 +1185,7 @@ class ExtraTreeRegressor(BaseTransformer):
1182
1185
 
1183
1186
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1184
1187
 
1185
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1188
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1186
1189
  outputs: List[BaseFeatureSpec] = []
1187
1190
  if hasattr(self, "predict"):
1188
1191
  # keep mypy happy
@@ -1190,7 +1193,7 @@ class ExtraTreeRegressor(BaseTransformer):
1190
1193
  # For classifier, the type of predict is the same as the type of label
1191
1194
  if self._sklearn_object._estimator_type == "classifier":
1192
1195
  # label columns is the desired type for output
1193
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1196
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1194
1197
  # rename the output columns
1195
1198
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1196
1199
  self._model_signature_dict["predict"] = ModelSignature(
@@ -37,6 +37,7 @@ from snowflake.ml.model.model_signature import (
37
37
  FeatureSpec,
38
38
  ModelSignature,
39
39
  _infer_signature,
40
+ _truncate_data,
40
41
  _rename_signature_with_snowflake_identifiers,
41
42
  )
42
43
 
@@ -57,6 +58,8 @@ _SUBPROJECT = "".join([s.capitalize() for s in "xgboost".replace("sklearn.", "")
57
58
 
58
59
  DATAFRAME_TYPE = Union[DataFrame, pd.DataFrame]
59
60
 
61
+ INFER_SIGNATURE_MAX_ROWS = 100
62
+
60
63
  class XGBClassifier(BaseTransformer):
61
64
  r"""Implementation of the scikit-learn API for XGBoost classification
62
65
  For more details on this class, see [xgboost.XGBClassifier]
@@ -125,111 +128,171 @@ class XGBClassifier(BaseTransformer):
125
128
  can seriously hurt performance in gradient boosting. Set the batch_size as large as possible
126
129
  based on the available memory.
127
130
 
128
- n_estimators: int
131
+ n_estimators: Optional[int]
129
132
  Number of boosting rounds.
130
133
 
131
- max_depth: Optional[int]
134
+ max_depth: typing.Optional[int]
135
+
132
136
  Maximum tree depth for base learners.
133
- max_leaves :
137
+
138
+ max_leaves: typing.Optional[int]
139
+
134
140
  Maximum number of leaves; 0 indicates no limit.
135
- max_bin :
141
+
142
+ max_bin: typing.Optional[int]
143
+
136
144
  If using histogram-based algorithm, maximum number of bins per feature
137
- grow_policy :
138
- Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
139
- depth-wise. 1: favor splitting at nodes with highest loss change.
140
- learning_rate: Optional[float]
145
+
146
+ grow_policy: typing.Optional[str]
147
+
148
+ Tree growing policy.
149
+
150
+ - depthwise: Favors splitting at nodes closest to the node,
151
+ - lossguide: Favors splitting at nodes with highest loss change.
152
+
153
+ learning_rate: typing.Optional[float]
154
+
141
155
  Boosting learning rate (xgb's "eta")
142
- verbosity: Optional[int]
156
+
157
+ verbosity: typing.Optional[int]
158
+
143
159
  The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
144
- objective: typing.Union[str, typing.Callable[[numpy.ndarray, numpy.ndarray], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]
145
- Specify the learning task and the corresponding learning objective or
146
- a custom objective function to be used (see note below).
147
- booster: Optional[str]
148
- Specify which booster to use: gbtree, gblinear or dart.
149
- tree_method: Optional[str]
160
+
161
+ objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]
162
+
163
+ Specify the learning task and the corresponding learning objective or a custom
164
+ objective function to be used.
165
+
166
+ For custom objective, see :doc:`/tutorials/custom_metric_obj` and
167
+ :ref:`custom-obj-metric` for more information, along with the end note for
168
+ function signatures.
169
+
170
+ booster: typing.Optional[str]
171
+
172
+ Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
173
+
174
+ tree_method: typing.Optional[str]
175
+
150
176
  Specify which tree method to use. Default to auto. If this parameter is set to
151
177
  default, XGBoost will choose the most conservative option available. It's
152
178
  recommended to study this option from the parameters document :doc:`tree method
153
179
  </treemethod>`
154
- n_jobs: Optional[int]
180
+
181
+ n_jobs: typing.Optional[int]
182
+
155
183
  Number of parallel threads used to run xgboost. When used with other
156
184
  Scikit-Learn algorithms like grid search, you may choose which algorithm to
157
185
  parallelize and balance the threads. Creating thread contention will
158
186
  significantly slow down both algorithms.
159
- gamma: Optional[float]
160
- (min_split_loss) Minimum loss reduction required to make a further partition on a
161
- leaf node of the tree.
162
- min_child_weight: Optional[float]
187
+
188
+ gamma: typing.Optional[float]
189
+
190
+ (min_split_loss) Minimum loss reduction required to make a further partition on
191
+ a leaf node of the tree.
192
+
193
+ min_child_weight: typing.Optional[float]
194
+
163
195
  Minimum sum of instance weight(hessian) needed in a child.
164
- max_delta_step: Optional[float]
196
+
197
+ max_delta_step: typing.Optional[float]
198
+
165
199
  Maximum delta step we allow each tree's weight estimation to be.
166
- subsample: Optional[float]
200
+
201
+ subsample: typing.Optional[float]
202
+
167
203
  Subsample ratio of the training instance.
168
- sampling_method :
169
- Sampling method. Used only by `gpu_hist` tree method.
170
- - `uniform`: select random training instances uniformly.
171
- - `gradient_based` select random training instances with higher probability when
172
- the gradient and hessian are larger. (cf. CatBoost)
173
- colsample_bytree: Optional[float]
204
+
205
+ sampling_method: typing.Optional[str]
206
+
207
+ Sampling method. Used only by the GPU version of ``hist`` tree method.
208
+
209
+ - ``uniform``: Select random training instances uniformly.
210
+ - ``gradient_based``: Select random training instances with higher probability
211
+ when the gradient and hessian are larger. (cf. CatBoost)
212
+
213
+ colsample_bytree: typing.Optional[float]
214
+
174
215
  Subsample ratio of columns when constructing each tree.
175
- colsample_bylevel: Optional[float]
216
+
217
+ colsample_bylevel: typing.Optional[float]
218
+
176
219
  Subsample ratio of columns for each level.
177
- colsample_bynode: Optional[float]
220
+
221
+ colsample_bynode: typing.Optional[float]
222
+
178
223
  Subsample ratio of columns for each split.
179
- reg_alpha: Optional[float]
224
+
225
+ reg_alpha: typing.Optional[float]
226
+
180
227
  L1 regularization term on weights (xgb's alpha).
181
- reg_lambda: Optional[float]
228
+
229
+ reg_lambda: typing.Optional[float]
230
+
182
231
  L2 regularization term on weights (xgb's lambda).
183
- scale_pos_weight: Optional[float]
232
+
233
+ scale_pos_weight: typing.Optional[float]
184
234
  Balancing of positive and negative weights.
185
- base_score: Optional[float]
235
+
236
+ base_score: typing.Optional[float]
237
+
186
238
  The initial prediction score of all instances, global bias.
187
- random_state: Optional[Union[numpy.random.RandomState, int]]
239
+
240
+ random_state: typing.Union[numpy.random.mtrand.RandomState, numpy.random._generator.Generator, int, NoneType]
241
+
188
242
  Random number seed.
189
243
 
190
244
  Using gblinear booster with shotgun updater is nondeterministic as
191
245
  it uses Hogwild algorithm.
192
246
 
193
- missing: float, default np.nan
194
- Value in the data which needs to be present as a missing value.
195
- num_parallel_tree: Optional[int]
247
+ missing: float
248
+
249
+ Value in the data which needs to be present as a missing value. Default to
250
+ :py:data:`numpy.nan`.
251
+
252
+ num_parallel_tree: typing.Optional[int]
253
+
196
254
  Used for boosting random forest.
197
- monotone_constraints: Optional[Union[Dict[str, int], str]]
255
+
256
+ monotone_constraints: typing.Union[typing.Dict[str, int], str, NoneType]
257
+
198
258
  Constraint of variable monotonicity. See :doc:`tutorial </tutorials/monotonic>`
199
259
  for more information.
200
- interaction_constraints: Optional[Union[str, List[Tuple[str]]]]
260
+
261
+ interaction_constraints: typing.Union[str, typing.List[typing.Tuple[str]], NoneType]
262
+
201
263
  Constraints for interaction representing permitted interactions. The
202
264
  constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
203
265
  3, 4]]``, where each inner list is a group of indices of features that are
204
266
  allowed to interact with each other. See :doc:`tutorial
205
267
  </tutorials/feature_interaction_constraint>` for more information
206
- importance_type: Optional[str]
268
+
269
+ importance_type: typing.Optional[str]
270
+
207
271
  The feature importance type for the feature_importances\_ property:
208
272
 
209
273
  * For tree model, it's either "gain", "weight", "cover", "total_gain" or
210
274
  "total_cover".
211
- * For linear model, only "weight" is defined and it's the normalized coefficients
212
- without bias.
275
+ * For linear model, only "weight" is defined and it's the normalized
276
+ coefficients without bias.
277
+
278
+ device: typing.Optional[str]
279
+
280
+ Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
281
+
282
+ validate_parameters: typing.Optional[bool]
213
283
 
214
- gpu_id: Optional[int]
215
- Device ordinal.
216
- validate_parameters: Optional[bool]
217
284
  Give warnings for unknown parameter.
218
- predictor: Optional[str]
219
- Force XGBoost to use specific predictor, available choices are [cpu_predictor,
220
- gpu_predictor].
285
+
221
286
  enable_categorical: bool
222
287
 
223
- Experimental support for categorical data. When enabled, cudf/pandas.DataFrame
224
- should be used to specify categorical data type. Also, JSON/UBJSON
225
- serialization format is required.
288
+ See the same parameter of :py:class:`DMatrix` for details.
226
289
 
227
- feature_types: FeatureTypes
290
+ feature_types: typing.Optional[typing.Sequence[str]]
228
291
 
229
292
  Used for specifying feature types without constructing a dataframe. See
230
293
  :py:class:`DMatrix` for details.
231
294
 
232
- max_cat_to_onehot: Optional[int]
295
+ max_cat_to_onehot: typing.Optional[int]
233
296
 
234
297
  A threshold for deciding whether XGBoost should use one-hot encoding based split
235
298
  for categorical data. When number of categories is lesser than the threshold
@@ -238,36 +301,41 @@ class XGBClassifier(BaseTransformer):
238
301
  categorical feature support. See :doc:`Categorical Data
239
302
  </tutorials/categorical>` and :ref:`cat-param` for details.
240
303
 
241
- max_cat_threshold: Optional[int]
304
+ max_cat_threshold: typing.Optional[int]
242
305
 
243
306
  Maximum number of categories considered for each split. Used only by
244
307
  partition-based splits for preventing over-fitting. Also, `enable_categorical`
245
308
  needs to be set to have categorical feature support. See :doc:`Categorical Data
246
309
  </tutorials/categorical>` and :ref:`cat-param` for details.
247
310
 
248
- eval_metric: Optional[Union[str, List[str], Callable]]
311
+ multi_strategy: typing.Optional[str]
312
+
313
+ The strategy used for training multi-target models, including multi-target
314
+ regression and multi-class classification. See :doc:`/tutorials/multioutput` for
315
+ more information.
316
+
317
+ - ``one_output_per_tree``: One model for each target.
318
+ - ``multi_output_tree``: Use multi-target trees.
319
+
320
+ eval_metric: typing.Union[str, typing.List[str], typing.Callable, NoneType]
249
321
 
250
322
  Metric used for monitoring the training result and early stopping. It can be a
251
323
  string or list of strings as names of predefined metric in XGBoost (See
252
- doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
253
- user defined metric that looks like `sklearn.metrics`.
324
+ doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
325
+ other user defined metric that looks like `sklearn.metrics`.
254
326
 
255
327
  If custom objective is also provided, then custom metric should implement the
256
328
  corresponding reverse link function.
257
329
 
258
330
  Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
259
- object is provided, it's assumed to be a cost function and by default XGBoost will
260
- minimize the result during early stopping.
261
-
262
- For advanced usage on Early stopping like directly choosing to maximize instead of
263
- minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
331
+ object is provided, it's assumed to be a cost function and by default XGBoost
332
+ will minimize the result during early stopping.
264
333
 
265
- See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
266
- for more.
334
+ For advanced usage on Early stopping like directly choosing to maximize instead
335
+ of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
267
336
 
268
- This parameter replaces `eval_metric` in :py:meth:`fit` method. The old one
269
- receives un-transformed prediction regardless of whether custom objective is
270
- being used.
337
+ See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
338
+ information.
271
339
 
272
340
  from sklearn.datasets import load_diabetes
273
341
  from sklearn.metrics import mean_absolute_error
@@ -278,24 +346,29 @@ class XGBClassifier(BaseTransformer):
278
346
  )
279
347
  reg.fit(X, y, eval_set=[(X, y)])
280
348
 
281
- early_stopping_rounds: Optional[int]
349
+ early_stopping_rounds: typing.Optional[int]
282
350
 
283
- Activates early stopping. Validation metric needs to improve at least once in
284
- every **early_stopping_rounds** round(s) to continue training. Requires at least
285
- one item in **eval_set** in :py:meth:`fit`.
351
+ - Activates early stopping. Validation metric needs to improve at least once in
352
+ every **early_stopping_rounds** round(s) to continue training. Requires at
353
+ least one item in **eval_set** in :py:meth:`fit`.
286
354
 
287
- The method returns the model from the last iteration (not the best one). If
288
- there's more than one item in **eval_set**, the last entry will be used for early
289
- stopping. If there's more than one metric in **eval_metric**, the last metric
290
- will be used for early stopping.
355
+ - If early stopping occurs, the model will have two additional attributes:
356
+ :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
357
+ :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
358
+ number of trees during inference. If users want to access the full model
359
+ (including trees built after early stopping), they can specify the
360
+ `iteration_range` in these inference methods. In addition, other utilities
361
+ like model plotting can also use the entire model.
291
362
 
292
- If early stopping occurs, the model will have three additional fields:
293
- :py:attr:`best_score`, :py:attr:`best_iteration` and
294
- :py:attr:`best_ntree_limit`.
363
+ - If you prefer to discard the trees after `best_iteration`, consider using the
364
+ callback function :py:class:`xgboost.callback.EarlyStopping`.
295
365
 
296
- This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
366
+ - If there's more than one item in **eval_set**, the last entry will be used for
367
+ early stopping. If there's more than one metric in **eval_metric**, the last
368
+ metric will be used for early stopping.
369
+
370
+ callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]]
297
371
 
298
- callbacks: Optional[List[TrainingCallback]]
299
372
  List of callback functions that are applied at end of each iteration.
300
373
  It is possible to use predefined callbacks by using
301
374
  :ref:`Callback API <callback_api>`.
@@ -307,9 +380,11 @@ class XGBClassifier(BaseTransformer):
307
380
  for params in parameters_grid:
308
381
  # be sure to (re)initialize the callbacks before each run
309
382
  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]
310
- xgboost.train(params, Xy, callbacks=callbacks)
383
+ reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
384
+ reg.fit(X, y)
385
+
386
+ kwargs: typing.Optional[typing.Any]
311
387
 
312
- kwargs: dict, optional
313
388
  Keyword arguments for XGBoost Booster object. Full documentation of parameters
314
389
  can be found :doc:`here </parameter>`.
315
390
  Attempting to set a parameter via the constructor args and \*\*kwargs
@@ -320,13 +395,16 @@ class XGBClassifier(BaseTransformer):
320
395
  with scikit-learn.
321
396
 
322
397
  A custom objective function can be provided for the ``objective``
323
- parameter. In this case, it should have the signature
324
- ``objective(y_true, y_pred) -> grad, hess``:
398
+ parameter. In this case, it should have the signature ``objective(y_true,
399
+ y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
400
+ -> [grad, hess]``:
325
401
 
326
402
  y_true: array_like of shape [n_samples]
327
403
  The target values
328
404
  y_pred: array_like of shape [n_samples]
329
405
  The predicted values
406
+ sample_weight :
407
+ Optional sample weights.
330
408
 
331
409
  grad: array_like of shape [n_samples]
332
410
  The value of the gradient for each sample point.
@@ -338,7 +416,6 @@ class XGBClassifier(BaseTransformer):
338
416
  self,
339
417
  *,
340
418
  objective="binary:logistic",
341
- use_label_encoder=None,
342
419
  input_cols: Optional[Union[str, Iterable[str]]] = None,
343
420
  output_cols: Optional[Union[str, Iterable[str]]] = None,
344
421
  label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -363,8 +440,7 @@ class XGBClassifier(BaseTransformer):
363
440
 
364
441
  self._deps = list(deps)
365
442
 
366
- init_args = {'objective':(objective, "binary:logistic", False),
367
- 'use_label_encoder':(use_label_encoder, None, False),}
443
+ init_args = {'objective':(objective, "binary:logistic", False),}
368
444
  cleaned_up_init_args = validate_sklearn_args(
369
445
  args=init_args,
370
446
  klass=xgboost.XGBClassifier
@@ -628,7 +704,7 @@ class XGBClassifier(BaseTransformer):
628
704
  elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
629
705
  expected_dtype = "array"
630
706
  else:
631
- output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
707
+ output_types = [signature.as_snowpark_type() for signature in _infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True)]
632
708
  # We can only infer the output types from the input types if the following two statemetns are true:
633
709
  # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
634
710
  # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
@@ -1289,7 +1365,7 @@ class XGBClassifier(BaseTransformer):
1289
1365
 
1290
1366
  PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1291
1367
 
1292
- inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
1368
+ inputs = list(_infer_signature(_truncate_data(dataset[self.input_cols], INFER_SIGNATURE_MAX_ROWS), "input", use_snowflake_identifiers=True))
1293
1369
  outputs: List[BaseFeatureSpec] = []
1294
1370
  if hasattr(self, "predict"):
1295
1371
  # keep mypy happy
@@ -1297,7 +1373,7 @@ class XGBClassifier(BaseTransformer):
1297
1373
  # For classifier, the type of predict is the same as the type of label
1298
1374
  if self._sklearn_object._estimator_type == "classifier":
1299
1375
  # label columns is the desired type for output
1300
- outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
1376
+ outputs = list(_infer_signature(_truncate_data(dataset[self.label_cols], INFER_SIGNATURE_MAX_ROWS), "output", use_snowflake_identifiers=True))
1301
1377
  # rename the output columns
1302
1378
  outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
1303
1379
  self._model_signature_dict["predict"] = ModelSignature(