scikit-learn-intelex 2025.6.0__py39-none-manylinux_2_28_x86_64.whl → 2025.7.0__py39-none-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scikit-learn-intelex might be problematic. Click here for more details.

Files changed (81) hide show
  1. daal4py/_daal4py.cpython-39-x86_64-linux-gnu.so +0 -0
  2. daal4py/mb/__init__.py +2 -2
  3. daal4py/mb/gbt_convertors.py +258 -2
  4. daal4py/mb/tree_based_builders.py +30 -5
  5. daal4py/mpi_transceiver.cpython-39-x86_64-linux-gnu.so +0 -0
  6. daal4py/sklearn/cluster/dbscan.py +2 -2
  7. daal4py/sklearn/ensemble/_forest.py +2 -2
  8. daal4py/sklearn/linear_model/logistic_path.py +21 -7
  9. daal4py/sklearn/manifold/_t_sne.py +8 -2
  10. daal4py/sklearn/metrics/_pairwise.py +1 -1
  11. daal4py/sklearn/svm/svm.py +1 -1
  12. daal4py/sklearn/utils/validation.py +15 -16
  13. onedal/__init__.py +26 -1
  14. onedal/_config.py +5 -4
  15. onedal/_device_offload.py +72 -83
  16. onedal/_onedal_py_dpc.cpython-39-x86_64-linux-gnu.so +0 -0
  17. onedal/_onedal_py_host.cpython-39-x86_64-linux-gnu.so +0 -0
  18. onedal/_onedal_py_spmd_dpc.cpython-39-x86_64-linux-gnu.so +0 -0
  19. onedal/basic_statistics/basic_statistics.py +69 -5
  20. onedal/basic_statistics/incremental_basic_statistics.py +19 -19
  21. onedal/cluster/kmeans.py +17 -1
  22. onedal/common/_backend.py +62 -37
  23. onedal/common/hyperparameters.py +3 -0
  24. onedal/common/tests/test_sycl.py +1 -1
  25. onedal/covariance/covariance.py +2 -2
  26. onedal/covariance/incremental_covariance.py +8 -16
  27. onedal/datatypes/__init__.py +8 -1
  28. onedal/datatypes/_data_conversion.py +25 -32
  29. onedal/datatypes/_sycl_usm.py +78 -0
  30. onedal/datatypes/tests/common.py +8 -3
  31. onedal/datatypes/tests/test_data.py +45 -2
  32. onedal/decomposition/incremental_pca.py +8 -17
  33. onedal/decomposition/pca.py +6 -4
  34. onedal/ensemble/forest.py +13 -5
  35. onedal/linear_model/incremental_linear_model.py +34 -32
  36. onedal/linear_model/linear_model.py +22 -30
  37. onedal/linear_model/logistic_regression.py +9 -5
  38. onedal/primitives/kernel_functions.py +64 -17
  39. onedal/spmd/decomposition/incremental_pca.py +0 -6
  40. onedal/svm/svm.py +0 -12
  41. onedal/tests/test_common.py +1 -1
  42. onedal/tests/utils/_dataframes_support.py +23 -6
  43. onedal/tests/utils/_device_selection.py +1 -1
  44. onedal/utils/_array_api.py +28 -26
  45. onedal/utils/_sycl_queue_manager.py +57 -31
  46. onedal/utils/_third_party.py +170 -0
  47. onedal/utils/validation.py +11 -3
  48. {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/METADATA +2 -2
  49. {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/RECORD +80 -79
  50. sklearnex/_config.py +17 -8
  51. sklearnex/_device_offload.py +33 -23
  52. sklearnex/_utils.py +23 -1
  53. sklearnex/base.py +1 -1
  54. sklearnex/basic_statistics/basic_statistics.py +5 -8
  55. sklearnex/basic_statistics/incremental_basic_statistics.py +17 -13
  56. sklearnex/covariance/incremental_covariance.py +12 -5
  57. sklearnex/decomposition/pca.py +16 -9
  58. sklearnex/decomposition/tests/test_pca.py +58 -1
  59. sklearnex/dispatcher.py +12 -1
  60. sklearnex/ensemble/_forest.py +9 -3
  61. sklearnex/linear_model/incremental_linear.py +14 -5
  62. sklearnex/linear_model/incremental_ridge.py +14 -7
  63. sklearnex/linear_model/logistic_regression.py +3 -4
  64. sklearnex/linear_model/ridge.py +9 -0
  65. sklearnex/manifold/tests/test_tsne.py +1 -1
  66. sklearnex/neighbors/_lof.py +1 -1
  67. sklearnex/preview/covariance/tests/test_covariance.py +59 -6
  68. sklearnex/spmd/covariance/incremental_covariance.py +0 -8
  69. sklearnex/spmd/decomposition/incremental_pca.py +0 -7
  70. sklearnex/spmd/linear_model/incremental_linear_model.py +0 -7
  71. sklearnex/svm/_common.py +1 -1
  72. sklearnex/tests/test_common.py +41 -2
  73. sklearnex/tests/test_config.py +22 -0
  74. sklearnex/tests/test_memory_usage.py +0 -8
  75. sklearnex/tests/test_n_jobs_support.py +1 -1
  76. sklearnex/tests/utils/base.py +1 -1
  77. sklearnex/utils/validation.py +10 -10
  78. onedal/utils/_dpep_helpers.py +0 -71
  79. {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/LICENSE.txt +0 -0
  80. {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/WHEEL +0 -0
  81. {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/top_level.txt +0 -0
daal4py/mb/__init__.py CHANGED
@@ -29,8 +29,8 @@ def convert_model(model) -> "GBTDAALModel | LogisticDAALModel":
29
29
  prediction methods.
30
30
 
31
31
  It supports gradient-boosted decision tree ensembles (GBT) from the libraries
32
- ``xgboost``, ``lightgbm``, and ``catboost``; and logistic regression (binary
33
- and multinomial) models from scikit-learn.
32
+ ``xgboost``, ``lightgbm``, ``catboost``, and ``treelite``; and logistic regression
33
+ (binary and multinomial) models from scikit-learn.
34
34
 
35
35
  See the documentation of the classes :obj:`daal4py.mb.GBTDAALModel` and
36
36
  :obj:`daal4py.mb.LogisticDAALModel` for more details.
@@ -15,6 +15,7 @@
15
15
  # ===============================================================================
16
16
 
17
17
  import json
18
+ import warnings
18
19
  from collections import deque
19
20
  from copy import deepcopy
20
21
  from tempfile import NamedTemporaryFile
@@ -197,6 +198,52 @@ class Node:
197
198
  right_child=right_child,
198
199
  )
199
200
 
201
+ @staticmethod
202
+ def from_treelite_dict(dict_all_nodes: list[dict[str, Any]], node_id: int) -> "Node":
203
+ this_node = dict_all_nodes[node_id]
204
+ is_leaf = "leaf_value" in this_node
205
+ default_left = this_node.get("default_left", False)
206
+
207
+ n_children = 0
208
+ if "left_child" in this_node:
209
+ left_child = Node.from_treelite_dict(dict_all_nodes, this_node["left_child"])
210
+ n_children += 1 + left_child.n_children
211
+ else:
212
+ left_child = None
213
+ if "right_child" in this_node:
214
+ right_child = Node.from_treelite_dict(
215
+ dict_all_nodes, this_node["right_child"]
216
+ )
217
+ n_children += 1 + right_child.n_children
218
+ else:
219
+ right_child = None
220
+
221
+ value = this_node["leaf_value"] if is_leaf else this_node["threshold"]
222
+ if not is_leaf:
223
+ comp = this_node["comparison_op"]
224
+ if comp == "<=":
225
+ value = float(np.nextafter(value, np.inf))
226
+ elif comp in [">", ">="]:
227
+ left_child, right_child = right_child, left_child
228
+ default_left = not default_left
229
+ if comp == ">":
230
+ value = float(np.nextafter(value, -np.inf))
231
+ elif comp != "<":
232
+ raise TypeError(
233
+ f"Model to convert contains unsupported split type: {comp}."
234
+ )
235
+
236
+ return Node(
237
+ cover=this_node.get("sum_hess", 0.0),
238
+ is_leaf=is_leaf,
239
+ default_left=default_left,
240
+ feature=this_node.get("split_feature_id"),
241
+ value=value,
242
+ n_children=n_children,
243
+ left_child=left_child,
244
+ right_child=right_child,
245
+ )
246
+
200
247
  def get_value_closest_float_downward(self) -> np.float64:
201
248
  """Get the closest exact fp value smaller than self.value"""
202
249
  return np.nextafter(np.single(self.value), np.single(-np.inf))
@@ -310,6 +357,14 @@ class TreeList(list):
310
357
 
311
358
  return tl
312
359
 
360
+ @staticmethod
361
+ def from_treelite_dict(tl_json: Dict[str, Any]) -> "TreeList":
362
+ tl = TreeList()
363
+ for tree_id, tree_dict in enumerate(tl_json["trees"]):
364
+ root_node = Node.from_treelite_dict(tree_dict["nodes"], 0)
365
+ tl.append(TreeView(tree_id=tree_id, root_node=root_node))
366
+ return tl
367
+
313
368
  def __setitem__(self):
314
369
  raise NotImplementedError(
315
370
  "Use TreeList.from_*() methods to initialize a TreeList"
@@ -421,7 +476,9 @@ def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any:
421
476
  if "is_linear=1" in model_str:
422
477
  raise TypeError("Linear trees are not supported.")
423
478
  if "[boosting: dart]" in model_str:
424
- raise TypeError("'Dart' booster is not supported.")
479
+ raise TypeError(
480
+ "'Dart' booster is not supported. Try converting to 'treelite' first."
481
+ )
425
482
  if "[boosting: rf]" in model_str:
426
483
  raise TypeError("Random forest boosters are not supported.")
427
484
  if ("[objective: lambdarank]" in model_str) or (
@@ -476,7 +533,9 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any:
476
533
  xgb_config = get_xgboost_params(booster)
477
534
 
478
535
  if xgb_config["learner"]["learner_train_param"]["booster"] != "gbtree":
479
- raise TypeError("Only 'gbtree' booster type is supported.")
536
+ raise TypeError(
537
+ "Only 'gbtree' booster type is supported. For DART, try converting to 'treelite' first."
538
+ )
480
539
 
481
540
  n_targets = xgb_config["learner"]["learner_model_param"].get("num_target")
482
541
  if n_targets is not None and int(n_targets) > 1:
@@ -920,3 +979,200 @@ def get_gbt_model_from_catboost(booster: Any) -> Any:
920
979
  if not add_intercept_to_each_node:
921
980
  intercept = booster.get_scale_and_bias()[1]
922
981
  return mb.model(base_score=intercept), shap_ready
982
+
983
+
984
+ def get_gbt_model_from_treelite(
985
+ tl_model: "treelite.model.Model",
986
+ ) -> tuple[Any, int, int, bool]:
987
+ model_json = json.loads(tl_model.dump_as_json())
988
+ task_type = model_json["task_type"]
989
+ if task_type not in ["kBinaryClf", "kRegressor", "kMultiClf", "kIsolationForest"]:
990
+ raise TypeError(f"Model to convert is of unsupported type: {task_type}")
991
+ if model_json["num_target"] > 1:
992
+ raise TypeError("Multi-target models are not supported.")
993
+ if model_json["postprocessor"] == "multiclass_ova":
994
+ raise TypeError(
995
+ "Multi-class classification models that use One-Vs-All are not supported."
996
+ )
997
+ for tree in model_json["trees"]:
998
+ if tree["has_categorical_split"]:
999
+ raise TypeError("Models with categorical features are not supported.")
1000
+ num_trees = tl_model.num_tree
1001
+ if not num_trees:
1002
+ raise TypeError("Model to convert contains no trees.")
1003
+
1004
+ # Note: the daal4py module always adds up the scores, but some models
1005
+ # might average them instead. In such case, this turns the trees into
1006
+ # additive ones by dividing the predictions by the number of nodes beforehand.
1007
+ if model_json["average_tree_output"]:
1008
+ divide_treelite_leaf_values_by_const(model_json, num_trees)
1009
+
1010
+ base_score = model_json["base_scores"]
1011
+ num_class = model_json["num_class"][0]
1012
+ num_feature = model_json["num_feature"]
1013
+
1014
+ if task_type == "kBinaryClf":
1015
+ num_class = 2
1016
+ if base_score:
1017
+ base_score = list(1 / (1 + np.exp(-np.array(base_score))))
1018
+
1019
+ if num_class > 2:
1020
+ shap_ready = False
1021
+ else:
1022
+ shap_ready = True
1023
+ for tree in model_json["trees"]:
1024
+ if not tree["nodes"][0].get("sum_hess", False):
1025
+ shap_ready = False
1026
+ break
1027
+
1028
+ # In the case of random forests for classification, it might work
1029
+ # by averaging predictions without any link function, whereas
1030
+ # daal4py assumes a logit link. In such case, it's not possible to
1031
+ # convert them to daal4py's logic, but the model can still be used
1032
+ # as a regressor that always outputs something between 0 and 1.
1033
+ is_regression = "Clf" not in task_type
1034
+ if not is_regression and model_json["postprocessor"] == "identity_multiclass":
1035
+ is_regression = True
1036
+ warnings.warn(
1037
+ "Attempting to convert classification model which is not"
1038
+ " based on gradient boosting. Will output a regression"
1039
+ " model instead."
1040
+ )
1041
+
1042
+ looks_like_random_forest = (
1043
+ model_json["postprocessor"] == "identity_multiclass"
1044
+ and len(model_json["base_scores"]) > 1
1045
+ and task_type == "kMultiClf"
1046
+ )
1047
+ if looks_like_random_forest:
1048
+ if num_class > 2 or len(base_score) > 2:
1049
+ raise TypeError("Multi-class random forests are not supported.")
1050
+ if len(model_json["num_class"]) > 1:
1051
+ raise TypeError("Multi-output random forests are not supported.")
1052
+ if len(base_score) == 2 and base_score[0]:
1053
+ raise TypeError("Random forests with base scores are not supported.")
1054
+
1055
+ # In the case of binary random forests, it will always have leaf values
1056
+ # for 2 classes, which is redundant as they sum to 1. daal4py requires
1057
+ # only values for the positive class, so they need to be converted.
1058
+ if looks_like_random_forest:
1059
+ leave_only_last_treelite_leaf_value(model_json)
1060
+ base_score = base_score[-1]
1061
+
1062
+ # In the case of multi-class classification models, if converted
1063
+ # from xgboost, the order of the trees will be the same - i.e.
1064
+ # sequences of one tree of each class, followed by another such
1065
+ # sequence. But treelite could in theory also support building
1066
+ # models where the trees are in a different order, in which case
1067
+ # they will need to be reordered to match xgboost, since that's
1068
+ # how daal4py handles them. And if there is an uneven number of
1069
+ # trees per class, then will need to make up extra trees with
1070
+ # zeros to accommodate it.
1071
+ if task_type == "kMultiClf" and not looks_like_random_forest:
1072
+ num_trees = len(model_json["trees"])
1073
+ if (num_trees % num_class) != 0:
1074
+ shap_ready = False
1075
+ class_ids, num_trees_per_class = np.unique(
1076
+ model_json["class_id"], return_counts=True
1077
+ )
1078
+ max_tree_per_class = num_trees_per_class.max()
1079
+ num_tree_add_per_class = max_tree_per_class - num_trees_per_class
1080
+ for class_ind in range(num_class):
1081
+ for tree in range(num_tree_add_per_class[class_ind]):
1082
+ add_empty_tree_to_treelite_json(model_json, class_ind)
1083
+
1084
+ tree_class_orders = model_json["class_id"]
1085
+ sequential_ids = np.arange(num_class)
1086
+ num_trees = len(model_json["trees"])
1087
+ assert (num_trees % num_class) == 0
1088
+ if not np.array_equal(
1089
+ tree_class_orders, np.tile(sequential_ids, int(num_trees / num_class))
1090
+ ):
1091
+ argsorted_class_indices = np.argsort(tree_class_orders)
1092
+ per_class_indices = np.split(argsorted_class_indices, num_class)
1093
+ correct_order = np.vstack(per_class_indices).reshape(-1, order="F")
1094
+ model_json["trees"] = [model_json["trees"][ix] for ix in correct_order]
1095
+ model_json["class_id"] = [model_json["class_id"][ix] for ix in correct_order]
1096
+
1097
+ # In the case of multi-class classification with base scores,
1098
+ # since daal4py only supports scalar intercepts, this follows the
1099
+ # same strategy as in catboost of dividing the intercepts equally
1100
+ # among the number of trees
1101
+ if task_type == "kMultiClf" and not looks_like_random_forest:
1102
+ add_intercept_to_treelite_leafs(model_json, base_score)
1103
+ base_score = None
1104
+
1105
+ if isinstance(base_score, list):
1106
+ if len(base_score) == 1:
1107
+ base_score = base_score[0]
1108
+ else:
1109
+ raise TypeError("Model to convert is malformed.")
1110
+
1111
+ tree_list = TreeList.from_treelite_dict(model_json)
1112
+ return (
1113
+ get_gbt_model_from_tree_list(
1114
+ tree_list,
1115
+ n_iterations=num_trees
1116
+ / (
1117
+ num_class
1118
+ if task_type == "kMultiClf" and not looks_like_random_forest
1119
+ else 1
1120
+ ),
1121
+ is_regression=is_regression,
1122
+ n_features=num_feature,
1123
+ n_classes=num_class,
1124
+ base_score=base_score,
1125
+ ),
1126
+ num_class,
1127
+ num_feature,
1128
+ shap_ready,
1129
+ )
1130
+
1131
+
1132
+ def divide_treelite_leaf_values_by_const(
1133
+ tl_json: dict[str, Any], divisor: "int | float"
1134
+ ) -> None:
1135
+ for tree in tl_json["trees"]:
1136
+ for node in tree["nodes"]:
1137
+ if "leaf_value" in node:
1138
+ if isinstance(node["leaf_value"], (list, tuple)):
1139
+ node["leaf_value"] = list(np.array(node["leaf_value"]) / divisor)
1140
+ else:
1141
+ node["leaf_value"] /= divisor
1142
+
1143
+
1144
+ def leave_only_last_treelite_leaf_value(tl_json: dict[str, Any]) -> None:
1145
+ for tree in tl_json["trees"]:
1146
+ for node in tree["nodes"]:
1147
+ if "leaf_value" in node:
1148
+ assert len(node["leaf_value"]) == 2
1149
+ node["leaf_value"] = node["leaf_value"][-1]
1150
+
1151
+
1152
+ def add_intercept_to_treelite_leafs(
1153
+ tl_json: dict[str, Any], base_score: list[float]
1154
+ ) -> None:
1155
+ num_trees_per_class = len(tl_json["trees"]) / tl_json["num_class"][0]
1156
+ for tree_index, tree in enumerate(tl_json["trees"]):
1157
+ leaf_add = base_score[tl_json["class_id"][tree_index]] / num_trees_per_class
1158
+ for node in tree["nodes"]:
1159
+ if "leaf_value" in node:
1160
+ node["leaf_value"] += leaf_add
1161
+
1162
+
1163
+ def add_empty_tree_to_treelite_json(tl_json: dict[str, Any], class_add: int) -> None:
1164
+ tl_json["class_id"].append(class_add)
1165
+ tl_json["trees"].append(
1166
+ {
1167
+ "num_nodes": 1,
1168
+ "has_categorical_split": False,
1169
+ "nodes": [
1170
+ {
1171
+ "node_id": 0,
1172
+ "leaf_value": 0.0,
1173
+ "data_count": 0,
1174
+ "sum_hess": 0.0,
1175
+ },
1176
+ ],
1177
+ }
1178
+ )
@@ -37,6 +37,7 @@ from .gbt_convertors import (
37
37
  get_catboost_params,
38
38
  get_gbt_model_from_catboost,
39
39
  get_gbt_model_from_lightgbm,
40
+ get_gbt_model_from_treelite,
40
41
  get_gbt_model_from_xgboost,
41
42
  get_lightgbm_params,
42
43
  get_xgboost_params,
@@ -63,7 +64,9 @@ def getFPType(X):
63
64
 
64
65
  class GBTDAALBaseModel:
65
66
  def __init__(self):
66
- self.model_type: Optional[Literal["xgboost", "catboost", "lightgbm"]] = None
67
+ self.model_type: Optional[
68
+ Literal["xgboost", "catboost", "lightgbm", "treelite"]
69
+ ] = None
67
70
 
68
71
  @property
69
72
  def _is_regression(self):
@@ -86,6 +89,8 @@ class GBTDAALBaseModel:
86
89
  if self.n_classes_ <= 2:
87
90
  if objective_fun in ["binary:logistic", "binary:logitraw"]:
88
91
  self.n_classes_ = 2
92
+ elif self.n_classes_ == 0:
93
+ self.n_classes_ = 1
89
94
 
90
95
  self.n_features_in_ = int(params["learner"]["learner_model_param"]["num_feature"])
91
96
 
@@ -113,6 +118,11 @@ class GBTDAALBaseModel:
113
118
  self.daal_model_, self.supports_shap_ = get_gbt_model_from_catboost(booster)
114
119
  self._get_params_from_catboost(catboost_params)
115
120
 
121
+ def _convert_model_from_treelite(self, tl_model):
122
+ self.daal_model_, self.n_classes_, self.n_features_in_, self.supports_shap_ = (
123
+ get_gbt_model_from_treelite(tl_model)
124
+ )
125
+
116
126
  def _convert_model(self, model):
117
127
  (submodule_name, class_name) = (
118
128
  model.__class__.__module__,
@@ -147,6 +157,14 @@ class GBTDAALBaseModel:
147
157
  # Build GBTDAALModel from CatBoost
148
158
  elif (submodule_name, class_name) == ("catboost.core", "CatBoost"):
149
159
  self._convert_model_from_catboost(model)
160
+ elif (submodule_name, class_name) == ("treelite.model", "Model"):
161
+ self._convert_model_from_treelite(model)
162
+ elif submodule_name.startswith("sklearn.ensemble"):
163
+ raise TypeError(
164
+ "Cannot convert scikit-learn models. Try converting to treelite "
165
+ "with 'treelite.sklearn.import_model' and then converting the "
166
+ "resulting TreeLite object."
167
+ )
150
168
  else:
151
169
  raise TypeError(f"Unknown model format {submodule_name}.{class_name}")
152
170
 
@@ -303,14 +321,21 @@ class GBTDAALModel(GBTDAALBaseModel):
303
321
 
304
322
  Can be created from model objects that meet all of the following criteria:
305
323
 
306
- - Were produced from one of the following libraries: ``xgboost``, ``lightgbm``, or ``catboost``.
307
- It can work with either the base booster classes of those libraries or with their
308
- scikit-learn-compatible classes.
324
+ - Were produced from one of the following libraries: ``xgboost``, ``lightgbm``, ``catboost``,
325
+ or ``treelite`` (with some limitations). It can work with either the base booster classes
326
+ of those libraries or with their scikit-learn-compatible classes.
309
327
  - Do not use categorical features.
310
328
  - Are for regression or classification (e.g. no ranking). In the case of XGBoost objective
311
329
  ``binary:logitraw``, it will create a classification model out of it, and in the case of
312
330
  objective ``reg:logistic``, will create a regression model.
313
331
  - Are not multi-output models. Note that multi-class classification **is** supported.
332
+ - Are not multi-class random forests (multi-class gradient boosters are supported).
333
+
334
+ Note that while models from packages such as scikit-learn are not supported directly,
335
+ they can still be converted to this class by first converting them to TreeLite and
336
+ then converting to :obj:`GBTDAALModel` from that TreeLite model. In such case, note that
337
+ models corresponding to random forest binary classifiers will be treated as regressors
338
+ that predict probabilities.
314
339
 
315
340
  Parameters
316
341
  ----------
@@ -330,7 +355,7 @@ class GBTDAALModel(GBTDAALBaseModel):
330
355
 
331
356
  def __init__(self, model):
332
357
  self._convert_model(model)
333
- for type_str in ("xgboost", "lightgbm", "catboost"):
358
+ for type_str in ("xgboost", "lightgbm", "catboost", "treelite"):
334
359
  if type_str in str(type(model)):
335
360
  self.model_type = type_str
336
361
  break
@@ -36,10 +36,10 @@ def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None):
36
36
  ww = make2d(sample_weight) if sample_weight is not None else None
37
37
  XX = make2d(X)
38
38
 
39
- fpt = getFPType(XX)
39
+ fpt = getFPType(XX) # codespell:ignore fpt
40
40
  alg = daal4py.dbscan(
41
41
  method="defaultDense",
42
- fptype=fpt,
42
+ fptype=fpt, # codespell:ignore fpt
43
43
  epsilon=float(eps),
44
44
  minObservations=int(min_samples),
45
45
  memorySavingMode=False,
@@ -679,8 +679,8 @@ class RandomForestClassifier(RandomForestClassifier_original, RandomForestBase):
679
679
  dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_)
680
680
 
681
681
  pred = dfc_predictionResult.probabilities
682
-
683
- return pred
682
+ # TODO: fix probabilities out of [0, 1] interval on oneDAL side
683
+ return pred.clip(0.0, 1.0)
684
684
 
685
685
  def _daal_fit_classifier(self, X, y, sample_weight=None):
686
686
  y = check_array(y, ensure_2d=False, dtype=None)
@@ -359,14 +359,16 @@ def __logistic_regression_path(
359
359
  y_bin = np.ones(y.shape, dtype=X.dtype)
360
360
  # for compute_class_weight
361
361
 
362
- if solver in ["lbfgs", "newton-cg"]:
362
+ if solver == "liblinear" or (
363
+ not sklearn_check_version("1.6") and solver not in ["lbfgs", "newton-cg"]
364
+ ):
365
+ mask_classes = np.array([-1, 1])
366
+ y_bin[~mask] = -1.0
367
+ else:
363
368
  # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead
364
369
  # of in [-1, 1].
365
370
  mask_classes = np.array([0, 1])
366
371
  y_bin[~mask] = 0.0
367
- else:
368
- mask_classes = np.array([-1, 1])
369
- y_bin[~mask] = -1.0
370
372
  else:
371
373
  mask_classes = np.array([-1, 1])
372
374
  mask = y == pos_class
@@ -388,7 +390,11 @@ def __logistic_regression_path(
388
390
 
389
391
  else:
390
392
  if sklearn_check_version("1.1"):
391
- if solver in ["sag", "saga", "lbfgs", "newton-cg"]:
393
+ if sklearn_check_version("1.6"):
394
+ solver_list = ["sag", "saga", "lbfgs", "newton-cg", "newton-cholesky"]
395
+ else:
396
+ solver_list = ["sag", "saga", "lbfgs", "newton-cg"]
397
+ if solver in solver_list:
392
398
  # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder,
393
399
  # not LabelBinarizer, i.e. y as a 1d-array of integers.
394
400
  # LabelEncoder also saves memory compared to LabelBinarizer, especially
@@ -488,7 +494,11 @@ def __logistic_regression_path(
488
494
 
489
495
  if multi_class == "multinomial":
490
496
  # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
491
- if solver in ["lbfgs", "newton-cg"]:
497
+ if sklearn_check_version("1.6"):
498
+ solver_list = ["lbfgs", "newton-cg", "newton-cholesky"]
499
+ else:
500
+ solver_list = ["lbfgs", "newton-cg"]
501
+ if solver in solver_list:
492
502
  if _dal_ready and classes.size == 2:
493
503
  w0 = w0[-1:, :]
494
504
  if sklearn_check_version("1.1"):
@@ -753,7 +763,11 @@ def __logistic_regression_path(
753
763
  else:
754
764
  n_classes = max(2, classes.size)
755
765
  if sklearn_check_version("1.1"):
756
- if solver in ["lbfgs", "newton-cg"]:
766
+ if sklearn_check_version("1.6"):
767
+ solver_list = ["lbfgs", "newton-cg", "newton-cholesky"]
768
+ else:
769
+ solver_list = ["lbfgs", "newton-cg"]
770
+ if solver in solver_list:
757
771
  multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
758
772
  else:
759
773
  multi_w0 = w0
@@ -66,7 +66,13 @@ class TSNE(BaseTSNE):
66
66
  [n_samples],
67
67
  [P.nnz],
68
68
  [self.n_iter_without_progress],
69
- [self._max_iter if sklearn_check_version("1.5") else self.n_iter],
69
+ [
70
+ (
71
+ self.max_iter
72
+ if sklearn_check_version("1.7")
73
+ else (self._max_iter if sklearn_check_version("1.5") else self.n_iter)
74
+ )
75
+ ],
70
76
  ]
71
77
 
72
78
  # Pass params to daal4py backend
@@ -130,7 +136,7 @@ class TSNE(BaseTSNE):
130
136
 
131
137
  if isinstance(self._init, str) and self._init == "pca" and issparse(X):
132
138
  raise TypeError(
133
- "PCA initialization is currently not suported "
139
+ "PCA initialization is currently not supported "
134
140
  "with the sparse input matrix. Use "
135
141
  'init="random" instead.'
136
142
  )
@@ -18,6 +18,7 @@ import warnings
18
18
  from functools import partial
19
19
 
20
20
  import numpy as np
21
+ from joblib import effective_n_jobs
21
22
  from sklearn.exceptions import DataConversionWarning
22
23
  from sklearn.metrics import pairwise_distances as pairwise_distances_original
23
24
  from sklearn.metrics.pairwise import (
@@ -28,7 +29,6 @@ from sklearn.metrics.pairwise import (
28
29
  _parallel_pairwise,
29
30
  check_pairwise_arrays,
30
31
  )
31
- from sklearn.utils._joblib import effective_n_jobs
32
32
  from sklearn.utils.validation import check_non_negative
33
33
 
34
34
  try:
@@ -158,7 +158,7 @@ def _daal4py_kf(kernel, X_fptype, gamma=1.0, is_sparse=False):
158
158
  kf = daal4py.kernel_function_linear(fptype=X_fptype, method=method)
159
159
  else:
160
160
  raise ValueError(
161
- "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel)
161
+ "_daal4py_fit received unexpected kernel specification {}.".format(kernel)
162
162
  )
163
163
 
164
164
  return kf
@@ -72,25 +72,24 @@ def _assert_all_finite(
72
72
 
73
73
  # Data with small size has too big relative overhead
74
74
  # TODO: tune threshold size
75
- if hasattr(X, "size"):
76
- if X.size < 32768:
77
- if sklearn_check_version("1.1"):
78
- _sklearn_assert_all_finite(
79
- X,
80
- allow_nan=allow_nan,
81
- msg_dtype=msg_dtype,
82
- estimator_name=estimator_name,
83
- input_name=input_name,
84
- )
85
- else:
86
- _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype)
87
- return
88
-
89
75
  is_df = is_DataFrame(X)
76
+ if not (is_df or isinstance(X, np.ndarray)) or X.size < 32768:
77
+ if sklearn_check_version("1.1"):
78
+ _sklearn_assert_all_finite(
79
+ X,
80
+ allow_nan=allow_nan,
81
+ msg_dtype=msg_dtype,
82
+ estimator_name=estimator_name,
83
+ input_name=input_name,
84
+ )
85
+ else:
86
+ _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype)
87
+ return
88
+
90
89
  num_of_types = get_number_of_types(X)
91
90
 
92
91
  # if X is heterogeneous pandas.DataFrame then
93
- # covert it to a list of arrays
92
+ # convert it to a list of arrays
94
93
  if is_df and num_of_types > 1:
95
94
  lst = []
96
95
  for idx in X:
@@ -330,7 +329,7 @@ def _daal_check_array(
330
329
  has_pd_integer_array = False
331
330
  if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
332
331
  # throw warning if columns are sparse. If all columns are sparse, then
333
- # array.sparse exists and sparsity will be perserved (later).
332
+ # array.sparse exists and sparsity will be preserved (later).
334
333
  with suppress(ImportError):
335
334
  from pandas import SparseDtype
336
335
 
onedal/__init__.py CHANGED
@@ -21,9 +21,34 @@ from daal4py.sklearn._utils import daal_check_version
21
21
 
22
22
 
23
23
  class Backend:
24
- """Encapsulates the oneDAL pybind11 modules and provides a unified interface to it together with additional properties about dpc/spmd policies"""
25
24
 
26
25
  def __init__(self, backend_module, is_dpc, is_spmd):
26
+ """A unified interface to an available oneDAL pybind11 module.
27
+
28
+ This class encapsulates a oneDAL pybind11 module allowing for
29
+ dynamic access of module objects. This simplifies method and
30
+ attribute access in sklearnex without aliasing in sys.modules.
31
+ It contains additional attributes for inspection of the pybind11
32
+ module type (i.e. dpc or spmd) for use in policy creation.
33
+
34
+ Parameters
35
+ ----------
36
+ backend_module : oneDAL pybind11 module
37
+ Pybind11 module to be encapsulated.
38
+
39
+ is_dpc : bool
40
+ Flag describing if the module is Data Parallel C++-enabled.
41
+
42
+ is_spmd : bool
43
+ Flag describing if the module is single program, multiple
44
+ data enabled.
45
+
46
+ Returns
47
+ -------
48
+ self : Backend
49
+ Encapsulated oneDAL pybind11 module.
50
+ """
51
+
27
52
  self.backend = backend_module
28
53
  self.is_dpc = is_dpc
29
54
  self.is_spmd = is_spmd
onedal/_config.py CHANGED
@@ -59,13 +59,14 @@ def _get_onedal_threadlocal_config():
59
59
 
60
60
 
61
61
  def _get_config(copy=True):
62
- """Retrieve current values for configuration set
63
- by :func:`sklearnex.set_config`
62
+ """Retrieve current configuration set by :func:`sklearnex.set_config`
63
+
64
64
  Parameters
65
65
  ----------
66
66
  copy : bool, default=True
67
- If False, a mutable view of the configuration is returned. Each thread
68
- has a separate copy of the configuration.
67
+ If 'False', a mutable view of the configuration is returned. Each
68
+ thread has a separate copy of the configuration.
69
+
69
70
  Returns
70
71
  -------
71
72
  config : dict