scikit-learn-intelex 2025.6.0__py39-none-manylinux_2_28_x86_64.whl → 2025.7.0__py39-none-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scikit-learn-intelex might be problematic. Click here for more details.
- daal4py/_daal4py.cpython-39-x86_64-linux-gnu.so +0 -0
- daal4py/mb/__init__.py +2 -2
- daal4py/mb/gbt_convertors.py +258 -2
- daal4py/mb/tree_based_builders.py +30 -5
- daal4py/mpi_transceiver.cpython-39-x86_64-linux-gnu.so +0 -0
- daal4py/sklearn/cluster/dbscan.py +2 -2
- daal4py/sklearn/ensemble/_forest.py +2 -2
- daal4py/sklearn/linear_model/logistic_path.py +21 -7
- daal4py/sklearn/manifold/_t_sne.py +8 -2
- daal4py/sklearn/metrics/_pairwise.py +1 -1
- daal4py/sklearn/svm/svm.py +1 -1
- daal4py/sklearn/utils/validation.py +15 -16
- onedal/__init__.py +26 -1
- onedal/_config.py +5 -4
- onedal/_device_offload.py +72 -83
- onedal/_onedal_py_dpc.cpython-39-x86_64-linux-gnu.so +0 -0
- onedal/_onedal_py_host.cpython-39-x86_64-linux-gnu.so +0 -0
- onedal/_onedal_py_spmd_dpc.cpython-39-x86_64-linux-gnu.so +0 -0
- onedal/basic_statistics/basic_statistics.py +69 -5
- onedal/basic_statistics/incremental_basic_statistics.py +19 -19
- onedal/cluster/kmeans.py +17 -1
- onedal/common/_backend.py +62 -37
- onedal/common/hyperparameters.py +3 -0
- onedal/common/tests/test_sycl.py +1 -1
- onedal/covariance/covariance.py +2 -2
- onedal/covariance/incremental_covariance.py +8 -16
- onedal/datatypes/__init__.py +8 -1
- onedal/datatypes/_data_conversion.py +25 -32
- onedal/datatypes/_sycl_usm.py +78 -0
- onedal/datatypes/tests/common.py +8 -3
- onedal/datatypes/tests/test_data.py +45 -2
- onedal/decomposition/incremental_pca.py +8 -17
- onedal/decomposition/pca.py +6 -4
- onedal/ensemble/forest.py +13 -5
- onedal/linear_model/incremental_linear_model.py +34 -32
- onedal/linear_model/linear_model.py +22 -30
- onedal/linear_model/logistic_regression.py +9 -5
- onedal/primitives/kernel_functions.py +64 -17
- onedal/spmd/decomposition/incremental_pca.py +0 -6
- onedal/svm/svm.py +0 -12
- onedal/tests/test_common.py +1 -1
- onedal/tests/utils/_dataframes_support.py +23 -6
- onedal/tests/utils/_device_selection.py +1 -1
- onedal/utils/_array_api.py +28 -26
- onedal/utils/_sycl_queue_manager.py +57 -31
- onedal/utils/_third_party.py +170 -0
- onedal/utils/validation.py +11 -3
- {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/METADATA +2 -2
- {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/RECORD +80 -79
- sklearnex/_config.py +17 -8
- sklearnex/_device_offload.py +33 -23
- sklearnex/_utils.py +23 -1
- sklearnex/base.py +1 -1
- sklearnex/basic_statistics/basic_statistics.py +5 -8
- sklearnex/basic_statistics/incremental_basic_statistics.py +17 -13
- sklearnex/covariance/incremental_covariance.py +12 -5
- sklearnex/decomposition/pca.py +16 -9
- sklearnex/decomposition/tests/test_pca.py +58 -1
- sklearnex/dispatcher.py +12 -1
- sklearnex/ensemble/_forest.py +9 -3
- sklearnex/linear_model/incremental_linear.py +14 -5
- sklearnex/linear_model/incremental_ridge.py +14 -7
- sklearnex/linear_model/logistic_regression.py +3 -4
- sklearnex/linear_model/ridge.py +9 -0
- sklearnex/manifold/tests/test_tsne.py +1 -1
- sklearnex/neighbors/_lof.py +1 -1
- sklearnex/preview/covariance/tests/test_covariance.py +59 -6
- sklearnex/spmd/covariance/incremental_covariance.py +0 -8
- sklearnex/spmd/decomposition/incremental_pca.py +0 -7
- sklearnex/spmd/linear_model/incremental_linear_model.py +0 -7
- sklearnex/svm/_common.py +1 -1
- sklearnex/tests/test_common.py +41 -2
- sklearnex/tests/test_config.py +22 -0
- sklearnex/tests/test_memory_usage.py +0 -8
- sklearnex/tests/test_n_jobs_support.py +1 -1
- sklearnex/tests/utils/base.py +1 -1
- sklearnex/utils/validation.py +10 -10
- onedal/utils/_dpep_helpers.py +0 -71
- {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/LICENSE.txt +0 -0
- {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/WHEEL +0 -0
- {scikit_learn_intelex-2025.6.0.dist-info → scikit_learn_intelex-2025.7.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
daal4py/mb/__init__.py
CHANGED
|
@@ -29,8 +29,8 @@ def convert_model(model) -> "GBTDAALModel | LogisticDAALModel":
|
|
|
29
29
|
prediction methods.
|
|
30
30
|
|
|
31
31
|
It supports gradient-boosted decision tree ensembles (GBT) from the libraries
|
|
32
|
-
``xgboost``, ``lightgbm``, and ``
|
|
33
|
-
and multinomial) models from scikit-learn.
|
|
32
|
+
``xgboost``, ``lightgbm``, ``catboost``, and ``treelite``; and logistic regression
|
|
33
|
+
(binary and multinomial) models from scikit-learn.
|
|
34
34
|
|
|
35
35
|
See the documentation of the classes :obj:`daal4py.mb.GBTDAALModel` and
|
|
36
36
|
:obj:`daal4py.mb.LogisticDAALModel` for more details.
|
daal4py/mb/gbt_convertors.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
# ===============================================================================
|
|
16
16
|
|
|
17
17
|
import json
|
|
18
|
+
import warnings
|
|
18
19
|
from collections import deque
|
|
19
20
|
from copy import deepcopy
|
|
20
21
|
from tempfile import NamedTemporaryFile
|
|
@@ -197,6 +198,52 @@ class Node:
|
|
|
197
198
|
right_child=right_child,
|
|
198
199
|
)
|
|
199
200
|
|
|
201
|
+
@staticmethod
|
|
202
|
+
def from_treelite_dict(dict_all_nodes: list[dict[str, Any]], node_id: int) -> "Node":
|
|
203
|
+
this_node = dict_all_nodes[node_id]
|
|
204
|
+
is_leaf = "leaf_value" in this_node
|
|
205
|
+
default_left = this_node.get("default_left", False)
|
|
206
|
+
|
|
207
|
+
n_children = 0
|
|
208
|
+
if "left_child" in this_node:
|
|
209
|
+
left_child = Node.from_treelite_dict(dict_all_nodes, this_node["left_child"])
|
|
210
|
+
n_children += 1 + left_child.n_children
|
|
211
|
+
else:
|
|
212
|
+
left_child = None
|
|
213
|
+
if "right_child" in this_node:
|
|
214
|
+
right_child = Node.from_treelite_dict(
|
|
215
|
+
dict_all_nodes, this_node["right_child"]
|
|
216
|
+
)
|
|
217
|
+
n_children += 1 + right_child.n_children
|
|
218
|
+
else:
|
|
219
|
+
right_child = None
|
|
220
|
+
|
|
221
|
+
value = this_node["leaf_value"] if is_leaf else this_node["threshold"]
|
|
222
|
+
if not is_leaf:
|
|
223
|
+
comp = this_node["comparison_op"]
|
|
224
|
+
if comp == "<=":
|
|
225
|
+
value = float(np.nextafter(value, np.inf))
|
|
226
|
+
elif comp in [">", ">="]:
|
|
227
|
+
left_child, right_child = right_child, left_child
|
|
228
|
+
default_left = not default_left
|
|
229
|
+
if comp == ">":
|
|
230
|
+
value = float(np.nextafter(value, -np.inf))
|
|
231
|
+
elif comp != "<":
|
|
232
|
+
raise TypeError(
|
|
233
|
+
f"Model to convert contains unsupported split type: {comp}."
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return Node(
|
|
237
|
+
cover=this_node.get("sum_hess", 0.0),
|
|
238
|
+
is_leaf=is_leaf,
|
|
239
|
+
default_left=default_left,
|
|
240
|
+
feature=this_node.get("split_feature_id"),
|
|
241
|
+
value=value,
|
|
242
|
+
n_children=n_children,
|
|
243
|
+
left_child=left_child,
|
|
244
|
+
right_child=right_child,
|
|
245
|
+
)
|
|
246
|
+
|
|
200
247
|
def get_value_closest_float_downward(self) -> np.float64:
|
|
201
248
|
"""Get the closest exact fp value smaller than self.value"""
|
|
202
249
|
return np.nextafter(np.single(self.value), np.single(-np.inf))
|
|
@@ -310,6 +357,14 @@ class TreeList(list):
|
|
|
310
357
|
|
|
311
358
|
return tl
|
|
312
359
|
|
|
360
|
+
@staticmethod
|
|
361
|
+
def from_treelite_dict(tl_json: Dict[str, Any]) -> "TreeList":
|
|
362
|
+
tl = TreeList()
|
|
363
|
+
for tree_id, tree_dict in enumerate(tl_json["trees"]):
|
|
364
|
+
root_node = Node.from_treelite_dict(tree_dict["nodes"], 0)
|
|
365
|
+
tl.append(TreeView(tree_id=tree_id, root_node=root_node))
|
|
366
|
+
return tl
|
|
367
|
+
|
|
313
368
|
def __setitem__(self):
|
|
314
369
|
raise NotImplementedError(
|
|
315
370
|
"Use TreeList.from_*() methods to initialize a TreeList"
|
|
@@ -421,7 +476,9 @@ def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any:
|
|
|
421
476
|
if "is_linear=1" in model_str:
|
|
422
477
|
raise TypeError("Linear trees are not supported.")
|
|
423
478
|
if "[boosting: dart]" in model_str:
|
|
424
|
-
raise TypeError(
|
|
479
|
+
raise TypeError(
|
|
480
|
+
"'Dart' booster is not supported. Try converting to 'treelite' first."
|
|
481
|
+
)
|
|
425
482
|
if "[boosting: rf]" in model_str:
|
|
426
483
|
raise TypeError("Random forest boosters are not supported.")
|
|
427
484
|
if ("[objective: lambdarank]" in model_str) or (
|
|
@@ -476,7 +533,9 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any:
|
|
|
476
533
|
xgb_config = get_xgboost_params(booster)
|
|
477
534
|
|
|
478
535
|
if xgb_config["learner"]["learner_train_param"]["booster"] != "gbtree":
|
|
479
|
-
raise TypeError(
|
|
536
|
+
raise TypeError(
|
|
537
|
+
"Only 'gbtree' booster type is supported. For DART, try converting to 'treelite' first."
|
|
538
|
+
)
|
|
480
539
|
|
|
481
540
|
n_targets = xgb_config["learner"]["learner_model_param"].get("num_target")
|
|
482
541
|
if n_targets is not None and int(n_targets) > 1:
|
|
@@ -920,3 +979,200 @@ def get_gbt_model_from_catboost(booster: Any) -> Any:
|
|
|
920
979
|
if not add_intercept_to_each_node:
|
|
921
980
|
intercept = booster.get_scale_and_bias()[1]
|
|
922
981
|
return mb.model(base_score=intercept), shap_ready
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
def get_gbt_model_from_treelite(
|
|
985
|
+
tl_model: "treelite.model.Model",
|
|
986
|
+
) -> tuple[Any, int, int, bool]:
|
|
987
|
+
model_json = json.loads(tl_model.dump_as_json())
|
|
988
|
+
task_type = model_json["task_type"]
|
|
989
|
+
if task_type not in ["kBinaryClf", "kRegressor", "kMultiClf", "kIsolationForest"]:
|
|
990
|
+
raise TypeError(f"Model to convert is of unsupported type: {task_type}")
|
|
991
|
+
if model_json["num_target"] > 1:
|
|
992
|
+
raise TypeError("Multi-target models are not supported.")
|
|
993
|
+
if model_json["postprocessor"] == "multiclass_ova":
|
|
994
|
+
raise TypeError(
|
|
995
|
+
"Multi-class classification models that use One-Vs-All are not supported."
|
|
996
|
+
)
|
|
997
|
+
for tree in model_json["trees"]:
|
|
998
|
+
if tree["has_categorical_split"]:
|
|
999
|
+
raise TypeError("Models with categorical features are not supported.")
|
|
1000
|
+
num_trees = tl_model.num_tree
|
|
1001
|
+
if not num_trees:
|
|
1002
|
+
raise TypeError("Model to convert contains no trees.")
|
|
1003
|
+
|
|
1004
|
+
# Note: the daal4py module always adds up the scores, but some models
|
|
1005
|
+
# might average them instead. In such case, this turns the trees into
|
|
1006
|
+
# additive ones by dividing the predictions by the number of nodes beforehand.
|
|
1007
|
+
if model_json["average_tree_output"]:
|
|
1008
|
+
divide_treelite_leaf_values_by_const(model_json, num_trees)
|
|
1009
|
+
|
|
1010
|
+
base_score = model_json["base_scores"]
|
|
1011
|
+
num_class = model_json["num_class"][0]
|
|
1012
|
+
num_feature = model_json["num_feature"]
|
|
1013
|
+
|
|
1014
|
+
if task_type == "kBinaryClf":
|
|
1015
|
+
num_class = 2
|
|
1016
|
+
if base_score:
|
|
1017
|
+
base_score = list(1 / (1 + np.exp(-np.array(base_score))))
|
|
1018
|
+
|
|
1019
|
+
if num_class > 2:
|
|
1020
|
+
shap_ready = False
|
|
1021
|
+
else:
|
|
1022
|
+
shap_ready = True
|
|
1023
|
+
for tree in model_json["trees"]:
|
|
1024
|
+
if not tree["nodes"][0].get("sum_hess", False):
|
|
1025
|
+
shap_ready = False
|
|
1026
|
+
break
|
|
1027
|
+
|
|
1028
|
+
# In the case of random forests for classification, it might work
|
|
1029
|
+
# by averaging predictions without any link function, whereas
|
|
1030
|
+
# daal4py assumes a logit link. In such case, it's not possible to
|
|
1031
|
+
# convert them to daal4py's logic, but the model can still be used
|
|
1032
|
+
# as a regressor that always outputs something between 0 and 1.
|
|
1033
|
+
is_regression = "Clf" not in task_type
|
|
1034
|
+
if not is_regression and model_json["postprocessor"] == "identity_multiclass":
|
|
1035
|
+
is_regression = True
|
|
1036
|
+
warnings.warn(
|
|
1037
|
+
"Attempting to convert classification model which is not"
|
|
1038
|
+
" based on gradient boosting. Will output a regression"
|
|
1039
|
+
" model instead."
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
looks_like_random_forest = (
|
|
1043
|
+
model_json["postprocessor"] == "identity_multiclass"
|
|
1044
|
+
and len(model_json["base_scores"]) > 1
|
|
1045
|
+
and task_type == "kMultiClf"
|
|
1046
|
+
)
|
|
1047
|
+
if looks_like_random_forest:
|
|
1048
|
+
if num_class > 2 or len(base_score) > 2:
|
|
1049
|
+
raise TypeError("Multi-class random forests are not supported.")
|
|
1050
|
+
if len(model_json["num_class"]) > 1:
|
|
1051
|
+
raise TypeError("Multi-output random forests are not supported.")
|
|
1052
|
+
if len(base_score) == 2 and base_score[0]:
|
|
1053
|
+
raise TypeError("Random forests with base scores are not supported.")
|
|
1054
|
+
|
|
1055
|
+
# In the case of binary random forests, it will always have leaf values
|
|
1056
|
+
# for 2 classes, which is redundant as they sum to 1. daal4py requires
|
|
1057
|
+
# only values for the positive class, so they need to be converted.
|
|
1058
|
+
if looks_like_random_forest:
|
|
1059
|
+
leave_only_last_treelite_leaf_value(model_json)
|
|
1060
|
+
base_score = base_score[-1]
|
|
1061
|
+
|
|
1062
|
+
# In the case of multi-class classification models, if converted
|
|
1063
|
+
# from xgboost, the order of the trees will be the same - i.e.
|
|
1064
|
+
# sequences of one tree of each class, followed by another such
|
|
1065
|
+
# sequence. But treelite could in theory also support building
|
|
1066
|
+
# models where the trees are in a different order, in which case
|
|
1067
|
+
# they will need to be reordered to match xgboost, since that's
|
|
1068
|
+
# how daal4py handles them. And if there is an uneven number of
|
|
1069
|
+
# trees per class, then will need to make up extra trees with
|
|
1070
|
+
# zeros to accommodate it.
|
|
1071
|
+
if task_type == "kMultiClf" and not looks_like_random_forest:
|
|
1072
|
+
num_trees = len(model_json["trees"])
|
|
1073
|
+
if (num_trees % num_class) != 0:
|
|
1074
|
+
shap_ready = False
|
|
1075
|
+
class_ids, num_trees_per_class = np.unique(
|
|
1076
|
+
model_json["class_id"], return_counts=True
|
|
1077
|
+
)
|
|
1078
|
+
max_tree_per_class = num_trees_per_class.max()
|
|
1079
|
+
num_tree_add_per_class = max_tree_per_class - num_trees_per_class
|
|
1080
|
+
for class_ind in range(num_class):
|
|
1081
|
+
for tree in range(num_tree_add_per_class[class_ind]):
|
|
1082
|
+
add_empty_tree_to_treelite_json(model_json, class_ind)
|
|
1083
|
+
|
|
1084
|
+
tree_class_orders = model_json["class_id"]
|
|
1085
|
+
sequential_ids = np.arange(num_class)
|
|
1086
|
+
num_trees = len(model_json["trees"])
|
|
1087
|
+
assert (num_trees % num_class) == 0
|
|
1088
|
+
if not np.array_equal(
|
|
1089
|
+
tree_class_orders, np.tile(sequential_ids, int(num_trees / num_class))
|
|
1090
|
+
):
|
|
1091
|
+
argsorted_class_indices = np.argsort(tree_class_orders)
|
|
1092
|
+
per_class_indices = np.split(argsorted_class_indices, num_class)
|
|
1093
|
+
correct_order = np.vstack(per_class_indices).reshape(-1, order="F")
|
|
1094
|
+
model_json["trees"] = [model_json["trees"][ix] for ix in correct_order]
|
|
1095
|
+
model_json["class_id"] = [model_json["class_id"][ix] for ix in correct_order]
|
|
1096
|
+
|
|
1097
|
+
# In the case of multi-class classification with base scores,
|
|
1098
|
+
# since daal4py only supports scalar intercepts, this follows the
|
|
1099
|
+
# same strategy as in catboost of dividing the intercepts equally
|
|
1100
|
+
# among the number of trees
|
|
1101
|
+
if task_type == "kMultiClf" and not looks_like_random_forest:
|
|
1102
|
+
add_intercept_to_treelite_leafs(model_json, base_score)
|
|
1103
|
+
base_score = None
|
|
1104
|
+
|
|
1105
|
+
if isinstance(base_score, list):
|
|
1106
|
+
if len(base_score) == 1:
|
|
1107
|
+
base_score = base_score[0]
|
|
1108
|
+
else:
|
|
1109
|
+
raise TypeError("Model to convert is malformed.")
|
|
1110
|
+
|
|
1111
|
+
tree_list = TreeList.from_treelite_dict(model_json)
|
|
1112
|
+
return (
|
|
1113
|
+
get_gbt_model_from_tree_list(
|
|
1114
|
+
tree_list,
|
|
1115
|
+
n_iterations=num_trees
|
|
1116
|
+
/ (
|
|
1117
|
+
num_class
|
|
1118
|
+
if task_type == "kMultiClf" and not looks_like_random_forest
|
|
1119
|
+
else 1
|
|
1120
|
+
),
|
|
1121
|
+
is_regression=is_regression,
|
|
1122
|
+
n_features=num_feature,
|
|
1123
|
+
n_classes=num_class,
|
|
1124
|
+
base_score=base_score,
|
|
1125
|
+
),
|
|
1126
|
+
num_class,
|
|
1127
|
+
num_feature,
|
|
1128
|
+
shap_ready,
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def divide_treelite_leaf_values_by_const(
|
|
1133
|
+
tl_json: dict[str, Any], divisor: "int | float"
|
|
1134
|
+
) -> None:
|
|
1135
|
+
for tree in tl_json["trees"]:
|
|
1136
|
+
for node in tree["nodes"]:
|
|
1137
|
+
if "leaf_value" in node:
|
|
1138
|
+
if isinstance(node["leaf_value"], (list, tuple)):
|
|
1139
|
+
node["leaf_value"] = list(np.array(node["leaf_value"]) / divisor)
|
|
1140
|
+
else:
|
|
1141
|
+
node["leaf_value"] /= divisor
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
def leave_only_last_treelite_leaf_value(tl_json: dict[str, Any]) -> None:
|
|
1145
|
+
for tree in tl_json["trees"]:
|
|
1146
|
+
for node in tree["nodes"]:
|
|
1147
|
+
if "leaf_value" in node:
|
|
1148
|
+
assert len(node["leaf_value"]) == 2
|
|
1149
|
+
node["leaf_value"] = node["leaf_value"][-1]
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def add_intercept_to_treelite_leafs(
|
|
1153
|
+
tl_json: dict[str, Any], base_score: list[float]
|
|
1154
|
+
) -> None:
|
|
1155
|
+
num_trees_per_class = len(tl_json["trees"]) / tl_json["num_class"][0]
|
|
1156
|
+
for tree_index, tree in enumerate(tl_json["trees"]):
|
|
1157
|
+
leaf_add = base_score[tl_json["class_id"][tree_index]] / num_trees_per_class
|
|
1158
|
+
for node in tree["nodes"]:
|
|
1159
|
+
if "leaf_value" in node:
|
|
1160
|
+
node["leaf_value"] += leaf_add
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
def add_empty_tree_to_treelite_json(tl_json: dict[str, Any], class_add: int) -> None:
|
|
1164
|
+
tl_json["class_id"].append(class_add)
|
|
1165
|
+
tl_json["trees"].append(
|
|
1166
|
+
{
|
|
1167
|
+
"num_nodes": 1,
|
|
1168
|
+
"has_categorical_split": False,
|
|
1169
|
+
"nodes": [
|
|
1170
|
+
{
|
|
1171
|
+
"node_id": 0,
|
|
1172
|
+
"leaf_value": 0.0,
|
|
1173
|
+
"data_count": 0,
|
|
1174
|
+
"sum_hess": 0.0,
|
|
1175
|
+
},
|
|
1176
|
+
],
|
|
1177
|
+
}
|
|
1178
|
+
)
|
|
@@ -37,6 +37,7 @@ from .gbt_convertors import (
|
|
|
37
37
|
get_catboost_params,
|
|
38
38
|
get_gbt_model_from_catboost,
|
|
39
39
|
get_gbt_model_from_lightgbm,
|
|
40
|
+
get_gbt_model_from_treelite,
|
|
40
41
|
get_gbt_model_from_xgboost,
|
|
41
42
|
get_lightgbm_params,
|
|
42
43
|
get_xgboost_params,
|
|
@@ -63,7 +64,9 @@ def getFPType(X):
|
|
|
63
64
|
|
|
64
65
|
class GBTDAALBaseModel:
|
|
65
66
|
def __init__(self):
|
|
66
|
-
self.model_type: Optional[
|
|
67
|
+
self.model_type: Optional[
|
|
68
|
+
Literal["xgboost", "catboost", "lightgbm", "treelite"]
|
|
69
|
+
] = None
|
|
67
70
|
|
|
68
71
|
@property
|
|
69
72
|
def _is_regression(self):
|
|
@@ -86,6 +89,8 @@ class GBTDAALBaseModel:
|
|
|
86
89
|
if self.n_classes_ <= 2:
|
|
87
90
|
if objective_fun in ["binary:logistic", "binary:logitraw"]:
|
|
88
91
|
self.n_classes_ = 2
|
|
92
|
+
elif self.n_classes_ == 0:
|
|
93
|
+
self.n_classes_ = 1
|
|
89
94
|
|
|
90
95
|
self.n_features_in_ = int(params["learner"]["learner_model_param"]["num_feature"])
|
|
91
96
|
|
|
@@ -113,6 +118,11 @@ class GBTDAALBaseModel:
|
|
|
113
118
|
self.daal_model_, self.supports_shap_ = get_gbt_model_from_catboost(booster)
|
|
114
119
|
self._get_params_from_catboost(catboost_params)
|
|
115
120
|
|
|
121
|
+
def _convert_model_from_treelite(self, tl_model):
|
|
122
|
+
self.daal_model_, self.n_classes_, self.n_features_in_, self.supports_shap_ = (
|
|
123
|
+
get_gbt_model_from_treelite(tl_model)
|
|
124
|
+
)
|
|
125
|
+
|
|
116
126
|
def _convert_model(self, model):
|
|
117
127
|
(submodule_name, class_name) = (
|
|
118
128
|
model.__class__.__module__,
|
|
@@ -147,6 +157,14 @@ class GBTDAALBaseModel:
|
|
|
147
157
|
# Build GBTDAALModel from CatBoost
|
|
148
158
|
elif (submodule_name, class_name) == ("catboost.core", "CatBoost"):
|
|
149
159
|
self._convert_model_from_catboost(model)
|
|
160
|
+
elif (submodule_name, class_name) == ("treelite.model", "Model"):
|
|
161
|
+
self._convert_model_from_treelite(model)
|
|
162
|
+
elif submodule_name.startswith("sklearn.ensemble"):
|
|
163
|
+
raise TypeError(
|
|
164
|
+
"Cannot convert scikit-learn models. Try converting to treelite "
|
|
165
|
+
"with 'treelite.sklearn.import_model' and then converting the "
|
|
166
|
+
"resulting TreeLite object."
|
|
167
|
+
)
|
|
150
168
|
else:
|
|
151
169
|
raise TypeError(f"Unknown model format {submodule_name}.{class_name}")
|
|
152
170
|
|
|
@@ -303,14 +321,21 @@ class GBTDAALModel(GBTDAALBaseModel):
|
|
|
303
321
|
|
|
304
322
|
Can be created from model objects that meet all of the following criteria:
|
|
305
323
|
|
|
306
|
-
- Were produced from one of the following libraries: ``xgboost``, ``lightgbm``,
|
|
307
|
-
It can work with either the base booster classes
|
|
308
|
-
scikit-learn-compatible classes.
|
|
324
|
+
- Were produced from one of the following libraries: ``xgboost``, ``lightgbm``, ``catboost``,
|
|
325
|
+
or ``treelite`` (with some limitations). It can work with either the base booster classes
|
|
326
|
+
of those libraries or with their scikit-learn-compatible classes.
|
|
309
327
|
- Do not use categorical features.
|
|
310
328
|
- Are for regression or classification (e.g. no ranking). In the case of XGBoost objective
|
|
311
329
|
``binary:logitraw``, it will create a classification model out of it, and in the case of
|
|
312
330
|
objective ``reg:logistic``, will create a regression model.
|
|
313
331
|
- Are not multi-output models. Note that multi-class classification **is** supported.
|
|
332
|
+
- Are not multi-class random forests (multi-class gradient boosters are supported).
|
|
333
|
+
|
|
334
|
+
Note that while models from packages such as scikit-learn are not supported directly,
|
|
335
|
+
they can still be converted to this class by first converting them to TreeLite and
|
|
336
|
+
then converting to :obj:`GBTDAALModel` from that TreeLite model. In such case, note that
|
|
337
|
+
models corresponding to random forest binary classifiers will be treated as regressors
|
|
338
|
+
that predict probabilities.
|
|
314
339
|
|
|
315
340
|
Parameters
|
|
316
341
|
----------
|
|
@@ -330,7 +355,7 @@ class GBTDAALModel(GBTDAALBaseModel):
|
|
|
330
355
|
|
|
331
356
|
def __init__(self, model):
|
|
332
357
|
self._convert_model(model)
|
|
333
|
-
for type_str in ("xgboost", "lightgbm", "catboost"):
|
|
358
|
+
for type_str in ("xgboost", "lightgbm", "catboost", "treelite"):
|
|
334
359
|
if type_str in str(type(model)):
|
|
335
360
|
self.model_type = type_str
|
|
336
361
|
break
|
|
Binary file
|
|
@@ -36,10 +36,10 @@ def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None):
|
|
|
36
36
|
ww = make2d(sample_weight) if sample_weight is not None else None
|
|
37
37
|
XX = make2d(X)
|
|
38
38
|
|
|
39
|
-
fpt = getFPType(XX)
|
|
39
|
+
fpt = getFPType(XX) # codespell:ignore fpt
|
|
40
40
|
alg = daal4py.dbscan(
|
|
41
41
|
method="defaultDense",
|
|
42
|
-
fptype=fpt,
|
|
42
|
+
fptype=fpt, # codespell:ignore fpt
|
|
43
43
|
epsilon=float(eps),
|
|
44
44
|
minObservations=int(min_samples),
|
|
45
45
|
memorySavingMode=False,
|
|
@@ -679,8 +679,8 @@ class RandomForestClassifier(RandomForestClassifier_original, RandomForestBase):
|
|
|
679
679
|
dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_)
|
|
680
680
|
|
|
681
681
|
pred = dfc_predictionResult.probabilities
|
|
682
|
-
|
|
683
|
-
return pred
|
|
682
|
+
# TODO: fix probabilities out of [0, 1] interval on oneDAL side
|
|
683
|
+
return pred.clip(0.0, 1.0)
|
|
684
684
|
|
|
685
685
|
def _daal_fit_classifier(self, X, y, sample_weight=None):
|
|
686
686
|
y = check_array(y, ensure_2d=False, dtype=None)
|
|
@@ -359,14 +359,16 @@ def __logistic_regression_path(
|
|
|
359
359
|
y_bin = np.ones(y.shape, dtype=X.dtype)
|
|
360
360
|
# for compute_class_weight
|
|
361
361
|
|
|
362
|
-
if solver
|
|
362
|
+
if solver == "liblinear" or (
|
|
363
|
+
not sklearn_check_version("1.6") and solver not in ["lbfgs", "newton-cg"]
|
|
364
|
+
):
|
|
365
|
+
mask_classes = np.array([-1, 1])
|
|
366
|
+
y_bin[~mask] = -1.0
|
|
367
|
+
else:
|
|
363
368
|
# HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead
|
|
364
369
|
# of in [-1, 1].
|
|
365
370
|
mask_classes = np.array([0, 1])
|
|
366
371
|
y_bin[~mask] = 0.0
|
|
367
|
-
else:
|
|
368
|
-
mask_classes = np.array([-1, 1])
|
|
369
|
-
y_bin[~mask] = -1.0
|
|
370
372
|
else:
|
|
371
373
|
mask_classes = np.array([-1, 1])
|
|
372
374
|
mask = y == pos_class
|
|
@@ -388,7 +390,11 @@ def __logistic_regression_path(
|
|
|
388
390
|
|
|
389
391
|
else:
|
|
390
392
|
if sklearn_check_version("1.1"):
|
|
391
|
-
if
|
|
393
|
+
if sklearn_check_version("1.6"):
|
|
394
|
+
solver_list = ["sag", "saga", "lbfgs", "newton-cg", "newton-cholesky"]
|
|
395
|
+
else:
|
|
396
|
+
solver_list = ["sag", "saga", "lbfgs", "newton-cg"]
|
|
397
|
+
if solver in solver_list:
|
|
392
398
|
# SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder,
|
|
393
399
|
# not LabelBinarizer, i.e. y as a 1d-array of integers.
|
|
394
400
|
# LabelEncoder also saves memory compared to LabelBinarizer, especially
|
|
@@ -488,7 +494,11 @@ def __logistic_regression_path(
|
|
|
488
494
|
|
|
489
495
|
if multi_class == "multinomial":
|
|
490
496
|
# fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
|
|
491
|
-
if
|
|
497
|
+
if sklearn_check_version("1.6"):
|
|
498
|
+
solver_list = ["lbfgs", "newton-cg", "newton-cholesky"]
|
|
499
|
+
else:
|
|
500
|
+
solver_list = ["lbfgs", "newton-cg"]
|
|
501
|
+
if solver in solver_list:
|
|
492
502
|
if _dal_ready and classes.size == 2:
|
|
493
503
|
w0 = w0[-1:, :]
|
|
494
504
|
if sklearn_check_version("1.1"):
|
|
@@ -753,7 +763,11 @@ def __logistic_regression_path(
|
|
|
753
763
|
else:
|
|
754
764
|
n_classes = max(2, classes.size)
|
|
755
765
|
if sklearn_check_version("1.1"):
|
|
756
|
-
if
|
|
766
|
+
if sklearn_check_version("1.6"):
|
|
767
|
+
solver_list = ["lbfgs", "newton-cg", "newton-cholesky"]
|
|
768
|
+
else:
|
|
769
|
+
solver_list = ["lbfgs", "newton-cg"]
|
|
770
|
+
if solver in solver_list:
|
|
757
771
|
multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
|
|
758
772
|
else:
|
|
759
773
|
multi_w0 = w0
|
|
@@ -66,7 +66,13 @@ class TSNE(BaseTSNE):
|
|
|
66
66
|
[n_samples],
|
|
67
67
|
[P.nnz],
|
|
68
68
|
[self.n_iter_without_progress],
|
|
69
|
-
[
|
|
69
|
+
[
|
|
70
|
+
(
|
|
71
|
+
self.max_iter
|
|
72
|
+
if sklearn_check_version("1.7")
|
|
73
|
+
else (self._max_iter if sklearn_check_version("1.5") else self.n_iter)
|
|
74
|
+
)
|
|
75
|
+
],
|
|
70
76
|
]
|
|
71
77
|
|
|
72
78
|
# Pass params to daal4py backend
|
|
@@ -130,7 +136,7 @@ class TSNE(BaseTSNE):
|
|
|
130
136
|
|
|
131
137
|
if isinstance(self._init, str) and self._init == "pca" and issparse(X):
|
|
132
138
|
raise TypeError(
|
|
133
|
-
"PCA initialization is currently not
|
|
139
|
+
"PCA initialization is currently not supported "
|
|
134
140
|
"with the sparse input matrix. Use "
|
|
135
141
|
'init="random" instead.'
|
|
136
142
|
)
|
|
@@ -18,6 +18,7 @@ import warnings
|
|
|
18
18
|
from functools import partial
|
|
19
19
|
|
|
20
20
|
import numpy as np
|
|
21
|
+
from joblib import effective_n_jobs
|
|
21
22
|
from sklearn.exceptions import DataConversionWarning
|
|
22
23
|
from sklearn.metrics import pairwise_distances as pairwise_distances_original
|
|
23
24
|
from sklearn.metrics.pairwise import (
|
|
@@ -28,7 +29,6 @@ from sklearn.metrics.pairwise import (
|
|
|
28
29
|
_parallel_pairwise,
|
|
29
30
|
check_pairwise_arrays,
|
|
30
31
|
)
|
|
31
|
-
from sklearn.utils._joblib import effective_n_jobs
|
|
32
32
|
from sklearn.utils.validation import check_non_negative
|
|
33
33
|
|
|
34
34
|
try:
|
daal4py/sklearn/svm/svm.py
CHANGED
|
@@ -158,7 +158,7 @@ def _daal4py_kf(kernel, X_fptype, gamma=1.0, is_sparse=False):
|
|
|
158
158
|
kf = daal4py.kernel_function_linear(fptype=X_fptype, method=method)
|
|
159
159
|
else:
|
|
160
160
|
raise ValueError(
|
|
161
|
-
"_daal4py_fit received unexpected kernel
|
|
161
|
+
"_daal4py_fit received unexpected kernel specification {}.".format(kernel)
|
|
162
162
|
)
|
|
163
163
|
|
|
164
164
|
return kf
|
|
@@ -72,25 +72,24 @@ def _assert_all_finite(
|
|
|
72
72
|
|
|
73
73
|
# Data with small size has too big relative overhead
|
|
74
74
|
# TODO: tune threshold size
|
|
75
|
-
if hasattr(X, "size"):
|
|
76
|
-
if X.size < 32768:
|
|
77
|
-
if sklearn_check_version("1.1"):
|
|
78
|
-
_sklearn_assert_all_finite(
|
|
79
|
-
X,
|
|
80
|
-
allow_nan=allow_nan,
|
|
81
|
-
msg_dtype=msg_dtype,
|
|
82
|
-
estimator_name=estimator_name,
|
|
83
|
-
input_name=input_name,
|
|
84
|
-
)
|
|
85
|
-
else:
|
|
86
|
-
_sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype)
|
|
87
|
-
return
|
|
88
|
-
|
|
89
75
|
is_df = is_DataFrame(X)
|
|
76
|
+
if not (is_df or isinstance(X, np.ndarray)) or X.size < 32768:
|
|
77
|
+
if sklearn_check_version("1.1"):
|
|
78
|
+
_sklearn_assert_all_finite(
|
|
79
|
+
X,
|
|
80
|
+
allow_nan=allow_nan,
|
|
81
|
+
msg_dtype=msg_dtype,
|
|
82
|
+
estimator_name=estimator_name,
|
|
83
|
+
input_name=input_name,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
_sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype)
|
|
87
|
+
return
|
|
88
|
+
|
|
90
89
|
num_of_types = get_number_of_types(X)
|
|
91
90
|
|
|
92
91
|
# if X is heterogeneous pandas.DataFrame then
|
|
93
|
-
#
|
|
92
|
+
# convert it to a list of arrays
|
|
94
93
|
if is_df and num_of_types > 1:
|
|
95
94
|
lst = []
|
|
96
95
|
for idx in X:
|
|
@@ -330,7 +329,7 @@ def _daal_check_array(
|
|
|
330
329
|
has_pd_integer_array = False
|
|
331
330
|
if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
|
|
332
331
|
# throw warning if columns are sparse. If all columns are sparse, then
|
|
333
|
-
# array.sparse exists and sparsity will be
|
|
332
|
+
# array.sparse exists and sparsity will be preserved (later).
|
|
334
333
|
with suppress(ImportError):
|
|
335
334
|
from pandas import SparseDtype
|
|
336
335
|
|
onedal/__init__.py
CHANGED
|
@@ -21,9 +21,34 @@ from daal4py.sklearn._utils import daal_check_version
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class Backend:
|
|
24
|
-
"""Encapsulates the oneDAL pybind11 modules and provides a unified interface to it together with additional properties about dpc/spmd policies"""
|
|
25
24
|
|
|
26
25
|
def __init__(self, backend_module, is_dpc, is_spmd):
|
|
26
|
+
"""A unified interface to an available oneDAL pybind11 module.
|
|
27
|
+
|
|
28
|
+
This class encapsulates a oneDAL pybind11 module allowing for
|
|
29
|
+
dynamic access of module objects. This simplifies method and
|
|
30
|
+
attribute access in sklearnex without aliasing in sys.modules.
|
|
31
|
+
It contains additional attributes for inspection of the pybind11
|
|
32
|
+
module type (i.e. dpc or spmd) for use in policy creation.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
backend_module : oneDAL pybind11 module
|
|
37
|
+
Pybind11 module to be encapsulated.
|
|
38
|
+
|
|
39
|
+
is_dpc : bool
|
|
40
|
+
Flag describing if the module is Data Parallel C++-enabled.
|
|
41
|
+
|
|
42
|
+
is_spmd : bool
|
|
43
|
+
Flag describing if the module is single program, multiple
|
|
44
|
+
data enabled.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
self : Backend
|
|
49
|
+
Encapsulated oneDAL pybind11 module.
|
|
50
|
+
"""
|
|
51
|
+
|
|
27
52
|
self.backend = backend_module
|
|
28
53
|
self.is_dpc = is_dpc
|
|
29
54
|
self.is_spmd = is_spmd
|
onedal/_config.py
CHANGED
|
@@ -59,13 +59,14 @@ def _get_onedal_threadlocal_config():
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def _get_config(copy=True):
|
|
62
|
-
"""Retrieve current
|
|
63
|
-
|
|
62
|
+
"""Retrieve current configuration set by :func:`sklearnex.set_config`
|
|
63
|
+
|
|
64
64
|
Parameters
|
|
65
65
|
----------
|
|
66
66
|
copy : bool, default=True
|
|
67
|
-
If False, a mutable view of the configuration is returned. Each
|
|
68
|
-
has a separate copy of the configuration.
|
|
67
|
+
If 'False', a mutable view of the configuration is returned. Each
|
|
68
|
+
thread has a separate copy of the configuration.
|
|
69
|
+
|
|
69
70
|
Returns
|
|
70
71
|
-------
|
|
71
72
|
config : dict
|