scikit-learn-intelex 2024.5.0__py39-none-manylinux1_x86_64.whl → 2024.7.0__py39-none-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scikit-learn-intelex might be problematic. Click here for more details.
- {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/METADATA +2 -2
- scikit_learn_intelex-2024.7.0.dist-info/RECORD +122 -0
- sklearnex/_config.py +3 -15
- sklearnex/_device_offload.py +9 -168
- sklearnex/basic_statistics/basic_statistics.py +127 -1
- sklearnex/basic_statistics/tests/test_basic_statistics.py +251 -0
- sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +1 -1
- sklearnex/cluster/dbscan.py +3 -1
- sklearnex/cluster/k_means.py +8 -0
- sklearnex/cluster/tests/test_dbscan.py +8 -6
- sklearnex/cluster/tests/test_kmeans.py +15 -3
- sklearnex/conftest.py +11 -1
- sklearnex/covariance/incremental_covariance.py +64 -13
- sklearnex/covariance/tests/test_incremental_covariance.py +35 -0
- sklearnex/decomposition/pca.py +25 -1
- sklearnex/decomposition/tests/test_pca.py +4 -2
- sklearnex/dispatcher.py +109 -1
- sklearnex/ensemble/_forest.py +121 -57
- sklearnex/ensemble/tests/test_forest.py +7 -0
- sklearnex/glob/dispatcher.py +16 -2
- sklearnex/linear_model/coordinate_descent.py +13 -0
- sklearnex/linear_model/incremental_linear.py +102 -25
- sklearnex/linear_model/linear.py +25 -39
- sklearnex/linear_model/logistic_regression.py +92 -74
- sklearnex/linear_model/ridge.py +7 -0
- sklearnex/linear_model/tests/test_incremental_linear.py +10 -10
- sklearnex/linear_model/tests/test_linear.py +30 -5
- sklearnex/linear_model/tests/test_logreg.py +45 -3
- sklearnex/manifold/t_sne.py +4 -0
- sklearnex/metrics/pairwise.py +5 -0
- sklearnex/metrics/ranking.py +3 -0
- sklearnex/model_selection/split.py +3 -0
- sklearnex/neighbors/_lof.py +9 -0
- sklearnex/neighbors/common.py +45 -1
- sklearnex/neighbors/knn_classification.py +1 -20
- sklearnex/neighbors/knn_regression.py +25 -20
- sklearnex/neighbors/knn_unsupervised.py +31 -7
- sklearnex/preview/__init__.py +1 -1
- sklearnex/preview/decomposition/__init__.py +19 -0
- sklearnex/preview/decomposition/incremental_pca.py +228 -0
- sklearnex/preview/decomposition/tests/test_incremental_pca.py +266 -0
- sklearnex/preview/linear_model/__init__.py +19 -0
- sklearnex/preview/linear_model/ridge.py +419 -0
- sklearnex/preview/linear_model/tests/test_ridge.py +102 -0
- sklearnex/spmd/basic_statistics/tests/test_basic_statistics_spmd.py +107 -0
- sklearnex/spmd/cluster/tests/test_dbscan_spmd.py +97 -0
- sklearnex/spmd/cluster/tests/test_kmeans_spmd.py +172 -0
- sklearnex/spmd/covariance/tests/test_covariance_spmd.py +107 -0
- sklearnex/spmd/decomposition/tests/test_pca_spmd.py +128 -0
- sklearnex/spmd/ensemble/tests/test_forest_spmd.py +265 -0
- sklearnex/spmd/linear_model/tests/test_linear_regression_spmd.py +145 -0
- sklearnex/spmd/linear_model/tests/test_logistic_regression_spmd.py +163 -0
- sklearnex/spmd/neighbors/tests/test_neighbors_spmd.py +288 -0
- sklearnex/svm/_common.py +163 -20
- sklearnex/svm/nusvc.py +40 -4
- sklearnex/svm/nusvr.py +31 -2
- sklearnex/svm/svc.py +40 -4
- sklearnex/svm/svr.py +31 -2
- sklearnex/svm/tests/test_svm.py +12 -20
- sklearnex/tests/_utils.py +185 -30
- sklearnex/tests/_utils_spmd.py +185 -0
- sklearnex/tests/test_common.py +54 -0
- sklearnex/tests/test_config.py +4 -0
- sklearnex/tests/test_memory_usage.py +185 -126
- sklearnex/tests/test_monkeypatch.py +12 -4
- sklearnex/tests/test_patching.py +21 -25
- sklearnex/tests/test_run_to_run_stability.py +295 -0
- sklearnex/utils/_namespace.py +1 -1
- scikit_learn_intelex-2024.5.0.dist-info/RECORD +0 -104
- sklearnex/tests/test_run_to_run_stability_tests.py +0 -428
- {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/LICENSE.txt +0 -0
- {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/WHEEL +0 -0
- {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/top_level.txt +0 -0
sklearnex/svm/svc.py
CHANGED
|
@@ -85,6 +85,17 @@ class SVC(sklearn_SVC, BaseSVC):
|
|
|
85
85
|
def fit(self, X, y, sample_weight=None):
|
|
86
86
|
if sklearn_check_version("1.2"):
|
|
87
87
|
self._validate_params()
|
|
88
|
+
elif self.C <= 0:
|
|
89
|
+
# else if added to correct issues with
|
|
90
|
+
# sklearn tests:
|
|
91
|
+
# svm/tests/test_sparse.py::test_error
|
|
92
|
+
# svm/tests/test_svm.py::test_bad_input
|
|
93
|
+
# for sklearn versions < 1.2 (i.e. without
|
|
94
|
+
# validate_params parameter checking)
|
|
95
|
+
# Without this, a segmentation fault with
|
|
96
|
+
# Windows fatal exception: access violation
|
|
97
|
+
# occurs
|
|
98
|
+
raise ValueError("C <= 0")
|
|
88
99
|
if sklearn_check_version("1.0"):
|
|
89
100
|
self._check_feature_names(X, reset=True)
|
|
90
101
|
dispatch(
|
|
@@ -96,8 +107,9 @@ class SVC(sklearn_SVC, BaseSVC):
|
|
|
96
107
|
},
|
|
97
108
|
X,
|
|
98
109
|
y,
|
|
99
|
-
sample_weight,
|
|
110
|
+
sample_weight=sample_weight,
|
|
100
111
|
)
|
|
112
|
+
|
|
101
113
|
return self
|
|
102
114
|
|
|
103
115
|
@wrap_output_data
|
|
@@ -270,12 +282,30 @@ class SVC(sklearn_SVC, BaseSVC):
|
|
|
270
282
|
return patching_status
|
|
271
283
|
raise RuntimeError(f"Unknown method {method_name} in {class_name}")
|
|
272
284
|
|
|
285
|
+
def _get_sample_weight(self, X, y, sample_weight=None):
|
|
286
|
+
sample_weight = super()._get_sample_weight(X, y, sample_weight)
|
|
287
|
+
if sample_weight is None:
|
|
288
|
+
return sample_weight
|
|
289
|
+
|
|
290
|
+
if np.any(sample_weight <= 0) and len(np.unique(y[sample_weight > 0])) != len(
|
|
291
|
+
self.classes_
|
|
292
|
+
):
|
|
293
|
+
raise ValueError(
|
|
294
|
+
"Invalid input - all samples with positive weights "
|
|
295
|
+
"belong to the same class"
|
|
296
|
+
if sklearn_check_version("1.2")
|
|
297
|
+
else "Invalid input - all samples with positive weights "
|
|
298
|
+
"have the same label."
|
|
299
|
+
)
|
|
300
|
+
return sample_weight
|
|
301
|
+
|
|
273
302
|
def _onedal_fit(self, X, y, sample_weight=None, queue=None):
|
|
303
|
+
X, _, weights = self._onedal_fit_checks(X, y, sample_weight)
|
|
274
304
|
onedal_params = {
|
|
275
305
|
"C": self.C,
|
|
276
306
|
"kernel": self.kernel,
|
|
277
307
|
"degree": self.degree,
|
|
278
|
-
"gamma": self.
|
|
308
|
+
"gamma": self._compute_gamma_sigma(X),
|
|
279
309
|
"coef0": self.coef0,
|
|
280
310
|
"tol": self.tol,
|
|
281
311
|
"shrinking": self.shrinking,
|
|
@@ -287,10 +317,16 @@ class SVC(sklearn_SVC, BaseSVC):
|
|
|
287
317
|
}
|
|
288
318
|
|
|
289
319
|
self._onedal_estimator = onedal_SVC(**onedal_params)
|
|
290
|
-
self._onedal_estimator.fit(X, y,
|
|
320
|
+
self._onedal_estimator.fit(X, y, weights, queue=queue)
|
|
291
321
|
|
|
292
322
|
if self.probability:
|
|
293
|
-
self._fit_proba(
|
|
323
|
+
self._fit_proba(
|
|
324
|
+
X,
|
|
325
|
+
y,
|
|
326
|
+
sample_weight=sample_weight,
|
|
327
|
+
queue=queue,
|
|
328
|
+
)
|
|
329
|
+
|
|
294
330
|
self._save_attributes()
|
|
295
331
|
|
|
296
332
|
def _onedal_predict(self, X, queue=None):
|
sklearnex/svm/svr.py
CHANGED
|
@@ -65,6 +65,17 @@ class SVR(sklearn_SVR, BaseSVR):
|
|
|
65
65
|
def fit(self, X, y, sample_weight=None):
|
|
66
66
|
if sklearn_check_version("1.2"):
|
|
67
67
|
self._validate_params()
|
|
68
|
+
elif self.C <= 0:
|
|
69
|
+
# else if added to correct issues with
|
|
70
|
+
# sklearn tests:
|
|
71
|
+
# svm/tests/test_sparse.py::test_error
|
|
72
|
+
# svm/tests/test_svm.py::test_bad_input
|
|
73
|
+
# for sklearn versions < 1.2 (i.e. without
|
|
74
|
+
# validate_params parameter checking)
|
|
75
|
+
# Without this, a segmentation fault with
|
|
76
|
+
# Windows fatal exception: access violation
|
|
77
|
+
# occurs
|
|
78
|
+
raise ValueError("C <= 0")
|
|
68
79
|
if sklearn_check_version("1.0"):
|
|
69
80
|
self._check_feature_names(X, reset=True)
|
|
70
81
|
dispatch(
|
|
@@ -76,7 +87,7 @@ class SVR(sklearn_SVR, BaseSVR):
|
|
|
76
87
|
},
|
|
77
88
|
X,
|
|
78
89
|
y,
|
|
79
|
-
sample_weight,
|
|
90
|
+
sample_weight=sample_weight,
|
|
80
91
|
)
|
|
81
92
|
|
|
82
93
|
return self
|
|
@@ -95,13 +106,30 @@ class SVR(sklearn_SVR, BaseSVR):
|
|
|
95
106
|
X,
|
|
96
107
|
)
|
|
97
108
|
|
|
109
|
+
@wrap_output_data
|
|
110
|
+
def score(self, X, y, sample_weight=None):
|
|
111
|
+
if sklearn_check_version("1.0"):
|
|
112
|
+
self._check_feature_names(X, reset=False)
|
|
113
|
+
return dispatch(
|
|
114
|
+
self,
|
|
115
|
+
"score",
|
|
116
|
+
{
|
|
117
|
+
"onedal": self.__class__._onedal_score,
|
|
118
|
+
"sklearn": sklearn_SVR.score,
|
|
119
|
+
},
|
|
120
|
+
X,
|
|
121
|
+
y,
|
|
122
|
+
sample_weight=sample_weight,
|
|
123
|
+
)
|
|
124
|
+
|
|
98
125
|
def _onedal_fit(self, X, y, sample_weight=None, queue=None):
|
|
126
|
+
X, _, sample_weight = self._onedal_fit_checks(X, y, sample_weight)
|
|
99
127
|
onedal_params = {
|
|
100
128
|
"C": self.C,
|
|
101
129
|
"epsilon": self.epsilon,
|
|
102
130
|
"kernel": self.kernel,
|
|
103
131
|
"degree": self.degree,
|
|
104
|
-
"gamma": self.
|
|
132
|
+
"gamma": self._compute_gamma_sigma(X),
|
|
105
133
|
"coef0": self.coef0,
|
|
106
134
|
"tol": self.tol,
|
|
107
135
|
"shrinking": self.shrinking,
|
|
@@ -118,3 +146,4 @@ class SVR(sklearn_SVR, BaseSVR):
|
|
|
118
146
|
|
|
119
147
|
fit.__doc__ = sklearn_SVR.fit.__doc__
|
|
120
148
|
predict.__doc__ = sklearn_SVR.predict.__doc__
|
|
149
|
+
score.__doc__ = sklearn_SVR.score.__doc__
|
sklearnex/svm/tests/test_svm.py
CHANGED
|
@@ -25,12 +25,10 @@ from onedal.tests.utils._dataframes_support import (
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
|
|
30
|
-
@pytest.mark.parametrize(
|
|
31
|
-
"dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
|
|
32
|
-
)
|
|
28
|
+
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
|
|
33
29
|
def test_sklearnex_import_svc(dataframe, queue):
|
|
30
|
+
if queue and queue.sycl_device.is_gpu:
|
|
31
|
+
pytest.skip("SVC fit for the GPU sycl_queue is buggy.")
|
|
34
32
|
from sklearnex.svm import SVC
|
|
35
33
|
|
|
36
34
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
|
|
@@ -43,12 +41,10 @@ def test_sklearnex_import_svc(dataframe, queue):
|
|
|
43
41
|
assert_allclose(_as_numpy(svc.support_), [1, 3])
|
|
44
42
|
|
|
45
43
|
|
|
46
|
-
|
|
47
|
-
# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
|
|
48
|
-
@pytest.mark.parametrize(
|
|
49
|
-
"dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
|
|
50
|
-
)
|
|
44
|
+
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
|
|
51
45
|
def test_sklearnex_import_nusvc(dataframe, queue):
|
|
46
|
+
if queue and queue.sycl_device.is_gpu:
|
|
47
|
+
pytest.skip("NuSVC fit for the GPU sycl_queue is buggy.")
|
|
52
48
|
from sklearnex.svm import NuSVC
|
|
53
49
|
|
|
54
50
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
|
|
@@ -63,12 +59,10 @@ def test_sklearnex_import_nusvc(dataframe, queue):
|
|
|
63
59
|
assert_allclose(_as_numpy(svc.support_), [0, 1, 3, 4])
|
|
64
60
|
|
|
65
61
|
|
|
66
|
-
|
|
67
|
-
# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
|
|
68
|
-
@pytest.mark.parametrize(
|
|
69
|
-
"dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
|
|
70
|
-
)
|
|
62
|
+
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
|
|
71
63
|
def test_sklearnex_import_svr(dataframe, queue):
|
|
64
|
+
if queue and queue.sycl_device.is_gpu:
|
|
65
|
+
pytest.skip("SVR fit for the GPU sycl_queue is buggy.")
|
|
72
66
|
from sklearnex.svm import SVR
|
|
73
67
|
|
|
74
68
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
|
|
@@ -81,12 +75,10 @@ def test_sklearnex_import_svr(dataframe, queue):
|
|
|
81
75
|
assert_allclose(_as_numpy(svc.support_), [1, 3])
|
|
82
76
|
|
|
83
77
|
|
|
84
|
-
|
|
85
|
-
# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
|
|
86
|
-
@pytest.mark.parametrize(
|
|
87
|
-
"dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
|
|
88
|
-
)
|
|
78
|
+
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
|
|
89
79
|
def test_sklearnex_import_nusvr(dataframe, queue):
|
|
80
|
+
if queue and queue.sycl_device.is_gpu:
|
|
81
|
+
pytest.skip("NuSVR fit for the GPU sycl_queue is buggy.")
|
|
90
82
|
from sklearnex.svm import NuSVR
|
|
91
83
|
|
|
92
84
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
|
sklearnex/tests/_utils.py
CHANGED
|
@@ -14,9 +14,11 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
# ==============================================================================
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from functools import partial
|
|
18
|
+
from inspect import getattr_static, isclass, signature
|
|
18
19
|
|
|
19
20
|
import numpy as np
|
|
21
|
+
from scipy import sparse as sp
|
|
20
22
|
from sklearn import clone
|
|
21
23
|
from sklearn.base import (
|
|
22
24
|
BaseEstimator,
|
|
@@ -42,6 +44,22 @@ from sklearnex.svm import SVC, NuSVC
|
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
def _load_all_models(with_sklearnex=True, estimator=True):
|
|
47
|
+
"""Convert sklearnex patch_map into a dictionary of estimators or functions
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
with_sklearnex: bool (default=True)
|
|
52
|
+
Discover estimators and methods with sklearnex patching enabled (True)
|
|
53
|
+
or disabled (False) from the sklearnex patch_map
|
|
54
|
+
|
|
55
|
+
estimator: bool (default=True)
|
|
56
|
+
yield estimators (True) or functions (False)
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
dict: {name:estimator}
|
|
61
|
+
estimator is a class or function from sklearn or sklearnex
|
|
62
|
+
"""
|
|
45
63
|
# insure that patch state is correct as dictated by patch_sklearn boolean
|
|
46
64
|
# and return it to the previous state no matter what occurs.
|
|
47
65
|
already_patched_map = sklearn_is_patched(return_map=True)
|
|
@@ -89,11 +107,17 @@ mixin_map = [
|
|
|
89
107
|
|
|
90
108
|
|
|
91
109
|
class _sklearn_clone_dict(dict):
|
|
110
|
+
"""Special dict type for returning state-free sklearn/sklearnex estimators
|
|
111
|
+
with the same parameters"""
|
|
92
112
|
|
|
93
113
|
def __getitem__(self, key):
|
|
94
114
|
return clone(super().__getitem__(key))
|
|
95
115
|
|
|
96
116
|
|
|
117
|
+
# Special dictionary of sklearnex estimators which must be specifically tested, this
|
|
118
|
+
# could be because of supported non-default parameters, blocked support via sklearn's
|
|
119
|
+
# 'available_if' decorator, or not being a native sklearn estimator (i.e. those not in
|
|
120
|
+
# the default PATCHED_MODELS dictionary)
|
|
97
121
|
SPECIAL_INSTANCES = _sklearn_clone_dict(
|
|
98
122
|
{
|
|
99
123
|
str(i): i
|
|
@@ -110,27 +134,58 @@ SPECIAL_INSTANCES = _sklearn_clone_dict(
|
|
|
110
134
|
)
|
|
111
135
|
|
|
112
136
|
|
|
113
|
-
def gen_models_info(algorithms):
|
|
137
|
+
def gen_models_info(algorithms, required_inputs=["X", "y"]):
|
|
138
|
+
"""Generate estimator-attribute pairs for pytest test collection.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
algorithms : iterable (list, tuple, 1D array-like object)
|
|
143
|
+
Iterable of valid sklearnex estimators or keys from PATCHED_MODELS
|
|
144
|
+
|
|
145
|
+
required_inputs : list, tuple of strings or None
|
|
146
|
+
list of required args/kwargs for callable attribute (only non-private,
|
|
147
|
+
non-BaseEstimator attributes). Only one must be present, None
|
|
148
|
+
signifies taking all non-private attribues, callable or not.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
list of 2-element tuples: (estimator, string)
|
|
153
|
+
Returns a list of valid methods or attributes without "fit"
|
|
154
|
+
"""
|
|
114
155
|
output = []
|
|
115
|
-
for
|
|
156
|
+
for estimator in algorithms:
|
|
116
157
|
|
|
117
|
-
if
|
|
118
|
-
est = PATCHED_MODELS[
|
|
119
|
-
elif
|
|
120
|
-
est =
|
|
158
|
+
if estimator in PATCHED_MODELS:
|
|
159
|
+
est = PATCHED_MODELS[estimator]
|
|
160
|
+
elif isinstance(algorithms[estimator], BaseEstimator):
|
|
161
|
+
est = algorithms[estimator].__class__
|
|
121
162
|
else:
|
|
122
|
-
raise KeyError(f"Unrecognized sklearnex estimator: {
|
|
163
|
+
raise KeyError(f"Unrecognized sklearnex estimator: {estimator}")
|
|
123
164
|
|
|
124
|
-
methods
|
|
125
|
-
candidates = set(
|
|
126
|
-
|
|
127
|
-
)
|
|
165
|
+
# remove BaseEstimator methods (get_params, set_params)
|
|
166
|
+
candidates = set(dir(est)) - set(dir(BaseEstimator))
|
|
167
|
+
# remove private methods
|
|
168
|
+
candidates = set([attr for attr in candidates if not attr.startswith("_")])
|
|
169
|
+
# required to enable other methods
|
|
170
|
+
candidates = candidates - {"fit"}
|
|
128
171
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
172
|
+
# allow only callable methods with any of the required inputs
|
|
173
|
+
if required_inputs:
|
|
174
|
+
methods = []
|
|
175
|
+
for attr in candidates:
|
|
176
|
+
attribute = getattr_static(est, attr)
|
|
177
|
+
if callable(attribute):
|
|
178
|
+
params = signature(attribute).parameters
|
|
179
|
+
if any([inp in params for inp in required_inputs]):
|
|
180
|
+
methods += [attr]
|
|
181
|
+
else:
|
|
182
|
+
methods = candidates
|
|
132
183
|
|
|
133
|
-
output +=
|
|
184
|
+
output += (
|
|
185
|
+
[(estimator, method) for method in methods]
|
|
186
|
+
if methods
|
|
187
|
+
else [(estimator, None)]
|
|
188
|
+
)
|
|
134
189
|
|
|
135
190
|
# In the case that no methods are available, set method to None.
|
|
136
191
|
# This will allow estimators without mixins to still test the fit
|
|
@@ -138,24 +193,124 @@ def gen_models_info(algorithms):
|
|
|
138
193
|
return output
|
|
139
194
|
|
|
140
195
|
|
|
141
|
-
def
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
196
|
+
def call_method(estimator, method, X, y, **kwargs):
|
|
197
|
+
"""Generalized interface to call most sklearn estimator methods
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
estimator : sklearn or sklearnex estimator instance
|
|
202
|
+
|
|
203
|
+
method: string
|
|
204
|
+
Valid callable method to estimator
|
|
205
|
+
|
|
206
|
+
X: array-like
|
|
207
|
+
data
|
|
208
|
+
|
|
209
|
+
y: array-like (for 'score', 'partial-fit', and 'path')
|
|
210
|
+
X-dependent data
|
|
211
|
+
|
|
212
|
+
**kwargs: keyword dict
|
|
213
|
+
keyword arguments to estimator.method
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
return value from estimator.method
|
|
218
|
+
"""
|
|
219
|
+
# useful for repository wide testing
|
|
220
|
+
if method == "inverse_transform":
|
|
221
|
+
# PCA's inverse_transform takes (n_samples, n_components)
|
|
222
|
+
data = (
|
|
223
|
+
(X[:, : estimator.n_components_],)
|
|
224
|
+
if X.shape[1] != estimator.n_components_
|
|
225
|
+
else (X,)
|
|
226
|
+
)
|
|
227
|
+
elif method not in ["score", "partial_fit", "path"]:
|
|
228
|
+
data = (X,)
|
|
229
|
+
else:
|
|
230
|
+
data = (X, y)
|
|
231
|
+
return getattr(estimator, method)(*data, **kwargs)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _gen_dataset_type(est):
|
|
235
|
+
# est should be an estimator or estimator class
|
|
236
|
+
# dataset initialized to classification, but will be swapped
|
|
237
|
+
# for other types as necessary. Private method.
|
|
238
|
+
dataset = "classification"
|
|
239
|
+
estimator = est.__class__ if isinstance(est, BaseEstimator) else est
|
|
240
|
+
|
|
145
241
|
for mixin, _, data in mixin_map:
|
|
146
|
-
if issubclass(
|
|
242
|
+
if issubclass(estimator, mixin) and data is not None:
|
|
147
243
|
dataset = data
|
|
244
|
+
return dataset
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
_dataset_dict = {
|
|
248
|
+
"classification": [partial(load_iris, return_X_y=True)],
|
|
249
|
+
"regression": [partial(load_diabetes, return_X_y=True)],
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def gen_dataset(
|
|
254
|
+
est,
|
|
255
|
+
datasets=_dataset_dict,
|
|
256
|
+
sparse=False,
|
|
257
|
+
queue=None,
|
|
258
|
+
target_df=None,
|
|
259
|
+
dtype=None,
|
|
260
|
+
):
|
|
261
|
+
"""Generate dataset for pytest testing.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
est : sklearn or sklearnex estimator class
|
|
266
|
+
Must inherit an sklearn Mixin or sklearn's BaseEstimator
|
|
267
|
+
|
|
268
|
+
dataset: dataset dict
|
|
269
|
+
Dictionary with keys "classification" and/or "regression"
|
|
270
|
+
Value must be a list of object which yield X, y array
|
|
271
|
+
objects when called, ideally using a lambda or
|
|
272
|
+
functools.partial.
|
|
273
|
+
|
|
274
|
+
sparse: bool (default False)
|
|
275
|
+
Convert X data to a scipy.sparse csr_matrix format.
|
|
276
|
+
|
|
277
|
+
queue: SYCL queue or None
|
|
278
|
+
Queue necessary for device offloading following the
|
|
279
|
+
SYCL 2020 standard, usually generated by dpctl.
|
|
280
|
+
|
|
281
|
+
target_df: string or None
|
|
282
|
+
dataframe type for returned dataset, as dictated by
|
|
283
|
+
onedal's _convert_to_dataframe.
|
|
284
|
+
|
|
285
|
+
dtype: numpy dtype or None
|
|
286
|
+
target datatype for returned datasets (see DTYPES).
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
list of 2-element list X,y: (array-like, array-like)
|
|
291
|
+
list of datasets for analysis
|
|
292
|
+
"""
|
|
293
|
+
dataset_type = _gen_dataset_type(est)
|
|
294
|
+
output = []
|
|
148
295
|
# load data
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
X, y =
|
|
153
|
-
|
|
154
|
-
|
|
296
|
+
flag = dtype is None
|
|
297
|
+
|
|
298
|
+
for func in datasets[dataset_type]:
|
|
299
|
+
X, y = func()
|
|
300
|
+
if flag:
|
|
301
|
+
dtype = X.dtype if hasattr(X, "dtype") else np.float64
|
|
155
302
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
303
|
+
if sparse:
|
|
304
|
+
X = sp.csr_matrix(X)
|
|
305
|
+
else:
|
|
306
|
+
X = _convert_to_dataframe(
|
|
307
|
+
X, sycl_queue=queue, target_df=target_df, dtype=dtype
|
|
308
|
+
)
|
|
309
|
+
y = _convert_to_dataframe(
|
|
310
|
+
y, sycl_queue=queue, target_df=target_df, dtype=dtype
|
|
311
|
+
)
|
|
312
|
+
output += [[X, y]]
|
|
313
|
+
return output
|
|
159
314
|
|
|
160
315
|
|
|
161
316
|
DTYPES = [
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# ==============================================================================
|
|
2
|
+
# Copyright 2024 Intel Corporation
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# ==============================================================================
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.testing import assert_allclose
|
|
19
|
+
from sklearn.datasets import make_blobs, make_classification, make_regression
|
|
20
|
+
from sklearn.model_selection import train_test_split
|
|
21
|
+
|
|
22
|
+
from onedal.tests.utils._dataframes_support import _as_numpy
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import dpctl
|
|
26
|
+
from dpctl import SyclQueue
|
|
27
|
+
from mpi4py import MPI
|
|
28
|
+
|
|
29
|
+
mpi_libs_available = True
|
|
30
|
+
gpu_is_available = dpctl.has_gpu_devices()
|
|
31
|
+
except (ImportError, ModuleNotFoundError):
|
|
32
|
+
mpi_libs_available = False
|
|
33
|
+
|
|
34
|
+
_mpi_libs_and_gpu_available = mpi_libs_available and gpu_is_available
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_local_tensor(full_data):
|
|
38
|
+
"""Splits data across ranks.
|
|
39
|
+
|
|
40
|
+
Called on each rank to extract the subset of data assigned to that rank.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
full_data (numpy or dpctl array): The entire set of data
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
local_data (numpy or dpctl array): The subset of data used by the rank
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# create sycl queue and gather communicator details
|
|
50
|
+
q = SyclQueue("gpu")
|
|
51
|
+
comm = MPI.COMM_WORLD
|
|
52
|
+
rank = comm.Get_rank()
|
|
53
|
+
size = comm.Get_size()
|
|
54
|
+
|
|
55
|
+
# divide data across ranks and move to dpt tensor
|
|
56
|
+
data_rows = full_data.shape[0]
|
|
57
|
+
local_start = rank * data_rows // size
|
|
58
|
+
local_end = (1 + rank) * data_rows // size
|
|
59
|
+
local_data = full_data[local_start:local_end]
|
|
60
|
+
|
|
61
|
+
return local_data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _generate_regression_data(n_samples, n_features, dtype=np.float64, random_state=42):
|
|
65
|
+
# Generates regression data and divides between train and test
|
|
66
|
+
X, y = make_regression(
|
|
67
|
+
n_samples=n_samples, n_features=n_features, random_state=random_state
|
|
68
|
+
)
|
|
69
|
+
X = X.astype(dtype)
|
|
70
|
+
y = y.astype(dtype)
|
|
71
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
|
|
72
|
+
return X_train, X_test, y_train, y_test
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _generate_classification_data(
|
|
76
|
+
n_samples, n_features, n_classes=2, dtype=np.float64, random_state=42
|
|
77
|
+
):
|
|
78
|
+
# Generates classification data and divides between train and test
|
|
79
|
+
X, y = make_classification(
|
|
80
|
+
n_samples=n_samples,
|
|
81
|
+
n_features=n_features,
|
|
82
|
+
n_classes=n_classes,
|
|
83
|
+
n_informative=int(0.5 * n_classes + 1),
|
|
84
|
+
random_state=random_state,
|
|
85
|
+
)
|
|
86
|
+
X = X.astype(dtype)
|
|
87
|
+
y = y.astype(dtype)
|
|
88
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
|
|
89
|
+
return X_train, X_test, y_train, y_test
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _generate_statistic_data(n_samples, n_features, dtype=np.float64, random_state=42):
|
|
93
|
+
# Generates statistical data
|
|
94
|
+
gen = np.random.default_rng(random_state)
|
|
95
|
+
data = gen.uniform(low=-0.3, high=+0.7, size=(n_samples, n_features)).astype(dtype)
|
|
96
|
+
return data
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _generate_clustering_data(
|
|
100
|
+
n_samples, n_features, centers=None, dtype=np.float64, random_state=42
|
|
101
|
+
):
|
|
102
|
+
# Generates clustering data and divides between train and test
|
|
103
|
+
X, _ = make_blobs(
|
|
104
|
+
n_samples=n_samples,
|
|
105
|
+
centers=centers,
|
|
106
|
+
n_features=n_features,
|
|
107
|
+
random_state=random_state,
|
|
108
|
+
)
|
|
109
|
+
X = X.astype(dtype)
|
|
110
|
+
X_train, X_test = train_test_split(X, random_state=random_state)
|
|
111
|
+
return X_train, X_test
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _spmd_assert_allclose(spmd_result, batch_result, **kwargs):
|
|
115
|
+
"""Calls assert_allclose on spmd and batch results.
|
|
116
|
+
|
|
117
|
+
Called on each rank to compare the spmd result specific to that rank and
|
|
118
|
+
subset of batch result that corresponds to that rank.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
spmd_result (numpy or dpctl array): The result for the subset of data on the rank the function is called from, computed by the spmd estimator
|
|
122
|
+
batch_result (numpy array): The result for all data, computed by the batch estimator
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
AssertionError: If all results are not adequately close.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# extract chunk from batch result to match with local spmd result
|
|
129
|
+
local_batch_result = _get_local_tensor(batch_result)
|
|
130
|
+
|
|
131
|
+
assert_allclose(_as_numpy(spmd_result), _as_numpy(local_batch_result), **kwargs)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _assert_unordered_allclose(spmd_result, batch_result, localize=False, **kwargs):
|
|
135
|
+
"""Checks if rows in spmd and batch results are aligned, even if not in the same order.
|
|
136
|
+
|
|
137
|
+
Called to verify correct unordered results are present. Useful to check KMeans centers
|
|
138
|
+
or KNN neighbors, where order does not matter. Sorts inputs to handle unordering. Also
|
|
139
|
+
capable of handling localization.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
spmd_result (numpy or dpctl array): Result computed by the spmd estimator
|
|
143
|
+
batch_result (numpy array): Result computed by batch estimator
|
|
144
|
+
localize (bool): Whether of not spmd result is specific to the rank, in which case batch result needs to be localized
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
AssertionError: If results do not match.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
sorted_spmd_result = spmd_result[np.argsort(np.linalg.norm(spmd_result, axis=1))]
|
|
151
|
+
if localize:
|
|
152
|
+
local_batch_result = _get_local_tensor(batch_result)
|
|
153
|
+
sorted_batch_result = local_batch_result[
|
|
154
|
+
np.argsort(np.linalg.norm(local_batch_result, axis=1))
|
|
155
|
+
]
|
|
156
|
+
else:
|
|
157
|
+
sorted_batch_result = batch_result[
|
|
158
|
+
np.argsort(np.linalg.norm(batch_result, axis=1))
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
assert_allclose(_as_numpy(sorted_spmd_result), sorted_batch_result, **kwargs)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _assert_kmeans_labels_allclose(
|
|
165
|
+
spmd_labels, batch_labels, spmd_centers, batch_centers, **kwargs
|
|
166
|
+
):
|
|
167
|
+
"""Checks if labels for spmd and batch results are aligned, even cluster indices don't match.
|
|
168
|
+
|
|
169
|
+
Called to verify labels are assigned the same way on spmd and batch. Uses raw labels (which
|
|
170
|
+
may not match) to identify cluster center and ensure results match.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
spmd_labels (numpy or dpctl array): The labels for the subset of data on the rank the function is called from, computed by the spmd estimator
|
|
174
|
+
batch_labels (numpy array): The labels for all data, computed by the batch estimator
|
|
175
|
+
spmd_centers (numpy or dpctl array): Centers computed by the spmd estimator
|
|
176
|
+
batch_centers (numpy array): Centers computed by batch estimator
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
AssertionError: If clusters are not correctly assigned.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
local_batch_labels = _get_local_tensor(batch_labels)
|
|
183
|
+
assert_allclose(
|
|
184
|
+
spmd_centers[_as_numpy(spmd_labels)], batch_centers[local_batch_labels], **kwargs
|
|
185
|
+
)
|