scikit-learn-intelex 2024.5.0__py311-none-manylinux1_x86_64.whl → 2024.7.0__py311-none-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scikit-learn-intelex might be problematic. Click here for more details.

Files changed (73) hide show
  1. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/METADATA +2 -2
  2. scikit_learn_intelex-2024.7.0.dist-info/RECORD +122 -0
  3. sklearnex/_config.py +3 -15
  4. sklearnex/_device_offload.py +9 -168
  5. sklearnex/basic_statistics/basic_statistics.py +127 -1
  6. sklearnex/basic_statistics/tests/test_basic_statistics.py +251 -0
  7. sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +1 -1
  8. sklearnex/cluster/dbscan.py +3 -1
  9. sklearnex/cluster/k_means.py +8 -0
  10. sklearnex/cluster/tests/test_dbscan.py +8 -6
  11. sklearnex/cluster/tests/test_kmeans.py +15 -3
  12. sklearnex/conftest.py +11 -1
  13. sklearnex/covariance/incremental_covariance.py +64 -13
  14. sklearnex/covariance/tests/test_incremental_covariance.py +35 -0
  15. sklearnex/decomposition/pca.py +25 -1
  16. sklearnex/decomposition/tests/test_pca.py +4 -2
  17. sklearnex/dispatcher.py +109 -1
  18. sklearnex/ensemble/_forest.py +121 -57
  19. sklearnex/ensemble/tests/test_forest.py +7 -0
  20. sklearnex/glob/dispatcher.py +16 -2
  21. sklearnex/linear_model/coordinate_descent.py +13 -0
  22. sklearnex/linear_model/incremental_linear.py +102 -25
  23. sklearnex/linear_model/linear.py +25 -39
  24. sklearnex/linear_model/logistic_regression.py +92 -74
  25. sklearnex/linear_model/ridge.py +7 -0
  26. sklearnex/linear_model/tests/test_incremental_linear.py +10 -10
  27. sklearnex/linear_model/tests/test_linear.py +30 -5
  28. sklearnex/linear_model/tests/test_logreg.py +45 -3
  29. sklearnex/manifold/t_sne.py +4 -0
  30. sklearnex/metrics/pairwise.py +5 -0
  31. sklearnex/metrics/ranking.py +3 -0
  32. sklearnex/model_selection/split.py +3 -0
  33. sklearnex/neighbors/_lof.py +9 -0
  34. sklearnex/neighbors/common.py +45 -1
  35. sklearnex/neighbors/knn_classification.py +1 -20
  36. sklearnex/neighbors/knn_regression.py +25 -20
  37. sklearnex/neighbors/knn_unsupervised.py +31 -7
  38. sklearnex/preview/__init__.py +1 -1
  39. sklearnex/preview/decomposition/__init__.py +19 -0
  40. sklearnex/preview/decomposition/incremental_pca.py +228 -0
  41. sklearnex/preview/decomposition/tests/test_incremental_pca.py +266 -0
  42. sklearnex/preview/linear_model/__init__.py +19 -0
  43. sklearnex/preview/linear_model/ridge.py +419 -0
  44. sklearnex/preview/linear_model/tests/test_ridge.py +102 -0
  45. sklearnex/spmd/basic_statistics/tests/test_basic_statistics_spmd.py +107 -0
  46. sklearnex/spmd/cluster/tests/test_dbscan_spmd.py +97 -0
  47. sklearnex/spmd/cluster/tests/test_kmeans_spmd.py +172 -0
  48. sklearnex/spmd/covariance/tests/test_covariance_spmd.py +107 -0
  49. sklearnex/spmd/decomposition/tests/test_pca_spmd.py +128 -0
  50. sklearnex/spmd/ensemble/tests/test_forest_spmd.py +265 -0
  51. sklearnex/spmd/linear_model/tests/test_linear_regression_spmd.py +145 -0
  52. sklearnex/spmd/linear_model/tests/test_logistic_regression_spmd.py +163 -0
  53. sklearnex/spmd/neighbors/tests/test_neighbors_spmd.py +288 -0
  54. sklearnex/svm/_common.py +163 -20
  55. sklearnex/svm/nusvc.py +40 -4
  56. sklearnex/svm/nusvr.py +31 -2
  57. sklearnex/svm/svc.py +40 -4
  58. sklearnex/svm/svr.py +31 -2
  59. sklearnex/svm/tests/test_svm.py +12 -20
  60. sklearnex/tests/_utils.py +185 -30
  61. sklearnex/tests/_utils_spmd.py +185 -0
  62. sklearnex/tests/test_common.py +54 -0
  63. sklearnex/tests/test_config.py +4 -0
  64. sklearnex/tests/test_memory_usage.py +185 -126
  65. sklearnex/tests/test_monkeypatch.py +12 -4
  66. sklearnex/tests/test_patching.py +21 -25
  67. sklearnex/tests/test_run_to_run_stability.py +295 -0
  68. sklearnex/utils/_namespace.py +1 -1
  69. scikit_learn_intelex-2024.5.0.dist-info/RECORD +0 -104
  70. sklearnex/tests/test_run_to_run_stability_tests.py +0 -428
  71. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/LICENSE.txt +0 -0
  72. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/WHEEL +0 -0
  73. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.7.0.dist-info}/top_level.txt +0 -0
sklearnex/svm/svc.py CHANGED
@@ -85,6 +85,17 @@ class SVC(sklearn_SVC, BaseSVC):
85
85
  def fit(self, X, y, sample_weight=None):
86
86
  if sklearn_check_version("1.2"):
87
87
  self._validate_params()
88
+ elif self.C <= 0:
89
+ # else if added to correct issues with
90
+ # sklearn tests:
91
+ # svm/tests/test_sparse.py::test_error
92
+ # svm/tests/test_svm.py::test_bad_input
93
+ # for sklearn versions < 1.2 (i.e. without
94
+ # validate_params parameter checking)
95
+ # Without this, a segmentation fault with
96
+ # Windows fatal exception: access violation
97
+ # occurs
98
+ raise ValueError("C <= 0")
88
99
  if sklearn_check_version("1.0"):
89
100
  self._check_feature_names(X, reset=True)
90
101
  dispatch(
@@ -96,8 +107,9 @@ class SVC(sklearn_SVC, BaseSVC):
96
107
  },
97
108
  X,
98
109
  y,
99
- sample_weight,
110
+ sample_weight=sample_weight,
100
111
  )
112
+
101
113
  return self
102
114
 
103
115
  @wrap_output_data
@@ -270,12 +282,30 @@ class SVC(sklearn_SVC, BaseSVC):
270
282
  return patching_status
271
283
  raise RuntimeError(f"Unknown method {method_name} in {class_name}")
272
284
 
285
+ def _get_sample_weight(self, X, y, sample_weight=None):
286
+ sample_weight = super()._get_sample_weight(X, y, sample_weight)
287
+ if sample_weight is None:
288
+ return sample_weight
289
+
290
+ if np.any(sample_weight <= 0) and len(np.unique(y[sample_weight > 0])) != len(
291
+ self.classes_
292
+ ):
293
+ raise ValueError(
294
+ "Invalid input - all samples with positive weights "
295
+ "belong to the same class"
296
+ if sklearn_check_version("1.2")
297
+ else "Invalid input - all samples with positive weights "
298
+ "have the same label."
299
+ )
300
+ return sample_weight
301
+
273
302
  def _onedal_fit(self, X, y, sample_weight=None, queue=None):
303
+ X, _, weights = self._onedal_fit_checks(X, y, sample_weight)
274
304
  onedal_params = {
275
305
  "C": self.C,
276
306
  "kernel": self.kernel,
277
307
  "degree": self.degree,
278
- "gamma": self.gamma,
308
+ "gamma": self._compute_gamma_sigma(X),
279
309
  "coef0": self.coef0,
280
310
  "tol": self.tol,
281
311
  "shrinking": self.shrinking,
@@ -287,10 +317,16 @@ class SVC(sklearn_SVC, BaseSVC):
287
317
  }
288
318
 
289
319
  self._onedal_estimator = onedal_SVC(**onedal_params)
290
- self._onedal_estimator.fit(X, y, sample_weight, queue=queue)
320
+ self._onedal_estimator.fit(X, y, weights, queue=queue)
291
321
 
292
322
  if self.probability:
293
- self._fit_proba(X, y, sample_weight, queue=queue)
323
+ self._fit_proba(
324
+ X,
325
+ y,
326
+ sample_weight=sample_weight,
327
+ queue=queue,
328
+ )
329
+
294
330
  self._save_attributes()
295
331
 
296
332
  def _onedal_predict(self, X, queue=None):
sklearnex/svm/svr.py CHANGED
@@ -65,6 +65,17 @@ class SVR(sklearn_SVR, BaseSVR):
65
65
  def fit(self, X, y, sample_weight=None):
66
66
  if sklearn_check_version("1.2"):
67
67
  self._validate_params()
68
+ elif self.C <= 0:
69
+ # else if added to correct issues with
70
+ # sklearn tests:
71
+ # svm/tests/test_sparse.py::test_error
72
+ # svm/tests/test_svm.py::test_bad_input
73
+ # for sklearn versions < 1.2 (i.e. without
74
+ # validate_params parameter checking)
75
+ # Without this, a segmentation fault with
76
+ # Windows fatal exception: access violation
77
+ # occurs
78
+ raise ValueError("C <= 0")
68
79
  if sklearn_check_version("1.0"):
69
80
  self._check_feature_names(X, reset=True)
70
81
  dispatch(
@@ -76,7 +87,7 @@ class SVR(sklearn_SVR, BaseSVR):
76
87
  },
77
88
  X,
78
89
  y,
79
- sample_weight,
90
+ sample_weight=sample_weight,
80
91
  )
81
92
 
82
93
  return self
@@ -95,13 +106,30 @@ class SVR(sklearn_SVR, BaseSVR):
95
106
  X,
96
107
  )
97
108
 
109
+ @wrap_output_data
110
+ def score(self, X, y, sample_weight=None):
111
+ if sklearn_check_version("1.0"):
112
+ self._check_feature_names(X, reset=False)
113
+ return dispatch(
114
+ self,
115
+ "score",
116
+ {
117
+ "onedal": self.__class__._onedal_score,
118
+ "sklearn": sklearn_SVR.score,
119
+ },
120
+ X,
121
+ y,
122
+ sample_weight=sample_weight,
123
+ )
124
+
98
125
  def _onedal_fit(self, X, y, sample_weight=None, queue=None):
126
+ X, _, sample_weight = self._onedal_fit_checks(X, y, sample_weight)
99
127
  onedal_params = {
100
128
  "C": self.C,
101
129
  "epsilon": self.epsilon,
102
130
  "kernel": self.kernel,
103
131
  "degree": self.degree,
104
- "gamma": self.gamma,
132
+ "gamma": self._compute_gamma_sigma(X),
105
133
  "coef0": self.coef0,
106
134
  "tol": self.tol,
107
135
  "shrinking": self.shrinking,
@@ -118,3 +146,4 @@ class SVR(sklearn_SVR, BaseSVR):
118
146
 
119
147
  fit.__doc__ = sklearn_SVR.fit.__doc__
120
148
  predict.__doc__ = sklearn_SVR.predict.__doc__
149
+ score.__doc__ = sklearn_SVR.score.__doc__
@@ -25,12 +25,10 @@ from onedal.tests.utils._dataframes_support import (
25
25
  )
26
26
 
27
27
 
28
- # TODO:
29
- # investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
30
- @pytest.mark.parametrize(
31
- "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
32
- )
28
+ @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
33
29
  def test_sklearnex_import_svc(dataframe, queue):
30
+ if queue and queue.sycl_device.is_gpu:
31
+ pytest.skip("SVC fit for the GPU sycl_queue is buggy.")
34
32
  from sklearnex.svm import SVC
35
33
 
36
34
  X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
@@ -43,12 +41,10 @@ def test_sklearnex_import_svc(dataframe, queue):
43
41
  assert_allclose(_as_numpy(svc.support_), [1, 3])
44
42
 
45
43
 
46
- # TODO:
47
- # investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
48
- @pytest.mark.parametrize(
49
- "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
50
- )
44
+ @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
51
45
  def test_sklearnex_import_nusvc(dataframe, queue):
46
+ if queue and queue.sycl_device.is_gpu:
47
+ pytest.skip("NuSVC fit for the GPU sycl_queue is buggy.")
52
48
  from sklearnex.svm import NuSVC
53
49
 
54
50
  X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
@@ -63,12 +59,10 @@ def test_sklearnex_import_nusvc(dataframe, queue):
63
59
  assert_allclose(_as_numpy(svc.support_), [0, 1, 3, 4])
64
60
 
65
61
 
66
- # TODO:
67
- # investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
68
- @pytest.mark.parametrize(
69
- "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
70
- )
62
+ @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
71
63
  def test_sklearnex_import_svr(dataframe, queue):
64
+ if queue and queue.sycl_device.is_gpu:
65
+ pytest.skip("SVR fit for the GPU sycl_queue is buggy.")
72
66
  from sklearnex.svm import SVR
73
67
 
74
68
  X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
@@ -81,12 +75,10 @@ def test_sklearnex_import_svr(dataframe, queue):
81
75
  assert_allclose(_as_numpy(svc.support_), [1, 3])
82
76
 
83
77
 
84
- # TODO:
85
- # investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU`
86
- @pytest.mark.parametrize(
87
- "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu")
88
- )
78
+ @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
89
79
  def test_sklearnex_import_nusvr(dataframe, queue):
80
+ if queue and queue.sycl_device.is_gpu:
81
+ pytest.skip("NuSVR fit for the GPU sycl_queue is buggy.")
90
82
  from sklearnex.svm import NuSVR
91
83
 
92
84
  X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]])
sklearnex/tests/_utils.py CHANGED
@@ -14,9 +14,11 @@
14
14
  # limitations under the License.
15
15
  # ==============================================================================
16
16
 
17
- from inspect import isclass
17
+ from functools import partial
18
+ from inspect import getattr_static, isclass, signature
18
19
 
19
20
  import numpy as np
21
+ from scipy import sparse as sp
20
22
  from sklearn import clone
21
23
  from sklearn.base import (
22
24
  BaseEstimator,
@@ -42,6 +44,22 @@ from sklearnex.svm import SVC, NuSVC
42
44
 
43
45
 
44
46
  def _load_all_models(with_sklearnex=True, estimator=True):
47
+ """Convert sklearnex patch_map into a dictionary of estimators or functions
48
+
49
+ Parameters
50
+ ----------
51
+ with_sklearnex: bool (default=True)
52
+ Discover estimators and methods with sklearnex patching enabled (True)
53
+ or disabled (False) from the sklearnex patch_map
54
+
55
+ estimator: bool (default=True)
56
+ yield estimators (True) or functions (False)
57
+
58
+ Returns
59
+ -------
60
+ dict: {name:estimator}
61
+ estimator is a class or function from sklearn or sklearnex
62
+ """
45
63
  # insure that patch state is correct as dictated by patch_sklearn boolean
46
64
  # and return it to the previous state no matter what occurs.
47
65
  already_patched_map = sklearn_is_patched(return_map=True)
@@ -89,11 +107,17 @@ mixin_map = [
89
107
 
90
108
 
91
109
  class _sklearn_clone_dict(dict):
110
+ """Special dict type for returning state-free sklearn/sklearnex estimators
111
+ with the same parameters"""
92
112
 
93
113
  def __getitem__(self, key):
94
114
  return clone(super().__getitem__(key))
95
115
 
96
116
 
117
+ # Special dictionary of sklearnex estimators which must be specifically tested, this
118
+ # could be because of supported non-default parameters, blocked support via sklearn's
119
+ # 'available_if' decorator, or not being a native sklearn estimator (i.e. those not in
120
+ # the default PATCHED_MODELS dictionary)
97
121
  SPECIAL_INSTANCES = _sklearn_clone_dict(
98
122
  {
99
123
  str(i): i
@@ -110,27 +134,58 @@ SPECIAL_INSTANCES = _sklearn_clone_dict(
110
134
  )
111
135
 
112
136
 
113
- def gen_models_info(algorithms):
137
+ def gen_models_info(algorithms, required_inputs=["X", "y"]):
138
+ """Generate estimator-attribute pairs for pytest test collection.
139
+
140
+ Parameters
141
+ ----------
142
+ algorithms : iterable (list, tuple, 1D array-like object)
143
+ Iterable of valid sklearnex estimators or keys from PATCHED_MODELS
144
+
145
+ required_inputs : list, tuple of strings or None
146
+ list of required args/kwargs for callable attribute (only non-private,
147
+ non-BaseEstimator attributes). Only one must be present, None
148
+ signifies taking all non-private attribues, callable or not.
149
+
150
+ Returns
151
+ -------
152
+ list of 2-element tuples: (estimator, string)
153
+ Returns a list of valid methods or attributes without "fit"
154
+ """
114
155
  output = []
115
- for i in algorithms:
156
+ for estimator in algorithms:
116
157
 
117
- if i in PATCHED_MODELS:
118
- est = PATCHED_MODELS[i]
119
- elif i in SPECIAL_INSTANCES:
120
- est = SPECIAL_INSTANCES[i].__class__
158
+ if estimator in PATCHED_MODELS:
159
+ est = PATCHED_MODELS[estimator]
160
+ elif isinstance(algorithms[estimator], BaseEstimator):
161
+ est = algorithms[estimator].__class__
121
162
  else:
122
- raise KeyError(f"Unrecognized sklearnex estimator: {i}")
163
+ raise KeyError(f"Unrecognized sklearnex estimator: {estimator}")
123
164
 
124
- methods = set()
125
- candidates = set(
126
- [i for i in dir(est) if not i.startswith("_") and not i.endswith("_")]
127
- )
165
+ # remove BaseEstimator methods (get_params, set_params)
166
+ candidates = set(dir(est)) - set(dir(BaseEstimator))
167
+ # remove private methods
168
+ candidates = set([attr for attr in candidates if not attr.startswith("_")])
169
+ # required to enable other methods
170
+ candidates = candidates - {"fit"}
128
171
 
129
- for mixin, method, _ in mixin_map:
130
- if issubclass(est, mixin):
131
- methods |= candidates & set(method)
172
+ # allow only callable methods with any of the required inputs
173
+ if required_inputs:
174
+ methods = []
175
+ for attr in candidates:
176
+ attribute = getattr_static(est, attr)
177
+ if callable(attribute):
178
+ params = signature(attribute).parameters
179
+ if any([inp in params for inp in required_inputs]):
180
+ methods += [attr]
181
+ else:
182
+ methods = candidates
132
183
 
133
- output += [[i, j] for j in methods] if methods else [[i, None]]
184
+ output += (
185
+ [(estimator, method) for method in methods]
186
+ if methods
187
+ else [(estimator, None)]
188
+ )
134
189
 
135
190
  # In the case that no methods are available, set method to None.
136
191
  # This will allow estimators without mixins to still test the fit
@@ -138,24 +193,124 @@ def gen_models_info(algorithms):
138
193
  return output
139
194
 
140
195
 
141
- def gen_dataset(estimator, queue=None, target_df=None, dtype=np.float64):
142
- dataset = None
143
- name = estimator.__class__.__name__
144
- est = PATCHED_MODELS[name]
196
+ def call_method(estimator, method, X, y, **kwargs):
197
+ """Generalized interface to call most sklearn estimator methods
198
+
199
+ Parameters
200
+ ----------
201
+ estimator : sklearn or sklearnex estimator instance
202
+
203
+ method: string
204
+ Valid callable method to estimator
205
+
206
+ X: array-like
207
+ data
208
+
209
+ y: array-like (for 'score', 'partial-fit', and 'path')
210
+ X-dependent data
211
+
212
+ **kwargs: keyword dict
213
+ keyword arguments to estimator.method
214
+
215
+ Returns
216
+ -------
217
+ return value from estimator.method
218
+ """
219
+ # useful for repository wide testing
220
+ if method == "inverse_transform":
221
+ # PCA's inverse_transform takes (n_samples, n_components)
222
+ data = (
223
+ (X[:, : estimator.n_components_],)
224
+ if X.shape[1] != estimator.n_components_
225
+ else (X,)
226
+ )
227
+ elif method not in ["score", "partial_fit", "path"]:
228
+ data = (X,)
229
+ else:
230
+ data = (X, y)
231
+ return getattr(estimator, method)(*data, **kwargs)
232
+
233
+
234
+ def _gen_dataset_type(est):
235
+ # est should be an estimator or estimator class
236
+ # dataset initialized to classification, but will be swapped
237
+ # for other types as necessary. Private method.
238
+ dataset = "classification"
239
+ estimator = est.__class__ if isinstance(est, BaseEstimator) else est
240
+
145
241
  for mixin, _, data in mixin_map:
146
- if issubclass(est, mixin) and data is not None:
242
+ if issubclass(estimator, mixin) and data is not None:
147
243
  dataset = data
244
+ return dataset
245
+
246
+
247
+ _dataset_dict = {
248
+ "classification": [partial(load_iris, return_X_y=True)],
249
+ "regression": [partial(load_diabetes, return_X_y=True)],
250
+ }
251
+
252
+
253
+ def gen_dataset(
254
+ est,
255
+ datasets=_dataset_dict,
256
+ sparse=False,
257
+ queue=None,
258
+ target_df=None,
259
+ dtype=None,
260
+ ):
261
+ """Generate dataset for pytest testing.
262
+
263
+ Parameters
264
+ ----------
265
+ est : sklearn or sklearnex estimator class
266
+ Must inherit an sklearn Mixin or sklearn's BaseEstimator
267
+
268
+ dataset: dataset dict
269
+ Dictionary with keys "classification" and/or "regression"
270
+ Value must be a list of object which yield X, y array
271
+ objects when called, ideally using a lambda or
272
+ functools.partial.
273
+
274
+ sparse: bool (default False)
275
+ Convert X data to a scipy.sparse csr_matrix format.
276
+
277
+ queue: SYCL queue or None
278
+ Queue necessary for device offloading following the
279
+ SYCL 2020 standard, usually generated by dpctl.
280
+
281
+ target_df: string or None
282
+ dataframe type for returned dataset, as dictated by
283
+ onedal's _convert_to_dataframe.
284
+
285
+ dtype: numpy dtype or None
286
+ target datatype for returned datasets (see DTYPES).
287
+
288
+ Returns
289
+ -------
290
+ list of 2-element list X,y: (array-like, array-like)
291
+ list of datasets for analysis
292
+ """
293
+ dataset_type = _gen_dataset_type(est)
294
+ output = []
148
295
  # load data
149
- if dataset == "classification" or dataset is None:
150
- X, y = load_iris(return_X_y=True)
151
- elif dataset == "regression":
152
- X, y = load_diabetes(return_X_y=True)
153
- else:
154
- raise ValueError("Unknown dataset type")
296
+ flag = dtype is None
297
+
298
+ for func in datasets[dataset_type]:
299
+ X, y = func()
300
+ if flag:
301
+ dtype = X.dtype if hasattr(X, "dtype") else np.float64
155
302
 
156
- X = _convert_to_dataframe(X, sycl_queue=queue, target_df=target_df, dtype=dtype)
157
- y = _convert_to_dataframe(y, sycl_queue=queue, target_df=target_df, dtype=dtype)
158
- return X, y
303
+ if sparse:
304
+ X = sp.csr_matrix(X)
305
+ else:
306
+ X = _convert_to_dataframe(
307
+ X, sycl_queue=queue, target_df=target_df, dtype=dtype
308
+ )
309
+ y = _convert_to_dataframe(
310
+ y, sycl_queue=queue, target_df=target_df, dtype=dtype
311
+ )
312
+ output += [[X, y]]
313
+ return output
159
314
 
160
315
 
161
316
  DTYPES = [
@@ -0,0 +1,185 @@
1
+ # ==============================================================================
2
+ # Copyright 2024 Intel Corporation
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ import numpy as np
18
+ from numpy.testing import assert_allclose
19
+ from sklearn.datasets import make_blobs, make_classification, make_regression
20
+ from sklearn.model_selection import train_test_split
21
+
22
+ from onedal.tests.utils._dataframes_support import _as_numpy
23
+
24
+ try:
25
+ import dpctl
26
+ from dpctl import SyclQueue
27
+ from mpi4py import MPI
28
+
29
+ mpi_libs_available = True
30
+ gpu_is_available = dpctl.has_gpu_devices()
31
+ except (ImportError, ModuleNotFoundError):
32
+ mpi_libs_available = False
33
+
34
+ _mpi_libs_and_gpu_available = mpi_libs_available and gpu_is_available
35
+
36
+
37
+ def _get_local_tensor(full_data):
38
+ """Splits data across ranks.
39
+
40
+ Called on each rank to extract the subset of data assigned to that rank.
41
+
42
+ Args:
43
+ full_data (numpy or dpctl array): The entire set of data
44
+
45
+ Returns:
46
+ local_data (numpy or dpctl array): The subset of data used by the rank
47
+ """
48
+
49
+ # create sycl queue and gather communicator details
50
+ q = SyclQueue("gpu")
51
+ comm = MPI.COMM_WORLD
52
+ rank = comm.Get_rank()
53
+ size = comm.Get_size()
54
+
55
+ # divide data across ranks and move to dpt tensor
56
+ data_rows = full_data.shape[0]
57
+ local_start = rank * data_rows // size
58
+ local_end = (1 + rank) * data_rows // size
59
+ local_data = full_data[local_start:local_end]
60
+
61
+ return local_data
62
+
63
+
64
+ def _generate_regression_data(n_samples, n_features, dtype=np.float64, random_state=42):
65
+ # Generates regression data and divides between train and test
66
+ X, y = make_regression(
67
+ n_samples=n_samples, n_features=n_features, random_state=random_state
68
+ )
69
+ X = X.astype(dtype)
70
+ y = y.astype(dtype)
71
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
72
+ return X_train, X_test, y_train, y_test
73
+
74
+
75
+ def _generate_classification_data(
76
+ n_samples, n_features, n_classes=2, dtype=np.float64, random_state=42
77
+ ):
78
+ # Generates classification data and divides between train and test
79
+ X, y = make_classification(
80
+ n_samples=n_samples,
81
+ n_features=n_features,
82
+ n_classes=n_classes,
83
+ n_informative=int(0.5 * n_classes + 1),
84
+ random_state=random_state,
85
+ )
86
+ X = X.astype(dtype)
87
+ y = y.astype(dtype)
88
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
89
+ return X_train, X_test, y_train, y_test
90
+
91
+
92
+ def _generate_statistic_data(n_samples, n_features, dtype=np.float64, random_state=42):
93
+ # Generates statistical data
94
+ gen = np.random.default_rng(random_state)
95
+ data = gen.uniform(low=-0.3, high=+0.7, size=(n_samples, n_features)).astype(dtype)
96
+ return data
97
+
98
+
99
+ def _generate_clustering_data(
100
+ n_samples, n_features, centers=None, dtype=np.float64, random_state=42
101
+ ):
102
+ # Generates clustering data and divides between train and test
103
+ X, _ = make_blobs(
104
+ n_samples=n_samples,
105
+ centers=centers,
106
+ n_features=n_features,
107
+ random_state=random_state,
108
+ )
109
+ X = X.astype(dtype)
110
+ X_train, X_test = train_test_split(X, random_state=random_state)
111
+ return X_train, X_test
112
+
113
+
114
+ def _spmd_assert_allclose(spmd_result, batch_result, **kwargs):
115
+ """Calls assert_allclose on spmd and batch results.
116
+
117
+ Called on each rank to compare the spmd result specific to that rank and
118
+ subset of batch result that corresponds to that rank.
119
+
120
+ Args:
121
+ spmd_result (numpy or dpctl array): The result for the subset of data on the rank the function is called from, computed by the spmd estimator
122
+ batch_result (numpy array): The result for all data, computed by the batch estimator
123
+
124
+ Raises:
125
+ AssertionError: If all results are not adequately close.
126
+ """
127
+
128
+ # extract chunk from batch result to match with local spmd result
129
+ local_batch_result = _get_local_tensor(batch_result)
130
+
131
+ assert_allclose(_as_numpy(spmd_result), _as_numpy(local_batch_result), **kwargs)
132
+
133
+
134
+ def _assert_unordered_allclose(spmd_result, batch_result, localize=False, **kwargs):
135
+ """Checks if rows in spmd and batch results are aligned, even if not in the same order.
136
+
137
+ Called to verify correct unordered results are present. Useful to check KMeans centers
138
+ or KNN neighbors, where order does not matter. Sorts inputs to handle unordering. Also
139
+ capable of handling localization.
140
+
141
+ Args:
142
+ spmd_result (numpy or dpctl array): Result computed by the spmd estimator
143
+ batch_result (numpy array): Result computed by batch estimator
144
+ localize (bool): Whether of not spmd result is specific to the rank, in which case batch result needs to be localized
145
+
146
+ Raises:
147
+ AssertionError: If results do not match.
148
+ """
149
+
150
+ sorted_spmd_result = spmd_result[np.argsort(np.linalg.norm(spmd_result, axis=1))]
151
+ if localize:
152
+ local_batch_result = _get_local_tensor(batch_result)
153
+ sorted_batch_result = local_batch_result[
154
+ np.argsort(np.linalg.norm(local_batch_result, axis=1))
155
+ ]
156
+ else:
157
+ sorted_batch_result = batch_result[
158
+ np.argsort(np.linalg.norm(batch_result, axis=1))
159
+ ]
160
+
161
+ assert_allclose(_as_numpy(sorted_spmd_result), sorted_batch_result, **kwargs)
162
+
163
+
164
+ def _assert_kmeans_labels_allclose(
165
+ spmd_labels, batch_labels, spmd_centers, batch_centers, **kwargs
166
+ ):
167
+ """Checks if labels for spmd and batch results are aligned, even cluster indices don't match.
168
+
169
+ Called to verify labels are assigned the same way on spmd and batch. Uses raw labels (which
170
+ may not match) to identify cluster center and ensure results match.
171
+
172
+ Args:
173
+ spmd_labels (numpy or dpctl array): The labels for the subset of data on the rank the function is called from, computed by the spmd estimator
174
+ batch_labels (numpy array): The labels for all data, computed by the batch estimator
175
+ spmd_centers (numpy or dpctl array): Centers computed by the spmd estimator
176
+ batch_centers (numpy array): Centers computed by batch estimator
177
+
178
+ Raises:
179
+ AssertionError: If clusters are not correctly assigned.
180
+ """
181
+
182
+ local_batch_labels = _get_local_tensor(batch_labels)
183
+ assert_allclose(
184
+ spmd_centers[_as_numpy(spmd_labels)], batch_centers[local_batch_labels], **kwargs
185
+ )