scikit-learn-intelex 2024.5.0__py312-none-manylinux1_x86_64.whl → 2024.6.0__py312-none-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.6.0.dist-info}/METADATA +2 -2
  2. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.6.0.dist-info}/RECORD +34 -30
  3. sklearnex/cluster/dbscan.py +3 -0
  4. sklearnex/cluster/tests/test_dbscan.py +8 -6
  5. sklearnex/conftest.py +11 -1
  6. sklearnex/decomposition/tests/test_pca.py +4 -2
  7. sklearnex/dispatcher.py +15 -1
  8. sklearnex/ensemble/_forest.py +114 -23
  9. sklearnex/ensemble/tests/test_forest.py +13 -3
  10. sklearnex/glob/dispatcher.py +16 -2
  11. sklearnex/linear_model/incremental_linear.py +102 -25
  12. sklearnex/linear_model/linear.py +25 -7
  13. sklearnex/linear_model/logistic_regression.py +13 -15
  14. sklearnex/linear_model/tests/test_incremental_linear.py +10 -10
  15. sklearnex/linear_model/tests/test_linear.py +2 -2
  16. sklearnex/neighbors/knn_regression.py +24 -0
  17. sklearnex/preview/__init__.py +1 -1
  18. sklearnex/preview/decomposition/__init__.py +19 -0
  19. sklearnex/preview/decomposition/incremental_pca.py +228 -0
  20. sklearnex/preview/decomposition/tests/test_incremental_pca.py +266 -0
  21. sklearnex/svm/_common.py +165 -20
  22. sklearnex/svm/nusvc.py +40 -4
  23. sklearnex/svm/nusvr.py +31 -2
  24. sklearnex/svm/svc.py +40 -4
  25. sklearnex/svm/svr.py +31 -2
  26. sklearnex/tests/_utils.py +49 -17
  27. sklearnex/tests/test_common.py +54 -0
  28. sklearnex/tests/test_memory_usage.py +185 -126
  29. sklearnex/tests/test_patching.py +5 -12
  30. sklearnex/tests/test_run_to_run_stability.py +283 -0
  31. sklearnex/utils/_namespace.py +1 -1
  32. sklearnex/tests/test_run_to_run_stability_tests.py +0 -428
  33. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.6.0.dist-info}/LICENSE.txt +0 -0
  34. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.6.0.dist-info}/WHEEL +0 -0
  35. {scikit_learn_intelex-2024.5.0.dist-info → scikit_learn_intelex-2024.6.0.dist-info}/top_level.txt +0 -0
@@ -14,126 +14,113 @@
14
14
  # limitations under the License.
15
15
  # ==============================================================================
16
16
 
17
-
18
17
  import gc
19
18
  import logging
19
+ import os
20
20
  import tracemalloc
21
21
  import types
22
+ import warnings
23
+ from inspect import isclass
22
24
 
23
25
  import numpy as np
24
26
  import pandas as pd
25
27
  import pytest
26
28
  from scipy.stats import pearsonr
27
- from sklearn.base import BaseEstimator
29
+ from sklearn.base import BaseEstimator, clone
28
30
  from sklearn.datasets import make_classification
29
31
  from sklearn.model_selection import KFold
30
32
 
31
- from sklearnex import get_patch_map
32
- from sklearnex.metrics import pairwise_distances, roc_auc_score
33
- from sklearnex.model_selection import train_test_split
34
- from sklearnex.utils import _assert_all_finite
35
-
36
-
37
- class TrainTestSplitEstimator:
38
- def __init__(self):
39
- pass
40
-
41
- def fit(self, x, y):
42
- train_test_split(x, y)
43
-
44
-
45
- class FiniteCheckEstimator:
46
- def __init__(self):
47
- pass
48
-
49
- def fit(self, x, y):
50
- _assert_all_finite(x)
51
- _assert_all_finite(y)
52
-
53
-
54
- class PairwiseDistancesEstimator:
55
- def fit(self, x, y):
56
- pairwise_distances(x, metric=self.metric)
57
-
58
-
59
- class CosineDistancesEstimator(PairwiseDistancesEstimator):
60
- def __init__(self):
61
- self.metric = "cosine"
62
-
63
-
64
- class CorrelationDistancesEstimator(PairwiseDistancesEstimator):
65
- def __init__(self):
66
- self.metric = "correlation"
67
-
68
-
69
- class RocAucEstimator:
70
- def __init__(self):
71
- pass
72
-
73
- def fit(self, x, y):
74
- print(roc_auc_score(y, np.zeros(shape=y.shape, dtype=np.int32)))
75
-
76
-
77
- # add all daal4py estimators enabled in patching (except banned)
78
-
79
-
80
- def get_patched_estimators(ban_list, output_list):
81
- patched_estimators = get_patch_map().values()
82
- for listing in patched_estimators:
83
- estimator, name = listing[0][0][2], listing[0][0][1]
84
- if not isinstance(estimator, types.FunctionType):
85
- if name not in ban_list:
86
- if issubclass(estimator, BaseEstimator):
87
- if hasattr(estimator, "fit"):
88
- output_list.append(estimator)
89
-
33
+ from onedal import _is_dpc_backend
34
+ from onedal.tests.utils._dataframes_support import (
35
+ _convert_to_dataframe,
36
+ get_dataframes_and_queues,
37
+ )
38
+ from onedal.tests.utils._device_selection import get_queues, is_dpctl_available
39
+ from sklearnex import config_context
40
+ from sklearnex.tests._utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES
41
+ from sklearnex.utils import get_namespace
90
42
 
91
- def remove_duplicated_estimators(estimators_list):
92
- estimators_map = {}
93
- for estimator in estimators_list:
94
- full_name = f"{estimator.__module__}.{estimator.__name__}"
95
- estimators_map[full_name] = estimator
96
- return estimators_map.values()
43
+ if _is_dpc_backend:
44
+ from onedal import _backend
97
45
 
98
46
 
99
- BANNED_ESTIMATORS = (
47
+ CPU_SKIP_LIST = (
48
+ "TSNE", # too slow for using in testing on common data size
49
+ "config_context", # does not malloc
50
+ "get_config", # does not malloc
51
+ "set_config", # does not malloc
52
+ "SVC(probability=True)", # memory leak fortran numpy (investigate _fit_proba)
53
+ "NuSVC(probability=True)", # memory leak fortran numpy (investigate _fit_proba)
100
54
  "IncrementalEmpiricalCovariance", # dataframe_f issues
101
55
  "IncrementalLinearRegression", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f
102
- "TSNE", # too slow for using in testing on common data size
56
+ "IncrementalPCA", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f
57
+ "LogisticRegression(solver='newton-cg')", # memory leak fortran (1000, 100)
103
58
  )
104
- estimators = [
105
- TrainTestSplitEstimator,
106
- FiniteCheckEstimator,
107
- CosineDistancesEstimator,
108
- CorrelationDistancesEstimator,
109
- RocAucEstimator,
110
- ]
111
- get_patched_estimators(BANNED_ESTIMATORS, estimators)
112
- estimators = remove_duplicated_estimators(estimators)
113
59
 
60
+ GPU_SKIP_LIST = (
61
+ "TSNE", # too slow for using in testing on common data size
62
+ "RandomForestRegressor", # too slow for using in testing on common data size
63
+ "KMeans", # does not support GPU offloading
64
+ "config_context", # does not malloc
65
+ "get_config", # does not malloc
66
+ "set_config", # does not malloc
67
+ "Ridge", # does not support GPU offloading (fails silently)
68
+ "ElasticNet", # does not support GPU offloading (fails silently)
69
+ "Lasso", # does not support GPU offloading (fails silently)
70
+ "SVR", # does not support GPU offloading (fails silently)
71
+ "NuSVR", # does not support GPU offloading (fails silently)
72
+ "NuSVC", # does not support GPU offloading (fails silently)
73
+ "LogisticRegression", # default parameters not supported, see solver=newton-cg
74
+ "NuSVC(probability=True)", # does not support GPU offloading (fails silently)
75
+ "IncrementalLinearRegression", # issue with potrf with the specific dataset
76
+ "LinearRegression", # issue with potrf with the specific dataset
77
+ )
114
78
 
115
- def ndarray_c(x, y):
116
- return np.ascontiguousarray(x), y
117
79
 
80
+ def gen_functions(functions):
81
+ func_dict = functions.copy()
118
82
 
119
- def ndarray_f(x, y):
120
- return np.asfortranarray(x), y
83
+ roc_auc_score = func_dict.pop("roc_auc_score")
84
+ func_dict["roc_auc_score"] = lambda x, y: roc_auc_score(y, y)
121
85
 
86
+ pairwise_distances = func_dict.pop("pairwise_distances")
87
+ func_dict["pairwise_distances(metric='cosine')"] = lambda x, y: pairwise_distances(
88
+ x, metric="cosine"
89
+ )
90
+ func_dict["pairwise_distances(metric='correlation')"] = (
91
+ lambda x, y: pairwise_distances(x, metric="correlation")
92
+ )
122
93
 
123
- def dataframe_c(x, y):
124
- return pd.DataFrame(np.ascontiguousarray(x)), pd.Series(y)
94
+ _assert_all_finite = func_dict.pop("_assert_all_finite")
95
+ func_dict["_assert_all_finite"] = lambda x, y: [
96
+ _assert_all_finite(x),
97
+ _assert_all_finite(y),
98
+ ]
99
+ return func_dict
125
100
 
126
101
 
127
- def dataframe_f(x, y):
128
- return pd.DataFrame(np.asfortranarray(x)), pd.Series(y)
102
+ FUNCTIONS = gen_functions(PATCHED_FUNCTIONS)
129
103
 
104
+ CPU_ESTIMATORS = {
105
+ k: v
106
+ for k, v in {**PATCHED_MODELS, **SPECIAL_INSTANCES, **FUNCTIONS}.items()
107
+ if not k in CPU_SKIP_LIST
108
+ }
130
109
 
131
- data_transforms = [ndarray_c, ndarray_f, dataframe_c, dataframe_f]
110
+ GPU_ESTIMATORS = {
111
+ k: v
112
+ for k, v in {**PATCHED_MODELS, **SPECIAL_INSTANCES}.items()
113
+ if not k in GPU_SKIP_LIST
114
+ }
132
115
 
133
- data_shapes = [(1000, 100), (2000, 50)]
116
+ data_shapes = [
117
+ pytest.param((1000, 100), id="(1000, 100)"),
118
+ pytest.param((2000, 50), id="(2000, 50)"),
119
+ ]
134
120
 
135
121
  EXTRA_MEMORY_THRESHOLD = 0.15
136
122
  N_SPLITS = 10
123
+ ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray}
137
124
 
138
125
 
139
126
  def gen_clsf_data(n_samples, n_features):
@@ -147,45 +134,82 @@ def gen_clsf_data(n_samples, n_features):
147
134
  )
148
135
 
149
136
 
150
- def split_train_inference(kf, x, y, estimator):
137
+ def get_traced_memory(queue=None):
138
+ if _is_dpc_backend and queue and queue.sycl_device.is_gpu:
139
+ return _backend.get_used_memory(queue)
140
+ else:
141
+ return tracemalloc.get_traced_memory()[0]
142
+
143
+
144
+ def take(x, index, axis=0, queue=None):
145
+ xp, array_api = get_namespace(x)
146
+ if array_api:
147
+ return xp.take(x, xp.asarray(index, device=queue), axis=axis)
148
+ else:
149
+ return x.take(index, axis=axis)
150
+
151
+
152
+ def split_train_inference(kf, x, y, estimator, queue=None):
151
153
  mem_tracks = []
152
154
  for train_index, test_index in kf.split(x):
153
- if isinstance(x, np.ndarray):
154
- x_train, x_test = x[train_index], x[test_index]
155
- y_train, y_test = y[train_index], y[test_index]
156
- elif isinstance(x, pd.core.frame.DataFrame):
157
- x_train, x_test = x.iloc[train_index], x.iloc[test_index]
158
- y_train, y_test = y.iloc[train_index], y.iloc[test_index]
159
- # TODO: add parameters for all estimators to prevent
160
- # fallback to stock scikit-learn with default parameters
161
-
162
- alg = estimator()
163
- alg.fit(x_train, y_train)
164
- if hasattr(alg, "predict"):
165
- alg.predict(x_test)
166
- elif hasattr(alg, "transform"):
167
- alg.transform(x_test)
168
- elif hasattr(alg, "kneighbors"):
169
- alg.kneighbors(x_test)
170
- del alg, x_train, x_test, y_train, y_test
171
- mem_tracks.append(tracemalloc.get_traced_memory()[0])
155
+ x_train = take(x, train_index, queue=queue)
156
+ y_train = take(y, train_index, queue=queue)
157
+ x_test = take(x, test_index, queue=queue)
158
+ y_test = take(y, test_index, queue=queue)
159
+
160
+ if isclass(estimator) and issubclass(estimator, BaseEstimator):
161
+ alg = estimator()
162
+ flag = True
163
+ elif isinstance(estimator, BaseEstimator):
164
+ alg = clone(estimator)
165
+ flag = True
166
+ else:
167
+ flag = False
168
+
169
+ if flag:
170
+ alg.fit(x_train, y_train)
171
+ if hasattr(alg, "predict"):
172
+ alg.predict(x_test)
173
+ elif hasattr(alg, "transform"):
174
+ alg.transform(x_test)
175
+ elif hasattr(alg, "kneighbors"):
176
+ alg.kneighbors(x_test)
177
+ del alg
178
+ else:
179
+ estimator(x_train, y_train)
180
+
181
+ del x_train, x_test, y_train, y_test, flag
182
+ mem_tracks.append(get_traced_memory(queue))
172
183
  return mem_tracks
173
184
 
174
185
 
175
- def _kfold_function_template(estimator, data_transform_function, data_shape):
186
+ def _kfold_function_template(estimator, dataframe, data_shape, queue=None, func=None):
176
187
  tracemalloc.start()
177
188
 
178
189
  n_samples, n_features = data_shape
179
- x, y, data_memory_size = gen_clsf_data(n_samples, n_features)
190
+ X, y, data_memory_size = gen_clsf_data(n_samples, n_features)
180
191
  kf = KFold(n_splits=N_SPLITS)
181
- x, y = data_transform_function(x, y)
192
+ if func:
193
+ X = func(X)
194
+
195
+ X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
196
+ y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe)
182
197
 
183
- mem_before, _ = tracemalloc.get_traced_memory()
184
- mem_tracks = split_train_inference(kf, x, y, estimator)
198
+ mem_before = get_traced_memory(queue)
199
+ mem_tracks = split_train_inference(kf, X, y, estimator, queue=queue)
185
200
  mem_iter_diffs = np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1])
186
201
  mem_incr_mean, mem_incr_std = mem_iter_diffs.mean(), mem_iter_diffs.std()
187
202
  mem_incr_mean, mem_incr_std = round(mem_incr_mean), round(mem_incr_std)
188
- mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks))))
203
+ with warnings.catch_warnings():
204
+ # In the case that the memory usage is constant, this will raise
205
+ # a ConstantInputWarning error in pearsonr from scipy, this can
206
+ # be ignored.
207
+ warnings.filterwarnings(
208
+ "ignore",
209
+ message="An input array is constant; the correlation coefficient is not defined",
210
+ )
211
+ mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks))))
212
+
189
213
  if mem_iter_corr > 0.95:
190
214
  logging.warning(
191
215
  "Memory usage is steadily increasing with iterations "
@@ -194,12 +218,17 @@ def _kfold_function_template(estimator, data_transform_function, data_shape):
194
218
  "Memory usage increase per iteration: "
195
219
  f"{mem_incr_mean}±{mem_incr_std} bytes"
196
220
  )
197
- mem_before_gc, _ = tracemalloc.get_traced_memory()
221
+ mem_before_gc = get_traced_memory(queue)
198
222
  mem_diff = mem_before_gc - mem_before
223
+ if isinstance(estimator, BaseEstimator):
224
+ name = str(estimator)
225
+ else:
226
+ name = estimator.__name__
227
+
199
228
  message = (
200
229
  "Size of extra allocated memory {} using garbage collector "
201
230
  f"is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data"
202
- f"\n\tAlgorithm: {estimator.__name__}"
231
+ f"\n\tAlgorithm: {name}"
203
232
  f"\n\tInput data size: {data_memory_size} bytes"
204
233
  "\n\tExtra allocated memory size: {} bytes"
205
234
  " / {} %"
@@ -211,21 +240,51 @@ def _kfold_function_template(estimator, data_transform_function, data_shape):
211
240
  )
212
241
  )
213
242
  gc.collect()
214
- mem_after, _ = tracemalloc.get_traced_memory()
243
+ mem_after = get_traced_memory(queue)
215
244
  tracemalloc.stop()
216
245
  mem_diff = mem_after - mem_before
217
246
 
218
- assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format(
219
- "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2)
220
- )
247
+ # GPU offloading with SYCL contains a program/kernel cache which should
248
+ # be controllable via a KernelProgramCache object in the SYCL context.
249
+ # The programs and kernels are stored on the GPU, but cannot be cleared
250
+ # as this class is not available for access in all oneDAL DPC++ runtimes.
251
+ # Therefore, until this is implemented this test must be skipped for gpu
252
+ # as it looks like a memory leak (at least there is no way to discern a
253
+ # leak on the first run).
254
+ if queue is None or queue.sycl_device.is_cpu:
255
+ assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format(
256
+ "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2)
257
+ )
221
258
 
222
259
 
223
- # disable fallback check as logging impacts memory use
260
+ @pytest.mark.parametrize("order", ["F", "C"])
261
+ @pytest.mark.parametrize(
262
+ "dataframe,queue", get_dataframes_and_queues("numpy,pandas,dpctl", "cpu")
263
+ )
264
+ @pytest.mark.parametrize("estimator", CPU_ESTIMATORS.keys())
265
+ @pytest.mark.parametrize("data_shape", data_shapes)
266
+ def test_memory_leaks(estimator, dataframe, queue, order, data_shape):
267
+ func = ORDER_DICT[order]
268
+ if estimator == "_assert_all_finite" and queue is not None:
269
+ pytest.skip(f"{estimator} is not designed for device offloading")
270
+
271
+ _kfold_function_template(
272
+ CPU_ESTIMATORS[estimator], dataframe, data_shape, queue, func
273
+ )
224
274
 
225
275
 
226
- @pytest.mark.allow_sklearn_fallback
227
- @pytest.mark.parametrize("data_transform_function", data_transforms)
228
- @pytest.mark.parametrize("estimator", estimators)
276
+ @pytest.mark.skipif(
277
+ os.getenv("ZES_ENABLE_SYSMAN") is None or not is_dpctl_available("gpu"),
278
+ reason="SYCL device memory leak check requires the level zero sysman",
279
+ )
280
+ @pytest.mark.parametrize("queue", get_queues("gpu"))
281
+ @pytest.mark.parametrize("estimator", GPU_ESTIMATORS.keys())
282
+ @pytest.mark.parametrize("order", ["F", "C"])
229
283
  @pytest.mark.parametrize("data_shape", data_shapes)
230
- def test_memory_leaks(estimator, data_transform_function, data_shape):
231
- _kfold_function_template(estimator, data_transform_function, data_shape)
284
+ def test_gpu_memory_leaks(estimator, queue, order, data_shape):
285
+ func = ORDER_DICT[order]
286
+ if "ExtraTrees" in estimator and data_shape == (2000, 50):
287
+ pytest.skip("Avoid a segmentation fault in Extra Trees algorithms")
288
+
289
+ with config_context(target_offload=queue):
290
+ _kfold_function_template(GPU_ESTIMATORS[estimator], None, data_shape, queue, func)
@@ -148,23 +148,16 @@ def test_standard_estimator_patching(caplog, dataframe, queue, dtype, estimator,
148
148
  and dtype in [np.uint32, np.uint64]
149
149
  ):
150
150
  pytest.skip("Windows segmentation fault for Ridge.predict for unsigned ints")
151
- elif estimator == "IncrementalLinearRegression" and dtype in [
152
- np.int8,
153
- np.int16,
154
- np.int32,
155
- np.int64,
156
- np.uint8,
157
- np.uint16,
158
- np.uint32,
159
- np.uint64,
160
- ]:
151
+ elif estimator == "IncrementalLinearRegression" and np.issubdtype(
152
+ dtype, np.integer
153
+ ):
161
154
  pytest.skip(
162
155
  "IncrementalLinearRegression fails on oneDAL side with int types because dataset is filled by zeroes"
163
156
  )
164
157
  elif method and not hasattr(est, method):
165
158
  pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}")
166
159
 
167
- X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)
160
+ X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0]
168
161
  est.fit(X, y)
169
162
 
170
163
  if method:
@@ -196,7 +189,7 @@ def test_special_estimator_patching(caplog, dataframe, queue, dtype, estimator,
196
189
  elif dtype == np.float64 and queue and not queue.sycl_device.has_aspect_fp64:
197
190
  pytest.skip("Hardware does not support fp64 SYCL testing")
198
191
 
199
- X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)
192
+ X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0]
200
193
  est.fit(X, y)
201
194
 
202
195
  if method and not hasattr(est, method):