scikit-learn-intelex 2025.4.0__py313-none-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scikit-learn-intelex might be problematic. Click here for more details.

Files changed (282) hide show
  1. daal4py/__init__.py +73 -0
  2. daal4py/__main__.py +58 -0
  3. daal4py/_daal4py.cpython-313-x86_64-linux-gnu.so +0 -0
  4. daal4py/doc/third-party-programs.txt +424 -0
  5. daal4py/mb/__init__.py +19 -0
  6. daal4py/mb/model_builders.py +377 -0
  7. daal4py/mpi_transceiver.cpython-313-x86_64-linux-gnu.so +0 -0
  8. daal4py/sklearn/__init__.py +40 -0
  9. daal4py/sklearn/_n_jobs_support.py +248 -0
  10. daal4py/sklearn/_utils.py +245 -0
  11. daal4py/sklearn/cluster/__init__.py +20 -0
  12. daal4py/sklearn/cluster/dbscan.py +165 -0
  13. daal4py/sklearn/cluster/k_means.py +597 -0
  14. daal4py/sklearn/cluster/tests/test_dbscan.py +109 -0
  15. daal4py/sklearn/decomposition/__init__.py +19 -0
  16. daal4py/sklearn/decomposition/_pca.py +524 -0
  17. daal4py/sklearn/ensemble/AdaBoostClassifier.py +196 -0
  18. daal4py/sklearn/ensemble/GBTDAAL.py +337 -0
  19. daal4py/sklearn/ensemble/__init__.py +27 -0
  20. daal4py/sklearn/ensemble/_forest.py +1397 -0
  21. daal4py/sklearn/ensemble/tests/test_decision_forest.py +206 -0
  22. daal4py/sklearn/linear_model/__init__.py +29 -0
  23. daal4py/sklearn/linear_model/_coordinate_descent.py +848 -0
  24. daal4py/sklearn/linear_model/_linear.py +272 -0
  25. daal4py/sklearn/linear_model/_ridge.py +325 -0
  26. daal4py/sklearn/linear_model/coordinate_descent.py +17 -0
  27. daal4py/sklearn/linear_model/linear.py +17 -0
  28. daal4py/sklearn/linear_model/logistic_loss.py +195 -0
  29. daal4py/sklearn/linear_model/logistic_path.py +1026 -0
  30. daal4py/sklearn/linear_model/ridge.py +17 -0
  31. daal4py/sklearn/linear_model/tests/test_linear.py +208 -0
  32. daal4py/sklearn/linear_model/tests/test_ridge.py +69 -0
  33. daal4py/sklearn/manifold/__init__.py +19 -0
  34. daal4py/sklearn/manifold/_t_sne.py +405 -0
  35. daal4py/sklearn/metrics/__init__.py +20 -0
  36. daal4py/sklearn/metrics/_pairwise.py +236 -0
  37. daal4py/sklearn/metrics/_ranking.py +210 -0
  38. daal4py/sklearn/model_selection/__init__.py +19 -0
  39. daal4py/sklearn/model_selection/_split.py +309 -0
  40. daal4py/sklearn/model_selection/tests/test_split.py +56 -0
  41. daal4py/sklearn/monkeypatch/__init__.py +0 -0
  42. daal4py/sklearn/monkeypatch/dispatcher.py +232 -0
  43. daal4py/sklearn/monkeypatch/tests/_models_info.py +161 -0
  44. daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py +71 -0
  45. daal4py/sklearn/monkeypatch/tests/test_patching.py +90 -0
  46. daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py +117 -0
  47. daal4py/sklearn/neighbors/__init__.py +21 -0
  48. daal4py/sklearn/neighbors/_base.py +503 -0
  49. daal4py/sklearn/neighbors/_classification.py +139 -0
  50. daal4py/sklearn/neighbors/_regression.py +74 -0
  51. daal4py/sklearn/neighbors/_unsupervised.py +55 -0
  52. daal4py/sklearn/neighbors/tests/test_kneighbors.py +113 -0
  53. daal4py/sklearn/svm/__init__.py +19 -0
  54. daal4py/sklearn/svm/svm.py +734 -0
  55. daal4py/sklearn/utils/__init__.py +21 -0
  56. daal4py/sklearn/utils/base.py +75 -0
  57. daal4py/sklearn/utils/tests/test_utils.py +51 -0
  58. daal4py/sklearn/utils/validation.py +696 -0
  59. onedal/__init__.py +83 -0
  60. onedal/_config.py +54 -0
  61. onedal/_device_offload.py +204 -0
  62. onedal/_onedal_py_dpc.cpython-313-x86_64-linux-gnu.so +0 -0
  63. onedal/_onedal_py_host.cpython-313-x86_64-linux-gnu.so +0 -0
  64. onedal/_onedal_py_spmd_dpc.cpython-313-x86_64-linux-gnu.so +0 -0
  65. onedal/basic_statistics/__init__.py +20 -0
  66. onedal/basic_statistics/basic_statistics.py +107 -0
  67. onedal/basic_statistics/incremental_basic_statistics.py +175 -0
  68. onedal/basic_statistics/tests/test_basic_statistics.py +242 -0
  69. onedal/basic_statistics/tests/test_incremental_basic_statistics.py +279 -0
  70. onedal/basic_statistics/tests/utils.py +50 -0
  71. onedal/cluster/__init__.py +27 -0
  72. onedal/cluster/dbscan.py +105 -0
  73. onedal/cluster/kmeans.py +557 -0
  74. onedal/cluster/kmeans_init.py +112 -0
  75. onedal/cluster/tests/test_dbscan.py +125 -0
  76. onedal/cluster/tests/test_kmeans.py +88 -0
  77. onedal/cluster/tests/test_kmeans_init.py +93 -0
  78. onedal/common/_base.py +38 -0
  79. onedal/common/_estimator_checks.py +47 -0
  80. onedal/common/_mixin.py +62 -0
  81. onedal/common/_policy.py +55 -0
  82. onedal/common/_spmd_policy.py +30 -0
  83. onedal/common/hyperparameters.py +125 -0
  84. onedal/common/tests/test_policy.py +76 -0
  85. onedal/common/tests/test_sycl.py +128 -0
  86. onedal/covariance/__init__.py +20 -0
  87. onedal/covariance/covariance.py +122 -0
  88. onedal/covariance/incremental_covariance.py +161 -0
  89. onedal/covariance/tests/test_covariance.py +50 -0
  90. onedal/covariance/tests/test_incremental_covariance.py +190 -0
  91. onedal/datatypes/__init__.py +19 -0
  92. onedal/datatypes/_data_conversion.py +121 -0
  93. onedal/datatypes/tests/common.py +126 -0
  94. onedal/datatypes/tests/test_data.py +475 -0
  95. onedal/decomposition/__init__.py +20 -0
  96. onedal/decomposition/incremental_pca.py +214 -0
  97. onedal/decomposition/pca.py +186 -0
  98. onedal/decomposition/tests/test_incremental_pca.py +285 -0
  99. onedal/ensemble/__init__.py +29 -0
  100. onedal/ensemble/forest.py +736 -0
  101. onedal/ensemble/tests/test_random_forest.py +97 -0
  102. onedal/linear_model/__init__.py +27 -0
  103. onedal/linear_model/incremental_linear_model.py +292 -0
  104. onedal/linear_model/linear_model.py +325 -0
  105. onedal/linear_model/logistic_regression.py +247 -0
  106. onedal/linear_model/tests/test_incremental_linear_regression.py +213 -0
  107. onedal/linear_model/tests/test_incremental_ridge_regression.py +171 -0
  108. onedal/linear_model/tests/test_linear_regression.py +259 -0
  109. onedal/linear_model/tests/test_logistic_regression.py +95 -0
  110. onedal/linear_model/tests/test_ridge.py +95 -0
  111. onedal/neighbors/__init__.py +19 -0
  112. onedal/neighbors/neighbors.py +763 -0
  113. onedal/neighbors/tests/test_knn_classification.py +49 -0
  114. onedal/primitives/__init__.py +27 -0
  115. onedal/primitives/get_tree.py +25 -0
  116. onedal/primitives/kernel_functions.py +152 -0
  117. onedal/primitives/tests/test_kernel_functions.py +159 -0
  118. onedal/spmd/__init__.py +25 -0
  119. onedal/spmd/_base.py +30 -0
  120. onedal/spmd/basic_statistics/__init__.py +20 -0
  121. onedal/spmd/basic_statistics/basic_statistics.py +30 -0
  122. onedal/spmd/basic_statistics/incremental_basic_statistics.py +71 -0
  123. onedal/spmd/cluster/__init__.py +28 -0
  124. onedal/spmd/cluster/dbscan.py +23 -0
  125. onedal/spmd/cluster/kmeans.py +56 -0
  126. onedal/spmd/covariance/__init__.py +20 -0
  127. onedal/spmd/covariance/covariance.py +26 -0
  128. onedal/spmd/covariance/incremental_covariance.py +83 -0
  129. onedal/spmd/decomposition/__init__.py +20 -0
  130. onedal/spmd/decomposition/incremental_pca.py +124 -0
  131. onedal/spmd/decomposition/pca.py +26 -0
  132. onedal/spmd/ensemble/__init__.py +19 -0
  133. onedal/spmd/ensemble/forest.py +28 -0
  134. onedal/spmd/linear_model/__init__.py +21 -0
  135. onedal/spmd/linear_model/incremental_linear_model.py +101 -0
  136. onedal/spmd/linear_model/linear_model.py +30 -0
  137. onedal/spmd/linear_model/logistic_regression.py +38 -0
  138. onedal/spmd/neighbors/__init__.py +19 -0
  139. onedal/spmd/neighbors/neighbors.py +75 -0
  140. onedal/svm/__init__.py +19 -0
  141. onedal/svm/svm.py +556 -0
  142. onedal/svm/tests/test_csr_svm.py +351 -0
  143. onedal/svm/tests/test_nusvc.py +204 -0
  144. onedal/svm/tests/test_nusvr.py +210 -0
  145. onedal/svm/tests/test_svc.py +176 -0
  146. onedal/svm/tests/test_svr.py +243 -0
  147. onedal/tests/test_common.py +57 -0
  148. onedal/tests/utils/_dataframes_support.py +162 -0
  149. onedal/tests/utils/_device_selection.py +102 -0
  150. onedal/utils/__init__.py +49 -0
  151. onedal/utils/_array_api.py +81 -0
  152. onedal/utils/_dpep_helpers.py +56 -0
  153. onedal/utils/tests/test_validation.py +142 -0
  154. onedal/utils/validation.py +464 -0
  155. scikit_learn_intelex-2025.4.0.dist-info/LICENSE.txt +202 -0
  156. scikit_learn_intelex-2025.4.0.dist-info/METADATA +190 -0
  157. scikit_learn_intelex-2025.4.0.dist-info/RECORD +282 -0
  158. scikit_learn_intelex-2025.4.0.dist-info/WHEEL +5 -0
  159. scikit_learn_intelex-2025.4.0.dist-info/top_level.txt +3 -0
  160. sklearnex/__init__.py +66 -0
  161. sklearnex/__main__.py +58 -0
  162. sklearnex/_config.py +116 -0
  163. sklearnex/_device_offload.py +126 -0
  164. sklearnex/_utils.py +177 -0
  165. sklearnex/basic_statistics/__init__.py +20 -0
  166. sklearnex/basic_statistics/basic_statistics.py +261 -0
  167. sklearnex/basic_statistics/incremental_basic_statistics.py +352 -0
  168. sklearnex/basic_statistics/tests/test_basic_statistics.py +405 -0
  169. sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +455 -0
  170. sklearnex/cluster/__init__.py +20 -0
  171. sklearnex/cluster/dbscan.py +197 -0
  172. sklearnex/cluster/k_means.py +397 -0
  173. sklearnex/cluster/tests/test_dbscan.py +38 -0
  174. sklearnex/cluster/tests/test_kmeans.py +157 -0
  175. sklearnex/conftest.py +82 -0
  176. sklearnex/covariance/__init__.py +19 -0
  177. sklearnex/covariance/incremental_covariance.py +405 -0
  178. sklearnex/covariance/tests/test_incremental_covariance.py +287 -0
  179. sklearnex/decomposition/__init__.py +19 -0
  180. sklearnex/decomposition/pca.py +427 -0
  181. sklearnex/decomposition/tests/test_pca.py +58 -0
  182. sklearnex/dispatcher.py +534 -0
  183. sklearnex/doc/third-party-programs.txt +424 -0
  184. sklearnex/ensemble/__init__.py +29 -0
  185. sklearnex/ensemble/_forest.py +2029 -0
  186. sklearnex/ensemble/tests/test_forest.py +140 -0
  187. sklearnex/glob/__main__.py +72 -0
  188. sklearnex/glob/dispatcher.py +101 -0
  189. sklearnex/linear_model/__init__.py +32 -0
  190. sklearnex/linear_model/coordinate_descent.py +30 -0
  191. sklearnex/linear_model/incremental_linear.py +495 -0
  192. sklearnex/linear_model/incremental_ridge.py +432 -0
  193. sklearnex/linear_model/linear.py +346 -0
  194. sklearnex/linear_model/logistic_regression.py +415 -0
  195. sklearnex/linear_model/ridge.py +390 -0
  196. sklearnex/linear_model/tests/test_incremental_linear.py +267 -0
  197. sklearnex/linear_model/tests/test_incremental_ridge.py +214 -0
  198. sklearnex/linear_model/tests/test_linear.py +142 -0
  199. sklearnex/linear_model/tests/test_logreg.py +134 -0
  200. sklearnex/linear_model/tests/test_ridge.py +256 -0
  201. sklearnex/manifold/__init__.py +19 -0
  202. sklearnex/manifold/t_sne.py +26 -0
  203. sklearnex/manifold/tests/test_tsne.py +250 -0
  204. sklearnex/metrics/__init__.py +23 -0
  205. sklearnex/metrics/pairwise.py +22 -0
  206. sklearnex/metrics/ranking.py +20 -0
  207. sklearnex/metrics/tests/test_metrics.py +39 -0
  208. sklearnex/model_selection/__init__.py +21 -0
  209. sklearnex/model_selection/split.py +22 -0
  210. sklearnex/model_selection/tests/test_model_selection.py +34 -0
  211. sklearnex/neighbors/__init__.py +27 -0
  212. sklearnex/neighbors/_lof.py +236 -0
  213. sklearnex/neighbors/common.py +310 -0
  214. sklearnex/neighbors/knn_classification.py +231 -0
  215. sklearnex/neighbors/knn_regression.py +207 -0
  216. sklearnex/neighbors/knn_unsupervised.py +178 -0
  217. sklearnex/neighbors/tests/test_neighbors.py +82 -0
  218. sklearnex/preview/__init__.py +17 -0
  219. sklearnex/preview/covariance/__init__.py +19 -0
  220. sklearnex/preview/covariance/covariance.py +142 -0
  221. sklearnex/preview/covariance/tests/test_covariance.py +66 -0
  222. sklearnex/preview/decomposition/__init__.py +19 -0
  223. sklearnex/preview/decomposition/incremental_pca.py +244 -0
  224. sklearnex/preview/decomposition/tests/test_incremental_pca.py +336 -0
  225. sklearnex/spmd/__init__.py +25 -0
  226. sklearnex/spmd/basic_statistics/__init__.py +20 -0
  227. sklearnex/spmd/basic_statistics/basic_statistics.py +21 -0
  228. sklearnex/spmd/basic_statistics/incremental_basic_statistics.py +30 -0
  229. sklearnex/spmd/basic_statistics/tests/test_basic_statistics_spmd.py +107 -0
  230. sklearnex/spmd/basic_statistics/tests/test_incremental_basic_statistics_spmd.py +306 -0
  231. sklearnex/spmd/cluster/__init__.py +30 -0
  232. sklearnex/spmd/cluster/dbscan.py +50 -0
  233. sklearnex/spmd/cluster/kmeans.py +21 -0
  234. sklearnex/spmd/cluster/tests/test_dbscan_spmd.py +97 -0
  235. sklearnex/spmd/cluster/tests/test_kmeans_spmd.py +173 -0
  236. sklearnex/spmd/covariance/__init__.py +20 -0
  237. sklearnex/spmd/covariance/covariance.py +21 -0
  238. sklearnex/spmd/covariance/incremental_covariance.py +37 -0
  239. sklearnex/spmd/covariance/tests/test_covariance_spmd.py +107 -0
  240. sklearnex/spmd/covariance/tests/test_incremental_covariance_spmd.py +184 -0
  241. sklearnex/spmd/decomposition/__init__.py +20 -0
  242. sklearnex/spmd/decomposition/incremental_pca.py +30 -0
  243. sklearnex/spmd/decomposition/pca.py +21 -0
  244. sklearnex/spmd/decomposition/tests/test_incremental_pca_spmd.py +269 -0
  245. sklearnex/spmd/decomposition/tests/test_pca_spmd.py +128 -0
  246. sklearnex/spmd/ensemble/__init__.py +19 -0
  247. sklearnex/spmd/ensemble/forest.py +71 -0
  248. sklearnex/spmd/ensemble/tests/test_forest_spmd.py +265 -0
  249. sklearnex/spmd/linear_model/__init__.py +21 -0
  250. sklearnex/spmd/linear_model/incremental_linear_model.py +35 -0
  251. sklearnex/spmd/linear_model/linear_model.py +21 -0
  252. sklearnex/spmd/linear_model/logistic_regression.py +21 -0
  253. sklearnex/spmd/linear_model/tests/test_incremental_linear_spmd.py +331 -0
  254. sklearnex/spmd/linear_model/tests/test_linear_regression_spmd.py +145 -0
  255. sklearnex/spmd/linear_model/tests/test_logistic_regression_spmd.py +162 -0
  256. sklearnex/spmd/neighbors/__init__.py +19 -0
  257. sklearnex/spmd/neighbors/neighbors.py +25 -0
  258. sklearnex/spmd/neighbors/tests/test_neighbors_spmd.py +288 -0
  259. sklearnex/svm/__init__.py +29 -0
  260. sklearnex/svm/_common.py +339 -0
  261. sklearnex/svm/nusvc.py +371 -0
  262. sklearnex/svm/nusvr.py +170 -0
  263. sklearnex/svm/svc.py +399 -0
  264. sklearnex/svm/svr.py +167 -0
  265. sklearnex/svm/tests/test_svm.py +93 -0
  266. sklearnex/tests/test_common.py +491 -0
  267. sklearnex/tests/test_config.py +123 -0
  268. sklearnex/tests/test_hyperparameters.py +43 -0
  269. sklearnex/tests/test_memory_usage.py +347 -0
  270. sklearnex/tests/test_monkeypatch.py +269 -0
  271. sklearnex/tests/test_n_jobs_support.py +108 -0
  272. sklearnex/tests/test_parallel.py +48 -0
  273. sklearnex/tests/test_patching.py +377 -0
  274. sklearnex/tests/test_run_to_run_stability.py +326 -0
  275. sklearnex/tests/utils/__init__.py +48 -0
  276. sklearnex/tests/utils/base.py +436 -0
  277. sklearnex/tests/utils/spmd.py +198 -0
  278. sklearnex/utils/__init__.py +19 -0
  279. sklearnex/utils/_array_api.py +82 -0
  280. sklearnex/utils/parallel.py +59 -0
  281. sklearnex/utils/tests/test_validation.py +238 -0
  282. sklearnex/utils/validation.py +208 -0
@@ -0,0 +1,214 @@
1
+ # ==============================================================================
2
+ # Copyright 2024 Intel Corporation
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ import numpy as np
18
+
19
+ from daal4py.sklearn._utils import get_dtype
20
+
21
+ from ..datatypes import from_table, to_table
22
+ from ..utils import _check_array
23
+ from .pca import BasePCA
24
+
25
+
26
+ class IncrementalPCA(BasePCA):
27
+ """
28
+ Incremental estimator for PCA based on oneDAL implementation.
29
+ Allows to compute PCA if data are splitted into batches.
30
+
31
+ Parameters
32
+ ----------
33
+ n_components : int, default=None
34
+ Number of components to keep. If ``n_components`` is ``None``,
35
+ then ``n_components`` is set to ``min(n_samples, n_features)``.
36
+
37
+ is_deterministic : bool, default=True
38
+ When True the ``components_`` vectors are chosen in deterministic
39
+ way, otherwise some of them can be oppositely directed.
40
+
41
+ method : string, default='cov'
42
+ Method used on oneDAL side to compute result.
43
+
44
+ whiten : bool, default=False
45
+ When True the ``components_`` vectors are divided
46
+ by ``n_samples`` times ``components_`` to ensure uncorrelated outputs
47
+ with unit component-wise variances.
48
+
49
+ Whitening will remove some information from the transformed signal
50
+ (the relative variance scales of the components) but can sometimes
51
+ improve the predictive accuracy of the downstream estimators by
52
+ making data respect some hard-wired assumptions.
53
+
54
+ Attributes
55
+ ----------
56
+ components_ : ndarray of shape (n_components, n_features)
57
+ Principal axes in feature space, representing the directions of
58
+ maximum variance in the data. Equivalently, the right singular
59
+ vectors of the centered input data, parallel to its eigenvectors.
60
+ The components are sorted by decreasing ``explained_variance_``.
61
+
62
+ explained_variance_ : ndarray of shape (n_components,)
63
+ Variance explained by each of the selected components.
64
+
65
+ explained_variance_ratio_ : ndarray of shape (n_components,)
66
+ Percentage of variance explained by each of the selected components.
67
+ If all components are stored, the sum of explained variances is equal
68
+ to 1.0.
69
+
70
+ singular_values_ : ndarray of shape (n_components,)
71
+ The singular values corresponding to each of the selected components.
72
+ The singular values are equal to the 2-norms of the ``n_components``
73
+ variables in the lower-dimensional space.
74
+
75
+ mean_ : ndarray of shape (n_features,)
76
+ Per-feature empirical mean, aggregate over calls to ``partial_fit``.
77
+
78
+ var_ : ndarray of shape (n_features,)
79
+ Per-feature empirical variance, aggregate over calls to
80
+ ``partial_fit``.
81
+
82
+ noise_variance_ : float
83
+ Equal to the average of (min(n_features, n_samples) - n_components)
84
+ smallest eigenvalues of the covariance matrix of X.
85
+
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ n_components=None,
91
+ is_deterministic=True,
92
+ method="cov",
93
+ whiten=False,
94
+ ):
95
+ self.n_components = n_components
96
+ self.method = method
97
+ self.is_deterministic = is_deterministic
98
+ self.whiten = whiten
99
+ self._reset()
100
+
101
+ def _reset(self):
102
+ self._need_to_finalize = False
103
+ module = self._get_backend("decomposition", "dim_reduction")
104
+ if hasattr(self, "components_"):
105
+ del self.components_
106
+ self._partial_result = module.partial_train_result()
107
+
108
+ def __getstate__(self):
109
+ # Since finalize_fit can't be dispatched without directly provided queue
110
+ # and the dispatching policy can't be serialized, the computation is finalized
111
+ # here and the policy is not saved in serialized data.
112
+
113
+ self.finalize_fit()
114
+ data = self.__dict__.copy()
115
+ data.pop("_queue", None)
116
+
117
+ return data
118
+
119
+ def partial_fit(self, X, queue):
120
+ """Incremental fit with X. All of X is processed as a single batch.
121
+
122
+ Parameters
123
+ ----------
124
+ X : array-like of shape (n_samples, n_features)
125
+ Training data, where `n_samples` is the number of samples and
126
+ `n_features` is the number of features.
127
+
128
+ y : Ignored
129
+ Not used, present for API consistency by convention.
130
+
131
+ Returns
132
+ -------
133
+ self : object
134
+ Returns the instance itself.
135
+ """
136
+ X = _check_array(X)
137
+ n_samples, n_features = X.shape
138
+
139
+ first_pass = not hasattr(self, "components_")
140
+ if first_pass:
141
+ self.components_ = None
142
+ self.n_samples_seen_ = n_samples
143
+ self.n_features_in_ = n_features
144
+ else:
145
+ self.n_samples_seen_ += n_samples
146
+
147
+ if self.n_components is None:
148
+ if self.components_ is None:
149
+ self.n_components_ = min(n_samples, n_features)
150
+ else:
151
+ self.n_components_ = self.components_.shape[0]
152
+ else:
153
+ self.n_components_ = self.n_components
154
+
155
+ self._queue = queue
156
+
157
+ policy = self._get_policy(queue, X)
158
+ X_table = to_table(X, queue=queue)
159
+
160
+ if not hasattr(self, "_dtype"):
161
+ self._dtype = X_table.dtype
162
+ self._params = self._get_onedal_params(X_table)
163
+
164
+ self._partial_result = self._get_backend(
165
+ "decomposition",
166
+ "dim_reduction",
167
+ "partial_train",
168
+ policy,
169
+ self._params,
170
+ self._partial_result,
171
+ X_table,
172
+ )
173
+ self._need_to_finalize = True
174
+ return self
175
+
176
+ def finalize_fit(self, queue=None):
177
+ """
178
+ Finalizes principal components computation and obtains resulting
179
+ attributes from the current `_partial_result`.
180
+
181
+ Parameters
182
+ ----------
183
+ queue : dpctl.SyclQueue
184
+ Not used here, added for API conformance
185
+
186
+ Returns
187
+ -------
188
+ self : object
189
+ Returns the instance itself.
190
+ """
191
+ if self._need_to_finalize:
192
+ module = self._get_backend("decomposition", "dim_reduction")
193
+ if queue is not None:
194
+ policy = self._get_policy(queue)
195
+ else:
196
+ policy = self._get_policy(self._queue)
197
+ result = module.finalize_train(policy, self._params, self._partial_result)
198
+ self.mean_ = from_table(result.means).ravel()
199
+ self.var_ = from_table(result.variances).ravel()
200
+ self.components_ = from_table(result.eigenvectors)
201
+ self.singular_values_ = np.nan_to_num(
202
+ from_table(result.singular_values).ravel()
203
+ )
204
+ self.explained_variance_ = np.maximum(
205
+ from_table(result.eigenvalues).ravel(), 0
206
+ )
207
+ self.explained_variance_ratio_ = from_table(
208
+ result.explained_variances_ratio
209
+ ).ravel()
210
+ self.noise_variance_ = self._compute_noise_variance(
211
+ self.n_components_, min(self.n_samples_seen_, self.n_features_in_)
212
+ )
213
+ self._need_to_finalize = False
214
+ return self
@@ -0,0 +1,186 @@
1
+ # ==============================================================================
2
+ # Copyright 2023 Intel Corporation
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ import numbers
18
+ from abc import ABCMeta
19
+
20
+ import numpy as np
21
+ from sklearn.decomposition._pca import _infer_dimension
22
+ from sklearn.utils.extmath import stable_cumsum
23
+
24
+ from ..common._base import BaseEstimator
25
+ from ..datatypes import from_table, to_table
26
+
27
+
28
+ class BasePCA(BaseEstimator, metaclass=ABCMeta):
29
+ """
30
+ Base class for PCA oneDAL implementation.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ n_components=None,
36
+ is_deterministic=True,
37
+ method="cov",
38
+ whiten=False,
39
+ ):
40
+ self.n_components = n_components
41
+ self.method = method
42
+ self.is_deterministic = is_deterministic
43
+ self.whiten = whiten
44
+
45
+ def _get_onedal_params(self, data, stage=None):
46
+ if stage is None:
47
+ n_components = self._resolve_n_components_for_training(data.shape)
48
+ elif stage == "predict":
49
+ n_components = self.n_components_
50
+ return {
51
+ "fptype": data.dtype,
52
+ "method": self.method,
53
+ "n_components": n_components,
54
+ "is_deterministic": self.is_deterministic,
55
+ "whiten": self.whiten,
56
+ }
57
+
58
+ def _validate_n_components(self, n_components, n_samples, n_features):
59
+ if n_components is None:
60
+ n_components = min(n_samples, n_features)
61
+ if n_components == "mle":
62
+ if n_samples < n_features:
63
+ raise ValueError(
64
+ "n_components='mle' is only supported if n_samples >= n_features"
65
+ )
66
+ elif not 0 <= n_components <= min(n_samples, n_features):
67
+ raise ValueError(
68
+ "n_components=%r must be between 0 and "
69
+ "min(n_samples, n_features)=%r with "
70
+ "svd_solver='full'" % (n_components, min(n_samples, n_features))
71
+ )
72
+ elif n_components >= 1:
73
+ if not isinstance(n_components, numbers.Integral):
74
+ raise ValueError(
75
+ "n_components=%r must be of type int "
76
+ "when greater than or equal to 1, "
77
+ "was of type=%r" % (n_components, type(n_components))
78
+ )
79
+
80
+ def _resolve_n_components_for_training(self, shape_tuple):
81
+ if self.n_components is None or self.n_components == "mle":
82
+ return min(shape_tuple)
83
+ elif (
84
+ isinstance(self.n_components, float)
85
+ and self.n_components > 0.0
86
+ and self.n_components <= 1.0
87
+ ):
88
+ return min(shape_tuple)
89
+ else:
90
+ return self.n_components
91
+
92
+ def _resolve_n_components_for_result(self, shape_tuple):
93
+ if self.n_components is None:
94
+ return min(shape_tuple)
95
+ elif self.n_components == "mle":
96
+ return _infer_dimension(self.explained_variance_, shape_tuple[0])
97
+ elif 0.0 < self.n_components < 1.0:
98
+ ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
99
+ return np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1
100
+ elif isinstance(self.n_components, float) and self.n_components == 1.0:
101
+ return min(shape_tuple)
102
+ else:
103
+ return self.n_components
104
+
105
+ def _compute_noise_variance(self, n_components, n_sf_min):
106
+ if n_components < n_sf_min:
107
+ if len(self.explained_variance_) == n_sf_min:
108
+ return self.explained_variance_[n_components:].mean()
109
+ elif len(self.explained_variance_) < n_sf_min:
110
+ # TODO Rename variances_ to var_ to align with sklearn/sklearnex IncrementalPCA
111
+ if hasattr(self, "variances_"):
112
+ resid_var = self.variances_.sum()
113
+ elif hasattr(self, "var_"):
114
+ resid_var = self.var_.sum()
115
+
116
+ resid_var -= self.explained_variance_.sum()
117
+ return resid_var / (n_sf_min - n_components)
118
+ else:
119
+ return 0.0
120
+
121
+ def _create_model(self):
122
+ m = self._get_backend("decomposition", "dim_reduction", "model")
123
+ m.eigenvectors = to_table(self.components_)
124
+ m.means = to_table(self.mean_)
125
+ if self.whiten:
126
+ m.eigenvalues = to_table(self.explained_variance_)
127
+ self._onedal_model = m
128
+ return m
129
+
130
+ def predict(self, X, queue=None):
131
+ policy = self._get_policy(queue, X)
132
+ model = self._create_model()
133
+ X_table = to_table(X, queue=queue)
134
+ params = self._get_onedal_params(X_table, stage="predict")
135
+
136
+ result = self._get_backend(
137
+ "decomposition", "dim_reduction", "infer", policy, params, model, X_table
138
+ )
139
+ return from_table(result.transformed_data)
140
+
141
+
142
+ class PCA(BasePCA):
143
+
144
+ def fit(self, X, y=None, queue=None):
145
+ n_samples, n_features = X.shape
146
+ n_sf_min = min(n_samples, n_features)
147
+ self._validate_n_components(self.n_components, n_samples, n_features)
148
+
149
+ policy = self._get_policy(queue, X)
150
+ # TODO: investigate why np.ndarray with OWNDATA=FALSE flag
151
+ # fails to be converted to oneDAL table
152
+ if isinstance(X, np.ndarray) and not X.flags["OWNDATA"]:
153
+ X = X.copy()
154
+
155
+ X = to_table(X, queue=queue)
156
+ params = self._get_onedal_params(X)
157
+ result = self._get_backend(
158
+ "decomposition", "dim_reduction", "train", policy, params, X
159
+ )
160
+
161
+ self.mean_ = from_table(result.means).ravel()
162
+ self.variances_ = from_table(result.variances)
163
+ self.components_ = from_table(result.eigenvectors)
164
+ self.singular_values_ = from_table(result.singular_values).ravel()
165
+ self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0)
166
+ self.explained_variance_ratio_ = from_table(
167
+ result.explained_variances_ratio
168
+ ).ravel()
169
+ self.n_samples_ = n_samples
170
+ self.n_features_ = n_features
171
+
172
+ U = None
173
+ S = self.singular_values_
174
+ Vt = self.components_
175
+
176
+ n_components = self._resolve_n_components_for_result(X.shape)
177
+ self.n_components_ = n_components
178
+ self.noise_variance_ = self._compute_noise_variance(n_components, n_sf_min)
179
+
180
+ if n_components < params["n_components"]:
181
+ self.explained_variance_ = self.explained_variance_[:n_components]
182
+ self.components_ = self.components_[:n_components]
183
+ self.singular_values_ = self.singular_values_[:n_components]
184
+ self.explained_variance_ratio_ = self.explained_variance_ratio_[:n_components]
185
+
186
+ return self
@@ -0,0 +1,285 @@
1
+ # ==============================================================================
2
+ # Copyright 2024 Intel Corporation
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ import numpy as np
18
+ import pytest
19
+ from numpy.testing import assert_allclose
20
+
21
+ from daal4py.sklearn._utils import daal_check_version
22
+ from onedal.datatypes import from_table
23
+ from onedal.decomposition import IncrementalPCA
24
+ from onedal.tests.utils._device_selection import get_queues
25
+
26
+
27
+ @pytest.mark.parametrize("queue", get_queues())
28
+ @pytest.mark.parametrize("is_deterministic", [True, False])
29
+ @pytest.mark.parametrize("whiten", [True, False])
30
+ @pytest.mark.parametrize("num_blocks", [1, 2, 3])
31
+ @pytest.mark.parametrize("dtype", [np.float32, np.float64])
32
+ def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
33
+ X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
34
+ X = X.astype(dtype=dtype)
35
+ X_split = np.array_split(X, num_blocks)
36
+ incpca = IncrementalPCA(is_deterministic=is_deterministic, whiten=whiten)
37
+
38
+ for i in range(num_blocks):
39
+ incpca.partial_fit(X_split[i], queue=queue)
40
+
41
+ result = incpca.finalize_fit()
42
+
43
+ transformed_data = incpca.predict(X, queue=queue)
44
+
45
+ expected_n_components_ = 2
46
+ expected_components_ = np.array([[0.83849224, 0.54491354], [-0.54491354, 0.83849224]])
47
+ expected_singular_values_ = np.array([6.30061232, 0.54980396])
48
+ expected_mean_ = np.array([0, 0])
49
+ expected_var_ = np.array([5.6, 2.4])
50
+ expected_explained_variance_ = np.array([7.93954312, 0.06045688])
51
+ expected_explained_variance_ratio_ = np.array([0.99244289, 0.00755711])
52
+ expected_transformed_data = (
53
+ np.array(
54
+ [
55
+ [-0.49096647, -1.19399271],
56
+ [-0.78854479, 1.02218579],
57
+ [-1.27951125, -0.17180692],
58
+ [0.49096647, 1.19399271],
59
+ [0.78854479, -1.02218579],
60
+ [1.27951125, 0.17180692],
61
+ ]
62
+ )
63
+ if whiten
64
+ else np.array(
65
+ [
66
+ [-1.38340578, -0.2935787],
67
+ [-2.22189802, 0.25133484],
68
+ [-3.6053038, -0.04224385],
69
+ [1.38340578, 0.2935787],
70
+ [2.22189802, -0.25133484],
71
+ [3.6053038, 0.04224385],
72
+ ]
73
+ )
74
+ )
75
+
76
+ tol = 1e-7
77
+ if transformed_data.dtype == np.float32:
78
+ tol = 7e-6 if whiten else 1e-6
79
+
80
+ assert result.n_components_ == expected_n_components_
81
+
82
+ assert_allclose(result.singular_values_, expected_singular_values_, atol=tol)
83
+ assert_allclose(result.mean_, expected_mean_, atol=tol)
84
+ assert_allclose(result.var_, expected_var_, atol=tol)
85
+ assert_allclose(result.explained_variance_, expected_explained_variance_, atol=tol)
86
+ assert_allclose(
87
+ result.explained_variance_ratio_, expected_explained_variance_ratio_, atol=tol
88
+ )
89
+ if is_deterministic and daal_check_version((2024, "P", 500)):
90
+ assert_allclose(result.components_, expected_components_, atol=tol)
91
+ assert_allclose(transformed_data, expected_transformed_data, atol=tol)
92
+ else:
93
+ for i in range(result.n_components_):
94
+ abs_dot_product = np.abs(
95
+ np.dot(result.components_[i], expected_components_[i])
96
+ )
97
+ assert np.abs(abs_dot_product - 1.0) < tol
98
+
99
+ if np.dot(result.components_[i], expected_components_[i]) < 0:
100
+ assert_allclose(
101
+ -transformed_data[i], expected_transformed_data[i], atol=tol
102
+ )
103
+ else:
104
+ assert_allclose(
105
+ transformed_data[i], expected_transformed_data[i], atol=tol
106
+ )
107
+
108
+
109
+ @pytest.mark.parametrize("queue", get_queues())
110
+ @pytest.mark.parametrize("n_components", [None, 1, 5])
111
+ @pytest.mark.parametrize("whiten", [True, False])
112
+ @pytest.mark.parametrize("num_blocks", [1, 10])
113
+ @pytest.mark.parametrize("row_count", [100, 1000])
114
+ @pytest.mark.parametrize("column_count", [10, 100])
115
+ @pytest.mark.parametrize("dtype", [np.float32, np.float64])
116
+ def test_on_random_data(
117
+ queue, n_components, whiten, num_blocks, row_count, column_count, dtype
118
+ ):
119
+ seed = 78
120
+ gen = np.random.default_rng(seed)
121
+ X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count))
122
+ X = X.astype(dtype=dtype)
123
+ X_split = np.array_split(X, num_blocks)
124
+ incpca = IncrementalPCA(n_components=n_components, whiten=whiten)
125
+
126
+ for i in range(num_blocks):
127
+ incpca.partial_fit(X_split[i], queue=queue)
128
+
129
+ incpca.finalize_fit()
130
+
131
+ transformed_data = incpca.predict(X, queue=queue)
132
+ tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6
133
+
134
+ n_components = incpca.n_components_
135
+ expected_n_samples_seen = X.shape[0]
136
+ expected_n_features_in = X.shape[1]
137
+ n_samples_seen = incpca.n_samples_seen_
138
+ n_features_in = incpca.n_features_in_
139
+ assert n_samples_seen == expected_n_samples_seen
140
+ assert n_features_in == expected_n_features_in
141
+
142
+ components = incpca.components_
143
+ singular_values = incpca.singular_values_
144
+ centered_data = X - np.mean(X, axis=0)
145
+ cov_eigenvalues, cov_eigenvectors = np.linalg.eig(
146
+ centered_data.T @ centered_data / (n_samples_seen - 1)
147
+ )
148
+ cov_eigenvalues = np.nan_to_num(cov_eigenvalues)
149
+ cov_eigenvalues[cov_eigenvalues < 0] = 0
150
+ eigenvalues_order = np.argsort(cov_eigenvalues)[::-1]
151
+ sorted_eigenvalues = cov_eigenvalues[eigenvalues_order]
152
+ sorted_eigenvectors = cov_eigenvectors[:, eigenvalues_order]
153
+ expected_singular_values = np.sqrt(sorted_eigenvalues * (n_samples_seen - 1))[
154
+ :n_components
155
+ ]
156
+ expected_components = sorted_eigenvectors.T[:n_components]
157
+
158
+ assert_allclose(singular_values, expected_singular_values, atol=tol)
159
+ for i in range(n_components):
160
+ component_length = np.dot(components[i], components[i])
161
+ assert np.abs(component_length - 1.0) < tol
162
+ abs_dot_product = np.abs(np.dot(components[i], expected_components[i]))
163
+ assert np.abs(abs_dot_product - 1.0) < tol
164
+
165
+ expected_mean = np.mean(X, axis=0)
166
+ assert_allclose(incpca.mean_, expected_mean, atol=tol)
167
+
168
+ expected_var_ = np.var(X, ddof=1, axis=0)
169
+ assert_allclose(incpca.var_, expected_var_, atol=tol)
170
+
171
+ expected_explained_variance = sorted_eigenvalues[:n_components]
172
+ assert_allclose(incpca.explained_variance_, expected_explained_variance, atol=tol)
173
+
174
+ expected_explained_variance_ratio = expected_explained_variance / np.sum(
175
+ sorted_eigenvalues
176
+ )
177
+ assert_allclose(
178
+ incpca.explained_variance_ratio_, expected_explained_variance_ratio, atol=tol
179
+ )
180
+
181
+ expected_noise_variance = (
182
+ np.mean(sorted_eigenvalues[n_components:])
183
+ if len(sorted_eigenvalues) > n_components
184
+ else 0.0
185
+ )
186
+ # TODO Fix noise variance computation (It is necessary to update C++ side)
187
+ # assert np.abs(incpca.noise_variance_ - expected_noise_variance) < tol
188
+
189
+ expected_transformed_data = centered_data @ components.T
190
+ if whiten:
191
+ scale = np.sqrt(incpca.explained_variance_)
192
+ min_scale = np.finfo(scale.dtype).eps
193
+ scale[scale < min_scale] = np.inf
194
+ expected_transformed_data /= scale
195
+
196
+ if daal_check_version((2024, "P", 500)) or not (
197
+ whiten and queue is not None and queue.sycl_device.device_type.name == "gpu"
198
+ ):
199
+ assert_allclose(transformed_data, expected_transformed_data, atol=tol)
200
+
201
+
202
+ @pytest.mark.parametrize("queue", get_queues())
203
+ @pytest.mark.parametrize("dtype", [np.float32, np.float64])
204
+ def test_incremental_estimator_pickle(queue, dtype):
205
+ import pickle
206
+
207
+ from onedal.decomposition import IncrementalPCA
208
+
209
+ incpca = IncrementalPCA()
210
+
211
+ # Check that estimator can be serialized without any data.
212
+ dump = pickle.dumps(incpca)
213
+ incpca_loaded = pickle.loads(dump)
214
+ seed = 77
215
+ gen = np.random.default_rng(seed)
216
+ X = gen.uniform(low=-0.3, high=+0.7, size=(10, 10))
217
+ X = X.astype(dtype)
218
+ X_split = np.array_split(X, 2)
219
+ incpca.partial_fit(X_split[0], queue=queue)
220
+ incpca_loaded.partial_fit(X_split[0], queue=queue)
221
+ assert incpca._need_to_finalize == True
222
+ assert incpca_loaded._need_to_finalize == True
223
+
224
+ # Check that estimator can be serialized after partial_fit call.
225
+ dump = pickle.dumps(incpca)
226
+ incpca_loaded = pickle.loads(dump)
227
+ assert incpca._need_to_finalize == False
228
+ # Finalize is called during serialization to make sure partial results are finalized correctly.
229
+ assert incpca_loaded._need_to_finalize == False
230
+
231
+ partial_n_rows = from_table(incpca._partial_result.partial_n_rows)
232
+ partial_n_rows_loaded = from_table(incpca_loaded._partial_result.partial_n_rows)
233
+ assert_allclose(partial_n_rows, partial_n_rows_loaded)
234
+
235
+ partial_crossproduct = from_table(incpca._partial_result.partial_crossproduct)
236
+ partial_crossproduct_loaded = from_table(
237
+ incpca_loaded._partial_result.partial_crossproduct
238
+ )
239
+ assert_allclose(partial_crossproduct, partial_crossproduct_loaded)
240
+
241
+ partial_sum = from_table(incpca._partial_result.partial_sum)
242
+ partial_sum_loaded = from_table(incpca_loaded._partial_result.partial_sum)
243
+ assert_allclose(partial_sum, partial_sum_loaded)
244
+
245
+ auxiliary_table_count = incpca._partial_result.auxiliary_table_count
246
+ auxiliary_table_count_loaded = incpca_loaded._partial_result.auxiliary_table_count
247
+ assert_allclose(auxiliary_table_count, auxiliary_table_count_loaded)
248
+
249
+ for i in range(auxiliary_table_count):
250
+ aux_table = incpca._partial_result.get_auxiliary_table(i)
251
+ aux_table_loaded = incpca_loaded._partial_result.get_auxiliary_table(i)
252
+ assert_allclose(from_table(aux_table), from_table(aux_table_loaded))
253
+
254
+ incpca.partial_fit(X_split[1], queue=queue)
255
+ incpca_loaded.partial_fit(X_split[1], queue=queue)
256
+ assert incpca._need_to_finalize == True
257
+ assert incpca_loaded._need_to_finalize == True
258
+
259
+ dump = pickle.dumps(incpca_loaded)
260
+ incpca_loaded = pickle.loads(dump)
261
+
262
+ assert incpca._need_to_finalize == True
263
+ assert incpca_loaded._need_to_finalize == False
264
+
265
+ incpca.finalize_fit()
266
+ incpca_loaded.finalize_fit()
267
+
268
+ # Check that finalized estimator can be serialized.
269
+ dump = pickle.dumps(incpca_loaded)
270
+ incpca_loaded = pickle.loads(dump)
271
+
272
+ assert_allclose(incpca.singular_values_, incpca_loaded.singular_values_, atol=1e-6)
273
+ assert_allclose(incpca.n_samples_seen_, incpca_loaded.n_samples_seen_, atol=1e-6)
274
+ assert_allclose(incpca.n_features_in_, incpca_loaded.n_features_in_, atol=1e-6)
275
+ assert_allclose(incpca.mean_, incpca_loaded.mean_, atol=1e-6)
276
+ assert_allclose(incpca.var_, incpca_loaded.var_, atol=1e-6)
277
+ assert_allclose(
278
+ incpca.explained_variance_, incpca_loaded.explained_variance_, atol=1e-6
279
+ )
280
+ assert_allclose(incpca.components_, incpca_loaded.components_, atol=1e-6)
281
+ assert_allclose(
282
+ incpca.explained_variance_ratio_,
283
+ incpca_loaded.explained_variance_ratio_,
284
+ atol=1e-6,
285
+ )