PyPI - scikit-learn-intelex - Versions diffs - 2025.4.0__py313-none-manylinux_2_28_x86_64.whl - Mend

scikit-learn-intelex 2025.4.0__py313-none-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of scikit-learn-intelex might be problematic. Click here for more details.

Files changed (282) hide show

daal4py/__init__.py +73 -0
daal4py/__main__.py +58 -0
daal4py/_daal4py.cpython-313-x86_64-linux-gnu.so +0 -0
daal4py/doc/third-party-programs.txt +424 -0
daal4py/mb/__init__.py +19 -0
daal4py/mb/model_builders.py +377 -0
daal4py/mpi_transceiver.cpython-313-x86_64-linux-gnu.so +0 -0
daal4py/sklearn/__init__.py +40 -0
daal4py/sklearn/_n_jobs_support.py +248 -0
daal4py/sklearn/_utils.py +245 -0
daal4py/sklearn/cluster/__init__.py +20 -0
daal4py/sklearn/cluster/dbscan.py +165 -0
daal4py/sklearn/cluster/k_means.py +597 -0
daal4py/sklearn/cluster/tests/test_dbscan.py +109 -0
daal4py/sklearn/decomposition/__init__.py +19 -0
daal4py/sklearn/decomposition/_pca.py +524 -0
daal4py/sklearn/ensemble/AdaBoostClassifier.py +196 -0
daal4py/sklearn/ensemble/GBTDAAL.py +337 -0
daal4py/sklearn/ensemble/__init__.py +27 -0
daal4py/sklearn/ensemble/_forest.py +1397 -0
daal4py/sklearn/ensemble/tests/test_decision_forest.py +206 -0
daal4py/sklearn/linear_model/__init__.py +29 -0
daal4py/sklearn/linear_model/_coordinate_descent.py +848 -0
daal4py/sklearn/linear_model/_linear.py +272 -0
daal4py/sklearn/linear_model/_ridge.py +325 -0
daal4py/sklearn/linear_model/coordinate_descent.py +17 -0
daal4py/sklearn/linear_model/linear.py +17 -0
daal4py/sklearn/linear_model/logistic_loss.py +195 -0
daal4py/sklearn/linear_model/logistic_path.py +1026 -0
daal4py/sklearn/linear_model/ridge.py +17 -0
daal4py/sklearn/linear_model/tests/test_linear.py +208 -0
daal4py/sklearn/linear_model/tests/test_ridge.py +69 -0
daal4py/sklearn/manifold/__init__.py +19 -0
daal4py/sklearn/manifold/_t_sne.py +405 -0
daal4py/sklearn/metrics/__init__.py +20 -0
daal4py/sklearn/metrics/_pairwise.py +236 -0
daal4py/sklearn/metrics/_ranking.py +210 -0
daal4py/sklearn/model_selection/__init__.py +19 -0
daal4py/sklearn/model_selection/_split.py +309 -0
daal4py/sklearn/model_selection/tests/test_split.py +56 -0
daal4py/sklearn/monkeypatch/__init__.py +0 -0
daal4py/sklearn/monkeypatch/dispatcher.py +232 -0
daal4py/sklearn/monkeypatch/tests/_models_info.py +161 -0
daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py +71 -0
daal4py/sklearn/monkeypatch/tests/test_patching.py +90 -0
daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py +117 -0
daal4py/sklearn/neighbors/__init__.py +21 -0
daal4py/sklearn/neighbors/_base.py +503 -0
daal4py/sklearn/neighbors/_classification.py +139 -0
daal4py/sklearn/neighbors/_regression.py +74 -0
daal4py/sklearn/neighbors/_unsupervised.py +55 -0
daal4py/sklearn/neighbors/tests/test_kneighbors.py +113 -0
daal4py/sklearn/svm/__init__.py +19 -0
daal4py/sklearn/svm/svm.py +734 -0
daal4py/sklearn/utils/__init__.py +21 -0
daal4py/sklearn/utils/base.py +75 -0
daal4py/sklearn/utils/tests/test_utils.py +51 -0
daal4py/sklearn/utils/validation.py +696 -0
onedal/__init__.py +83 -0
onedal/_config.py +54 -0
onedal/_device_offload.py +204 -0
onedal/_onedal_py_dpc.cpython-313-x86_64-linux-gnu.so +0 -0
onedal/_onedal_py_host.cpython-313-x86_64-linux-gnu.so +0 -0
onedal/_onedal_py_spmd_dpc.cpython-313-x86_64-linux-gnu.so +0 -0
onedal/basic_statistics/__init__.py +20 -0
onedal/basic_statistics/basic_statistics.py +107 -0
onedal/basic_statistics/incremental_basic_statistics.py +175 -0
onedal/basic_statistics/tests/test_basic_statistics.py +242 -0
onedal/basic_statistics/tests/test_incremental_basic_statistics.py +279 -0
onedal/basic_statistics/tests/utils.py +50 -0
onedal/cluster/__init__.py +27 -0
onedal/cluster/dbscan.py +105 -0
onedal/cluster/kmeans.py +557 -0
onedal/cluster/kmeans_init.py +112 -0
onedal/cluster/tests/test_dbscan.py +125 -0
onedal/cluster/tests/test_kmeans.py +88 -0
onedal/cluster/tests/test_kmeans_init.py +93 -0
onedal/common/_base.py +38 -0
onedal/common/_estimator_checks.py +47 -0
onedal/common/_mixin.py +62 -0
onedal/common/_policy.py +55 -0
onedal/common/_spmd_policy.py +30 -0
onedal/common/hyperparameters.py +125 -0
onedal/common/tests/test_policy.py +76 -0
onedal/common/tests/test_sycl.py +128 -0
onedal/covariance/__init__.py +20 -0
onedal/covariance/covariance.py +122 -0
onedal/covariance/incremental_covariance.py +161 -0
onedal/covariance/tests/test_covariance.py +50 -0
onedal/covariance/tests/test_incremental_covariance.py +190 -0
onedal/datatypes/__init__.py +19 -0
onedal/datatypes/_data_conversion.py +121 -0
onedal/datatypes/tests/common.py +126 -0
onedal/datatypes/tests/test_data.py +475 -0
onedal/decomposition/__init__.py +20 -0
onedal/decomposition/incremental_pca.py +214 -0
onedal/decomposition/pca.py +186 -0
onedal/decomposition/tests/test_incremental_pca.py +285 -0
onedal/ensemble/__init__.py +29 -0
onedal/ensemble/forest.py +736 -0
onedal/ensemble/tests/test_random_forest.py +97 -0
onedal/linear_model/__init__.py +27 -0
onedal/linear_model/incremental_linear_model.py +292 -0
onedal/linear_model/linear_model.py +325 -0
onedal/linear_model/logistic_regression.py +247 -0
onedal/linear_model/tests/test_incremental_linear_regression.py +213 -0
onedal/linear_model/tests/test_incremental_ridge_regression.py +171 -0
onedal/linear_model/tests/test_linear_regression.py +259 -0
onedal/linear_model/tests/test_logistic_regression.py +95 -0
onedal/linear_model/tests/test_ridge.py +95 -0
onedal/neighbors/__init__.py +19 -0
onedal/neighbors/neighbors.py +763 -0
onedal/neighbors/tests/test_knn_classification.py +49 -0
onedal/primitives/__init__.py +27 -0
onedal/primitives/get_tree.py +25 -0
onedal/primitives/kernel_functions.py +152 -0
onedal/primitives/tests/test_kernel_functions.py +159 -0
onedal/spmd/__init__.py +25 -0
onedal/spmd/_base.py +30 -0
onedal/spmd/basic_statistics/__init__.py +20 -0
onedal/spmd/basic_statistics/basic_statistics.py +30 -0
onedal/spmd/basic_statistics/incremental_basic_statistics.py +71 -0
onedal/spmd/cluster/__init__.py +28 -0
onedal/spmd/cluster/dbscan.py +23 -0
onedal/spmd/cluster/kmeans.py +56 -0
onedal/spmd/covariance/__init__.py +20 -0
onedal/spmd/covariance/covariance.py +26 -0
onedal/spmd/covariance/incremental_covariance.py +83 -0
onedal/spmd/decomposition/__init__.py +20 -0
onedal/spmd/decomposition/incremental_pca.py +124 -0
onedal/spmd/decomposition/pca.py +26 -0
onedal/spmd/ensemble/__init__.py +19 -0
onedal/spmd/ensemble/forest.py +28 -0
onedal/spmd/linear_model/__init__.py +21 -0
onedal/spmd/linear_model/incremental_linear_model.py +101 -0
onedal/spmd/linear_model/linear_model.py +30 -0
onedal/spmd/linear_model/logistic_regression.py +38 -0
onedal/spmd/neighbors/__init__.py +19 -0
onedal/spmd/neighbors/neighbors.py +75 -0
onedal/svm/__init__.py +19 -0
onedal/svm/svm.py +556 -0
onedal/svm/tests/test_csr_svm.py +351 -0
onedal/svm/tests/test_nusvc.py +204 -0
onedal/svm/tests/test_nusvr.py +210 -0
onedal/svm/tests/test_svc.py +176 -0
onedal/svm/tests/test_svr.py +243 -0
onedal/tests/test_common.py +57 -0
onedal/tests/utils/_dataframes_support.py +162 -0
onedal/tests/utils/_device_selection.py +102 -0
onedal/utils/__init__.py +49 -0
onedal/utils/_array_api.py +81 -0
onedal/utils/_dpep_helpers.py +56 -0
onedal/utils/tests/test_validation.py +142 -0
onedal/utils/validation.py +464 -0
scikit_learn_intelex-2025.4.0.dist-info/LICENSE.txt +202 -0
scikit_learn_intelex-2025.4.0.dist-info/METADATA +190 -0
scikit_learn_intelex-2025.4.0.dist-info/RECORD +282 -0
scikit_learn_intelex-2025.4.0.dist-info/WHEEL +5 -0
scikit_learn_intelex-2025.4.0.dist-info/top_level.txt +3 -0
sklearnex/__init__.py +66 -0
sklearnex/__main__.py +58 -0
sklearnex/_config.py +116 -0
sklearnex/_device_offload.py +126 -0
sklearnex/_utils.py +177 -0
sklearnex/basic_statistics/__init__.py +20 -0
sklearnex/basic_statistics/basic_statistics.py +261 -0
sklearnex/basic_statistics/incremental_basic_statistics.py +352 -0
sklearnex/basic_statistics/tests/test_basic_statistics.py +405 -0
sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +455 -0
sklearnex/cluster/__init__.py +20 -0
sklearnex/cluster/dbscan.py +197 -0
sklearnex/cluster/k_means.py +397 -0
sklearnex/cluster/tests/test_dbscan.py +38 -0
sklearnex/cluster/tests/test_kmeans.py +157 -0
sklearnex/conftest.py +82 -0
sklearnex/covariance/__init__.py +19 -0
sklearnex/covariance/incremental_covariance.py +405 -0
sklearnex/covariance/tests/test_incremental_covariance.py +287 -0
sklearnex/decomposition/__init__.py +19 -0
sklearnex/decomposition/pca.py +427 -0
sklearnex/decomposition/tests/test_pca.py +58 -0
sklearnex/dispatcher.py +534 -0
sklearnex/doc/third-party-programs.txt +424 -0
sklearnex/ensemble/__init__.py +29 -0
sklearnex/ensemble/_forest.py +2029 -0
sklearnex/ensemble/tests/test_forest.py +140 -0
sklearnex/glob/__main__.py +72 -0
sklearnex/glob/dispatcher.py +101 -0
sklearnex/linear_model/__init__.py +32 -0
sklearnex/linear_model/coordinate_descent.py +30 -0
sklearnex/linear_model/incremental_linear.py +495 -0
sklearnex/linear_model/incremental_ridge.py +432 -0
sklearnex/linear_model/linear.py +346 -0
sklearnex/linear_model/logistic_regression.py +415 -0
sklearnex/linear_model/ridge.py +390 -0
sklearnex/linear_model/tests/test_incremental_linear.py +267 -0
sklearnex/linear_model/tests/test_incremental_ridge.py +214 -0
sklearnex/linear_model/tests/test_linear.py +142 -0
sklearnex/linear_model/tests/test_logreg.py +134 -0
sklearnex/linear_model/tests/test_ridge.py +256 -0
sklearnex/manifold/__init__.py +19 -0
sklearnex/manifold/t_sne.py +26 -0
sklearnex/manifold/tests/test_tsne.py +250 -0
sklearnex/metrics/__init__.py +23 -0
sklearnex/metrics/pairwise.py +22 -0
sklearnex/metrics/ranking.py +20 -0
sklearnex/metrics/tests/test_metrics.py +39 -0
sklearnex/model_selection/__init__.py +21 -0
sklearnex/model_selection/split.py +22 -0
sklearnex/model_selection/tests/test_model_selection.py +34 -0
sklearnex/neighbors/__init__.py +27 -0
sklearnex/neighbors/_lof.py +236 -0
sklearnex/neighbors/common.py +310 -0
sklearnex/neighbors/knn_classification.py +231 -0
sklearnex/neighbors/knn_regression.py +207 -0
sklearnex/neighbors/knn_unsupervised.py +178 -0
sklearnex/neighbors/tests/test_neighbors.py +82 -0
sklearnex/preview/__init__.py +17 -0
sklearnex/preview/covariance/__init__.py +19 -0
sklearnex/preview/covariance/covariance.py +142 -0
sklearnex/preview/covariance/tests/test_covariance.py +66 -0
sklearnex/preview/decomposition/__init__.py +19 -0
sklearnex/preview/decomposition/incremental_pca.py +244 -0
sklearnex/preview/decomposition/tests/test_incremental_pca.py +336 -0
sklearnex/spmd/__init__.py +25 -0
sklearnex/spmd/basic_statistics/__init__.py +20 -0
sklearnex/spmd/basic_statistics/basic_statistics.py +21 -0
sklearnex/spmd/basic_statistics/incremental_basic_statistics.py +30 -0
sklearnex/spmd/basic_statistics/tests/test_basic_statistics_spmd.py +107 -0
sklearnex/spmd/basic_statistics/tests/test_incremental_basic_statistics_spmd.py +306 -0
sklearnex/spmd/cluster/__init__.py +30 -0
sklearnex/spmd/cluster/dbscan.py +50 -0
sklearnex/spmd/cluster/kmeans.py +21 -0
sklearnex/spmd/cluster/tests/test_dbscan_spmd.py +97 -0
sklearnex/spmd/cluster/tests/test_kmeans_spmd.py +173 -0
sklearnex/spmd/covariance/__init__.py +20 -0
sklearnex/spmd/covariance/covariance.py +21 -0
sklearnex/spmd/covariance/incremental_covariance.py +37 -0
sklearnex/spmd/covariance/tests/test_covariance_spmd.py +107 -0
sklearnex/spmd/covariance/tests/test_incremental_covariance_spmd.py +184 -0
sklearnex/spmd/decomposition/__init__.py +20 -0
sklearnex/spmd/decomposition/incremental_pca.py +30 -0
sklearnex/spmd/decomposition/pca.py +21 -0
sklearnex/spmd/decomposition/tests/test_incremental_pca_spmd.py +269 -0
sklearnex/spmd/decomposition/tests/test_pca_spmd.py +128 -0
sklearnex/spmd/ensemble/__init__.py +19 -0
sklearnex/spmd/ensemble/forest.py +71 -0
sklearnex/spmd/ensemble/tests/test_forest_spmd.py +265 -0
sklearnex/spmd/linear_model/__init__.py +21 -0
sklearnex/spmd/linear_model/incremental_linear_model.py +35 -0
sklearnex/spmd/linear_model/linear_model.py +21 -0
sklearnex/spmd/linear_model/logistic_regression.py +21 -0
sklearnex/spmd/linear_model/tests/test_incremental_linear_spmd.py +331 -0
sklearnex/spmd/linear_model/tests/test_linear_regression_spmd.py +145 -0
sklearnex/spmd/linear_model/tests/test_logistic_regression_spmd.py +162 -0
sklearnex/spmd/neighbors/__init__.py +19 -0
sklearnex/spmd/neighbors/neighbors.py +25 -0
sklearnex/spmd/neighbors/tests/test_neighbors_spmd.py +288 -0
sklearnex/svm/__init__.py +29 -0
sklearnex/svm/_common.py +339 -0
sklearnex/svm/nusvc.py +371 -0
sklearnex/svm/nusvr.py +170 -0
sklearnex/svm/svc.py +399 -0
sklearnex/svm/svr.py +167 -0
sklearnex/svm/tests/test_svm.py +93 -0
sklearnex/tests/test_common.py +491 -0
sklearnex/tests/test_config.py +123 -0
sklearnex/tests/test_hyperparameters.py +43 -0
sklearnex/tests/test_memory_usage.py +347 -0
sklearnex/tests/test_monkeypatch.py +269 -0
sklearnex/tests/test_n_jobs_support.py +108 -0
sklearnex/tests/test_parallel.py +48 -0
sklearnex/tests/test_patching.py +377 -0
sklearnex/tests/test_run_to_run_stability.py +326 -0
sklearnex/tests/utils/__init__.py +48 -0
sklearnex/tests/utils/base.py +436 -0
sklearnex/tests/utils/spmd.py +198 -0
sklearnex/utils/__init__.py +19 -0
sklearnex/utils/_array_api.py +82 -0
sklearnex/utils/parallel.py +59 -0
sklearnex/utils/tests/test_validation.py +238 -0
sklearnex/utils/validation.py +208 -0

onedal/decomposition/incremental_pca.py ADDED Viewed

@@ -0,0 +1,214 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+from daal4py.sklearn._utils import get_dtype
+from ..datatypes import from_table, to_table
+from ..utils import _check_array
+from .pca import BasePCA
+class IncrementalPCA(BasePCA):
+    """
+    Incremental estimator for PCA based on oneDAL implementation.
+    Allows to compute PCA if data are splitted into batches.
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of components to keep. If ``n_components`` is ``None``,
+        then ``n_components`` is set to ``min(n_samples, n_features)``.
+    is_deterministic : bool, default=True
+        When True the ``components_`` vectors are chosen in deterministic
+        way, otherwise some of them can be oppositely directed.
+    method : string, default='cov'
+        Method used on oneDAL side to compute result.
+    whiten : bool, default=False
+        When True the ``components_`` vectors are divided
+        by ``n_samples`` times ``components_`` to ensure uncorrelated outputs
+        with unit component-wise variances.
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometimes
+        improve the predictive accuracy of the downstream estimators by
+        making data respect some hard-wired assumptions.
+    Attributes
+    ----------
+        components_ : ndarray of shape (n_components, n_features)
+        Principal axes in feature space, representing the directions of
+        maximum variance in the data. Equivalently, the right singular
+        vectors of the centered input data, parallel to its eigenvectors.
+        The components are sorted by decreasing ``explained_variance_``.
+        explained_variance_ : ndarray of shape (n_components,)
+            Variance explained by each of the selected components.
+        explained_variance_ratio_ : ndarray of shape (n_components,)
+            Percentage of variance explained by each of the selected components.
+            If all components are stored, the sum of explained variances is equal
+            to 1.0.
+        singular_values_ : ndarray of shape (n_components,)
+            The singular values corresponding to each of the selected components.
+            The singular values are equal to the 2-norms of the ``n_components``
+            variables in the lower-dimensional space.
+        mean_ : ndarray of shape (n_features,)
+            Per-feature empirical mean, aggregate over calls to ``partial_fit``.
+        var_ : ndarray of shape (n_features,)
+            Per-feature empirical variance, aggregate over calls to
+            ``partial_fit``.
+        noise_variance_ : float
+            Equal to the average of (min(n_features, n_samples) - n_components)
+            smallest eigenvalues of the covariance matrix of X.
+    """
+    def __init__(
+        self,
+        n_components=None,
+        is_deterministic=True,
+        method="cov",
+        whiten=False,
+    ):
+        self.n_components = n_components
+        self.method = method
+        self.is_deterministic = is_deterministic
+        self.whiten = whiten
+        self._reset()
+    def _reset(self):
+        self._need_to_finalize = False
+        module = self._get_backend("decomposition", "dim_reduction")
+        if hasattr(self, "components_"):
+            del self.components_
+        self._partial_result = module.partial_train_result()
+    def __getstate__(self):
+        # Since finalize_fit can't be dispatched without directly provided queue
+        # and the dispatching policy can't be serialized, the computation is finalized
+        # here and the policy is not saved in serialized data.
+        self.finalize_fit()
+        data = self.__dict__.copy()
+        data.pop("_queue", None)
+        return data
+    def partial_fit(self, X, queue):
+        """Incremental fit with X. All of X is processed as a single batch.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = _check_array(X)
+        n_samples, n_features = X.shape
+        first_pass = not hasattr(self, "components_")
+        if first_pass:
+            self.components_ = None
+            self.n_samples_seen_ = n_samples
+            self.n_features_in_ = n_features
+        else:
+            self.n_samples_seen_ += n_samples
+        if self.n_components is None:
+            if self.components_ is None:
+                self.n_components_ = min(n_samples, n_features)
+            else:
+                self.n_components_ = self.components_.shape[0]
+        else:
+            self.n_components_ = self.n_components
+        self._queue = queue
+        policy = self._get_policy(queue, X)
+        X_table = to_table(X, queue=queue)
+        if not hasattr(self, "_dtype"):
+            self._dtype = X_table.dtype
+            self._params = self._get_onedal_params(X_table)
+        self._partial_result = self._get_backend(
+            "decomposition",
+            "dim_reduction",
+            "partial_train",
+            policy,
+            self._params,
+            self._partial_result,
+            X_table,
+        )
+        self._need_to_finalize = True
+        return self
+    def finalize_fit(self, queue=None):
+        """
+        Finalizes principal components computation and obtains resulting
+        attributes from the current `_partial_result`.
+        Parameters
+        ----------
+        queue : dpctl.SyclQueue
+            Not used here, added for API conformance
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if self._need_to_finalize:
+            module = self._get_backend("decomposition", "dim_reduction")
+            if queue is not None:
+                policy = self._get_policy(queue)
+            else:
+                policy = self._get_policy(self._queue)
+            result = module.finalize_train(policy, self._params, self._partial_result)
+            self.mean_ = from_table(result.means).ravel()
+            self.var_ = from_table(result.variances).ravel()
+            self.components_ = from_table(result.eigenvectors)
+            self.singular_values_ = np.nan_to_num(
+                from_table(result.singular_values).ravel()
+            )
+            self.explained_variance_ = np.maximum(
+                from_table(result.eigenvalues).ravel(), 0
+            )
+            self.explained_variance_ratio_ = from_table(
+                result.explained_variances_ratio
+            ).ravel()
+            self.noise_variance_ = self._compute_noise_variance(
+                self.n_components_, min(self.n_samples_seen_, self.n_features_in_)
+            )
+        self._need_to_finalize = False
+        return self

onedal/decomposition/pca.py ADDED Viewed

@@ -0,0 +1,186 @@
+# ==============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numbers
+from abc import ABCMeta
+import numpy as np
+from sklearn.decomposition._pca import _infer_dimension
+from sklearn.utils.extmath import stable_cumsum
+from ..common._base import BaseEstimator
+from ..datatypes import from_table, to_table
+class BasePCA(BaseEstimator, metaclass=ABCMeta):
+    """
+    Base class for PCA oneDAL implementation.
+    """
+    def __init__(
+        self,
+        n_components=None,
+        is_deterministic=True,
+        method="cov",
+        whiten=False,
+    ):
+        self.n_components = n_components
+        self.method = method
+        self.is_deterministic = is_deterministic
+        self.whiten = whiten
+    def _get_onedal_params(self, data, stage=None):
+        if stage is None:
+            n_components = self._resolve_n_components_for_training(data.shape)
+        elif stage == "predict":
+            n_components = self.n_components_
+        return {
+            "fptype": data.dtype,
+            "method": self.method,
+            "n_components": n_components,
+            "is_deterministic": self.is_deterministic,
+            "whiten": self.whiten,
+        }
+    def _validate_n_components(self, n_components, n_samples, n_features):
+        if n_components is None:
+            n_components = min(n_samples, n_features)
+        if n_components == "mle":
+            if n_samples < n_features:
+                raise ValueError(
+                    "n_components='mle' is only supported if n_samples >= n_features"
+                )
+        elif not 0 <= n_components <= min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be between 0 and "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='full'" % (n_components, min(n_samples, n_features))
+            )
+        elif n_components >= 1:
+            if not isinstance(n_components, numbers.Integral):
+                raise ValueError(
+                    "n_components=%r must be of type int "
+                    "when greater than or equal to 1, "
+                    "was of type=%r" % (n_components, type(n_components))
+                )
+    def _resolve_n_components_for_training(self, shape_tuple):
+        if self.n_components is None or self.n_components == "mle":
+            return min(shape_tuple)
+        elif (
+            isinstance(self.n_components, float)
+            and self.n_components > 0.0
+            and self.n_components <= 1.0
+        ):
+            return min(shape_tuple)
+        else:
+            return self.n_components
+    def _resolve_n_components_for_result(self, shape_tuple):
+        if self.n_components is None:
+            return min(shape_tuple)
+        elif self.n_components == "mle":
+            return _infer_dimension(self.explained_variance_, shape_tuple[0])
+        elif 0.0 < self.n_components < 1.0:
+            ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
+            return np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1
+        elif isinstance(self.n_components, float) and self.n_components == 1.0:
+            return min(shape_tuple)
+        else:
+            return self.n_components
+    def _compute_noise_variance(self, n_components, n_sf_min):
+        if n_components < n_sf_min:
+            if len(self.explained_variance_) == n_sf_min:
+                return self.explained_variance_[n_components:].mean()
+            elif len(self.explained_variance_) < n_sf_min:
+                # TODO Rename variances_ to var_ to align with sklearn/sklearnex IncrementalPCA
+                if hasattr(self, "variances_"):
+                    resid_var = self.variances_.sum()
+                elif hasattr(self, "var_"):
+                    resid_var = self.var_.sum()
+                resid_var -= self.explained_variance_.sum()
+                return resid_var / (n_sf_min - n_components)
+        else:
+            return 0.0
+    def _create_model(self):
+        m = self._get_backend("decomposition", "dim_reduction", "model")
+        m.eigenvectors = to_table(self.components_)
+        m.means = to_table(self.mean_)
+        if self.whiten:
+            m.eigenvalues = to_table(self.explained_variance_)
+        self._onedal_model = m
+        return m
+    def predict(self, X, queue=None):
+        policy = self._get_policy(queue, X)
+        model = self._create_model()
+        X_table = to_table(X, queue=queue)
+        params = self._get_onedal_params(X_table, stage="predict")
+        result = self._get_backend(
+            "decomposition", "dim_reduction", "infer", policy, params, model, X_table
+        )
+        return from_table(result.transformed_data)
+class PCA(BasePCA):
+    def fit(self, X, y=None, queue=None):
+        n_samples, n_features = X.shape
+        n_sf_min = min(n_samples, n_features)
+        self._validate_n_components(self.n_components, n_samples, n_features)
+        policy = self._get_policy(queue, X)
+        # TODO: investigate why np.ndarray with OWNDATA=FALSE flag
+        # fails to be converted to oneDAL table
+        if isinstance(X, np.ndarray) and not X.flags["OWNDATA"]:
+            X = X.copy()
+        X = to_table(X, queue=queue)
+        params = self._get_onedal_params(X)
+        result = self._get_backend(
+            "decomposition", "dim_reduction", "train", policy, params, X
+        )
+        self.mean_ = from_table(result.means).ravel()
+        self.variances_ = from_table(result.variances)
+        self.components_ = from_table(result.eigenvectors)
+        self.singular_values_ = from_table(result.singular_values).ravel()
+        self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0)
+        self.explained_variance_ratio_ = from_table(
+            result.explained_variances_ratio
+        ).ravel()
+        self.n_samples_ = n_samples
+        self.n_features_ = n_features
+        U = None
+        S = self.singular_values_
+        Vt = self.components_
+        n_components = self._resolve_n_components_for_result(X.shape)
+        self.n_components_ = n_components
+        self.noise_variance_ = self._compute_noise_variance(n_components, n_sf_min)
+        if n_components < params["n_components"]:
+            self.explained_variance_ = self.explained_variance_[:n_components]
+            self.components_ = self.components_[:n_components]
+            self.singular_values_ = self.singular_values_[:n_components]
+            self.explained_variance_ratio_ = self.explained_variance_ratio_[:n_components]
+        return self

onedal/decomposition/tests/test_incremental_pca.py ADDED Viewed

@@ -0,0 +1,285 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from daal4py.sklearn._utils import daal_check_version
+from onedal.datatypes import from_table
+from onedal.decomposition import IncrementalPCA
+from onedal.tests.utils._device_selection import get_queues
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("is_deterministic", [True, False])
+@pytest.mark.parametrize("whiten", [True, False])
+@pytest.mark.parametrize("num_blocks", [1, 2, 3])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
+    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    X = X.astype(dtype=dtype)
+    X_split = np.array_split(X, num_blocks)
+    incpca = IncrementalPCA(is_deterministic=is_deterministic, whiten=whiten)
+    for i in range(num_blocks):
+        incpca.partial_fit(X_split[i], queue=queue)
+    result = incpca.finalize_fit()
+    transformed_data = incpca.predict(X, queue=queue)
+    expected_n_components_ = 2
+    expected_components_ = np.array([[0.83849224, 0.54491354], [-0.54491354, 0.83849224]])
+    expected_singular_values_ = np.array([6.30061232, 0.54980396])
+    expected_mean_ = np.array([0, 0])
+    expected_var_ = np.array([5.6, 2.4])
+    expected_explained_variance_ = np.array([7.93954312, 0.06045688])
+    expected_explained_variance_ratio_ = np.array([0.99244289, 0.00755711])
+    expected_transformed_data = (
+        np.array(
+            [
+                [-0.49096647, -1.19399271],
+                [-0.78854479, 1.02218579],
+                [-1.27951125, -0.17180692],
+                [0.49096647, 1.19399271],
+                [0.78854479, -1.02218579],
+                [1.27951125, 0.17180692],
+            ]
+        )
+        if whiten
+        else np.array(
+            [
+                [-1.38340578, -0.2935787],
+                [-2.22189802, 0.25133484],
+                [-3.6053038, -0.04224385],
+                [1.38340578, 0.2935787],
+                [2.22189802, -0.25133484],
+                [3.6053038, 0.04224385],
+            ]
+        )
+    )
+    tol = 1e-7
+    if transformed_data.dtype == np.float32:
+        tol = 7e-6 if whiten else 1e-6
+    assert result.n_components_ == expected_n_components_
+    assert_allclose(result.singular_values_, expected_singular_values_, atol=tol)
+    assert_allclose(result.mean_, expected_mean_, atol=tol)
+    assert_allclose(result.var_, expected_var_, atol=tol)
+    assert_allclose(result.explained_variance_, expected_explained_variance_, atol=tol)
+    assert_allclose(
+        result.explained_variance_ratio_, expected_explained_variance_ratio_, atol=tol
+    )
+    if is_deterministic and daal_check_version((2024, "P", 500)):
+        assert_allclose(result.components_, expected_components_, atol=tol)
+        assert_allclose(transformed_data, expected_transformed_data, atol=tol)
+    else:
+        for i in range(result.n_components_):
+            abs_dot_product = np.abs(
+                np.dot(result.components_[i], expected_components_[i])
+            )
+            assert np.abs(abs_dot_product - 1.0) < tol
+            if np.dot(result.components_[i], expected_components_[i]) < 0:
+                assert_allclose(
+                    -transformed_data[i], expected_transformed_data[i], atol=tol
+                )
+            else:
+                assert_allclose(
+                    transformed_data[i], expected_transformed_data[i], atol=tol
+                )
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("n_components", [None, 1, 5])
+@pytest.mark.parametrize("whiten", [True, False])
+@pytest.mark.parametrize("num_blocks", [1, 10])
+@pytest.mark.parametrize("row_count", [100, 1000])
+@pytest.mark.parametrize("column_count", [10, 100])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_on_random_data(
+    queue, n_components, whiten, num_blocks, row_count, column_count, dtype
+):
+    seed = 78
+    gen = np.random.default_rng(seed)
+    X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count))
+    X = X.astype(dtype=dtype)
+    X_split = np.array_split(X, num_blocks)
+    incpca = IncrementalPCA(n_components=n_components, whiten=whiten)
+    for i in range(num_blocks):
+        incpca.partial_fit(X_split[i], queue=queue)
+    incpca.finalize_fit()
+    transformed_data = incpca.predict(X, queue=queue)
+    tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6
+    n_components = incpca.n_components_
+    expected_n_samples_seen = X.shape[0]
+    expected_n_features_in = X.shape[1]
+    n_samples_seen = incpca.n_samples_seen_
+    n_features_in = incpca.n_features_in_
+    assert n_samples_seen == expected_n_samples_seen
+    assert n_features_in == expected_n_features_in
+    components = incpca.components_
+    singular_values = incpca.singular_values_
+    centered_data = X - np.mean(X, axis=0)
+    cov_eigenvalues, cov_eigenvectors = np.linalg.eig(
+        centered_data.T @ centered_data / (n_samples_seen - 1)
+    )
+    cov_eigenvalues = np.nan_to_num(cov_eigenvalues)
+    cov_eigenvalues[cov_eigenvalues < 0] = 0
+    eigenvalues_order = np.argsort(cov_eigenvalues)[::-1]
+    sorted_eigenvalues = cov_eigenvalues[eigenvalues_order]
+    sorted_eigenvectors = cov_eigenvectors[:, eigenvalues_order]
+    expected_singular_values = np.sqrt(sorted_eigenvalues * (n_samples_seen - 1))[
+        :n_components
+    ]
+    expected_components = sorted_eigenvectors.T[:n_components]
+    assert_allclose(singular_values, expected_singular_values, atol=tol)
+    for i in range(n_components):
+        component_length = np.dot(components[i], components[i])
+        assert np.abs(component_length - 1.0) < tol
+        abs_dot_product = np.abs(np.dot(components[i], expected_components[i]))
+        assert np.abs(abs_dot_product - 1.0) < tol
+    expected_mean = np.mean(X, axis=0)
+    assert_allclose(incpca.mean_, expected_mean, atol=tol)
+    expected_var_ = np.var(X, ddof=1, axis=0)
+    assert_allclose(incpca.var_, expected_var_, atol=tol)
+    expected_explained_variance = sorted_eigenvalues[:n_components]
+    assert_allclose(incpca.explained_variance_, expected_explained_variance, atol=tol)
+    expected_explained_variance_ratio = expected_explained_variance / np.sum(
+        sorted_eigenvalues
+    )
+    assert_allclose(
+        incpca.explained_variance_ratio_, expected_explained_variance_ratio, atol=tol
+    )
+    expected_noise_variance = (
+        np.mean(sorted_eigenvalues[n_components:])
+        if len(sorted_eigenvalues) > n_components
+        else 0.0
+    )
+    # TODO Fix noise variance computation (It is necessary to update C++ side)
+    # assert np.abs(incpca.noise_variance_ - expected_noise_variance) < tol
+    expected_transformed_data = centered_data @ components.T
+    if whiten:
+        scale = np.sqrt(incpca.explained_variance_)
+        min_scale = np.finfo(scale.dtype).eps
+        scale[scale < min_scale] = np.inf
+        expected_transformed_data /= scale
+    if daal_check_version((2024, "P", 500)) or not (
+        whiten and queue is not None and queue.sycl_device.device_type.name == "gpu"
+    ):
+        assert_allclose(transformed_data, expected_transformed_data, atol=tol)
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incremental_estimator_pickle(queue, dtype):
+    import pickle
+    from onedal.decomposition import IncrementalPCA
+    incpca = IncrementalPCA()
+    # Check that estimator can be serialized without any data.
+    dump = pickle.dumps(incpca)
+    incpca_loaded = pickle.loads(dump)
+    seed = 77
+    gen = np.random.default_rng(seed)
+    X = gen.uniform(low=-0.3, high=+0.7, size=(10, 10))
+    X = X.astype(dtype)
+    X_split = np.array_split(X, 2)
+    incpca.partial_fit(X_split[0], queue=queue)
+    incpca_loaded.partial_fit(X_split[0], queue=queue)
+    assert incpca._need_to_finalize == True
+    assert incpca_loaded._need_to_finalize == True
+    # Check that estimator can be serialized after partial_fit call.
+    dump = pickle.dumps(incpca)
+    incpca_loaded = pickle.loads(dump)
+    assert incpca._need_to_finalize == False
+    # Finalize is called during serialization to make sure partial results are finalized correctly.
+    assert incpca_loaded._need_to_finalize == False
+    partial_n_rows = from_table(incpca._partial_result.partial_n_rows)
+    partial_n_rows_loaded = from_table(incpca_loaded._partial_result.partial_n_rows)
+    assert_allclose(partial_n_rows, partial_n_rows_loaded)
+    partial_crossproduct = from_table(incpca._partial_result.partial_crossproduct)
+    partial_crossproduct_loaded = from_table(
+        incpca_loaded._partial_result.partial_crossproduct
+    )
+    assert_allclose(partial_crossproduct, partial_crossproduct_loaded)
+    partial_sum = from_table(incpca._partial_result.partial_sum)
+    partial_sum_loaded = from_table(incpca_loaded._partial_result.partial_sum)
+    assert_allclose(partial_sum, partial_sum_loaded)
+    auxiliary_table_count = incpca._partial_result.auxiliary_table_count
+    auxiliary_table_count_loaded = incpca_loaded._partial_result.auxiliary_table_count
+    assert_allclose(auxiliary_table_count, auxiliary_table_count_loaded)
+    for i in range(auxiliary_table_count):
+        aux_table = incpca._partial_result.get_auxiliary_table(i)
+        aux_table_loaded = incpca_loaded._partial_result.get_auxiliary_table(i)
+        assert_allclose(from_table(aux_table), from_table(aux_table_loaded))
+    incpca.partial_fit(X_split[1], queue=queue)
+    incpca_loaded.partial_fit(X_split[1], queue=queue)
+    assert incpca._need_to_finalize == True
+    assert incpca_loaded._need_to_finalize == True
+    dump = pickle.dumps(incpca_loaded)
+    incpca_loaded = pickle.loads(dump)
+    assert incpca._need_to_finalize == True
+    assert incpca_loaded._need_to_finalize == False
+    incpca.finalize_fit()
+    incpca_loaded.finalize_fit()
+    # Check that finalized estimator can be serialized.
+    dump = pickle.dumps(incpca_loaded)
+    incpca_loaded = pickle.loads(dump)
+    assert_allclose(incpca.singular_values_, incpca_loaded.singular_values_, atol=1e-6)
+    assert_allclose(incpca.n_samples_seen_, incpca_loaded.n_samples_seen_, atol=1e-6)
+    assert_allclose(incpca.n_features_in_, incpca_loaded.n_features_in_, atol=1e-6)
+    assert_allclose(incpca.mean_, incpca_loaded.mean_, atol=1e-6)
+    assert_allclose(incpca.var_, incpca_loaded.var_, atol=1e-6)
+    assert_allclose(
+        incpca.explained_variance_, incpca_loaded.explained_variance_, atol=1e-6
+    )
+    assert_allclose(incpca.components_, incpca_loaded.components_, atol=1e-6)
+    assert_allclose(
+        incpca.explained_variance_ratio_,
+        incpca_loaded.explained_variance_ratio_,
+        atol=1e-6,
+    )