PyPI - scikit-learn-intelex - Versions diffs - 2025.1.0__py310-none-manylinux_2_28_x86_64.whl - Mend

scikit-learn-intelex 2025.1.0__py310-none-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of scikit-learn-intelex might be problematic. Click here for more details.

Files changed (280) hide show

daal4py/__init__.py +73 -0
daal4py/__main__.py +58 -0
daal4py/_daal4py.cpython-310-x86_64-linux-gnu.so +0 -0
daal4py/doc/third-party-programs.txt +424 -0
daal4py/mb/__init__.py +19 -0
daal4py/mb/model_builders.py +377 -0
daal4py/mpi_transceiver.cpython-310-x86_64-linux-gnu.so +0 -0
daal4py/sklearn/__init__.py +40 -0
daal4py/sklearn/_n_jobs_support.py +248 -0
daal4py/sklearn/_utils.py +245 -0
daal4py/sklearn/cluster/__init__.py +20 -0
daal4py/sklearn/cluster/dbscan.py +165 -0
daal4py/sklearn/cluster/k_means.py +597 -0
daal4py/sklearn/cluster/tests/test_dbscan.py +109 -0
daal4py/sklearn/decomposition/__init__.py +19 -0
daal4py/sklearn/decomposition/_pca.py +524 -0
daal4py/sklearn/ensemble/AdaBoostClassifier.py +196 -0
daal4py/sklearn/ensemble/GBTDAAL.py +337 -0
daal4py/sklearn/ensemble/__init__.py +27 -0
daal4py/sklearn/ensemble/_forest.py +1397 -0
daal4py/sklearn/ensemble/tests/test_decision_forest.py +206 -0
daal4py/sklearn/linear_model/__init__.py +29 -0
daal4py/sklearn/linear_model/_coordinate_descent.py +848 -0
daal4py/sklearn/linear_model/_linear.py +272 -0
daal4py/sklearn/linear_model/_ridge.py +325 -0
daal4py/sklearn/linear_model/coordinate_descent.py +17 -0
daal4py/sklearn/linear_model/linear.py +17 -0
daal4py/sklearn/linear_model/logistic_loss.py +195 -0
daal4py/sklearn/linear_model/logistic_path.py +1026 -0
daal4py/sklearn/linear_model/ridge.py +17 -0
daal4py/sklearn/linear_model/tests/test_linear.py +208 -0
daal4py/sklearn/linear_model/tests/test_ridge.py +69 -0
daal4py/sklearn/manifold/__init__.py +19 -0
daal4py/sklearn/manifold/_t_sne.py +405 -0
daal4py/sklearn/metrics/__init__.py +20 -0
daal4py/sklearn/metrics/_pairwise.py +236 -0
daal4py/sklearn/metrics/_ranking.py +210 -0
daal4py/sklearn/model_selection/__init__.py +19 -0
daal4py/sklearn/model_selection/_split.py +309 -0
daal4py/sklearn/model_selection/tests/test_split.py +56 -0
daal4py/sklearn/monkeypatch/__init__.py +0 -0
daal4py/sklearn/monkeypatch/dispatcher.py +232 -0
daal4py/sklearn/monkeypatch/tests/_models_info.py +161 -0
daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py +71 -0
daal4py/sklearn/monkeypatch/tests/test_patching.py +90 -0
daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py +117 -0
daal4py/sklearn/neighbors/__init__.py +21 -0
daal4py/sklearn/neighbors/_base.py +503 -0
daal4py/sklearn/neighbors/_classification.py +139 -0
daal4py/sklearn/neighbors/_regression.py +74 -0
daal4py/sklearn/neighbors/_unsupervised.py +55 -0
daal4py/sklearn/neighbors/tests/test_kneighbors.py +113 -0
daal4py/sklearn/svm/__init__.py +19 -0
daal4py/sklearn/svm/svm.py +734 -0
daal4py/sklearn/utils/__init__.py +21 -0
daal4py/sklearn/utils/base.py +75 -0
daal4py/sklearn/utils/tests/test_utils.py +51 -0
daal4py/sklearn/utils/validation.py +693 -0
onedal/__init__.py +83 -0
onedal/_config.py +54 -0
onedal/_device_offload.py +222 -0
onedal/_onedal_py_dpc.cpython-310-x86_64-linux-gnu.so +0 -0
onedal/_onedal_py_host.cpython-310-x86_64-linux-gnu.so +0 -0
onedal/_onedal_py_spmd_dpc.cpython-310-x86_64-linux-gnu.so +0 -0
onedal/basic_statistics/__init__.py +20 -0
onedal/basic_statistics/basic_statistics.py +107 -0
onedal/basic_statistics/incremental_basic_statistics.py +160 -0
onedal/basic_statistics/tests/test_basic_statistics.py +298 -0
onedal/basic_statistics/tests/test_incremental_basic_statistics.py +196 -0
onedal/cluster/__init__.py +27 -0
onedal/cluster/dbscan.py +110 -0
onedal/cluster/kmeans.py +564 -0
onedal/cluster/kmeans_init.py +115 -0
onedal/cluster/tests/test_dbscan.py +125 -0
onedal/cluster/tests/test_kmeans.py +88 -0
onedal/cluster/tests/test_kmeans_init.py +93 -0
onedal/common/_base.py +38 -0
onedal/common/_estimator_checks.py +47 -0
onedal/common/_mixin.py +62 -0
onedal/common/_policy.py +59 -0
onedal/common/_spmd_policy.py +30 -0
onedal/common/hyperparameters.py +125 -0
onedal/common/tests/test_policy.py +76 -0
onedal/covariance/__init__.py +20 -0
onedal/covariance/covariance.py +125 -0
onedal/covariance/incremental_covariance.py +146 -0
onedal/covariance/tests/test_covariance.py +50 -0
onedal/covariance/tests/test_incremental_covariance.py +122 -0
onedal/datatypes/__init__.py +19 -0
onedal/datatypes/_data_conversion.py +154 -0
onedal/datatypes/tests/common.py +126 -0
onedal/datatypes/tests/test_data.py +414 -0
onedal/decomposition/__init__.py +20 -0
onedal/decomposition/incremental_pca.py +204 -0
onedal/decomposition/pca.py +186 -0
onedal/decomposition/tests/test_incremental_pca.py +198 -0
onedal/ensemble/__init__.py +29 -0
onedal/ensemble/forest.py +727 -0
onedal/ensemble/tests/test_random_forest.py +97 -0
onedal/linear_model/__init__.py +27 -0
onedal/linear_model/incremental_linear_model.py +258 -0
onedal/linear_model/linear_model.py +329 -0
onedal/linear_model/logistic_regression.py +249 -0
onedal/linear_model/tests/test_incremental_linear_regression.py +168 -0
onedal/linear_model/tests/test_incremental_ridge_regression.py +107 -0
onedal/linear_model/tests/test_linear_regression.py +250 -0
onedal/linear_model/tests/test_logistic_regression.py +95 -0
onedal/linear_model/tests/test_ridge.py +95 -0
onedal/neighbors/__init__.py +19 -0
onedal/neighbors/neighbors.py +767 -0
onedal/neighbors/tests/test_knn_classification.py +49 -0
onedal/primitives/__init__.py +27 -0
onedal/primitives/get_tree.py +25 -0
onedal/primitives/kernel_functions.py +153 -0
onedal/primitives/tests/test_kernel_functions.py +159 -0
onedal/spmd/__init__.py +25 -0
onedal/spmd/_base.py +30 -0
onedal/spmd/basic_statistics/__init__.py +20 -0
onedal/spmd/basic_statistics/basic_statistics.py +30 -0
onedal/spmd/basic_statistics/incremental_basic_statistics.py +69 -0
onedal/spmd/cluster/__init__.py +28 -0
onedal/spmd/cluster/dbscan.py +23 -0
onedal/spmd/cluster/kmeans.py +56 -0
onedal/spmd/covariance/__init__.py +20 -0
onedal/spmd/covariance/covariance.py +26 -0
onedal/spmd/covariance/incremental_covariance.py +82 -0
onedal/spmd/decomposition/__init__.py +20 -0
onedal/spmd/decomposition/incremental_pca.py +117 -0
onedal/spmd/decomposition/pca.py +26 -0
onedal/spmd/ensemble/__init__.py +19 -0
onedal/spmd/ensemble/forest.py +28 -0
onedal/spmd/linear_model/__init__.py +21 -0
onedal/spmd/linear_model/incremental_linear_model.py +97 -0
onedal/spmd/linear_model/linear_model.py +30 -0
onedal/spmd/linear_model/logistic_regression.py +38 -0
onedal/spmd/neighbors/__init__.py +19 -0
onedal/spmd/neighbors/neighbors.py +75 -0
onedal/svm/__init__.py +19 -0
onedal/svm/svm.py +556 -0
onedal/svm/tests/test_csr_svm.py +351 -0
onedal/svm/tests/test_nusvc.py +204 -0
onedal/svm/tests/test_nusvr.py +210 -0
onedal/svm/tests/test_svc.py +176 -0
onedal/svm/tests/test_svr.py +243 -0
onedal/tests/test_common.py +57 -0
onedal/tests/utils/_dataframes_support.py +162 -0
onedal/tests/utils/_device_selection.py +102 -0
onedal/utils/__init__.py +49 -0
onedal/utils/_array_api.py +81 -0
onedal/utils/_dpep_helpers.py +56 -0
onedal/utils/validation.py +440 -0
scikit_learn_intelex-2025.1.0.dist-info/LICENSE.txt +202 -0
scikit_learn_intelex-2025.1.0.dist-info/METADATA +231 -0
scikit_learn_intelex-2025.1.0.dist-info/RECORD +280 -0
scikit_learn_intelex-2025.1.0.dist-info/WHEEL +5 -0
scikit_learn_intelex-2025.1.0.dist-info/top_level.txt +3 -0
sklearnex/__init__.py +66 -0
sklearnex/__main__.py +58 -0
sklearnex/_config.py +116 -0
sklearnex/_device_offload.py +126 -0
sklearnex/_utils.py +132 -0
sklearnex/basic_statistics/__init__.py +20 -0
sklearnex/basic_statistics/basic_statistics.py +230 -0
sklearnex/basic_statistics/incremental_basic_statistics.py +345 -0
sklearnex/basic_statistics/tests/test_basic_statistics.py +270 -0
sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +404 -0
sklearnex/cluster/__init__.py +20 -0
sklearnex/cluster/dbscan.py +197 -0
sklearnex/cluster/k_means.py +395 -0
sklearnex/cluster/tests/test_dbscan.py +38 -0
sklearnex/cluster/tests/test_kmeans.py +159 -0
sklearnex/conftest.py +82 -0
sklearnex/covariance/__init__.py +19 -0
sklearnex/covariance/incremental_covariance.py +398 -0
sklearnex/covariance/tests/test_incremental_covariance.py +237 -0
sklearnex/decomposition/__init__.py +19 -0
sklearnex/decomposition/pca.py +425 -0
sklearnex/decomposition/tests/test_pca.py +58 -0
sklearnex/dispatcher.py +543 -0
sklearnex/doc/third-party-programs.txt +424 -0
sklearnex/ensemble/__init__.py +29 -0
sklearnex/ensemble/_forest.py +2029 -0
sklearnex/ensemble/tests/test_forest.py +135 -0
sklearnex/glob/__main__.py +72 -0
sklearnex/glob/dispatcher.py +101 -0
sklearnex/linear_model/__init__.py +32 -0
sklearnex/linear_model/coordinate_descent.py +30 -0
sklearnex/linear_model/incremental_linear.py +482 -0
sklearnex/linear_model/incremental_ridge.py +425 -0
sklearnex/linear_model/linear.py +341 -0
sklearnex/linear_model/logistic_regression.py +413 -0
sklearnex/linear_model/ridge.py +24 -0
sklearnex/linear_model/tests/test_incremental_linear.py +207 -0
sklearnex/linear_model/tests/test_incremental_ridge.py +153 -0
sklearnex/linear_model/tests/test_linear.py +167 -0
sklearnex/linear_model/tests/test_logreg.py +134 -0
sklearnex/manifold/__init__.py +19 -0
sklearnex/manifold/t_sne.py +21 -0
sklearnex/manifold/tests/test_tsne.py +26 -0
sklearnex/metrics/__init__.py +23 -0
sklearnex/metrics/pairwise.py +22 -0
sklearnex/metrics/ranking.py +20 -0
sklearnex/metrics/tests/test_metrics.py +39 -0
sklearnex/model_selection/__init__.py +21 -0
sklearnex/model_selection/split.py +22 -0
sklearnex/model_selection/tests/test_model_selection.py +34 -0
sklearnex/neighbors/__init__.py +27 -0
sklearnex/neighbors/_lof.py +236 -0
sklearnex/neighbors/common.py +310 -0
sklearnex/neighbors/knn_classification.py +231 -0
sklearnex/neighbors/knn_regression.py +207 -0
sklearnex/neighbors/knn_unsupervised.py +178 -0
sklearnex/neighbors/tests/test_neighbors.py +82 -0
sklearnex/preview/__init__.py +17 -0
sklearnex/preview/covariance/__init__.py +19 -0
sklearnex/preview/covariance/covariance.py +138 -0
sklearnex/preview/covariance/tests/test_covariance.py +66 -0
sklearnex/preview/decomposition/__init__.py +19 -0
sklearnex/preview/decomposition/incremental_pca.py +233 -0
sklearnex/preview/decomposition/tests/test_incremental_pca.py +266 -0
sklearnex/preview/linear_model/__init__.py +19 -0
sklearnex/preview/linear_model/ridge.py +424 -0
sklearnex/preview/linear_model/tests/test_ridge.py +102 -0
sklearnex/spmd/__init__.py +25 -0
sklearnex/spmd/basic_statistics/__init__.py +20 -0
sklearnex/spmd/basic_statistics/basic_statistics.py +21 -0
sklearnex/spmd/basic_statistics/incremental_basic_statistics.py +30 -0
sklearnex/spmd/basic_statistics/tests/test_basic_statistics_spmd.py +107 -0
sklearnex/spmd/basic_statistics/tests/test_incremental_basic_statistics_spmd.py +307 -0
sklearnex/spmd/cluster/__init__.py +30 -0
sklearnex/spmd/cluster/dbscan.py +50 -0
sklearnex/spmd/cluster/kmeans.py +21 -0
sklearnex/spmd/cluster/tests/test_dbscan_spmd.py +97 -0
sklearnex/spmd/cluster/tests/test_kmeans_spmd.py +172 -0
sklearnex/spmd/covariance/__init__.py +20 -0
sklearnex/spmd/covariance/covariance.py +21 -0
sklearnex/spmd/covariance/incremental_covariance.py +37 -0
sklearnex/spmd/covariance/tests/test_covariance_spmd.py +107 -0
sklearnex/spmd/covariance/tests/test_incremental_covariance_spmd.py +184 -0
sklearnex/spmd/decomposition/__init__.py +20 -0
sklearnex/spmd/decomposition/incremental_pca.py +30 -0
sklearnex/spmd/decomposition/pca.py +21 -0
sklearnex/spmd/decomposition/tests/test_incremental_pca_spmd.py +269 -0
sklearnex/spmd/decomposition/tests/test_pca_spmd.py +128 -0
sklearnex/spmd/ensemble/__init__.py +19 -0
sklearnex/spmd/ensemble/forest.py +71 -0
sklearnex/spmd/ensemble/tests/test_forest_spmd.py +265 -0
sklearnex/spmd/linear_model/__init__.py +21 -0
sklearnex/spmd/linear_model/incremental_linear_model.py +35 -0
sklearnex/spmd/linear_model/linear_model.py +21 -0
sklearnex/spmd/linear_model/logistic_regression.py +21 -0
sklearnex/spmd/linear_model/tests/test_incremental_linear_spmd.py +329 -0
sklearnex/spmd/linear_model/tests/test_linear_regression_spmd.py +145 -0
sklearnex/spmd/linear_model/tests/test_logistic_regression_spmd.py +162 -0
sklearnex/spmd/neighbors/__init__.py +19 -0
sklearnex/spmd/neighbors/neighbors.py +25 -0
sklearnex/spmd/neighbors/tests/test_neighbors_spmd.py +288 -0
sklearnex/svm/__init__.py +29 -0
sklearnex/svm/_common.py +339 -0
sklearnex/svm/nusvc.py +371 -0
sklearnex/svm/nusvr.py +170 -0
sklearnex/svm/svc.py +399 -0
sklearnex/svm/svr.py +167 -0
sklearnex/svm/tests/test_svm.py +93 -0
sklearnex/tests/test_common.py +390 -0
sklearnex/tests/test_config.py +123 -0
sklearnex/tests/test_memory_usage.py +379 -0
sklearnex/tests/test_monkeypatch.py +276 -0
sklearnex/tests/test_n_jobs_support.py +108 -0
sklearnex/tests/test_parallel.py +48 -0
sklearnex/tests/test_patching.py +385 -0
sklearnex/tests/test_run_to_run_stability.py +321 -0
sklearnex/tests/utils/__init__.py +44 -0
sklearnex/tests/utils/base.py +371 -0
sklearnex/tests/utils/spmd.py +198 -0
sklearnex/utils/__init__.py +19 -0
sklearnex/utils/_array_api.py +82 -0
sklearnex/utils/parallel.py +59 -0
sklearnex/utils/tests/test_finite.py +89 -0
sklearnex/utils/validation.py +17 -0

onedal/cluster/kmeans.py ADDED Viewed

@@ -0,0 +1,564 @@
+# ==============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+import warnings
+from abc import ABC
+import numpy as np
+from daal4py.sklearn._utils import daal_check_version, get_dtype
+from onedal import _backend
+from onedal.basic_statistics import BasicStatistics
+if daal_check_version((2023, "P", 200)):
+    from .kmeans_init import KMeansInit
+from sklearn.cluster._kmeans import _kmeans_plusplus
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils import check_random_state
+from ..common._base import BaseEstimator as onedal_BaseEstimator
+from ..common._mixin import ClusterMixin, TransformerMixin
+from ..datatypes import _convert_to_supported, from_table, to_table
+from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr
+class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC):
+    def __init__(
+        self,
+        n_clusters,
+        *,
+        init,
+        n_init,
+        max_iter,
+        tol,
+        verbose,
+        random_state,
+        n_local_trials=None,
+    ):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.n_local_trials = n_local_trials
+    def _validate_center_shape(self, X, centers):
+        """Check if centers is compatible with X and n_clusters."""
+        if centers.shape[0] != self.n_clusters:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of clusters {self.n_clusters}."
+            )
+        if centers.shape[1] != X.shape[1]:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of features of the data {X.shape[1]}."
+            )
+    def _get_kmeans_init(self, cluster_count, seed, algorithm):
+        return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm)
+    # Get appropriate backend (required for SPMD)
+    def _get_basic_statistics_backend(self, result_options):
+        return BasicStatistics(result_options)
+    def _tolerance(self, X_table, rtol, is_csr, policy, dtype):
+        """Compute absolute tolerance from the relative tolerance"""
+        if rtol == 0.0:
+            return rtol
+        dummy = to_table(None)
+        bs = self._get_basic_statistics_backend("variance")
+        res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr)
+        mean_var = from_table(res["variance"]).mean()
+        return mean_var * rtol
+    def _check_params_vs_input(
+        self, X_table, is_csr, policy, default_n_init=10, dtype=np.float32
+    ):
+        # n_clusters
+        if X_table.shape[0] < self.n_clusters:
+            raise ValueError(
+                f"n_samples={X_table.shape[0]} should be >= n_clusters={self.n_clusters}."
+            )
+        # tol
+        self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype)
+        # n-init
+        # TODO(1.4): Remove
+        self._n_init = self.n_init
+        if self._n_init == "warn":
+            warnings.warn(
+                (
+                    "The default value of `n_init` will change from "
+                    f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`"
+                    " explicitly to suppress the warning"
+                ),
+                FutureWarning,
+                stacklevel=2,
+            )
+            self._n_init = default_n_init
+        if self._n_init == "auto":
+            if isinstance(self.init, str) and self.init == "k-means++":
+                self._n_init = 1
+            elif isinstance(self.init, str) and self.init == "random":
+                self._n_init = default_n_init
+            elif callable(self.init):
+                self._n_init = default_n_init
+            else:  # array-like
+                self._n_init = 1
+        if _is_arraylike_not_scalar(self.init) and self._n_init != 1:
+            warnings.warn(
+                (
+                    "Explicit initial center position passed: performing only"
+                    f" one init in {self.__class__.__name__} instead of "
+                    f"n_init={self._n_init}."
+                ),
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            self._n_init = 1
+        assert self.algorithm == "lloyd"
+    def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None):
+        thr = self._tol if hasattr(self, "_tol") else self.tol
+        return {
+            "fptype": "float" if dtype == np.float32 else "double",
+            "method": "lloyd_csr" if is_csr else "by_default",
+            "seed": -1,
+            "max_iteration_count": self.max_iter,
+            "cluster_count": self.n_clusters,
+            "accuracy_threshold": thr,
+            "result_options": "" if result_options is None else result_options,
+        }
+    def _init_centroids_onedal(
+        self,
+        X_table,
+        init,
+        random_seed,
+        policy,
+        is_csr,
+        dtype=np.float32,
+        n_centroids=None,
+    ):
+        n_clusters = self.n_clusters if n_centroids is None else n_centroids
+        # Use host policy for KMeans init, only for csr data
+        # as oneDAL KMeansInit for CSR data is not implemented on GPU
+        if is_csr:
+            init_policy = self._get_policy(None, None)
+            logging.getLogger("sklearnex").info("Running Sparse KMeansInit on CPU")
+        else:
+            init_policy = policy
+        if isinstance(init, str) and init == "k-means++":
+            if not is_csr:
+                alg = self._get_kmeans_init(
+                    cluster_count=n_clusters,
+                    seed=random_seed,
+                    algorithm="plus_plus_dense",
+                )
+            else:
+                alg = self._get_kmeans_init(
+                    cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_csr"
+                )
+            centers_table = alg.compute_raw(X_table, init_policy, dtype)
+        elif isinstance(init, str) and init == "random":
+            if not is_csr:
+                alg = self._get_kmeans_init(
+                    cluster_count=n_clusters, seed=random_seed, algorithm="random_dense"
+                )
+            else:
+                alg = self._get_kmeans_init(
+                    cluster_count=n_clusters, seed=random_seed, algorithm="random_csr"
+                )
+            centers_table = alg.compute_raw(X_table, init_policy, dtype)
+        elif _is_arraylike_not_scalar(init):
+            if _is_csr(init):
+                # oneDAL KMeans only supports Dense Centroids
+                centers = init.toarray()
+            else:
+                centers = np.asarray(init)
+            assert centers.shape[0] == n_clusters
+            assert centers.shape[1] == X_table.column_count
+            # KMeans is implemented on both CPU and GPU for Dense and CSR data
+            # The original policy can be used here
+            centers = _convert_to_supported(policy, centers)
+            centers_table = to_table(centers)
+        else:
+            raise TypeError("Unsupported type of the `init` value")
+        return centers_table
+    def _init_centroids_sklearn(self, X, init, random_state, policy, dtype=np.float32):
+        # For oneDAL versions < 2023.2 or callable init,
+        # using the scikit-learn implementation
+        logging.getLogger("sklearnex").info("Computing KMeansInit with Stock sklearn")
+        n_samples = X.shape[0]
+        if isinstance(init, str) and init == "k-means++":
+            centers, _ = _kmeans_plusplus(
+                X,
+                self.n_clusters,
+                random_state=random_state,
+            )
+        elif isinstance(init, str) and init == "random":
+            seeds = random_state.choice(n_samples, size=self.n_clusters, replace=False)
+            centers = X[seeds]
+        elif callable(init):
+            cc_arr = init(X, self.n_clusters, random_state)
+            cc_arr = np.ascontiguousarray(cc_arr, dtype=dtype)
+            self._validate_center_shape(X, cc_arr)
+            centers = cc_arr
+        elif _is_arraylike_not_scalar(init):
+            centers = init
+        else:
+            raise ValueError(
+                f"init should be either 'k-means++', 'random', a ndarray or a "
+                f"callable, got '{ init }' instead."
+            )
+        centers = _convert_to_supported(policy, centers)
+        return to_table(centers)
+    def _fit_backend(
+        self, X_table, centroids_table, module, policy, dtype=np.float32, is_csr=False
+    ):
+        params = self._get_onedal_params(is_csr, dtype)
+        meta = _backend.get_table_metadata(X_table)
+        assert meta.get_npy_dtype(0) == dtype
+        result = module.train(policy, params, X_table, centroids_table)
+        return (
+            result.responses,
+            result.objective_function_value,
+            result.model,
+            result.iteration_count,
+        )
+    def _fit(self, X, module, queue=None):
+        policy = self._get_policy(queue, X)
+        is_csr = _is_csr(X)
+        X = _check_array(
+            X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False
+        )
+        X = _convert_to_supported(policy, X)
+        dtype = get_dtype(X)
+        X_table = to_table(X)
+        self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype)
+        params = self._get_onedal_params(is_csr, dtype)
+        self.n_features_in_ = X_table.column_count
+        best_model, best_n_iter = None, None
+        best_inertia, best_labels = None, None
+        def is_better_iteration(inertia, labels):
+            if best_inertia is None:
+                return True
+            else:
+                mod = self._get_backend("kmeans_common", None, None)
+                better_inertia = inertia < best_inertia
+                same_clusters = mod._is_same_clustering(
+                    labels, best_labels, self.n_clusters
+                )
+                return better_inertia and not same_clusters
+        random_state = check_random_state(self.random_state)
+        init = self.init
+        init_is_array_like = _is_arraylike_not_scalar(init)
+        if init_is_array_like:
+            init = _check_array(
+                init, dtype=dtype, accept_sparse="csr", copy=True, order="C"
+            )
+            self._validate_center_shape(X, init)
+        use_onedal_init = daal_check_version((2023, "P", 200)) and not callable(self.init)
+        for _ in range(self._n_init):
+            if use_onedal_init:
+                random_seed = random_state.randint(np.iinfo("i").max)
+                centroids_table = self._init_centroids_onedal(
+                    X_table, init, random_seed, policy, is_csr, dtype=dtype
+                )
+            else:
+                centroids_table = self._init_centroids_sklearn(
+                    X, init, random_state, policy, dtype=dtype
+                )
+            if self.verbose:
+                print("Initialization complete")
+            labels, inertia, model, n_iter = self._fit_backend(
+                X_table, centroids_table, module, policy, dtype, is_csr
+            )
+            if self.verbose:
+                print("Iteration {}, inertia {}.".format(n_iter, inertia))
+            if is_better_iteration(inertia, labels):
+                best_model, best_n_iter = model, n_iter
+                best_inertia, best_labels = inertia, labels
+        # Types without conversion
+        self.model_ = best_model
+        # Simple types
+        self.n_iter_ = best_n_iter
+        self.inertia_ = best_inertia
+        # Complex type conversion
+        self.labels_ = from_table(best_labels).ravel()
+        distinct_clusters = len(np.unique(self.labels_))
+        if distinct_clusters < self.n_clusters:
+            warnings.warn(
+                "Number of distinct clusters ({}) found smaller than "
+                "n_clusters ({}). Possibly due to duplicate points "
+                "in X.".format(distinct_clusters, self.n_clusters),
+                ConvergenceWarning,
+                stacklevel=2,
+            )
+        return self
+    @property
+    def cluster_centers_(self):
+        if not hasattr(self, "_cluster_centers_"):
+            if hasattr(self, "model_"):
+                centroids = self.model_.centroids
+                self._cluster_centers_ = from_table(centroids)
+            else:
+                raise NameError("This model have not been trained")
+        return self._cluster_centers_
+    @cluster_centers_.setter
+    def cluster_centers_(self, cluster_centers):
+        self._cluster_centers_ = np.asarray(cluster_centers)
+        self.n_iter_ = 0
+        self.inertia_ = 0
+        self.model_ = self._get_backend("kmeans", "clustering", "model")
+        self.model_.centroids = to_table(self._cluster_centers_)
+        self.n_features_in_ = self.model_.centroids.column_count
+        self.labels_ = np.arange(self.model_.centroids.row_count)
+        return self
+    @cluster_centers_.deleter
+    def cluster_centers_(self):
+        del self._cluster_centers_
+    def _predict(self, X, module, queue=None, result_options=None):
+        is_csr = _is_csr(X)
+        policy = self._get_policy(queue, X)
+        X = _convert_to_supported(policy, X)
+        X_table, dtype = to_table(X), X.dtype
+        params = self._get_onedal_params(is_csr, dtype, result_options)
+        result = module.infer(policy, params, self.model_, X_table)
+        if (
+            result_options == "compute_exact_objective_function"
+        ):  # This is only set for score function
+            return result.objective_function_value * (-1)
+        else:
+            return from_table(result.responses).ravel()
+    def _score(self, X, module, queue=None):
+        result_options = "compute_exact_objective_function"
+        return self._predict(
+            X, self._get_backend("kmeans", "clustering", None), queue, result_options
+        )
+    def _transform(self, X):
+        return euclidean_distances(X, self.cluster_centers_)
+class KMeans(_BaseKMeans):
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        init="k-means++",
+        n_init="auto",
+        max_iter=300,
+        tol=1e-4,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        algorithm="lloyd",
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            init=init,
+            n_init=n_init,
+            max_iter=max_iter,
+            tol=tol,
+            verbose=verbose,
+            random_state=random_state,
+        )
+        self.copy_x = copy_x
+        self.algorithm = algorithm
+        assert self.algorithm == "lloyd"
+    def fit(self, X, y=None, queue=None):
+        return super()._fit(X, self._get_backend("kmeans", "clustering", None), queue)
+    def predict(self, X, queue=None):
+        """Predict the closest cluster each sample in X belongs to.
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to predict.
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        return super()._predict(X, self._get_backend("kmeans", "clustering", None), queue)
+    def fit_predict(self, X, y=None, queue=None):
+        """Compute cluster centers and predict cluster index for each sample.
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to transform.
+        y : Ignored
+            Not used, present here for API consistency by convention.
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        return self.fit(X, queue=queue).labels_
+    def fit_transform(self, X, y=None, queue=None):
+        """Compute clustering and transform X to cluster-distance space.
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to transform.
+        y : Ignored
+            Not used, present here for API consistency by convention.
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        return self.fit(X, queue=queue)._transform(X)
+    def transform(self, X):
+        """Transform X to a cluster-distance space.
+        In the new space, each dimension is the distance to the cluster
+        centers. Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to transform.
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        return self._transform(X)
+    def score(self, X, queue=None):
+        """Opposite of the value of X on the K-means objective.
+        Parameters
+        ----------
+        X: {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data.
+        Returns
+        -------
+        score: float
+            Opposite of the value of X on the K-means objective.
+        """
+        return super()._score(X, self._get_backend("kmeans", "clustering", None), queue)
+def k_means(
+    X,
+    n_clusters,
+    *,
+    init="k-means++",
+    n_init="auto",
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    algorithm="lloyd",
+    return_n_iter=False,
+    queue=None,
+):
+    est = KMeans(
+        n_clusters=n_clusters,
+        init=init,
+        n_init=n_init,
+        max_iter=max_iter,
+        verbose=verbose,
+        tol=tol,
+        random_state=random_state,
+        copy_x=copy_x,
+        algorithm=algorithm,
+    ).fit(X, queue=queue)
+    if return_n_iter:
+        return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
+    else:
+        return est.cluster_centers_, est.labels_, est.inertia_

onedal/cluster/kmeans_init.py ADDED Viewed

@@ -0,0 +1,115 @@
+# ==============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+from scipy.sparse import issparse
+from sklearn.utils import check_random_state
+from daal4py.sklearn._utils import daal_check_version, get_dtype
+from ..common._base import BaseEstimator as onedal_BaseEstimator
+from ..datatypes import _convert_to_supported, from_table, to_table
+from ..utils import _check_array
+if daal_check_version((2023, "P", 200)):
+    class KMeansInit(onedal_BaseEstimator):
+        """
+        KMeansInit oneDAL implementation.
+        """
+        def __init__(
+            self,
+            cluster_count,
+            seed=777,
+            local_trials_count=None,
+            algorithm="plus_plus_dense",
+        ):
+            self.cluster_count = cluster_count
+            self.seed = seed
+            self.local_trials_count = local_trials_count
+            self.algorithm = algorithm
+            if local_trials_count is None:
+                self.local_trials_count = 2 + int(np.log(cluster_count))
+            else:
+                self.local_trials_count = local_trials_count
+        def _get_onedal_params(self, dtype=np.float32):
+            return {
+                "fptype": "float" if dtype == np.float32 else "double",
+                "local_trials_count": self.local_trials_count,
+                "method": self.algorithm,
+                "seed": self.seed,
+                "cluster_count": self.cluster_count,
+            }
+        def _get_params_and_input(self, X, policy):
+            X = _check_array(
+                X,
+                dtype=[np.float64, np.float32],
+                accept_sparse="csr",
+                force_all_finite=False,
+            )
+            X = _convert_to_supported(policy, X)
+            dtype = get_dtype(X)
+            params = self._get_onedal_params(dtype)
+            return (params, to_table(X), dtype)
+        def _compute_raw(self, X_table, module, policy, dtype=np.float32):
+            params = self._get_onedal_params(dtype)
+            result = module.compute(policy, params, X_table)
+            return result.centroids
+        def _compute(self, X, module, queue):
+            policy = self._get_policy(queue, X)
+            # oneDAL KMeans Init for sparse data does not have GPU support
+            if issparse(X):
+                policy = self._get_policy(None, None)
+            _, X_table, dtype = self._get_params_and_input(X, policy)
+            centroids = self._compute_raw(X_table, module, policy, dtype)
+            return from_table(centroids)
+        def compute_raw(self, X_table, policy, dtype=np.float32):
+            return self._compute_raw(
+                X_table, self._get_backend("kmeans_init", "init", None), policy, dtype
+            )
+        def compute(self, X, queue=None):
+            return self._compute(X, self._get_backend("kmeans_init", "init", None), queue)
+    def kmeans_plusplus(
+        X,
+        n_clusters,
+        *,
+        x_squared_norms=None,
+        random_state=None,
+        n_local_trials=None,
+        queue=None,
+    ):
+        random_seed = check_random_state(random_state).tomaxint()
+        return (
+            KMeansInit(
+                n_clusters, seed=random_seed, local_trials_count=n_local_trials
+            ).compute(X, queue),
+            np.full(n_clusters, -1),
+        )