coreLearn 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coreLearn/__init__.py +17 -0
- coreLearn/base.py +25 -0
- coreLearn/distances.py +125 -0
- coreLearn/evaluator.py +143 -0
- coreLearn/knn.py +210 -0
- coreLearn/linear_regression.py +205 -0
- coreLearn/tests/__init__.py +0 -0
- coreLearn/tests/test_distances.py +187 -0
- coreLearn/tests/test_evaluator.py +104 -0
- coreLearn/tests/test_knn.py +101 -0
- coreLearn/tests/test_linear_regression.py +154 -0
- corelearn-0.1.0.dist-info/METADATA +482 -0
- corelearn-0.1.0.dist-info/RECORD +15 -0
- corelearn-0.1.0.dist-info/WHEEL +5 -0
- corelearn-0.1.0.dist-info/top_level.txt +1 -0
coreLearn/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""my_ml_library — public API."""
|
|
2
|
+
|
|
3
|
+
from .knn import KNNClassifier
|
|
4
|
+
from .linear_regression import LinearRegression
|
|
5
|
+
from .evaluator import Evaluator, accuracy, mae, mse, rmse, precision, recall, f1_score
|
|
6
|
+
from .distances import DistanceMetric, DistanceMetricFactory
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"KNNClassifier",
|
|
12
|
+
"LinearRegression",
|
|
13
|
+
"Evaluator",
|
|
14
|
+
"accuracy", "mae", "mse", "rmse",
|
|
15
|
+
"precision", "recall", "f1_score",
|
|
16
|
+
"DistanceMetric", "DistanceMetricFactory",
|
|
17
|
+
]
|
coreLearn/base.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseModel(ABC):
|
|
5
|
+
"""
|
|
6
|
+
Abstract base class for all models.
|
|
7
|
+
|
|
8
|
+
Implements the **Template Method** design pattern: ``fit_predict`` defines
|
|
9
|
+
the skeleton (fit → predict) and subclasses fill in the concrete steps.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def fit(self, X, y) -> "BaseModel":
|
|
14
|
+
"""Train the model on feature matrix X and label vector y."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def predict(self, X) -> list:
|
|
19
|
+
"""Return predictions for feature matrix X."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
def fit_predict(self, X_train, y_train, X_test) -> list:
|
|
23
|
+
"""Template method: train on X_train/y_train, then predict X_test."""
|
|
24
|
+
self.fit(X_train, y_train)
|
|
25
|
+
return self.predict(X_test)
|
coreLearn/distances.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DistanceMetric(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Abstract distance metric.
|
|
9
|
+
|
|
10
|
+
Subclasses only need to implement ``compute()``.
|
|
11
|
+
``__call__`` support lets an instance be used like a function:
|
|
12
|
+
``metric(a, b)`` → ``metric.compute(a, b)``
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def compute(self, a: list, b: list) -> float:
|
|
17
|
+
"""Compute and return the distance between two points."""
|
|
18
|
+
|
|
19
|
+
def __call__(self, a: list, b: list) -> float:
|
|
20
|
+
"""Allow the object to be called as a function."""
|
|
21
|
+
return self.compute(a, b)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EuclideanDistance(DistanceMetric):
|
|
25
|
+
"""
|
|
26
|
+
Euclidean (L2) distance: √Σ(aᵢ − bᵢ)²
|
|
27
|
+
|
|
28
|
+
General-purpose; suitable for continuous, scaled data.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def compute(self, a: list, b: list) -> float:
|
|
32
|
+
a_arr, b_arr = np.array(a), np.array(b)
|
|
33
|
+
return float(np.sqrt(np.sum((a_arr - b_arr) ** 2)))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ManhattanDistance(DistanceMetric):
|
|
37
|
+
"""
|
|
38
|
+
Manhattan (L1) distance: Σ|aᵢ − bᵢ|
|
|
39
|
+
|
|
40
|
+
Preferred over Euclidean for grid-like spaces or when
|
|
41
|
+
robustness to outliers is desired.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def compute(self, a: list, b: list) -> float:
|
|
45
|
+
a_arr, b_arr = np.array(a), np.array(b)
|
|
46
|
+
return float(np.sum(np.abs(a_arr - b_arr)))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DistanceMetricFactory:
|
|
50
|
+
"""
|
|
51
|
+
**Factory Pattern** — creates ``DistanceMetric`` instances by name.
|
|
52
|
+
|
|
53
|
+
``KNNClassifier`` does not import concrete classes (``EuclideanDistance``,
|
|
54
|
+
etc.) directly; it simply passes a name to this factory. Benefits:
|
|
55
|
+
|
|
56
|
+
* Adding a new metric requires no changes to ``KNNClassifier``.
|
|
57
|
+
* Decouples metric classes from consumer code (loose coupling).
|
|
58
|
+
|
|
59
|
+
Usage::
|
|
60
|
+
|
|
61
|
+
metric = DistanceMetricFactory.create("euclidean")
|
|
62
|
+
dist = metric([1, 2], [4, 6]) # → 5.0
|
|
63
|
+
|
|
64
|
+
Registering a new metric::
|
|
65
|
+
|
|
66
|
+
class ChebyshevDistance(DistanceMetric):
|
|
67
|
+
def compute(self, a, b):
|
|
68
|
+
return float(max(abs(x - y) for x, y in zip(a, b)))
|
|
69
|
+
|
|
70
|
+
DistanceMetricFactory.register("chebyshev", ChebyshevDistance)
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
_registry: dict[str, type[DistanceMetric]] = {
|
|
74
|
+
"euclidean": EuclideanDistance,
|
|
75
|
+
"manhattan": ManhattanDistance,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def create(cls, name: str) -> "DistanceMetric":
|
|
80
|
+
"""
|
|
81
|
+
Instantiate and return a ``DistanceMetric`` for the given name.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
name : str
|
|
86
|
+
Metric name — see ``available()`` for registered options.
|
|
87
|
+
|
|
88
|
+
Raises
|
|
89
|
+
------
|
|
90
|
+
ValueError
|
|
91
|
+
If an unknown name is provided.
|
|
92
|
+
"""
|
|
93
|
+
if name not in cls._registry:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Unknown distance metric: '{name}'. "
|
|
96
|
+
f"Available: {cls.available()}"
|
|
97
|
+
)
|
|
98
|
+
return cls._registry[name]()
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def available(cls) -> list[str]:
|
|
102
|
+
"""Return a list of all registered metric names."""
|
|
103
|
+
return list(cls._registry)
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def register(cls, name: str, metric_class: type[DistanceMetric]) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Add a new distance metric class to the registry.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
name : str — Lookup key.
|
|
113
|
+
metric_class : type[DistanceMetric] — Concrete class (not yet instantiated).
|
|
114
|
+
|
|
115
|
+
Raises
|
|
116
|
+
------
|
|
117
|
+
TypeError
|
|
118
|
+
If ``metric_class`` is not a subclass of ``DistanceMetric``.
|
|
119
|
+
"""
|
|
120
|
+
if not (isinstance(metric_class, type) and
|
|
121
|
+
issubclass(metric_class, DistanceMetric)):
|
|
122
|
+
raise TypeError(
|
|
123
|
+
f"{metric_class} must be a subclass of DistanceMetric."
|
|
124
|
+
)
|
|
125
|
+
cls._registry[name] = metric_class
|
coreLearn/evaluator.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# ---------------------------------------------------------------------------
|
|
5
|
+
# Regression metrics
|
|
6
|
+
# ---------------------------------------------------------------------------
|
|
7
|
+
|
|
8
|
+
def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
9
|
+
"""Mean Absolute Error."""
|
|
10
|
+
yt: np.ndarray = np.array(list(y_true), dtype=float)
|
|
11
|
+
yp: np.ndarray = np.array(list(y_pred), dtype=float)
|
|
12
|
+
if len(yt) == 0:
|
|
13
|
+
raise ValueError("y_true must not be empty.")
|
|
14
|
+
if len(yt) != len(yp):
|
|
15
|
+
raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
|
|
16
|
+
return float(np.mean(np.abs(yt - yp)))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
20
|
+
"""Mean Squared Error."""
|
|
21
|
+
yt: np.ndarray = np.array(list(y_true), dtype=float)
|
|
22
|
+
yp: np.ndarray = np.array(list(y_pred), dtype=float)
|
|
23
|
+
if len(yt) == 0:
|
|
24
|
+
raise ValueError("y_true must not be empty.")
|
|
25
|
+
if len(yt) != len(yp):
|
|
26
|
+
raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
|
|
27
|
+
return float(np.mean((yt - yp) ** 2))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
31
|
+
"""Root Mean Squared Error."""
|
|
32
|
+
return float(np.sqrt(mse(y_true, y_pred)))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Classification metrics
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
40
|
+
"""Fraction of correctly predicted labels."""
|
|
41
|
+
yt: list = list(y_true)
|
|
42
|
+
yp: list = list(y_pred)
|
|
43
|
+
if len(yt) == 0:
|
|
44
|
+
raise ValueError("y_true must not be empty.")
|
|
45
|
+
if len(yt) != len(yp):
|
|
46
|
+
raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
|
|
47
|
+
return sum(t == p for t, p in zip(yt, yp)) / len(yt)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def precision(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
51
|
+
"""Macro-averaged precision across all classes."""
|
|
52
|
+
yt: np.ndarray = np.array(y_true)
|
|
53
|
+
yp: np.ndarray = np.array(y_pred)
|
|
54
|
+
classes: np.ndarray = np.unique(yt)
|
|
55
|
+
scores: list[float] = []
|
|
56
|
+
for c in classes:
|
|
57
|
+
tp: int = int(np.sum((yp == c) & (yt == c)))
|
|
58
|
+
fp: int = int(np.sum((yp == c) & (yt != c)))
|
|
59
|
+
scores.append(tp / (tp + fp) if (tp + fp) > 0 else 0.0)
|
|
60
|
+
return float(np.mean(scores))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def recall(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
64
|
+
"""Macro-averaged recall across all classes."""
|
|
65
|
+
yt: np.ndarray = np.array(y_true)
|
|
66
|
+
yp: np.ndarray = np.array(y_pred)
|
|
67
|
+
classes: np.ndarray = np.unique(yt)
|
|
68
|
+
scores: list[float] = []
|
|
69
|
+
for c in classes:
|
|
70
|
+
tp: int = int(np.sum((yp == c) & (yt == c)))
|
|
71
|
+
fn: int = int(np.sum((yp != c) & (yt == c)))
|
|
72
|
+
scores.append(tp / (tp + fn) if (tp + fn) > 0 else 0.0)
|
|
73
|
+
return float(np.mean(scores))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def f1_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
77
|
+
"""Macro-averaged F1 score."""
|
|
78
|
+
yt: np.ndarray = np.array(y_true)
|
|
79
|
+
yp: np.ndarray = np.array(y_pred)
|
|
80
|
+
classes: np.ndarray = np.unique(yt)
|
|
81
|
+
scores: list[float] = []
|
|
82
|
+
for c in classes:
|
|
83
|
+
tp: int = int(np.sum((yp == c) & (yt == c)))
|
|
84
|
+
fp: int = int(np.sum((yp == c) & (yt != c)))
|
|
85
|
+
fn: int = int(np.sum((yp != c) & (yt == c)))
|
|
86
|
+
p: float = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
87
|
+
r: float = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
88
|
+
scores.append(2 * p * r / (p + r) if (p + r) > 0 else 0.0)
|
|
89
|
+
return float(np.mean(scores))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Evaluator
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
class Evaluator:
|
|
97
|
+
"""
|
|
98
|
+
Runs registered metrics by kind: regression or classification.
|
|
99
|
+
|
|
100
|
+
Built-in metrics are pre-registered per kind. New metrics can be added
|
|
101
|
+
at runtime via register() without modifying this class (Open/Closed Principle).
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
_regression_metrics: dict[str, callable] = {
|
|
105
|
+
"mae": mae,
|
|
106
|
+
"mse": mse,
|
|
107
|
+
"rmse": rmse,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
_classification_metrics: dict[str, callable] = {
|
|
111
|
+
"accuracy": accuracy,
|
|
112
|
+
"precision": precision,
|
|
113
|
+
"recall": recall,
|
|
114
|
+
"f1": f1_score,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def register(cls, name: str, fn: callable, kind: str = "regression") -> None:
|
|
119
|
+
"""
|
|
120
|
+
Register a new metric function.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
name : metric name used as dict key
|
|
125
|
+
fn : callable with signature (y_true, y_pred) -> float
|
|
126
|
+
kind : 'regression' (default) or 'classification'
|
|
127
|
+
"""
|
|
128
|
+
if kind == "regression":
|
|
129
|
+
cls._regression_metrics[name] = fn
|
|
130
|
+
elif kind == "classification":
|
|
131
|
+
cls._classification_metrics[name] = fn
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unknown kind '{kind}'. Use 'regression' or 'classification'.")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def evaluate_regression(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
|
|
137
|
+
"""Run all registered regression metrics (mae, mse, rmse, ...)."""
|
|
138
|
+
return {name: fn(y_true, y_pred) for name, fn in cls._regression_metrics.items()}
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def evaluate_classification(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
|
|
142
|
+
"""Run all registered classification metrics (accuracy, precision, recall, f1, ...)."""
|
|
143
|
+
return {name: fn(y_true, y_pred) for name, fn in cls._classification_metrics.items()}
|
coreLearn/knn.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from .base import BaseModel
|
|
6
|
+
from .distances import DistanceMetricFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
# Helpers
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
def _majority_vote(labels: list):
|
|
14
|
+
"""Return the most frequent label in the list."""
|
|
15
|
+
counts: dict = {}
|
|
16
|
+
for label in labels:
|
|
17
|
+
counts[label] = counts.get(label, 0) + 1
|
|
18
|
+
return max(counts, key=lambda k: counts[k])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _predict_worker(args: tuple):
|
|
22
|
+
"""
|
|
23
|
+
Module-level worker for ProcessPoolExecutor.
|
|
24
|
+
|
|
25
|
+
Must be defined at module level (not nested) so that Python's
|
|
26
|
+
multiprocessing can pickle it and send it to worker processes.
|
|
27
|
+
|
|
28
|
+
Each worker process receives its own copy of the KD-Tree and metric
|
|
29
|
+
via pickle — no shared memory, no data race.
|
|
30
|
+
"""
|
|
31
|
+
tree, sample, k, metric = args
|
|
32
|
+
neighbours = tree.nearest_k(sample, k, metric)
|
|
33
|
+
return _majority_vote(neighbours)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# KD-Tree
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
class KDNode:
|
|
41
|
+
"""A single node in a KD-Tree."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, point: list, label, left=None, right=None):
|
|
44
|
+
self.point = point
|
|
45
|
+
self.label = label
|
|
46
|
+
self.left = left
|
|
47
|
+
self.right = right
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class KDTree:
|
|
51
|
+
"""
|
|
52
|
+
Binary space-partitioning tree for fast nearest-neighbour lookups.
|
|
53
|
+
|
|
54
|
+
Both ``_build`` and ``_search`` are **recursive**, satisfying the
|
|
55
|
+
Recursion learning outcome.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, points: list, labels: list):
|
|
59
|
+
data = list(zip(points, labels))
|
|
60
|
+
self.root = self._build(data, depth=0)
|
|
61
|
+
|
|
62
|
+
# -- recursive build --
|
|
63
|
+
|
|
64
|
+
def _build(self, data: list, depth: int):
|
|
65
|
+
"""Split data along alternating axes and return the root KDNode."""
|
|
66
|
+
if not data:
|
|
67
|
+
return None
|
|
68
|
+
k = len(data[0][0])
|
|
69
|
+
axis = depth % k
|
|
70
|
+
data.sort(key=lambda item: item[0][axis])
|
|
71
|
+
mid = len(data) // 2
|
|
72
|
+
return KDNode(
|
|
73
|
+
point=data[mid][0],
|
|
74
|
+
label=data[mid][1],
|
|
75
|
+
left=self._build(data[:mid], depth + 1),
|
|
76
|
+
right=self._build(data[mid + 1:], depth + 1),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# -- public query --
|
|
80
|
+
|
|
81
|
+
def nearest_k(self, target: list, k: int, metric) -> list:
|
|
82
|
+
"""Return labels of the k nearest neighbours to *target*."""
|
|
83
|
+
best: list = []
|
|
84
|
+
self._search(self.root, target, k, metric, depth=0, best=best)
|
|
85
|
+
best.sort(key=lambda x: x[0])
|
|
86
|
+
return [label for _, label in best[:k]]
|
|
87
|
+
|
|
88
|
+
# -- recursive search --
|
|
89
|
+
|
|
90
|
+
def _search(self, node, target: list, k: int,
|
|
91
|
+
metric, depth: int, best: list) -> None:
|
|
92
|
+
"""Recursively prune branches using the splitting-plane distance."""
|
|
93
|
+
if node is None:
|
|
94
|
+
return
|
|
95
|
+
dist = metric(target, node.point)
|
|
96
|
+
|
|
97
|
+
if len(best) < k:
|
|
98
|
+
best.append((dist, node.label))
|
|
99
|
+
elif dist < best[-1][0]:
|
|
100
|
+
best[-1] = (dist, node.label)
|
|
101
|
+
best.sort(key=lambda x: x[0])
|
|
102
|
+
|
|
103
|
+
axis = depth % len(target)
|
|
104
|
+
diff = target[axis] - node.point[axis]
|
|
105
|
+
near, far = (node.left, node.right) if diff <= 0 else (node.right, node.left)
|
|
106
|
+
|
|
107
|
+
self._search(near, target, k, metric, depth + 1, best)
|
|
108
|
+
if len(best) < k or abs(diff) < best[-1][0]:
|
|
109
|
+
self._search(far, target, k, metric, depth + 1, best)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
# KNN Classifier
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
class KNNClassifier(BaseModel):
|
|
117
|
+
"""
|
|
118
|
+
K-Nearest Neighbours classifier.
|
|
119
|
+
|
|
120
|
+
Uses a KD-Tree for efficient lookups (**Recursion**) and
|
|
121
|
+
``ProcessPoolExecutor`` for **parallel** prediction across test samples
|
|
122
|
+
(**Concurrency**). Unlike threading, each worker runs in a separate
|
|
123
|
+
process with its own GIL — enabling true CPU-bound parallelism.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
k : number of neighbours
|
|
128
|
+
distance : 'euclidean' (default) or 'manhattan'
|
|
129
|
+
n_jobs : number of parallel worker processes during predict
|
|
130
|
+
(1 = no multiprocessing, sequential)
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(self, k: int = 5, distance: str = "euclidean", n_jobs: int = 1):
|
|
134
|
+
if not isinstance(k, int) or k < 1:
|
|
135
|
+
raise ValueError(f"k must be a positive integer, got: {k!r}.")
|
|
136
|
+
if not isinstance(n_jobs, int) or n_jobs < 1:
|
|
137
|
+
raise ValueError(f"n_jobs must be a positive integer, got: {n_jobs!r}.")
|
|
138
|
+
self.k = k
|
|
139
|
+
self.n_jobs = n_jobs
|
|
140
|
+
self._metric = DistanceMetricFactory.create(distance)
|
|
141
|
+
self._tree: KDTree | None = None
|
|
142
|
+
self._n_train: int = 0
|
|
143
|
+
self._n_features: int = 0
|
|
144
|
+
|
|
145
|
+
def fit(self, X, y) -> "KNNClassifier":
|
|
146
|
+
"""Build the internal KD-Tree from training data."""
|
|
147
|
+
X = np.array(X, dtype=float)
|
|
148
|
+
y = list(y)
|
|
149
|
+
|
|
150
|
+
if X.ndim != 2:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"X must be 2-D (n_samples, n_features), got shape {X.shape}."
|
|
153
|
+
)
|
|
154
|
+
if len(X) == 0:
|
|
155
|
+
raise ValueError("Training data X must not be empty.")
|
|
156
|
+
if len(y) == 0:
|
|
157
|
+
raise ValueError("Label vector y must not be empty.")
|
|
158
|
+
if len(X) != len(y):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"X and y must have the same number of samples: {len(X)} != {len(y)}."
|
|
161
|
+
)
|
|
162
|
+
if self.k > len(X):
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"k ({self.k}) cannot be greater than the number of "
|
|
165
|
+
f"training samples ({len(X)})."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
self._n_train = len(X)
|
|
169
|
+
self._n_features = X.shape[1]
|
|
170
|
+
self._tree = KDTree(X.tolist(), y)
|
|
171
|
+
return self
|
|
172
|
+
|
|
173
|
+
def _predict_one(self, x: list):
|
|
174
|
+
"""Classify a single sample by majority vote among k neighbours."""
|
|
175
|
+
neighbours = self._tree.nearest_k(x, self.k, self._metric)
|
|
176
|
+
return _majority_vote(neighbours)
|
|
177
|
+
|
|
178
|
+
def predict(self, X) -> list:
|
|
179
|
+
"""
|
|
180
|
+
Predict class labels for all samples in X.
|
|
181
|
+
|
|
182
|
+
n_jobs=1 : sequential prediction (no process overhead).
|
|
183
|
+
n_jobs>1 : samples are distributed across ``n_jobs`` worker processes
|
|
184
|
+
via ``ProcessPoolExecutor``. Each process receives a
|
|
185
|
+
pickled copy of the KD-Tree and metric — no shared memory,
|
|
186
|
+
no data race, true CPU-level parallelism.
|
|
187
|
+
"""
|
|
188
|
+
if self._tree is None:
|
|
189
|
+
raise RuntimeError("Call fit() before predict().")
|
|
190
|
+
raw = list(X) if not isinstance(X, np.ndarray) else X
|
|
191
|
+
if len(raw) == 0:
|
|
192
|
+
return []
|
|
193
|
+
X = np.array(X, dtype=float)
|
|
194
|
+
if X.ndim != 2:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"X must be 2-D (n_samples, n_features), got shape {X.shape}."
|
|
197
|
+
)
|
|
198
|
+
if X.shape[1] != self._n_features:
|
|
199
|
+
raise ValueError(
|
|
200
|
+
f"Feature count mismatch: model was trained with {self._n_features} "
|
|
201
|
+
f"features, but X has {X.shape[1]}."
|
|
202
|
+
)
|
|
203
|
+
samples: list = X.tolist()
|
|
204
|
+
|
|
205
|
+
if self.n_jobs == 1:
|
|
206
|
+
return [self._predict_one(x) for x in samples]
|
|
207
|
+
|
|
208
|
+
args = [(self._tree, x, self.k, self._metric) for x in samples]
|
|
209
|
+
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
210
|
+
return list(executor.map(_predict_worker, args))
|