coreLearn 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coreLearn/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """my_ml_library — public API."""
2
+
3
+ from .knn import KNNClassifier
4
+ from .linear_regression import LinearRegression
5
+ from .evaluator import Evaluator, accuracy, mae, mse, rmse, precision, recall, f1_score
6
+ from .distances import DistanceMetric, DistanceMetricFactory
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ __all__ = [
11
+ "KNNClassifier",
12
+ "LinearRegression",
13
+ "Evaluator",
14
+ "accuracy", "mae", "mse", "rmse",
15
+ "precision", "recall", "f1_score",
16
+ "DistanceMetric", "DistanceMetricFactory",
17
+ ]
coreLearn/base.py ADDED
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseModel(ABC):
5
+ """
6
+ Abstract base class for all models.
7
+
8
+ Implements the **Template Method** design pattern: ``fit_predict`` defines
9
+ the skeleton (fit → predict) and subclasses fill in the concrete steps.
10
+ """
11
+
12
+ @abstractmethod
13
+ def fit(self, X, y) -> "BaseModel":
14
+ """Train the model on feature matrix X and label vector y."""
15
+ pass
16
+
17
+ @abstractmethod
18
+ def predict(self, X) -> list:
19
+ """Return predictions for feature matrix X."""
20
+ pass
21
+
22
+ def fit_predict(self, X_train, y_train, X_test) -> list:
23
+ """Template method: train on X_train/y_train, then predict X_test."""
24
+ self.fit(X_train, y_train)
25
+ return self.predict(X_test)
coreLearn/distances.py ADDED
@@ -0,0 +1,125 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ import numpy as np
4
+
5
+
6
+ class DistanceMetric(ABC):
7
+ """
8
+ Abstract distance metric.
9
+
10
+ Subclasses only need to implement ``compute()``.
11
+ ``__call__`` support lets an instance be used like a function:
12
+ ``metric(a, b)`` → ``metric.compute(a, b)``
13
+ """
14
+
15
+ @abstractmethod
16
+ def compute(self, a: list, b: list) -> float:
17
+ """Compute and return the distance between two points."""
18
+
19
+ def __call__(self, a: list, b: list) -> float:
20
+ """Allow the object to be called as a function."""
21
+ return self.compute(a, b)
22
+
23
+
24
+ class EuclideanDistance(DistanceMetric):
25
+ """
26
+ Euclidean (L2) distance: √Σ(aᵢ − bᵢ)²
27
+
28
+ General-purpose; suitable for continuous, scaled data.
29
+ """
30
+
31
+ def compute(self, a: list, b: list) -> float:
32
+ a_arr, b_arr = np.array(a), np.array(b)
33
+ return float(np.sqrt(np.sum((a_arr - b_arr) ** 2)))
34
+
35
+
36
+ class ManhattanDistance(DistanceMetric):
37
+ """
38
+ Manhattan (L1) distance: Σ|aᵢ − bᵢ|
39
+
40
+ Preferred over Euclidean for grid-like spaces or when
41
+ robustness to outliers is desired.
42
+ """
43
+
44
+ def compute(self, a: list, b: list) -> float:
45
+ a_arr, b_arr = np.array(a), np.array(b)
46
+ return float(np.sum(np.abs(a_arr - b_arr)))
47
+
48
+
49
+ class DistanceMetricFactory:
50
+ """
51
+ **Factory Pattern** — creates ``DistanceMetric`` instances by name.
52
+
53
+ ``KNNClassifier`` does not import concrete classes (``EuclideanDistance``,
54
+ etc.) directly; it simply passes a name to this factory. Benefits:
55
+
56
+ * Adding a new metric requires no changes to ``KNNClassifier``.
57
+ * Decouples metric classes from consumer code (loose coupling).
58
+
59
+ Usage::
60
+
61
+ metric = DistanceMetricFactory.create("euclidean")
62
+ dist = metric([1, 2], [4, 6]) # → 5.0
63
+
64
+ Registering a new metric::
65
+
66
+ class ChebyshevDistance(DistanceMetric):
67
+ def compute(self, a, b):
68
+ return float(max(abs(x - y) for x, y in zip(a, b)))
69
+
70
+ DistanceMetricFactory.register("chebyshev", ChebyshevDistance)
71
+ """
72
+
73
+ _registry: dict[str, type[DistanceMetric]] = {
74
+ "euclidean": EuclideanDistance,
75
+ "manhattan": ManhattanDistance,
76
+ }
77
+
78
+ @classmethod
79
+ def create(cls, name: str) -> "DistanceMetric":
80
+ """
81
+ Instantiate and return a ``DistanceMetric`` for the given name.
82
+
83
+ Parameters
84
+ ----------
85
+ name : str
86
+ Metric name — see ``available()`` for registered options.
87
+
88
+ Raises
89
+ ------
90
+ ValueError
91
+ If an unknown name is provided.
92
+ """
93
+ if name not in cls._registry:
94
+ raise ValueError(
95
+ f"Unknown distance metric: '{name}'. "
96
+ f"Available: {cls.available()}"
97
+ )
98
+ return cls._registry[name]()
99
+
100
+ @classmethod
101
+ def available(cls) -> list[str]:
102
+ """Return a list of all registered metric names."""
103
+ return list(cls._registry)
104
+
105
+ @classmethod
106
+ def register(cls, name: str, metric_class: type[DistanceMetric]) -> None:
107
+ """
108
+ Add a new distance metric class to the registry.
109
+
110
+ Parameters
111
+ ----------
112
+ name : str — Lookup key.
113
+ metric_class : type[DistanceMetric] — Concrete class (not yet instantiated).
114
+
115
+ Raises
116
+ ------
117
+ TypeError
118
+ If ``metric_class`` is not a subclass of ``DistanceMetric``.
119
+ """
120
+ if not (isinstance(metric_class, type) and
121
+ issubclass(metric_class, DistanceMetric)):
122
+ raise TypeError(
123
+ f"{metric_class} must be a subclass of DistanceMetric."
124
+ )
125
+ cls._registry[name] = metric_class
coreLearn/evaluator.py ADDED
@@ -0,0 +1,143 @@
1
+ import numpy as np
2
+
3
+
4
+ # ---------------------------------------------------------------------------
5
+ # Regression metrics
6
+ # ---------------------------------------------------------------------------
7
+
8
+ def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
9
+ """Mean Absolute Error."""
10
+ yt: np.ndarray = np.array(list(y_true), dtype=float)
11
+ yp: np.ndarray = np.array(list(y_pred), dtype=float)
12
+ if len(yt) == 0:
13
+ raise ValueError("y_true must not be empty.")
14
+ if len(yt) != len(yp):
15
+ raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
16
+ return float(np.mean(np.abs(yt - yp)))
17
+
18
+
19
+ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
20
+ """Mean Squared Error."""
21
+ yt: np.ndarray = np.array(list(y_true), dtype=float)
22
+ yp: np.ndarray = np.array(list(y_pred), dtype=float)
23
+ if len(yt) == 0:
24
+ raise ValueError("y_true must not be empty.")
25
+ if len(yt) != len(yp):
26
+ raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
27
+ return float(np.mean((yt - yp) ** 2))
28
+
29
+
30
+ def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
31
+ """Root Mean Squared Error."""
32
+ return float(np.sqrt(mse(y_true, y_pred)))
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Classification metrics
37
+ # ---------------------------------------------------------------------------
38
+
39
+ def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
40
+ """Fraction of correctly predicted labels."""
41
+ yt: list = list(y_true)
42
+ yp: list = list(y_pred)
43
+ if len(yt) == 0:
44
+ raise ValueError("y_true must not be empty.")
45
+ if len(yt) != len(yp):
46
+ raise ValueError(f"y_true and y_pred must have the same length: {len(yt)} != {len(yp)}")
47
+ return sum(t == p for t, p in zip(yt, yp)) / len(yt)
48
+
49
+
50
+ def precision(y_true: np.ndarray, y_pred: np.ndarray) -> float:
51
+ """Macro-averaged precision across all classes."""
52
+ yt: np.ndarray = np.array(y_true)
53
+ yp: np.ndarray = np.array(y_pred)
54
+ classes: np.ndarray = np.unique(yt)
55
+ scores: list[float] = []
56
+ for c in classes:
57
+ tp: int = int(np.sum((yp == c) & (yt == c)))
58
+ fp: int = int(np.sum((yp == c) & (yt != c)))
59
+ scores.append(tp / (tp + fp) if (tp + fp) > 0 else 0.0)
60
+ return float(np.mean(scores))
61
+
62
+
63
+ def recall(y_true: np.ndarray, y_pred: np.ndarray) -> float:
64
+ """Macro-averaged recall across all classes."""
65
+ yt: np.ndarray = np.array(y_true)
66
+ yp: np.ndarray = np.array(y_pred)
67
+ classes: np.ndarray = np.unique(yt)
68
+ scores: list[float] = []
69
+ for c in classes:
70
+ tp: int = int(np.sum((yp == c) & (yt == c)))
71
+ fn: int = int(np.sum((yp != c) & (yt == c)))
72
+ scores.append(tp / (tp + fn) if (tp + fn) > 0 else 0.0)
73
+ return float(np.mean(scores))
74
+
75
+
76
+ def f1_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
77
+ """Macro-averaged F1 score."""
78
+ yt: np.ndarray = np.array(y_true)
79
+ yp: np.ndarray = np.array(y_pred)
80
+ classes: np.ndarray = np.unique(yt)
81
+ scores: list[float] = []
82
+ for c in classes:
83
+ tp: int = int(np.sum((yp == c) & (yt == c)))
84
+ fp: int = int(np.sum((yp == c) & (yt != c)))
85
+ fn: int = int(np.sum((yp != c) & (yt == c)))
86
+ p: float = tp / (tp + fp) if (tp + fp) > 0 else 0.0
87
+ r: float = tp / (tp + fn) if (tp + fn) > 0 else 0.0
88
+ scores.append(2 * p * r / (p + r) if (p + r) > 0 else 0.0)
89
+ return float(np.mean(scores))
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Evaluator
94
+ # ---------------------------------------------------------------------------
95
+
96
+ class Evaluator:
97
+ """
98
+ Runs registered metrics by kind: regression or classification.
99
+
100
+ Built-in metrics are pre-registered per kind. New metrics can be added
101
+ at runtime via register() without modifying this class (Open/Closed Principle).
102
+ """
103
+
104
+ _regression_metrics: dict[str, callable] = {
105
+ "mae": mae,
106
+ "mse": mse,
107
+ "rmse": rmse,
108
+ }
109
+
110
+ _classification_metrics: dict[str, callable] = {
111
+ "accuracy": accuracy,
112
+ "precision": precision,
113
+ "recall": recall,
114
+ "f1": f1_score,
115
+ }
116
+
117
+ @classmethod
118
+ def register(cls, name: str, fn: callable, kind: str = "regression") -> None:
119
+ """
120
+ Register a new metric function.
121
+
122
+ Parameters
123
+ ----------
124
+ name : metric name used as dict key
125
+ fn : callable with signature (y_true, y_pred) -> float
126
+ kind : 'regression' (default) or 'classification'
127
+ """
128
+ if kind == "regression":
129
+ cls._regression_metrics[name] = fn
130
+ elif kind == "classification":
131
+ cls._classification_metrics[name] = fn
132
+ else:
133
+ raise ValueError(f"Unknown kind '{kind}'. Use 'regression' or 'classification'.")
134
+
135
+ @classmethod
136
+ def evaluate_regression(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
137
+ """Run all registered regression metrics (mae, mse, rmse, ...)."""
138
+ return {name: fn(y_true, y_pred) for name, fn in cls._regression_metrics.items()}
139
+
140
+ @classmethod
141
+ def evaluate_classification(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
142
+ """Run all registered classification metrics (accuracy, precision, recall, f1, ...)."""
143
+ return {name: fn(y_true, y_pred) for name, fn in cls._classification_metrics.items()}
coreLearn/knn.py ADDED
@@ -0,0 +1,210 @@
1
+ from concurrent.futures import ProcessPoolExecutor
2
+
3
+ import numpy as np
4
+
5
+ from .base import BaseModel
6
+ from .distances import DistanceMetricFactory
7
+
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # Helpers
11
+ # ---------------------------------------------------------------------------
12
+
13
+ def _majority_vote(labels: list):
14
+ """Return the most frequent label in the list."""
15
+ counts: dict = {}
16
+ for label in labels:
17
+ counts[label] = counts.get(label, 0) + 1
18
+ return max(counts, key=lambda k: counts[k])
19
+
20
+
21
+ def _predict_worker(args: tuple):
22
+ """
23
+ Module-level worker for ProcessPoolExecutor.
24
+
25
+ Must be defined at module level (not nested) so that Python's
26
+ multiprocessing can pickle it and send it to worker processes.
27
+
28
+ Each worker process receives its own copy of the KD-Tree and metric
29
+ via pickle — no shared memory, no data race.
30
+ """
31
+ tree, sample, k, metric = args
32
+ neighbours = tree.nearest_k(sample, k, metric)
33
+ return _majority_vote(neighbours)
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # KD-Tree
38
+ # ---------------------------------------------------------------------------
39
+
40
+ class KDNode:
41
+ """A single node in a KD-Tree."""
42
+
43
+ def __init__(self, point: list, label, left=None, right=None):
44
+ self.point = point
45
+ self.label = label
46
+ self.left = left
47
+ self.right = right
48
+
49
+
50
+ class KDTree:
51
+ """
52
+ Binary space-partitioning tree for fast nearest-neighbour lookups.
53
+
54
+ Both ``_build`` and ``_search`` are **recursive**, satisfying the
55
+ Recursion learning outcome.
56
+ """
57
+
58
+ def __init__(self, points: list, labels: list):
59
+ data = list(zip(points, labels))
60
+ self.root = self._build(data, depth=0)
61
+
62
+ # -- recursive build --
63
+
64
+ def _build(self, data: list, depth: int):
65
+ """Split data along alternating axes and return the root KDNode."""
66
+ if not data:
67
+ return None
68
+ k = len(data[0][0])
69
+ axis = depth % k
70
+ data.sort(key=lambda item: item[0][axis])
71
+ mid = len(data) // 2
72
+ return KDNode(
73
+ point=data[mid][0],
74
+ label=data[mid][1],
75
+ left=self._build(data[:mid], depth + 1),
76
+ right=self._build(data[mid + 1:], depth + 1),
77
+ )
78
+
79
+ # -- public query --
80
+
81
+ def nearest_k(self, target: list, k: int, metric) -> list:
82
+ """Return labels of the k nearest neighbours to *target*."""
83
+ best: list = []
84
+ self._search(self.root, target, k, metric, depth=0, best=best)
85
+ best.sort(key=lambda x: x[0])
86
+ return [label for _, label in best[:k]]
87
+
88
+ # -- recursive search --
89
+
90
+ def _search(self, node, target: list, k: int,
91
+ metric, depth: int, best: list) -> None:
92
+ """Recursively prune branches using the splitting-plane distance."""
93
+ if node is None:
94
+ return
95
+ dist = metric(target, node.point)
96
+
97
+ if len(best) < k:
98
+ best.append((dist, node.label))
99
+ elif dist < best[-1][0]:
100
+ best[-1] = (dist, node.label)
101
+ best.sort(key=lambda x: x[0])
102
+
103
+ axis = depth % len(target)
104
+ diff = target[axis] - node.point[axis]
105
+ near, far = (node.left, node.right) if diff <= 0 else (node.right, node.left)
106
+
107
+ self._search(near, target, k, metric, depth + 1, best)
108
+ if len(best) < k or abs(diff) < best[-1][0]:
109
+ self._search(far, target, k, metric, depth + 1, best)
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # KNN Classifier
114
+ # ---------------------------------------------------------------------------
115
+
116
+ class KNNClassifier(BaseModel):
117
+ """
118
+ K-Nearest Neighbours classifier.
119
+
120
+ Uses a KD-Tree for efficient lookups (**Recursion**) and
121
+ ``ProcessPoolExecutor`` for **parallel** prediction across test samples
122
+ (**Concurrency**). Unlike threading, each worker runs in a separate
123
+ process with its own GIL — enabling true CPU-bound parallelism.
124
+
125
+ Parameters
126
+ ----------
127
+ k : number of neighbours
128
+ distance : 'euclidean' (default) or 'manhattan'
129
+ n_jobs : number of parallel worker processes during predict
130
+ (1 = no multiprocessing, sequential)
131
+ """
132
+
133
+ def __init__(self, k: int = 5, distance: str = "euclidean", n_jobs: int = 1):
134
+ if not isinstance(k, int) or k < 1:
135
+ raise ValueError(f"k must be a positive integer, got: {k!r}.")
136
+ if not isinstance(n_jobs, int) or n_jobs < 1:
137
+ raise ValueError(f"n_jobs must be a positive integer, got: {n_jobs!r}.")
138
+ self.k = k
139
+ self.n_jobs = n_jobs
140
+ self._metric = DistanceMetricFactory.create(distance)
141
+ self._tree: KDTree | None = None
142
+ self._n_train: int = 0
143
+ self._n_features: int = 0
144
+
145
+ def fit(self, X, y) -> "KNNClassifier":
146
+ """Build the internal KD-Tree from training data."""
147
+ X = np.array(X, dtype=float)
148
+ y = list(y)
149
+
150
+ if X.ndim != 2:
151
+ raise ValueError(
152
+ f"X must be 2-D (n_samples, n_features), got shape {X.shape}."
153
+ )
154
+ if len(X) == 0:
155
+ raise ValueError("Training data X must not be empty.")
156
+ if len(y) == 0:
157
+ raise ValueError("Label vector y must not be empty.")
158
+ if len(X) != len(y):
159
+ raise ValueError(
160
+ f"X and y must have the same number of samples: {len(X)} != {len(y)}."
161
+ )
162
+ if self.k > len(X):
163
+ raise ValueError(
164
+ f"k ({self.k}) cannot be greater than the number of "
165
+ f"training samples ({len(X)})."
166
+ )
167
+
168
+ self._n_train = len(X)
169
+ self._n_features = X.shape[1]
170
+ self._tree = KDTree(X.tolist(), y)
171
+ return self
172
+
173
+ def _predict_one(self, x: list):
174
+ """Classify a single sample by majority vote among k neighbours."""
175
+ neighbours = self._tree.nearest_k(x, self.k, self._metric)
176
+ return _majority_vote(neighbours)
177
+
178
+ def predict(self, X) -> list:
179
+ """
180
+ Predict class labels for all samples in X.
181
+
182
+ n_jobs=1 : sequential prediction (no process overhead).
183
+ n_jobs>1 : samples are distributed across ``n_jobs`` worker processes
184
+ via ``ProcessPoolExecutor``. Each process receives a
185
+ pickled copy of the KD-Tree and metric — no shared memory,
186
+ no data race, true CPU-level parallelism.
187
+ """
188
+ if self._tree is None:
189
+ raise RuntimeError("Call fit() before predict().")
190
+ raw = list(X) if not isinstance(X, np.ndarray) else X
191
+ if len(raw) == 0:
192
+ return []
193
+ X = np.array(X, dtype=float)
194
+ if X.ndim != 2:
195
+ raise ValueError(
196
+ f"X must be 2-D (n_samples, n_features), got shape {X.shape}."
197
+ )
198
+ if X.shape[1] != self._n_features:
199
+ raise ValueError(
200
+ f"Feature count mismatch: model was trained with {self._n_features} "
201
+ f"features, but X has {X.shape[1]}."
202
+ )
203
+ samples: list = X.tolist()
204
+
205
+ if self.n_jobs == 1:
206
+ return [self._predict_one(x) for x in samples]
207
+
208
+ args = [(self._tree, x, self.k, self._metric) for x in samples]
209
+ with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
210
+ return list(executor.map(_predict_worker, args))