scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Density-Based Spatial Clustering of Applications with Noise (DBSCAN)
|
|
3
|
+
=====================================================================
|
|
4
|
+
Groups together points that are closely packed (high density regions)
|
|
5
|
+
and marks points in low-density regions as outliers (noise).
|
|
6
|
+
|
|
7
|
+
Key ideas
|
|
8
|
+
---------
|
|
9
|
+
- eps : neighbourhood radius
|
|
10
|
+
- min_samples : minimum number of points to form a dense region (core point)
|
|
11
|
+
- Core point : has >= min_samples neighbours within eps
|
|
12
|
+
- Border point : within eps of a core point, but not a core point itself
|
|
13
|
+
- Noise point : neither core nor border
|
|
14
|
+
|
|
15
|
+
Time complexity : O(n^2) with the naive distance matrix approach used here.
|
|
16
|
+
Only numpy and basic Python stdlib are used.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DBSCAN:
|
|
23
|
+
"""
|
|
24
|
+
DBSCAN clustering.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
eps : float
|
|
29
|
+
Maximum distance between two samples to be considered neighbours.
|
|
30
|
+
min_samples : int
|
|
31
|
+
Minimum number of samples in a neighbourhood for a point to be
|
|
32
|
+
labelled a core point (includes the point itself).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
NOISE = -1
|
|
36
|
+
UNVISITED = 0
|
|
37
|
+
|
|
38
|
+
def __init__(self, eps: float = 0.5, min_samples: int = 5):
|
|
39
|
+
self.eps = eps
|
|
40
|
+
self.min_samples = min_samples
|
|
41
|
+
self.labels_ = None # cluster label per sample (-1 = noise)
|
|
42
|
+
self.core_sample_indices_ = None
|
|
43
|
+
|
|
44
|
+
# ------------------------------------------------------------------
|
|
45
|
+
# Internal helpers
|
|
46
|
+
# ------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def _euclidean_distance(self, a: np.ndarray, b: np.ndarray) -> float:
|
|
49
|
+
"""Euclidean distance between two 1-D vectors."""
|
|
50
|
+
return np.sqrt(np.sum((a - b) ** 2))
|
|
51
|
+
|
|
52
|
+
def _region_query(self, X: np.ndarray, point_idx: int) -> list:
|
|
53
|
+
"""Return indices of all points within eps of X[point_idx]."""
|
|
54
|
+
neighbours = []
|
|
55
|
+
for idx in range(len(X)):
|
|
56
|
+
if self._euclidean_distance(X[point_idx], X[idx]) <= self.eps:
|
|
57
|
+
neighbours.append(idx)
|
|
58
|
+
return neighbours
|
|
59
|
+
|
|
60
|
+
def _expand_cluster(
|
|
61
|
+
self,
|
|
62
|
+
X: np.ndarray,
|
|
63
|
+
labels: np.ndarray,
|
|
64
|
+
point_idx: int,
|
|
65
|
+
neighbours: list,
|
|
66
|
+
cluster_id: int,
|
|
67
|
+
) -> None:
|
|
68
|
+
"""Grow a cluster starting from point_idx."""
|
|
69
|
+
labels[point_idx] = cluster_id
|
|
70
|
+
seed_set = list(neighbours) # mutable working queue
|
|
71
|
+
|
|
72
|
+
i = 0
|
|
73
|
+
while i < len(seed_set):
|
|
74
|
+
current = seed_set[i]
|
|
75
|
+
|
|
76
|
+
# If this was previously labelled noise, reassign to cluster
|
|
77
|
+
if labels[current] == self.NOISE:
|
|
78
|
+
labels[current] = cluster_id
|
|
79
|
+
|
|
80
|
+
# If unvisited, visit it now
|
|
81
|
+
if labels[current] == self.UNVISITED:
|
|
82
|
+
labels[current] = cluster_id
|
|
83
|
+
current_neighbours = self._region_query(X, current)
|
|
84
|
+
|
|
85
|
+
# If it is itself a core point, add its neighbours to the queue
|
|
86
|
+
if len(current_neighbours) >= self.min_samples:
|
|
87
|
+
seed_set += current_neighbours # may add duplicates; OK
|
|
88
|
+
|
|
89
|
+
i += 1
|
|
90
|
+
|
|
91
|
+
# ------------------------------------------------------------------
|
|
92
|
+
# Public API
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
def fit(self, X: np.ndarray) -> "DBSCAN":
|
|
96
|
+
"""
|
|
97
|
+
Fit DBSCAN on dataset X.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
X : ndarray of shape (n_samples, n_features)
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
self
|
|
106
|
+
"""
|
|
107
|
+
n_samples = len(X)
|
|
108
|
+
labels = np.full(n_samples, self.UNVISITED, dtype=int)
|
|
109
|
+
cluster_id = 0
|
|
110
|
+
|
|
111
|
+
for idx in range(n_samples):
|
|
112
|
+
if labels[idx] != self.UNVISITED:
|
|
113
|
+
continue # already processed
|
|
114
|
+
|
|
115
|
+
neighbours = self._region_query(X, idx)
|
|
116
|
+
|
|
117
|
+
if len(neighbours) < self.min_samples:
|
|
118
|
+
labels[idx] = self.NOISE # mark as noise for now
|
|
119
|
+
else:
|
|
120
|
+
cluster_id += 1
|
|
121
|
+
self._expand_cluster(X, labels, idx, neighbours, cluster_id)
|
|
122
|
+
|
|
123
|
+
self.labels_ = labels
|
|
124
|
+
self.core_sample_indices_ = np.array(
|
|
125
|
+
[i for i in range(n_samples)
|
|
126
|
+
if len(self._region_query(X, i)) >= self.min_samples],
|
|
127
|
+
dtype=int,
|
|
128
|
+
)
|
|
129
|
+
return self
|
|
130
|
+
|
|
131
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
132
|
+
"""
|
|
133
|
+
Fit and return cluster labels.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
labels : ndarray of shape (n_samples,)
|
|
138
|
+
-1 for noise, integers >= 1 for clusters.
|
|
139
|
+
"""
|
|
140
|
+
self.fit(X)
|
|
141
|
+
return self.labels_
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gaussian Mixture Model (GMM) via Expectation-Maximization
|
|
3
|
+
==========================================================
|
|
4
|
+
Models data as a weighted mixture of K multivariate Gaussian distributions.
|
|
5
|
+
Parameters (means, covariances, mixing weights) are found by iterating the
|
|
6
|
+
E-step and M-step until the log-likelihood converges.
|
|
7
|
+
|
|
8
|
+
E-step : compute responsibilities r[i,k] = P(z=k | x_i)
|
|
9
|
+
M-step : update pi_k, mu_k, Sigma_k using the responsibilities
|
|
10
|
+
|
|
11
|
+
Only numpy is used.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GaussianMixtureModel:
|
|
18
|
+
"""
|
|
19
|
+
Gaussian Mixture Model fitted by the EM algorithm.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
n_components : int
|
|
24
|
+
Number of mixture components (clusters).
|
|
25
|
+
max_iter : int
|
|
26
|
+
Maximum number of EM iterations.
|
|
27
|
+
tol : float
|
|
28
|
+
Convergence tolerance on the log-likelihood change.
|
|
29
|
+
reg_covar : float
|
|
30
|
+
Small value added to the diagonal of each covariance matrix for
|
|
31
|
+
numerical stability.
|
|
32
|
+
random_state : int or None
|
|
33
|
+
Seed for reproducible centroid initialisation.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
n_components: int = 3,
|
|
39
|
+
max_iter: int = 100,
|
|
40
|
+
tol: float = 1e-4,
|
|
41
|
+
reg_covar: float = 1e-6,
|
|
42
|
+
random_state: int | None = None,
|
|
43
|
+
):
|
|
44
|
+
self.n_components = n_components
|
|
45
|
+
self.max_iter = max_iter
|
|
46
|
+
self.tol = tol
|
|
47
|
+
self.reg_covar = reg_covar
|
|
48
|
+
self.random_state = random_state
|
|
49
|
+
|
|
50
|
+
# Learned parameters
|
|
51
|
+
self.weights_ = None # (K,)
|
|
52
|
+
self.means_ = None # (K, n_features)
|
|
53
|
+
self.covariances_ = None # (K, n_features, n_features)
|
|
54
|
+
self.converged_ = False
|
|
55
|
+
self.n_iter_ = 0
|
|
56
|
+
self.lower_bound_ = -np.inf
|
|
57
|
+
|
|
58
|
+
# ------------------------------------------------------------------
|
|
59
|
+
# Internal helpers
|
|
60
|
+
# ------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
def _multivariate_gaussian(
|
|
63
|
+
self, X: np.ndarray, mean: np.ndarray, cov: np.ndarray
|
|
64
|
+
) -> np.ndarray:
|
|
65
|
+
"""
|
|
66
|
+
Evaluate the multivariate Gaussian PDF for each row of X.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
pdf : ndarray of shape (n_samples,)
|
|
71
|
+
"""
|
|
72
|
+
n_features = X.shape[1]
|
|
73
|
+
diff = X - mean # (n, d)
|
|
74
|
+
try:
|
|
75
|
+
cov_inv = np.linalg.inv(cov)
|
|
76
|
+
cov_det = np.linalg.det(cov)
|
|
77
|
+
except np.linalg.LinAlgError:
|
|
78
|
+
cov_inv = np.linalg.pinv(cov)
|
|
79
|
+
cov_det = np.linalg.det(cov + np.eye(n_features) * self.reg_covar)
|
|
80
|
+
|
|
81
|
+
cov_det = max(cov_det, 1e-300) # guard against log(0)
|
|
82
|
+
norm = 1.0 / (np.sqrt((2 * np.pi) ** n_features * cov_det))
|
|
83
|
+
exponent = -0.5 * np.einsum("ij,jk,ik->i", diff, cov_inv, diff)
|
|
84
|
+
return norm * np.exp(exponent)
|
|
85
|
+
|
|
86
|
+
def _e_step(self, X: np.ndarray) -> np.ndarray:
|
|
87
|
+
"""
|
|
88
|
+
Compute responsibilities r[i, k] = P(z=k | x_i).
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
r : ndarray of shape (n_samples, n_components)
|
|
93
|
+
"""
|
|
94
|
+
n_samples = X.shape[0]
|
|
95
|
+
r = np.zeros((n_samples, self.n_components))
|
|
96
|
+
|
|
97
|
+
for k in range(self.n_components):
|
|
98
|
+
r[:, k] = self.weights_[k] * self._multivariate_gaussian(
|
|
99
|
+
X, self.means_[k], self.covariances_[k]
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
row_sums = r.sum(axis=1, keepdims=True)
|
|
103
|
+
row_sums = np.where(row_sums == 0, 1e-300, row_sums)
|
|
104
|
+
r /= row_sums
|
|
105
|
+
return r
|
|
106
|
+
|
|
107
|
+
def _m_step(self, X: np.ndarray, r: np.ndarray) -> None:
|
|
108
|
+
"""Update parameters from responsibilities."""
|
|
109
|
+
n_samples, n_features = X.shape
|
|
110
|
+
N_k = r.sum(axis=0) # effective number per component
|
|
111
|
+
|
|
112
|
+
self.weights_ = N_k / n_samples
|
|
113
|
+
|
|
114
|
+
for k in range(self.n_components):
|
|
115
|
+
if N_k[k] < 1e-8:
|
|
116
|
+
continue
|
|
117
|
+
self.means_[k] = (r[:, k] @ X) / N_k[k]
|
|
118
|
+
|
|
119
|
+
diff = X - self.means_[k] # (n, d)
|
|
120
|
+
weighted_diff = r[:, k:k+1] * diff # (n, d)
|
|
121
|
+
self.covariances_[k] = (
|
|
122
|
+
weighted_diff.T @ diff / N_k[k]
|
|
123
|
+
+ np.eye(n_features) * self.reg_covar
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def _log_likelihood(self, X: np.ndarray) -> float:
|
|
127
|
+
"""Compute the log-likelihood of the data under current parameters."""
|
|
128
|
+
n_samples = X.shape[0]
|
|
129
|
+
ll = 0.0
|
|
130
|
+
for i in range(n_samples):
|
|
131
|
+
point_ll = sum(
|
|
132
|
+
self.weights_[k]
|
|
133
|
+
* self._multivariate_gaussian(
|
|
134
|
+
X[i:i+1], self.means_[k], self.covariances_[k]
|
|
135
|
+
)[0]
|
|
136
|
+
for k in range(self.n_components)
|
|
137
|
+
)
|
|
138
|
+
ll += np.log(max(point_ll, 1e-300))
|
|
139
|
+
return ll
|
|
140
|
+
|
|
141
|
+
# ------------------------------------------------------------------
|
|
142
|
+
# Public API
|
|
143
|
+
# ------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
def fit(self, X: np.ndarray) -> "GaussianMixtureModel":
|
|
146
|
+
"""
|
|
147
|
+
Fit GMM parameters to X via EM.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
X : ndarray of shape (n_samples, n_features)
|
|
152
|
+
"""
|
|
153
|
+
rng = np.random.default_rng(self.random_state)
|
|
154
|
+
n_samples, n_features = X.shape
|
|
155
|
+
K = self.n_components
|
|
156
|
+
|
|
157
|
+
# Initialise parameters
|
|
158
|
+
self.weights_ = np.full(K, 1.0 / K)
|
|
159
|
+
# Pick K random data points as initial means
|
|
160
|
+
idx = rng.choice(n_samples, K, replace=False)
|
|
161
|
+
self.means_ = X[idx].copy().astype(float)
|
|
162
|
+
# Identity covariances scaled by data variance
|
|
163
|
+
var = np.var(X, axis=0).mean()
|
|
164
|
+
self.covariances_ = np.array(
|
|
165
|
+
[np.eye(n_features) * var for _ in range(K)]
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
prev_ll = -np.inf
|
|
169
|
+
for iteration in range(self.max_iter):
|
|
170
|
+
# E-step
|
|
171
|
+
r = self._e_step(X)
|
|
172
|
+
# M-step
|
|
173
|
+
self._m_step(X, r)
|
|
174
|
+
# Check convergence
|
|
175
|
+
ll = self._log_likelihood(X)
|
|
176
|
+
if abs(ll - prev_ll) < self.tol:
|
|
177
|
+
self.converged_ = True
|
|
178
|
+
break
|
|
179
|
+
prev_ll = ll
|
|
180
|
+
self.n_iter_ = iteration + 1
|
|
181
|
+
|
|
182
|
+
self.lower_bound_ = prev_ll
|
|
183
|
+
return self
|
|
184
|
+
|
|
185
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
186
|
+
"""
|
|
187
|
+
Assign each sample to the most likely component.
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
labels : ndarray of shape (n_samples,)
|
|
192
|
+
"""
|
|
193
|
+
r = self._e_step(X)
|
|
194
|
+
return np.argmax(r, axis=1)
|
|
195
|
+
|
|
196
|
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Return responsibility (soft membership) for each component.
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
r : ndarray of shape (n_samples, n_components)
|
|
203
|
+
"""
|
|
204
|
+
return self._e_step(X)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hierarchical Agglomerative Clustering (HAC)
|
|
3
|
+
============================================
|
|
4
|
+
Builds a hierarchy of clusters bottom-up: each sample starts as its own
|
|
5
|
+
cluster; at each step the two closest clusters are merged, until all
|
|
6
|
+
samples belong to one cluster or a stopping criterion is met.
|
|
7
|
+
|
|
8
|
+
Linkage criteria implemented
|
|
9
|
+
-----------------------------
|
|
10
|
+
- 'single' : distance = min pairwise distance between clusters
|
|
11
|
+
- 'complete' : distance = max pairwise distance between clusters
|
|
12
|
+
- 'average' : distance = mean pairwise distance between clusters
|
|
13
|
+
- 'ward' : distance = increase in total within-cluster variance on merge
|
|
14
|
+
|
|
15
|
+
Only numpy is used.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AgglomerativeClustering:
|
|
22
|
+
"""
|
|
23
|
+
Hierarchical agglomerative clustering.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
n_clusters : int
|
|
28
|
+
Target number of clusters to cut the dendrogram to.
|
|
29
|
+
linkage : str
|
|
30
|
+
One of {'single', 'complete', 'average', 'ward'}.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, n_clusters: int = 2, linkage: str = "ward"):
|
|
34
|
+
if linkage not in {"single", "complete", "average", "ward"}:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"linkage must be one of 'single', 'complete', 'average', "
|
|
37
|
+
f"'ward'. Got '{linkage}'."
|
|
38
|
+
)
|
|
39
|
+
self.n_clusters = n_clusters
|
|
40
|
+
self.linkage = linkage
|
|
41
|
+
self.labels_ = None
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
# Distance helpers
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _pairwise_distances(X: np.ndarray) -> np.ndarray:
|
|
49
|
+
"""Return symmetric Euclidean distance matrix (n x n)."""
|
|
50
|
+
n = len(X)
|
|
51
|
+
D = np.zeros((n, n))
|
|
52
|
+
for i in range(n):
|
|
53
|
+
for j in range(i + 1, n):
|
|
54
|
+
d = np.sqrt(np.sum((X[i] - X[j]) ** 2))
|
|
55
|
+
D[i, j] = D[j, i] = d
|
|
56
|
+
return D
|
|
57
|
+
|
|
58
|
+
def _cluster_distance(
|
|
59
|
+
self,
|
|
60
|
+
c1: list,
|
|
61
|
+
c2: list,
|
|
62
|
+
X: np.ndarray,
|
|
63
|
+
D: np.ndarray,
|
|
64
|
+
) -> float:
|
|
65
|
+
"""Compute linkage distance between two clusters."""
|
|
66
|
+
dists = [D[i, j] for i in c1 for j in c2]
|
|
67
|
+
|
|
68
|
+
if self.linkage == "single":
|
|
69
|
+
return min(dists)
|
|
70
|
+
if self.linkage == "complete":
|
|
71
|
+
return max(dists)
|
|
72
|
+
if self.linkage == "average":
|
|
73
|
+
return sum(dists) / len(dists)
|
|
74
|
+
if self.linkage == "ward":
|
|
75
|
+
# Increase in total within-cluster variance
|
|
76
|
+
combined = c1 + c2
|
|
77
|
+
centroid_c1 = X[c1].mean(axis=0)
|
|
78
|
+
centroid_c2 = X[c2].mean(axis=0)
|
|
79
|
+
centroid_merged = X[combined].mean(axis=0)
|
|
80
|
+
wcv_c1 = np.sum((X[c1] - centroid_c1) ** 2)
|
|
81
|
+
wcv_c2 = np.sum((X[c2] - centroid_c2) ** 2)
|
|
82
|
+
wcv_merged = np.sum((X[combined] - centroid_merged) ** 2)
|
|
83
|
+
return wcv_merged - wcv_c1 - wcv_c2
|
|
84
|
+
raise ValueError(f"Unknown linkage '{self.linkage}'")
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
# Public API
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def fit(self, X: np.ndarray) -> "AgglomerativeClustering":
|
|
91
|
+
"""
|
|
92
|
+
Perform hierarchical clustering on X.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
X : ndarray of shape (n_samples, n_features)
|
|
97
|
+
"""
|
|
98
|
+
n_samples = len(X)
|
|
99
|
+
D = self._pairwise_distances(X)
|
|
100
|
+
|
|
101
|
+
# Each sample starts as its own cluster (stored as list of indices)
|
|
102
|
+
clusters = [[i] for i in range(n_samples)]
|
|
103
|
+
|
|
104
|
+
while len(clusters) > self.n_clusters:
|
|
105
|
+
min_dist = np.inf
|
|
106
|
+
merge_i, merge_j = 0, 1
|
|
107
|
+
|
|
108
|
+
# Find closest pair of clusters
|
|
109
|
+
for i in range(len(clusters)):
|
|
110
|
+
for j in range(i + 1, len(clusters)):
|
|
111
|
+
d = self._cluster_distance(clusters[i], clusters[j], X, D)
|
|
112
|
+
if d < min_dist:
|
|
113
|
+
min_dist = d
|
|
114
|
+
merge_i, merge_j = i, j
|
|
115
|
+
|
|
116
|
+
# Merge
|
|
117
|
+
merged = clusters[merge_i] + clusters[merge_j]
|
|
118
|
+
# Remove old clusters (higher index first to preserve positions)
|
|
119
|
+
clusters = [
|
|
120
|
+
c for idx, c in enumerate(clusters)
|
|
121
|
+
if idx != merge_i and idx != merge_j
|
|
122
|
+
]
|
|
123
|
+
clusters.append(merged)
|
|
124
|
+
|
|
125
|
+
# Assign labels
|
|
126
|
+
labels = np.empty(n_samples, dtype=int)
|
|
127
|
+
for cluster_id, indices in enumerate(clusters):
|
|
128
|
+
for idx in indices:
|
|
129
|
+
labels[idx] = cluster_id
|
|
130
|
+
|
|
131
|
+
self.labels_ = labels
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
135
|
+
"""Fit and return cluster labels."""
|
|
136
|
+
self.fit(X)
|
|
137
|
+
return self.labels_
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Independent Component Analysis (ICA) — FastICA
|
|
3
|
+
================================================
|
|
4
|
+
Separates a multivariate signal into additive, statistically independent
|
|
5
|
+
components (blind source separation). This is the FastICA algorithm using
|
|
6
|
+
a fixed-point iteration to maximise non-Gaussianity (measured via kurtosis
|
|
7
|
+
or negentropy via the logcosh or exp contrast functions).
|
|
8
|
+
|
|
9
|
+
Steps
|
|
10
|
+
-----
|
|
11
|
+
1. Whiten X: remove correlations so components have unit variance.
|
|
12
|
+
2. For each component, run a fixed-point update on a weight vector w:
|
|
13
|
+
w ← E[X g(w^T X)] − E[g'(w^T X)] w
|
|
14
|
+
then normalise w and orthogonalise against previous components.
|
|
15
|
+
3. The independent components are S = W X_white.
|
|
16
|
+
|
|
17
|
+
Only numpy is used.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FastICA:
|
|
24
|
+
"""
|
|
25
|
+
FastICA — Independent Component Analysis.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
n_components : int or None
|
|
30
|
+
Number of independent components to extract.
|
|
31
|
+
If None, uses min(n_samples, n_features).
|
|
32
|
+
max_iter : int
|
|
33
|
+
Maximum iterations per component.
|
|
34
|
+
tol : float
|
|
35
|
+
Convergence tolerance (change in w between iterations).
|
|
36
|
+
fun : str
|
|
37
|
+
Contrast function: 'logcosh' (default) or 'exp'.
|
|
38
|
+
random_state : int or None
|
|
39
|
+
Seed for weight initialisation.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
n_components: int | None = None,
|
|
45
|
+
max_iter: int = 200,
|
|
46
|
+
tol: float = 1e-4,
|
|
47
|
+
fun: str = "logcosh",
|
|
48
|
+
random_state: int | None = None,
|
|
49
|
+
):
|
|
50
|
+
if fun not in {"logcosh", "exp"}:
|
|
51
|
+
raise ValueError("fun must be 'logcosh' or 'exp'.")
|
|
52
|
+
self.n_components = n_components
|
|
53
|
+
self.max_iter = max_iter
|
|
54
|
+
self.tol = tol
|
|
55
|
+
self.fun = fun
|
|
56
|
+
self.random_state = random_state
|
|
57
|
+
|
|
58
|
+
self.components_ = None # (n_components, n_features) unmixing matrix
|
|
59
|
+
self.mixing_ = None # pseudo-inverse of components_
|
|
60
|
+
self.mean_ = None # for centering
|
|
61
|
+
self.whitening_ = None # whitening matrix
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Contrast functions (g) and their derivatives (g')
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def _g_and_gprime(self, u: np.ndarray):
|
|
68
|
+
"""Return g(u) and g'(u) element-wise for the chosen contrast."""
|
|
69
|
+
if self.fun == "logcosh":
|
|
70
|
+
g = np.tanh(u)
|
|
71
|
+
g_prime = 1.0 - g ** 2
|
|
72
|
+
else: # exp
|
|
73
|
+
exp_u = np.exp(-0.5 * u ** 2)
|
|
74
|
+
g = u * exp_u
|
|
75
|
+
g_prime = (1.0 - u ** 2) * exp_u
|
|
76
|
+
return g, g_prime
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# Whitening
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
def _whiten(self, X: np.ndarray):
|
|
83
|
+
"""
|
|
84
|
+
Whiten data: zero-mean, unit variance, uncorrelated.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
X_white : ndarray (n_features, n_samples) — note transposed
|
|
89
|
+
W_white : whitening matrix
|
|
90
|
+
"""
|
|
91
|
+
# X is (n_samples, n_features); work in (n_features, n_samples) form
|
|
92
|
+
X_c = X.T # (d, n)
|
|
93
|
+
cov = np.cov(X_c, rowvar=True) # (d, d)
|
|
94
|
+
eigvals, eigvecs = np.linalg.eigh(cov)
|
|
95
|
+
# Guard against near-zero eigenvalues
|
|
96
|
+
eigvals = np.maximum(eigvals, 1e-10)
|
|
97
|
+
W_white = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T
|
|
98
|
+
X_white = W_white @ X_c # (d, n)
|
|
99
|
+
return X_white, W_white
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
# Public API
|
|
103
|
+
# ------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def fit(self, X: np.ndarray) -> "FastICA":
|
|
106
|
+
"""
|
|
107
|
+
Fit ICA on X.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
X : ndarray of shape (n_samples, n_features)
|
|
112
|
+
"""
|
|
113
|
+
rng = np.random.default_rng(self.random_state)
|
|
114
|
+
n_samples, n_features = X.shape
|
|
115
|
+
n_components = self.n_components or min(n_samples, n_features)
|
|
116
|
+
|
|
117
|
+
self.mean_ = X.mean(axis=0)
|
|
118
|
+
X_centered = X - self.mean_
|
|
119
|
+
|
|
120
|
+
X_white, self.whitening_ = self._whiten(X_centered) # (d, n)
|
|
121
|
+
|
|
122
|
+
W = np.zeros((n_components, n_features)) # unmixing in white space
|
|
123
|
+
|
|
124
|
+
for p in range(n_components):
|
|
125
|
+
w = rng.standard_normal(n_features)
|
|
126
|
+
w /= np.linalg.norm(w) + 1e-12
|
|
127
|
+
|
|
128
|
+
for _ in range(self.max_iter):
|
|
129
|
+
u = w @ X_white # (n,)
|
|
130
|
+
g_u, g_prime_u = self._g_and_gprime(u)
|
|
131
|
+
|
|
132
|
+
# Fixed-point update
|
|
133
|
+
w_new = (X_white * g_u).mean(axis=1) - g_prime_u.mean() * w
|
|
134
|
+
|
|
135
|
+
# Deflation: subtract projections onto previous components
|
|
136
|
+
for j in range(p):
|
|
137
|
+
w_new -= (w_new @ W[j]) * W[j]
|
|
138
|
+
|
|
139
|
+
w_new /= np.linalg.norm(w_new) + 1e-12
|
|
140
|
+
|
|
141
|
+
if abs(abs(w_new @ w) - 1.0) < self.tol:
|
|
142
|
+
w = w_new
|
|
143
|
+
break
|
|
144
|
+
w = w_new
|
|
145
|
+
|
|
146
|
+
W[p] = w
|
|
147
|
+
|
|
148
|
+
# Recover components in original (non-whitened) space
|
|
149
|
+
self.components_ = W @ self.whitening_ # (n_components, n_features)
|
|
150
|
+
self.mixing_ = np.linalg.pinv(self.components_)
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def transform(self, X: np.ndarray) -> np.ndarray:
|
|
154
|
+
"""
|
|
155
|
+
Recover independent components from X.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
S : ndarray of shape (n_samples, n_components)
|
|
160
|
+
"""
|
|
161
|
+
X_centered = X - self.mean_
|
|
162
|
+
return X_centered @ self.components_.T
|
|
163
|
+
|
|
164
|
+
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
|
165
|
+
"""Fit and return independent components."""
|
|
166
|
+
self.fit(X)
|
|
167
|
+
return self.transform(X)
|