scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
K-Means Clustering
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
A classic unsupervised clustering algorithm using Lloyd's iteration with
|
|
6
|
+
K-Means++ initialization.
|
|
7
|
+
|
|
8
|
+
The objective minimized is:
|
|
9
|
+
|
|
10
|
+
.. math::
|
|
11
|
+
J = \sum_{i=1}^n \min_{1 \leq k \leq K} \|x_i - \mu_k\|^2
|
|
12
|
+
|
|
13
|
+
Complexity
|
|
14
|
+
----------
|
|
15
|
+
- Training: O(n K d \cdot n\_iter)
|
|
16
|
+
- Inference: O(n K d)
|
|
17
|
+
- Space: O(K d)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
from numpy.typing import ArrayLike, NDArray
|
|
24
|
+
|
|
25
|
+
FloatArray = NDArray[np.float64]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _validate_input(X: ArrayLike) -> FloatArray:
|
|
29
|
+
X_arr = np.asarray(X, dtype=float)
|
|
30
|
+
if X_arr.ndim != 2:
|
|
31
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
32
|
+
return X_arr
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class KMeans:
|
|
36
|
+
"""K-Means clustering with optional K-Means++ initialization.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
n_clusters : int
|
|
41
|
+
The number of clusters to form.
|
|
42
|
+
max_iter : int, default=300
|
|
43
|
+
Maximum number of iterations of the k-means algorithm for a single run.
|
|
44
|
+
tol : float, default=1e-4
|
|
45
|
+
Convergence tolerance. The algorithm stops when centroid movement is
|
|
46
|
+
less than this threshold.
|
|
47
|
+
random_state : int | None, default=None
|
|
48
|
+
Seed for centroid initialization.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
n_clusters: int = 8,
|
|
54
|
+
max_iter: int = 300,
|
|
55
|
+
tol: float = 1e-4,
|
|
56
|
+
random_state: int | None = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
self.n_clusters = int(n_clusters)
|
|
59
|
+
self.max_iter = int(max_iter)
|
|
60
|
+
self.tol = float(tol)
|
|
61
|
+
self.random_state = random_state
|
|
62
|
+
self.cluster_centers_: FloatArray | None = None
|
|
63
|
+
self.labels_: NDArray[np.int64] | None = None
|
|
64
|
+
self.inertia_: float | None = None
|
|
65
|
+
self.n_iter_: int | None = None
|
|
66
|
+
|
|
67
|
+
def fit(self, X: ArrayLike) -> "KMeans":
|
|
68
|
+
X_arr = _validate_input(X)
|
|
69
|
+
n_samples, n_features = X_arr.shape
|
|
70
|
+
if self.n_clusters <= 0 or self.n_clusters > n_samples:
|
|
71
|
+
raise ValueError("n_clusters must be between 1 and n_samples.")
|
|
72
|
+
|
|
73
|
+
rng = np.random.default_rng(self.random_state)
|
|
74
|
+
centers = self._initialize_centroids(X_arr, rng)
|
|
75
|
+
|
|
76
|
+
for iteration in range(1, self.max_iter + 1):
|
|
77
|
+
labels = self._assign_clusters(X_arr, centers)
|
|
78
|
+
new_centers = self._compute_centers(X_arr, labels, n_features)
|
|
79
|
+
|
|
80
|
+
shift = np.linalg.norm(centers - new_centers, axis=1).max()
|
|
81
|
+
centers = new_centers
|
|
82
|
+
if shift <= self.tol:
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
self.cluster_centers_ = centers
|
|
86
|
+
self.labels_ = labels
|
|
87
|
+
self.inertia_ = float(self._compute_inertia(X_arr, centers, labels))
|
|
88
|
+
self.n_iter_ = iteration
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def predict(self, X: ArrayLike) -> NDArray[np.int64]:
|
|
92
|
+
if self.cluster_centers_ is None:
|
|
93
|
+
raise RuntimeError("Call fit() before predict().")
|
|
94
|
+
X_arr = _validate_input(X)
|
|
95
|
+
if X_arr.shape[1] != self.cluster_centers_.shape[1]:
|
|
96
|
+
raise ValueError("X has a different number of features than the training data.")
|
|
97
|
+
return self._assign_clusters(X_arr, self.cluster_centers_)
|
|
98
|
+
|
|
99
|
+
def _initialize_centroids(self, X: FloatArray, rng: np.random.Generator) -> FloatArray:
|
|
100
|
+
centers = np.empty((self.n_clusters, X.shape[1]), dtype=float)
|
|
101
|
+
first_idx = rng.integers(X.shape[0])
|
|
102
|
+
centers[0] = X[first_idx]
|
|
103
|
+
|
|
104
|
+
distances = np.full(X.shape[0], np.inf, dtype=float)
|
|
105
|
+
for i in range(1, self.n_clusters):
|
|
106
|
+
squared_distances = np.sum((X - centers[i - 1]) ** 2, axis=1)
|
|
107
|
+
distances = np.minimum(distances, squared_distances)
|
|
108
|
+
probabilities = distances / distances.sum()
|
|
109
|
+
cumulative = np.cumsum(probabilities)
|
|
110
|
+
chosen = rng.random()
|
|
111
|
+
centers[i] = X[np.searchsorted(cumulative, chosen)]
|
|
112
|
+
|
|
113
|
+
return centers
|
|
114
|
+
|
|
115
|
+
def _assign_clusters(self, X: FloatArray, centers: FloatArray) -> NDArray[np.int64]:
|
|
116
|
+
distances = np.linalg.norm(X[:, np.newaxis, :] - centers[np.newaxis, :, :], axis=2)
|
|
117
|
+
return np.argmin(distances, axis=1).astype(np.int64)
|
|
118
|
+
|
|
119
|
+
def _compute_centers(
|
|
120
|
+
self, X: FloatArray, labels: NDArray[np.int64], n_features: int
|
|
121
|
+
) -> FloatArray:
|
|
122
|
+
centers = np.zeros((self.n_clusters, n_features), dtype=float)
|
|
123
|
+
for cluster_index in range(self.n_clusters):
|
|
124
|
+
members = X[labels == cluster_index]
|
|
125
|
+
if members.size == 0:
|
|
126
|
+
centers[cluster_index] = X[np.random.default_rng(self.random_state).integers(X.shape[0])]
|
|
127
|
+
else:
|
|
128
|
+
centers[cluster_index] = members.mean(axis=0)
|
|
129
|
+
return centers
|
|
130
|
+
|
|
131
|
+
def _compute_inertia(
|
|
132
|
+
self, X: FloatArray, centers: FloatArray, labels: NDArray[np.int64]
|
|
133
|
+
) -> float:
|
|
134
|
+
diff = X - centers[labels]
|
|
135
|
+
return float(np.sum(diff ** 2))
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
K-Medoids Clustering (PAM — Partitioning Around Medoids)
|
|
3
|
+
=========================================================
|
|
4
|
+
Similar to K-Means but the cluster representatives (medoids) must be actual
|
|
5
|
+
data points, making the algorithm more robust to outliers and compatible
|
|
6
|
+
with non-Euclidean distances.
|
|
7
|
+
|
|
8
|
+
Algorithm (simplified PAM)
|
|
9
|
+
--------------------------
|
|
10
|
+
1. Randomly initialise K medoids from the dataset.
|
|
11
|
+
2. Assign every point to its nearest medoid.
|
|
12
|
+
3. For each cluster, try every non-medoid point as a new medoid;
|
|
13
|
+
keep the swap if it reduces the total cluster cost.
|
|
14
|
+
4. Repeat steps 2-3 until no swap improves the cost.
|
|
15
|
+
|
|
16
|
+
Only numpy and Python stdlib are used.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class KMedoids:
|
|
23
|
+
"""
|
|
24
|
+
K-Medoids clustering.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
n_clusters : int
|
|
29
|
+
Number of clusters / medoids.
|
|
30
|
+
max_iter : int
|
|
31
|
+
Maximum number of swap iterations.
|
|
32
|
+
random_state : int or None
|
|
33
|
+
Seed for reproducible medoid initialisation.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
n_clusters: int = 3,
|
|
39
|
+
max_iter: int = 100,
|
|
40
|
+
random_state: int | None = None,
|
|
41
|
+
):
|
|
42
|
+
self.n_clusters = n_clusters
|
|
43
|
+
self.max_iter = max_iter
|
|
44
|
+
self.random_state = random_state
|
|
45
|
+
self.medoid_indices_ = None # indices into X of the medoids
|
|
46
|
+
self.labels_ = None
|
|
47
|
+
self.inertia_ = None
|
|
48
|
+
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
# Helpers
|
|
51
|
+
# ------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _pairwise_distances(X: np.ndarray) -> np.ndarray:
|
|
55
|
+
"""Return Euclidean distance matrix of shape (n, n)."""
|
|
56
|
+
n = len(X)
|
|
57
|
+
D = np.zeros((n, n))
|
|
58
|
+
for i in range(n):
|
|
59
|
+
for j in range(i + 1, n):
|
|
60
|
+
d = np.sqrt(np.sum((X[i] - X[j]) ** 2))
|
|
61
|
+
D[i, j] = D[j, i] = d
|
|
62
|
+
return D
|
|
63
|
+
|
|
64
|
+
def _assign_labels(self, D: np.ndarray, medoids: list) -> np.ndarray:
|
|
65
|
+
"""Assign each point to its nearest medoid."""
|
|
66
|
+
dist_to_medoids = D[:, medoids] # (n, K)
|
|
67
|
+
return np.argmin(dist_to_medoids, axis=1)
|
|
68
|
+
|
|
69
|
+
def _total_cost(self, D: np.ndarray, medoids: list, labels: np.ndarray) -> float:
|
|
70
|
+
"""Sum of distances from each point to its medoid."""
|
|
71
|
+
cost = 0.0
|
|
72
|
+
for k, m in enumerate(medoids):
|
|
73
|
+
members = np.where(labels == k)[0]
|
|
74
|
+
cost += D[members, m].sum()
|
|
75
|
+
return float(cost)
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# Public API
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def fit(self, X: np.ndarray) -> "KMedoids":
|
|
82
|
+
"""
|
|
83
|
+
Fit K-Medoids to X.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
X : ndarray of shape (n_samples, n_features)
|
|
88
|
+
"""
|
|
89
|
+
rng = np.random.default_rng(self.random_state)
|
|
90
|
+
n_samples = len(X)
|
|
91
|
+
D = self._pairwise_distances(X)
|
|
92
|
+
|
|
93
|
+
# 1. Initialise medoids
|
|
94
|
+
medoids = rng.choice(n_samples, self.n_clusters, replace=False).tolist()
|
|
95
|
+
|
|
96
|
+
for _ in range(self.max_iter):
|
|
97
|
+
labels = self._assign_labels(D, medoids)
|
|
98
|
+
current_cost = self._total_cost(D, medoids, labels)
|
|
99
|
+
improved = False
|
|
100
|
+
|
|
101
|
+
for k in range(self.n_clusters):
|
|
102
|
+
cluster_members = np.where(labels == k)[0].tolist()
|
|
103
|
+
for candidate in cluster_members:
|
|
104
|
+
if candidate in medoids:
|
|
105
|
+
continue
|
|
106
|
+
new_medoids = medoids.copy()
|
|
107
|
+
new_medoids[k] = candidate
|
|
108
|
+
new_labels = self._assign_labels(D, new_medoids)
|
|
109
|
+
new_cost = self._total_cost(D, new_medoids, new_labels)
|
|
110
|
+
|
|
111
|
+
if new_cost < current_cost:
|
|
112
|
+
medoids = new_medoids
|
|
113
|
+
labels = new_labels
|
|
114
|
+
current_cost = new_cost
|
|
115
|
+
improved = True
|
|
116
|
+
|
|
117
|
+
if not improved:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
self.medoid_indices_ = np.array(medoids, dtype=int)
|
|
121
|
+
self.labels_ = self._assign_labels(D, medoids)
|
|
122
|
+
self.inertia_ = self._total_cost(D, medoids, self.labels_)
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
126
|
+
"""Fit and return cluster labels."""
|
|
127
|
+
self.fit(X)
|
|
128
|
+
return self.labels_
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def cluster_centers_(self) -> np.ndarray:
|
|
132
|
+
"""Return the actual medoid data points."""
|
|
133
|
+
return None # set by fit via fit_predict path; use medoid_indices_
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Principal Component Analysis (PCA)
|
|
3
|
+
====================================
|
|
4
|
+
Linear dimensionality reduction via eigen-decomposition of the covariance
|
|
5
|
+
matrix. Projects data onto the directions of maximum variance.
|
|
6
|
+
|
|
7
|
+
Key steps
|
|
8
|
+
---------
|
|
9
|
+
1. Centre the data (subtract column means).
|
|
10
|
+
2. Compute the covariance matrix C = X^T X / (n - 1).
|
|
11
|
+
3. Eigendecompose C to get eigenvalues and eigenvectors.
|
|
12
|
+
4. Sort eigenvectors by descending eigenvalue.
|
|
13
|
+
5. Project: X_reduced = X_centered @ W, where W holds the top-k eigenvectors.
|
|
14
|
+
|
|
15
|
+
Only numpy is used; no scipy or sklearn.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PCA:
|
|
22
|
+
"""
|
|
23
|
+
Principal Component Analysis.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
n_components : int or None
|
|
28
|
+
Number of components to keep. If None, all components are kept.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, n_components: int | None = None):
|
|
32
|
+
self.n_components = n_components
|
|
33
|
+
self.components_ = None # shape (n_components, n_features)
|
|
34
|
+
self.explained_variance_ = None
|
|
35
|
+
self.explained_variance_ratio_ = None
|
|
36
|
+
self.mean_ = None
|
|
37
|
+
|
|
38
|
+
# ------------------------------------------------------------------
|
|
39
|
+
# Fit
|
|
40
|
+
# ------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
def fit(self, X: np.ndarray) -> "PCA":
|
|
43
|
+
"""
|
|
44
|
+
Compute principal components from X.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
X : ndarray of shape (n_samples, n_features)
|
|
49
|
+
"""
|
|
50
|
+
n_samples, n_features = X.shape
|
|
51
|
+
|
|
52
|
+
# 1. Centre
|
|
53
|
+
self.mean_ = np.mean(X, axis=0)
|
|
54
|
+
X_centered = X - self.mean_
|
|
55
|
+
|
|
56
|
+
# 2. Covariance matrix (unbiased, divide by n-1)
|
|
57
|
+
cov = np.dot(X_centered.T, X_centered) / (n_samples - 1)
|
|
58
|
+
|
|
59
|
+
# 3. Eigen-decomposition
|
|
60
|
+
eigenvalues, eigenvectors = np.linalg.eigh(cov)
|
|
61
|
+
|
|
62
|
+
# 4. Sort descending by eigenvalue
|
|
63
|
+
order = np.argsort(eigenvalues)[::-1]
|
|
64
|
+
eigenvalues = eigenvalues[order]
|
|
65
|
+
eigenvectors = eigenvectors[:, order] # columns are eigenvectors
|
|
66
|
+
|
|
67
|
+
# 5. Keep top-k
|
|
68
|
+
k = self.n_components if self.n_components is not None else n_features
|
|
69
|
+
self.components_ = eigenvectors[:, :k].T # (k, n_features)
|
|
70
|
+
self.explained_variance_ = eigenvalues[:k]
|
|
71
|
+
total_var = np.sum(eigenvalues)
|
|
72
|
+
self.explained_variance_ratio_ = (
|
|
73
|
+
self.explained_variance_ / total_var if total_var > 0
|
|
74
|
+
else np.zeros(k)
|
|
75
|
+
)
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# Transform / inverse_transform
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
def transform(self, X: np.ndarray) -> np.ndarray:
|
|
83
|
+
"""
|
|
84
|
+
Project X onto the principal components.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
X_new : ndarray of shape (n_samples, n_components)
|
|
89
|
+
"""
|
|
90
|
+
X_centered = X - self.mean_
|
|
91
|
+
return np.dot(X_centered, self.components_.T)
|
|
92
|
+
|
|
93
|
+
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
|
94
|
+
"""Fit and immediately transform X."""
|
|
95
|
+
self.fit(X)
|
|
96
|
+
return self.transform(X)
|
|
97
|
+
|
|
98
|
+
def inverse_transform(self, X_reduced: np.ndarray) -> np.ndarray:
|
|
99
|
+
"""
|
|
100
|
+
Map data from reduced space back to original feature space
|
|
101
|
+
(approximate reconstruction).
|
|
102
|
+
"""
|
|
103
|
+
return np.dot(X_reduced, self.components_) + self.mean_
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
t-Distributed Stochastic Neighbour Embedding (t-SNE)
|
|
3
|
+
=====================================================
|
|
4
|
+
Non-linear dimensionality reduction that preserves local structure.
|
|
5
|
+
Converts high-dimensional Euclidean distances into conditional
|
|
6
|
+
probabilities (Gaussian in high-dim, Student-t in low-dim) and minimises
|
|
7
|
+
the KL divergence between the two distributions via gradient descent.
|
|
8
|
+
|
|
9
|
+
Key steps
|
|
10
|
+
---------
|
|
11
|
+
1. Compute pairwise affinities p_{j|i} in the high-dimensional space using
|
|
12
|
+
a Gaussian kernel; perplexity controls the effective number of neighbours.
|
|
13
|
+
2. Symmetrise: p_{ij} = (p_{j|i} + p_{i|j}) / 2n.
|
|
14
|
+
3. Initialise low-dimensional embedding Y randomly.
|
|
15
|
+
4. Compute q_{ij} in Y using a Student-t kernel (df=1).
|
|
16
|
+
5. Gradient descent on KL(P || Q) with momentum.
|
|
17
|
+
|
|
18
|
+
Reference: van der Maaten & Hinton (2008).
|
|
19
|
+
Only numpy is used.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TSNE:
|
|
26
|
+
"""
|
|
27
|
+
t-SNE dimensionality reduction.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
n_components : int
|
|
32
|
+
Dimension of the embedding (almost always 2 or 3).
|
|
33
|
+
perplexity : float
|
|
34
|
+
Effective number of neighbours; typical values 5–50.
|
|
35
|
+
n_iter : int
|
|
36
|
+
Number of gradient-descent iterations.
|
|
37
|
+
learning_rate : float
|
|
38
|
+
Step size for gradient descent.
|
|
39
|
+
momentum : float
|
|
40
|
+
Momentum coefficient for gradient updates.
|
|
41
|
+
random_state : int or None
|
|
42
|
+
Seed for reproducibility.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
n_components: int = 2,
|
|
48
|
+
perplexity: float = 30.0,
|
|
49
|
+
n_iter: int = 1000,
|
|
50
|
+
learning_rate: float = 200.0,
|
|
51
|
+
momentum: float = 0.9,
|
|
52
|
+
random_state: int | None = None,
|
|
53
|
+
):
|
|
54
|
+
self.n_components = n_components
|
|
55
|
+
self.perplexity = perplexity
|
|
56
|
+
self.n_iter = n_iter
|
|
57
|
+
self.learning_rate = learning_rate
|
|
58
|
+
self.momentum = momentum
|
|
59
|
+
self.random_state = random_state
|
|
60
|
+
self.embedding_ = None
|
|
61
|
+
|
|
62
|
+
# ------------------------------------------------------------------
|
|
63
|
+
# High-dimensional affinities
|
|
64
|
+
# ------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
def _pairwise_sq_distances(self, X: np.ndarray) -> np.ndarray:
|
|
67
|
+
"""Return matrix of squared Euclidean distances."""
|
|
68
|
+
sum_sq = np.sum(X ** 2, axis=1, keepdims=True)
|
|
69
|
+
D_sq = sum_sq + sum_sq.T - 2.0 * (X @ X.T)
|
|
70
|
+
np.fill_diagonal(D_sq, 0.0)
|
|
71
|
+
return np.maximum(D_sq, 0.0)
|
|
72
|
+
|
|
73
|
+
def _conditional_probabilities(
|
|
74
|
+
self, D_sq: np.ndarray, sigma: float, i: int
|
|
75
|
+
) -> np.ndarray:
|
|
76
|
+
"""Compute p_{j|i} for a given bandwidth sigma."""
|
|
77
|
+
d = D_sq[i].copy()
|
|
78
|
+
d[i] = np.inf # exclude self
|
|
79
|
+
exp_d = np.exp(-d / (2.0 * sigma ** 2))
|
|
80
|
+
denom = exp_d.sum()
|
|
81
|
+
return exp_d / (denom + 1e-12)
|
|
82
|
+
|
|
83
|
+
def _binary_search_sigma(
|
|
84
|
+
self, D_sq: np.ndarray, i: int, target_perp: float,
|
|
85
|
+
tol: float = 1e-5, max_iter: int = 50
|
|
86
|
+
) -> float:
|
|
87
|
+
"""Find sigma_i such that perplexity(p_{.|i}) == target_perp."""
|
|
88
|
+
sigma_low, sigma_high = 1e-10, 1e5
|
|
89
|
+
sigma = 1.0
|
|
90
|
+
|
|
91
|
+
for _ in range(max_iter):
|
|
92
|
+
p = self._conditional_probabilities(D_sq, sigma, i)
|
|
93
|
+
# Shannon entropy
|
|
94
|
+
p_safe = np.maximum(p, 1e-12)
|
|
95
|
+
H = -np.sum(p_safe * np.log2(p_safe))
|
|
96
|
+
perp = 2.0 ** H
|
|
97
|
+
|
|
98
|
+
if abs(perp - target_perp) < tol:
|
|
99
|
+
break
|
|
100
|
+
if perp < target_perp:
|
|
101
|
+
sigma_low = sigma
|
|
102
|
+
sigma = (sigma + sigma_high) / 2.0
|
|
103
|
+
else:
|
|
104
|
+
sigma_high = sigma
|
|
105
|
+
sigma = (sigma + sigma_low) / 2.0
|
|
106
|
+
|
|
107
|
+
return sigma
|
|
108
|
+
|
|
109
|
+
def _compute_P(self, X: np.ndarray) -> np.ndarray:
|
|
110
|
+
"""Compute symmetric joint probabilities P."""
|
|
111
|
+
n = len(X)
|
|
112
|
+
D_sq = self._pairwise_sq_distances(X)
|
|
113
|
+
P = np.zeros((n, n))
|
|
114
|
+
|
|
115
|
+
for i in range(n):
|
|
116
|
+
sigma = self._binary_search_sigma(D_sq, i, self.perplexity)
|
|
117
|
+
P[i] = self._conditional_probabilities(D_sq, sigma, i)
|
|
118
|
+
|
|
119
|
+
# Symmetrise and normalise
|
|
120
|
+
P = (P + P.T) / (2.0 * n)
|
|
121
|
+
P = np.maximum(P, 1e-12)
|
|
122
|
+
return P
|
|
123
|
+
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
# Low-dimensional affinities
|
|
126
|
+
# ------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
def _compute_Q(self, Y: np.ndarray) -> tuple:
|
|
129
|
+
"""
|
|
130
|
+
Compute Student-t affinities in the embedding.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
Q : normalised affinities
|
|
135
|
+
num : unnormalised numerator (needed for gradient)
|
|
136
|
+
"""
|
|
137
|
+
D_sq = self._pairwise_sq_distances(Y)
|
|
138
|
+
num = 1.0 / (1.0 + D_sq)
|
|
139
|
+
np.fill_diagonal(num, 0.0)
|
|
140
|
+
denom = num.sum()
|
|
141
|
+
Q = num / (denom + 1e-12)
|
|
142
|
+
Q = np.maximum(Q, 1e-12)
|
|
143
|
+
return Q, num
|
|
144
|
+
|
|
145
|
+
# ------------------------------------------------------------------
|
|
146
|
+
# Public API
|
|
147
|
+
# ------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
|
150
|
+
"""
|
|
151
|
+
Fit t-SNE and return 2-D (or n_components-D) embedding.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
X : ndarray of shape (n_samples, n_features)
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
Y : ndarray of shape (n_samples, n_components)
|
|
160
|
+
"""
|
|
161
|
+
rng = np.random.default_rng(self.random_state)
|
|
162
|
+
n = len(X)
|
|
163
|
+
|
|
164
|
+
# Step 1: compute high-dim affinities
|
|
165
|
+
P = self._compute_P(X)
|
|
166
|
+
# Early exaggeration (first 250 iters)
|
|
167
|
+
P_exag = P * 4.0
|
|
168
|
+
|
|
169
|
+
# Step 2: random initialisation of embedding
|
|
170
|
+
Y = rng.standard_normal((n, self.n_components)) * 1e-4
|
|
171
|
+
velocity = np.zeros_like(Y)
|
|
172
|
+
|
|
173
|
+
for t in range(self.n_iter):
|
|
174
|
+
p_use = P_exag if t < 250 else P
|
|
175
|
+
Q, num = self._compute_Q(Y)
|
|
176
|
+
|
|
177
|
+
# Gradient of KL divergence
|
|
178
|
+
PQ_diff = p_use - Q # (n, n)
|
|
179
|
+
grad = np.zeros_like(Y)
|
|
180
|
+
for i in range(n):
|
|
181
|
+
# dC/dY_i = 4 * sum_j (p_ij - q_ij) * (y_i - y_j) * (1 + ||y_i-y_j||^2)^-1
|
|
182
|
+
diff = Y[i] - Y # (n, n_components)
|
|
183
|
+
grad[i] = 4.0 * (PQ_diff[i] * num[i] @ diff.reshape(n, -1)).sum(axis=0) \
|
|
184
|
+
if self.n_components == 1 \
|
|
185
|
+
else 4.0 * np.dot(PQ_diff[i] * num[i], diff)
|
|
186
|
+
|
|
187
|
+
# Momentum update
|
|
188
|
+
velocity = self.momentum * velocity - self.learning_rate * grad
|
|
189
|
+
Y = Y + velocity
|
|
190
|
+
|
|
191
|
+
# Centre embedding
|
|
192
|
+
Y -= Y.mean(axis=0)
|
|
193
|
+
|
|
194
|
+
self.embedding_ = Y
|
|
195
|
+
return Y
|
|
196
|
+
|
|
197
|
+
def fit(self, X: np.ndarray) -> "TSNE":
|
|
198
|
+
"""Fit t-SNE (embedding stored in self.embedding_)."""
|
|
199
|
+
self.fit_transform(X)
|
|
200
|
+
return self
|