createsonline 0.1.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- createsonline/__init__.py +46 -0
- createsonline/admin/__init__.py +7 -0
- createsonline/admin/content.py +526 -0
- createsonline/admin/crud.py +805 -0
- createsonline/admin/field_builder.py +559 -0
- createsonline/admin/integration.py +482 -0
- createsonline/admin/interface.py +2562 -0
- createsonline/admin/model_creator.py +513 -0
- createsonline/admin/model_manager.py +388 -0
- createsonline/admin/modern_dashboard.py +498 -0
- createsonline/admin/permissions.py +264 -0
- createsonline/admin/user_forms.py +594 -0
- createsonline/ai/__init__.py +202 -0
- createsonline/ai/fields.py +1226 -0
- createsonline/ai/orm.py +325 -0
- createsonline/ai/services.py +1244 -0
- createsonline/app.py +506 -0
- createsonline/auth/__init__.py +8 -0
- createsonline/auth/management.py +228 -0
- createsonline/auth/models.py +552 -0
- createsonline/cli/__init__.py +5 -0
- createsonline/cli/commands/__init__.py +122 -0
- createsonline/cli/commands/database.py +416 -0
- createsonline/cli/commands/info.py +173 -0
- createsonline/cli/commands/initdb.py +218 -0
- createsonline/cli/commands/project.py +545 -0
- createsonline/cli/commands/serve.py +173 -0
- createsonline/cli/commands/shell.py +93 -0
- createsonline/cli/commands/users.py +148 -0
- createsonline/cli/main.py +2041 -0
- createsonline/cli/manage.py +274 -0
- createsonline/config/__init__.py +9 -0
- createsonline/config/app.py +2577 -0
- createsonline/config/database.py +179 -0
- createsonline/config/docs.py +384 -0
- createsonline/config/errors.py +160 -0
- createsonline/config/orm.py +43 -0
- createsonline/config/request.py +93 -0
- createsonline/config/settings.py +176 -0
- createsonline/data/__init__.py +23 -0
- createsonline/data/dataframe.py +925 -0
- createsonline/data/io.py +453 -0
- createsonline/data/series.py +557 -0
- createsonline/database/__init__.py +60 -0
- createsonline/database/abstraction.py +440 -0
- createsonline/database/assistant.py +585 -0
- createsonline/database/fields.py +442 -0
- createsonline/database/migrations.py +132 -0
- createsonline/database/models.py +604 -0
- createsonline/database.py +438 -0
- createsonline/http/__init__.py +28 -0
- createsonline/http/client.py +535 -0
- createsonline/ml/__init__.py +55 -0
- createsonline/ml/classification.py +552 -0
- createsonline/ml/clustering.py +680 -0
- createsonline/ml/metrics.py +542 -0
- createsonline/ml/neural.py +560 -0
- createsonline/ml/preprocessing.py +784 -0
- createsonline/ml/regression.py +501 -0
- createsonline/performance/__init__.py +19 -0
- createsonline/performance/cache.py +444 -0
- createsonline/performance/compression.py +335 -0
- createsonline/performance/core.py +419 -0
- createsonline/project_init.py +789 -0
- createsonline/routing.py +528 -0
- createsonline/security/__init__.py +34 -0
- createsonline/security/core.py +811 -0
- createsonline/security/encryption.py +349 -0
- createsonline/server.py +295 -0
- createsonline/static/css/admin.css +263 -0
- createsonline/static/css/common.css +358 -0
- createsonline/static/css/dashboard.css +89 -0
- createsonline/static/favicon.ico +0 -0
- createsonline/static/icons/icon-128x128.png +0 -0
- createsonline/static/icons/icon-128x128.webp +0 -0
- createsonline/static/icons/icon-16x16.png +0 -0
- createsonline/static/icons/icon-16x16.webp +0 -0
- createsonline/static/icons/icon-180x180.png +0 -0
- createsonline/static/icons/icon-180x180.webp +0 -0
- createsonline/static/icons/icon-192x192.png +0 -0
- createsonline/static/icons/icon-192x192.webp +0 -0
- createsonline/static/icons/icon-256x256.png +0 -0
- createsonline/static/icons/icon-256x256.webp +0 -0
- createsonline/static/icons/icon-32x32.png +0 -0
- createsonline/static/icons/icon-32x32.webp +0 -0
- createsonline/static/icons/icon-384x384.png +0 -0
- createsonline/static/icons/icon-384x384.webp +0 -0
- createsonline/static/icons/icon-48x48.png +0 -0
- createsonline/static/icons/icon-48x48.webp +0 -0
- createsonline/static/icons/icon-512x512.png +0 -0
- createsonline/static/icons/icon-512x512.webp +0 -0
- createsonline/static/icons/icon-64x64.png +0 -0
- createsonline/static/icons/icon-64x64.webp +0 -0
- createsonline/static/image/android-chrome-192x192.png +0 -0
- createsonline/static/image/android-chrome-512x512.png +0 -0
- createsonline/static/image/apple-touch-icon.png +0 -0
- createsonline/static/image/favicon-16x16.png +0 -0
- createsonline/static/image/favicon-32x32.png +0 -0
- createsonline/static/image/favicon.ico +0 -0
- createsonline/static/image/favicon.svg +17 -0
- createsonline/static/image/icon-128x128.png +0 -0
- createsonline/static/image/icon-128x128.webp +0 -0
- createsonline/static/image/icon-16x16.png +0 -0
- createsonline/static/image/icon-16x16.webp +0 -0
- createsonline/static/image/icon-180x180.png +0 -0
- createsonline/static/image/icon-180x180.webp +0 -0
- createsonline/static/image/icon-192x192.png +0 -0
- createsonline/static/image/icon-192x192.webp +0 -0
- createsonline/static/image/icon-256x256.png +0 -0
- createsonline/static/image/icon-256x256.webp +0 -0
- createsonline/static/image/icon-32x32.png +0 -0
- createsonline/static/image/icon-32x32.webp +0 -0
- createsonline/static/image/icon-384x384.png +0 -0
- createsonline/static/image/icon-384x384.webp +0 -0
- createsonline/static/image/icon-48x48.png +0 -0
- createsonline/static/image/icon-48x48.webp +0 -0
- createsonline/static/image/icon-512x512.png +0 -0
- createsonline/static/image/icon-512x512.webp +0 -0
- createsonline/static/image/icon-64x64.png +0 -0
- createsonline/static/image/icon-64x64.webp +0 -0
- createsonline/static/image/logo-header-h100.png +0 -0
- createsonline/static/image/logo-header-h100.webp +0 -0
- createsonline/static/image/logo-header-h200@2x.png +0 -0
- createsonline/static/image/logo-header-h200@2x.webp +0 -0
- createsonline/static/image/logo.png +0 -0
- createsonline/static/js/admin.js +274 -0
- createsonline/static/site.webmanifest +35 -0
- createsonline/static/templates/admin/base.html +87 -0
- createsonline/static/templates/admin/dashboard.html +217 -0
- createsonline/static/templates/admin/model_form.html +270 -0
- createsonline/static/templates/admin/model_list.html +202 -0
- createsonline/static/test_script.js +15 -0
- createsonline/static/test_styles.css +59 -0
- createsonline/static_files.py +365 -0
- createsonline/templates/404.html +100 -0
- createsonline/templates/admin_login.html +169 -0
- createsonline/templates/base.html +102 -0
- createsonline/templates/index.html +151 -0
- createsonline/templates.py +205 -0
- createsonline/testing.py +322 -0
- createsonline/utils.py +448 -0
- createsonline/validation/__init__.py +49 -0
- createsonline/validation/fields.py +598 -0
- createsonline/validation/models.py +504 -0
- createsonline/validation/validators.py +561 -0
- createsonline/views.py +184 -0
- createsonline-0.1.26.dist-info/METADATA +46 -0
- createsonline-0.1.26.dist-info/RECORD +152 -0
- createsonline-0.1.26.dist-info/WHEEL +5 -0
- createsonline-0.1.26.dist-info/entry_points.txt +2 -0
- createsonline-0.1.26.dist-info/licenses/LICENSE +21 -0
- createsonline-0.1.26.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CREATESONLINE Clustering Algorithms
|
|
3
|
+
|
|
4
|
+
Pure Python clustering implementations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Optional, Union, List
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class KMeans:
|
|
12
|
+
"""
|
|
13
|
+
K-Means Clustering implementation
|
|
14
|
+
|
|
15
|
+
Pure Python implementation with numpy for matrix operations.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, n_clusters: int = 8, max_iter: int = 300, tol: float = 1e-4, random_state: Optional[int] = None):
|
|
19
|
+
"""
|
|
20
|
+
Initialize K-Means clustering
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
n_clusters: Number of clusters
|
|
24
|
+
max_iter: Maximum number of iterations
|
|
25
|
+
tol: Tolerance for convergence
|
|
26
|
+
random_state: Random seed for reproducibility
|
|
27
|
+
"""
|
|
28
|
+
self.n_clusters = n_clusters
|
|
29
|
+
self.max_iter = max_iter
|
|
30
|
+
self.tol = tol
|
|
31
|
+
self.random_state = random_state
|
|
32
|
+
|
|
33
|
+
self.cluster_centers_ = None
|
|
34
|
+
self.labels_ = None
|
|
35
|
+
self.inertia_ = None
|
|
36
|
+
self.n_iter_ = 0
|
|
37
|
+
self.fitted = False
|
|
38
|
+
|
|
39
|
+
def _init_centroids(self, X: np.ndarray) -> np.ndarray:
|
|
40
|
+
"""Initialize centroids randomly"""
|
|
41
|
+
if self.random_state is not None:
|
|
42
|
+
np.random.seed(self.random_state)
|
|
43
|
+
|
|
44
|
+
n_samples, n_features = X.shape
|
|
45
|
+
centroids = np.zeros((self.n_clusters, n_features))
|
|
46
|
+
|
|
47
|
+
for i in range(self.n_clusters):
|
|
48
|
+
# Choose random sample as initial centroid
|
|
49
|
+
centroid_idx = np.random.randint(0, n_samples)
|
|
50
|
+
centroids[i] = X[centroid_idx]
|
|
51
|
+
|
|
52
|
+
return centroids
|
|
53
|
+
|
|
54
|
+
def _assign_clusters(self, X: np.ndarray, centroids: np.ndarray) -> np.ndarray:
|
|
55
|
+
"""Assign each point to the nearest centroid"""
|
|
56
|
+
n_samples = X.shape[0]
|
|
57
|
+
labels = np.zeros(n_samples, dtype=int)
|
|
58
|
+
|
|
59
|
+
for i, point in enumerate(X):
|
|
60
|
+
distances = [np.linalg.norm(point - centroid) for centroid in centroids]
|
|
61
|
+
labels[i] = np.argmin(distances)
|
|
62
|
+
|
|
63
|
+
return labels
|
|
64
|
+
|
|
65
|
+
def _update_centroids(self, X: np.ndarray, labels: np.ndarray) -> np.ndarray:
|
|
66
|
+
"""Update centroids based on cluster assignments"""
|
|
67
|
+
n_features = X.shape[1]
|
|
68
|
+
centroids = np.zeros((self.n_clusters, n_features))
|
|
69
|
+
|
|
70
|
+
for k in range(self.n_clusters):
|
|
71
|
+
cluster_points = X[labels == k]
|
|
72
|
+
if len(cluster_points) > 0:
|
|
73
|
+
centroids[k] = np.mean(cluster_points, axis=0)
|
|
74
|
+
else:
|
|
75
|
+
# If cluster is empty, keep previous centroid
|
|
76
|
+
centroids[k] = self.cluster_centers_[k] if self.cluster_centers_ is not None else np.zeros(n_features)
|
|
77
|
+
|
|
78
|
+
return centroids
|
|
79
|
+
|
|
80
|
+
def _calculate_inertia(self, X: np.ndarray, labels: np.ndarray, centroids: np.ndarray) -> float:
|
|
81
|
+
"""Calculate within-cluster sum of squares (inertia)"""
|
|
82
|
+
inertia = 0.0
|
|
83
|
+
for i, point in enumerate(X):
|
|
84
|
+
centroid = centroids[labels[i]]
|
|
85
|
+
inertia += np.sum((point - centroid) ** 2)
|
|
86
|
+
return inertia
|
|
87
|
+
|
|
88
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'KMeans':
|
|
89
|
+
"""
|
|
90
|
+
Fit K-Means clustering
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
X: Training data (n_samples, n_features)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Self for method chaining
|
|
97
|
+
"""
|
|
98
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
99
|
+
if X.ndim == 1:
|
|
100
|
+
X = X.reshape(-1, 1)
|
|
101
|
+
|
|
102
|
+
# Initialize centroids
|
|
103
|
+
centroids = self._init_centroids(X)
|
|
104
|
+
|
|
105
|
+
for iteration in range(self.max_iter):
|
|
106
|
+
# Assign points to clusters
|
|
107
|
+
labels = self._assign_clusters(X, centroids)
|
|
108
|
+
|
|
109
|
+
# Update centroids
|
|
110
|
+
new_centroids = self._update_centroids(X, labels)
|
|
111
|
+
|
|
112
|
+
# Check for convergence
|
|
113
|
+
centroid_shift = np.sum(np.linalg.norm(new_centroids - centroids, axis=1))
|
|
114
|
+
if centroid_shift < self.tol:
|
|
115
|
+
break
|
|
116
|
+
|
|
117
|
+
centroids = new_centroids
|
|
118
|
+
self.n_iter_ = iteration + 1
|
|
119
|
+
|
|
120
|
+
self.cluster_centers_ = centroids
|
|
121
|
+
self.labels_ = labels
|
|
122
|
+
self.inertia_ = self._calculate_inertia(X, labels, centroids)
|
|
123
|
+
self.fitted = True
|
|
124
|
+
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
def predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
128
|
+
"""
|
|
129
|
+
Predict cluster labels for new data
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
X: Data to predict (n_samples, n_features)
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Cluster labels (n_samples,)
|
|
136
|
+
"""
|
|
137
|
+
if not self.fitted:
|
|
138
|
+
raise RuntimeError("Model must be fitted before making predictions")
|
|
139
|
+
|
|
140
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
141
|
+
if X.ndim == 1:
|
|
142
|
+
X = X.reshape(-1, 1)
|
|
143
|
+
|
|
144
|
+
return self._assign_clusters(X, self.cluster_centers_)
|
|
145
|
+
|
|
146
|
+
def fit_predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
147
|
+
"""
|
|
148
|
+
Fit the model and predict cluster labels
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
X: Training data (n_samples, n_features)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Cluster labels (n_samples,)
|
|
155
|
+
"""
|
|
156
|
+
self.fit(X)
|
|
157
|
+
return self.labels_
|
|
158
|
+
|
|
159
|
+
def transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
160
|
+
"""
|
|
161
|
+
Transform data to cluster-distance space
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
X: Data to transform (n_samples, n_features)
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Distances to each cluster center (n_samples, n_clusters)
|
|
168
|
+
"""
|
|
169
|
+
if not self.fitted:
|
|
170
|
+
raise RuntimeError("Model must be fitted before transforming")
|
|
171
|
+
|
|
172
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
173
|
+
if X.ndim == 1:
|
|
174
|
+
X = X.reshape(-1, 1)
|
|
175
|
+
|
|
176
|
+
distances = np.zeros((X.shape[0], self.n_clusters))
|
|
177
|
+
|
|
178
|
+
for i, point in enumerate(X):
|
|
179
|
+
for j, centroid in enumerate(self.cluster_centers_):
|
|
180
|
+
distances[i, j] = np.linalg.norm(point - centroid)
|
|
181
|
+
|
|
182
|
+
return distances
|
|
183
|
+
|
|
184
|
+
def score(self, X: Union[np.ndarray, list]) -> float:
|
|
185
|
+
"""
|
|
186
|
+
Calculate the negative inertia (higher is better)
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
X: Data to score
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Negative inertia
|
|
193
|
+
"""
|
|
194
|
+
labels = self.predict(X)
|
|
195
|
+
inertia = self._calculate_inertia(np.array(X), labels, self.cluster_centers_)
|
|
196
|
+
return -inertia
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class DBScan:
|
|
200
|
+
"""
|
|
201
|
+
DBSCAN (Density-Based Spatial Clustering) implementation
|
|
202
|
+
|
|
203
|
+
Pure Python implementation for clustering based on density.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
def __init__(self, eps: float = 0.5, min_samples: int = 5, metric: str = 'euclidean'):
|
|
207
|
+
"""
|
|
208
|
+
Initialize DBSCAN clustering
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
eps: Maximum distance between two samples to be considered neighbors
|
|
212
|
+
min_samples: Minimum number of samples in a neighborhood for a core point
|
|
213
|
+
metric: Distance metric ('euclidean', 'manhattan')
|
|
214
|
+
"""
|
|
215
|
+
self.eps = eps
|
|
216
|
+
self.min_samples = min_samples
|
|
217
|
+
self.metric = metric
|
|
218
|
+
|
|
219
|
+
self.labels_ = None
|
|
220
|
+
self.core_sample_indices_ = None
|
|
221
|
+
self.fitted = False
|
|
222
|
+
|
|
223
|
+
def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
|
|
224
|
+
"""Calculate distance between two points"""
|
|
225
|
+
if self.metric == 'euclidean':
|
|
226
|
+
return np.linalg.norm(x1 - x2)
|
|
227
|
+
elif self.metric == 'manhattan':
|
|
228
|
+
return np.sum(np.abs(x1 - x2))
|
|
229
|
+
else:
|
|
230
|
+
raise ValueError(f"Unknown metric: {self.metric}")
|
|
231
|
+
|
|
232
|
+
def _get_neighbors(self, X: np.ndarray, point_idx: int) -> List[int]:
|
|
233
|
+
"""Get all neighbors within eps distance"""
|
|
234
|
+
neighbors = []
|
|
235
|
+
point = X[point_idx]
|
|
236
|
+
|
|
237
|
+
for i, other_point in enumerate(X):
|
|
238
|
+
if self._distance(point, other_point) <= self.eps:
|
|
239
|
+
neighbors.append(i)
|
|
240
|
+
|
|
241
|
+
return neighbors
|
|
242
|
+
|
|
243
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'DBScan':
|
|
244
|
+
"""
|
|
245
|
+
Fit DBSCAN clustering
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
X: Training data (n_samples, n_features)
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Self for method chaining
|
|
252
|
+
"""
|
|
253
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
254
|
+
if X.ndim == 1:
|
|
255
|
+
X = X.reshape(-1, 1)
|
|
256
|
+
|
|
257
|
+
n_samples = X.shape[0]
|
|
258
|
+
self.labels_ = np.full(n_samples, -1, dtype=int) # -1 indicates noise
|
|
259
|
+
cluster_id = 0
|
|
260
|
+
|
|
261
|
+
visited = np.zeros(n_samples, dtype=bool)
|
|
262
|
+
self.core_sample_indices_ = []
|
|
263
|
+
|
|
264
|
+
for point_idx in range(n_samples):
|
|
265
|
+
if visited[point_idx]:
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
visited[point_idx] = True
|
|
269
|
+
neighbors = self._get_neighbors(X, point_idx)
|
|
270
|
+
|
|
271
|
+
if len(neighbors) < self.min_samples:
|
|
272
|
+
# Point is noise (for now)
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# Point is a core point
|
|
276
|
+
self.core_sample_indices_.append(point_idx)
|
|
277
|
+
self.labels_[point_idx] = cluster_id
|
|
278
|
+
|
|
279
|
+
# Expand cluster
|
|
280
|
+
seed_set = neighbors.copy()
|
|
281
|
+
i = 0
|
|
282
|
+
while i < len(seed_set):
|
|
283
|
+
neighbor_idx = seed_set[i]
|
|
284
|
+
|
|
285
|
+
if not visited[neighbor_idx]:
|
|
286
|
+
visited[neighbor_idx] = True
|
|
287
|
+
neighbor_neighbors = self._get_neighbors(X, neighbor_idx)
|
|
288
|
+
|
|
289
|
+
if len(neighbor_neighbors) >= self.min_samples:
|
|
290
|
+
# Neighbor is also a core point
|
|
291
|
+
self.core_sample_indices_.append(neighbor_idx)
|
|
292
|
+
seed_set.extend(neighbor_neighbors)
|
|
293
|
+
|
|
294
|
+
if self.labels_[neighbor_idx] == -1: # Not yet assigned to a cluster
|
|
295
|
+
self.labels_[neighbor_idx] = cluster_id
|
|
296
|
+
|
|
297
|
+
i += 1
|
|
298
|
+
|
|
299
|
+
cluster_id += 1
|
|
300
|
+
|
|
301
|
+
self.core_sample_indices_ = np.array(self.core_sample_indices_)
|
|
302
|
+
self.fitted = True
|
|
303
|
+
return self
|
|
304
|
+
|
|
305
|
+
def fit_predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
306
|
+
"""
|
|
307
|
+
Fit the model and return cluster labels
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
X: Training data (n_samples, n_features)
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Cluster labels (n_samples,) - -1 indicates noise
|
|
314
|
+
"""
|
|
315
|
+
self.fit(X)
|
|
316
|
+
return self.labels_
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class AgglomerativeClustering:
|
|
320
|
+
"""
|
|
321
|
+
Agglomerative (Hierarchical) Clustering implementation
|
|
322
|
+
|
|
323
|
+
Pure Python implementation using linkage criteria.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
def __init__(self, n_clusters: int = 2, linkage: str = 'ward', metric: str = 'euclidean'):
|
|
327
|
+
"""
|
|
328
|
+
Initialize Agglomerative Clustering
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
n_clusters: Number of clusters to find
|
|
332
|
+
linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
|
|
333
|
+
metric: Distance metric ('euclidean', 'manhattan')
|
|
334
|
+
"""
|
|
335
|
+
self.n_clusters = n_clusters
|
|
336
|
+
self.linkage = linkage
|
|
337
|
+
self.metric = metric
|
|
338
|
+
|
|
339
|
+
self.labels_ = None
|
|
340
|
+
self.n_clusters_ = None
|
|
341
|
+
self.fitted = False
|
|
342
|
+
|
|
343
|
+
def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
|
|
344
|
+
"""Calculate distance between two points"""
|
|
345
|
+
if self.metric == 'euclidean':
|
|
346
|
+
return np.linalg.norm(x1 - x2)
|
|
347
|
+
elif self.metric == 'manhattan':
|
|
348
|
+
return np.sum(np.abs(x1 - x2))
|
|
349
|
+
else:
|
|
350
|
+
raise ValueError(f"Unknown metric: {self.metric}")
|
|
351
|
+
|
|
352
|
+
def _cluster_distance(self, cluster1: List[int], cluster2: List[int], X: np.ndarray) -> float:
|
|
353
|
+
"""Calculate distance between two clusters based on linkage criterion"""
|
|
354
|
+
if self.linkage == 'single':
|
|
355
|
+
# Minimum distance between any two points
|
|
356
|
+
min_dist = float('inf')
|
|
357
|
+
for i in cluster1:
|
|
358
|
+
for j in cluster2:
|
|
359
|
+
dist = self._distance(X[i], X[j])
|
|
360
|
+
if dist < min_dist:
|
|
361
|
+
min_dist = dist
|
|
362
|
+
return min_dist
|
|
363
|
+
|
|
364
|
+
elif self.linkage == 'complete':
|
|
365
|
+
# Maximum distance between any two points
|
|
366
|
+
max_dist = 0.0
|
|
367
|
+
for i in cluster1:
|
|
368
|
+
for j in cluster2:
|
|
369
|
+
dist = self._distance(X[i], X[j])
|
|
370
|
+
if dist > max_dist:
|
|
371
|
+
max_dist = dist
|
|
372
|
+
return max_dist
|
|
373
|
+
|
|
374
|
+
elif self.linkage == 'average':
|
|
375
|
+
# Average distance between all pairs of points
|
|
376
|
+
total_dist = 0.0
|
|
377
|
+
count = 0
|
|
378
|
+
for i in cluster1:
|
|
379
|
+
for j in cluster2:
|
|
380
|
+
total_dist += self._distance(X[i], X[j])
|
|
381
|
+
count += 1
|
|
382
|
+
return total_dist / count if count > 0 else 0.0
|
|
383
|
+
|
|
384
|
+
elif self.linkage == 'ward':
|
|
385
|
+
# Ward linkage (minimum increase in within-cluster sum of squares)
|
|
386
|
+
# Calculate centroids
|
|
387
|
+
centroid1 = np.mean(X[cluster1], axis=0)
|
|
388
|
+
centroid2 = np.mean(X[cluster2], axis=0)
|
|
389
|
+
|
|
390
|
+
# Calculate merged centroid
|
|
391
|
+
n1, n2 = len(cluster1), len(cluster2)
|
|
392
|
+
merged_centroid = (n1 * centroid1 + n2 * centroid2) / (n1 + n2)
|
|
393
|
+
|
|
394
|
+
# Calculate increase in sum of squares
|
|
395
|
+
increase = 0.0
|
|
396
|
+
for i in cluster1:
|
|
397
|
+
increase += np.sum((X[i] - merged_centroid) ** 2) - np.sum((X[i] - centroid1) ** 2)
|
|
398
|
+
for j in cluster2:
|
|
399
|
+
increase += np.sum((X[j] - merged_centroid) ** 2) - np.sum((X[j] - centroid2) ** 2)
|
|
400
|
+
|
|
401
|
+
return increase
|
|
402
|
+
|
|
403
|
+
else:
|
|
404
|
+
raise ValueError(f"Unknown linkage: {self.linkage}")
|
|
405
|
+
|
|
406
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'AgglomerativeClustering':
|
|
407
|
+
"""
|
|
408
|
+
Fit Agglomerative Clustering
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
X: Training data (n_samples, n_features)
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
Self for method chaining
|
|
415
|
+
"""
|
|
416
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
417
|
+
if X.ndim == 1:
|
|
418
|
+
X = X.reshape(-1, 1)
|
|
419
|
+
|
|
420
|
+
n_samples = X.shape[0]
|
|
421
|
+
|
|
422
|
+
# Initialize each point as its own cluster
|
|
423
|
+
clusters = [[i] for i in range(n_samples)]
|
|
424
|
+
|
|
425
|
+
# Merge clusters until we have the desired number
|
|
426
|
+
while len(clusters) > self.n_clusters:
|
|
427
|
+
min_dist = float('inf')
|
|
428
|
+
merge_i, merge_j = -1, -1
|
|
429
|
+
|
|
430
|
+
# Find the two closest clusters
|
|
431
|
+
for i in range(len(clusters)):
|
|
432
|
+
for j in range(i + 1, len(clusters)):
|
|
433
|
+
dist = self._cluster_distance(clusters[i], clusters[j], X)
|
|
434
|
+
if dist < min_dist:
|
|
435
|
+
min_dist = dist
|
|
436
|
+
merge_i, merge_j = i, j
|
|
437
|
+
|
|
438
|
+
# Merge the closest clusters
|
|
439
|
+
if merge_i != -1 and merge_j != -1:
|
|
440
|
+
clusters[merge_i].extend(clusters[merge_j])
|
|
441
|
+
clusters.pop(merge_j)
|
|
442
|
+
|
|
443
|
+
# Assign labels
|
|
444
|
+
self.labels_ = np.zeros(n_samples, dtype=int)
|
|
445
|
+
for cluster_id, cluster_points in enumerate(clusters):
|
|
446
|
+
for point_idx in cluster_points:
|
|
447
|
+
self.labels_[point_idx] = cluster_id
|
|
448
|
+
|
|
449
|
+
self.n_clusters_ = len(clusters)
|
|
450
|
+
self.fitted = True
|
|
451
|
+
return self
|
|
452
|
+
|
|
453
|
+
def fit_predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
454
|
+
"""
|
|
455
|
+
Fit the model and return cluster labels
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
X: Training data (n_samples, n_features)
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Cluster labels (n_samples,)
|
|
462
|
+
"""
|
|
463
|
+
self.fit(X)
|
|
464
|
+
return self.labels_
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
class GaussianMixture:
|
|
468
|
+
"""
|
|
469
|
+
Gaussian Mixture Model implementation using Expectation-Maximization
|
|
470
|
+
|
|
471
|
+
Pure Python implementation for probabilistic clustering.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
def __init__(self, n_components: int = 1, max_iter: int = 100, tol: float = 1e-3, random_state: Optional[int] = None):
|
|
475
|
+
"""
|
|
476
|
+
Initialize Gaussian Mixture Model
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
n_components: Number of mixture components
|
|
480
|
+
max_iter: Maximum number of EM iterations
|
|
481
|
+
tol: Tolerance for convergence
|
|
482
|
+
random_state: Random seed for reproducibility
|
|
483
|
+
"""
|
|
484
|
+
self.n_components = n_components
|
|
485
|
+
self.max_iter = max_iter
|
|
486
|
+
self.tol = tol
|
|
487
|
+
self.random_state = random_state
|
|
488
|
+
|
|
489
|
+
self.weights_ = None
|
|
490
|
+
self.means_ = None
|
|
491
|
+
self.covariances_ = None
|
|
492
|
+
self.labels_ = None
|
|
493
|
+
self.fitted = False
|
|
494
|
+
|
|
495
|
+
def _initialize_parameters(self, X: np.ndarray):
|
|
496
|
+
"""Initialize GMM parameters"""
|
|
497
|
+
if self.random_state is not None:
|
|
498
|
+
np.random.seed(self.random_state)
|
|
499
|
+
|
|
500
|
+
n_samples, n_features = X.shape
|
|
501
|
+
|
|
502
|
+
# Initialize weights uniformly
|
|
503
|
+
self.weights_ = np.ones(self.n_components) / self.n_components
|
|
504
|
+
|
|
505
|
+
# Initialize means randomly
|
|
506
|
+
self.means_ = np.zeros((self.n_components, n_features))
|
|
507
|
+
for k in range(self.n_components):
|
|
508
|
+
self.means_[k] = X[np.random.randint(0, n_samples)]
|
|
509
|
+
|
|
510
|
+
# Initialize covariances as identity matrices
|
|
511
|
+
self.covariances_ = np.array([np.eye(n_features) for _ in range(self.n_components)])
|
|
512
|
+
|
|
513
|
+
def _multivariate_gaussian(self, X: np.ndarray, mean: np.ndarray, cov: np.ndarray) -> np.ndarray:
|
|
514
|
+
"""Calculate multivariate Gaussian probability density"""
|
|
515
|
+
n_features = X.shape[1]
|
|
516
|
+
|
|
517
|
+
# Add small regularization to diagonal for numerical stability
|
|
518
|
+
cov_reg = cov + 1e-6 * np.eye(n_features)
|
|
519
|
+
|
|
520
|
+
try:
|
|
521
|
+
cov_inv = np.linalg.inv(cov_reg)
|
|
522
|
+
cov_det = np.linalg.det(cov_reg)
|
|
523
|
+
except np.linalg.LinAlgError:
|
|
524
|
+
# Fallback to regularized covariance
|
|
525
|
+
cov_reg = np.eye(n_features)
|
|
526
|
+
cov_inv = cov_reg
|
|
527
|
+
cov_det = 1.0
|
|
528
|
+
|
|
529
|
+
if cov_det <= 0:
|
|
530
|
+
cov_det = 1e-6
|
|
531
|
+
|
|
532
|
+
# Calculate probability density
|
|
533
|
+
diff = X - mean
|
|
534
|
+
exponent = -0.5 * np.sum((diff @ cov_inv) * diff, axis=1)
|
|
535
|
+
|
|
536
|
+
normalization = 1.0 / np.sqrt((2 * np.pi) ** n_features * cov_det)
|
|
537
|
+
|
|
538
|
+
return normalization * np.exp(exponent)
|
|
539
|
+
|
|
540
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'GaussianMixture':
|
|
541
|
+
"""
|
|
542
|
+
Fit Gaussian Mixture Model using EM algorithm
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
X: Training data (n_samples, n_features)
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
Self for method chaining
|
|
549
|
+
"""
|
|
550
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
551
|
+
if X.ndim == 1:
|
|
552
|
+
X = X.reshape(-1, 1)
|
|
553
|
+
|
|
554
|
+
n_samples, n_features = X.shape
|
|
555
|
+
|
|
556
|
+
# Initialize parameters
|
|
557
|
+
self._initialize_parameters(X)
|
|
558
|
+
|
|
559
|
+
prev_log_likelihood = -np.inf
|
|
560
|
+
|
|
561
|
+
for iteration in range(self.max_iter):
|
|
562
|
+
# E-step: Calculate responsibilities
|
|
563
|
+
responsibilities = np.zeros((n_samples, self.n_components))
|
|
564
|
+
|
|
565
|
+
for k in range(self.n_components):
|
|
566
|
+
responsibilities[:, k] = self.weights_[k] * self._multivariate_gaussian(
|
|
567
|
+
X, self.means_[k], self.covariances_[k]
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Normalize responsibilities
|
|
571
|
+
total_responsibility = np.sum(responsibilities, axis=1, keepdims=True)
|
|
572
|
+
total_responsibility[total_responsibility == 0] = 1e-15 # Avoid division by zero
|
|
573
|
+
responsibilities /= total_responsibility
|
|
574
|
+
|
|
575
|
+
# M-step: Update parameters
|
|
576
|
+
N_k = np.sum(responsibilities, axis=0)
|
|
577
|
+
|
|
578
|
+
# Update weights
|
|
579
|
+
self.weights_ = N_k / n_samples
|
|
580
|
+
|
|
581
|
+
# Update means
|
|
582
|
+
for k in range(self.n_components):
|
|
583
|
+
if N_k[k] > 0:
|
|
584
|
+
self.means_[k] = np.sum(responsibilities[:, k:k+1] * X, axis=0) / N_k[k]
|
|
585
|
+
|
|
586
|
+
# Update covariances
|
|
587
|
+
for k in range(self.n_components):
|
|
588
|
+
if N_k[k] > 0:
|
|
589
|
+
diff = X - self.means_[k]
|
|
590
|
+
weighted_diff = responsibilities[:, k:k+1] * diff
|
|
591
|
+
self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
|
|
592
|
+
|
|
593
|
+
# Add regularization
|
|
594
|
+
self.covariances_[k] += 1e-6 * np.eye(n_features)
|
|
595
|
+
|
|
596
|
+
# Check for convergence
|
|
597
|
+
log_likelihood = np.sum(np.log(np.sum(responsibilities, axis=1) + 1e-15))
|
|
598
|
+
|
|
599
|
+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
|
|
600
|
+
break
|
|
601
|
+
|
|
602
|
+
prev_log_likelihood = log_likelihood
|
|
603
|
+
|
|
604
|
+
# Assign labels based on highest responsibility
|
|
605
|
+
self.labels_ = np.argmax(responsibilities, axis=1)
|
|
606
|
+
self.fitted = True
|
|
607
|
+
|
|
608
|
+
return self
|
|
609
|
+
|
|
610
|
+
def predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
611
|
+
"""
|
|
612
|
+
Predict cluster labels for new data
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
X: Data to predict (n_samples, n_features)
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Cluster labels (n_samples,)
|
|
619
|
+
"""
|
|
620
|
+
if not self.fitted:
|
|
621
|
+
raise RuntimeError("Model must be fitted before making predictions")
|
|
622
|
+
|
|
623
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
624
|
+
if X.ndim == 1:
|
|
625
|
+
X = X.reshape(-1, 1)
|
|
626
|
+
|
|
627
|
+
n_samples = X.shape[0]
|
|
628
|
+
responsibilities = np.zeros((n_samples, self.n_components))
|
|
629
|
+
|
|
630
|
+
for k in range(self.n_components):
|
|
631
|
+
responsibilities[:, k] = self.weights_[k] * self._multivariate_gaussian(
|
|
632
|
+
X, self.means_[k], self.covariances_[k]
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
return np.argmax(responsibilities, axis=1)
|
|
636
|
+
|
|
637
|
+
def predict_proba(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
638
|
+
"""
|
|
639
|
+
Predict cluster probabilities for new data
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
X: Data to predict (n_samples, n_features)
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
Cluster probabilities (n_samples, n_components)
|
|
646
|
+
"""
|
|
647
|
+
if not self.fitted:
|
|
648
|
+
raise RuntimeError("Model must be fitted before making predictions")
|
|
649
|
+
|
|
650
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
651
|
+
if X.ndim == 1:
|
|
652
|
+
X = X.reshape(-1, 1)
|
|
653
|
+
|
|
654
|
+
n_samples = X.shape[0]
|
|
655
|
+
responsibilities = np.zeros((n_samples, self.n_components))
|
|
656
|
+
|
|
657
|
+
for k in range(self.n_components):
|
|
658
|
+
responsibilities[:, k] = self.weights_[k] * self._multivariate_gaussian(
|
|
659
|
+
X, self.means_[k], self.covariances_[k]
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
# Normalize
|
|
663
|
+
total_responsibility = np.sum(responsibilities, axis=1, keepdims=True)
|
|
664
|
+
total_responsibility[total_responsibility == 0] = 1e-15
|
|
665
|
+
responsibilities /= total_responsibility
|
|
666
|
+
|
|
667
|
+
return responsibilities
|
|
668
|
+
|
|
669
|
+
def fit_predict(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
670
|
+
"""
|
|
671
|
+
Fit the model and return cluster labels
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
X: Training data (n_samples, n_features)
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Cluster labels (n_samples,)
|
|
678
|
+
"""
|
|
679
|
+
self.fit(X)
|
|
680
|
+
return self.labels_
|