ilovetools 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.2.2/ilovetools.egg-info → ilovetools-0.2.3}/PKG-INFO +1 -1
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/__init__.py +1 -1
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/__init__.py +67 -0
- ilovetools-0.2.3/ilovetools/ml/clustering.py +1107 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools.egg-info/SOURCES.txt +1 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/pyproject.toml +1 -1
- {ilovetools-0.2.2 → ilovetools-0.2.3}/LICENSE +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/MANIFEST.in +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/README.md +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/data/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/data/feature_engineering.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/cross_validation.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/dimensionality.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/ensemble.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/feature_selection.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/imbalanced.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/interpretation.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/metrics.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/pipeline.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/timeseries.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/ml/tuning.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/requirements.txt +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/setup.cfg +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/setup.py +0 -0
- {ilovetools-0.2.2 → ilovetools-0.2.3}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -282,6 +282,41 @@ from .timeseries import (
|
|
|
282
282
|
forecast_acc,
|
|
283
283
|
)
|
|
284
284
|
|
|
285
|
+
from .clustering import (
|
|
286
|
+
# Full names
|
|
287
|
+
kmeans_clustering,
|
|
288
|
+
hierarchical_clustering,
|
|
289
|
+
dbscan_clustering,
|
|
290
|
+
elbow_method,
|
|
291
|
+
silhouette_score,
|
|
292
|
+
euclidean_distance,
|
|
293
|
+
manhattan_distance,
|
|
294
|
+
cosine_similarity_distance,
|
|
295
|
+
initialize_centroids,
|
|
296
|
+
assign_clusters,
|
|
297
|
+
update_centroids,
|
|
298
|
+
calculate_inertia,
|
|
299
|
+
dendrogram_data,
|
|
300
|
+
cluster_purity,
|
|
301
|
+
davies_bouldin_index,
|
|
302
|
+
# Abbreviated aliases
|
|
303
|
+
kmeans,
|
|
304
|
+
hierarchical,
|
|
305
|
+
dbscan,
|
|
306
|
+
elbow,
|
|
307
|
+
silhouette,
|
|
308
|
+
euclidean,
|
|
309
|
+
manhattan,
|
|
310
|
+
cosine_dist,
|
|
311
|
+
init_centroids,
|
|
312
|
+
assign,
|
|
313
|
+
update,
|
|
314
|
+
inertia,
|
|
315
|
+
dendrogram,
|
|
316
|
+
purity,
|
|
317
|
+
davies_bouldin,
|
|
318
|
+
)
|
|
319
|
+
|
|
285
320
|
__all__ = [
|
|
286
321
|
# Metrics (full names)
|
|
287
322
|
'accuracy_score',
|
|
@@ -533,4 +568,36 @@ __all__ = [
|
|
|
533
568
|
'lag',
|
|
534
569
|
'ts_cv',
|
|
535
570
|
'forecast_acc',
|
|
571
|
+
# Clustering (full names)
|
|
572
|
+
'kmeans_clustering',
|
|
573
|
+
'hierarchical_clustering',
|
|
574
|
+
'dbscan_clustering',
|
|
575
|
+
'elbow_method',
|
|
576
|
+
'silhouette_score',
|
|
577
|
+
'euclidean_distance',
|
|
578
|
+
'manhattan_distance',
|
|
579
|
+
'cosine_similarity_distance',
|
|
580
|
+
'initialize_centroids',
|
|
581
|
+
'assign_clusters',
|
|
582
|
+
'update_centroids',
|
|
583
|
+
'calculate_inertia',
|
|
584
|
+
'dendrogram_data',
|
|
585
|
+
'cluster_purity',
|
|
586
|
+
'davies_bouldin_index',
|
|
587
|
+
# Clustering (aliases)
|
|
588
|
+
'kmeans',
|
|
589
|
+
'hierarchical',
|
|
590
|
+
'dbscan',
|
|
591
|
+
'elbow',
|
|
592
|
+
'silhouette',
|
|
593
|
+
'euclidean',
|
|
594
|
+
'manhattan',
|
|
595
|
+
'cosine_dist',
|
|
596
|
+
'init_centroids',
|
|
597
|
+
'assign',
|
|
598
|
+
'update',
|
|
599
|
+
'inertia',
|
|
600
|
+
'dendrogram',
|
|
601
|
+
'purity',
|
|
602
|
+
'davies_bouldin',
|
|
536
603
|
]
|
|
@@ -0,0 +1,1107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clustering algorithm utilities
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
7
|
+
import math
|
|
8
|
+
import random
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
# Full names
|
|
12
|
+
'kmeans_clustering',
|
|
13
|
+
'hierarchical_clustering',
|
|
14
|
+
'dbscan_clustering',
|
|
15
|
+
'elbow_method',
|
|
16
|
+
'silhouette_score',
|
|
17
|
+
'euclidean_distance',
|
|
18
|
+
'manhattan_distance',
|
|
19
|
+
'cosine_similarity_distance',
|
|
20
|
+
'initialize_centroids',
|
|
21
|
+
'assign_clusters',
|
|
22
|
+
'update_centroids',
|
|
23
|
+
'calculate_inertia',
|
|
24
|
+
'dendrogram_data',
|
|
25
|
+
'cluster_purity',
|
|
26
|
+
'davies_bouldin_index',
|
|
27
|
+
# Abbreviated aliases
|
|
28
|
+
'kmeans',
|
|
29
|
+
'hierarchical',
|
|
30
|
+
'dbscan',
|
|
31
|
+
'elbow',
|
|
32
|
+
'silhouette',
|
|
33
|
+
'euclidean',
|
|
34
|
+
'manhattan',
|
|
35
|
+
'cosine_dist',
|
|
36
|
+
'init_centroids',
|
|
37
|
+
'assign',
|
|
38
|
+
'update',
|
|
39
|
+
'inertia',
|
|
40
|
+
'dendrogram',
|
|
41
|
+
'purity',
|
|
42
|
+
'davies_bouldin',
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def euclidean_distance(
|
|
47
|
+
point1: List[float],
|
|
48
|
+
point2: List[float]
|
|
49
|
+
) -> float:
|
|
50
|
+
"""
|
|
51
|
+
Calculate Euclidean distance between two points.
|
|
52
|
+
|
|
53
|
+
Alias: euclidean()
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
point1: First point coordinates
|
|
57
|
+
point2: Second point coordinates
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
float: Euclidean distance
|
|
61
|
+
|
|
62
|
+
Examples:
|
|
63
|
+
>>> from ilovetools.ml import euclidean # Short alias
|
|
64
|
+
|
|
65
|
+
>>> p1 = [1, 2, 3]
|
|
66
|
+
>>> p2 = [4, 5, 6]
|
|
67
|
+
>>> dist = euclidean(p1, p2)
|
|
68
|
+
>>> print(round(dist, 2))
|
|
69
|
+
5.2
|
|
70
|
+
|
|
71
|
+
>>> from ilovetools.ml import euclidean_distance # Full name
|
|
72
|
+
>>> dist = euclidean_distance(p1, p2)
|
|
73
|
+
|
|
74
|
+
Notes:
|
|
75
|
+
- Most common distance metric
|
|
76
|
+
- Straight line distance
|
|
77
|
+
- Sensitive to scale
|
|
78
|
+
- Works in any dimension
|
|
79
|
+
"""
|
|
80
|
+
if len(point1) != len(point2):
|
|
81
|
+
raise ValueError("Points must have same dimensions")
|
|
82
|
+
|
|
83
|
+
return math.sqrt(sum((a - b) ** 2 for a, b in zip(point1, point2)))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Create alias
|
|
87
|
+
euclidean = euclidean_distance
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def manhattan_distance(
|
|
91
|
+
point1: List[float],
|
|
92
|
+
point2: List[float]
|
|
93
|
+
) -> float:
|
|
94
|
+
"""
|
|
95
|
+
Calculate Manhattan (L1) distance between two points.
|
|
96
|
+
|
|
97
|
+
Alias: manhattan()
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
point1: First point coordinates
|
|
101
|
+
point2: Second point coordinates
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
float: Manhattan distance
|
|
105
|
+
|
|
106
|
+
Examples:
|
|
107
|
+
>>> from ilovetools.ml import manhattan # Short alias
|
|
108
|
+
|
|
109
|
+
>>> p1 = [1, 2, 3]
|
|
110
|
+
>>> p2 = [4, 5, 6]
|
|
111
|
+
>>> dist = manhattan(p1, p2)
|
|
112
|
+
>>> print(dist)
|
|
113
|
+
9.0
|
|
114
|
+
|
|
115
|
+
>>> from ilovetools.ml import manhattan_distance # Full name
|
|
116
|
+
>>> dist = manhattan_distance(p1, p2)
|
|
117
|
+
|
|
118
|
+
Notes:
|
|
119
|
+
- Grid-based distance
|
|
120
|
+
- Sum of absolute differences
|
|
121
|
+
- Less sensitive to outliers
|
|
122
|
+
- Good for high dimensions
|
|
123
|
+
"""
|
|
124
|
+
if len(point1) != len(point2):
|
|
125
|
+
raise ValueError("Points must have same dimensions")
|
|
126
|
+
|
|
127
|
+
return sum(abs(a - b) for a, b in zip(point1, point2))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# Create alias
|
|
131
|
+
manhattan = manhattan_distance
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def cosine_similarity_distance(
|
|
135
|
+
point1: List[float],
|
|
136
|
+
point2: List[float]
|
|
137
|
+
) -> float:
|
|
138
|
+
"""
|
|
139
|
+
Calculate cosine distance between two points.
|
|
140
|
+
|
|
141
|
+
Alias: cosine_dist()
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
point1: First point coordinates
|
|
145
|
+
point2: Second point coordinates
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
float: Cosine distance (1 - cosine similarity)
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
>>> from ilovetools.ml import cosine_dist # Short alias
|
|
152
|
+
|
|
153
|
+
>>> p1 = [1, 2, 3]
|
|
154
|
+
>>> p2 = [2, 4, 6]
|
|
155
|
+
>>> dist = cosine_dist(p1, p2)
|
|
156
|
+
>>> print(round(dist, 4))
|
|
157
|
+
0.0
|
|
158
|
+
|
|
159
|
+
>>> from ilovetools.ml import cosine_similarity_distance # Full name
|
|
160
|
+
>>> dist = cosine_similarity_distance(p1, p2)
|
|
161
|
+
|
|
162
|
+
Notes:
|
|
163
|
+
- Measures angle between vectors
|
|
164
|
+
- Range: 0 to 2
|
|
165
|
+
- Good for text/sparse data
|
|
166
|
+
- Ignores magnitude
|
|
167
|
+
"""
|
|
168
|
+
if len(point1) != len(point2):
|
|
169
|
+
raise ValueError("Points must have same dimensions")
|
|
170
|
+
|
|
171
|
+
# Calculate dot product
|
|
172
|
+
dot_product = sum(a * b for a, b in zip(point1, point2))
|
|
173
|
+
|
|
174
|
+
# Calculate magnitudes
|
|
175
|
+
mag1 = math.sqrt(sum(a ** 2 for a in point1))
|
|
176
|
+
mag2 = math.sqrt(sum(b ** 2 for b in point2))
|
|
177
|
+
|
|
178
|
+
if mag1 == 0 or mag2 == 0:
|
|
179
|
+
return 1.0
|
|
180
|
+
|
|
181
|
+
# Cosine similarity
|
|
182
|
+
similarity = dot_product / (mag1 * mag2)
|
|
183
|
+
|
|
184
|
+
# Cosine distance
|
|
185
|
+
return 1 - similarity
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# Create alias
|
|
189
|
+
cosine_dist = cosine_similarity_distance
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def initialize_centroids(
|
|
193
|
+
data: List[List[float]],
|
|
194
|
+
k: int,
|
|
195
|
+
method: str = 'random'
|
|
196
|
+
) -> List[List[float]]:
|
|
197
|
+
"""
|
|
198
|
+
Initialize cluster centroids.
|
|
199
|
+
|
|
200
|
+
Alias: init_centroids()
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
data: Dataset
|
|
204
|
+
k: Number of clusters
|
|
205
|
+
method: 'random' or 'kmeans++'
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
list: Initial centroids
|
|
209
|
+
|
|
210
|
+
Examples:
|
|
211
|
+
>>> from ilovetools.ml import init_centroids # Short alias
|
|
212
|
+
|
|
213
|
+
>>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10]]
|
|
214
|
+
>>> centroids = init_centroids(data, k=2)
|
|
215
|
+
>>> print(len(centroids))
|
|
216
|
+
2
|
|
217
|
+
|
|
218
|
+
>>> from ilovetools.ml import initialize_centroids # Full name
|
|
219
|
+
>>> centroids = initialize_centroids(data, k=2)
|
|
220
|
+
|
|
221
|
+
Notes:
|
|
222
|
+
- Random: Pick random points
|
|
223
|
+
- KMeans++: Smart initialization
|
|
224
|
+
- Affects convergence speed
|
|
225
|
+
- Critical for K-Means
|
|
226
|
+
"""
|
|
227
|
+
if k <= 0 or k > len(data):
|
|
228
|
+
raise ValueError("K must be between 1 and number of data points")
|
|
229
|
+
|
|
230
|
+
if method == 'random':
|
|
231
|
+
# Random initialization
|
|
232
|
+
indices = random.sample(range(len(data)), k)
|
|
233
|
+
return [data[i][:] for i in indices]
|
|
234
|
+
|
|
235
|
+
elif method == 'kmeans++':
|
|
236
|
+
# K-Means++ initialization
|
|
237
|
+
centroids = []
|
|
238
|
+
|
|
239
|
+
# Choose first centroid randomly
|
|
240
|
+
centroids.append(data[random.randint(0, len(data) - 1)][:])
|
|
241
|
+
|
|
242
|
+
# Choose remaining centroids
|
|
243
|
+
for _ in range(k - 1):
|
|
244
|
+
distances = []
|
|
245
|
+
for point in data:
|
|
246
|
+
# Find minimum distance to existing centroids
|
|
247
|
+
min_dist = min(euclidean_distance(point, c) for c in centroids)
|
|
248
|
+
distances.append(min_dist ** 2)
|
|
249
|
+
|
|
250
|
+
# Choose next centroid with probability proportional to distance
|
|
251
|
+
total = sum(distances)
|
|
252
|
+
if total == 0:
|
|
253
|
+
# All points are centroids, pick random
|
|
254
|
+
remaining = [p for p in data if p not in centroids]
|
|
255
|
+
if remaining:
|
|
256
|
+
centroids.append(remaining[0][:])
|
|
257
|
+
else:
|
|
258
|
+
probs = [d / total for d in distances]
|
|
259
|
+
cumsum = []
|
|
260
|
+
total_prob = 0
|
|
261
|
+
for p in probs:
|
|
262
|
+
total_prob += p
|
|
263
|
+
cumsum.append(total_prob)
|
|
264
|
+
|
|
265
|
+
r = random.random()
|
|
266
|
+
for i, cum_p in enumerate(cumsum):
|
|
267
|
+
if r <= cum_p:
|
|
268
|
+
centroids.append(data[i][:])
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
return centroids
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
raise ValueError("Method must be 'random' or 'kmeans++'")
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# Create alias
|
|
278
|
+
init_centroids = initialize_centroids
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def assign_clusters(
|
|
282
|
+
data: List[List[float]],
|
|
283
|
+
centroids: List[List[float]],
|
|
284
|
+
distance_metric: str = 'euclidean'
|
|
285
|
+
) -> List[int]:
|
|
286
|
+
"""
|
|
287
|
+
Assign each point to nearest centroid.
|
|
288
|
+
|
|
289
|
+
Alias: assign()
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
data: Dataset
|
|
293
|
+
centroids: Cluster centroids
|
|
294
|
+
distance_metric: 'euclidean', 'manhattan', or 'cosine'
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
list: Cluster assignments
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
>>> from ilovetools.ml import assign # Short alias
|
|
301
|
+
|
|
302
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
303
|
+
>>> centroids = [[1.5, 2.5], [8.5, 9.5]]
|
|
304
|
+
>>> labels = assign(data, centroids)
|
|
305
|
+
>>> print(labels)
|
|
306
|
+
[0, 0, 1, 1]
|
|
307
|
+
|
|
308
|
+
>>> from ilovetools.ml import assign_clusters # Full name
|
|
309
|
+
>>> labels = assign_clusters(data, centroids)
|
|
310
|
+
|
|
311
|
+
Notes:
|
|
312
|
+
- Assigns to nearest centroid
|
|
313
|
+
- Uses specified distance metric
|
|
314
|
+
- Core step in K-Means
|
|
315
|
+
- Fast operation
|
|
316
|
+
"""
|
|
317
|
+
# Choose distance function
|
|
318
|
+
if distance_metric == 'euclidean':
|
|
319
|
+
dist_func = euclidean_distance
|
|
320
|
+
elif distance_metric == 'manhattan':
|
|
321
|
+
dist_func = manhattan_distance
|
|
322
|
+
elif distance_metric == 'cosine':
|
|
323
|
+
dist_func = cosine_similarity_distance
|
|
324
|
+
else:
|
|
325
|
+
raise ValueError("Invalid distance metric")
|
|
326
|
+
|
|
327
|
+
labels = []
|
|
328
|
+
for point in data:
|
|
329
|
+
# Find nearest centroid
|
|
330
|
+
distances = [dist_func(point, c) for c in centroids]
|
|
331
|
+
nearest = distances.index(min(distances))
|
|
332
|
+
labels.append(nearest)
|
|
333
|
+
|
|
334
|
+
return labels
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Create alias
|
|
338
|
+
assign = assign_clusters
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def update_centroids(
|
|
342
|
+
data: List[List[float]],
|
|
343
|
+
labels: List[int],
|
|
344
|
+
k: int
|
|
345
|
+
) -> List[List[float]]:
|
|
346
|
+
"""
|
|
347
|
+
Update centroids based on cluster assignments.
|
|
348
|
+
|
|
349
|
+
Alias: update()
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
data: Dataset
|
|
353
|
+
labels: Cluster assignments
|
|
354
|
+
k: Number of clusters
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
list: Updated centroids
|
|
358
|
+
|
|
359
|
+
Examples:
|
|
360
|
+
>>> from ilovetools.ml import update # Short alias
|
|
361
|
+
|
|
362
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
363
|
+
>>> labels = [0, 0, 1, 1]
|
|
364
|
+
>>> centroids = update(data, labels, k=2)
|
|
365
|
+
>>> print(len(centroids))
|
|
366
|
+
2
|
|
367
|
+
|
|
368
|
+
>>> from ilovetools.ml import update_centroids # Full name
|
|
369
|
+
>>> centroids = update_centroids(data, labels, k=2)
|
|
370
|
+
|
|
371
|
+
Notes:
|
|
372
|
+
- Calculate mean of each cluster
|
|
373
|
+
- Core step in K-Means
|
|
374
|
+
- Moves centroids to center
|
|
375
|
+
- Iterative process
|
|
376
|
+
"""
|
|
377
|
+
dimensions = len(data[0])
|
|
378
|
+
centroids = []
|
|
379
|
+
|
|
380
|
+
for cluster_id in range(k):
|
|
381
|
+
# Get points in this cluster
|
|
382
|
+
cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
|
|
383
|
+
|
|
384
|
+
if not cluster_points:
|
|
385
|
+
# Empty cluster, keep old centroid or random point
|
|
386
|
+
centroids.append(data[random.randint(0, len(data) - 1)][:])
|
|
387
|
+
else:
|
|
388
|
+
# Calculate mean
|
|
389
|
+
centroid = []
|
|
390
|
+
for dim in range(dimensions):
|
|
391
|
+
mean = sum(point[dim] for point in cluster_points) / len(cluster_points)
|
|
392
|
+
centroid.append(mean)
|
|
393
|
+
centroids.append(centroid)
|
|
394
|
+
|
|
395
|
+
return centroids
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# Create alias
|
|
399
|
+
update = update_centroids
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def calculate_inertia(
|
|
403
|
+
data: List[List[float]],
|
|
404
|
+
labels: List[int],
|
|
405
|
+
centroids: List[List[float]]
|
|
406
|
+
) -> float:
|
|
407
|
+
"""
|
|
408
|
+
Calculate within-cluster sum of squares (inertia).
|
|
409
|
+
|
|
410
|
+
Alias: inertia()
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
data: Dataset
|
|
414
|
+
labels: Cluster assignments
|
|
415
|
+
centroids: Cluster centroids
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
float: Inertia value
|
|
419
|
+
|
|
420
|
+
Examples:
|
|
421
|
+
>>> from ilovetools.ml import inertia # Short alias
|
|
422
|
+
|
|
423
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
424
|
+
>>> labels = [0, 0, 1, 1]
|
|
425
|
+
>>> centroids = [[1.5, 2.5], [8.5, 9.5]]
|
|
426
|
+
>>> score = inertia(data, labels, centroids)
|
|
427
|
+
>>> print(round(score, 2))
|
|
428
|
+
2.0
|
|
429
|
+
|
|
430
|
+
>>> from ilovetools.ml import calculate_inertia # Full name
|
|
431
|
+
>>> score = calculate_inertia(data, labels, centroids)
|
|
432
|
+
|
|
433
|
+
Notes:
|
|
434
|
+
- Lower is better
|
|
435
|
+
- Measures compactness
|
|
436
|
+
- Used in elbow method
|
|
437
|
+
- Always decreases with more K
|
|
438
|
+
"""
|
|
439
|
+
total = 0.0
|
|
440
|
+
for i, point in enumerate(data):
|
|
441
|
+
centroid = centroids[labels[i]]
|
|
442
|
+
dist = euclidean_distance(point, centroid)
|
|
443
|
+
total += dist ** 2
|
|
444
|
+
|
|
445
|
+
return total
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# Create alias
|
|
449
|
+
inertia = calculate_inertia
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def kmeans_clustering(
|
|
453
|
+
data: List[List[float]],
|
|
454
|
+
k: int,
|
|
455
|
+
max_iterations: int = 100,
|
|
456
|
+
distance_metric: str = 'euclidean',
|
|
457
|
+
init_method: str = 'kmeans++'
|
|
458
|
+
) -> Dict[str, Any]:
|
|
459
|
+
"""
|
|
460
|
+
K-Means clustering algorithm.
|
|
461
|
+
|
|
462
|
+
Alias: kmeans()
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
data: Dataset
|
|
466
|
+
k: Number of clusters
|
|
467
|
+
max_iterations: Maximum iterations
|
|
468
|
+
distance_metric: Distance metric to use
|
|
469
|
+
init_method: Centroid initialization method
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
dict: Clustering results
|
|
473
|
+
|
|
474
|
+
Examples:
|
|
475
|
+
>>> from ilovetools.ml import kmeans # Short alias
|
|
476
|
+
|
|
477
|
+
>>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
|
|
478
|
+
>>> result = kmeans(data, k=2)
|
|
479
|
+
>>> print(len(result['labels']))
|
|
480
|
+
6
|
|
481
|
+
>>> print(len(result['centroids']))
|
|
482
|
+
2
|
|
483
|
+
|
|
484
|
+
>>> from ilovetools.ml import kmeans_clustering # Full name
|
|
485
|
+
>>> result = kmeans_clustering(data, k=2)
|
|
486
|
+
|
|
487
|
+
Notes:
|
|
488
|
+
- Most popular clustering
|
|
489
|
+
- Fast and scalable
|
|
490
|
+
- Requires K specification
|
|
491
|
+
- Sensitive to initialization
|
|
492
|
+
"""
|
|
493
|
+
# Initialize centroids
|
|
494
|
+
centroids = initialize_centroids(data, k, method=init_method)
|
|
495
|
+
|
|
496
|
+
for iteration in range(max_iterations):
|
|
497
|
+
# Assign clusters
|
|
498
|
+
labels = assign_clusters(data, centroids, distance_metric)
|
|
499
|
+
|
|
500
|
+
# Update centroids
|
|
501
|
+
new_centroids = update_centroids(data, labels, k)
|
|
502
|
+
|
|
503
|
+
# Check convergence
|
|
504
|
+
converged = True
|
|
505
|
+
for old, new in zip(centroids, new_centroids):
|
|
506
|
+
if euclidean_distance(old, new) > 1e-6:
|
|
507
|
+
converged = False
|
|
508
|
+
break
|
|
509
|
+
|
|
510
|
+
centroids = new_centroids
|
|
511
|
+
|
|
512
|
+
if converged:
|
|
513
|
+
break
|
|
514
|
+
|
|
515
|
+
# Calculate inertia
|
|
516
|
+
inertia_value = calculate_inertia(data, labels, centroids)
|
|
517
|
+
|
|
518
|
+
return {
|
|
519
|
+
'labels': labels,
|
|
520
|
+
'centroids': centroids,
|
|
521
|
+
'inertia': inertia_value,
|
|
522
|
+
'iterations': iteration + 1,
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
# Create alias
|
|
527
|
+
kmeans = kmeans_clustering
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def elbow_method(
|
|
531
|
+
data: List[List[float]],
|
|
532
|
+
max_k: int = 10,
|
|
533
|
+
distance_metric: str = 'euclidean'
|
|
534
|
+
) -> Dict[str, Any]:
|
|
535
|
+
"""
|
|
536
|
+
Elbow method to find optimal K.
|
|
537
|
+
|
|
538
|
+
Alias: elbow()
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
data: Dataset
|
|
542
|
+
max_k: Maximum K to try
|
|
543
|
+
distance_metric: Distance metric to use
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
dict: Inertia values for each K
|
|
547
|
+
|
|
548
|
+
Examples:
|
|
549
|
+
>>> from ilovetools.ml import elbow # Short alias
|
|
550
|
+
|
|
551
|
+
>>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
|
|
552
|
+
>>> result = elbow(data, max_k=4)
|
|
553
|
+
>>> print(len(result['k_values']))
|
|
554
|
+
4
|
|
555
|
+
>>> print(len(result['inertias']))
|
|
556
|
+
4
|
|
557
|
+
|
|
558
|
+
>>> from ilovetools.ml import elbow_method # Full name
|
|
559
|
+
>>> result = elbow_method(data, max_k=4)
|
|
560
|
+
|
|
561
|
+
Notes:
|
|
562
|
+
- Find optimal K
|
|
563
|
+
- Plot inertia vs K
|
|
564
|
+
- Look for elbow point
|
|
565
|
+
- Subjective interpretation
|
|
566
|
+
"""
|
|
567
|
+
k_values = list(range(1, max_k + 1))
|
|
568
|
+
inertias = []
|
|
569
|
+
|
|
570
|
+
for k in k_values:
|
|
571
|
+
if k > len(data):
|
|
572
|
+
break
|
|
573
|
+
result = kmeans_clustering(data, k, distance_metric=distance_metric)
|
|
574
|
+
inertias.append(result['inertia'])
|
|
575
|
+
|
|
576
|
+
return {
|
|
577
|
+
'k_values': k_values[:len(inertias)],
|
|
578
|
+
'inertias': inertias,
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
# Create alias
|
|
583
|
+
elbow = elbow_method
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def silhouette_score(
|
|
587
|
+
data: List[List[float]],
|
|
588
|
+
labels: List[int],
|
|
589
|
+
distance_metric: str = 'euclidean'
|
|
590
|
+
) -> float:
|
|
591
|
+
"""
|
|
592
|
+
Calculate silhouette score for clustering.
|
|
593
|
+
|
|
594
|
+
Alias: silhouette()
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
data: Dataset
|
|
598
|
+
labels: Cluster assignments
|
|
599
|
+
distance_metric: Distance metric to use
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
float: Silhouette score (-1 to 1)
|
|
603
|
+
|
|
604
|
+
Examples:
|
|
605
|
+
>>> from ilovetools.ml import silhouette # Short alias
|
|
606
|
+
|
|
607
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
608
|
+
>>> labels = [0, 0, 1, 1]
|
|
609
|
+
>>> score = silhouette(data, labels)
|
|
610
|
+
>>> print(round(score, 2))
|
|
611
|
+
0.71
|
|
612
|
+
|
|
613
|
+
>>> from ilovetools.ml import silhouette_score # Full name
|
|
614
|
+
>>> score = silhouette_score(data, labels)
|
|
615
|
+
|
|
616
|
+
Notes:
|
|
617
|
+
- Range: -1 to 1
|
|
618
|
+
- Higher is better
|
|
619
|
+
- Measures cluster quality
|
|
620
|
+
- Considers separation and cohesion
|
|
621
|
+
"""
|
|
622
|
+
# Choose distance function
|
|
623
|
+
if distance_metric == 'euclidean':
|
|
624
|
+
dist_func = euclidean_distance
|
|
625
|
+
elif distance_metric == 'manhattan':
|
|
626
|
+
dist_func = manhattan_distance
|
|
627
|
+
elif distance_metric == 'cosine':
|
|
628
|
+
dist_func = cosine_similarity_distance
|
|
629
|
+
else:
|
|
630
|
+
raise ValueError("Invalid distance metric")
|
|
631
|
+
|
|
632
|
+
n = len(data)
|
|
633
|
+
silhouette_values = []
|
|
634
|
+
|
|
635
|
+
for i in range(n):
|
|
636
|
+
# Get cluster of point i
|
|
637
|
+
cluster_i = labels[i]
|
|
638
|
+
|
|
639
|
+
# Calculate a(i): mean distance to points in same cluster
|
|
640
|
+
same_cluster = [j for j in range(n) if labels[j] == cluster_i and j != i]
|
|
641
|
+
if not same_cluster:
|
|
642
|
+
silhouette_values.append(0)
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
a_i = sum(dist_func(data[i], data[j]) for j in same_cluster) / len(same_cluster)
|
|
646
|
+
|
|
647
|
+
# Calculate b(i): mean distance to points in nearest cluster
|
|
648
|
+
unique_clusters = set(labels)
|
|
649
|
+
unique_clusters.discard(cluster_i)
|
|
650
|
+
|
|
651
|
+
if not unique_clusters:
|
|
652
|
+
silhouette_values.append(0)
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
b_i = float('inf')
|
|
656
|
+
for cluster_j in unique_clusters:
|
|
657
|
+
other_cluster = [j for j in range(n) if labels[j] == cluster_j]
|
|
658
|
+
mean_dist = sum(dist_func(data[i], data[j]) for j in other_cluster) / len(other_cluster)
|
|
659
|
+
b_i = min(b_i, mean_dist)
|
|
660
|
+
|
|
661
|
+
# Calculate silhouette value
|
|
662
|
+
s_i = (b_i - a_i) / max(a_i, b_i)
|
|
663
|
+
silhouette_values.append(s_i)
|
|
664
|
+
|
|
665
|
+
return sum(silhouette_values) / len(silhouette_values)
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
# Create alias
|
|
669
|
+
silhouette = silhouette_score
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def hierarchical_clustering(
|
|
673
|
+
data: List[List[float]],
|
|
674
|
+
n_clusters: int,
|
|
675
|
+
linkage: str = 'average',
|
|
676
|
+
distance_metric: str = 'euclidean'
|
|
677
|
+
) -> Dict[str, Any]:
|
|
678
|
+
"""
|
|
679
|
+
Hierarchical clustering (agglomerative).
|
|
680
|
+
|
|
681
|
+
Alias: hierarchical()
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
data: Dataset
|
|
685
|
+
n_clusters: Number of clusters
|
|
686
|
+
linkage: 'single', 'complete', or 'average'
|
|
687
|
+
distance_metric: Distance metric to use
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
dict: Clustering results
|
|
691
|
+
|
|
692
|
+
Examples:
|
|
693
|
+
>>> from ilovetools.ml import hierarchical # Short alias
|
|
694
|
+
|
|
695
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
696
|
+
>>> result = hierarchical(data, n_clusters=2)
|
|
697
|
+
>>> print(len(result['labels']))
|
|
698
|
+
4
|
|
699
|
+
|
|
700
|
+
>>> from ilovetools.ml import hierarchical_clustering # Full name
|
|
701
|
+
>>> result = hierarchical_clustering(data, n_clusters=2)
|
|
702
|
+
|
|
703
|
+
Notes:
|
|
704
|
+
- Creates tree structure
|
|
705
|
+
- No need to specify K initially
|
|
706
|
+
- Good for small datasets
|
|
707
|
+
- Computationally expensive
|
|
708
|
+
"""
|
|
709
|
+
# Choose distance function
|
|
710
|
+
if distance_metric == 'euclidean':
|
|
711
|
+
dist_func = euclidean_distance
|
|
712
|
+
elif distance_metric == 'manhattan':
|
|
713
|
+
dist_func = manhattan_distance
|
|
714
|
+
elif distance_metric == 'cosine':
|
|
715
|
+
dist_func = cosine_similarity_distance
|
|
716
|
+
else:
|
|
717
|
+
raise ValueError("Invalid distance metric")
|
|
718
|
+
|
|
719
|
+
n = len(data)
|
|
720
|
+
|
|
721
|
+
# Initialize: each point is its own cluster
|
|
722
|
+
clusters = [[i] for i in range(n)]
|
|
723
|
+
|
|
724
|
+
# Merge until we have n_clusters
|
|
725
|
+
while len(clusters) > n_clusters:
|
|
726
|
+
# Find closest pair of clusters
|
|
727
|
+
min_dist = float('inf')
|
|
728
|
+
merge_i, merge_j = 0, 1
|
|
729
|
+
|
|
730
|
+
for i in range(len(clusters)):
|
|
731
|
+
for j in range(i + 1, len(clusters)):
|
|
732
|
+
# Calculate distance between clusters
|
|
733
|
+
if linkage == 'single':
|
|
734
|
+
# Minimum distance
|
|
735
|
+
dist = min(dist_func(data[p1], data[p2])
|
|
736
|
+
for p1 in clusters[i] for p2 in clusters[j])
|
|
737
|
+
elif linkage == 'complete':
|
|
738
|
+
# Maximum distance
|
|
739
|
+
dist = max(dist_func(data[p1], data[p2])
|
|
740
|
+
for p1 in clusters[i] for p2 in clusters[j])
|
|
741
|
+
else: # average
|
|
742
|
+
# Average distance
|
|
743
|
+
distances = [dist_func(data[p1], data[p2])
|
|
744
|
+
for p1 in clusters[i] for p2 in clusters[j]]
|
|
745
|
+
dist = sum(distances) / len(distances)
|
|
746
|
+
|
|
747
|
+
if dist < min_dist:
|
|
748
|
+
min_dist = dist
|
|
749
|
+
merge_i, merge_j = i, j
|
|
750
|
+
|
|
751
|
+
# Merge clusters
|
|
752
|
+
clusters[merge_i].extend(clusters[merge_j])
|
|
753
|
+
clusters.pop(merge_j)
|
|
754
|
+
|
|
755
|
+
# Create labels
|
|
756
|
+
labels = [0] * n
|
|
757
|
+
for cluster_id, cluster in enumerate(clusters):
|
|
758
|
+
for point_id in cluster:
|
|
759
|
+
labels[point_id] = cluster_id
|
|
760
|
+
|
|
761
|
+
return {
|
|
762
|
+
'labels': labels,
|
|
763
|
+
'n_clusters': len(clusters),
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
# Create alias
|
|
768
|
+
hierarchical = hierarchical_clustering
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def dbscan_clustering(
|
|
772
|
+
data: List[List[float]],
|
|
773
|
+
eps: float,
|
|
774
|
+
min_samples: int = 5,
|
|
775
|
+
distance_metric: str = 'euclidean'
|
|
776
|
+
) -> Dict[str, Any]:
|
|
777
|
+
"""
|
|
778
|
+
DBSCAN density-based clustering.
|
|
779
|
+
|
|
780
|
+
Alias: dbscan()
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
data: Dataset
|
|
784
|
+
eps: Maximum distance for neighborhood
|
|
785
|
+
min_samples: Minimum points for core point
|
|
786
|
+
distance_metric: Distance metric to use
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
dict: Clustering results
|
|
790
|
+
|
|
791
|
+
Examples:
|
|
792
|
+
>>> from ilovetools.ml import dbscan # Short alias
|
|
793
|
+
|
|
794
|
+
>>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
|
|
795
|
+
>>> result = dbscan(data, eps=2.0, min_samples=2)
|
|
796
|
+
>>> print(len(result['labels']))
|
|
797
|
+
6
|
|
798
|
+
|
|
799
|
+
>>> from ilovetools.ml import dbscan_clustering # Full name
|
|
800
|
+
>>> result = dbscan_clustering(data, eps=2.0, min_samples=2)
|
|
801
|
+
|
|
802
|
+
Notes:
|
|
803
|
+
- Density-based clustering
|
|
804
|
+
- Finds arbitrary shapes
|
|
805
|
+
- Handles noise (label -1)
|
|
806
|
+
- No need to specify K
|
|
807
|
+
"""
|
|
808
|
+
# Choose distance function
|
|
809
|
+
if distance_metric == 'euclidean':
|
|
810
|
+
dist_func = euclidean_distance
|
|
811
|
+
elif distance_metric == 'manhattan':
|
|
812
|
+
dist_func = manhattan_distance
|
|
813
|
+
elif distance_metric == 'cosine':
|
|
814
|
+
dist_func = cosine_similarity_distance
|
|
815
|
+
else:
|
|
816
|
+
raise ValueError("Invalid distance metric")
|
|
817
|
+
|
|
818
|
+
n = len(data)
|
|
819
|
+
labels = [-1] * n # -1 means noise
|
|
820
|
+
cluster_id = 0
|
|
821
|
+
|
|
822
|
+
for i in range(n):
|
|
823
|
+
if labels[i] != -1:
|
|
824
|
+
continue
|
|
825
|
+
|
|
826
|
+
# Find neighbors
|
|
827
|
+
neighbors = []
|
|
828
|
+
for j in range(n):
|
|
829
|
+
if dist_func(data[i], data[j]) <= eps:
|
|
830
|
+
neighbors.append(j)
|
|
831
|
+
|
|
832
|
+
# Check if core point
|
|
833
|
+
if len(neighbors) < min_samples:
|
|
834
|
+
continue # Noise point
|
|
835
|
+
|
|
836
|
+
# Start new cluster
|
|
837
|
+
labels[i] = cluster_id
|
|
838
|
+
|
|
839
|
+
# Expand cluster
|
|
840
|
+
seed_set = neighbors[:]
|
|
841
|
+
while seed_set:
|
|
842
|
+
current = seed_set.pop(0)
|
|
843
|
+
|
|
844
|
+
if labels[current] == -1:
|
|
845
|
+
labels[current] = cluster_id
|
|
846
|
+
|
|
847
|
+
if labels[current] != -1:
|
|
848
|
+
continue
|
|
849
|
+
|
|
850
|
+
labels[current] = cluster_id
|
|
851
|
+
|
|
852
|
+
# Find neighbors of current
|
|
853
|
+
current_neighbors = []
|
|
854
|
+
for j in range(n):
|
|
855
|
+
if dist_func(data[current], data[j]) <= eps:
|
|
856
|
+
current_neighbors.append(j)
|
|
857
|
+
|
|
858
|
+
# If core point, add neighbors to seed set
|
|
859
|
+
if len(current_neighbors) >= min_samples:
|
|
860
|
+
seed_set.extend(current_neighbors)
|
|
861
|
+
|
|
862
|
+
cluster_id += 1
|
|
863
|
+
|
|
864
|
+
# Count noise points
|
|
865
|
+
noise_count = sum(1 for label in labels if label == -1)
|
|
866
|
+
|
|
867
|
+
return {
|
|
868
|
+
'labels': labels,
|
|
869
|
+
'n_clusters': cluster_id,
|
|
870
|
+
'noise_points': noise_count,
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
# Create alias
|
|
875
|
+
dbscan = dbscan_clustering
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def dendrogram_data(
|
|
879
|
+
data: List[List[float]],
|
|
880
|
+
linkage: str = 'average',
|
|
881
|
+
distance_metric: str = 'euclidean'
|
|
882
|
+
) -> List[Dict[str, Any]]:
|
|
883
|
+
"""
|
|
884
|
+
Generate dendrogram data for hierarchical clustering.
|
|
885
|
+
|
|
886
|
+
Alias: dendrogram()
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
data: Dataset
|
|
890
|
+
linkage: Linkage method
|
|
891
|
+
distance_metric: Distance metric to use
|
|
892
|
+
|
|
893
|
+
Returns:
|
|
894
|
+
list: Merge history
|
|
895
|
+
|
|
896
|
+
Examples:
|
|
897
|
+
>>> from ilovetools.ml import dendrogram # Short alias
|
|
898
|
+
|
|
899
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
900
|
+
>>> merges = dendrogram(data)
|
|
901
|
+
>>> print(len(merges))
|
|
902
|
+
3
|
|
903
|
+
|
|
904
|
+
>>> from ilovetools.ml import dendrogram_data # Full name
|
|
905
|
+
>>> merges = dendrogram_data(data)
|
|
906
|
+
|
|
907
|
+
Notes:
|
|
908
|
+
- Shows merge history
|
|
909
|
+
- Tree structure
|
|
910
|
+
- Cut at desired level
|
|
911
|
+
- Visualize hierarchy
|
|
912
|
+
"""
|
|
913
|
+
# Choose distance function
|
|
914
|
+
if distance_metric == 'euclidean':
|
|
915
|
+
dist_func = euclidean_distance
|
|
916
|
+
elif distance_metric == 'manhattan':
|
|
917
|
+
dist_func = manhattan_distance
|
|
918
|
+
elif distance_metric == 'cosine':
|
|
919
|
+
dist_func = cosine_similarity_distance
|
|
920
|
+
else:
|
|
921
|
+
raise ValueError("Invalid distance metric")
|
|
922
|
+
|
|
923
|
+
n = len(data)
|
|
924
|
+
clusters = [[i] for i in range(n)]
|
|
925
|
+
merges = []
|
|
926
|
+
|
|
927
|
+
while len(clusters) > 1:
|
|
928
|
+
# Find closest pair
|
|
929
|
+
min_dist = float('inf')
|
|
930
|
+
merge_i, merge_j = 0, 1
|
|
931
|
+
|
|
932
|
+
for i in range(len(clusters)):
|
|
933
|
+
for j in range(i + 1, len(clusters)):
|
|
934
|
+
# Calculate distance
|
|
935
|
+
if linkage == 'single':
|
|
936
|
+
dist = min(dist_func(data[p1], data[p2])
|
|
937
|
+
for p1 in clusters[i] for p2 in clusters[j])
|
|
938
|
+
elif linkage == 'complete':
|
|
939
|
+
dist = max(dist_func(data[p1], data[p2])
|
|
940
|
+
for p1 in clusters[i] for p2 in clusters[j])
|
|
941
|
+
else: # average
|
|
942
|
+
distances = [dist_func(data[p1], data[p2])
|
|
943
|
+
for p1 in clusters[i] for p2 in clusters[j]]
|
|
944
|
+
dist = sum(distances) / len(distances)
|
|
945
|
+
|
|
946
|
+
if dist < min_dist:
|
|
947
|
+
min_dist = dist
|
|
948
|
+
merge_i, merge_j = i, j
|
|
949
|
+
|
|
950
|
+
# Record merge
|
|
951
|
+
merges.append({
|
|
952
|
+
'cluster1': clusters[merge_i][:],
|
|
953
|
+
'cluster2': clusters[merge_j][:],
|
|
954
|
+
'distance': min_dist,
|
|
955
|
+
'size': len(clusters[merge_i]) + len(clusters[merge_j]),
|
|
956
|
+
})
|
|
957
|
+
|
|
958
|
+
# Merge clusters
|
|
959
|
+
clusters[merge_i].extend(clusters[merge_j])
|
|
960
|
+
clusters.pop(merge_j)
|
|
961
|
+
|
|
962
|
+
return merges
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
# Create alias
|
|
966
|
+
dendrogram = dendrogram_data
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
def cluster_purity(
|
|
970
|
+
labels_true: List[int],
|
|
971
|
+
labels_pred: List[int]
|
|
972
|
+
) -> float:
|
|
973
|
+
"""
|
|
974
|
+
Calculate cluster purity score.
|
|
975
|
+
|
|
976
|
+
Alias: purity()
|
|
977
|
+
|
|
978
|
+
Args:
|
|
979
|
+
labels_true: True labels
|
|
980
|
+
labels_pred: Predicted cluster labels
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
float: Purity score (0 to 1)
|
|
984
|
+
|
|
985
|
+
Examples:
|
|
986
|
+
>>> from ilovetools.ml import purity # Short alias
|
|
987
|
+
|
|
988
|
+
>>> true_labels = [0, 0, 1, 1, 2, 2]
|
|
989
|
+
>>> pred_labels = [0, 0, 1, 1, 1, 1]
|
|
990
|
+
>>> score = purity(true_labels, pred_labels)
|
|
991
|
+
>>> print(round(score, 2))
|
|
992
|
+
0.67
|
|
993
|
+
|
|
994
|
+
>>> from ilovetools.ml import cluster_purity # Full name
|
|
995
|
+
>>> score = cluster_purity(true_labels, pred_labels)
|
|
996
|
+
|
|
997
|
+
Notes:
|
|
998
|
+
- Range: 0 to 1
|
|
999
|
+
- Higher is better
|
|
1000
|
+
- Measures cluster quality
|
|
1001
|
+
- Requires true labels
|
|
1002
|
+
"""
|
|
1003
|
+
if len(labels_true) != len(labels_pred):
|
|
1004
|
+
raise ValueError("Label arrays must have same length")
|
|
1005
|
+
|
|
1006
|
+
# Get unique clusters
|
|
1007
|
+
clusters = set(labels_pred)
|
|
1008
|
+
|
|
1009
|
+
correct = 0
|
|
1010
|
+
for cluster in clusters:
|
|
1011
|
+
# Get points in this cluster
|
|
1012
|
+
cluster_indices = [i for i in range(len(labels_pred)) if labels_pred[i] == cluster]
|
|
1013
|
+
|
|
1014
|
+
# Get true labels for these points
|
|
1015
|
+
cluster_true_labels = [labels_true[i] for i in cluster_indices]
|
|
1016
|
+
|
|
1017
|
+
# Find most common true label
|
|
1018
|
+
if cluster_true_labels:
|
|
1019
|
+
most_common = max(set(cluster_true_labels), key=cluster_true_labels.count)
|
|
1020
|
+
correct += cluster_true_labels.count(most_common)
|
|
1021
|
+
|
|
1022
|
+
return correct / len(labels_true)
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
# Create alias
|
|
1026
|
+
purity = cluster_purity
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def davies_bouldin_index(
|
|
1030
|
+
data: List[List[float]],
|
|
1031
|
+
labels: List[int]
|
|
1032
|
+
) -> float:
|
|
1033
|
+
"""
|
|
1034
|
+
Calculate Davies-Bouldin index.
|
|
1035
|
+
|
|
1036
|
+
Alias: davies_bouldin()
|
|
1037
|
+
|
|
1038
|
+
Args:
|
|
1039
|
+
data: Dataset
|
|
1040
|
+
labels: Cluster assignments
|
|
1041
|
+
|
|
1042
|
+
Returns:
|
|
1043
|
+
float: Davies-Bouldin index (lower is better)
|
|
1044
|
+
|
|
1045
|
+
Examples:
|
|
1046
|
+
>>> from ilovetools.ml import davies_bouldin # Short alias
|
|
1047
|
+
|
|
1048
|
+
>>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
|
|
1049
|
+
>>> labels = [0, 0, 1, 1]
|
|
1050
|
+
>>> score = davies_bouldin(data, labels)
|
|
1051
|
+
>>> print(round(score, 2))
|
|
1052
|
+
0.35
|
|
1053
|
+
|
|
1054
|
+
>>> from ilovetools.ml import davies_bouldin_index # Full name
|
|
1055
|
+
>>> score = davies_bouldin_index(data, labels)
|
|
1056
|
+
|
|
1057
|
+
Notes:
|
|
1058
|
+
- Lower is better
|
|
1059
|
+
- Measures cluster separation
|
|
1060
|
+
- No true labels needed
|
|
1061
|
+
- Considers intra and inter cluster distances
|
|
1062
|
+
"""
|
|
1063
|
+
# Get unique clusters
|
|
1064
|
+
clusters = list(set(labels))
|
|
1065
|
+
k = len(clusters)
|
|
1066
|
+
|
|
1067
|
+
if k <= 1:
|
|
1068
|
+
return 0.0
|
|
1069
|
+
|
|
1070
|
+
# Calculate centroids
|
|
1071
|
+
centroids = []
|
|
1072
|
+
for cluster_id in clusters:
|
|
1073
|
+
cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
|
|
1074
|
+
if cluster_points:
|
|
1075
|
+
dimensions = len(data[0])
|
|
1076
|
+
centroid = [sum(p[d] for p in cluster_points) / len(cluster_points) for d in range(dimensions)]
|
|
1077
|
+
centroids.append(centroid)
|
|
1078
|
+
|
|
1079
|
+
# Calculate average distances within clusters
|
|
1080
|
+
avg_distances = []
|
|
1081
|
+
for cluster_id in clusters:
|
|
1082
|
+
cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
|
|
1083
|
+
if len(cluster_points) > 0:
|
|
1084
|
+
centroid = centroids[cluster_id]
|
|
1085
|
+
avg_dist = sum(euclidean_distance(p, centroid) for p in cluster_points) / len(cluster_points)
|
|
1086
|
+
avg_distances.append(avg_dist)
|
|
1087
|
+
else:
|
|
1088
|
+
avg_distances.append(0)
|
|
1089
|
+
|
|
1090
|
+
# Calculate DB index
|
|
1091
|
+
db_values = []
|
|
1092
|
+
for i in range(k):
|
|
1093
|
+
max_ratio = 0
|
|
1094
|
+
for j in range(k):
|
|
1095
|
+
if i != j:
|
|
1096
|
+
numerator = avg_distances[i] + avg_distances[j]
|
|
1097
|
+
denominator = euclidean_distance(centroids[i], centroids[j])
|
|
1098
|
+
if denominator > 0:
|
|
1099
|
+
ratio = numerator / denominator
|
|
1100
|
+
max_ratio = max(max_ratio, ratio)
|
|
1101
|
+
db_values.append(max_ratio)
|
|
1102
|
+
|
|
1103
|
+
return sum(db_values) / k if k > 0 else 0.0
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
# Create alias
|
|
1107
|
+
davies_bouldin = davies_bouldin_index
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|