ilovetools 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1107 @@
1
+ """
2
+ Clustering algorithm utilities
3
+ Each function has TWO names: full descriptive name + abbreviated alias
4
+ """
5
+
6
+ from typing import List, Dict, Any, Tuple, Optional
7
+ import math
8
+ import random
9
+
10
+ __all__ = [
11
+ # Full names
12
+ 'kmeans_clustering',
13
+ 'hierarchical_clustering',
14
+ 'dbscan_clustering',
15
+ 'elbow_method',
16
+ 'silhouette_score',
17
+ 'euclidean_distance',
18
+ 'manhattan_distance',
19
+ 'cosine_similarity_distance',
20
+ 'initialize_centroids',
21
+ 'assign_clusters',
22
+ 'update_centroids',
23
+ 'calculate_inertia',
24
+ 'dendrogram_data',
25
+ 'cluster_purity',
26
+ 'davies_bouldin_index',
27
+ # Abbreviated aliases
28
+ 'kmeans',
29
+ 'hierarchical',
30
+ 'dbscan',
31
+ 'elbow',
32
+ 'silhouette',
33
+ 'euclidean',
34
+ 'manhattan',
35
+ 'cosine_dist',
36
+ 'init_centroids',
37
+ 'assign',
38
+ 'update',
39
+ 'inertia',
40
+ 'dendrogram',
41
+ 'purity',
42
+ 'davies_bouldin',
43
+ ]
44
+
45
+
46
+ def euclidean_distance(
47
+ point1: List[float],
48
+ point2: List[float]
49
+ ) -> float:
50
+ """
51
+ Calculate Euclidean distance between two points.
52
+
53
+ Alias: euclidean()
54
+
55
+ Args:
56
+ point1: First point coordinates
57
+ point2: Second point coordinates
58
+
59
+ Returns:
60
+ float: Euclidean distance
61
+
62
+ Examples:
63
+ >>> from ilovetools.ml import euclidean # Short alias
64
+
65
+ >>> p1 = [1, 2, 3]
66
+ >>> p2 = [4, 5, 6]
67
+ >>> dist = euclidean(p1, p2)
68
+ >>> print(round(dist, 2))
69
+ 5.2
70
+
71
+ >>> from ilovetools.ml import euclidean_distance # Full name
72
+ >>> dist = euclidean_distance(p1, p2)
73
+
74
+ Notes:
75
+ - Most common distance metric
76
+ - Straight line distance
77
+ - Sensitive to scale
78
+ - Works in any dimension
79
+ """
80
+ if len(point1) != len(point2):
81
+ raise ValueError("Points must have same dimensions")
82
+
83
+ return math.sqrt(sum((a - b) ** 2 for a, b in zip(point1, point2)))
84
+
85
+
86
+ # Create alias
87
+ euclidean = euclidean_distance
88
+
89
+
90
+ def manhattan_distance(
91
+ point1: List[float],
92
+ point2: List[float]
93
+ ) -> float:
94
+ """
95
+ Calculate Manhattan (L1) distance between two points.
96
+
97
+ Alias: manhattan()
98
+
99
+ Args:
100
+ point1: First point coordinates
101
+ point2: Second point coordinates
102
+
103
+ Returns:
104
+ float: Manhattan distance
105
+
106
+ Examples:
107
+ >>> from ilovetools.ml import manhattan # Short alias
108
+
109
+ >>> p1 = [1, 2, 3]
110
+ >>> p2 = [4, 5, 6]
111
+ >>> dist = manhattan(p1, p2)
112
+ >>> print(dist)
113
+ 9.0
114
+
115
+ >>> from ilovetools.ml import manhattan_distance # Full name
116
+ >>> dist = manhattan_distance(p1, p2)
117
+
118
+ Notes:
119
+ - Grid-based distance
120
+ - Sum of absolute differences
121
+ - Less sensitive to outliers
122
+ - Good for high dimensions
123
+ """
124
+ if len(point1) != len(point2):
125
+ raise ValueError("Points must have same dimensions")
126
+
127
+ return sum(abs(a - b) for a, b in zip(point1, point2))
128
+
129
+
130
+ # Create alias
131
+ manhattan = manhattan_distance
132
+
133
+
134
+ def cosine_similarity_distance(
135
+ point1: List[float],
136
+ point2: List[float]
137
+ ) -> float:
138
+ """
139
+ Calculate cosine distance between two points.
140
+
141
+ Alias: cosine_dist()
142
+
143
+ Args:
144
+ point1: First point coordinates
145
+ point2: Second point coordinates
146
+
147
+ Returns:
148
+ float: Cosine distance (1 - cosine similarity)
149
+
150
+ Examples:
151
+ >>> from ilovetools.ml import cosine_dist # Short alias
152
+
153
+ >>> p1 = [1, 2, 3]
154
+ >>> p2 = [2, 4, 6]
155
+ >>> dist = cosine_dist(p1, p2)
156
+ >>> print(round(dist, 4))
157
+ 0.0
158
+
159
+ >>> from ilovetools.ml import cosine_similarity_distance # Full name
160
+ >>> dist = cosine_similarity_distance(p1, p2)
161
+
162
+ Notes:
163
+ - Measures angle between vectors
164
+ - Range: 0 to 2
165
+ - Good for text/sparse data
166
+ - Ignores magnitude
167
+ """
168
+ if len(point1) != len(point2):
169
+ raise ValueError("Points must have same dimensions")
170
+
171
+ # Calculate dot product
172
+ dot_product = sum(a * b for a, b in zip(point1, point2))
173
+
174
+ # Calculate magnitudes
175
+ mag1 = math.sqrt(sum(a ** 2 for a in point1))
176
+ mag2 = math.sqrt(sum(b ** 2 for b in point2))
177
+
178
+ if mag1 == 0 or mag2 == 0:
179
+ return 1.0
180
+
181
+ # Cosine similarity
182
+ similarity = dot_product / (mag1 * mag2)
183
+
184
+ # Cosine distance
185
+ return 1 - similarity
186
+
187
+
188
+ # Create alias
189
+ cosine_dist = cosine_similarity_distance
190
+
191
+
192
+ def initialize_centroids(
193
+ data: List[List[float]],
194
+ k: int,
195
+ method: str = 'random'
196
+ ) -> List[List[float]]:
197
+ """
198
+ Initialize cluster centroids.
199
+
200
+ Alias: init_centroids()
201
+
202
+ Args:
203
+ data: Dataset
204
+ k: Number of clusters
205
+ method: 'random' or 'kmeans++'
206
+
207
+ Returns:
208
+ list: Initial centroids
209
+
210
+ Examples:
211
+ >>> from ilovetools.ml import init_centroids # Short alias
212
+
213
+ >>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10]]
214
+ >>> centroids = init_centroids(data, k=2)
215
+ >>> print(len(centroids))
216
+ 2
217
+
218
+ >>> from ilovetools.ml import initialize_centroids # Full name
219
+ >>> centroids = initialize_centroids(data, k=2)
220
+
221
+ Notes:
222
+ - Random: Pick random points
223
+ - KMeans++: Smart initialization
224
+ - Affects convergence speed
225
+ - Critical for K-Means
226
+ """
227
+ if k <= 0 or k > len(data):
228
+ raise ValueError("K must be between 1 and number of data points")
229
+
230
+ if method == 'random':
231
+ # Random initialization
232
+ indices = random.sample(range(len(data)), k)
233
+ return [data[i][:] for i in indices]
234
+
235
+ elif method == 'kmeans++':
236
+ # K-Means++ initialization
237
+ centroids = []
238
+
239
+ # Choose first centroid randomly
240
+ centroids.append(data[random.randint(0, len(data) - 1)][:])
241
+
242
+ # Choose remaining centroids
243
+ for _ in range(k - 1):
244
+ distances = []
245
+ for point in data:
246
+ # Find minimum distance to existing centroids
247
+ min_dist = min(euclidean_distance(point, c) for c in centroids)
248
+ distances.append(min_dist ** 2)
249
+
250
+ # Choose next centroid with probability proportional to distance
251
+ total = sum(distances)
252
+ if total == 0:
253
+ # All points are centroids, pick random
254
+ remaining = [p for p in data if p not in centroids]
255
+ if remaining:
256
+ centroids.append(remaining[0][:])
257
+ else:
258
+ probs = [d / total for d in distances]
259
+ cumsum = []
260
+ total_prob = 0
261
+ for p in probs:
262
+ total_prob += p
263
+ cumsum.append(total_prob)
264
+
265
+ r = random.random()
266
+ for i, cum_p in enumerate(cumsum):
267
+ if r <= cum_p:
268
+ centroids.append(data[i][:])
269
+ break
270
+
271
+ return centroids
272
+
273
+ else:
274
+ raise ValueError("Method must be 'random' or 'kmeans++'")
275
+
276
+
277
+ # Create alias
278
+ init_centroids = initialize_centroids
279
+
280
+
281
+ def assign_clusters(
282
+ data: List[List[float]],
283
+ centroids: List[List[float]],
284
+ distance_metric: str = 'euclidean'
285
+ ) -> List[int]:
286
+ """
287
+ Assign each point to nearest centroid.
288
+
289
+ Alias: assign()
290
+
291
+ Args:
292
+ data: Dataset
293
+ centroids: Cluster centroids
294
+ distance_metric: 'euclidean', 'manhattan', or 'cosine'
295
+
296
+ Returns:
297
+ list: Cluster assignments
298
+
299
+ Examples:
300
+ >>> from ilovetools.ml import assign # Short alias
301
+
302
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
303
+ >>> centroids = [[1.5, 2.5], [8.5, 9.5]]
304
+ >>> labels = assign(data, centroids)
305
+ >>> print(labels)
306
+ [0, 0, 1, 1]
307
+
308
+ >>> from ilovetools.ml import assign_clusters # Full name
309
+ >>> labels = assign_clusters(data, centroids)
310
+
311
+ Notes:
312
+ - Assigns to nearest centroid
313
+ - Uses specified distance metric
314
+ - Core step in K-Means
315
+ - Fast operation
316
+ """
317
+ # Choose distance function
318
+ if distance_metric == 'euclidean':
319
+ dist_func = euclidean_distance
320
+ elif distance_metric == 'manhattan':
321
+ dist_func = manhattan_distance
322
+ elif distance_metric == 'cosine':
323
+ dist_func = cosine_similarity_distance
324
+ else:
325
+ raise ValueError("Invalid distance metric")
326
+
327
+ labels = []
328
+ for point in data:
329
+ # Find nearest centroid
330
+ distances = [dist_func(point, c) for c in centroids]
331
+ nearest = distances.index(min(distances))
332
+ labels.append(nearest)
333
+
334
+ return labels
335
+
336
+
337
+ # Create alias
338
+ assign = assign_clusters
339
+
340
+
341
+ def update_centroids(
342
+ data: List[List[float]],
343
+ labels: List[int],
344
+ k: int
345
+ ) -> List[List[float]]:
346
+ """
347
+ Update centroids based on cluster assignments.
348
+
349
+ Alias: update()
350
+
351
+ Args:
352
+ data: Dataset
353
+ labels: Cluster assignments
354
+ k: Number of clusters
355
+
356
+ Returns:
357
+ list: Updated centroids
358
+
359
+ Examples:
360
+ >>> from ilovetools.ml import update # Short alias
361
+
362
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
363
+ >>> labels = [0, 0, 1, 1]
364
+ >>> centroids = update(data, labels, k=2)
365
+ >>> print(len(centroids))
366
+ 2
367
+
368
+ >>> from ilovetools.ml import update_centroids # Full name
369
+ >>> centroids = update_centroids(data, labels, k=2)
370
+
371
+ Notes:
372
+ - Calculate mean of each cluster
373
+ - Core step in K-Means
374
+ - Moves centroids to center
375
+ - Iterative process
376
+ """
377
+ dimensions = len(data[0])
378
+ centroids = []
379
+
380
+ for cluster_id in range(k):
381
+ # Get points in this cluster
382
+ cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
383
+
384
+ if not cluster_points:
385
+ # Empty cluster, keep old centroid or random point
386
+ centroids.append(data[random.randint(0, len(data) - 1)][:])
387
+ else:
388
+ # Calculate mean
389
+ centroid = []
390
+ for dim in range(dimensions):
391
+ mean = sum(point[dim] for point in cluster_points) / len(cluster_points)
392
+ centroid.append(mean)
393
+ centroids.append(centroid)
394
+
395
+ return centroids
396
+
397
+
398
+ # Create alias
399
+ update = update_centroids
400
+
401
+
402
+ def calculate_inertia(
403
+ data: List[List[float]],
404
+ labels: List[int],
405
+ centroids: List[List[float]]
406
+ ) -> float:
407
+ """
408
+ Calculate within-cluster sum of squares (inertia).
409
+
410
+ Alias: inertia()
411
+
412
+ Args:
413
+ data: Dataset
414
+ labels: Cluster assignments
415
+ centroids: Cluster centroids
416
+
417
+ Returns:
418
+ float: Inertia value
419
+
420
+ Examples:
421
+ >>> from ilovetools.ml import inertia # Short alias
422
+
423
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
424
+ >>> labels = [0, 0, 1, 1]
425
+ >>> centroids = [[1.5, 2.5], [8.5, 9.5]]
426
+ >>> score = inertia(data, labels, centroids)
427
+ >>> print(round(score, 2))
428
+ 2.0
429
+
430
+ >>> from ilovetools.ml import calculate_inertia # Full name
431
+ >>> score = calculate_inertia(data, labels, centroids)
432
+
433
+ Notes:
434
+ - Lower is better
435
+ - Measures compactness
436
+ - Used in elbow method
437
+ - Always decreases with more K
438
+ """
439
+ total = 0.0
440
+ for i, point in enumerate(data):
441
+ centroid = centroids[labels[i]]
442
+ dist = euclidean_distance(point, centroid)
443
+ total += dist ** 2
444
+
445
+ return total
446
+
447
+
448
+ # Create alias
449
+ inertia = calculate_inertia
450
+
451
+
452
+ def kmeans_clustering(
453
+ data: List[List[float]],
454
+ k: int,
455
+ max_iterations: int = 100,
456
+ distance_metric: str = 'euclidean',
457
+ init_method: str = 'kmeans++'
458
+ ) -> Dict[str, Any]:
459
+ """
460
+ K-Means clustering algorithm.
461
+
462
+ Alias: kmeans()
463
+
464
+ Args:
465
+ data: Dataset
466
+ k: Number of clusters
467
+ max_iterations: Maximum iterations
468
+ distance_metric: Distance metric to use
469
+ init_method: Centroid initialization method
470
+
471
+ Returns:
472
+ dict: Clustering results
473
+
474
+ Examples:
475
+ >>> from ilovetools.ml import kmeans # Short alias
476
+
477
+ >>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
478
+ >>> result = kmeans(data, k=2)
479
+ >>> print(len(result['labels']))
480
+ 6
481
+ >>> print(len(result['centroids']))
482
+ 2
483
+
484
+ >>> from ilovetools.ml import kmeans_clustering # Full name
485
+ >>> result = kmeans_clustering(data, k=2)
486
+
487
+ Notes:
488
+ - Most popular clustering
489
+ - Fast and scalable
490
+ - Requires K specification
491
+ - Sensitive to initialization
492
+ """
493
+ # Initialize centroids
494
+ centroids = initialize_centroids(data, k, method=init_method)
495
+
496
+ for iteration in range(max_iterations):
497
+ # Assign clusters
498
+ labels = assign_clusters(data, centroids, distance_metric)
499
+
500
+ # Update centroids
501
+ new_centroids = update_centroids(data, labels, k)
502
+
503
+ # Check convergence
504
+ converged = True
505
+ for old, new in zip(centroids, new_centroids):
506
+ if euclidean_distance(old, new) > 1e-6:
507
+ converged = False
508
+ break
509
+
510
+ centroids = new_centroids
511
+
512
+ if converged:
513
+ break
514
+
515
+ # Calculate inertia
516
+ inertia_value = calculate_inertia(data, labels, centroids)
517
+
518
+ return {
519
+ 'labels': labels,
520
+ 'centroids': centroids,
521
+ 'inertia': inertia_value,
522
+ 'iterations': iteration + 1,
523
+ }
524
+
525
+
526
+ # Create alias
527
+ kmeans = kmeans_clustering
528
+
529
+
530
+ def elbow_method(
531
+ data: List[List[float]],
532
+ max_k: int = 10,
533
+ distance_metric: str = 'euclidean'
534
+ ) -> Dict[str, Any]:
535
+ """
536
+ Elbow method to find optimal K.
537
+
538
+ Alias: elbow()
539
+
540
+ Args:
541
+ data: Dataset
542
+ max_k: Maximum K to try
543
+ distance_metric: Distance metric to use
544
+
545
+ Returns:
546
+ dict: Inertia values for each K
547
+
548
+ Examples:
549
+ >>> from ilovetools.ml import elbow # Short alias
550
+
551
+ >>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
552
+ >>> result = elbow(data, max_k=4)
553
+ >>> print(len(result['k_values']))
554
+ 4
555
+ >>> print(len(result['inertias']))
556
+ 4
557
+
558
+ >>> from ilovetools.ml import elbow_method # Full name
559
+ >>> result = elbow_method(data, max_k=4)
560
+
561
+ Notes:
562
+ - Find optimal K
563
+ - Plot inertia vs K
564
+ - Look for elbow point
565
+ - Subjective interpretation
566
+ """
567
+ k_values = list(range(1, max_k + 1))
568
+ inertias = []
569
+
570
+ for k in k_values:
571
+ if k > len(data):
572
+ break
573
+ result = kmeans_clustering(data, k, distance_metric=distance_metric)
574
+ inertias.append(result['inertia'])
575
+
576
+ return {
577
+ 'k_values': k_values[:len(inertias)],
578
+ 'inertias': inertias,
579
+ }
580
+
581
+
582
+ # Create alias
583
+ elbow = elbow_method
584
+
585
+
586
+ def silhouette_score(
587
+ data: List[List[float]],
588
+ labels: List[int],
589
+ distance_metric: str = 'euclidean'
590
+ ) -> float:
591
+ """
592
+ Calculate silhouette score for clustering.
593
+
594
+ Alias: silhouette()
595
+
596
+ Args:
597
+ data: Dataset
598
+ labels: Cluster assignments
599
+ distance_metric: Distance metric to use
600
+
601
+ Returns:
602
+ float: Silhouette score (-1 to 1)
603
+
604
+ Examples:
605
+ >>> from ilovetools.ml import silhouette # Short alias
606
+
607
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
608
+ >>> labels = [0, 0, 1, 1]
609
+ >>> score = silhouette(data, labels)
610
+ >>> print(round(score, 2))
611
+ 0.71
612
+
613
+ >>> from ilovetools.ml import silhouette_score # Full name
614
+ >>> score = silhouette_score(data, labels)
615
+
616
+ Notes:
617
+ - Range: -1 to 1
618
+ - Higher is better
619
+ - Measures cluster quality
620
+ - Considers separation and cohesion
621
+ """
622
+ # Choose distance function
623
+ if distance_metric == 'euclidean':
624
+ dist_func = euclidean_distance
625
+ elif distance_metric == 'manhattan':
626
+ dist_func = manhattan_distance
627
+ elif distance_metric == 'cosine':
628
+ dist_func = cosine_similarity_distance
629
+ else:
630
+ raise ValueError("Invalid distance metric")
631
+
632
+ n = len(data)
633
+ silhouette_values = []
634
+
635
+ for i in range(n):
636
+ # Get cluster of point i
637
+ cluster_i = labels[i]
638
+
639
+ # Calculate a(i): mean distance to points in same cluster
640
+ same_cluster = [j for j in range(n) if labels[j] == cluster_i and j != i]
641
+ if not same_cluster:
642
+ silhouette_values.append(0)
643
+ continue
644
+
645
+ a_i = sum(dist_func(data[i], data[j]) for j in same_cluster) / len(same_cluster)
646
+
647
+ # Calculate b(i): mean distance to points in nearest cluster
648
+ unique_clusters = set(labels)
649
+ unique_clusters.discard(cluster_i)
650
+
651
+ if not unique_clusters:
652
+ silhouette_values.append(0)
653
+ continue
654
+
655
+ b_i = float('inf')
656
+ for cluster_j in unique_clusters:
657
+ other_cluster = [j for j in range(n) if labels[j] == cluster_j]
658
+ mean_dist = sum(dist_func(data[i], data[j]) for j in other_cluster) / len(other_cluster)
659
+ b_i = min(b_i, mean_dist)
660
+
661
+ # Calculate silhouette value
662
+ s_i = (b_i - a_i) / max(a_i, b_i)
663
+ silhouette_values.append(s_i)
664
+
665
+ return sum(silhouette_values) / len(silhouette_values)
666
+
667
+
668
+ # Create alias
669
+ silhouette = silhouette_score
670
+
671
+
672
+ def hierarchical_clustering(
673
+ data: List[List[float]],
674
+ n_clusters: int,
675
+ linkage: str = 'average',
676
+ distance_metric: str = 'euclidean'
677
+ ) -> Dict[str, Any]:
678
+ """
679
+ Hierarchical clustering (agglomerative).
680
+
681
+ Alias: hierarchical()
682
+
683
+ Args:
684
+ data: Dataset
685
+ n_clusters: Number of clusters
686
+ linkage: 'single', 'complete', or 'average'
687
+ distance_metric: Distance metric to use
688
+
689
+ Returns:
690
+ dict: Clustering results
691
+
692
+ Examples:
693
+ >>> from ilovetools.ml import hierarchical # Short alias
694
+
695
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
696
+ >>> result = hierarchical(data, n_clusters=2)
697
+ >>> print(len(result['labels']))
698
+ 4
699
+
700
+ >>> from ilovetools.ml import hierarchical_clustering # Full name
701
+ >>> result = hierarchical_clustering(data, n_clusters=2)
702
+
703
+ Notes:
704
+ - Creates tree structure
705
+ - No need to specify K initially
706
+ - Good for small datasets
707
+ - Computationally expensive
708
+ """
709
+ # Choose distance function
710
+ if distance_metric == 'euclidean':
711
+ dist_func = euclidean_distance
712
+ elif distance_metric == 'manhattan':
713
+ dist_func = manhattan_distance
714
+ elif distance_metric == 'cosine':
715
+ dist_func = cosine_similarity_distance
716
+ else:
717
+ raise ValueError("Invalid distance metric")
718
+
719
+ n = len(data)
720
+
721
+ # Initialize: each point is its own cluster
722
+ clusters = [[i] for i in range(n)]
723
+
724
+ # Merge until we have n_clusters
725
+ while len(clusters) > n_clusters:
726
+ # Find closest pair of clusters
727
+ min_dist = float('inf')
728
+ merge_i, merge_j = 0, 1
729
+
730
+ for i in range(len(clusters)):
731
+ for j in range(i + 1, len(clusters)):
732
+ # Calculate distance between clusters
733
+ if linkage == 'single':
734
+ # Minimum distance
735
+ dist = min(dist_func(data[p1], data[p2])
736
+ for p1 in clusters[i] for p2 in clusters[j])
737
+ elif linkage == 'complete':
738
+ # Maximum distance
739
+ dist = max(dist_func(data[p1], data[p2])
740
+ for p1 in clusters[i] for p2 in clusters[j])
741
+ else: # average
742
+ # Average distance
743
+ distances = [dist_func(data[p1], data[p2])
744
+ for p1 in clusters[i] for p2 in clusters[j]]
745
+ dist = sum(distances) / len(distances)
746
+
747
+ if dist < min_dist:
748
+ min_dist = dist
749
+ merge_i, merge_j = i, j
750
+
751
+ # Merge clusters
752
+ clusters[merge_i].extend(clusters[merge_j])
753
+ clusters.pop(merge_j)
754
+
755
+ # Create labels
756
+ labels = [0] * n
757
+ for cluster_id, cluster in enumerate(clusters):
758
+ for point_id in cluster:
759
+ labels[point_id] = cluster_id
760
+
761
+ return {
762
+ 'labels': labels,
763
+ 'n_clusters': len(clusters),
764
+ }
765
+
766
+
767
+ # Create alias
768
+ hierarchical = hierarchical_clustering
769
+
770
+
771
+ def dbscan_clustering(
772
+ data: List[List[float]],
773
+ eps: float,
774
+ min_samples: int = 5,
775
+ distance_metric: str = 'euclidean'
776
+ ) -> Dict[str, Any]:
777
+ """
778
+ DBSCAN density-based clustering.
779
+
780
+ Alias: dbscan()
781
+
782
+ Args:
783
+ data: Dataset
784
+ eps: Maximum distance for neighborhood
785
+ min_samples: Minimum points for core point
786
+ distance_metric: Distance metric to use
787
+
788
+ Returns:
789
+ dict: Clustering results
790
+
791
+ Examples:
792
+ >>> from ilovetools.ml import dbscan # Short alias
793
+
794
+ >>> data = [[1, 2], [2, 3], [3, 4], [8, 9], [9, 10], [10, 11]]
795
+ >>> result = dbscan(data, eps=2.0, min_samples=2)
796
+ >>> print(len(result['labels']))
797
+ 6
798
+
799
+ >>> from ilovetools.ml import dbscan_clustering # Full name
800
+ >>> result = dbscan_clustering(data, eps=2.0, min_samples=2)
801
+
802
+ Notes:
803
+ - Density-based clustering
804
+ - Finds arbitrary shapes
805
+ - Handles noise (label -1)
806
+ - No need to specify K
807
+ """
808
+ # Choose distance function
809
+ if distance_metric == 'euclidean':
810
+ dist_func = euclidean_distance
811
+ elif distance_metric == 'manhattan':
812
+ dist_func = manhattan_distance
813
+ elif distance_metric == 'cosine':
814
+ dist_func = cosine_similarity_distance
815
+ else:
816
+ raise ValueError("Invalid distance metric")
817
+
818
+ n = len(data)
819
+ labels = [-1] * n # -1 means noise
820
+ cluster_id = 0
821
+
822
+ for i in range(n):
823
+ if labels[i] != -1:
824
+ continue
825
+
826
+ # Find neighbors
827
+ neighbors = []
828
+ for j in range(n):
829
+ if dist_func(data[i], data[j]) <= eps:
830
+ neighbors.append(j)
831
+
832
+ # Check if core point
833
+ if len(neighbors) < min_samples:
834
+ continue # Noise point
835
+
836
+ # Start new cluster
837
+ labels[i] = cluster_id
838
+
839
+ # Expand cluster
840
+ seed_set = neighbors[:]
841
+ while seed_set:
842
+ current = seed_set.pop(0)
843
+
844
+ if labels[current] == -1:
845
+ labels[current] = cluster_id
846
+
847
+ if labels[current] != -1:
848
+ continue
849
+
850
+ labels[current] = cluster_id
851
+
852
+ # Find neighbors of current
853
+ current_neighbors = []
854
+ for j in range(n):
855
+ if dist_func(data[current], data[j]) <= eps:
856
+ current_neighbors.append(j)
857
+
858
+ # If core point, add neighbors to seed set
859
+ if len(current_neighbors) >= min_samples:
860
+ seed_set.extend(current_neighbors)
861
+
862
+ cluster_id += 1
863
+
864
+ # Count noise points
865
+ noise_count = sum(1 for label in labels if label == -1)
866
+
867
+ return {
868
+ 'labels': labels,
869
+ 'n_clusters': cluster_id,
870
+ 'noise_points': noise_count,
871
+ }
872
+
873
+
874
+ # Create alias
875
+ dbscan = dbscan_clustering
876
+
877
+
878
+ def dendrogram_data(
879
+ data: List[List[float]],
880
+ linkage: str = 'average',
881
+ distance_metric: str = 'euclidean'
882
+ ) -> List[Dict[str, Any]]:
883
+ """
884
+ Generate dendrogram data for hierarchical clustering.
885
+
886
+ Alias: dendrogram()
887
+
888
+ Args:
889
+ data: Dataset
890
+ linkage: Linkage method
891
+ distance_metric: Distance metric to use
892
+
893
+ Returns:
894
+ list: Merge history
895
+
896
+ Examples:
897
+ >>> from ilovetools.ml import dendrogram # Short alias
898
+
899
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
900
+ >>> merges = dendrogram(data)
901
+ >>> print(len(merges))
902
+ 3
903
+
904
+ >>> from ilovetools.ml import dendrogram_data # Full name
905
+ >>> merges = dendrogram_data(data)
906
+
907
+ Notes:
908
+ - Shows merge history
909
+ - Tree structure
910
+ - Cut at desired level
911
+ - Visualize hierarchy
912
+ """
913
+ # Choose distance function
914
+ if distance_metric == 'euclidean':
915
+ dist_func = euclidean_distance
916
+ elif distance_metric == 'manhattan':
917
+ dist_func = manhattan_distance
918
+ elif distance_metric == 'cosine':
919
+ dist_func = cosine_similarity_distance
920
+ else:
921
+ raise ValueError("Invalid distance metric")
922
+
923
+ n = len(data)
924
+ clusters = [[i] for i in range(n)]
925
+ merges = []
926
+
927
+ while len(clusters) > 1:
928
+ # Find closest pair
929
+ min_dist = float('inf')
930
+ merge_i, merge_j = 0, 1
931
+
932
+ for i in range(len(clusters)):
933
+ for j in range(i + 1, len(clusters)):
934
+ # Calculate distance
935
+ if linkage == 'single':
936
+ dist = min(dist_func(data[p1], data[p2])
937
+ for p1 in clusters[i] for p2 in clusters[j])
938
+ elif linkage == 'complete':
939
+ dist = max(dist_func(data[p1], data[p2])
940
+ for p1 in clusters[i] for p2 in clusters[j])
941
+ else: # average
942
+ distances = [dist_func(data[p1], data[p2])
943
+ for p1 in clusters[i] for p2 in clusters[j]]
944
+ dist = sum(distances) / len(distances)
945
+
946
+ if dist < min_dist:
947
+ min_dist = dist
948
+ merge_i, merge_j = i, j
949
+
950
+ # Record merge
951
+ merges.append({
952
+ 'cluster1': clusters[merge_i][:],
953
+ 'cluster2': clusters[merge_j][:],
954
+ 'distance': min_dist,
955
+ 'size': len(clusters[merge_i]) + len(clusters[merge_j]),
956
+ })
957
+
958
+ # Merge clusters
959
+ clusters[merge_i].extend(clusters[merge_j])
960
+ clusters.pop(merge_j)
961
+
962
+ return merges
963
+
964
+
965
+ # Create alias
966
+ dendrogram = dendrogram_data
967
+
968
+
969
+ def cluster_purity(
970
+ labels_true: List[int],
971
+ labels_pred: List[int]
972
+ ) -> float:
973
+ """
974
+ Calculate cluster purity score.
975
+
976
+ Alias: purity()
977
+
978
+ Args:
979
+ labels_true: True labels
980
+ labels_pred: Predicted cluster labels
981
+
982
+ Returns:
983
+ float: Purity score (0 to 1)
984
+
985
+ Examples:
986
+ >>> from ilovetools.ml import purity # Short alias
987
+
988
+ >>> true_labels = [0, 0, 1, 1, 2, 2]
989
+ >>> pred_labels = [0, 0, 1, 1, 1, 1]
990
+ >>> score = purity(true_labels, pred_labels)
991
+ >>> print(round(score, 2))
992
+ 0.67
993
+
994
+ >>> from ilovetools.ml import cluster_purity # Full name
995
+ >>> score = cluster_purity(true_labels, pred_labels)
996
+
997
+ Notes:
998
+ - Range: 0 to 1
999
+ - Higher is better
1000
+ - Measures cluster quality
1001
+ - Requires true labels
1002
+ """
1003
+ if len(labels_true) != len(labels_pred):
1004
+ raise ValueError("Label arrays must have same length")
1005
+
1006
+ # Get unique clusters
1007
+ clusters = set(labels_pred)
1008
+
1009
+ correct = 0
1010
+ for cluster in clusters:
1011
+ # Get points in this cluster
1012
+ cluster_indices = [i for i in range(len(labels_pred)) if labels_pred[i] == cluster]
1013
+
1014
+ # Get true labels for these points
1015
+ cluster_true_labels = [labels_true[i] for i in cluster_indices]
1016
+
1017
+ # Find most common true label
1018
+ if cluster_true_labels:
1019
+ most_common = max(set(cluster_true_labels), key=cluster_true_labels.count)
1020
+ correct += cluster_true_labels.count(most_common)
1021
+
1022
+ return correct / len(labels_true)
1023
+
1024
+
1025
+ # Create alias
1026
+ purity = cluster_purity
1027
+
1028
+
1029
+ def davies_bouldin_index(
1030
+ data: List[List[float]],
1031
+ labels: List[int]
1032
+ ) -> float:
1033
+ """
1034
+ Calculate Davies-Bouldin index.
1035
+
1036
+ Alias: davies_bouldin()
1037
+
1038
+ Args:
1039
+ data: Dataset
1040
+ labels: Cluster assignments
1041
+
1042
+ Returns:
1043
+ float: Davies-Bouldin index (lower is better)
1044
+
1045
+ Examples:
1046
+ >>> from ilovetools.ml import davies_bouldin # Short alias
1047
+
1048
+ >>> data = [[1, 2], [2, 3], [8, 9], [9, 10]]
1049
+ >>> labels = [0, 0, 1, 1]
1050
+ >>> score = davies_bouldin(data, labels)
1051
+ >>> print(round(score, 2))
1052
+ 0.35
1053
+
1054
+ >>> from ilovetools.ml import davies_bouldin_index # Full name
1055
+ >>> score = davies_bouldin_index(data, labels)
1056
+
1057
+ Notes:
1058
+ - Lower is better
1059
+ - Measures cluster separation
1060
+ - No true labels needed
1061
+ - Considers intra and inter cluster distances
1062
+ """
1063
+ # Get unique clusters
1064
+ clusters = list(set(labels))
1065
+ k = len(clusters)
1066
+
1067
+ if k <= 1:
1068
+ return 0.0
1069
+
1070
+ # Calculate centroids
1071
+ centroids = []
1072
+ for cluster_id in clusters:
1073
+ cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
1074
+ if cluster_points:
1075
+ dimensions = len(data[0])
1076
+ centroid = [sum(p[d] for p in cluster_points) / len(cluster_points) for d in range(dimensions)]
1077
+ centroids.append(centroid)
1078
+
1079
+ # Calculate average distances within clusters
1080
+ avg_distances = []
1081
+ for cluster_id in clusters:
1082
+ cluster_points = [data[i] for i in range(len(data)) if labels[i] == cluster_id]
1083
+ if len(cluster_points) > 0:
1084
+ centroid = centroids[cluster_id]
1085
+ avg_dist = sum(euclidean_distance(p, centroid) for p in cluster_points) / len(cluster_points)
1086
+ avg_distances.append(avg_dist)
1087
+ else:
1088
+ avg_distances.append(0)
1089
+
1090
+ # Calculate DB index
1091
+ db_values = []
1092
+ for i in range(k):
1093
+ max_ratio = 0
1094
+ for j in range(k):
1095
+ if i != j:
1096
+ numerator = avg_distances[i] + avg_distances[j]
1097
+ denominator = euclidean_distance(centroids[i], centroids[j])
1098
+ if denominator > 0:
1099
+ ratio = numerator / denominator
1100
+ max_ratio = max(max_ratio, ratio)
1101
+ db_values.append(max_ratio)
1102
+
1103
+ return sum(db_values) / k if k > 0 else 0.0
1104
+
1105
+
1106
+ # Create alias
1107
+ davies_bouldin = davies_bouldin_index