ilovetools 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,797 @@
1
+ """
2
+ Imbalanced data handling utilities
3
+ Each function has TWO names: full descriptive name + abbreviated alias
4
+ """
5
+
6
+ from typing import List, Dict, Any, Tuple, Optional
7
+ import random
8
+
9
+ __all__ = [
10
+ # Full names
11
+ 'random_oversampling',
12
+ 'random_undersampling',
13
+ 'smote_oversampling',
14
+ 'tomek_links_undersampling',
15
+ 'class_weight_calculator',
16
+ 'stratified_sampling',
17
+ 'compute_class_distribution',
18
+ 'balance_dataset',
19
+ 'minority_class_identifier',
20
+ 'imbalance_ratio',
21
+ 'synthetic_sample_generator',
22
+ 'near_miss_undersampling',
23
+ # Abbreviated aliases
24
+ 'random_oversample',
25
+ 'random_undersample',
26
+ 'smote',
27
+ 'tomek_links',
28
+ 'class_weights',
29
+ 'stratified_sample',
30
+ 'class_dist',
31
+ 'balance_data',
32
+ 'minority_class',
33
+ 'imbalance_ratio_alias',
34
+ 'synthetic_sample',
35
+ 'near_miss',
36
+ ]
37
+
38
+
39
+ def random_oversampling(
40
+ X: List[List[float]],
41
+ y: List[int],
42
+ target_ratio: float = 1.0
43
+ ) -> Tuple[List[List[float]], List[int]]:
44
+ """
45
+ Randomly oversample minority class.
46
+
47
+ Alias: random_oversample()
48
+
49
+ Args:
50
+ X: Feature data
51
+ y: Labels
52
+ target_ratio: Desired minority/majority ratio (1.0 = balanced)
53
+
54
+ Returns:
55
+ tuple: (X_resampled, y_resampled)
56
+
57
+ Examples:
58
+ >>> from ilovetools.ml import random_oversample # Short alias
59
+
60
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
61
+ >>> y = [0, 0, 0, 0, 1] # Imbalanced: 4 vs 1
62
+ >>>
63
+ >>> X_res, y_res = random_oversample(X, y, target_ratio=1.0)
64
+ >>> print(len([label for label in y_res if label == 0]))
65
+ 4
66
+ >>> print(len([label for label in y_res if label == 1]))
67
+ 4
68
+
69
+ >>> from ilovetools.ml import random_oversampling # Full name
70
+ >>> X_res, y_res = random_oversampling(X, y)
71
+
72
+ Notes:
73
+ - Duplicates minority samples randomly
74
+ - Simple but effective
75
+ - May cause overfitting
76
+ - Good starting point
77
+ """
78
+ # Separate by class
79
+ class_indices = {}
80
+ for idx, label in enumerate(y):
81
+ if label not in class_indices:
82
+ class_indices[label] = []
83
+ class_indices[label].append(idx)
84
+
85
+ # Find majority class size
86
+ max_size = max(len(indices) for indices in class_indices.values())
87
+ target_size = int(max_size * target_ratio)
88
+
89
+ X_resampled = []
90
+ y_resampled = []
91
+
92
+ for label, indices in class_indices.items():
93
+ # Add all original samples
94
+ for idx in indices:
95
+ X_resampled.append(X[idx])
96
+ y_resampled.append(y[idx])
97
+
98
+ # Oversample if needed
99
+ if len(indices) < target_size:
100
+ n_samples = target_size - len(indices)
101
+ for _ in range(n_samples):
102
+ idx = random.choice(indices)
103
+ X_resampled.append(X[idx])
104
+ y_resampled.append(y[idx])
105
+
106
+ return X_resampled, y_resampled
107
+
108
+
109
+ # Create alias
110
+ random_oversample = random_oversampling
111
+
112
+
113
+ def random_undersampling(
114
+ X: List[List[float]],
115
+ y: List[int],
116
+ target_ratio: float = 1.0
117
+ ) -> Tuple[List[List[float]], List[int]]:
118
+ """
119
+ Randomly undersample majority class.
120
+
121
+ Alias: random_undersample()
122
+
123
+ Args:
124
+ X: Feature data
125
+ y: Labels
126
+ target_ratio: Desired minority/majority ratio (1.0 = balanced)
127
+
128
+ Returns:
129
+ tuple: (X_resampled, y_resampled)
130
+
131
+ Examples:
132
+ >>> from ilovetools.ml import random_undersample # Short alias
133
+
134
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
135
+ >>> y = [0, 0, 0, 0, 1] # Imbalanced: 4 vs 1
136
+ >>>
137
+ >>> X_res, y_res = random_undersample(X, y, target_ratio=1.0)
138
+ >>> print(len([label for label in y_res if label == 0]))
139
+ 1
140
+ >>> print(len([label for label in y_res if label == 1]))
141
+ 1
142
+
143
+ >>> from ilovetools.ml import random_undersampling # Full name
144
+ >>> X_res, y_res = random_undersampling(X, y)
145
+
146
+ Notes:
147
+ - Removes majority samples randomly
148
+ - Loses information
149
+ - Faster training
150
+ - Good for large datasets
151
+ """
152
+ # Separate by class
153
+ class_indices = {}
154
+ for idx, label in enumerate(y):
155
+ if label not in class_indices:
156
+ class_indices[label] = []
157
+ class_indices[label].append(idx)
158
+
159
+ # Find minority class size
160
+ min_size = min(len(indices) for indices in class_indices.values())
161
+ target_size = int(min_size / target_ratio)
162
+
163
+ X_resampled = []
164
+ y_resampled = []
165
+
166
+ for label, indices in class_indices.items():
167
+ # Undersample if needed
168
+ if len(indices) > target_size:
169
+ selected_indices = random.sample(indices, target_size)
170
+ else:
171
+ selected_indices = indices
172
+
173
+ for idx in selected_indices:
174
+ X_resampled.append(X[idx])
175
+ y_resampled.append(y[idx])
176
+
177
+ return X_resampled, y_resampled
178
+
179
+
180
+ # Create alias
181
+ random_undersample = random_undersampling
182
+
183
+
184
+ def smote_oversampling(
185
+ X: List[List[float]],
186
+ y: List[int],
187
+ k_neighbors: int = 5,
188
+ target_ratio: float = 1.0
189
+ ) -> Tuple[List[List[float]], List[int]]:
190
+ """
191
+ SMOTE (Synthetic Minority Over-sampling Technique).
192
+
193
+ Alias: smote()
194
+
195
+ Args:
196
+ X: Feature data
197
+ y: Labels
198
+ k_neighbors: Number of nearest neighbors
199
+ target_ratio: Desired minority/majority ratio
200
+
201
+ Returns:
202
+ tuple: (X_resampled, y_resampled)
203
+
204
+ Examples:
205
+ >>> from ilovetools.ml import smote # Short alias
206
+
207
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
208
+ >>> y = [0, 0, 0, 0, 1]
209
+ >>>
210
+ >>> X_res, y_res = smote(X, y, k_neighbors=2)
211
+ >>> print(len(y_res) > len(y))
212
+ True
213
+
214
+ >>> from ilovetools.ml import smote_oversampling # Full name
215
+ >>> X_res, y_res = smote_oversampling(X, y)
216
+
217
+ Notes:
218
+ - Creates synthetic samples
219
+ - Interpolates between neighbors
220
+ - Reduces overfitting
221
+ - Industry standard
222
+ """
223
+ # Separate by class
224
+ class_indices = {}
225
+ for idx, label in enumerate(y):
226
+ if label not in class_indices:
227
+ class_indices[label] = []
228
+ class_indices[label].append(idx)
229
+
230
+ # Find majority class size
231
+ max_size = max(len(indices) for indices in class_indices.values())
232
+ target_size = int(max_size * target_ratio)
233
+
234
+ X_resampled = list(X)
235
+ y_resampled = list(y)
236
+
237
+ for label, indices in class_indices.items():
238
+ if len(indices) < target_size:
239
+ n_samples = target_size - len(indices)
240
+
241
+ for _ in range(n_samples):
242
+ # Select random sample from minority class
243
+ idx = random.choice(indices)
244
+ sample = X[idx]
245
+
246
+ # Find k nearest neighbors (simplified)
247
+ neighbors = random.sample(indices, min(k_neighbors, len(indices)))
248
+ neighbor_idx = random.choice(neighbors)
249
+ neighbor = X[neighbor_idx]
250
+
251
+ # Create synthetic sample (interpolation)
252
+ alpha = random.random()
253
+ synthetic = [
254
+ sample[i] + alpha * (neighbor[i] - sample[i])
255
+ for i in range(len(sample))
256
+ ]
257
+
258
+ X_resampled.append(synthetic)
259
+ y_resampled.append(label)
260
+
261
+ return X_resampled, y_resampled
262
+
263
+
264
+ # Create alias
265
+ smote = smote_oversampling
266
+
267
+
268
+ def tomek_links_undersampling(
269
+ X: List[List[float]],
270
+ y: List[int]
271
+ ) -> Tuple[List[List[float]], List[int]]:
272
+ """
273
+ Remove Tomek links (borderline samples).
274
+
275
+ Alias: tomek_links()
276
+
277
+ Args:
278
+ X: Feature data
279
+ y: Labels
280
+
281
+ Returns:
282
+ tuple: (X_cleaned, y_cleaned)
283
+
284
+ Examples:
285
+ >>> from ilovetools.ml import tomek_links # Short alias
286
+
287
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5]]
288
+ >>> y = [0, 0, 1, 1]
289
+ >>>
290
+ >>> X_clean, y_clean = tomek_links(X, y)
291
+ >>> print(len(X_clean) <= len(X))
292
+ True
293
+
294
+ >>> from ilovetools.ml import tomek_links_undersampling # Full name
295
+ >>> X_clean, y_clean = tomek_links_undersampling(X, y)
296
+
297
+ Notes:
298
+ - Removes noisy samples
299
+ - Cleans decision boundary
300
+ - Often combined with SMOTE
301
+ - Improves model performance
302
+ """
303
+ def euclidean_distance(p1, p2):
304
+ return sum((a - b) ** 2 for a, b in zip(p1, p2)) ** 0.5
305
+
306
+ # Find Tomek links (simplified version)
307
+ tomek_indices = set()
308
+
309
+ for i in range(len(X)):
310
+ # Find nearest neighbor with different class
311
+ min_dist = float('inf')
312
+ nearest_idx = -1
313
+
314
+ for j in range(len(X)):
315
+ if i != j and y[i] != y[j]:
316
+ dist = euclidean_distance(X[i], X[j])
317
+ if dist < min_dist:
318
+ min_dist = dist
319
+ nearest_idx = j
320
+
321
+ if nearest_idx != -1:
322
+ # Check if they are each other's nearest neighbors
323
+ is_tomek = True
324
+ for k in range(len(X)):
325
+ if k != i and k != nearest_idx:
326
+ if euclidean_distance(X[nearest_idx], X[k]) < min_dist:
327
+ is_tomek = False
328
+ break
329
+
330
+ if is_tomek:
331
+ # Remove majority class sample
332
+ if sum(1 for label in y if label == y[i]) > sum(1 for label in y if label == y[nearest_idx]):
333
+ tomek_indices.add(i)
334
+ else:
335
+ tomek_indices.add(nearest_idx)
336
+
337
+ # Remove Tomek links
338
+ X_cleaned = [X[i] for i in range(len(X)) if i not in tomek_indices]
339
+ y_cleaned = [y[i] for i in range(len(y)) if i not in tomek_indices]
340
+
341
+ return X_cleaned, y_cleaned
342
+
343
+
344
+ # Create alias
345
+ tomek_links = tomek_links_undersampling
346
+
347
+
348
+ def class_weight_calculator(y: List[int]) -> Dict[int, float]:
349
+ """
350
+ Calculate class weights for imbalanced data.
351
+
352
+ Alias: class_weights()
353
+
354
+ Args:
355
+ y: Labels
356
+
357
+ Returns:
358
+ dict: Class weights
359
+
360
+ Examples:
361
+ >>> from ilovetools.ml import class_weights # Short alias
362
+
363
+ >>> y = [0, 0, 0, 0, 1]
364
+ >>> weights = class_weights(y)
365
+ >>> print(weights[0] < weights[1])
366
+ True
367
+
368
+ >>> from ilovetools.ml import class_weight_calculator # Full name
369
+ >>> weights = class_weight_calculator(y)
370
+
371
+ Notes:
372
+ - Inverse of class frequency
373
+ - Use in model training
374
+ - Penalizes minority errors more
375
+ - Sklearn-compatible
376
+ """
377
+ # Count samples per class
378
+ class_counts = {}
379
+ for label in y:
380
+ class_counts[label] = class_counts.get(label, 0) + 1
381
+
382
+ # Calculate weights (inverse frequency)
383
+ n_samples = len(y)
384
+ n_classes = len(class_counts)
385
+
386
+ weights = {}
387
+ for label, count in class_counts.items():
388
+ weights[label] = n_samples / (n_classes * count)
389
+
390
+ return weights
391
+
392
+
393
+ # Create alias
394
+ class_weights = class_weight_calculator
395
+
396
+
397
+ def stratified_sampling(
398
+ X: List[List[float]],
399
+ y: List[int],
400
+ sample_size: int
401
+ ) -> Tuple[List[List[float]], List[int]]:
402
+ """
403
+ Stratified sampling maintaining class distribution.
404
+
405
+ Alias: stratified_sample()
406
+
407
+ Args:
408
+ X: Feature data
409
+ y: Labels
410
+ sample_size: Number of samples to draw
411
+
412
+ Returns:
413
+ tuple: (X_sample, y_sample)
414
+
415
+ Examples:
416
+ >>> from ilovetools.ml import stratified_sample # Short alias
417
+
418
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
419
+ >>> y = [0, 0, 0, 1, 1]
420
+ >>>
421
+ >>> X_sample, y_sample = stratified_sample(X, y, sample_size=3)
422
+ >>> print(len(X_sample))
423
+ 3
424
+
425
+ >>> from ilovetools.ml import stratified_sampling # Full name
426
+ >>> X_sample, y_sample = stratified_sampling(X, y, 3)
427
+
428
+ Notes:
429
+ - Maintains class proportions
430
+ - Better for train/test split
431
+ - Reduces sampling bias
432
+ - Essential for imbalanced data
433
+ """
434
+ # Separate by class
435
+ class_indices = {}
436
+ for idx, label in enumerate(y):
437
+ if label not in class_indices:
438
+ class_indices[label] = []
439
+ class_indices[label].append(idx)
440
+
441
+ # Calculate samples per class
442
+ class_proportions = {
443
+ label: len(indices) / len(y)
444
+ for label, indices in class_indices.items()
445
+ }
446
+
447
+ X_sample = []
448
+ y_sample = []
449
+
450
+ for label, proportion in class_proportions.items():
451
+ n_samples = int(sample_size * proportion)
452
+ indices = class_indices[label]
453
+
454
+ if n_samples > 0:
455
+ selected = random.sample(indices, min(n_samples, len(indices)))
456
+ for idx in selected:
457
+ X_sample.append(X[idx])
458
+ y_sample.append(y[idx])
459
+
460
+ return X_sample, y_sample
461
+
462
+
463
+ # Create alias
464
+ stratified_sample = stratified_sampling
465
+
466
+
467
+ def compute_class_distribution(y: List[int]) -> Dict[str, Any]:
468
+ """
469
+ Compute class distribution statistics.
470
+
471
+ Alias: class_dist()
472
+
473
+ Args:
474
+ y: Labels
475
+
476
+ Returns:
477
+ dict: Distribution statistics
478
+
479
+ Examples:
480
+ >>> from ilovetools.ml import class_dist # Short alias
481
+
482
+ >>> y = [0, 0, 0, 0, 1]
483
+ >>> dist = class_dist(y)
484
+ >>> print(dist['counts'])
485
+ {0: 4, 1: 1}
486
+ >>> print(dist['imbalance_ratio'])
487
+ 4.0
488
+
489
+ >>> from ilovetools.ml import compute_class_distribution # Full name
490
+ >>> dist = compute_class_distribution(y)
491
+
492
+ Notes:
493
+ - Understand data imbalance
494
+ - Plan resampling strategy
495
+ - Monitor class distribution
496
+ - Essential first step
497
+ """
498
+ # Count samples per class
499
+ class_counts = {}
500
+ for label in y:
501
+ class_counts[label] = class_counts.get(label, 0) + 1
502
+
503
+ # Calculate proportions
504
+ total = len(y)
505
+ class_proportions = {
506
+ label: count / total
507
+ for label, count in class_counts.items()
508
+ }
509
+
510
+ # Find majority and minority
511
+ majority_class = max(class_counts, key=class_counts.get)
512
+ minority_class = min(class_counts, key=class_counts.get)
513
+
514
+ # Calculate imbalance ratio
515
+ imbalance_ratio = class_counts[majority_class] / class_counts[minority_class]
516
+
517
+ return {
518
+ 'counts': class_counts,
519
+ 'proportions': class_proportions,
520
+ 'majority_class': majority_class,
521
+ 'minority_class': minority_class,
522
+ 'imbalance_ratio': imbalance_ratio,
523
+ 'total_samples': total,
524
+ }
525
+
526
+
527
+ # Create alias
528
+ class_dist = compute_class_distribution
529
+
530
+
531
+ def balance_dataset(
532
+ X: List[List[float]],
533
+ y: List[int],
534
+ method: str = 'oversample',
535
+ target_ratio: float = 1.0
536
+ ) -> Tuple[List[List[float]], List[int]]:
537
+ """
538
+ Balance dataset using specified method.
539
+
540
+ Alias: balance_data()
541
+
542
+ Args:
543
+ X: Feature data
544
+ y: Labels
545
+ method: 'oversample', 'undersample', or 'smote'
546
+ target_ratio: Desired balance ratio
547
+
548
+ Returns:
549
+ tuple: (X_balanced, y_balanced)
550
+
551
+ Examples:
552
+ >>> from ilovetools.ml import balance_data # Short alias
553
+
554
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
555
+ >>> y = [0, 0, 0, 0, 1]
556
+ >>>
557
+ >>> X_bal, y_bal = balance_data(X, y, method='oversample')
558
+ >>> print(len(y_bal) >= len(y))
559
+ True
560
+
561
+ >>> from ilovetools.ml import balance_dataset # Full name
562
+ >>> X_bal, y_bal = balance_dataset(X, y, method='smote')
563
+
564
+ Notes:
565
+ - Unified interface
566
+ - Multiple methods
567
+ - Easy to switch
568
+ - Production ready
569
+ """
570
+ if method == 'oversample':
571
+ return random_oversampling(X, y, target_ratio)
572
+ elif method == 'undersample':
573
+ return random_undersampling(X, y, target_ratio)
574
+ elif method == 'smote':
575
+ return smote_oversampling(X, y, target_ratio=target_ratio)
576
+ else:
577
+ raise ValueError(f"Unknown method: {method}")
578
+
579
+
580
+ # Create alias
581
+ balance_data = balance_dataset
582
+
583
+
584
+ def minority_class_identifier(y: List[int]) -> int:
585
+ """
586
+ Identify minority class label.
587
+
588
+ Alias: minority_class()
589
+
590
+ Args:
591
+ y: Labels
592
+
593
+ Returns:
594
+ int: Minority class label
595
+
596
+ Examples:
597
+ >>> from ilovetools.ml import minority_class # Short alias
598
+
599
+ >>> y = [0, 0, 0, 0, 1]
600
+ >>> minority = minority_class(y)
601
+ >>> print(minority)
602
+ 1
603
+
604
+ >>> from ilovetools.ml import minority_class_identifier # Full name
605
+ >>> minority = minority_class_identifier(y)
606
+
607
+ Notes:
608
+ - Quick identification
609
+ - Useful for filtering
610
+ - Essential for resampling
611
+ - Simple utility
612
+ """
613
+ class_counts = {}
614
+ for label in y:
615
+ class_counts[label] = class_counts.get(label, 0) + 1
616
+
617
+ return min(class_counts, key=class_counts.get)
618
+
619
+
620
+ # Create alias
621
+ minority_class = minority_class_identifier
622
+
623
+
624
+ def imbalance_ratio(y: List[int]) -> float:
625
+ """
626
+ Calculate imbalance ratio (majority/minority).
627
+
628
+ Alias: imbalance_ratio_alias()
629
+
630
+ Args:
631
+ y: Labels
632
+
633
+ Returns:
634
+ float: Imbalance ratio
635
+
636
+ Examples:
637
+ >>> from ilovetools.ml import imbalance_ratio
638
+
639
+ >>> y = [0, 0, 0, 0, 1]
640
+ >>> ratio = imbalance_ratio(y)
641
+ >>> print(ratio)
642
+ 4.0
643
+
644
+ >>> y = [0, 0, 1, 1]
645
+ >>> ratio = imbalance_ratio(y)
646
+ >>> print(ratio)
647
+ 1.0
648
+
649
+ Notes:
650
+ - Quick assessment
651
+ - 1.0 = balanced
652
+ - >3.0 = highly imbalanced
653
+ - Guide resampling strategy
654
+ """
655
+ class_counts = {}
656
+ for label in y:
657
+ class_counts[label] = class_counts.get(label, 0) + 1
658
+
659
+ majority_count = max(class_counts.values())
660
+ minority_count = min(class_counts.values())
661
+
662
+ return majority_count / minority_count
663
+
664
+
665
+ # Create alias (different name to avoid conflict)
666
+ imbalance_ratio_alias = imbalance_ratio
667
+
668
+
669
+ def synthetic_sample_generator(
670
+ sample: List[float],
671
+ neighbor: List[float],
672
+ alpha: Optional[float] = None
673
+ ) -> List[float]:
674
+ """
675
+ Generate synthetic sample between two samples.
676
+
677
+ Alias: synthetic_sample()
678
+
679
+ Args:
680
+ sample: First sample
681
+ neighbor: Second sample
682
+ alpha: Interpolation factor (None = random)
683
+
684
+ Returns:
685
+ list: Synthetic sample
686
+
687
+ Examples:
688
+ >>> from ilovetools.ml import synthetic_sample # Short alias
689
+
690
+ >>> sample = [1.0, 2.0]
691
+ >>> neighbor = [3.0, 4.0]
692
+ >>> synthetic = synthetic_sample(sample, neighbor, alpha=0.5)
693
+ >>> print(synthetic)
694
+ [2.0, 3.0]
695
+
696
+ >>> from ilovetools.ml import synthetic_sample_generator # Full name
697
+ >>> synthetic = synthetic_sample_generator(sample, neighbor)
698
+
699
+ Notes:
700
+ - Core of SMOTE
701
+ - Linear interpolation
702
+ - Creates diversity
703
+ - Reduces overfitting
704
+ """
705
+ if alpha is None:
706
+ alpha = random.random()
707
+
708
+ return [
709
+ sample[i] + alpha * (neighbor[i] - sample[i])
710
+ for i in range(len(sample))
711
+ ]
712
+
713
+
714
+ # Create alias
715
+ synthetic_sample = synthetic_sample_generator
716
+
717
+
718
+ def near_miss_undersampling(
719
+ X: List[List[float]],
720
+ y: List[int],
721
+ version: int = 1
722
+ ) -> Tuple[List[List[float]], List[int]]:
723
+ """
724
+ NearMiss undersampling algorithm.
725
+
726
+ Alias: near_miss()
727
+
728
+ Args:
729
+ X: Feature data
730
+ y: Labels
731
+ version: NearMiss version (1, 2, or 3)
732
+
733
+ Returns:
734
+ tuple: (X_resampled, y_resampled)
735
+
736
+ Examples:
737
+ >>> from ilovetools.ml import near_miss # Short alias
738
+
739
+ >>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
740
+ >>> y = [0, 0, 0, 0, 1]
741
+ >>>
742
+ >>> X_res, y_res = near_miss(X, y, version=1)
743
+ >>> print(len(y_res) < len(y))
744
+ True
745
+
746
+ >>> from ilovetools.ml import near_miss_undersampling # Full name
747
+ >>> X_res, y_res = near_miss_undersampling(X, y)
748
+
749
+ Notes:
750
+ - Intelligent undersampling
751
+ - Keeps informative samples
752
+ - Better than random
753
+ - Multiple versions
754
+ """
755
+ def euclidean_distance(p1, p2):
756
+ return sum((a - b) ** 2 for a, b in zip(p1, p2)) ** 0.5
757
+
758
+ # Separate by class
759
+ class_indices = {}
760
+ for idx, label in enumerate(y):
761
+ if label not in class_indices:
762
+ class_indices[label] = []
763
+ class_indices[label].append(idx)
764
+
765
+ # Find majority and minority
766
+ majority_label = max(class_indices, key=lambda k: len(class_indices[k]))
767
+ minority_label = min(class_indices, key=lambda k: len(class_indices[k]))
768
+
769
+ majority_indices = class_indices[majority_label]
770
+ minority_indices = class_indices[minority_label]
771
+
772
+ # NearMiss-1: Select majority samples closest to minority
773
+ selected_majority = []
774
+ target_size = len(minority_indices)
775
+
776
+ # Calculate average distance to minority class
777
+ distances = []
778
+ for maj_idx in majority_indices:
779
+ avg_dist = sum(
780
+ euclidean_distance(X[maj_idx], X[min_idx])
781
+ for min_idx in minority_indices
782
+ ) / len(minority_indices)
783
+ distances.append((maj_idx, avg_dist))
784
+
785
+ # Select samples with smallest average distance
786
+ distances.sort(key=lambda x: x[1])
787
+ selected_majority = [idx for idx, _ in distances[:target_size]]
788
+
789
+ # Combine with minority class
790
+ X_resampled = [X[i] for i in minority_indices + selected_majority]
791
+ y_resampled = [y[i] for i in minority_indices + selected_majority]
792
+
793
+ return X_resampled, y_resampled
794
+
795
+
796
+ # Create alias
797
+ near_miss = near_miss_undersampling