ilovetools 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,971 @@
1
+ """
2
+ Feature selection utilities for ML workflows
3
+ Each function has TWO names: full descriptive name + abbreviated alias
4
+ """
5
+
6
+ from typing import List, Dict, Any, Callable, Optional, Tuple
7
+ import random
8
+
9
+ __all__ = [
10
+ # Full names
11
+ 'correlation_filter',
12
+ 'variance_threshold_filter',
13
+ 'chi_square_filter',
14
+ 'mutual_information_filter',
15
+ 'recursive_feature_elimination',
16
+ 'forward_feature_selection',
17
+ 'backward_feature_elimination',
18
+ 'feature_importance_ranking',
19
+ 'l1_feature_selection',
20
+ 'univariate_feature_selection',
21
+ 'select_k_best_features',
22
+ 'remove_correlated_features',
23
+ # Abbreviated aliases
24
+ 'corr_filter',
25
+ 'var_filter',
26
+ 'chi2_filter',
27
+ 'mi_filter',
28
+ 'rfe',
29
+ 'forward_select',
30
+ 'backward_select',
31
+ 'feat_importance',
32
+ 'l1_select',
33
+ 'univariate_select',
34
+ 'select_k_best',
35
+ 'remove_corr',
36
+ ]
37
+
38
+
39
+ def correlation_filter(
40
+ X: List[List[float]],
41
+ feature_names: Optional[List[str]] = None,
42
+ threshold: float = 0.9
43
+ ) -> Tuple[List[int], List[str]]:
44
+ """
45
+ Remove highly correlated features.
46
+
47
+ Alias: corr_filter()
48
+
49
+ Args:
50
+ X: Feature matrix [n_samples, n_features]
51
+ feature_names: Optional feature names
52
+ threshold: Correlation threshold (default: 0.9)
53
+
54
+ Returns:
55
+ tuple: (selected_indices, selected_names)
56
+
57
+ Examples:
58
+ >>> from ilovetools.ml import corr_filter # Short alias
59
+
60
+ >>> X = [
61
+ ... [1, 2, 2.1],
62
+ ... [2, 4, 4.2],
63
+ ... [3, 6, 6.3],
64
+ ... [4, 8, 8.4]
65
+ ... ]
66
+ >>> feature_names = ['A', 'B', 'C']
67
+ >>>
68
+ >>> # Features B and C are highly correlated (0.99+)
69
+ >>> indices, names = corr_filter(X, feature_names, threshold=0.9)
70
+ >>> print(f"Selected: {names}")
71
+ Selected: ['A', 'B']
72
+
73
+ >>> from ilovetools.ml import correlation_filter # Full name
74
+ >>> indices, names = correlation_filter(X, feature_names)
75
+
76
+ Notes:
77
+ - Removes redundant features
78
+ - Keeps first of correlated pair
79
+ - Fast filter method
80
+ - Use before training
81
+ """
82
+ n_features = len(X[0])
83
+
84
+ if feature_names is None:
85
+ feature_names = [f"feature_{i}" for i in range(n_features)]
86
+
87
+ # Calculate correlation matrix
88
+ corr_matrix = []
89
+ for i in range(n_features):
90
+ row = []
91
+ for j in range(n_features):
92
+ if i == j:
93
+ row.append(1.0)
94
+ else:
95
+ # Calculate correlation
96
+ col_i = [row[i] for row in X]
97
+ col_j = [row[j] for row in X]
98
+
99
+ mean_i = sum(col_i) / len(col_i)
100
+ mean_j = sum(col_j) / len(col_j)
101
+
102
+ numerator = sum((col_i[k] - mean_i) * (col_j[k] - mean_j)
103
+ for k in range(len(col_i)))
104
+
105
+ std_i = (sum((x - mean_i) ** 2 for x in col_i) / len(col_i)) ** 0.5
106
+ std_j = (sum((x - mean_j) ** 2 for x in col_j) / len(col_j)) ** 0.5
107
+
108
+ if std_i == 0 or std_j == 0:
109
+ corr = 0.0
110
+ else:
111
+ corr = numerator / (len(col_i) * std_i * std_j)
112
+
113
+ row.append(abs(corr))
114
+ corr_matrix.append(row)
115
+
116
+ # Find features to keep
117
+ to_remove = set()
118
+ for i in range(n_features):
119
+ if i in to_remove:
120
+ continue
121
+ for j in range(i + 1, n_features):
122
+ if j in to_remove:
123
+ continue
124
+ if corr_matrix[i][j] > threshold:
125
+ to_remove.add(j)
126
+
127
+ selected_indices = [i for i in range(n_features) if i not in to_remove]
128
+ selected_names = [feature_names[i] for i in selected_indices]
129
+
130
+ return selected_indices, selected_names
131
+
132
+
133
+ # Create alias
134
+ corr_filter = correlation_filter
135
+
136
+
137
+ def variance_threshold_filter(
138
+ X: List[List[float]],
139
+ feature_names: Optional[List[str]] = None,
140
+ threshold: float = 0.0
141
+ ) -> Tuple[List[int], List[str]]:
142
+ """
143
+ Remove low-variance features.
144
+
145
+ Alias: var_filter()
146
+
147
+ Args:
148
+ X: Feature matrix [n_samples, n_features]
149
+ feature_names: Optional feature names
150
+ threshold: Variance threshold (default: 0.0)
151
+
152
+ Returns:
153
+ tuple: (selected_indices, selected_names)
154
+
155
+ Examples:
156
+ >>> from ilovetools.ml import var_filter # Short alias
157
+
158
+ >>> X = [
159
+ ... [1, 5, 0],
160
+ ... [2, 6, 0],
161
+ ... [3, 7, 0],
162
+ ... [4, 8, 0]
163
+ ... ]
164
+ >>> feature_names = ['A', 'B', 'C']
165
+ >>>
166
+ >>> # Feature C has zero variance (constant)
167
+ >>> indices, names = var_filter(X, feature_names, threshold=0.1)
168
+ >>> print(f"Selected: {names}")
169
+ Selected: ['A', 'B']
170
+
171
+ >>> from ilovetools.ml import variance_threshold_filter # Full name
172
+ >>> indices, names = variance_threshold_filter(X, feature_names)
173
+
174
+ Notes:
175
+ - Removes constant/near-constant features
176
+ - Very fast filter method
177
+ - Run first in pipeline
178
+ - Threshold 0.0 removes only constants
179
+ """
180
+ n_features = len(X[0])
181
+
182
+ if feature_names is None:
183
+ feature_names = [f"feature_{i}" for i in range(n_features)]
184
+
185
+ selected_indices = []
186
+ selected_names = []
187
+
188
+ for i in range(n_features):
189
+ col = [row[i] for row in X]
190
+ mean = sum(col) / len(col)
191
+ variance = sum((x - mean) ** 2 for x in col) / len(col)
192
+
193
+ if variance > threshold:
194
+ selected_indices.append(i)
195
+ selected_names.append(feature_names[i])
196
+
197
+ return selected_indices, selected_names
198
+
199
+
200
+ # Create alias
201
+ var_filter = variance_threshold_filter
202
+
203
+
204
+ def chi_square_filter(
205
+ X: List[List[float]],
206
+ y: List[int],
207
+ feature_names: Optional[List[str]] = None,
208
+ k: int = 10
209
+ ) -> Tuple[List[int], List[str], List[float]]:
210
+ """
211
+ Chi-square test for categorical features.
212
+
213
+ Alias: chi2_filter()
214
+
215
+ Args:
216
+ X: Feature matrix [n_samples, n_features]
217
+ y: Target labels (categorical)
218
+ feature_names: Optional feature names
219
+ k: Number of top features to select
220
+
221
+ Returns:
222
+ tuple: (selected_indices, selected_names, scores)
223
+
224
+ Examples:
225
+ >>> from ilovetools.ml import chi2_filter # Short alias
226
+
227
+ >>> X = [
228
+ ... [1, 0, 1],
229
+ ... [0, 1, 1],
230
+ ... [1, 1, 0],
231
+ ... [0, 0, 0]
232
+ ... ]
233
+ >>> y = [1, 1, 0, 0]
234
+ >>> feature_names = ['A', 'B', 'C']
235
+ >>>
236
+ >>> indices, names, scores = chi2_filter(X, y, feature_names, k=2)
237
+ >>> print(f"Selected: {names}")
238
+ >>> print(f"Scores: {[f'{s:.2f}' for s in scores]}")
239
+
240
+ >>> from ilovetools.ml import chi_square_filter # Full name
241
+ >>> indices, names, scores = chi_square_filter(X, y, feature_names)
242
+
243
+ Notes:
244
+ - For categorical/binary features
245
+ - Measures independence from target
246
+ - Fast filter method
247
+ - Higher score = more important
248
+ """
249
+ n_features = len(X[0])
250
+
251
+ if feature_names is None:
252
+ feature_names = [f"feature_{i}" for i in range(n_features)]
253
+
254
+ # Calculate chi-square scores
255
+ scores = []
256
+ for i in range(n_features):
257
+ col = [row[i] for row in X]
258
+
259
+ # Simple chi-square approximation
260
+ # Group by class and calculate observed vs expected
261
+ class_0_sum = sum(col[j] for j in range(len(col)) if y[j] == 0)
262
+ class_1_sum = sum(col[j] for j in range(len(col)) if y[j] == 1)
263
+
264
+ class_0_count = sum(1 for label in y if label == 0)
265
+ class_1_count = sum(1 for label in y if label == 1)
266
+
267
+ total = sum(col)
268
+
269
+ if total == 0 or class_0_count == 0 or class_1_count == 0:
270
+ scores.append(0.0)
271
+ continue
272
+
273
+ expected_0 = total * class_0_count / len(y)
274
+ expected_1 = total * class_1_count / len(y)
275
+
276
+ chi2 = 0.0
277
+ if expected_0 > 0:
278
+ chi2 += (class_0_sum - expected_0) ** 2 / expected_0
279
+ if expected_1 > 0:
280
+ chi2 += (class_1_sum - expected_1) ** 2 / expected_1
281
+
282
+ scores.append(chi2)
283
+
284
+ # Select top k features
285
+ indexed_scores = [(i, score) for i, score in enumerate(scores)]
286
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
287
+
288
+ selected_indices = [i for i, _ in indexed_scores[:k]]
289
+ selected_names = [feature_names[i] for i in selected_indices]
290
+ selected_scores = [scores[i] for i in selected_indices]
291
+
292
+ return selected_indices, selected_names, selected_scores
293
+
294
+
295
+ # Create alias
296
+ chi2_filter = chi_square_filter
297
+
298
+
299
+ def mutual_information_filter(
300
+ X: List[List[float]],
301
+ y: List,
302
+ feature_names: Optional[List[str]] = None,
303
+ k: int = 10
304
+ ) -> Tuple[List[int], List[str], List[float]]:
305
+ """
306
+ Mutual information for feature selection.
307
+
308
+ Alias: mi_filter()
309
+
310
+ Args:
311
+ X: Feature matrix [n_samples, n_features]
312
+ y: Target values
313
+ feature_names: Optional feature names
314
+ k: Number of top features to select
315
+
316
+ Returns:
317
+ tuple: (selected_indices, selected_names, scores)
318
+
319
+ Examples:
320
+ >>> from ilovetools.ml import mi_filter # Short alias
321
+
322
+ >>> X = [
323
+ ... [1, 2, 3],
324
+ ... [2, 4, 6],
325
+ ... [3, 6, 9],
326
+ ... [4, 8, 12]
327
+ ... ]
328
+ >>> y = [1, 2, 3, 4]
329
+ >>> feature_names = ['A', 'B', 'C']
330
+ >>>
331
+ >>> indices, names, scores = mi_filter(X, y, feature_names, k=2)
332
+ >>> print(f"Selected: {names}")
333
+
334
+ >>> from ilovetools.ml import mutual_information_filter # Full name
335
+ >>> indices, names, scores = mutual_information_filter(X, y, feature_names)
336
+
337
+ Notes:
338
+ - Measures dependency on target
339
+ - Works for any relationship
340
+ - Non-linear dependencies
341
+ - Higher score = more informative
342
+ """
343
+ n_features = len(X[0])
344
+
345
+ if feature_names is None:
346
+ feature_names = [f"feature_{i}" for i in range(n_features)]
347
+
348
+ # Calculate MI scores (simplified correlation-based approximation)
349
+ scores = []
350
+ for i in range(n_features):
351
+ col = [row[i] for row in X]
352
+
353
+ # Calculate correlation with target
354
+ mean_x = sum(col) / len(col)
355
+ mean_y = sum(y) / len(y)
356
+
357
+ numerator = sum((col[j] - mean_x) * (y[j] - mean_y) for j in range(len(col)))
358
+
359
+ std_x = (sum((x - mean_x) ** 2 for x in col) / len(col)) ** 0.5
360
+ std_y = (sum((y_val - mean_y) ** 2 for y_val in y) / len(y)) ** 0.5
361
+
362
+ if std_x == 0 or std_y == 0:
363
+ mi_score = 0.0
364
+ else:
365
+ corr = numerator / (len(col) * std_x * std_y)
366
+ mi_score = abs(corr) # Simplified MI approximation
367
+
368
+ scores.append(mi_score)
369
+
370
+ # Select top k features
371
+ indexed_scores = [(i, score) for i, score in enumerate(scores)]
372
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
373
+
374
+ selected_indices = [i for i, _ in indexed_scores[:k]]
375
+ selected_names = [feature_names[i] for i in selected_indices]
376
+ selected_scores = [scores[i] for i in selected_indices]
377
+
378
+ return selected_indices, selected_names, selected_scores
379
+
380
+
381
+ # Create alias
382
+ mi_filter = mutual_information_filter
383
+
384
+
385
+ def recursive_feature_elimination(
386
+ X: List[List[float]],
387
+ y: List,
388
+ model_func: Callable,
389
+ metric_func: Callable,
390
+ feature_names: Optional[List[str]] = None,
391
+ n_features_to_select: int = 5
392
+ ) -> Tuple[List[int], List[str], List[float]]:
393
+ """
394
+ Recursive Feature Elimination (RFE).
395
+
396
+ Alias: rfe()
397
+
398
+ Args:
399
+ X: Feature matrix [n_samples, n_features]
400
+ y: Target values
401
+ model_func: Function(X_train, y_train, X_test) -> predictions
402
+ metric_func: Function(y_true, y_pred) -> score
403
+ feature_names: Optional feature names
404
+ n_features_to_select: Number of features to keep
405
+
406
+ Returns:
407
+ tuple: (selected_indices, selected_names, scores_history)
408
+
409
+ Examples:
410
+ >>> from ilovetools.ml import rfe # Short alias
411
+
412
+ >>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
413
+ >>> y = [1, 2, 3, 4]
414
+ >>>
415
+ >>> def model(X_tr, y_tr, X_te):
416
+ ... avg = sum(y_tr) / len(y_tr)
417
+ ... return [avg] * len(X_te)
418
+ >>>
419
+ >>> def metric(y_true, y_pred):
420
+ ... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
421
+ >>>
422
+ >>> indices, names, history = rfe(X, y, model, metric, n_features_to_select=2)
423
+ >>> print(f"Selected: {names}")
424
+
425
+ >>> from ilovetools.ml import recursive_feature_elimination # Full name
426
+ >>> indices, names, history = recursive_feature_elimination(X, y, model, metric)
427
+
428
+ Notes:
429
+ - Wrapper method (uses model)
430
+ - Removes worst feature iteratively
431
+ - Considers feature interactions
432
+ - Computationally expensive
433
+ """
434
+ n_features = len(X[0])
435
+
436
+ if feature_names is None:
437
+ feature_names = [f"feature_{i}" for i in range(n_features)]
438
+
439
+ remaining_indices = list(range(n_features))
440
+ scores_history = []
441
+
442
+ while len(remaining_indices) > n_features_to_select:
443
+ # Evaluate each feature's contribution
444
+ feature_scores = []
445
+
446
+ for idx in remaining_indices:
447
+ # Create subset without this feature
448
+ subset_indices = [i for i in remaining_indices if i != idx]
449
+ X_subset = [[row[i] for i in subset_indices] for row in X]
450
+
451
+ # Train and evaluate
452
+ predictions = model_func(X_subset, y, X_subset)
453
+ score = metric_func(y, predictions)
454
+ feature_scores.append((idx, score))
455
+
456
+ # Remove feature with worst score
457
+ worst_idx = min(feature_scores, key=lambda x: x[1])[0]
458
+ remaining_indices.remove(worst_idx)
459
+
460
+ # Record score
461
+ X_current = [[row[i] for i in remaining_indices] for row in X]
462
+ predictions = model_func(X_current, y, X_current)
463
+ current_score = metric_func(y, predictions)
464
+ scores_history.append(current_score)
465
+
466
+ selected_names = [feature_names[i] for i in remaining_indices]
467
+
468
+ return remaining_indices, selected_names, scores_history
469
+
470
+
471
+ # Create alias
472
+ rfe = recursive_feature_elimination
473
+
474
+
475
+ def forward_feature_selection(
476
+ X: List[List[float]],
477
+ y: List,
478
+ model_func: Callable,
479
+ metric_func: Callable,
480
+ feature_names: Optional[List[str]] = None,
481
+ n_features_to_select: int = 5
482
+ ) -> Tuple[List[int], List[str], List[float]]:
483
+ """
484
+ Forward Feature Selection.
485
+
486
+ Alias: forward_select()
487
+
488
+ Args:
489
+ X: Feature matrix [n_samples, n_features]
490
+ y: Target values
491
+ model_func: Function(X_train, y_train, X_test) -> predictions
492
+ metric_func: Function(y_true, y_pred) -> score (higher is better)
493
+ feature_names: Optional feature names
494
+ n_features_to_select: Number of features to select
495
+
496
+ Returns:
497
+ tuple: (selected_indices, selected_names, scores_history)
498
+
499
+ Examples:
500
+ >>> from ilovetools.ml import forward_select # Short alias
501
+
502
+ >>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
503
+ >>> y = [1, 2, 3, 4]
504
+ >>>
505
+ >>> def model(X_tr, y_tr, X_te):
506
+ ... avg = sum(y_tr) / len(y_tr)
507
+ ... return [avg] * len(X_te)
508
+ >>>
509
+ >>> def metric(y_true, y_pred):
510
+ ... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
511
+ >>>
512
+ >>> indices, names, history = forward_select(X, y, model, metric, n_features_to_select=2)
513
+ >>> print(f"Selected: {names}")
514
+
515
+ >>> from ilovetools.ml import forward_feature_selection # Full name
516
+ >>> indices, names, history = forward_feature_selection(X, y, model, metric)
517
+
518
+ Notes:
519
+ - Wrapper method
520
+ - Adds best feature iteratively
521
+ - Greedy approach
522
+ - Good for small feature sets
523
+ """
524
+ n_features = len(X[0])
525
+
526
+ if feature_names is None:
527
+ feature_names = [f"feature_{i}" for i in range(n_features)]
528
+
529
+ selected_indices = []
530
+ remaining_indices = list(range(n_features))
531
+ scores_history = []
532
+
533
+ for _ in range(min(n_features_to_select, n_features)):
534
+ best_score = float('-inf')
535
+ best_idx = None
536
+
537
+ for idx in remaining_indices:
538
+ # Try adding this feature
539
+ trial_indices = selected_indices + [idx]
540
+ X_subset = [[row[i] for i in trial_indices] for row in X]
541
+
542
+ # Evaluate
543
+ predictions = model_func(X_subset, y, X_subset)
544
+ score = metric_func(y, predictions)
545
+
546
+ if score > best_score:
547
+ best_score = score
548
+ best_idx = idx
549
+
550
+ if best_idx is not None:
551
+ selected_indices.append(best_idx)
552
+ remaining_indices.remove(best_idx)
553
+ scores_history.append(best_score)
554
+
555
+ selected_names = [feature_names[i] for i in selected_indices]
556
+
557
+ return selected_indices, selected_names, scores_history
558
+
559
+
560
+ # Create alias
561
+ forward_select = forward_feature_selection
562
+
563
+
564
+ def backward_feature_elimination(
565
+ X: List[List[float]],
566
+ y: List,
567
+ model_func: Callable,
568
+ metric_func: Callable,
569
+ feature_names: Optional[List[str]] = None,
570
+ n_features_to_select: int = 5
571
+ ) -> Tuple[List[int], List[str], List[float]]:
572
+ """
573
+ Backward Feature Elimination.
574
+
575
+ Alias: backward_select()
576
+
577
+ Similar to RFE but evaluates full model each iteration.
578
+
579
+ Args:
580
+ X: Feature matrix [n_samples, n_features]
581
+ y: Target values
582
+ model_func: Function(X_train, y_train, X_test) -> predictions
583
+ metric_func: Function(y_true, y_pred) -> score
584
+ feature_names: Optional feature names
585
+ n_features_to_select: Number of features to keep
586
+
587
+ Returns:
588
+ tuple: (selected_indices, selected_names, scores_history)
589
+
590
+ Examples:
591
+ >>> from ilovetools.ml import backward_select # Short alias
592
+
593
+ >>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
594
+ >>> y = [1, 2, 3, 4]
595
+ >>>
596
+ >>> def model(X_tr, y_tr, X_te):
597
+ ... avg = sum(y_tr) / len(y_tr)
598
+ ... return [avg] * len(X_te)
599
+ >>>
600
+ >>> def metric(y_true, y_pred):
601
+ ... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
602
+ >>>
603
+ >>> indices, names, history = backward_select(X, y, model, metric, n_features_to_select=2)
604
+
605
+ >>> from ilovetools.ml import backward_feature_elimination # Full name
606
+ >>> indices, names, history = backward_feature_elimination(X, y, model, metric)
607
+
608
+ Notes:
609
+ - Wrapper method
610
+ - Starts with all features
611
+ - Removes least important
612
+ - More thorough than RFE
613
+ """
614
+ # Same implementation as RFE for simplicity
615
+ return recursive_feature_elimination(
616
+ X, y, model_func, metric_func, feature_names, n_features_to_select
617
+ )
618
+
619
+
620
+ # Create alias
621
+ backward_select = backward_feature_elimination
622
+
623
+
624
+ def feature_importance_ranking(
625
+ importances: List[float],
626
+ feature_names: Optional[List[str]] = None,
627
+ k: Optional[int] = None
628
+ ) -> Tuple[List[int], List[str], List[float]]:
629
+ """
630
+ Rank features by importance scores.
631
+
632
+ Alias: feat_importance()
633
+
634
+ Args:
635
+ importances: Feature importance scores
636
+ feature_names: Optional feature names
637
+ k: Number of top features to select (None = all)
638
+
639
+ Returns:
640
+ tuple: (selected_indices, selected_names, selected_scores)
641
+
642
+ Examples:
643
+ >>> from ilovetools.ml import feat_importance # Short alias
644
+
645
+ >>> importances = [0.1, 0.5, 0.3, 0.8, 0.2]
646
+ >>> feature_names = ['A', 'B', 'C', 'D', 'E']
647
+ >>>
648
+ >>> indices, names, scores = feat_importance(importances, feature_names, k=3)
649
+ >>> print(f"Top 3: {names}")
650
+ Top 3: ['D', 'B', 'C']
651
+ >>> print(f"Scores: {scores}")
652
+ [0.8, 0.5, 0.3]
653
+
654
+ >>> from ilovetools.ml import feature_importance_ranking # Full name
655
+ >>> indices, names, scores = feature_importance_ranking(importances, feature_names)
656
+
657
+ Notes:
658
+ - Works with any importance scores
659
+ - Random Forest, XGBoost, etc.
660
+ - Simple and effective
661
+ - Use after training
662
+ """
663
+ n_features = len(importances)
664
+
665
+ if feature_names is None:
666
+ feature_names = [f"feature_{i}" for i in range(n_features)]
667
+
668
+ # Sort by importance
669
+ indexed_importances = [(i, imp) for i, imp in enumerate(importances)]
670
+ indexed_importances.sort(key=lambda x: x[1], reverse=True)
671
+
672
+ if k is None:
673
+ k = n_features
674
+
675
+ selected_indices = [i for i, _ in indexed_importances[:k]]
676
+ selected_names = [feature_names[i] for i in selected_indices]
677
+ selected_scores = [importances[i] for i in selected_indices]
678
+
679
+ return selected_indices, selected_names, selected_scores
680
+
681
+
682
+ # Create alias
683
+ feat_importance = feature_importance_ranking
684
+
685
+
686
+ def l1_feature_selection(
687
+ X: List[List[float]],
688
+ y: List[float],
689
+ feature_names: Optional[List[str]] = None,
690
+ alpha: float = 0.1
691
+ ) -> Tuple[List[int], List[str], List[float]]:
692
+ """
693
+ L1 regularization for feature selection (Lasso).
694
+
695
+ Alias: l1_select()
696
+
697
+ Args:
698
+ X: Feature matrix [n_samples, n_features]
699
+ y: Target values
700
+ feature_names: Optional feature names
701
+ alpha: Regularization strength (higher = more sparse)
702
+
703
+ Returns:
704
+ tuple: (selected_indices, selected_names, coefficients)
705
+
706
+ Examples:
707
+ >>> from ilovetools.ml import l1_select # Short alias
708
+
709
+ >>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
710
+ >>> y = [1.0, 2.0, 3.0, 4.0]
711
+ >>> feature_names = ['A', 'B', 'C']
712
+ >>>
713
+ >>> indices, names, coefs = l1_select(X, y, feature_names, alpha=0.1)
714
+ >>> print(f"Selected: {names}")
715
+ >>> print(f"Coefficients: {[f'{c:.2f}' for c in coefs]}")
716
+
717
+ >>> from ilovetools.ml import l1_feature_selection # Full name
718
+ >>> indices, names, coefs = l1_feature_selection(X, y, feature_names)
719
+
720
+ Notes:
721
+ - Embedded method
722
+ - Shrinks coefficients to zero
723
+ - Automatic feature selection
724
+ - Higher alpha = fewer features
725
+ """
726
+ n_features = len(X[0])
727
+
728
+ if feature_names is None:
729
+ feature_names = [f"feature_{i}" for i in range(n_features)]
730
+
731
+ # Simple L1 approximation using correlation-based weights
732
+ coefficients = []
733
+
734
+ for i in range(n_features):
735
+ col = [row[i] for row in X]
736
+
737
+ # Calculate correlation with target
738
+ mean_x = sum(col) / len(col)
739
+ mean_y = sum(y) / len(y)
740
+
741
+ numerator = sum((col[j] - mean_x) * (y[j] - mean_y) for j in range(len(col)))
742
+
743
+ std_x = (sum((x - mean_x) ** 2 for x in col) / len(col)) ** 0.5
744
+ std_y = (sum((y_val - mean_y) ** 2 for y_val in y) / len(y)) ** 0.5
745
+
746
+ if std_x == 0 or std_y == 0:
747
+ coef = 0.0
748
+ else:
749
+ corr = numerator / (len(col) * std_x * std_y)
750
+ # Apply soft thresholding (L1 penalty)
751
+ if abs(corr) > alpha:
752
+ coef = corr - alpha * (1 if corr > 0 else -1)
753
+ else:
754
+ coef = 0.0
755
+
756
+ coefficients.append(coef)
757
+
758
+ # Select non-zero coefficients
759
+ selected_indices = [i for i, coef in enumerate(coefficients) if abs(coef) > 1e-10]
760
+ selected_names = [feature_names[i] for i in selected_indices]
761
+ selected_coefs = [coefficients[i] for i in selected_indices]
762
+
763
+ return selected_indices, selected_names, selected_coefs
764
+
765
+
766
+ # Create alias
767
+ l1_select = l1_feature_selection
768
+
769
+
770
+ def univariate_feature_selection(
771
+ X: List[List[float]],
772
+ y: List,
773
+ feature_names: Optional[List[str]] = None,
774
+ method: str = 'correlation',
775
+ k: int = 10
776
+ ) -> Tuple[List[int], List[str], List[float]]:
777
+ """
778
+ Univariate feature selection.
779
+
780
+ Alias: univariate_select()
781
+
782
+ Args:
783
+ X: Feature matrix [n_samples, n_features]
784
+ y: Target values
785
+ feature_names: Optional feature names
786
+ method: 'correlation', 'variance', or 'mutual_info'
787
+ k: Number of features to select
788
+
789
+ Returns:
790
+ tuple: (selected_indices, selected_names, scores)
791
+
792
+ Examples:
793
+ >>> from ilovetools.ml import univariate_select # Short alias
794
+
795
+ >>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
796
+ >>> y = [1, 2, 3, 4]
797
+ >>> feature_names = ['A', 'B', 'C']
798
+ >>>
799
+ >>> indices, names, scores = univariate_select(X, y, feature_names, method='correlation', k=2)
800
+ >>> print(f"Selected: {names}")
801
+
802
+ >>> from ilovetools.ml import univariate_feature_selection # Full name
803
+ >>> indices, names, scores = univariate_feature_selection(X, y, feature_names)
804
+
805
+ Notes:
806
+ - Tests each feature independently
807
+ - Fast filter method
808
+ - Ignores feature interactions
809
+ - Good starting point
810
+ """
811
+ if method == 'correlation' or method == 'mutual_info':
812
+ return mutual_information_filter(X, y, feature_names, k)
813
+ elif method == 'variance':
814
+ indices, names = variance_threshold_filter(X, feature_names, threshold=0.0)
815
+ scores = [1.0] * len(indices) # Dummy scores
816
+ return indices[:k], names[:k], scores[:k]
817
+ else:
818
+ raise ValueError(f"Unknown method: {method}")
819
+
820
+
821
+ # Create alias
822
+ univariate_select = univariate_feature_selection
823
+
824
+
825
+ def select_k_best_features(
826
+ X: List[List[float]],
827
+ y: List,
828
+ feature_names: Optional[List[str]] = None,
829
+ k: int = 10,
830
+ method: str = 'auto'
831
+ ) -> Tuple[List[int], List[str]]:
832
+ """
833
+ Select k best features automatically.
834
+
835
+ Alias: select_k_best()
836
+
837
+ Args:
838
+ X: Feature matrix [n_samples, n_features]
839
+ y: Target values
840
+ feature_names: Optional feature names
841
+ k: Number of features to select
842
+ method: 'auto', 'correlation', 'chi2', or 'mutual_info'
843
+
844
+ Returns:
845
+ tuple: (selected_indices, selected_names)
846
+
847
+ Examples:
848
+ >>> from ilovetools.ml import select_k_best # Short alias
849
+
850
+ >>> X = [[1, 2, 3, 4], [2, 4, 6, 8], [3, 6, 9, 12], [4, 8, 12, 16]]
851
+ >>> y = [1, 2, 3, 4]
852
+ >>> feature_names = ['A', 'B', 'C', 'D']
853
+ >>>
854
+ >>> indices, names = select_k_best(X, y, feature_names, k=2)
855
+ >>> print(f"Selected: {names}")
856
+
857
+ >>> from ilovetools.ml import select_k_best_features # Full name
858
+ >>> indices, names = select_k_best_features(X, y, feature_names)
859
+
860
+ Notes:
861
+ - Automatic method selection
862
+ - Fast and simple
863
+ - Good default choice
864
+ - Use for quick feature reduction
865
+ """
866
+ if method == 'auto':
867
+ # Check if y is categorical (for chi2) or continuous
868
+ unique_y = len(set(y))
869
+ if unique_y <= 10: # Likely categorical
870
+ method = 'chi2'
871
+ else:
872
+ method = 'mutual_info'
873
+
874
+ if method == 'chi2':
875
+ indices, names, _ = chi_square_filter(X, y, feature_names, k)
876
+ elif method == 'mutual_info' or method == 'correlation':
877
+ indices, names, _ = mutual_information_filter(X, y, feature_names, k)
878
+ else:
879
+ raise ValueError(f"Unknown method: {method}")
880
+
881
+ return indices, names
882
+
883
+
884
+ # Create alias
885
+ select_k_best = select_k_best_features
886
+
887
+
888
+ def remove_correlated_features(
889
+ X: List[List[float]],
890
+ feature_names: Optional[List[str]] = None,
891
+ threshold: float = 0.95
892
+ ) -> Tuple[List[int], List[str], List[Tuple[str, str, float]]]:
893
+ """
894
+ Remove highly correlated features and return correlation pairs.
895
+
896
+ Alias: remove_corr()
897
+
898
+ Args:
899
+ X: Feature matrix [n_samples, n_features]
900
+ feature_names: Optional feature names
901
+ threshold: Correlation threshold (default: 0.95)
902
+
903
+ Returns:
904
+ tuple: (selected_indices, selected_names, removed_pairs)
905
+
906
+ Examples:
907
+ >>> from ilovetools.ml import remove_corr # Short alias
908
+
909
+ >>> X = [
910
+ ... [1, 2, 2.05],
911
+ ... [2, 4, 4.1],
912
+ ... [3, 6, 6.15],
913
+ ... [4, 8, 8.2]
914
+ ... ]
915
+ >>> feature_names = ['A', 'B', 'C']
916
+ >>>
917
+ >>> indices, names, pairs = remove_corr(X, feature_names, threshold=0.95)
918
+ >>> print(f"Kept: {names}")
919
+ >>> print(f"Removed pairs: {[(p[0], p[1], f'{p[2]:.2f}') for p in pairs]}")
920
+
921
+ >>> from ilovetools.ml import remove_correlated_features # Full name
922
+ >>> indices, names, pairs = remove_correlated_features(X, feature_names)
923
+
924
+ Notes:
925
+ - Returns which features were correlated
926
+ - Helps understand redundancy
927
+ - Use before training
928
+ - Threshold 0.95 is common
929
+ """
930
+ indices, names = correlation_filter(X, feature_names, threshold)
931
+
932
+ # Find removed pairs
933
+ n_features = len(X[0])
934
+ if feature_names is None:
935
+ feature_names = [f"feature_{i}" for i in range(n_features)]
936
+
937
+ removed_pairs = []
938
+ removed_indices = set(range(n_features)) - set(indices)
939
+
940
+ # Calculate correlations for removed features
941
+ for removed_idx in removed_indices:
942
+ col_removed = [row[removed_idx] for row in X]
943
+
944
+ for kept_idx in indices:
945
+ col_kept = [row[kept_idx] for row in X]
946
+
947
+ # Calculate correlation
948
+ mean_r = sum(col_removed) / len(col_removed)
949
+ mean_k = sum(col_kept) / len(col_kept)
950
+
951
+ numerator = sum((col_removed[i] - mean_r) * (col_kept[i] - mean_k)
952
+ for i in range(len(col_removed)))
953
+
954
+ std_r = (sum((x - mean_r) ** 2 for x in col_removed) / len(col_removed)) ** 0.5
955
+ std_k = (sum((x - mean_k) ** 2 for x in col_kept) / len(col_kept)) ** 0.5
956
+
957
+ if std_r > 0 and std_k > 0:
958
+ corr = numerator / (len(col_removed) * std_r * std_k)
959
+ if abs(corr) > threshold:
960
+ removed_pairs.append((
961
+ feature_names[kept_idx],
962
+ feature_names[removed_idx],
963
+ abs(corr)
964
+ ))
965
+ break
966
+
967
+ return indices, names, removed_pairs
968
+
969
+
970
+ # Create alias
971
+ remove_corr = remove_correlated_features