ilovetools 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.1.6/ilovetools.egg-info → ilovetools-0.1.7}/PKG-INFO +1 -1
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/__init__.py +1 -1
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ml/__init__.py +55 -0
- ilovetools-0.1.7/ilovetools/ml/feature_selection.py +971 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools.egg-info/SOURCES.txt +1 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/pyproject.toml +1 -1
- {ilovetools-0.1.6 → ilovetools-0.1.7}/setup.py +1 -1
- {ilovetools-0.1.6 → ilovetools-0.1.7}/LICENSE +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/MANIFEST.in +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/README.md +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/data/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/data/feature_engineering.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ml/cross_validation.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ml/ensemble.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ml/metrics.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/ml/tuning.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/requirements.txt +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/setup.cfg +0 -0
- {ilovetools-0.1.6 → ilovetools-0.1.7}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -96,6 +96,35 @@ from .ensemble import (
|
|
|
96
96
|
blend,
|
|
97
97
|
)
|
|
98
98
|
|
|
99
|
+
from .feature_selection import (
|
|
100
|
+
# Full names
|
|
101
|
+
correlation_filter,
|
|
102
|
+
variance_threshold_filter,
|
|
103
|
+
chi_square_filter,
|
|
104
|
+
mutual_information_filter,
|
|
105
|
+
recursive_feature_elimination,
|
|
106
|
+
forward_feature_selection,
|
|
107
|
+
backward_feature_elimination,
|
|
108
|
+
feature_importance_ranking,
|
|
109
|
+
l1_feature_selection,
|
|
110
|
+
univariate_feature_selection,
|
|
111
|
+
select_k_best_features,
|
|
112
|
+
remove_correlated_features,
|
|
113
|
+
# Abbreviated aliases
|
|
114
|
+
corr_filter,
|
|
115
|
+
var_filter,
|
|
116
|
+
chi2_filter,
|
|
117
|
+
mi_filter,
|
|
118
|
+
rfe,
|
|
119
|
+
forward_select,
|
|
120
|
+
backward_select,
|
|
121
|
+
feat_importance,
|
|
122
|
+
l1_select,
|
|
123
|
+
univariate_select,
|
|
124
|
+
select_k_best,
|
|
125
|
+
remove_corr,
|
|
126
|
+
)
|
|
127
|
+
|
|
99
128
|
__all__ = [
|
|
100
129
|
# Metrics (full names)
|
|
101
130
|
'accuracy_score',
|
|
@@ -179,4 +208,30 @@ __all__ = [
|
|
|
179
208
|
'oob_score',
|
|
180
209
|
'diversity',
|
|
181
210
|
'blend',
|
|
211
|
+
# Feature Selection (full names)
|
|
212
|
+
'correlation_filter',
|
|
213
|
+
'variance_threshold_filter',
|
|
214
|
+
'chi_square_filter',
|
|
215
|
+
'mutual_information_filter',
|
|
216
|
+
'recursive_feature_elimination',
|
|
217
|
+
'forward_feature_selection',
|
|
218
|
+
'backward_feature_elimination',
|
|
219
|
+
'feature_importance_ranking',
|
|
220
|
+
'l1_feature_selection',
|
|
221
|
+
'univariate_feature_selection',
|
|
222
|
+
'select_k_best_features',
|
|
223
|
+
'remove_correlated_features',
|
|
224
|
+
# Feature Selection (aliases)
|
|
225
|
+
'corr_filter',
|
|
226
|
+
'var_filter',
|
|
227
|
+
'chi2_filter',
|
|
228
|
+
'mi_filter',
|
|
229
|
+
'rfe',
|
|
230
|
+
'forward_select',
|
|
231
|
+
'backward_select',
|
|
232
|
+
'feat_importance',
|
|
233
|
+
'l1_select',
|
|
234
|
+
'univariate_select',
|
|
235
|
+
'select_k_best',
|
|
236
|
+
'remove_corr',
|
|
182
237
|
]
|
|
@@ -0,0 +1,971 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feature selection utilities for ML workflows
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Callable, Optional, Tuple
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# Full names
|
|
11
|
+
'correlation_filter',
|
|
12
|
+
'variance_threshold_filter',
|
|
13
|
+
'chi_square_filter',
|
|
14
|
+
'mutual_information_filter',
|
|
15
|
+
'recursive_feature_elimination',
|
|
16
|
+
'forward_feature_selection',
|
|
17
|
+
'backward_feature_elimination',
|
|
18
|
+
'feature_importance_ranking',
|
|
19
|
+
'l1_feature_selection',
|
|
20
|
+
'univariate_feature_selection',
|
|
21
|
+
'select_k_best_features',
|
|
22
|
+
'remove_correlated_features',
|
|
23
|
+
# Abbreviated aliases
|
|
24
|
+
'corr_filter',
|
|
25
|
+
'var_filter',
|
|
26
|
+
'chi2_filter',
|
|
27
|
+
'mi_filter',
|
|
28
|
+
'rfe',
|
|
29
|
+
'forward_select',
|
|
30
|
+
'backward_select',
|
|
31
|
+
'feat_importance',
|
|
32
|
+
'l1_select',
|
|
33
|
+
'univariate_select',
|
|
34
|
+
'select_k_best',
|
|
35
|
+
'remove_corr',
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def correlation_filter(
|
|
40
|
+
X: List[List[float]],
|
|
41
|
+
feature_names: Optional[List[str]] = None,
|
|
42
|
+
threshold: float = 0.9
|
|
43
|
+
) -> Tuple[List[int], List[str]]:
|
|
44
|
+
"""
|
|
45
|
+
Remove highly correlated features.
|
|
46
|
+
|
|
47
|
+
Alias: corr_filter()
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
X: Feature matrix [n_samples, n_features]
|
|
51
|
+
feature_names: Optional feature names
|
|
52
|
+
threshold: Correlation threshold (default: 0.9)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
tuple: (selected_indices, selected_names)
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
>>> from ilovetools.ml import corr_filter # Short alias
|
|
59
|
+
|
|
60
|
+
>>> X = [
|
|
61
|
+
... [1, 2, 2.1],
|
|
62
|
+
... [2, 4, 4.2],
|
|
63
|
+
... [3, 6, 6.3],
|
|
64
|
+
... [4, 8, 8.4]
|
|
65
|
+
... ]
|
|
66
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
67
|
+
>>>
|
|
68
|
+
>>> # Features B and C are highly correlated (0.99+)
|
|
69
|
+
>>> indices, names = corr_filter(X, feature_names, threshold=0.9)
|
|
70
|
+
>>> print(f"Selected: {names}")
|
|
71
|
+
Selected: ['A', 'B']
|
|
72
|
+
|
|
73
|
+
>>> from ilovetools.ml import correlation_filter # Full name
|
|
74
|
+
>>> indices, names = correlation_filter(X, feature_names)
|
|
75
|
+
|
|
76
|
+
Notes:
|
|
77
|
+
- Removes redundant features
|
|
78
|
+
- Keeps first of correlated pair
|
|
79
|
+
- Fast filter method
|
|
80
|
+
- Use before training
|
|
81
|
+
"""
|
|
82
|
+
n_features = len(X[0])
|
|
83
|
+
|
|
84
|
+
if feature_names is None:
|
|
85
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
86
|
+
|
|
87
|
+
# Calculate correlation matrix
|
|
88
|
+
corr_matrix = []
|
|
89
|
+
for i in range(n_features):
|
|
90
|
+
row = []
|
|
91
|
+
for j in range(n_features):
|
|
92
|
+
if i == j:
|
|
93
|
+
row.append(1.0)
|
|
94
|
+
else:
|
|
95
|
+
# Calculate correlation
|
|
96
|
+
col_i = [row[i] for row in X]
|
|
97
|
+
col_j = [row[j] for row in X]
|
|
98
|
+
|
|
99
|
+
mean_i = sum(col_i) / len(col_i)
|
|
100
|
+
mean_j = sum(col_j) / len(col_j)
|
|
101
|
+
|
|
102
|
+
numerator = sum((col_i[k] - mean_i) * (col_j[k] - mean_j)
|
|
103
|
+
for k in range(len(col_i)))
|
|
104
|
+
|
|
105
|
+
std_i = (sum((x - mean_i) ** 2 for x in col_i) / len(col_i)) ** 0.5
|
|
106
|
+
std_j = (sum((x - mean_j) ** 2 for x in col_j) / len(col_j)) ** 0.5
|
|
107
|
+
|
|
108
|
+
if std_i == 0 or std_j == 0:
|
|
109
|
+
corr = 0.0
|
|
110
|
+
else:
|
|
111
|
+
corr = numerator / (len(col_i) * std_i * std_j)
|
|
112
|
+
|
|
113
|
+
row.append(abs(corr))
|
|
114
|
+
corr_matrix.append(row)
|
|
115
|
+
|
|
116
|
+
# Find features to keep
|
|
117
|
+
to_remove = set()
|
|
118
|
+
for i in range(n_features):
|
|
119
|
+
if i in to_remove:
|
|
120
|
+
continue
|
|
121
|
+
for j in range(i + 1, n_features):
|
|
122
|
+
if j in to_remove:
|
|
123
|
+
continue
|
|
124
|
+
if corr_matrix[i][j] > threshold:
|
|
125
|
+
to_remove.add(j)
|
|
126
|
+
|
|
127
|
+
selected_indices = [i for i in range(n_features) if i not in to_remove]
|
|
128
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
129
|
+
|
|
130
|
+
return selected_indices, selected_names
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Create alias
|
|
134
|
+
corr_filter = correlation_filter
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def variance_threshold_filter(
|
|
138
|
+
X: List[List[float]],
|
|
139
|
+
feature_names: Optional[List[str]] = None,
|
|
140
|
+
threshold: float = 0.0
|
|
141
|
+
) -> Tuple[List[int], List[str]]:
|
|
142
|
+
"""
|
|
143
|
+
Remove low-variance features.
|
|
144
|
+
|
|
145
|
+
Alias: var_filter()
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
X: Feature matrix [n_samples, n_features]
|
|
149
|
+
feature_names: Optional feature names
|
|
150
|
+
threshold: Variance threshold (default: 0.0)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
tuple: (selected_indices, selected_names)
|
|
154
|
+
|
|
155
|
+
Examples:
|
|
156
|
+
>>> from ilovetools.ml import var_filter # Short alias
|
|
157
|
+
|
|
158
|
+
>>> X = [
|
|
159
|
+
... [1, 5, 0],
|
|
160
|
+
... [2, 6, 0],
|
|
161
|
+
... [3, 7, 0],
|
|
162
|
+
... [4, 8, 0]
|
|
163
|
+
... ]
|
|
164
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
165
|
+
>>>
|
|
166
|
+
>>> # Feature C has zero variance (constant)
|
|
167
|
+
>>> indices, names = var_filter(X, feature_names, threshold=0.1)
|
|
168
|
+
>>> print(f"Selected: {names}")
|
|
169
|
+
Selected: ['A', 'B']
|
|
170
|
+
|
|
171
|
+
>>> from ilovetools.ml import variance_threshold_filter # Full name
|
|
172
|
+
>>> indices, names = variance_threshold_filter(X, feature_names)
|
|
173
|
+
|
|
174
|
+
Notes:
|
|
175
|
+
- Removes constant/near-constant features
|
|
176
|
+
- Very fast filter method
|
|
177
|
+
- Run first in pipeline
|
|
178
|
+
- Threshold 0.0 removes only constants
|
|
179
|
+
"""
|
|
180
|
+
n_features = len(X[0])
|
|
181
|
+
|
|
182
|
+
if feature_names is None:
|
|
183
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
184
|
+
|
|
185
|
+
selected_indices = []
|
|
186
|
+
selected_names = []
|
|
187
|
+
|
|
188
|
+
for i in range(n_features):
|
|
189
|
+
col = [row[i] for row in X]
|
|
190
|
+
mean = sum(col) / len(col)
|
|
191
|
+
variance = sum((x - mean) ** 2 for x in col) / len(col)
|
|
192
|
+
|
|
193
|
+
if variance > threshold:
|
|
194
|
+
selected_indices.append(i)
|
|
195
|
+
selected_names.append(feature_names[i])
|
|
196
|
+
|
|
197
|
+
return selected_indices, selected_names
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# Create alias
|
|
201
|
+
var_filter = variance_threshold_filter
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def chi_square_filter(
|
|
205
|
+
X: List[List[float]],
|
|
206
|
+
y: List[int],
|
|
207
|
+
feature_names: Optional[List[str]] = None,
|
|
208
|
+
k: int = 10
|
|
209
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
210
|
+
"""
|
|
211
|
+
Chi-square test for categorical features.
|
|
212
|
+
|
|
213
|
+
Alias: chi2_filter()
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
X: Feature matrix [n_samples, n_features]
|
|
217
|
+
y: Target labels (categorical)
|
|
218
|
+
feature_names: Optional feature names
|
|
219
|
+
k: Number of top features to select
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
tuple: (selected_indices, selected_names, scores)
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
|
+
>>> from ilovetools.ml import chi2_filter # Short alias
|
|
226
|
+
|
|
227
|
+
>>> X = [
|
|
228
|
+
... [1, 0, 1],
|
|
229
|
+
... [0, 1, 1],
|
|
230
|
+
... [1, 1, 0],
|
|
231
|
+
... [0, 0, 0]
|
|
232
|
+
... ]
|
|
233
|
+
>>> y = [1, 1, 0, 0]
|
|
234
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
235
|
+
>>>
|
|
236
|
+
>>> indices, names, scores = chi2_filter(X, y, feature_names, k=2)
|
|
237
|
+
>>> print(f"Selected: {names}")
|
|
238
|
+
>>> print(f"Scores: {[f'{s:.2f}' for s in scores]}")
|
|
239
|
+
|
|
240
|
+
>>> from ilovetools.ml import chi_square_filter # Full name
|
|
241
|
+
>>> indices, names, scores = chi_square_filter(X, y, feature_names)
|
|
242
|
+
|
|
243
|
+
Notes:
|
|
244
|
+
- For categorical/binary features
|
|
245
|
+
- Measures independence from target
|
|
246
|
+
- Fast filter method
|
|
247
|
+
- Higher score = more important
|
|
248
|
+
"""
|
|
249
|
+
n_features = len(X[0])
|
|
250
|
+
|
|
251
|
+
if feature_names is None:
|
|
252
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
253
|
+
|
|
254
|
+
# Calculate chi-square scores
|
|
255
|
+
scores = []
|
|
256
|
+
for i in range(n_features):
|
|
257
|
+
col = [row[i] for row in X]
|
|
258
|
+
|
|
259
|
+
# Simple chi-square approximation
|
|
260
|
+
# Group by class and calculate observed vs expected
|
|
261
|
+
class_0_sum = sum(col[j] for j in range(len(col)) if y[j] == 0)
|
|
262
|
+
class_1_sum = sum(col[j] for j in range(len(col)) if y[j] == 1)
|
|
263
|
+
|
|
264
|
+
class_0_count = sum(1 for label in y if label == 0)
|
|
265
|
+
class_1_count = sum(1 for label in y if label == 1)
|
|
266
|
+
|
|
267
|
+
total = sum(col)
|
|
268
|
+
|
|
269
|
+
if total == 0 or class_0_count == 0 or class_1_count == 0:
|
|
270
|
+
scores.append(0.0)
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
expected_0 = total * class_0_count / len(y)
|
|
274
|
+
expected_1 = total * class_1_count / len(y)
|
|
275
|
+
|
|
276
|
+
chi2 = 0.0
|
|
277
|
+
if expected_0 > 0:
|
|
278
|
+
chi2 += (class_0_sum - expected_0) ** 2 / expected_0
|
|
279
|
+
if expected_1 > 0:
|
|
280
|
+
chi2 += (class_1_sum - expected_1) ** 2 / expected_1
|
|
281
|
+
|
|
282
|
+
scores.append(chi2)
|
|
283
|
+
|
|
284
|
+
# Select top k features
|
|
285
|
+
indexed_scores = [(i, score) for i, score in enumerate(scores)]
|
|
286
|
+
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
|
287
|
+
|
|
288
|
+
selected_indices = [i for i, _ in indexed_scores[:k]]
|
|
289
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
290
|
+
selected_scores = [scores[i] for i in selected_indices]
|
|
291
|
+
|
|
292
|
+
return selected_indices, selected_names, selected_scores
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# Create alias
|
|
296
|
+
chi2_filter = chi_square_filter
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def mutual_information_filter(
|
|
300
|
+
X: List[List[float]],
|
|
301
|
+
y: List,
|
|
302
|
+
feature_names: Optional[List[str]] = None,
|
|
303
|
+
k: int = 10
|
|
304
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
305
|
+
"""
|
|
306
|
+
Mutual information for feature selection.
|
|
307
|
+
|
|
308
|
+
Alias: mi_filter()
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
X: Feature matrix [n_samples, n_features]
|
|
312
|
+
y: Target values
|
|
313
|
+
feature_names: Optional feature names
|
|
314
|
+
k: Number of top features to select
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
tuple: (selected_indices, selected_names, scores)
|
|
318
|
+
|
|
319
|
+
Examples:
|
|
320
|
+
>>> from ilovetools.ml import mi_filter # Short alias
|
|
321
|
+
|
|
322
|
+
>>> X = [
|
|
323
|
+
... [1, 2, 3],
|
|
324
|
+
... [2, 4, 6],
|
|
325
|
+
... [3, 6, 9],
|
|
326
|
+
... [4, 8, 12]
|
|
327
|
+
... ]
|
|
328
|
+
>>> y = [1, 2, 3, 4]
|
|
329
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
330
|
+
>>>
|
|
331
|
+
>>> indices, names, scores = mi_filter(X, y, feature_names, k=2)
|
|
332
|
+
>>> print(f"Selected: {names}")
|
|
333
|
+
|
|
334
|
+
>>> from ilovetools.ml import mutual_information_filter # Full name
|
|
335
|
+
>>> indices, names, scores = mutual_information_filter(X, y, feature_names)
|
|
336
|
+
|
|
337
|
+
Notes:
|
|
338
|
+
- Measures dependency on target
|
|
339
|
+
- Works for any relationship
|
|
340
|
+
- Non-linear dependencies
|
|
341
|
+
- Higher score = more informative
|
|
342
|
+
"""
|
|
343
|
+
n_features = len(X[0])
|
|
344
|
+
|
|
345
|
+
if feature_names is None:
|
|
346
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
347
|
+
|
|
348
|
+
# Calculate MI scores (simplified correlation-based approximation)
|
|
349
|
+
scores = []
|
|
350
|
+
for i in range(n_features):
|
|
351
|
+
col = [row[i] for row in X]
|
|
352
|
+
|
|
353
|
+
# Calculate correlation with target
|
|
354
|
+
mean_x = sum(col) / len(col)
|
|
355
|
+
mean_y = sum(y) / len(y)
|
|
356
|
+
|
|
357
|
+
numerator = sum((col[j] - mean_x) * (y[j] - mean_y) for j in range(len(col)))
|
|
358
|
+
|
|
359
|
+
std_x = (sum((x - mean_x) ** 2 for x in col) / len(col)) ** 0.5
|
|
360
|
+
std_y = (sum((y_val - mean_y) ** 2 for y_val in y) / len(y)) ** 0.5
|
|
361
|
+
|
|
362
|
+
if std_x == 0 or std_y == 0:
|
|
363
|
+
mi_score = 0.0
|
|
364
|
+
else:
|
|
365
|
+
corr = numerator / (len(col) * std_x * std_y)
|
|
366
|
+
mi_score = abs(corr) # Simplified MI approximation
|
|
367
|
+
|
|
368
|
+
scores.append(mi_score)
|
|
369
|
+
|
|
370
|
+
# Select top k features
|
|
371
|
+
indexed_scores = [(i, score) for i, score in enumerate(scores)]
|
|
372
|
+
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
|
373
|
+
|
|
374
|
+
selected_indices = [i for i, _ in indexed_scores[:k]]
|
|
375
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
376
|
+
selected_scores = [scores[i] for i in selected_indices]
|
|
377
|
+
|
|
378
|
+
return selected_indices, selected_names, selected_scores
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# Create alias
|
|
382
|
+
mi_filter = mutual_information_filter
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def recursive_feature_elimination(
|
|
386
|
+
X: List[List[float]],
|
|
387
|
+
y: List,
|
|
388
|
+
model_func: Callable,
|
|
389
|
+
metric_func: Callable,
|
|
390
|
+
feature_names: Optional[List[str]] = None,
|
|
391
|
+
n_features_to_select: int = 5
|
|
392
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
393
|
+
"""
|
|
394
|
+
Recursive Feature Elimination (RFE).
|
|
395
|
+
|
|
396
|
+
Alias: rfe()
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
X: Feature matrix [n_samples, n_features]
|
|
400
|
+
y: Target values
|
|
401
|
+
model_func: Function(X_train, y_train, X_test) -> predictions
|
|
402
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
403
|
+
feature_names: Optional feature names
|
|
404
|
+
n_features_to_select: Number of features to keep
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
tuple: (selected_indices, selected_names, scores_history)
|
|
408
|
+
|
|
409
|
+
Examples:
|
|
410
|
+
>>> from ilovetools.ml import rfe # Short alias
|
|
411
|
+
|
|
412
|
+
>>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
|
|
413
|
+
>>> y = [1, 2, 3, 4]
|
|
414
|
+
>>>
|
|
415
|
+
>>> def model(X_tr, y_tr, X_te):
|
|
416
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
417
|
+
... return [avg] * len(X_te)
|
|
418
|
+
>>>
|
|
419
|
+
>>> def metric(y_true, y_pred):
|
|
420
|
+
... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
|
|
421
|
+
>>>
|
|
422
|
+
>>> indices, names, history = rfe(X, y, model, metric, n_features_to_select=2)
|
|
423
|
+
>>> print(f"Selected: {names}")
|
|
424
|
+
|
|
425
|
+
>>> from ilovetools.ml import recursive_feature_elimination # Full name
|
|
426
|
+
>>> indices, names, history = recursive_feature_elimination(X, y, model, metric)
|
|
427
|
+
|
|
428
|
+
Notes:
|
|
429
|
+
- Wrapper method (uses model)
|
|
430
|
+
- Removes worst feature iteratively
|
|
431
|
+
- Considers feature interactions
|
|
432
|
+
- Computationally expensive
|
|
433
|
+
"""
|
|
434
|
+
n_features = len(X[0])
|
|
435
|
+
|
|
436
|
+
if feature_names is None:
|
|
437
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
438
|
+
|
|
439
|
+
remaining_indices = list(range(n_features))
|
|
440
|
+
scores_history = []
|
|
441
|
+
|
|
442
|
+
while len(remaining_indices) > n_features_to_select:
|
|
443
|
+
# Evaluate each feature's contribution
|
|
444
|
+
feature_scores = []
|
|
445
|
+
|
|
446
|
+
for idx in remaining_indices:
|
|
447
|
+
# Create subset without this feature
|
|
448
|
+
subset_indices = [i for i in remaining_indices if i != idx]
|
|
449
|
+
X_subset = [[row[i] for i in subset_indices] for row in X]
|
|
450
|
+
|
|
451
|
+
# Train and evaluate
|
|
452
|
+
predictions = model_func(X_subset, y, X_subset)
|
|
453
|
+
score = metric_func(y, predictions)
|
|
454
|
+
feature_scores.append((idx, score))
|
|
455
|
+
|
|
456
|
+
# Remove feature with worst score
|
|
457
|
+
worst_idx = min(feature_scores, key=lambda x: x[1])[0]
|
|
458
|
+
remaining_indices.remove(worst_idx)
|
|
459
|
+
|
|
460
|
+
# Record score
|
|
461
|
+
X_current = [[row[i] for i in remaining_indices] for row in X]
|
|
462
|
+
predictions = model_func(X_current, y, X_current)
|
|
463
|
+
current_score = metric_func(y, predictions)
|
|
464
|
+
scores_history.append(current_score)
|
|
465
|
+
|
|
466
|
+
selected_names = [feature_names[i] for i in remaining_indices]
|
|
467
|
+
|
|
468
|
+
return remaining_indices, selected_names, scores_history
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# Create alias
|
|
472
|
+
rfe = recursive_feature_elimination
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def forward_feature_selection(
|
|
476
|
+
X: List[List[float]],
|
|
477
|
+
y: List,
|
|
478
|
+
model_func: Callable,
|
|
479
|
+
metric_func: Callable,
|
|
480
|
+
feature_names: Optional[List[str]] = None,
|
|
481
|
+
n_features_to_select: int = 5
|
|
482
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
483
|
+
"""
|
|
484
|
+
Forward Feature Selection.
|
|
485
|
+
|
|
486
|
+
Alias: forward_select()
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
X: Feature matrix [n_samples, n_features]
|
|
490
|
+
y: Target values
|
|
491
|
+
model_func: Function(X_train, y_train, X_test) -> predictions
|
|
492
|
+
metric_func: Function(y_true, y_pred) -> score (higher is better)
|
|
493
|
+
feature_names: Optional feature names
|
|
494
|
+
n_features_to_select: Number of features to select
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
tuple: (selected_indices, selected_names, scores_history)
|
|
498
|
+
|
|
499
|
+
Examples:
|
|
500
|
+
>>> from ilovetools.ml import forward_select # Short alias
|
|
501
|
+
|
|
502
|
+
>>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
|
|
503
|
+
>>> y = [1, 2, 3, 4]
|
|
504
|
+
>>>
|
|
505
|
+
>>> def model(X_tr, y_tr, X_te):
|
|
506
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
507
|
+
... return [avg] * len(X_te)
|
|
508
|
+
>>>
|
|
509
|
+
>>> def metric(y_true, y_pred):
|
|
510
|
+
... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
|
|
511
|
+
>>>
|
|
512
|
+
>>> indices, names, history = forward_select(X, y, model, metric, n_features_to_select=2)
|
|
513
|
+
>>> print(f"Selected: {names}")
|
|
514
|
+
|
|
515
|
+
>>> from ilovetools.ml import forward_feature_selection # Full name
|
|
516
|
+
>>> indices, names, history = forward_feature_selection(X, y, model, metric)
|
|
517
|
+
|
|
518
|
+
Notes:
|
|
519
|
+
- Wrapper method
|
|
520
|
+
- Adds best feature iteratively
|
|
521
|
+
- Greedy approach
|
|
522
|
+
- Good for small feature sets
|
|
523
|
+
"""
|
|
524
|
+
n_features = len(X[0])
|
|
525
|
+
|
|
526
|
+
if feature_names is None:
|
|
527
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
528
|
+
|
|
529
|
+
selected_indices = []
|
|
530
|
+
remaining_indices = list(range(n_features))
|
|
531
|
+
scores_history = []
|
|
532
|
+
|
|
533
|
+
for _ in range(min(n_features_to_select, n_features)):
|
|
534
|
+
best_score = float('-inf')
|
|
535
|
+
best_idx = None
|
|
536
|
+
|
|
537
|
+
for idx in remaining_indices:
|
|
538
|
+
# Try adding this feature
|
|
539
|
+
trial_indices = selected_indices + [idx]
|
|
540
|
+
X_subset = [[row[i] for i in trial_indices] for row in X]
|
|
541
|
+
|
|
542
|
+
# Evaluate
|
|
543
|
+
predictions = model_func(X_subset, y, X_subset)
|
|
544
|
+
score = metric_func(y, predictions)
|
|
545
|
+
|
|
546
|
+
if score > best_score:
|
|
547
|
+
best_score = score
|
|
548
|
+
best_idx = idx
|
|
549
|
+
|
|
550
|
+
if best_idx is not None:
|
|
551
|
+
selected_indices.append(best_idx)
|
|
552
|
+
remaining_indices.remove(best_idx)
|
|
553
|
+
scores_history.append(best_score)
|
|
554
|
+
|
|
555
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
556
|
+
|
|
557
|
+
return selected_indices, selected_names, scores_history
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
# Create alias
|
|
561
|
+
forward_select = forward_feature_selection
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def backward_feature_elimination(
|
|
565
|
+
X: List[List[float]],
|
|
566
|
+
y: List,
|
|
567
|
+
model_func: Callable,
|
|
568
|
+
metric_func: Callable,
|
|
569
|
+
feature_names: Optional[List[str]] = None,
|
|
570
|
+
n_features_to_select: int = 5
|
|
571
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
572
|
+
"""
|
|
573
|
+
Backward Feature Elimination.
|
|
574
|
+
|
|
575
|
+
Alias: backward_select()
|
|
576
|
+
|
|
577
|
+
Similar to RFE but evaluates full model each iteration.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
X: Feature matrix [n_samples, n_features]
|
|
581
|
+
y: Target values
|
|
582
|
+
model_func: Function(X_train, y_train, X_test) -> predictions
|
|
583
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
584
|
+
feature_names: Optional feature names
|
|
585
|
+
n_features_to_select: Number of features to keep
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
tuple: (selected_indices, selected_names, scores_history)
|
|
589
|
+
|
|
590
|
+
Examples:
|
|
591
|
+
>>> from ilovetools.ml import backward_select # Short alias
|
|
592
|
+
|
|
593
|
+
>>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
|
|
594
|
+
>>> y = [1, 2, 3, 4]
|
|
595
|
+
>>>
|
|
596
|
+
>>> def model(X_tr, y_tr, X_te):
|
|
597
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
598
|
+
... return [avg] * len(X_te)
|
|
599
|
+
>>>
|
|
600
|
+
>>> def metric(y_true, y_pred):
|
|
601
|
+
... return -sum(abs(y_true[i] - y_pred[i]) for i in range(len(y_true)))
|
|
602
|
+
>>>
|
|
603
|
+
>>> indices, names, history = backward_select(X, y, model, metric, n_features_to_select=2)
|
|
604
|
+
|
|
605
|
+
>>> from ilovetools.ml import backward_feature_elimination # Full name
|
|
606
|
+
>>> indices, names, history = backward_feature_elimination(X, y, model, metric)
|
|
607
|
+
|
|
608
|
+
Notes:
|
|
609
|
+
- Wrapper method
|
|
610
|
+
- Starts with all features
|
|
611
|
+
- Removes least important
|
|
612
|
+
- More thorough than RFE
|
|
613
|
+
"""
|
|
614
|
+
# Same implementation as RFE for simplicity
|
|
615
|
+
return recursive_feature_elimination(
|
|
616
|
+
X, y, model_func, metric_func, feature_names, n_features_to_select
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
# Create alias
|
|
621
|
+
backward_select = backward_feature_elimination
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def feature_importance_ranking(
|
|
625
|
+
importances: List[float],
|
|
626
|
+
feature_names: Optional[List[str]] = None,
|
|
627
|
+
k: Optional[int] = None
|
|
628
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
629
|
+
"""
|
|
630
|
+
Rank features by importance scores.
|
|
631
|
+
|
|
632
|
+
Alias: feat_importance()
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
importances: Feature importance scores
|
|
636
|
+
feature_names: Optional feature names
|
|
637
|
+
k: Number of top features to select (None = all)
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
tuple: (selected_indices, selected_names, selected_scores)
|
|
641
|
+
|
|
642
|
+
Examples:
|
|
643
|
+
>>> from ilovetools.ml import feat_importance # Short alias
|
|
644
|
+
|
|
645
|
+
>>> importances = [0.1, 0.5, 0.3, 0.8, 0.2]
|
|
646
|
+
>>> feature_names = ['A', 'B', 'C', 'D', 'E']
|
|
647
|
+
>>>
|
|
648
|
+
>>> indices, names, scores = feat_importance(importances, feature_names, k=3)
|
|
649
|
+
>>> print(f"Top 3: {names}")
|
|
650
|
+
Top 3: ['D', 'B', 'C']
|
|
651
|
+
>>> print(f"Scores: {scores}")
|
|
652
|
+
[0.8, 0.5, 0.3]
|
|
653
|
+
|
|
654
|
+
>>> from ilovetools.ml import feature_importance_ranking # Full name
|
|
655
|
+
>>> indices, names, scores = feature_importance_ranking(importances, feature_names)
|
|
656
|
+
|
|
657
|
+
Notes:
|
|
658
|
+
- Works with any importance scores
|
|
659
|
+
- Random Forest, XGBoost, etc.
|
|
660
|
+
- Simple and effective
|
|
661
|
+
- Use after training
|
|
662
|
+
"""
|
|
663
|
+
n_features = len(importances)
|
|
664
|
+
|
|
665
|
+
if feature_names is None:
|
|
666
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
667
|
+
|
|
668
|
+
# Sort by importance
|
|
669
|
+
indexed_importances = [(i, imp) for i, imp in enumerate(importances)]
|
|
670
|
+
indexed_importances.sort(key=lambda x: x[1], reverse=True)
|
|
671
|
+
|
|
672
|
+
if k is None:
|
|
673
|
+
k = n_features
|
|
674
|
+
|
|
675
|
+
selected_indices = [i for i, _ in indexed_importances[:k]]
|
|
676
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
677
|
+
selected_scores = [importances[i] for i in selected_indices]
|
|
678
|
+
|
|
679
|
+
return selected_indices, selected_names, selected_scores
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
# Create alias
|
|
683
|
+
feat_importance = feature_importance_ranking
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def l1_feature_selection(
|
|
687
|
+
X: List[List[float]],
|
|
688
|
+
y: List[float],
|
|
689
|
+
feature_names: Optional[List[str]] = None,
|
|
690
|
+
alpha: float = 0.1
|
|
691
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
692
|
+
"""
|
|
693
|
+
L1 regularization for feature selection (Lasso).
|
|
694
|
+
|
|
695
|
+
Alias: l1_select()
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
X: Feature matrix [n_samples, n_features]
|
|
699
|
+
y: Target values
|
|
700
|
+
feature_names: Optional feature names
|
|
701
|
+
alpha: Regularization strength (higher = more sparse)
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
tuple: (selected_indices, selected_names, coefficients)
|
|
705
|
+
|
|
706
|
+
Examples:
|
|
707
|
+
>>> from ilovetools.ml import l1_select # Short alias
|
|
708
|
+
|
|
709
|
+
>>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
|
|
710
|
+
>>> y = [1.0, 2.0, 3.0, 4.0]
|
|
711
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
712
|
+
>>>
|
|
713
|
+
>>> indices, names, coefs = l1_select(X, y, feature_names, alpha=0.1)
|
|
714
|
+
>>> print(f"Selected: {names}")
|
|
715
|
+
>>> print(f"Coefficients: {[f'{c:.2f}' for c in coefs]}")
|
|
716
|
+
|
|
717
|
+
>>> from ilovetools.ml import l1_feature_selection # Full name
|
|
718
|
+
>>> indices, names, coefs = l1_feature_selection(X, y, feature_names)
|
|
719
|
+
|
|
720
|
+
Notes:
|
|
721
|
+
- Embedded method
|
|
722
|
+
- Shrinks coefficients to zero
|
|
723
|
+
- Automatic feature selection
|
|
724
|
+
- Higher alpha = fewer features
|
|
725
|
+
"""
|
|
726
|
+
n_features = len(X[0])
|
|
727
|
+
|
|
728
|
+
if feature_names is None:
|
|
729
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
730
|
+
|
|
731
|
+
# Simple L1 approximation using correlation-based weights
|
|
732
|
+
coefficients = []
|
|
733
|
+
|
|
734
|
+
for i in range(n_features):
|
|
735
|
+
col = [row[i] for row in X]
|
|
736
|
+
|
|
737
|
+
# Calculate correlation with target
|
|
738
|
+
mean_x = sum(col) / len(col)
|
|
739
|
+
mean_y = sum(y) / len(y)
|
|
740
|
+
|
|
741
|
+
numerator = sum((col[j] - mean_x) * (y[j] - mean_y) for j in range(len(col)))
|
|
742
|
+
|
|
743
|
+
std_x = (sum((x - mean_x) ** 2 for x in col) / len(col)) ** 0.5
|
|
744
|
+
std_y = (sum((y_val - mean_y) ** 2 for y_val in y) / len(y)) ** 0.5
|
|
745
|
+
|
|
746
|
+
if std_x == 0 or std_y == 0:
|
|
747
|
+
coef = 0.0
|
|
748
|
+
else:
|
|
749
|
+
corr = numerator / (len(col) * std_x * std_y)
|
|
750
|
+
# Apply soft thresholding (L1 penalty)
|
|
751
|
+
if abs(corr) > alpha:
|
|
752
|
+
coef = corr - alpha * (1 if corr > 0 else -1)
|
|
753
|
+
else:
|
|
754
|
+
coef = 0.0
|
|
755
|
+
|
|
756
|
+
coefficients.append(coef)
|
|
757
|
+
|
|
758
|
+
# Select non-zero coefficients
|
|
759
|
+
selected_indices = [i for i, coef in enumerate(coefficients) if abs(coef) > 1e-10]
|
|
760
|
+
selected_names = [feature_names[i] for i in selected_indices]
|
|
761
|
+
selected_coefs = [coefficients[i] for i in selected_indices]
|
|
762
|
+
|
|
763
|
+
return selected_indices, selected_names, selected_coefs
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# Create alias
|
|
767
|
+
l1_select = l1_feature_selection
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def univariate_feature_selection(
|
|
771
|
+
X: List[List[float]],
|
|
772
|
+
y: List,
|
|
773
|
+
feature_names: Optional[List[str]] = None,
|
|
774
|
+
method: str = 'correlation',
|
|
775
|
+
k: int = 10
|
|
776
|
+
) -> Tuple[List[int], List[str], List[float]]:
|
|
777
|
+
"""
|
|
778
|
+
Univariate feature selection.
|
|
779
|
+
|
|
780
|
+
Alias: univariate_select()
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
X: Feature matrix [n_samples, n_features]
|
|
784
|
+
y: Target values
|
|
785
|
+
feature_names: Optional feature names
|
|
786
|
+
method: 'correlation', 'variance', or 'mutual_info'
|
|
787
|
+
k: Number of features to select
|
|
788
|
+
|
|
789
|
+
Returns:
|
|
790
|
+
tuple: (selected_indices, selected_names, scores)
|
|
791
|
+
|
|
792
|
+
Examples:
|
|
793
|
+
>>> from ilovetools.ml import univariate_select # Short alias
|
|
794
|
+
|
|
795
|
+
>>> X = [[1, 2, 3], [2, 4, 6], [3, 6, 9], [4, 8, 12]]
|
|
796
|
+
>>> y = [1, 2, 3, 4]
|
|
797
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
798
|
+
>>>
|
|
799
|
+
>>> indices, names, scores = univariate_select(X, y, feature_names, method='correlation', k=2)
|
|
800
|
+
>>> print(f"Selected: {names}")
|
|
801
|
+
|
|
802
|
+
>>> from ilovetools.ml import univariate_feature_selection # Full name
|
|
803
|
+
>>> indices, names, scores = univariate_feature_selection(X, y, feature_names)
|
|
804
|
+
|
|
805
|
+
Notes:
|
|
806
|
+
- Tests each feature independently
|
|
807
|
+
- Fast filter method
|
|
808
|
+
- Ignores feature interactions
|
|
809
|
+
- Good starting point
|
|
810
|
+
"""
|
|
811
|
+
if method == 'correlation' or method == 'mutual_info':
|
|
812
|
+
return mutual_information_filter(X, y, feature_names, k)
|
|
813
|
+
elif method == 'variance':
|
|
814
|
+
indices, names = variance_threshold_filter(X, feature_names, threshold=0.0)
|
|
815
|
+
scores = [1.0] * len(indices) # Dummy scores
|
|
816
|
+
return indices[:k], names[:k], scores[:k]
|
|
817
|
+
else:
|
|
818
|
+
raise ValueError(f"Unknown method: {method}")
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
# Create alias
|
|
822
|
+
univariate_select = univariate_feature_selection
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def select_k_best_features(
|
|
826
|
+
X: List[List[float]],
|
|
827
|
+
y: List,
|
|
828
|
+
feature_names: Optional[List[str]] = None,
|
|
829
|
+
k: int = 10,
|
|
830
|
+
method: str = 'auto'
|
|
831
|
+
) -> Tuple[List[int], List[str]]:
|
|
832
|
+
"""
|
|
833
|
+
Select k best features automatically.
|
|
834
|
+
|
|
835
|
+
Alias: select_k_best()
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
X: Feature matrix [n_samples, n_features]
|
|
839
|
+
y: Target values
|
|
840
|
+
feature_names: Optional feature names
|
|
841
|
+
k: Number of features to select
|
|
842
|
+
method: 'auto', 'correlation', 'chi2', or 'mutual_info'
|
|
843
|
+
|
|
844
|
+
Returns:
|
|
845
|
+
tuple: (selected_indices, selected_names)
|
|
846
|
+
|
|
847
|
+
Examples:
|
|
848
|
+
>>> from ilovetools.ml import select_k_best # Short alias
|
|
849
|
+
|
|
850
|
+
>>> X = [[1, 2, 3, 4], [2, 4, 6, 8], [3, 6, 9, 12], [4, 8, 12, 16]]
|
|
851
|
+
>>> y = [1, 2, 3, 4]
|
|
852
|
+
>>> feature_names = ['A', 'B', 'C', 'D']
|
|
853
|
+
>>>
|
|
854
|
+
>>> indices, names = select_k_best(X, y, feature_names, k=2)
|
|
855
|
+
>>> print(f"Selected: {names}")
|
|
856
|
+
|
|
857
|
+
>>> from ilovetools.ml import select_k_best_features # Full name
|
|
858
|
+
>>> indices, names = select_k_best_features(X, y, feature_names)
|
|
859
|
+
|
|
860
|
+
Notes:
|
|
861
|
+
- Automatic method selection
|
|
862
|
+
- Fast and simple
|
|
863
|
+
- Good default choice
|
|
864
|
+
- Use for quick feature reduction
|
|
865
|
+
"""
|
|
866
|
+
if method == 'auto':
|
|
867
|
+
# Check if y is categorical (for chi2) or continuous
|
|
868
|
+
unique_y = len(set(y))
|
|
869
|
+
if unique_y <= 10: # Likely categorical
|
|
870
|
+
method = 'chi2'
|
|
871
|
+
else:
|
|
872
|
+
method = 'mutual_info'
|
|
873
|
+
|
|
874
|
+
if method == 'chi2':
|
|
875
|
+
indices, names, _ = chi_square_filter(X, y, feature_names, k)
|
|
876
|
+
elif method == 'mutual_info' or method == 'correlation':
|
|
877
|
+
indices, names, _ = mutual_information_filter(X, y, feature_names, k)
|
|
878
|
+
else:
|
|
879
|
+
raise ValueError(f"Unknown method: {method}")
|
|
880
|
+
|
|
881
|
+
return indices, names
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
# Create alias
|
|
885
|
+
select_k_best = select_k_best_features
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def remove_correlated_features(
|
|
889
|
+
X: List[List[float]],
|
|
890
|
+
feature_names: Optional[List[str]] = None,
|
|
891
|
+
threshold: float = 0.95
|
|
892
|
+
) -> Tuple[List[int], List[str], List[Tuple[str, str, float]]]:
|
|
893
|
+
"""
|
|
894
|
+
Remove highly correlated features and return correlation pairs.
|
|
895
|
+
|
|
896
|
+
Alias: remove_corr()
|
|
897
|
+
|
|
898
|
+
Args:
|
|
899
|
+
X: Feature matrix [n_samples, n_features]
|
|
900
|
+
feature_names: Optional feature names
|
|
901
|
+
threshold: Correlation threshold (default: 0.95)
|
|
902
|
+
|
|
903
|
+
Returns:
|
|
904
|
+
tuple: (selected_indices, selected_names, removed_pairs)
|
|
905
|
+
|
|
906
|
+
Examples:
|
|
907
|
+
>>> from ilovetools.ml import remove_corr # Short alias
|
|
908
|
+
|
|
909
|
+
>>> X = [
|
|
910
|
+
... [1, 2, 2.05],
|
|
911
|
+
... [2, 4, 4.1],
|
|
912
|
+
... [3, 6, 6.15],
|
|
913
|
+
... [4, 8, 8.2]
|
|
914
|
+
... ]
|
|
915
|
+
>>> feature_names = ['A', 'B', 'C']
|
|
916
|
+
>>>
|
|
917
|
+
>>> indices, names, pairs = remove_corr(X, feature_names, threshold=0.95)
|
|
918
|
+
>>> print(f"Kept: {names}")
|
|
919
|
+
>>> print(f"Removed pairs: {[(p[0], p[1], f'{p[2]:.2f}') for p in pairs]}")
|
|
920
|
+
|
|
921
|
+
>>> from ilovetools.ml import remove_correlated_features # Full name
|
|
922
|
+
>>> indices, names, pairs = remove_correlated_features(X, feature_names)
|
|
923
|
+
|
|
924
|
+
Notes:
|
|
925
|
+
- Returns which features were correlated
|
|
926
|
+
- Helps understand redundancy
|
|
927
|
+
- Use before training
|
|
928
|
+
- Threshold 0.95 is common
|
|
929
|
+
"""
|
|
930
|
+
indices, names = correlation_filter(X, feature_names, threshold)
|
|
931
|
+
|
|
932
|
+
# Find removed pairs
|
|
933
|
+
n_features = len(X[0])
|
|
934
|
+
if feature_names is None:
|
|
935
|
+
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
936
|
+
|
|
937
|
+
removed_pairs = []
|
|
938
|
+
removed_indices = set(range(n_features)) - set(indices)
|
|
939
|
+
|
|
940
|
+
# Calculate correlations for removed features
|
|
941
|
+
for removed_idx in removed_indices:
|
|
942
|
+
col_removed = [row[removed_idx] for row in X]
|
|
943
|
+
|
|
944
|
+
for kept_idx in indices:
|
|
945
|
+
col_kept = [row[kept_idx] for row in X]
|
|
946
|
+
|
|
947
|
+
# Calculate correlation
|
|
948
|
+
mean_r = sum(col_removed) / len(col_removed)
|
|
949
|
+
mean_k = sum(col_kept) / len(col_kept)
|
|
950
|
+
|
|
951
|
+
numerator = sum((col_removed[i] - mean_r) * (col_kept[i] - mean_k)
|
|
952
|
+
for i in range(len(col_removed)))
|
|
953
|
+
|
|
954
|
+
std_r = (sum((x - mean_r) ** 2 for x in col_removed) / len(col_removed)) ** 0.5
|
|
955
|
+
std_k = (sum((x - mean_k) ** 2 for x in col_kept) / len(col_kept)) ** 0.5
|
|
956
|
+
|
|
957
|
+
if std_r > 0 and std_k > 0:
|
|
958
|
+
corr = numerator / (len(col_removed) * std_r * std_k)
|
|
959
|
+
if abs(corr) > threshold:
|
|
960
|
+
removed_pairs.append((
|
|
961
|
+
feature_names[kept_idx],
|
|
962
|
+
feature_names[removed_idx],
|
|
963
|
+
abs(corr)
|
|
964
|
+
))
|
|
965
|
+
break
|
|
966
|
+
|
|
967
|
+
return indices, names, removed_pairs
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
# Create alias
|
|
971
|
+
remove_corr = remove_correlated_features
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.7"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.6",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|