ilovetools 0.1.9__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.1.9/ilovetools.egg-info → ilovetools-0.2.0}/PKG-INFO +1 -1
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/__init__.py +1 -1
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/__init__.py +55 -0
- ilovetools-0.2.0/ilovetools/ml/imbalanced.py +797 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools.egg-info/SOURCES.txt +1 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/pyproject.toml +1 -1
- {ilovetools-0.1.9 → ilovetools-0.2.0}/setup.py +1 -1
- {ilovetools-0.1.9 → ilovetools-0.2.0}/LICENSE +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/MANIFEST.in +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/README.md +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/data/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/data/feature_engineering.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/cross_validation.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/ensemble.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/feature_selection.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/interpretation.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/metrics.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/pipeline.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/ml/tuning.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/requirements.txt +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/setup.cfg +0 -0
- {ilovetools-0.1.9 → ilovetools-0.2.0}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -183,6 +183,35 @@ from .pipeline import (
|
|
|
183
183
|
pipe_summary,
|
|
184
184
|
)
|
|
185
185
|
|
|
186
|
+
from .imbalanced import (
|
|
187
|
+
# Full names
|
|
188
|
+
random_oversampling,
|
|
189
|
+
random_undersampling,
|
|
190
|
+
smote_oversampling,
|
|
191
|
+
tomek_links_undersampling,
|
|
192
|
+
class_weight_calculator,
|
|
193
|
+
stratified_sampling,
|
|
194
|
+
compute_class_distribution,
|
|
195
|
+
balance_dataset,
|
|
196
|
+
minority_class_identifier,
|
|
197
|
+
imbalance_ratio,
|
|
198
|
+
synthetic_sample_generator,
|
|
199
|
+
near_miss_undersampling,
|
|
200
|
+
# Abbreviated aliases
|
|
201
|
+
random_oversample,
|
|
202
|
+
random_undersample,
|
|
203
|
+
smote,
|
|
204
|
+
tomek_links,
|
|
205
|
+
class_weights,
|
|
206
|
+
stratified_sample,
|
|
207
|
+
class_dist,
|
|
208
|
+
balance_data,
|
|
209
|
+
minority_class,
|
|
210
|
+
imbalance_ratio_alias,
|
|
211
|
+
synthetic_sample,
|
|
212
|
+
near_miss,
|
|
213
|
+
)
|
|
214
|
+
|
|
186
215
|
__all__ = [
|
|
187
216
|
# Metrics (full names)
|
|
188
217
|
'accuracy_score',
|
|
@@ -344,4 +373,30 @@ __all__ = [
|
|
|
344
373
|
'set_params',
|
|
345
374
|
'clone_pipe',
|
|
346
375
|
'pipe_summary',
|
|
376
|
+
# Imbalanced (full names)
|
|
377
|
+
'random_oversampling',
|
|
378
|
+
'random_undersampling',
|
|
379
|
+
'smote_oversampling',
|
|
380
|
+
'tomek_links_undersampling',
|
|
381
|
+
'class_weight_calculator',
|
|
382
|
+
'stratified_sampling',
|
|
383
|
+
'compute_class_distribution',
|
|
384
|
+
'balance_dataset',
|
|
385
|
+
'minority_class_identifier',
|
|
386
|
+
'imbalance_ratio',
|
|
387
|
+
'synthetic_sample_generator',
|
|
388
|
+
'near_miss_undersampling',
|
|
389
|
+
# Imbalanced (aliases)
|
|
390
|
+
'random_oversample',
|
|
391
|
+
'random_undersample',
|
|
392
|
+
'smote',
|
|
393
|
+
'tomek_links',
|
|
394
|
+
'class_weights',
|
|
395
|
+
'stratified_sample',
|
|
396
|
+
'class_dist',
|
|
397
|
+
'balance_data',
|
|
398
|
+
'minority_class',
|
|
399
|
+
'imbalance_ratio_alias',
|
|
400
|
+
'synthetic_sample',
|
|
401
|
+
'near_miss',
|
|
347
402
|
]
|
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Imbalanced data handling utilities
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# Full names
|
|
11
|
+
'random_oversampling',
|
|
12
|
+
'random_undersampling',
|
|
13
|
+
'smote_oversampling',
|
|
14
|
+
'tomek_links_undersampling',
|
|
15
|
+
'class_weight_calculator',
|
|
16
|
+
'stratified_sampling',
|
|
17
|
+
'compute_class_distribution',
|
|
18
|
+
'balance_dataset',
|
|
19
|
+
'minority_class_identifier',
|
|
20
|
+
'imbalance_ratio',
|
|
21
|
+
'synthetic_sample_generator',
|
|
22
|
+
'near_miss_undersampling',
|
|
23
|
+
# Abbreviated aliases
|
|
24
|
+
'random_oversample',
|
|
25
|
+
'random_undersample',
|
|
26
|
+
'smote',
|
|
27
|
+
'tomek_links',
|
|
28
|
+
'class_weights',
|
|
29
|
+
'stratified_sample',
|
|
30
|
+
'class_dist',
|
|
31
|
+
'balance_data',
|
|
32
|
+
'minority_class',
|
|
33
|
+
'imbalance_ratio_alias',
|
|
34
|
+
'synthetic_sample',
|
|
35
|
+
'near_miss',
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def random_oversampling(
|
|
40
|
+
X: List[List[float]],
|
|
41
|
+
y: List[int],
|
|
42
|
+
target_ratio: float = 1.0
|
|
43
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
44
|
+
"""
|
|
45
|
+
Randomly oversample minority class.
|
|
46
|
+
|
|
47
|
+
Alias: random_oversample()
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
X: Feature data
|
|
51
|
+
y: Labels
|
|
52
|
+
target_ratio: Desired minority/majority ratio (1.0 = balanced)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
tuple: (X_resampled, y_resampled)
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
>>> from ilovetools.ml import random_oversample # Short alias
|
|
59
|
+
|
|
60
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
61
|
+
>>> y = [0, 0, 0, 0, 1] # Imbalanced: 4 vs 1
|
|
62
|
+
>>>
|
|
63
|
+
>>> X_res, y_res = random_oversample(X, y, target_ratio=1.0)
|
|
64
|
+
>>> print(len([label for label in y_res if label == 0]))
|
|
65
|
+
4
|
|
66
|
+
>>> print(len([label for label in y_res if label == 1]))
|
|
67
|
+
4
|
|
68
|
+
|
|
69
|
+
>>> from ilovetools.ml import random_oversampling # Full name
|
|
70
|
+
>>> X_res, y_res = random_oversampling(X, y)
|
|
71
|
+
|
|
72
|
+
Notes:
|
|
73
|
+
- Duplicates minority samples randomly
|
|
74
|
+
- Simple but effective
|
|
75
|
+
- May cause overfitting
|
|
76
|
+
- Good starting point
|
|
77
|
+
"""
|
|
78
|
+
# Separate by class
|
|
79
|
+
class_indices = {}
|
|
80
|
+
for idx, label in enumerate(y):
|
|
81
|
+
if label not in class_indices:
|
|
82
|
+
class_indices[label] = []
|
|
83
|
+
class_indices[label].append(idx)
|
|
84
|
+
|
|
85
|
+
# Find majority class size
|
|
86
|
+
max_size = max(len(indices) for indices in class_indices.values())
|
|
87
|
+
target_size = int(max_size * target_ratio)
|
|
88
|
+
|
|
89
|
+
X_resampled = []
|
|
90
|
+
y_resampled = []
|
|
91
|
+
|
|
92
|
+
for label, indices in class_indices.items():
|
|
93
|
+
# Add all original samples
|
|
94
|
+
for idx in indices:
|
|
95
|
+
X_resampled.append(X[idx])
|
|
96
|
+
y_resampled.append(y[idx])
|
|
97
|
+
|
|
98
|
+
# Oversample if needed
|
|
99
|
+
if len(indices) < target_size:
|
|
100
|
+
n_samples = target_size - len(indices)
|
|
101
|
+
for _ in range(n_samples):
|
|
102
|
+
idx = random.choice(indices)
|
|
103
|
+
X_resampled.append(X[idx])
|
|
104
|
+
y_resampled.append(y[idx])
|
|
105
|
+
|
|
106
|
+
return X_resampled, y_resampled
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# Create alias
|
|
110
|
+
random_oversample = random_oversampling
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def random_undersampling(
|
|
114
|
+
X: List[List[float]],
|
|
115
|
+
y: List[int],
|
|
116
|
+
target_ratio: float = 1.0
|
|
117
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
118
|
+
"""
|
|
119
|
+
Randomly undersample majority class.
|
|
120
|
+
|
|
121
|
+
Alias: random_undersample()
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
X: Feature data
|
|
125
|
+
y: Labels
|
|
126
|
+
target_ratio: Desired minority/majority ratio (1.0 = balanced)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
tuple: (X_resampled, y_resampled)
|
|
130
|
+
|
|
131
|
+
Examples:
|
|
132
|
+
>>> from ilovetools.ml import random_undersample # Short alias
|
|
133
|
+
|
|
134
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
135
|
+
>>> y = [0, 0, 0, 0, 1] # Imbalanced: 4 vs 1
|
|
136
|
+
>>>
|
|
137
|
+
>>> X_res, y_res = random_undersample(X, y, target_ratio=1.0)
|
|
138
|
+
>>> print(len([label for label in y_res if label == 0]))
|
|
139
|
+
1
|
|
140
|
+
>>> print(len([label for label in y_res if label == 1]))
|
|
141
|
+
1
|
|
142
|
+
|
|
143
|
+
>>> from ilovetools.ml import random_undersampling # Full name
|
|
144
|
+
>>> X_res, y_res = random_undersampling(X, y)
|
|
145
|
+
|
|
146
|
+
Notes:
|
|
147
|
+
- Removes majority samples randomly
|
|
148
|
+
- Loses information
|
|
149
|
+
- Faster training
|
|
150
|
+
- Good for large datasets
|
|
151
|
+
"""
|
|
152
|
+
# Separate by class
|
|
153
|
+
class_indices = {}
|
|
154
|
+
for idx, label in enumerate(y):
|
|
155
|
+
if label not in class_indices:
|
|
156
|
+
class_indices[label] = []
|
|
157
|
+
class_indices[label].append(idx)
|
|
158
|
+
|
|
159
|
+
# Find minority class size
|
|
160
|
+
min_size = min(len(indices) for indices in class_indices.values())
|
|
161
|
+
target_size = int(min_size / target_ratio)
|
|
162
|
+
|
|
163
|
+
X_resampled = []
|
|
164
|
+
y_resampled = []
|
|
165
|
+
|
|
166
|
+
for label, indices in class_indices.items():
|
|
167
|
+
# Undersample if needed
|
|
168
|
+
if len(indices) > target_size:
|
|
169
|
+
selected_indices = random.sample(indices, target_size)
|
|
170
|
+
else:
|
|
171
|
+
selected_indices = indices
|
|
172
|
+
|
|
173
|
+
for idx in selected_indices:
|
|
174
|
+
X_resampled.append(X[idx])
|
|
175
|
+
y_resampled.append(y[idx])
|
|
176
|
+
|
|
177
|
+
return X_resampled, y_resampled
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# Create alias
|
|
181
|
+
random_undersample = random_undersampling
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def smote_oversampling(
|
|
185
|
+
X: List[List[float]],
|
|
186
|
+
y: List[int],
|
|
187
|
+
k_neighbors: int = 5,
|
|
188
|
+
target_ratio: float = 1.0
|
|
189
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
190
|
+
"""
|
|
191
|
+
SMOTE (Synthetic Minority Over-sampling Technique).
|
|
192
|
+
|
|
193
|
+
Alias: smote()
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
X: Feature data
|
|
197
|
+
y: Labels
|
|
198
|
+
k_neighbors: Number of nearest neighbors
|
|
199
|
+
target_ratio: Desired minority/majority ratio
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
tuple: (X_resampled, y_resampled)
|
|
203
|
+
|
|
204
|
+
Examples:
|
|
205
|
+
>>> from ilovetools.ml import smote # Short alias
|
|
206
|
+
|
|
207
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
208
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
209
|
+
>>>
|
|
210
|
+
>>> X_res, y_res = smote(X, y, k_neighbors=2)
|
|
211
|
+
>>> print(len(y_res) > len(y))
|
|
212
|
+
True
|
|
213
|
+
|
|
214
|
+
>>> from ilovetools.ml import smote_oversampling # Full name
|
|
215
|
+
>>> X_res, y_res = smote_oversampling(X, y)
|
|
216
|
+
|
|
217
|
+
Notes:
|
|
218
|
+
- Creates synthetic samples
|
|
219
|
+
- Interpolates between neighbors
|
|
220
|
+
- Reduces overfitting
|
|
221
|
+
- Industry standard
|
|
222
|
+
"""
|
|
223
|
+
# Separate by class
|
|
224
|
+
class_indices = {}
|
|
225
|
+
for idx, label in enumerate(y):
|
|
226
|
+
if label not in class_indices:
|
|
227
|
+
class_indices[label] = []
|
|
228
|
+
class_indices[label].append(idx)
|
|
229
|
+
|
|
230
|
+
# Find majority class size
|
|
231
|
+
max_size = max(len(indices) for indices in class_indices.values())
|
|
232
|
+
target_size = int(max_size * target_ratio)
|
|
233
|
+
|
|
234
|
+
X_resampled = list(X)
|
|
235
|
+
y_resampled = list(y)
|
|
236
|
+
|
|
237
|
+
for label, indices in class_indices.items():
|
|
238
|
+
if len(indices) < target_size:
|
|
239
|
+
n_samples = target_size - len(indices)
|
|
240
|
+
|
|
241
|
+
for _ in range(n_samples):
|
|
242
|
+
# Select random sample from minority class
|
|
243
|
+
idx = random.choice(indices)
|
|
244
|
+
sample = X[idx]
|
|
245
|
+
|
|
246
|
+
# Find k nearest neighbors (simplified)
|
|
247
|
+
neighbors = random.sample(indices, min(k_neighbors, len(indices)))
|
|
248
|
+
neighbor_idx = random.choice(neighbors)
|
|
249
|
+
neighbor = X[neighbor_idx]
|
|
250
|
+
|
|
251
|
+
# Create synthetic sample (interpolation)
|
|
252
|
+
alpha = random.random()
|
|
253
|
+
synthetic = [
|
|
254
|
+
sample[i] + alpha * (neighbor[i] - sample[i])
|
|
255
|
+
for i in range(len(sample))
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
X_resampled.append(synthetic)
|
|
259
|
+
y_resampled.append(label)
|
|
260
|
+
|
|
261
|
+
return X_resampled, y_resampled
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# Create alias
|
|
265
|
+
smote = smote_oversampling
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def tomek_links_undersampling(
|
|
269
|
+
X: List[List[float]],
|
|
270
|
+
y: List[int]
|
|
271
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
272
|
+
"""
|
|
273
|
+
Remove Tomek links (borderline samples).
|
|
274
|
+
|
|
275
|
+
Alias: tomek_links()
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
X: Feature data
|
|
279
|
+
y: Labels
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
tuple: (X_cleaned, y_cleaned)
|
|
283
|
+
|
|
284
|
+
Examples:
|
|
285
|
+
>>> from ilovetools.ml import tomek_links # Short alias
|
|
286
|
+
|
|
287
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5]]
|
|
288
|
+
>>> y = [0, 0, 1, 1]
|
|
289
|
+
>>>
|
|
290
|
+
>>> X_clean, y_clean = tomek_links(X, y)
|
|
291
|
+
>>> print(len(X_clean) <= len(X))
|
|
292
|
+
True
|
|
293
|
+
|
|
294
|
+
>>> from ilovetools.ml import tomek_links_undersampling # Full name
|
|
295
|
+
>>> X_clean, y_clean = tomek_links_undersampling(X, y)
|
|
296
|
+
|
|
297
|
+
Notes:
|
|
298
|
+
- Removes noisy samples
|
|
299
|
+
- Cleans decision boundary
|
|
300
|
+
- Often combined with SMOTE
|
|
301
|
+
- Improves model performance
|
|
302
|
+
"""
|
|
303
|
+
def euclidean_distance(p1, p2):
|
|
304
|
+
return sum((a - b) ** 2 for a, b in zip(p1, p2)) ** 0.5
|
|
305
|
+
|
|
306
|
+
# Find Tomek links (simplified version)
|
|
307
|
+
tomek_indices = set()
|
|
308
|
+
|
|
309
|
+
for i in range(len(X)):
|
|
310
|
+
# Find nearest neighbor with different class
|
|
311
|
+
min_dist = float('inf')
|
|
312
|
+
nearest_idx = -1
|
|
313
|
+
|
|
314
|
+
for j in range(len(X)):
|
|
315
|
+
if i != j and y[i] != y[j]:
|
|
316
|
+
dist = euclidean_distance(X[i], X[j])
|
|
317
|
+
if dist < min_dist:
|
|
318
|
+
min_dist = dist
|
|
319
|
+
nearest_idx = j
|
|
320
|
+
|
|
321
|
+
if nearest_idx != -1:
|
|
322
|
+
# Check if they are each other's nearest neighbors
|
|
323
|
+
is_tomek = True
|
|
324
|
+
for k in range(len(X)):
|
|
325
|
+
if k != i and k != nearest_idx:
|
|
326
|
+
if euclidean_distance(X[nearest_idx], X[k]) < min_dist:
|
|
327
|
+
is_tomek = False
|
|
328
|
+
break
|
|
329
|
+
|
|
330
|
+
if is_tomek:
|
|
331
|
+
# Remove majority class sample
|
|
332
|
+
if sum(1 for label in y if label == y[i]) > sum(1 for label in y if label == y[nearest_idx]):
|
|
333
|
+
tomek_indices.add(i)
|
|
334
|
+
else:
|
|
335
|
+
tomek_indices.add(nearest_idx)
|
|
336
|
+
|
|
337
|
+
# Remove Tomek links
|
|
338
|
+
X_cleaned = [X[i] for i in range(len(X)) if i not in tomek_indices]
|
|
339
|
+
y_cleaned = [y[i] for i in range(len(y)) if i not in tomek_indices]
|
|
340
|
+
|
|
341
|
+
return X_cleaned, y_cleaned
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# Create alias
|
|
345
|
+
tomek_links = tomek_links_undersampling
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def class_weight_calculator(y: List[int]) -> Dict[int, float]:
|
|
349
|
+
"""
|
|
350
|
+
Calculate class weights for imbalanced data.
|
|
351
|
+
|
|
352
|
+
Alias: class_weights()
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
y: Labels
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
dict: Class weights
|
|
359
|
+
|
|
360
|
+
Examples:
|
|
361
|
+
>>> from ilovetools.ml import class_weights # Short alias
|
|
362
|
+
|
|
363
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
364
|
+
>>> weights = class_weights(y)
|
|
365
|
+
>>> print(weights[0] < weights[1])
|
|
366
|
+
True
|
|
367
|
+
|
|
368
|
+
>>> from ilovetools.ml import class_weight_calculator # Full name
|
|
369
|
+
>>> weights = class_weight_calculator(y)
|
|
370
|
+
|
|
371
|
+
Notes:
|
|
372
|
+
- Inverse of class frequency
|
|
373
|
+
- Use in model training
|
|
374
|
+
- Penalizes minority errors more
|
|
375
|
+
- Sklearn-compatible
|
|
376
|
+
"""
|
|
377
|
+
# Count samples per class
|
|
378
|
+
class_counts = {}
|
|
379
|
+
for label in y:
|
|
380
|
+
class_counts[label] = class_counts.get(label, 0) + 1
|
|
381
|
+
|
|
382
|
+
# Calculate weights (inverse frequency)
|
|
383
|
+
n_samples = len(y)
|
|
384
|
+
n_classes = len(class_counts)
|
|
385
|
+
|
|
386
|
+
weights = {}
|
|
387
|
+
for label, count in class_counts.items():
|
|
388
|
+
weights[label] = n_samples / (n_classes * count)
|
|
389
|
+
|
|
390
|
+
return weights
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# Create alias
|
|
394
|
+
class_weights = class_weight_calculator
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def stratified_sampling(
|
|
398
|
+
X: List[List[float]],
|
|
399
|
+
y: List[int],
|
|
400
|
+
sample_size: int
|
|
401
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
402
|
+
"""
|
|
403
|
+
Stratified sampling maintaining class distribution.
|
|
404
|
+
|
|
405
|
+
Alias: stratified_sample()
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
X: Feature data
|
|
409
|
+
y: Labels
|
|
410
|
+
sample_size: Number of samples to draw
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
tuple: (X_sample, y_sample)
|
|
414
|
+
|
|
415
|
+
Examples:
|
|
416
|
+
>>> from ilovetools.ml import stratified_sample # Short alias
|
|
417
|
+
|
|
418
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
419
|
+
>>> y = [0, 0, 0, 1, 1]
|
|
420
|
+
>>>
|
|
421
|
+
>>> X_sample, y_sample = stratified_sample(X, y, sample_size=3)
|
|
422
|
+
>>> print(len(X_sample))
|
|
423
|
+
3
|
|
424
|
+
|
|
425
|
+
>>> from ilovetools.ml import stratified_sampling # Full name
|
|
426
|
+
>>> X_sample, y_sample = stratified_sampling(X, y, 3)
|
|
427
|
+
|
|
428
|
+
Notes:
|
|
429
|
+
- Maintains class proportions
|
|
430
|
+
- Better for train/test split
|
|
431
|
+
- Reduces sampling bias
|
|
432
|
+
- Essential for imbalanced data
|
|
433
|
+
"""
|
|
434
|
+
# Separate by class
|
|
435
|
+
class_indices = {}
|
|
436
|
+
for idx, label in enumerate(y):
|
|
437
|
+
if label not in class_indices:
|
|
438
|
+
class_indices[label] = []
|
|
439
|
+
class_indices[label].append(idx)
|
|
440
|
+
|
|
441
|
+
# Calculate samples per class
|
|
442
|
+
class_proportions = {
|
|
443
|
+
label: len(indices) / len(y)
|
|
444
|
+
for label, indices in class_indices.items()
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
X_sample = []
|
|
448
|
+
y_sample = []
|
|
449
|
+
|
|
450
|
+
for label, proportion in class_proportions.items():
|
|
451
|
+
n_samples = int(sample_size * proportion)
|
|
452
|
+
indices = class_indices[label]
|
|
453
|
+
|
|
454
|
+
if n_samples > 0:
|
|
455
|
+
selected = random.sample(indices, min(n_samples, len(indices)))
|
|
456
|
+
for idx in selected:
|
|
457
|
+
X_sample.append(X[idx])
|
|
458
|
+
y_sample.append(y[idx])
|
|
459
|
+
|
|
460
|
+
return X_sample, y_sample
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# Create alias
|
|
464
|
+
stratified_sample = stratified_sampling
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def compute_class_distribution(y: List[int]) -> Dict[str, Any]:
|
|
468
|
+
"""
|
|
469
|
+
Compute class distribution statistics.
|
|
470
|
+
|
|
471
|
+
Alias: class_dist()
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
y: Labels
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
dict: Distribution statistics
|
|
478
|
+
|
|
479
|
+
Examples:
|
|
480
|
+
>>> from ilovetools.ml import class_dist # Short alias
|
|
481
|
+
|
|
482
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
483
|
+
>>> dist = class_dist(y)
|
|
484
|
+
>>> print(dist['counts'])
|
|
485
|
+
{0: 4, 1: 1}
|
|
486
|
+
>>> print(dist['imbalance_ratio'])
|
|
487
|
+
4.0
|
|
488
|
+
|
|
489
|
+
>>> from ilovetools.ml import compute_class_distribution # Full name
|
|
490
|
+
>>> dist = compute_class_distribution(y)
|
|
491
|
+
|
|
492
|
+
Notes:
|
|
493
|
+
- Understand data imbalance
|
|
494
|
+
- Plan resampling strategy
|
|
495
|
+
- Monitor class distribution
|
|
496
|
+
- Essential first step
|
|
497
|
+
"""
|
|
498
|
+
# Count samples per class
|
|
499
|
+
class_counts = {}
|
|
500
|
+
for label in y:
|
|
501
|
+
class_counts[label] = class_counts.get(label, 0) + 1
|
|
502
|
+
|
|
503
|
+
# Calculate proportions
|
|
504
|
+
total = len(y)
|
|
505
|
+
class_proportions = {
|
|
506
|
+
label: count / total
|
|
507
|
+
for label, count in class_counts.items()
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
# Find majority and minority
|
|
511
|
+
majority_class = max(class_counts, key=class_counts.get)
|
|
512
|
+
minority_class = min(class_counts, key=class_counts.get)
|
|
513
|
+
|
|
514
|
+
# Calculate imbalance ratio
|
|
515
|
+
imbalance_ratio = class_counts[majority_class] / class_counts[minority_class]
|
|
516
|
+
|
|
517
|
+
return {
|
|
518
|
+
'counts': class_counts,
|
|
519
|
+
'proportions': class_proportions,
|
|
520
|
+
'majority_class': majority_class,
|
|
521
|
+
'minority_class': minority_class,
|
|
522
|
+
'imbalance_ratio': imbalance_ratio,
|
|
523
|
+
'total_samples': total,
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# Create alias
|
|
528
|
+
class_dist = compute_class_distribution
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def balance_dataset(
|
|
532
|
+
X: List[List[float]],
|
|
533
|
+
y: List[int],
|
|
534
|
+
method: str = 'oversample',
|
|
535
|
+
target_ratio: float = 1.0
|
|
536
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
537
|
+
"""
|
|
538
|
+
Balance dataset using specified method.
|
|
539
|
+
|
|
540
|
+
Alias: balance_data()
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
X: Feature data
|
|
544
|
+
y: Labels
|
|
545
|
+
method: 'oversample', 'undersample', or 'smote'
|
|
546
|
+
target_ratio: Desired balance ratio
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
tuple: (X_balanced, y_balanced)
|
|
550
|
+
|
|
551
|
+
Examples:
|
|
552
|
+
>>> from ilovetools.ml import balance_data # Short alias
|
|
553
|
+
|
|
554
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
555
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
556
|
+
>>>
|
|
557
|
+
>>> X_bal, y_bal = balance_data(X, y, method='oversample')
|
|
558
|
+
>>> print(len(y_bal) >= len(y))
|
|
559
|
+
True
|
|
560
|
+
|
|
561
|
+
>>> from ilovetools.ml import balance_dataset # Full name
|
|
562
|
+
>>> X_bal, y_bal = balance_dataset(X, y, method='smote')
|
|
563
|
+
|
|
564
|
+
Notes:
|
|
565
|
+
- Unified interface
|
|
566
|
+
- Multiple methods
|
|
567
|
+
- Easy to switch
|
|
568
|
+
- Production ready
|
|
569
|
+
"""
|
|
570
|
+
if method == 'oversample':
|
|
571
|
+
return random_oversampling(X, y, target_ratio)
|
|
572
|
+
elif method == 'undersample':
|
|
573
|
+
return random_undersampling(X, y, target_ratio)
|
|
574
|
+
elif method == 'smote':
|
|
575
|
+
return smote_oversampling(X, y, target_ratio=target_ratio)
|
|
576
|
+
else:
|
|
577
|
+
raise ValueError(f"Unknown method: {method}")
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
# Create alias
|
|
581
|
+
balance_data = balance_dataset
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def minority_class_identifier(y: List[int]) -> int:
|
|
585
|
+
"""
|
|
586
|
+
Identify minority class label.
|
|
587
|
+
|
|
588
|
+
Alias: minority_class()
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
y: Labels
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
int: Minority class label
|
|
595
|
+
|
|
596
|
+
Examples:
|
|
597
|
+
>>> from ilovetools.ml import minority_class # Short alias
|
|
598
|
+
|
|
599
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
600
|
+
>>> minority = minority_class(y)
|
|
601
|
+
>>> print(minority)
|
|
602
|
+
1
|
|
603
|
+
|
|
604
|
+
>>> from ilovetools.ml import minority_class_identifier # Full name
|
|
605
|
+
>>> minority = minority_class_identifier(y)
|
|
606
|
+
|
|
607
|
+
Notes:
|
|
608
|
+
- Quick identification
|
|
609
|
+
- Useful for filtering
|
|
610
|
+
- Essential for resampling
|
|
611
|
+
- Simple utility
|
|
612
|
+
"""
|
|
613
|
+
class_counts = {}
|
|
614
|
+
for label in y:
|
|
615
|
+
class_counts[label] = class_counts.get(label, 0) + 1
|
|
616
|
+
|
|
617
|
+
return min(class_counts, key=class_counts.get)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
# Create alias
|
|
621
|
+
minority_class = minority_class_identifier
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def imbalance_ratio(y: List[int]) -> float:
|
|
625
|
+
"""
|
|
626
|
+
Calculate imbalance ratio (majority/minority).
|
|
627
|
+
|
|
628
|
+
Alias: imbalance_ratio_alias()
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
y: Labels
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
float: Imbalance ratio
|
|
635
|
+
|
|
636
|
+
Examples:
|
|
637
|
+
>>> from ilovetools.ml import imbalance_ratio
|
|
638
|
+
|
|
639
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
640
|
+
>>> ratio = imbalance_ratio(y)
|
|
641
|
+
>>> print(ratio)
|
|
642
|
+
4.0
|
|
643
|
+
|
|
644
|
+
>>> y = [0, 0, 1, 1]
|
|
645
|
+
>>> ratio = imbalance_ratio(y)
|
|
646
|
+
>>> print(ratio)
|
|
647
|
+
1.0
|
|
648
|
+
|
|
649
|
+
Notes:
|
|
650
|
+
- Quick assessment
|
|
651
|
+
- 1.0 = balanced
|
|
652
|
+
- >3.0 = highly imbalanced
|
|
653
|
+
- Guide resampling strategy
|
|
654
|
+
"""
|
|
655
|
+
class_counts = {}
|
|
656
|
+
for label in y:
|
|
657
|
+
class_counts[label] = class_counts.get(label, 0) + 1
|
|
658
|
+
|
|
659
|
+
majority_count = max(class_counts.values())
|
|
660
|
+
minority_count = min(class_counts.values())
|
|
661
|
+
|
|
662
|
+
return majority_count / minority_count
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
# Create alias (different name to avoid conflict)
|
|
666
|
+
imbalance_ratio_alias = imbalance_ratio
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def synthetic_sample_generator(
|
|
670
|
+
sample: List[float],
|
|
671
|
+
neighbor: List[float],
|
|
672
|
+
alpha: Optional[float] = None
|
|
673
|
+
) -> List[float]:
|
|
674
|
+
"""
|
|
675
|
+
Generate synthetic sample between two samples.
|
|
676
|
+
|
|
677
|
+
Alias: synthetic_sample()
|
|
678
|
+
|
|
679
|
+
Args:
|
|
680
|
+
sample: First sample
|
|
681
|
+
neighbor: Second sample
|
|
682
|
+
alpha: Interpolation factor (None = random)
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
list: Synthetic sample
|
|
686
|
+
|
|
687
|
+
Examples:
|
|
688
|
+
>>> from ilovetools.ml import synthetic_sample # Short alias
|
|
689
|
+
|
|
690
|
+
>>> sample = [1.0, 2.0]
|
|
691
|
+
>>> neighbor = [3.0, 4.0]
|
|
692
|
+
>>> synthetic = synthetic_sample(sample, neighbor, alpha=0.5)
|
|
693
|
+
>>> print(synthetic)
|
|
694
|
+
[2.0, 3.0]
|
|
695
|
+
|
|
696
|
+
>>> from ilovetools.ml import synthetic_sample_generator # Full name
|
|
697
|
+
>>> synthetic = synthetic_sample_generator(sample, neighbor)
|
|
698
|
+
|
|
699
|
+
Notes:
|
|
700
|
+
- Core of SMOTE
|
|
701
|
+
- Linear interpolation
|
|
702
|
+
- Creates diversity
|
|
703
|
+
- Reduces overfitting
|
|
704
|
+
"""
|
|
705
|
+
if alpha is None:
|
|
706
|
+
alpha = random.random()
|
|
707
|
+
|
|
708
|
+
return [
|
|
709
|
+
sample[i] + alpha * (neighbor[i] - sample[i])
|
|
710
|
+
for i in range(len(sample))
|
|
711
|
+
]
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
# Create alias
|
|
715
|
+
synthetic_sample = synthetic_sample_generator
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def near_miss_undersampling(
|
|
719
|
+
X: List[List[float]],
|
|
720
|
+
y: List[int],
|
|
721
|
+
version: int = 1
|
|
722
|
+
) -> Tuple[List[List[float]], List[int]]:
|
|
723
|
+
"""
|
|
724
|
+
NearMiss undersampling algorithm.
|
|
725
|
+
|
|
726
|
+
Alias: near_miss()
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
X: Feature data
|
|
730
|
+
y: Labels
|
|
731
|
+
version: NearMiss version (1, 2, or 3)
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
tuple: (X_resampled, y_resampled)
|
|
735
|
+
|
|
736
|
+
Examples:
|
|
737
|
+
>>> from ilovetools.ml import near_miss # Short alias
|
|
738
|
+
|
|
739
|
+
>>> X = [[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]
|
|
740
|
+
>>> y = [0, 0, 0, 0, 1]
|
|
741
|
+
>>>
|
|
742
|
+
>>> X_res, y_res = near_miss(X, y, version=1)
|
|
743
|
+
>>> print(len(y_res) < len(y))
|
|
744
|
+
True
|
|
745
|
+
|
|
746
|
+
>>> from ilovetools.ml import near_miss_undersampling # Full name
|
|
747
|
+
>>> X_res, y_res = near_miss_undersampling(X, y)
|
|
748
|
+
|
|
749
|
+
Notes:
|
|
750
|
+
- Intelligent undersampling
|
|
751
|
+
- Keeps informative samples
|
|
752
|
+
- Better than random
|
|
753
|
+
- Multiple versions
|
|
754
|
+
"""
|
|
755
|
+
def euclidean_distance(p1, p2):
|
|
756
|
+
return sum((a - b) ** 2 for a, b in zip(p1, p2)) ** 0.5
|
|
757
|
+
|
|
758
|
+
# Separate by class
|
|
759
|
+
class_indices = {}
|
|
760
|
+
for idx, label in enumerate(y):
|
|
761
|
+
if label not in class_indices:
|
|
762
|
+
class_indices[label] = []
|
|
763
|
+
class_indices[label].append(idx)
|
|
764
|
+
|
|
765
|
+
# Find majority and minority
|
|
766
|
+
majority_label = max(class_indices, key=lambda k: len(class_indices[k]))
|
|
767
|
+
minority_label = min(class_indices, key=lambda k: len(class_indices[k]))
|
|
768
|
+
|
|
769
|
+
majority_indices = class_indices[majority_label]
|
|
770
|
+
minority_indices = class_indices[minority_label]
|
|
771
|
+
|
|
772
|
+
# NearMiss-1: Select majority samples closest to minority
|
|
773
|
+
selected_majority = []
|
|
774
|
+
target_size = len(minority_indices)
|
|
775
|
+
|
|
776
|
+
# Calculate average distance to minority class
|
|
777
|
+
distances = []
|
|
778
|
+
for maj_idx in majority_indices:
|
|
779
|
+
avg_dist = sum(
|
|
780
|
+
euclidean_distance(X[maj_idx], X[min_idx])
|
|
781
|
+
for min_idx in minority_indices
|
|
782
|
+
) / len(minority_indices)
|
|
783
|
+
distances.append((maj_idx, avg_dist))
|
|
784
|
+
|
|
785
|
+
# Select samples with smallest average distance
|
|
786
|
+
distances.sort(key=lambda x: x[1])
|
|
787
|
+
selected_majority = [idx for idx, _ in distances[:target_size]]
|
|
788
|
+
|
|
789
|
+
# Combine with minority class
|
|
790
|
+
X_resampled = [X[i] for i in minority_indices + selected_majority]
|
|
791
|
+
y_resampled = [y[i] for i in minority_indices + selected_majority]
|
|
792
|
+
|
|
793
|
+
return X_resampled, y_resampled
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
# Create alias
|
|
797
|
+
near_miss = near_miss_undersampling
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.9",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|