ilovetools 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +1 -1
- ilovetools/ml/__init__.py +110 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/feature_selection.py +971 -0
- {ilovetools-0.1.5.dist-info → ilovetools-0.1.7.dist-info}/METADATA +1 -1
- {ilovetools-0.1.5.dist-info → ilovetools-0.1.7.dist-info}/RECORD +9 -7
- {ilovetools-0.1.5.dist-info → ilovetools-0.1.7.dist-info}/WHEEL +0 -0
- {ilovetools-0.1.5.dist-info → ilovetools-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {ilovetools-0.1.5.dist-info → ilovetools-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,872 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ensemble methods utilities for ML workflows
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Callable, Optional, Tuple
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# Full names
|
|
11
|
+
'voting_classifier',
|
|
12
|
+
'voting_regressor',
|
|
13
|
+
'bagging_predictions',
|
|
14
|
+
'boosting_sequential',
|
|
15
|
+
'stacking_ensemble',
|
|
16
|
+
'weighted_average_ensemble',
|
|
17
|
+
'majority_vote',
|
|
18
|
+
'soft_vote',
|
|
19
|
+
'bootstrap_sample',
|
|
20
|
+
'out_of_bag_score',
|
|
21
|
+
'ensemble_diversity',
|
|
22
|
+
'blend_predictions',
|
|
23
|
+
# Abbreviated aliases
|
|
24
|
+
'vote_clf',
|
|
25
|
+
'vote_reg',
|
|
26
|
+
'bagging',
|
|
27
|
+
'boosting',
|
|
28
|
+
'stacking',
|
|
29
|
+
'weighted_avg',
|
|
30
|
+
'hard_vote',
|
|
31
|
+
'soft_vote_alias',
|
|
32
|
+
'bootstrap',
|
|
33
|
+
'oob_score',
|
|
34
|
+
'diversity',
|
|
35
|
+
'blend',
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def voting_classifier(
|
|
40
|
+
predictions: List[List[int]],
|
|
41
|
+
method: str = 'hard',
|
|
42
|
+
weights: Optional[List[float]] = None
|
|
43
|
+
) -> List[int]:
|
|
44
|
+
"""
|
|
45
|
+
Combine multiple classifier predictions using voting.
|
|
46
|
+
|
|
47
|
+
Alias: vote_clf()
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
predictions: List of prediction arrays from different models
|
|
51
|
+
method: 'hard' (majority vote) or 'soft' (average probabilities)
|
|
52
|
+
weights: Optional weights for each model
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
list: Combined predictions
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
>>> from ilovetools.ml import vote_clf # Short alias
|
|
59
|
+
|
|
60
|
+
# Hard voting (majority)
|
|
61
|
+
>>> model1_pred = [0, 1, 1, 0, 1]
|
|
62
|
+
>>> model2_pred = [0, 1, 0, 0, 1]
|
|
63
|
+
>>> model3_pred = [1, 1, 1, 0, 1]
|
|
64
|
+
>>> predictions = [model1_pred, model2_pred, model3_pred]
|
|
65
|
+
>>> result = vote_clf(predictions, method='hard')
|
|
66
|
+
>>> print(result)
|
|
67
|
+
[0, 1, 1, 0, 1]
|
|
68
|
+
|
|
69
|
+
# Weighted voting
|
|
70
|
+
>>> weights = [0.5, 0.3, 0.2] # Trust model1 more
|
|
71
|
+
>>> result = vote_clf(predictions, weights=weights)
|
|
72
|
+
|
|
73
|
+
>>> from ilovetools.ml import voting_classifier # Full name
|
|
74
|
+
>>> result = voting_classifier(predictions, method='hard')
|
|
75
|
+
|
|
76
|
+
Notes:
|
|
77
|
+
- Hard voting: Majority class wins
|
|
78
|
+
- Soft voting: Average probabilities (need predict_proba)
|
|
79
|
+
- Weighted: Give more importance to better models
|
|
80
|
+
- Odd number of models avoids ties
|
|
81
|
+
"""
|
|
82
|
+
if not predictions:
|
|
83
|
+
raise ValueError("predictions cannot be empty")
|
|
84
|
+
|
|
85
|
+
n_samples = len(predictions[0])
|
|
86
|
+
n_models = len(predictions)
|
|
87
|
+
|
|
88
|
+
if weights is None:
|
|
89
|
+
weights = [1.0] * n_models
|
|
90
|
+
|
|
91
|
+
if len(weights) != n_models:
|
|
92
|
+
raise ValueError("weights must match number of models")
|
|
93
|
+
|
|
94
|
+
result = []
|
|
95
|
+
|
|
96
|
+
for i in range(n_samples):
|
|
97
|
+
votes = {}
|
|
98
|
+
for model_idx, model_preds in enumerate(predictions):
|
|
99
|
+
pred = model_preds[i]
|
|
100
|
+
weight = weights[model_idx]
|
|
101
|
+
votes[pred] = votes.get(pred, 0) + weight
|
|
102
|
+
|
|
103
|
+
# Get class with highest weighted vote
|
|
104
|
+
final_pred = max(votes.items(), key=lambda x: x[1])[0]
|
|
105
|
+
result.append(final_pred)
|
|
106
|
+
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Create alias
|
|
111
|
+
vote_clf = voting_classifier
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def voting_regressor(
|
|
115
|
+
predictions: List[List[float]],
|
|
116
|
+
method: str = 'mean',
|
|
117
|
+
weights: Optional[List[float]] = None
|
|
118
|
+
) -> List[float]:
|
|
119
|
+
"""
|
|
120
|
+
Combine multiple regressor predictions.
|
|
121
|
+
|
|
122
|
+
Alias: vote_reg()
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
predictions: List of prediction arrays from different models
|
|
126
|
+
method: 'mean', 'median', or 'weighted'
|
|
127
|
+
weights: Optional weights for weighted average
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
list: Combined predictions
|
|
131
|
+
|
|
132
|
+
Examples:
|
|
133
|
+
>>> from ilovetools.ml import vote_reg # Short alias
|
|
134
|
+
|
|
135
|
+
# Mean averaging
|
|
136
|
+
>>> model1_pred = [100, 200, 300]
|
|
137
|
+
>>> model2_pred = [110, 190, 310]
|
|
138
|
+
>>> model3_pred = [105, 195, 305]
|
|
139
|
+
>>> predictions = [model1_pred, model2_pred, model3_pred]
|
|
140
|
+
>>> result = vote_reg(predictions, method='mean')
|
|
141
|
+
>>> print(result)
|
|
142
|
+
[105.0, 195.0, 305.0]
|
|
143
|
+
|
|
144
|
+
# Weighted average
|
|
145
|
+
>>> weights = [0.5, 0.3, 0.2]
|
|
146
|
+
>>> result = vote_reg(predictions, method='weighted', weights=weights)
|
|
147
|
+
|
|
148
|
+
# Median (robust to outliers)
|
|
149
|
+
>>> result = vote_reg(predictions, method='median')
|
|
150
|
+
|
|
151
|
+
>>> from ilovetools.ml import voting_regressor # Full name
|
|
152
|
+
>>> result = voting_regressor(predictions, method='mean')
|
|
153
|
+
|
|
154
|
+
Notes:
|
|
155
|
+
- Mean: Simple average
|
|
156
|
+
- Median: Robust to outliers
|
|
157
|
+
- Weighted: Trust better models more
|
|
158
|
+
- Use median for noisy predictions
|
|
159
|
+
"""
|
|
160
|
+
if not predictions:
|
|
161
|
+
raise ValueError("predictions cannot be empty")
|
|
162
|
+
|
|
163
|
+
n_samples = len(predictions[0])
|
|
164
|
+
n_models = len(predictions)
|
|
165
|
+
|
|
166
|
+
result = []
|
|
167
|
+
|
|
168
|
+
for i in range(n_samples):
|
|
169
|
+
values = [model_preds[i] for model_preds in predictions]
|
|
170
|
+
|
|
171
|
+
if method == 'mean':
|
|
172
|
+
combined = sum(values) / len(values)
|
|
173
|
+
elif method == 'median':
|
|
174
|
+
sorted_values = sorted(values)
|
|
175
|
+
mid = len(sorted_values) // 2
|
|
176
|
+
if len(sorted_values) % 2 == 0:
|
|
177
|
+
combined = (sorted_values[mid-1] + sorted_values[mid]) / 2
|
|
178
|
+
else:
|
|
179
|
+
combined = sorted_values[mid]
|
|
180
|
+
elif method == 'weighted':
|
|
181
|
+
if weights is None:
|
|
182
|
+
raise ValueError("weights required for weighted method")
|
|
183
|
+
if len(weights) != n_models:
|
|
184
|
+
raise ValueError("weights must match number of models")
|
|
185
|
+
combined = sum(v * w for v, w in zip(values, weights)) / sum(weights)
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError(f"Unknown method: {method}")
|
|
188
|
+
|
|
189
|
+
result.append(combined)
|
|
190
|
+
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# Create alias
|
|
195
|
+
vote_reg = voting_regressor
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def bagging_predictions(
|
|
199
|
+
X: List,
|
|
200
|
+
y: List,
|
|
201
|
+
model_func: Callable,
|
|
202
|
+
n_models: int = 10,
|
|
203
|
+
sample_size: float = 1.0,
|
|
204
|
+
random_state: Optional[int] = None
|
|
205
|
+
) -> Tuple[List[Any], List[List]]:
|
|
206
|
+
"""
|
|
207
|
+
Bootstrap Aggregating (Bagging) ensemble.
|
|
208
|
+
|
|
209
|
+
Alias: bagging()
|
|
210
|
+
|
|
211
|
+
Train multiple models on bootstrap samples and average predictions.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
X: Feature data
|
|
215
|
+
y: Target data
|
|
216
|
+
model_func: Function(X_train, y_train, X_test) -> predictions
|
|
217
|
+
n_models: Number of models to train. Default: 10
|
|
218
|
+
sample_size: Proportion of data for each bootstrap. Default: 1.0
|
|
219
|
+
random_state: Random seed for reproducibility
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
tuple: (final_predictions, all_model_predictions)
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
|
+
>>> from ilovetools.ml import bagging # Short alias
|
|
226
|
+
|
|
227
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
228
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
229
|
+
>>>
|
|
230
|
+
>>> def simple_model(X_tr, y_tr, X_te):
|
|
231
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
232
|
+
... return [avg] * len(X_te)
|
|
233
|
+
>>>
|
|
234
|
+
>>> final_pred, all_preds = bagging(X, y, simple_model, n_models=5)
|
|
235
|
+
>>> print(f"Trained {len(all_preds)} models")
|
|
236
|
+
Trained 5 models
|
|
237
|
+
|
|
238
|
+
>>> from ilovetools.ml import bagging_predictions # Full name
|
|
239
|
+
>>> final_pred, all_preds = bagging_predictions(X, y, simple_model)
|
|
240
|
+
|
|
241
|
+
Notes:
|
|
242
|
+
- Reduces variance (overfitting)
|
|
243
|
+
- Each model sees different data
|
|
244
|
+
- Random Forest uses bagging
|
|
245
|
+
- More models = more stable
|
|
246
|
+
"""
|
|
247
|
+
if random_state is not None:
|
|
248
|
+
random.seed(random_state)
|
|
249
|
+
|
|
250
|
+
n_samples = len(X)
|
|
251
|
+
bootstrap_size = int(n_samples * sample_size)
|
|
252
|
+
|
|
253
|
+
all_predictions = []
|
|
254
|
+
|
|
255
|
+
for _ in range(n_models):
|
|
256
|
+
# Bootstrap sample (with replacement)
|
|
257
|
+
indices = [random.randint(0, n_samples - 1) for _ in range(bootstrap_size)]
|
|
258
|
+
X_bootstrap = [X[i] for i in indices]
|
|
259
|
+
y_bootstrap = [y[i] for i in indices]
|
|
260
|
+
|
|
261
|
+
# Train model and predict on original data
|
|
262
|
+
predictions = model_func(X_bootstrap, y_bootstrap, X)
|
|
263
|
+
all_predictions.append(predictions)
|
|
264
|
+
|
|
265
|
+
# Average predictions
|
|
266
|
+
final_predictions = []
|
|
267
|
+
for i in range(n_samples):
|
|
268
|
+
avg = sum(preds[i] for preds in all_predictions) / n_models
|
|
269
|
+
final_predictions.append(avg)
|
|
270
|
+
|
|
271
|
+
return final_predictions, all_predictions
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# Create alias
|
|
275
|
+
bagging = bagging_predictions
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def boosting_sequential(
|
|
279
|
+
X: List,
|
|
280
|
+
y: List,
|
|
281
|
+
model_func: Callable,
|
|
282
|
+
n_models: int = 10,
|
|
283
|
+
learning_rate: float = 0.1
|
|
284
|
+
) -> Tuple[List[float], List[List]]:
|
|
285
|
+
"""
|
|
286
|
+
Sequential Boosting ensemble.
|
|
287
|
+
|
|
288
|
+
Alias: boosting()
|
|
289
|
+
|
|
290
|
+
Train models sequentially, each focusing on previous errors.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
X: Feature data
|
|
294
|
+
y: Target data
|
|
295
|
+
model_func: Function(X_train, y_train, weights) -> predictions
|
|
296
|
+
n_models: Number of models to train. Default: 10
|
|
297
|
+
learning_rate: Shrinkage parameter. Default: 0.1
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
tuple: (final_predictions, all_model_predictions)
|
|
301
|
+
|
|
302
|
+
Examples:
|
|
303
|
+
>>> from ilovetools.ml import boosting # Short alias
|
|
304
|
+
|
|
305
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
306
|
+
>>> y = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
307
|
+
>>>
|
|
308
|
+
>>> def simple_model(X_tr, y_tr, weights):
|
|
309
|
+
... # Weighted average
|
|
310
|
+
... total_weight = sum(weights)
|
|
311
|
+
... weighted_sum = sum(y * w for y, w in zip(y_tr, weights))
|
|
312
|
+
... avg = weighted_sum / total_weight
|
|
313
|
+
... return [avg] * len(X_tr)
|
|
314
|
+
>>>
|
|
315
|
+
>>> final_pred, all_preds = boosting(X, y, simple_model, n_models=3)
|
|
316
|
+
|
|
317
|
+
>>> from ilovetools.ml import boosting_sequential # Full name
|
|
318
|
+
>>> final_pred, all_preds = boosting_sequential(X, y, simple_model)
|
|
319
|
+
|
|
320
|
+
Notes:
|
|
321
|
+
- Reduces bias (underfitting)
|
|
322
|
+
- Each model fixes previous errors
|
|
323
|
+
- XGBoost, AdaBoost use boosting
|
|
324
|
+
- Lower learning_rate = more models needed
|
|
325
|
+
"""
|
|
326
|
+
n_samples = len(X)
|
|
327
|
+
|
|
328
|
+
# Initialize weights uniformly
|
|
329
|
+
weights = [1.0 / n_samples] * n_samples
|
|
330
|
+
|
|
331
|
+
all_predictions = []
|
|
332
|
+
final_predictions = [0.0] * n_samples
|
|
333
|
+
|
|
334
|
+
for _ in range(n_models):
|
|
335
|
+
# Train model with current weights
|
|
336
|
+
predictions = model_func(X, y, weights)
|
|
337
|
+
all_predictions.append(predictions)
|
|
338
|
+
|
|
339
|
+
# Update final predictions
|
|
340
|
+
for i in range(n_samples):
|
|
341
|
+
final_predictions[i] += learning_rate * predictions[i]
|
|
342
|
+
|
|
343
|
+
# Calculate errors
|
|
344
|
+
errors = [abs(y[i] - final_predictions[i]) for i in range(n_samples)]
|
|
345
|
+
|
|
346
|
+
# Update weights (focus on high error samples)
|
|
347
|
+
total_error = sum(errors)
|
|
348
|
+
if total_error > 0:
|
|
349
|
+
weights = [e / total_error for e in errors]
|
|
350
|
+
else:
|
|
351
|
+
weights = [1.0 / n_samples] * n_samples
|
|
352
|
+
|
|
353
|
+
return final_predictions, all_predictions
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# Create alias
|
|
357
|
+
boosting = boosting_sequential
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def stacking_ensemble(
|
|
361
|
+
base_predictions: List[List],
|
|
362
|
+
y_true: List,
|
|
363
|
+
meta_model_func: Callable
|
|
364
|
+
) -> List:
|
|
365
|
+
"""
|
|
366
|
+
Stacking ensemble with meta-model.
|
|
367
|
+
|
|
368
|
+
Alias: stacking()
|
|
369
|
+
|
|
370
|
+
Train meta-model to combine base model predictions.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
base_predictions: List of prediction arrays from base models
|
|
374
|
+
y_true: True target values
|
|
375
|
+
meta_model_func: Function(X_meta, y_meta) -> meta_model
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
list: Meta-model predictions
|
|
379
|
+
|
|
380
|
+
Examples:
|
|
381
|
+
>>> from ilovetools.ml import stacking # Short alias
|
|
382
|
+
|
|
383
|
+
>>> # Base model predictions
|
|
384
|
+
>>> model1_pred = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
385
|
+
>>> model2_pred = [1.1, 1.9, 3.1, 3.9, 5.1]
|
|
386
|
+
>>> model3_pred = [0.9, 2.1, 2.9, 4.1, 4.9]
|
|
387
|
+
>>> base_preds = [model1_pred, model2_pred, model3_pred]
|
|
388
|
+
>>> y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
389
|
+
>>>
|
|
390
|
+
>>> def meta_model(X_meta, y_meta):
|
|
391
|
+
... # Simple weighted average learner
|
|
392
|
+
... def predict(X_test):
|
|
393
|
+
... return [sum(x) / len(x) for x in X_test]
|
|
394
|
+
... return predict
|
|
395
|
+
>>>
|
|
396
|
+
>>> meta_preds = stacking(base_preds, y_true, meta_model)
|
|
397
|
+
|
|
398
|
+
>>> from ilovetools.ml import stacking_ensemble # Full name
|
|
399
|
+
>>> meta_preds = stacking_ensemble(base_preds, y_true, meta_model)
|
|
400
|
+
|
|
401
|
+
Notes:
|
|
402
|
+
- Most powerful ensemble method
|
|
403
|
+
- Meta-model learns optimal combination
|
|
404
|
+
- Kaggle winners use stacking
|
|
405
|
+
- Requires more data and compute
|
|
406
|
+
"""
|
|
407
|
+
if not base_predictions:
|
|
408
|
+
raise ValueError("base_predictions cannot be empty")
|
|
409
|
+
|
|
410
|
+
n_samples = len(base_predictions[0])
|
|
411
|
+
n_models = len(base_predictions)
|
|
412
|
+
|
|
413
|
+
# Create meta-features (transpose predictions)
|
|
414
|
+
X_meta = []
|
|
415
|
+
for i in range(n_samples):
|
|
416
|
+
meta_features = [base_preds[i] for base_preds in base_predictions]
|
|
417
|
+
X_meta.append(meta_features)
|
|
418
|
+
|
|
419
|
+
# Train meta-model
|
|
420
|
+
meta_model = meta_model_func(X_meta, y_true)
|
|
421
|
+
|
|
422
|
+
# Get meta-predictions
|
|
423
|
+
meta_predictions = meta_model(X_meta)
|
|
424
|
+
|
|
425
|
+
return meta_predictions
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# Create alias
|
|
429
|
+
stacking = stacking_ensemble
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def weighted_average_ensemble(
|
|
433
|
+
predictions: List[List[float]],
|
|
434
|
+
weights: List[float]
|
|
435
|
+
) -> List[float]:
|
|
436
|
+
"""
|
|
437
|
+
Weighted average of predictions.
|
|
438
|
+
|
|
439
|
+
Alias: weighted_avg()
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
predictions: List of prediction arrays
|
|
443
|
+
weights: Weight for each model
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
list: Weighted average predictions
|
|
447
|
+
|
|
448
|
+
Examples:
|
|
449
|
+
>>> from ilovetools.ml import weighted_avg # Short alias
|
|
450
|
+
|
|
451
|
+
>>> model1 = [100, 200, 300]
|
|
452
|
+
>>> model2 = [110, 190, 310]
|
|
453
|
+
>>> model3 = [105, 195, 305]
|
|
454
|
+
>>> predictions = [model1, model2, model3]
|
|
455
|
+
>>> weights = [0.5, 0.3, 0.2] # Trust model1 most
|
|
456
|
+
>>> result = weighted_avg(predictions, weights)
|
|
457
|
+
>>> print(result)
|
|
458
|
+
[105.0, 196.5, 304.5]
|
|
459
|
+
|
|
460
|
+
>>> from ilovetools.ml import weighted_average_ensemble # Full name
|
|
461
|
+
>>> result = weighted_average_ensemble(predictions, weights)
|
|
462
|
+
|
|
463
|
+
Notes:
|
|
464
|
+
- Give more weight to better models
|
|
465
|
+
- Weights should sum to 1.0
|
|
466
|
+
- Use CV to find optimal weights
|
|
467
|
+
- Simple but effective
|
|
468
|
+
"""
|
|
469
|
+
if len(predictions) != len(weights):
|
|
470
|
+
raise ValueError("predictions and weights must have same length")
|
|
471
|
+
|
|
472
|
+
n_samples = len(predictions[0])
|
|
473
|
+
result = []
|
|
474
|
+
|
|
475
|
+
for i in range(n_samples):
|
|
476
|
+
weighted_sum = sum(preds[i] * w for preds, w in zip(predictions, weights))
|
|
477
|
+
result.append(weighted_sum)
|
|
478
|
+
|
|
479
|
+
return result
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# Create alias
|
|
483
|
+
weighted_avg = weighted_average_ensemble
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def majority_vote(predictions: List[List[int]]) -> List[int]:
|
|
487
|
+
"""
|
|
488
|
+
Hard voting (majority vote) for classification.
|
|
489
|
+
|
|
490
|
+
Alias: hard_vote()
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
predictions: List of prediction arrays
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
list: Majority vote predictions
|
|
497
|
+
|
|
498
|
+
Examples:
|
|
499
|
+
>>> from ilovetools.ml import hard_vote # Short alias
|
|
500
|
+
|
|
501
|
+
>>> model1 = [0, 1, 1, 0, 1]
|
|
502
|
+
>>> model2 = [0, 1, 0, 0, 1]
|
|
503
|
+
>>> model3 = [1, 1, 1, 0, 1]
|
|
504
|
+
>>> predictions = [model1, model2, model3]
|
|
505
|
+
>>> result = hard_vote(predictions)
|
|
506
|
+
>>> print(result)
|
|
507
|
+
[0, 1, 1, 0, 1]
|
|
508
|
+
|
|
509
|
+
>>> from ilovetools.ml import majority_vote # Full name
|
|
510
|
+
>>> result = majority_vote(predictions)
|
|
511
|
+
|
|
512
|
+
Notes:
|
|
513
|
+
- Simple majority wins
|
|
514
|
+
- Use odd number of models
|
|
515
|
+
- Fast and interpretable
|
|
516
|
+
- Good for balanced models
|
|
517
|
+
"""
|
|
518
|
+
n_samples = len(predictions[0])
|
|
519
|
+
result = []
|
|
520
|
+
|
|
521
|
+
for i in range(n_samples):
|
|
522
|
+
votes = [preds[i] for preds in predictions]
|
|
523
|
+
# Count votes
|
|
524
|
+
vote_counts = {}
|
|
525
|
+
for vote in votes:
|
|
526
|
+
vote_counts[vote] = vote_counts.get(vote, 0) + 1
|
|
527
|
+
# Get majority
|
|
528
|
+
majority = max(vote_counts.items(), key=lambda x: x[1])[0]
|
|
529
|
+
result.append(majority)
|
|
530
|
+
|
|
531
|
+
return result
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
# Create alias
|
|
535
|
+
hard_vote = majority_vote
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def soft_vote(
|
|
539
|
+
probabilities: List[List[List[float]]],
|
|
540
|
+
weights: Optional[List[float]] = None
|
|
541
|
+
) -> List[int]:
|
|
542
|
+
"""
|
|
543
|
+
Soft voting using predicted probabilities.
|
|
544
|
+
|
|
545
|
+
Alias: soft_vote_alias()
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
probabilities: List of probability arrays [n_models][n_samples][n_classes]
|
|
549
|
+
weights: Optional weights for each model
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
list: Predicted classes based on averaged probabilities
|
|
553
|
+
|
|
554
|
+
Examples:
|
|
555
|
+
>>> from ilovetools.ml import soft_vote_alias # Short alias
|
|
556
|
+
|
|
557
|
+
# Binary classification probabilities
|
|
558
|
+
>>> model1_proba = [[0.8, 0.2], [0.3, 0.7], [0.6, 0.4]]
|
|
559
|
+
>>> model2_proba = [[0.7, 0.3], [0.4, 0.6], [0.5, 0.5]]
|
|
560
|
+
>>> probabilities = [model1_proba, model2_proba]
|
|
561
|
+
>>> result = soft_vote_alias(probabilities)
|
|
562
|
+
>>> print(result)
|
|
563
|
+
[0, 1, 0]
|
|
564
|
+
|
|
565
|
+
>>> from ilovetools.ml import soft_vote # Full name
|
|
566
|
+
>>> result = soft_vote(probabilities)
|
|
567
|
+
|
|
568
|
+
Notes:
|
|
569
|
+
- Uses probability information
|
|
570
|
+
- More nuanced than hard voting
|
|
571
|
+
- Requires predict_proba
|
|
572
|
+
- Better for uncertain predictions
|
|
573
|
+
"""
|
|
574
|
+
n_models = len(probabilities)
|
|
575
|
+
n_samples = len(probabilities[0])
|
|
576
|
+
n_classes = len(probabilities[0][0])
|
|
577
|
+
|
|
578
|
+
if weights is None:
|
|
579
|
+
weights = [1.0] * n_models
|
|
580
|
+
|
|
581
|
+
result = []
|
|
582
|
+
|
|
583
|
+
for i in range(n_samples):
|
|
584
|
+
# Average probabilities across models
|
|
585
|
+
avg_proba = [0.0] * n_classes
|
|
586
|
+
for model_idx, model_proba in enumerate(probabilities):
|
|
587
|
+
for class_idx in range(n_classes):
|
|
588
|
+
avg_proba[class_idx] += model_proba[i][class_idx] * weights[model_idx]
|
|
589
|
+
|
|
590
|
+
# Normalize
|
|
591
|
+
total = sum(avg_proba)
|
|
592
|
+
avg_proba = [p / total for p in avg_proba]
|
|
593
|
+
|
|
594
|
+
# Get class with highest probability
|
|
595
|
+
predicted_class = avg_proba.index(max(avg_proba))
|
|
596
|
+
result.append(predicted_class)
|
|
597
|
+
|
|
598
|
+
return result
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
# Create alias
|
|
602
|
+
soft_vote_alias = soft_vote
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def bootstrap_sample(
|
|
606
|
+
X: List,
|
|
607
|
+
y: List,
|
|
608
|
+
sample_size: Optional[int] = None,
|
|
609
|
+
random_state: Optional[int] = None
|
|
610
|
+
) -> Tuple[List, List, List[int]]:
|
|
611
|
+
"""
|
|
612
|
+
Create bootstrap sample (sampling with replacement).
|
|
613
|
+
|
|
614
|
+
Alias: bootstrap()
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
X: Feature data
|
|
618
|
+
y: Target data
|
|
619
|
+
sample_size: Size of bootstrap sample. Default: len(X)
|
|
620
|
+
random_state: Random seed
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
tuple: (X_bootstrap, y_bootstrap, indices)
|
|
624
|
+
|
|
625
|
+
Examples:
|
|
626
|
+
>>> from ilovetools.ml import bootstrap # Short alias
|
|
627
|
+
|
|
628
|
+
>>> X = [1, 2, 3, 4, 5]
|
|
629
|
+
>>> y = [10, 20, 30, 40, 50]
|
|
630
|
+
>>> X_boot, y_boot, indices = bootstrap(X, y, random_state=42)
|
|
631
|
+
>>> print(f"Bootstrap size: {len(X_boot)}")
|
|
632
|
+
Bootstrap size: 5
|
|
633
|
+
>>> print(f"Unique samples: {len(set(indices))}")
|
|
634
|
+
|
|
635
|
+
>>> from ilovetools.ml import bootstrap_sample # Full name
|
|
636
|
+
>>> X_boot, y_boot, indices = bootstrap_sample(X, y)
|
|
637
|
+
|
|
638
|
+
Notes:
|
|
639
|
+
- Sampling with replacement
|
|
640
|
+
- Some samples appear multiple times
|
|
641
|
+
- ~63% unique samples on average
|
|
642
|
+
- Foundation of bagging
|
|
643
|
+
"""
|
|
644
|
+
if random_state is not None:
|
|
645
|
+
random.seed(random_state)
|
|
646
|
+
|
|
647
|
+
n = len(X)
|
|
648
|
+
if sample_size is None:
|
|
649
|
+
sample_size = n
|
|
650
|
+
|
|
651
|
+
indices = [random.randint(0, n - 1) for _ in range(sample_size)]
|
|
652
|
+
X_bootstrap = [X[i] for i in indices]
|
|
653
|
+
y_bootstrap = [y[i] for i in indices]
|
|
654
|
+
|
|
655
|
+
return X_bootstrap, y_bootstrap, indices
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
# Create alias
|
|
659
|
+
bootstrap = bootstrap_sample
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def out_of_bag_score(
|
|
663
|
+
X: List,
|
|
664
|
+
y: List,
|
|
665
|
+
model_func: Callable,
|
|
666
|
+
n_models: int = 10,
|
|
667
|
+
random_state: Optional[int] = None
|
|
668
|
+
) -> float:
|
|
669
|
+
"""
|
|
670
|
+
Calculate Out-of-Bag (OOB) score for bagging.
|
|
671
|
+
|
|
672
|
+
Alias: oob_score()
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
X: Feature data
|
|
676
|
+
y: Target data
|
|
677
|
+
model_func: Function(X_train, y_train, X_test) -> predictions
|
|
678
|
+
n_models: Number of bootstrap models. Default: 10
|
|
679
|
+
random_state: Random seed
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
float: OOB score (accuracy for classification)
|
|
683
|
+
|
|
684
|
+
Examples:
|
|
685
|
+
>>> from ilovetools.ml import oob_score # Short alias
|
|
686
|
+
|
|
687
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
688
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
689
|
+
>>>
|
|
690
|
+
>>> def model(X_tr, y_tr, X_te):
|
|
691
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
692
|
+
... return [avg] * len(X_te)
|
|
693
|
+
>>>
|
|
694
|
+
>>> score = oob_score(X, y, model, n_models=5, random_state=42)
|
|
695
|
+
>>> print(f"OOB Score: {score:.2f}")
|
|
696
|
+
|
|
697
|
+
>>> from ilovetools.ml import out_of_bag_score # Full name
|
|
698
|
+
>>> score = out_of_bag_score(X, y, model)
|
|
699
|
+
|
|
700
|
+
Notes:
|
|
701
|
+
- Free validation without separate test set
|
|
702
|
+
- Uses samples not in bootstrap
|
|
703
|
+
- ~37% samples are OOB per model
|
|
704
|
+
- Good estimate of generalization
|
|
705
|
+
"""
|
|
706
|
+
if random_state is not None:
|
|
707
|
+
random.seed(random_state)
|
|
708
|
+
|
|
709
|
+
n_samples = len(X)
|
|
710
|
+
oob_predictions = [[] for _ in range(n_samples)]
|
|
711
|
+
|
|
712
|
+
for _ in range(n_models):
|
|
713
|
+
# Bootstrap sample
|
|
714
|
+
indices = [random.randint(0, n_samples - 1) for _ in range(n_samples)]
|
|
715
|
+
X_bootstrap = [X[i] for i in indices]
|
|
716
|
+
y_bootstrap = [y[i] for i in indices]
|
|
717
|
+
|
|
718
|
+
# Find OOB samples
|
|
719
|
+
oob_indices = [i for i in range(n_samples) if i not in indices]
|
|
720
|
+
|
|
721
|
+
if not oob_indices:
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
X_oob = [X[i] for i in oob_indices]
|
|
725
|
+
|
|
726
|
+
# Predict on OOB samples
|
|
727
|
+
predictions = model_func(X_bootstrap, y_bootstrap, X_oob)
|
|
728
|
+
|
|
729
|
+
# Store OOB predictions
|
|
730
|
+
for idx, pred in zip(oob_indices, predictions):
|
|
731
|
+
oob_predictions[idx].append(pred)
|
|
732
|
+
|
|
733
|
+
# Calculate OOB score
|
|
734
|
+
correct = 0
|
|
735
|
+
total = 0
|
|
736
|
+
|
|
737
|
+
for i, preds in enumerate(oob_predictions):
|
|
738
|
+
if preds: # Has OOB predictions
|
|
739
|
+
avg_pred = sum(preds) / len(preds)
|
|
740
|
+
if abs(avg_pred - y[i]) < 0.5: # For classification
|
|
741
|
+
correct += 1
|
|
742
|
+
total += 1
|
|
743
|
+
|
|
744
|
+
return correct / total if total > 0 else 0.0
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
# Create alias
|
|
748
|
+
oob_score = out_of_bag_score
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def ensemble_diversity(
|
|
752
|
+
predictions: List[List[int]]
|
|
753
|
+
) -> float:
|
|
754
|
+
"""
|
|
755
|
+
Calculate diversity among ensemble models.
|
|
756
|
+
|
|
757
|
+
Alias: diversity()
|
|
758
|
+
|
|
759
|
+
Higher diversity = Better ensemble potential
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
predictions: List of prediction arrays
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
float: Diversity score (0.0 to 1.0)
|
|
766
|
+
|
|
767
|
+
Examples:
|
|
768
|
+
>>> from ilovetools.ml import diversity # Short alias
|
|
769
|
+
|
|
770
|
+
# High diversity (different predictions)
|
|
771
|
+
>>> model1 = [0, 1, 0, 1, 0]
|
|
772
|
+
>>> model2 = [1, 0, 1, 0, 1]
|
|
773
|
+
>>> model3 = [0, 0, 1, 1, 0]
|
|
774
|
+
>>> predictions = [model1, model2, model3]
|
|
775
|
+
>>> div = diversity(predictions)
|
|
776
|
+
>>> print(f"Diversity: {div:.2%}")
|
|
777
|
+
|
|
778
|
+
# Low diversity (similar predictions)
|
|
779
|
+
>>> model1 = [0, 1, 0, 1, 0]
|
|
780
|
+
>>> model2 = [0, 1, 0, 1, 0]
|
|
781
|
+
>>> model3 = [0, 1, 0, 1, 1]
|
|
782
|
+
>>> predictions = [model1, model2, model3]
|
|
783
|
+
>>> div = diversity(predictions)
|
|
784
|
+
|
|
785
|
+
>>> from ilovetools.ml import ensemble_diversity # Full name
|
|
786
|
+
>>> div = ensemble_diversity(predictions)
|
|
787
|
+
|
|
788
|
+
Notes:
|
|
789
|
+
- High diversity = Models make different errors
|
|
790
|
+
- Low diversity = Models too similar
|
|
791
|
+
- Aim for diverse but accurate models
|
|
792
|
+
- Use different algorithms for diversity
|
|
793
|
+
"""
|
|
794
|
+
n_models = len(predictions)
|
|
795
|
+
n_samples = len(predictions[0])
|
|
796
|
+
|
|
797
|
+
if n_models < 2:
|
|
798
|
+
return 0.0
|
|
799
|
+
|
|
800
|
+
# Calculate pairwise disagreement
|
|
801
|
+
total_disagreement = 0
|
|
802
|
+
pairs = 0
|
|
803
|
+
|
|
804
|
+
for i in range(n_models):
|
|
805
|
+
for j in range(i + 1, n_models):
|
|
806
|
+
disagreement = sum(1 for k in range(n_samples)
|
|
807
|
+
if predictions[i][k] != predictions[j][k])
|
|
808
|
+
total_disagreement += disagreement / n_samples
|
|
809
|
+
pairs += 1
|
|
810
|
+
|
|
811
|
+
return total_disagreement / pairs if pairs > 0 else 0.0
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
# Create alias
|
|
815
|
+
diversity = ensemble_diversity
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def blend_predictions(
|
|
819
|
+
train_predictions: List[List],
|
|
820
|
+
test_predictions: List[List],
|
|
821
|
+
y_train: List,
|
|
822
|
+
blend_func: Callable
|
|
823
|
+
) -> List:
|
|
824
|
+
"""
|
|
825
|
+
Blend predictions using a blending function.
|
|
826
|
+
|
|
827
|
+
Alias: blend()
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
train_predictions: Base model predictions on training set
|
|
831
|
+
test_predictions: Base model predictions on test set
|
|
832
|
+
y_train: Training labels
|
|
833
|
+
blend_func: Function to learn blending weights
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
list: Blended test predictions
|
|
837
|
+
|
|
838
|
+
Examples:
|
|
839
|
+
>>> from ilovetools.ml import blend # Short alias
|
|
840
|
+
|
|
841
|
+
>>> train_preds = [[1, 2, 3], [1.1, 1.9, 3.1]]
|
|
842
|
+
>>> test_preds = [[4, 5], [3.9, 5.1]]
|
|
843
|
+
>>> y_train = [1, 2, 3]
|
|
844
|
+
>>>
|
|
845
|
+
>>> def simple_blend(train_p, y_tr):
|
|
846
|
+
... # Learn to average
|
|
847
|
+
... def predict(test_p):
|
|
848
|
+
... return [sum(p)/len(p) for p in zip(*test_p)]
|
|
849
|
+
... return predict
|
|
850
|
+
>>>
|
|
851
|
+
>>> result = blend(train_preds, test_preds, y_train, simple_blend)
|
|
852
|
+
|
|
853
|
+
>>> from ilovetools.ml import blend_predictions # Full name
|
|
854
|
+
>>> result = blend_predictions(train_preds, test_preds, y_train, simple_blend)
|
|
855
|
+
|
|
856
|
+
Notes:
|
|
857
|
+
- Similar to stacking but simpler
|
|
858
|
+
- Uses holdout set for blending
|
|
859
|
+
- Less prone to overfitting than stacking
|
|
860
|
+
- Popular in Kaggle competitions
|
|
861
|
+
"""
|
|
862
|
+
# Learn blending function on training predictions
|
|
863
|
+
blender = blend_func(train_predictions, y_train)
|
|
864
|
+
|
|
865
|
+
# Apply to test predictions
|
|
866
|
+
blended_predictions = blender(test_predictions)
|
|
867
|
+
|
|
868
|
+
return blended_predictions
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
# Create alias
|
|
872
|
+
blend = blend_predictions
|