ilovetools 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +42 -0
- ilovetools/ai/__init__.py +13 -0
- ilovetools/ai/embeddings.py +270 -0
- ilovetools/ai/inference.py +5 -0
- ilovetools/ai/llm_helpers.py +141 -0
- ilovetools/audio/__init__.py +5 -0
- ilovetools/automation/__init__.py +5 -0
- ilovetools/conversion/__init__.py +5 -0
- ilovetools/data/__init__.py +27 -0
- ilovetools/data/feature_engineering.py +497 -0
- ilovetools/data/preprocessing.py +234 -0
- ilovetools/database/__init__.py +5 -0
- ilovetools/datetime/__init__.py +5 -0
- ilovetools/files/__init__.py +5 -0
- ilovetools/image/__init__.py +5 -0
- ilovetools/ml/__init__.py +603 -0
- ilovetools/ml/clustering.py +1107 -0
- ilovetools/ml/cross_validation.py +612 -0
- ilovetools/ml/dimensionality.py +1001 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/feature_selection.py +971 -0
- ilovetools/ml/imbalanced.py +797 -0
- ilovetools/ml/interpretation.py +915 -0
- ilovetools/ml/metrics.py +601 -0
- ilovetools/ml/pipeline.py +711 -0
- ilovetools/ml/timeseries.py +984 -0
- ilovetools/ml/tuning.py +781 -0
- ilovetools/security/__init__.py +5 -0
- ilovetools/text/__init__.py +5 -0
- ilovetools/utils/__init__.py +5 -0
- ilovetools/validation/__init__.py +5 -0
- ilovetools/web/__init__.py +5 -0
- ilovetools-0.2.3.dist-info/METADATA +143 -0
- ilovetools-0.2.3.dist-info/RECORD +38 -0
- ilovetools-0.2.3.dist-info/WHEEL +5 -0
- ilovetools-0.2.3.dist-info/licenses/LICENSE +21 -0
- ilovetools-0.2.3.dist-info/top_level.txt +2 -0
- tests/__init__.py +3 -0
ilovetools/ml/tuning.py
ADDED
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hyperparameter tuning utilities for ML workflows
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Callable, Optional, Tuple
|
|
7
|
+
import random
|
|
8
|
+
import itertools
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
# Full names
|
|
12
|
+
'grid_search_cv',
|
|
13
|
+
'random_search_cv',
|
|
14
|
+
'generate_param_grid',
|
|
15
|
+
'extract_best_params',
|
|
16
|
+
'format_cv_results',
|
|
17
|
+
'learning_curve_data',
|
|
18
|
+
'validation_curve_data',
|
|
19
|
+
'early_stopping_monitor',
|
|
20
|
+
'compare_models_cv',
|
|
21
|
+
'bayesian_search_simple',
|
|
22
|
+
# Abbreviated aliases
|
|
23
|
+
'gridsearch',
|
|
24
|
+
'randomsearch',
|
|
25
|
+
'param_grid',
|
|
26
|
+
'best_params',
|
|
27
|
+
'cv_results',
|
|
28
|
+
'learning_curve',
|
|
29
|
+
'val_curve',
|
|
30
|
+
'early_stop',
|
|
31
|
+
'compare_models',
|
|
32
|
+
'bayesopt',
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def grid_search_cv(
|
|
37
|
+
X: List,
|
|
38
|
+
y: List,
|
|
39
|
+
model_func: Callable,
|
|
40
|
+
param_grid: Dict[str, List],
|
|
41
|
+
metric_func: Callable,
|
|
42
|
+
cv_splits: int = 5
|
|
43
|
+
) -> Dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Grid Search Cross-Validation for hyperparameter tuning.
|
|
46
|
+
|
|
47
|
+
Alias: gridsearch()
|
|
48
|
+
|
|
49
|
+
Exhaustively searches all parameter combinations.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
X: Feature data
|
|
53
|
+
y: Target data
|
|
54
|
+
model_func: Function(params, X_train, y_train, X_val) -> predictions
|
|
55
|
+
param_grid: Dictionary of parameter lists
|
|
56
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
57
|
+
cv_splits: Number of CV folds. Default: 5
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
dict: Best parameters, best score, all results
|
|
61
|
+
|
|
62
|
+
Examples:
|
|
63
|
+
>>> from ilovetools.ml import gridsearch # Short alias
|
|
64
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
65
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
66
|
+
>>>
|
|
67
|
+
>>> def model(params, X_tr, y_tr, X_val):
|
|
68
|
+
... # Simple model with threshold param
|
|
69
|
+
... threshold = params['threshold']
|
|
70
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
71
|
+
... return [avg + threshold] * len(X_val)
|
|
72
|
+
>>>
|
|
73
|
+
>>> def metric(y_true, y_pred):
|
|
74
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
75
|
+
>>>
|
|
76
|
+
>>> param_grid = {'threshold': [0, 0.5, 1.0]}
|
|
77
|
+
>>> results = gridsearch(X, y, model, param_grid, metric, cv_splits=3)
|
|
78
|
+
>>> print(results['best_params'])
|
|
79
|
+
|
|
80
|
+
>>> from ilovetools.ml import grid_search_cv # Full name
|
|
81
|
+
>>> results = grid_search_cv(X, y, model, param_grid, metric)
|
|
82
|
+
|
|
83
|
+
Notes:
|
|
84
|
+
- Tries all combinations
|
|
85
|
+
- Exhaustive but slow
|
|
86
|
+
- Good for small parameter spaces
|
|
87
|
+
- Guaranteed to find best in grid
|
|
88
|
+
"""
|
|
89
|
+
from .cross_validation import k_fold_cross_validation
|
|
90
|
+
|
|
91
|
+
# Generate all parameter combinations
|
|
92
|
+
param_names = list(param_grid.keys())
|
|
93
|
+
param_values = [param_grid[name] for name in param_names]
|
|
94
|
+
param_combinations = list(itertools.product(*param_values))
|
|
95
|
+
|
|
96
|
+
results = []
|
|
97
|
+
|
|
98
|
+
for combo in param_combinations:
|
|
99
|
+
params = dict(zip(param_names, combo))
|
|
100
|
+
|
|
101
|
+
# Perform CV
|
|
102
|
+
splits = k_fold_cross_validation(X, y, k=cv_splits)
|
|
103
|
+
scores = []
|
|
104
|
+
|
|
105
|
+
for train_idx, val_idx in splits:
|
|
106
|
+
X_train = [X[i] for i in train_idx]
|
|
107
|
+
y_train = [y[i] for i in train_idx]
|
|
108
|
+
X_val = [X[i] for i in val_idx]
|
|
109
|
+
y_val = [y[i] for i in val_idx]
|
|
110
|
+
|
|
111
|
+
y_pred = model_func(params, X_train, y_train, X_val)
|
|
112
|
+
score = metric_func(y_val, y_pred)
|
|
113
|
+
scores.append(score)
|
|
114
|
+
|
|
115
|
+
mean_score = sum(scores) / len(scores)
|
|
116
|
+
|
|
117
|
+
results.append({
|
|
118
|
+
'params': params,
|
|
119
|
+
'mean_score': mean_score,
|
|
120
|
+
'scores': scores
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
# Find best
|
|
124
|
+
best_result = max(results, key=lambda x: x['mean_score'])
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
'best_params': best_result['params'],
|
|
128
|
+
'best_score': best_result['mean_score'],
|
|
129
|
+
'all_results': results,
|
|
130
|
+
'n_combinations': len(param_combinations)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Create alias
|
|
135
|
+
gridsearch = grid_search_cv
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def random_search_cv(
|
|
139
|
+
X: List,
|
|
140
|
+
y: List,
|
|
141
|
+
model_func: Callable,
|
|
142
|
+
param_distributions: Dict[str, List],
|
|
143
|
+
metric_func: Callable,
|
|
144
|
+
n_iter: int = 10,
|
|
145
|
+
cv_splits: int = 5,
|
|
146
|
+
random_state: Optional[int] = None
|
|
147
|
+
) -> Dict[str, Any]:
|
|
148
|
+
"""
|
|
149
|
+
Random Search Cross-Validation for hyperparameter tuning.
|
|
150
|
+
|
|
151
|
+
Alias: randomsearch()
|
|
152
|
+
|
|
153
|
+
Randomly samples parameter combinations. Faster than grid search.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
X: Feature data
|
|
157
|
+
y: Target data
|
|
158
|
+
model_func: Function(params, X_train, y_train, X_val) -> predictions
|
|
159
|
+
param_distributions: Dictionary of parameter lists
|
|
160
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
161
|
+
n_iter: Number of random combinations to try. Default: 10
|
|
162
|
+
cv_splits: Number of CV folds. Default: 5
|
|
163
|
+
random_state: Random seed for reproducibility
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
dict: Best parameters, best score, all results
|
|
167
|
+
|
|
168
|
+
Examples:
|
|
169
|
+
>>> from ilovetools.ml import randomsearch # Short alias
|
|
170
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
171
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
172
|
+
>>>
|
|
173
|
+
>>> def model(params, X_tr, y_tr, X_val):
|
|
174
|
+
... alpha = params['alpha']
|
|
175
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
176
|
+
... return [avg * alpha] * len(X_val)
|
|
177
|
+
>>>
|
|
178
|
+
>>> def metric(y_true, y_pred):
|
|
179
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
180
|
+
>>>
|
|
181
|
+
>>> param_dist = {'alpha': [0.5, 0.8, 1.0, 1.2, 1.5]}
|
|
182
|
+
>>> results = randomsearch(X, y, model, param_dist, metric, n_iter=3)
|
|
183
|
+
|
|
184
|
+
>>> from ilovetools.ml import random_search_cv # Full name
|
|
185
|
+
>>> results = random_search_cv(X, y, model, param_dist, metric)
|
|
186
|
+
|
|
187
|
+
Notes:
|
|
188
|
+
- Faster than grid search
|
|
189
|
+
- Often finds good params quickly
|
|
190
|
+
- Good for large parameter spaces
|
|
191
|
+
- May miss optimal combination
|
|
192
|
+
"""
|
|
193
|
+
from .cross_validation import k_fold_cross_validation
|
|
194
|
+
|
|
195
|
+
if random_state is not None:
|
|
196
|
+
random.seed(random_state)
|
|
197
|
+
|
|
198
|
+
param_names = list(param_distributions.keys())
|
|
199
|
+
results = []
|
|
200
|
+
|
|
201
|
+
for _ in range(n_iter):
|
|
202
|
+
# Random sample
|
|
203
|
+
params = {name: random.choice(param_distributions[name]) for name in param_names}
|
|
204
|
+
|
|
205
|
+
# Perform CV
|
|
206
|
+
splits = k_fold_cross_validation(X, y, k=cv_splits)
|
|
207
|
+
scores = []
|
|
208
|
+
|
|
209
|
+
for train_idx, val_idx in splits:
|
|
210
|
+
X_train = [X[i] for i in train_idx]
|
|
211
|
+
y_train = [y[i] for i in train_idx]
|
|
212
|
+
X_val = [X[i] for i in val_idx]
|
|
213
|
+
y_val = [y[i] for i in val_idx]
|
|
214
|
+
|
|
215
|
+
y_pred = model_func(params, X_train, y_train, X_val)
|
|
216
|
+
score = metric_func(y_val, y_pred)
|
|
217
|
+
scores.append(score)
|
|
218
|
+
|
|
219
|
+
mean_score = sum(scores) / len(scores)
|
|
220
|
+
|
|
221
|
+
results.append({
|
|
222
|
+
'params': params,
|
|
223
|
+
'mean_score': mean_score,
|
|
224
|
+
'scores': scores
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
# Find best
|
|
228
|
+
best_result = max(results, key=lambda x: x['mean_score'])
|
|
229
|
+
|
|
230
|
+
return {
|
|
231
|
+
'best_params': best_result['params'],
|
|
232
|
+
'best_score': best_result['mean_score'],
|
|
233
|
+
'all_results': results,
|
|
234
|
+
'n_iterations': n_iter
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# Create alias
|
|
239
|
+
randomsearch = random_search_cv
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def generate_param_grid(
|
|
243
|
+
param_ranges: Dict[str, Tuple[float, float, int]]
|
|
244
|
+
) -> Dict[str, List[float]]:
|
|
245
|
+
"""
|
|
246
|
+
Generate parameter grid from ranges.
|
|
247
|
+
|
|
248
|
+
Alias: param_grid()
|
|
249
|
+
|
|
250
|
+
Creates evenly spaced parameter values.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
param_ranges: Dict of (min, max, n_values) tuples
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
dict: Parameter grid
|
|
257
|
+
|
|
258
|
+
Examples:
|
|
259
|
+
>>> from ilovetools.ml import param_grid # Short alias
|
|
260
|
+
>>> ranges = {
|
|
261
|
+
... 'learning_rate': (0.001, 0.1, 5),
|
|
262
|
+
... 'max_depth': (3, 10, 4)
|
|
263
|
+
... }
|
|
264
|
+
>>> grid = param_grid(ranges)
|
|
265
|
+
>>> print(grid)
|
|
266
|
+
{'learning_rate': [0.001, 0.02575, 0.0505, 0.07525, 0.1], 'max_depth': [3.0, 5.333, 7.667, 10.0]}
|
|
267
|
+
|
|
268
|
+
>>> from ilovetools.ml import generate_param_grid # Full name
|
|
269
|
+
>>> grid = generate_param_grid(ranges)
|
|
270
|
+
|
|
271
|
+
Notes:
|
|
272
|
+
- Creates evenly spaced values
|
|
273
|
+
- Useful for continuous parameters
|
|
274
|
+
- Combine with grid_search_cv
|
|
275
|
+
- Adjust n_values for granularity
|
|
276
|
+
"""
|
|
277
|
+
grid = {}
|
|
278
|
+
|
|
279
|
+
for param_name, (min_val, max_val, n_values) in param_ranges.items():
|
|
280
|
+
if n_values == 1:
|
|
281
|
+
grid[param_name] = [min_val]
|
|
282
|
+
else:
|
|
283
|
+
step = (max_val - min_val) / (n_values - 1)
|
|
284
|
+
grid[param_name] = [min_val + i * step for i in range(n_values)]
|
|
285
|
+
|
|
286
|
+
return grid
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# Create alias
|
|
290
|
+
param_grid = generate_param_grid
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def extract_best_params(search_results: Dict[str, Any]) -> Dict[str, Any]:
|
|
294
|
+
"""
|
|
295
|
+
Extract best parameters from search results.
|
|
296
|
+
|
|
297
|
+
Alias: best_params()
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
search_results: Results from grid_search_cv or random_search_cv
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
dict: Best parameters
|
|
304
|
+
|
|
305
|
+
Examples:
|
|
306
|
+
>>> from ilovetools.ml import best_params # Short alias
|
|
307
|
+
>>> results = {'best_params': {'alpha': 0.5}, 'best_score': 0.95}
|
|
308
|
+
>>> params = best_params(results)
|
|
309
|
+
>>> print(params)
|
|
310
|
+
{'alpha': 0.5}
|
|
311
|
+
|
|
312
|
+
>>> from ilovetools.ml import extract_best_params # Full name
|
|
313
|
+
>>> params = extract_best_params(results)
|
|
314
|
+
|
|
315
|
+
Notes:
|
|
316
|
+
- Simple extraction utility
|
|
317
|
+
- Works with any search method
|
|
318
|
+
- Returns clean parameter dict
|
|
319
|
+
- Use for model training
|
|
320
|
+
"""
|
|
321
|
+
return search_results.get('best_params', {})
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# Create alias
|
|
325
|
+
best_params = extract_best_params
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def format_cv_results(search_results: Dict[str, Any], top_n: int = 5) -> List[Dict]:
|
|
329
|
+
"""
|
|
330
|
+
Format CV results for easy viewing.
|
|
331
|
+
|
|
332
|
+
Alias: cv_results()
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
search_results: Results from search
|
|
336
|
+
top_n: Number of top results to return. Default: 5
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
list: Formatted top results
|
|
340
|
+
|
|
341
|
+
Examples:
|
|
342
|
+
>>> from ilovetools.ml import cv_results # Short alias
|
|
343
|
+
>>> results = {
|
|
344
|
+
... 'all_results': [
|
|
345
|
+
... {'params': {'a': 1}, 'mean_score': 0.8},
|
|
346
|
+
... {'params': {'a': 2}, 'mean_score': 0.9}
|
|
347
|
+
... ]
|
|
348
|
+
... }
|
|
349
|
+
>>> top = cv_results(results, top_n=2)
|
|
350
|
+
|
|
351
|
+
>>> from ilovetools.ml import format_cv_results # Full name
|
|
352
|
+
>>> top = format_cv_results(results)
|
|
353
|
+
|
|
354
|
+
Notes:
|
|
355
|
+
- Shows top performing combinations
|
|
356
|
+
- Sorted by score
|
|
357
|
+
- Easy comparison
|
|
358
|
+
- Use for analysis
|
|
359
|
+
"""
|
|
360
|
+
all_results = search_results.get('all_results', [])
|
|
361
|
+
sorted_results = sorted(all_results, key=lambda x: x['mean_score'], reverse=True)
|
|
362
|
+
|
|
363
|
+
return sorted_results[:top_n]
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# Create alias
|
|
367
|
+
cv_results = format_cv_results
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def learning_curve_data(
|
|
371
|
+
X: List,
|
|
372
|
+
y: List,
|
|
373
|
+
model_func: Callable,
|
|
374
|
+
metric_func: Callable,
|
|
375
|
+
train_sizes: List[float] = None
|
|
376
|
+
) -> Dict[str, List]:
|
|
377
|
+
"""
|
|
378
|
+
Generate learning curve data.
|
|
379
|
+
|
|
380
|
+
Alias: learning_curve()
|
|
381
|
+
|
|
382
|
+
Shows how model performance changes with training set size.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
X: Feature data
|
|
386
|
+
y: Target data
|
|
387
|
+
model_func: Function(X_train, y_train, X_val) -> predictions
|
|
388
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
389
|
+
train_sizes: List of training set proportions. Default: [0.2, 0.4, 0.6, 0.8, 1.0]
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
dict: Training sizes, train scores, validation scores
|
|
393
|
+
|
|
394
|
+
Examples:
|
|
395
|
+
>>> from ilovetools.ml import learning_curve # Short alias
|
|
396
|
+
>>> X = list(range(20))
|
|
397
|
+
>>> y = [i * 2 for i in X]
|
|
398
|
+
>>>
|
|
399
|
+
>>> def model(X_tr, y_tr, X_val):
|
|
400
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
401
|
+
... return [avg] * len(X_val)
|
|
402
|
+
>>>
|
|
403
|
+
>>> def metric(y_true, y_pred):
|
|
404
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
405
|
+
>>>
|
|
406
|
+
>>> curve = learning_curve(X, y, model, metric)
|
|
407
|
+
|
|
408
|
+
>>> from ilovetools.ml import learning_curve_data # Full name
|
|
409
|
+
>>> curve = learning_curve_data(X, y, model, metric)
|
|
410
|
+
|
|
411
|
+
Notes:
|
|
412
|
+
- Diagnose overfitting/underfitting
|
|
413
|
+
- Shows if more data helps
|
|
414
|
+
- Plot train vs val scores
|
|
415
|
+
- Use for model selection
|
|
416
|
+
"""
|
|
417
|
+
if train_sizes is None:
|
|
418
|
+
train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
|
|
419
|
+
|
|
420
|
+
from .cross_validation import holdout_validation_split
|
|
421
|
+
|
|
422
|
+
train_scores = []
|
|
423
|
+
val_scores = []
|
|
424
|
+
|
|
425
|
+
for size in train_sizes:
|
|
426
|
+
# Split data
|
|
427
|
+
X_train, X_val, y_train, y_val = holdout_validation_split(
|
|
428
|
+
X, y, test_size=1-size
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Train and evaluate
|
|
432
|
+
y_train_pred = model_func(X_train, y_train, X_train)
|
|
433
|
+
y_val_pred = model_func(X_train, y_train, X_val)
|
|
434
|
+
|
|
435
|
+
train_score = metric_func(y_train, y_train_pred)
|
|
436
|
+
val_score = metric_func(y_val, y_val_pred)
|
|
437
|
+
|
|
438
|
+
train_scores.append(train_score)
|
|
439
|
+
val_scores.append(val_score)
|
|
440
|
+
|
|
441
|
+
return {
|
|
442
|
+
'train_sizes': train_sizes,
|
|
443
|
+
'train_scores': train_scores,
|
|
444
|
+
'val_scores': val_scores
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# Create alias
|
|
449
|
+
learning_curve = learning_curve_data
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def validation_curve_data(
|
|
453
|
+
X: List,
|
|
454
|
+
y: List,
|
|
455
|
+
model_func: Callable,
|
|
456
|
+
metric_func: Callable,
|
|
457
|
+
param_name: str,
|
|
458
|
+
param_range: List
|
|
459
|
+
) -> Dict[str, List]:
|
|
460
|
+
"""
|
|
461
|
+
Generate validation curve data.
|
|
462
|
+
|
|
463
|
+
Alias: val_curve()
|
|
464
|
+
|
|
465
|
+
Shows how model performance changes with a hyperparameter.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
X: Feature data
|
|
469
|
+
y: Target data
|
|
470
|
+
model_func: Function(param_value, X_train, y_train, X_val) -> predictions
|
|
471
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
472
|
+
param_name: Name of parameter to vary
|
|
473
|
+
param_range: List of parameter values to try
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
dict: Parameter values, train scores, validation scores
|
|
477
|
+
|
|
478
|
+
Examples:
|
|
479
|
+
>>> from ilovetools.ml import val_curve # Short alias
|
|
480
|
+
>>> X = list(range(10))
|
|
481
|
+
>>> y = [i * 2 for i in X]
|
|
482
|
+
>>>
|
|
483
|
+
>>> def model(param_val, X_tr, y_tr, X_val):
|
|
484
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
485
|
+
... return [avg * param_val] * len(X_val)
|
|
486
|
+
>>>
|
|
487
|
+
>>> def metric(y_true, y_pred):
|
|
488
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
489
|
+
>>>
|
|
490
|
+
>>> curve = val_curve(X, y, model, metric, 'alpha', [0.5, 1.0, 1.5])
|
|
491
|
+
|
|
492
|
+
>>> from ilovetools.ml import validation_curve_data # Full name
|
|
493
|
+
>>> curve = validation_curve_data(X, y, model, metric, 'alpha', [0.5, 1.0])
|
|
494
|
+
|
|
495
|
+
Notes:
|
|
496
|
+
- Visualize hyperparameter impact
|
|
497
|
+
- Find optimal parameter value
|
|
498
|
+
- Detect overfitting
|
|
499
|
+
- Use for tuning guidance
|
|
500
|
+
"""
|
|
501
|
+
from .cross_validation import holdout_validation_split
|
|
502
|
+
|
|
503
|
+
train_scores = []
|
|
504
|
+
val_scores = []
|
|
505
|
+
|
|
506
|
+
for param_value in param_range:
|
|
507
|
+
# Split data
|
|
508
|
+
X_train, X_val, y_train, y_val = holdout_validation_split(X, y)
|
|
509
|
+
|
|
510
|
+
# Train and evaluate
|
|
511
|
+
y_train_pred = model_func(param_value, X_train, y_train, X_train)
|
|
512
|
+
y_val_pred = model_func(param_value, X_train, y_train, X_val)
|
|
513
|
+
|
|
514
|
+
train_score = metric_func(y_train, y_train_pred)
|
|
515
|
+
val_score = metric_func(y_val, y_val_pred)
|
|
516
|
+
|
|
517
|
+
train_scores.append(train_score)
|
|
518
|
+
val_scores.append(val_score)
|
|
519
|
+
|
|
520
|
+
return {
|
|
521
|
+
'param_name': param_name,
|
|
522
|
+
'param_range': param_range,
|
|
523
|
+
'train_scores': train_scores,
|
|
524
|
+
'val_scores': val_scores
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
# Create alias
|
|
529
|
+
val_curve = validation_curve_data
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def early_stopping_monitor(
|
|
533
|
+
scores: List[float],
|
|
534
|
+
patience: int = 5,
|
|
535
|
+
min_delta: float = 0.001
|
|
536
|
+
) -> bool:
|
|
537
|
+
"""
|
|
538
|
+
Monitor for early stopping.
|
|
539
|
+
|
|
540
|
+
Alias: early_stop()
|
|
541
|
+
|
|
542
|
+
Stops training if no improvement for patience epochs.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
scores: List of validation scores (higher is better)
|
|
546
|
+
patience: Number of epochs to wait. Default: 5
|
|
547
|
+
min_delta: Minimum improvement threshold. Default: 0.001
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
bool: True if should stop, False otherwise
|
|
551
|
+
|
|
552
|
+
Examples:
|
|
553
|
+
>>> from ilovetools.ml import early_stop # Short alias
|
|
554
|
+
>>> scores = [0.7, 0.75, 0.78, 0.78, 0.78, 0.78]
|
|
555
|
+
>>> should_stop = early_stop(scores, patience=3)
|
|
556
|
+
>>> print(should_stop)
|
|
557
|
+
True
|
|
558
|
+
|
|
559
|
+
>>> from ilovetools.ml import early_stopping_monitor # Full name
|
|
560
|
+
>>> should_stop = early_stopping_monitor(scores, patience=5)
|
|
561
|
+
|
|
562
|
+
Notes:
|
|
563
|
+
- Prevents overfitting
|
|
564
|
+
- Saves training time
|
|
565
|
+
- Common in neural networks
|
|
566
|
+
- Adjust patience for stability
|
|
567
|
+
"""
|
|
568
|
+
if len(scores) < patience + 1:
|
|
569
|
+
return False
|
|
570
|
+
|
|
571
|
+
best_score = max(scores[:-patience])
|
|
572
|
+
recent_scores = scores[-patience:]
|
|
573
|
+
|
|
574
|
+
# Check if any recent score improved
|
|
575
|
+
for score in recent_scores:
|
|
576
|
+
if score > best_score + min_delta:
|
|
577
|
+
return False
|
|
578
|
+
|
|
579
|
+
return True
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
# Create alias
|
|
583
|
+
early_stop = early_stopping_monitor
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def compare_models_cv(
|
|
587
|
+
X: List,
|
|
588
|
+
y: List,
|
|
589
|
+
models: Dict[str, Callable],
|
|
590
|
+
metric_func: Callable,
|
|
591
|
+
cv_splits: int = 5
|
|
592
|
+
) -> Dict[str, Dict]:
|
|
593
|
+
"""
|
|
594
|
+
Compare multiple models using cross-validation.
|
|
595
|
+
|
|
596
|
+
Alias: compare_models()
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
X: Feature data
|
|
600
|
+
y: Target data
|
|
601
|
+
models: Dict of model_name: model_func
|
|
602
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
603
|
+
cv_splits: Number of CV folds. Default: 5
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
dict: Results for each model
|
|
607
|
+
|
|
608
|
+
Examples:
|
|
609
|
+
>>> from ilovetools.ml import compare_models # Short alias
|
|
610
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
611
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
612
|
+
>>>
|
|
613
|
+
>>> def model1(X_tr, y_tr, X_val):
|
|
614
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
615
|
+
... return [avg] * len(X_val)
|
|
616
|
+
>>>
|
|
617
|
+
>>> def model2(X_tr, y_tr, X_val):
|
|
618
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
619
|
+
... return [avg + 0.5] * len(X_val)
|
|
620
|
+
>>>
|
|
621
|
+
>>> def metric(y_true, y_pred):
|
|
622
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
623
|
+
>>>
|
|
624
|
+
>>> models = {'Model1': model1, 'Model2': model2}
|
|
625
|
+
>>> results = compare_models(X, y, models, metric)
|
|
626
|
+
|
|
627
|
+
>>> from ilovetools.ml import compare_models_cv # Full name
|
|
628
|
+
>>> results = compare_models_cv(X, y, models, metric)
|
|
629
|
+
|
|
630
|
+
Notes:
|
|
631
|
+
- Compare multiple algorithms
|
|
632
|
+
- Fair comparison with same CV splits
|
|
633
|
+
- Returns mean and std for each
|
|
634
|
+
- Use for model selection
|
|
635
|
+
"""
|
|
636
|
+
from .cross_validation import k_fold_cross_validation
|
|
637
|
+
|
|
638
|
+
splits = k_fold_cross_validation(X, y, k=cv_splits)
|
|
639
|
+
results = {}
|
|
640
|
+
|
|
641
|
+
for model_name, model_func in models.items():
|
|
642
|
+
scores = []
|
|
643
|
+
|
|
644
|
+
for train_idx, val_idx in splits:
|
|
645
|
+
X_train = [X[i] for i in train_idx]
|
|
646
|
+
y_train = [y[i] for i in train_idx]
|
|
647
|
+
X_val = [X[i] for i in val_idx]
|
|
648
|
+
y_val = [y[i] for i in val_idx]
|
|
649
|
+
|
|
650
|
+
y_pred = model_func(X_train, y_train, X_val)
|
|
651
|
+
score = metric_func(y_val, y_pred)
|
|
652
|
+
scores.append(score)
|
|
653
|
+
|
|
654
|
+
results[model_name] = {
|
|
655
|
+
'mean_score': sum(scores) / len(scores),
|
|
656
|
+
'std_score': (sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores)) ** 0.5,
|
|
657
|
+
'scores': scores
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
return results
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
# Create alias
|
|
664
|
+
compare_models = compare_models_cv
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def bayesian_search_simple(
|
|
668
|
+
X: List,
|
|
669
|
+
y: List,
|
|
670
|
+
model_func: Callable,
|
|
671
|
+
param_bounds: Dict[str, Tuple[float, float]],
|
|
672
|
+
metric_func: Callable,
|
|
673
|
+
n_iter: int = 10,
|
|
674
|
+
cv_splits: int = 5
|
|
675
|
+
) -> Dict[str, Any]:
|
|
676
|
+
"""
|
|
677
|
+
Simple Bayesian optimization for hyperparameter tuning.
|
|
678
|
+
|
|
679
|
+
Alias: bayesopt()
|
|
680
|
+
|
|
681
|
+
Uses past results to guide search. More efficient than random search.
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
X: Feature data
|
|
685
|
+
y: Target data
|
|
686
|
+
model_func: Function(params, X_train, y_train, X_val) -> predictions
|
|
687
|
+
param_bounds: Dict of (min, max) tuples
|
|
688
|
+
metric_func: Function(y_true, y_pred) -> score
|
|
689
|
+
n_iter: Number of iterations. Default: 10
|
|
690
|
+
cv_splits: Number of CV folds. Default: 5
|
|
691
|
+
|
|
692
|
+
Returns:
|
|
693
|
+
dict: Best parameters, best score, all results
|
|
694
|
+
|
|
695
|
+
Examples:
|
|
696
|
+
>>> from ilovetools.ml import bayesopt # Short alias
|
|
697
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
698
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
699
|
+
>>>
|
|
700
|
+
>>> def model(params, X_tr, y_tr, X_val):
|
|
701
|
+
... alpha = params['alpha']
|
|
702
|
+
... avg = sum(y_tr) / len(y_tr)
|
|
703
|
+
... return [avg * alpha] * len(X_val)
|
|
704
|
+
>>>
|
|
705
|
+
>>> def metric(y_true, y_pred):
|
|
706
|
+
... return -sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
707
|
+
>>>
|
|
708
|
+
>>> bounds = {'alpha': (0.5, 1.5)}
|
|
709
|
+
>>> results = bayesopt(X, y, model, bounds, metric, n_iter=5)
|
|
710
|
+
|
|
711
|
+
>>> from ilovetools.ml import bayesian_search_simple # Full name
|
|
712
|
+
>>> results = bayesian_search_simple(X, y, model, bounds, metric)
|
|
713
|
+
|
|
714
|
+
Notes:
|
|
715
|
+
- More efficient than random search
|
|
716
|
+
- Learns from past evaluations
|
|
717
|
+
- Good for expensive models
|
|
718
|
+
- Simplified implementation
|
|
719
|
+
"""
|
|
720
|
+
from .cross_validation import k_fold_cross_validation
|
|
721
|
+
|
|
722
|
+
results = []
|
|
723
|
+
|
|
724
|
+
# Initial random samples
|
|
725
|
+
n_random = min(3, n_iter)
|
|
726
|
+
|
|
727
|
+
for i in range(n_iter):
|
|
728
|
+
if i < n_random:
|
|
729
|
+
# Random sampling initially
|
|
730
|
+
params = {
|
|
731
|
+
name: random.uniform(bounds[0], bounds[1])
|
|
732
|
+
for name, bounds in param_bounds.items()
|
|
733
|
+
}
|
|
734
|
+
else:
|
|
735
|
+
# Exploit best region
|
|
736
|
+
best_params = max(results, key=lambda x: x['mean_score'])['params']
|
|
737
|
+
params = {
|
|
738
|
+
name: best_params[name] + random.uniform(-0.1, 0.1) * (bounds[1] - bounds[0])
|
|
739
|
+
for name, bounds in param_bounds.items()
|
|
740
|
+
}
|
|
741
|
+
# Clip to bounds
|
|
742
|
+
params = {
|
|
743
|
+
name: max(param_bounds[name][0], min(param_bounds[name][1], val))
|
|
744
|
+
for name, val in params.items()
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
# Evaluate
|
|
748
|
+
splits = k_fold_cross_validation(X, y, k=cv_splits)
|
|
749
|
+
scores = []
|
|
750
|
+
|
|
751
|
+
for train_idx, val_idx in splits:
|
|
752
|
+
X_train = [X[i] for i in train_idx]
|
|
753
|
+
y_train = [y[i] for i in train_idx]
|
|
754
|
+
X_val = [X[i] for i in val_idx]
|
|
755
|
+
y_val = [y[i] for i in val_idx]
|
|
756
|
+
|
|
757
|
+
y_pred = model_func(params, X_train, y_train, X_val)
|
|
758
|
+
score = metric_func(y_val, y_pred)
|
|
759
|
+
scores.append(score)
|
|
760
|
+
|
|
761
|
+
mean_score = sum(scores) / len(scores)
|
|
762
|
+
|
|
763
|
+
results.append({
|
|
764
|
+
'params': params,
|
|
765
|
+
'mean_score': mean_score,
|
|
766
|
+
'scores': scores
|
|
767
|
+
})
|
|
768
|
+
|
|
769
|
+
# Find best
|
|
770
|
+
best_result = max(results, key=lambda x: x['mean_score'])
|
|
771
|
+
|
|
772
|
+
return {
|
|
773
|
+
'best_params': best_result['params'],
|
|
774
|
+
'best_score': best_result['mean_score'],
|
|
775
|
+
'all_results': results,
|
|
776
|
+
'n_iterations': n_iter
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
# Create alias
|
|
781
|
+
bayesopt = bayesian_search_simple
|