ilovetools 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +42 -0
- ilovetools/ai/__init__.py +13 -0
- ilovetools/ai/embeddings.py +270 -0
- ilovetools/ai/inference.py +5 -0
- ilovetools/ai/llm_helpers.py +141 -0
- ilovetools/audio/__init__.py +5 -0
- ilovetools/automation/__init__.py +5 -0
- ilovetools/conversion/__init__.py +5 -0
- ilovetools/data/__init__.py +27 -0
- ilovetools/data/feature_engineering.py +497 -0
- ilovetools/data/preprocessing.py +234 -0
- ilovetools/database/__init__.py +5 -0
- ilovetools/datetime/__init__.py +5 -0
- ilovetools/files/__init__.py +5 -0
- ilovetools/image/__init__.py +5 -0
- ilovetools/ml/__init__.py +603 -0
- ilovetools/ml/clustering.py +1107 -0
- ilovetools/ml/cross_validation.py +612 -0
- ilovetools/ml/dimensionality.py +1001 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/feature_selection.py +971 -0
- ilovetools/ml/imbalanced.py +797 -0
- ilovetools/ml/interpretation.py +915 -0
- ilovetools/ml/metrics.py +601 -0
- ilovetools/ml/pipeline.py +711 -0
- ilovetools/ml/timeseries.py +984 -0
- ilovetools/ml/tuning.py +781 -0
- ilovetools/security/__init__.py +5 -0
- ilovetools/text/__init__.py +5 -0
- ilovetools/utils/__init__.py +5 -0
- ilovetools/validation/__init__.py +5 -0
- ilovetools/web/__init__.py +5 -0
- ilovetools-0.2.3.dist-info/METADATA +143 -0
- ilovetools-0.2.3.dist-info/RECORD +38 -0
- ilovetools-0.2.3.dist-info/WHEEL +5 -0
- ilovetools-0.2.3.dist-info/licenses/LICENSE +21 -0
- ilovetools-0.2.3.dist-info/top_level.txt +2 -0
- tests/__init__.py +3 -0
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cross-validation utilities for ML workflows
|
|
3
|
+
Each function has TWO names: full descriptive name + abbreviated alias
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Tuple, Dict, Callable, Any, Optional
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# Full names
|
|
11
|
+
'k_fold_cross_validation',
|
|
12
|
+
'stratified_k_fold',
|
|
13
|
+
'time_series_split',
|
|
14
|
+
'leave_one_out_cv',
|
|
15
|
+
'shuffle_split_cv',
|
|
16
|
+
'cross_validate_score',
|
|
17
|
+
'holdout_validation_split',
|
|
18
|
+
'train_val_test_split',
|
|
19
|
+
# Abbreviated aliases
|
|
20
|
+
'kfold',
|
|
21
|
+
'skfold',
|
|
22
|
+
'tssplit',
|
|
23
|
+
'loocv',
|
|
24
|
+
'shuffle_cv',
|
|
25
|
+
'cv_score',
|
|
26
|
+
'holdout',
|
|
27
|
+
'tvt_split',
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def k_fold_cross_validation(
|
|
32
|
+
X: List,
|
|
33
|
+
y: List,
|
|
34
|
+
k: int = 5,
|
|
35
|
+
shuffle: bool = True,
|
|
36
|
+
random_state: Optional[int] = None
|
|
37
|
+
) -> List[Tuple[List[int], List[int]]]:
|
|
38
|
+
"""
|
|
39
|
+
K-Fold Cross-Validation split.
|
|
40
|
+
|
|
41
|
+
Alias: kfold()
|
|
42
|
+
|
|
43
|
+
Splits data into K folds. Each fold is used once as validation
|
|
44
|
+
while remaining K-1 folds form training set.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
X: Feature data
|
|
48
|
+
y: Target data
|
|
49
|
+
k: Number of folds. Default: 5
|
|
50
|
+
shuffle: Shuffle data before splitting. Default: True
|
|
51
|
+
random_state: Random seed for reproducibility
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
list: List of (train_indices, val_indices) tuples
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
>>> from ilovetools.ml import kfold # Short alias
|
|
58
|
+
>>> X = list(range(10))
|
|
59
|
+
>>> y = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
60
|
+
>>> folds = kfold(X, y, k=5)
|
|
61
|
+
>>> len(folds)
|
|
62
|
+
5
|
|
63
|
+
>>> train_idx, val_idx = folds[0]
|
|
64
|
+
>>> len(train_idx), len(val_idx)
|
|
65
|
+
(8, 2)
|
|
66
|
+
|
|
67
|
+
>>> from ilovetools.ml import k_fold_cross_validation # Full name
|
|
68
|
+
>>> folds = k_fold_cross_validation(X, y, k=3)
|
|
69
|
+
|
|
70
|
+
Notes:
|
|
71
|
+
- Most common CV method
|
|
72
|
+
- Use k=5 or k=10 typically
|
|
73
|
+
- Larger k = more training data per fold
|
|
74
|
+
- Smaller k = faster computation
|
|
75
|
+
"""
|
|
76
|
+
if len(X) != len(y):
|
|
77
|
+
raise ValueError("X and y must have same length")
|
|
78
|
+
|
|
79
|
+
n = len(X)
|
|
80
|
+
indices = list(range(n))
|
|
81
|
+
|
|
82
|
+
if shuffle:
|
|
83
|
+
if random_state is not None:
|
|
84
|
+
random.seed(random_state)
|
|
85
|
+
random.shuffle(indices)
|
|
86
|
+
|
|
87
|
+
fold_size = n // k
|
|
88
|
+
folds = []
|
|
89
|
+
|
|
90
|
+
for i in range(k):
|
|
91
|
+
start = i * fold_size
|
|
92
|
+
end = start + fold_size if i < k - 1 else n
|
|
93
|
+
|
|
94
|
+
val_indices = indices[start:end]
|
|
95
|
+
train_indices = indices[:start] + indices[end:]
|
|
96
|
+
|
|
97
|
+
folds.append((train_indices, val_indices))
|
|
98
|
+
|
|
99
|
+
return folds
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Create alias
|
|
103
|
+
kfold = k_fold_cross_validation
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def stratified_k_fold(
|
|
107
|
+
X: List,
|
|
108
|
+
y: List,
|
|
109
|
+
k: int = 5,
|
|
110
|
+
shuffle: bool = True,
|
|
111
|
+
random_state: Optional[int] = None
|
|
112
|
+
) -> List[Tuple[List[int], List[int]]]:
|
|
113
|
+
"""
|
|
114
|
+
Stratified K-Fold Cross-Validation split.
|
|
115
|
+
|
|
116
|
+
Alias: skfold()
|
|
117
|
+
|
|
118
|
+
Like K-Fold but maintains class distribution in each fold.
|
|
119
|
+
Essential for imbalanced datasets.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
X: Feature data
|
|
123
|
+
y: Target data (class labels)
|
|
124
|
+
k: Number of folds. Default: 5
|
|
125
|
+
shuffle: Shuffle data before splitting. Default: True
|
|
126
|
+
random_state: Random seed for reproducibility
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
list: List of (train_indices, val_indices) tuples
|
|
130
|
+
|
|
131
|
+
Examples:
|
|
132
|
+
>>> from ilovetools.ml import skfold # Short alias
|
|
133
|
+
>>> X = list(range(10))
|
|
134
|
+
>>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] # Balanced
|
|
135
|
+
>>> folds = skfold(X, y, k=5)
|
|
136
|
+
|
|
137
|
+
>>> from ilovetools.ml import stratified_k_fold # Full name
|
|
138
|
+
>>> folds = stratified_k_fold(X, y, k=3)
|
|
139
|
+
|
|
140
|
+
# Imbalanced dataset
|
|
141
|
+
>>> y_imbalanced = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # 80-20 split
|
|
142
|
+
>>> folds = skfold(X, y_imbalanced, k=5)
|
|
143
|
+
# Each fold maintains 80-20 ratio
|
|
144
|
+
|
|
145
|
+
Notes:
|
|
146
|
+
- Use for imbalanced datasets
|
|
147
|
+
- Maintains class distribution
|
|
148
|
+
- More reliable than regular K-Fold
|
|
149
|
+
- Slightly slower than K-Fold
|
|
150
|
+
"""
|
|
151
|
+
if len(X) != len(y):
|
|
152
|
+
raise ValueError("X and y must have same length")
|
|
153
|
+
|
|
154
|
+
# Group indices by class
|
|
155
|
+
class_indices = {}
|
|
156
|
+
for idx, label in enumerate(y):
|
|
157
|
+
if label not in class_indices:
|
|
158
|
+
class_indices[label] = []
|
|
159
|
+
class_indices[label].append(idx)
|
|
160
|
+
|
|
161
|
+
# Shuffle within each class
|
|
162
|
+
if shuffle:
|
|
163
|
+
if random_state is not None:
|
|
164
|
+
random.seed(random_state)
|
|
165
|
+
for label in class_indices:
|
|
166
|
+
random.shuffle(class_indices[label])
|
|
167
|
+
|
|
168
|
+
# Create folds maintaining class distribution
|
|
169
|
+
folds = [[] for _ in range(k)]
|
|
170
|
+
|
|
171
|
+
for label, indices in class_indices.items():
|
|
172
|
+
fold_size = len(indices) // k
|
|
173
|
+
for i in range(k):
|
|
174
|
+
start = i * fold_size
|
|
175
|
+
end = start + fold_size if i < k - 1 else len(indices)
|
|
176
|
+
folds[i].extend(indices[start:end])
|
|
177
|
+
|
|
178
|
+
# Convert to train/val splits
|
|
179
|
+
result = []
|
|
180
|
+
all_indices = list(range(len(X)))
|
|
181
|
+
|
|
182
|
+
for val_indices in folds:
|
|
183
|
+
train_indices = [idx for idx in all_indices if idx not in val_indices]
|
|
184
|
+
result.append((train_indices, val_indices))
|
|
185
|
+
|
|
186
|
+
return result
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# Create alias
|
|
190
|
+
skfold = stratified_k_fold
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def time_series_split(
|
|
194
|
+
X: List,
|
|
195
|
+
y: List,
|
|
196
|
+
n_splits: int = 5,
|
|
197
|
+
test_size: Optional[int] = None
|
|
198
|
+
) -> List[Tuple[List[int], List[int]]]:
|
|
199
|
+
"""
|
|
200
|
+
Time Series Cross-Validation split.
|
|
201
|
+
|
|
202
|
+
Alias: tssplit()
|
|
203
|
+
|
|
204
|
+
Respects temporal order. Training set always comes before test set.
|
|
205
|
+
No future data leakage!
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
X: Feature data (time-ordered)
|
|
209
|
+
y: Target data (time-ordered)
|
|
210
|
+
n_splits: Number of splits. Default: 5
|
|
211
|
+
test_size: Size of test set. If None, uses expanding window
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
list: List of (train_indices, test_indices) tuples
|
|
215
|
+
|
|
216
|
+
Examples:
|
|
217
|
+
>>> from ilovetools.ml import tssplit # Short alias
|
|
218
|
+
>>> X = list(range(10))
|
|
219
|
+
>>> y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
|
220
|
+
>>> splits = tssplit(X, y, n_splits=3)
|
|
221
|
+
>>> len(splits)
|
|
222
|
+
3
|
|
223
|
+
|
|
224
|
+
>>> from ilovetools.ml import time_series_split # Full name
|
|
225
|
+
>>> splits = time_series_split(X, y, n_splits=5)
|
|
226
|
+
|
|
227
|
+
# Stock price prediction
|
|
228
|
+
>>> prices = [100, 102, 101, 105, 103, 107, 110, 108, 112, 115]
|
|
229
|
+
>>> dates = list(range(len(prices)))
|
|
230
|
+
>>> splits = tssplit(dates, prices, n_splits=3)
|
|
231
|
+
# Each split: train on past, test on future
|
|
232
|
+
|
|
233
|
+
Notes:
|
|
234
|
+
- Essential for time series data
|
|
235
|
+
- Prevents future data leakage
|
|
236
|
+
- Training set grows over time
|
|
237
|
+
- Use for: Stock prices, weather, sales
|
|
238
|
+
"""
|
|
239
|
+
if len(X) != len(y):
|
|
240
|
+
raise ValueError("X and y must have same length")
|
|
241
|
+
|
|
242
|
+
n = len(X)
|
|
243
|
+
|
|
244
|
+
if test_size is None:
|
|
245
|
+
test_size = n // (n_splits + 1)
|
|
246
|
+
|
|
247
|
+
splits = []
|
|
248
|
+
|
|
249
|
+
for i in range(n_splits):
|
|
250
|
+
test_start = (i + 1) * test_size
|
|
251
|
+
test_end = test_start + test_size
|
|
252
|
+
|
|
253
|
+
if test_end > n:
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
train_indices = list(range(test_start))
|
|
257
|
+
test_indices = list(range(test_start, test_end))
|
|
258
|
+
|
|
259
|
+
splits.append((train_indices, test_indices))
|
|
260
|
+
|
|
261
|
+
return splits
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# Create alias
|
|
265
|
+
tssplit = time_series_split
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def leave_one_out_cv(X: List, y: List) -> List[Tuple[List[int], List[int]]]:
|
|
269
|
+
"""
|
|
270
|
+
Leave-One-Out Cross-Validation.
|
|
271
|
+
|
|
272
|
+
Alias: loocv()
|
|
273
|
+
|
|
274
|
+
Each sample is used once as validation, rest as training.
|
|
275
|
+
Maximum training data but computationally expensive.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
X: Feature data
|
|
279
|
+
y: Target data
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
list: List of (train_indices, val_indices) tuples
|
|
283
|
+
|
|
284
|
+
Examples:
|
|
285
|
+
>>> from ilovetools.ml import loocv # Short alias
|
|
286
|
+
>>> X = [1, 2, 3, 4, 5]
|
|
287
|
+
>>> y = [10, 20, 30, 40, 50]
|
|
288
|
+
>>> splits = loocv(X, y)
|
|
289
|
+
>>> len(splits)
|
|
290
|
+
5
|
|
291
|
+
>>> train_idx, val_idx = splits[0]
|
|
292
|
+
>>> len(train_idx), len(val_idx)
|
|
293
|
+
(4, 1)
|
|
294
|
+
|
|
295
|
+
>>> from ilovetools.ml import leave_one_out_cv # Full name
|
|
296
|
+
>>> splits = leave_one_out_cv(X, y)
|
|
297
|
+
|
|
298
|
+
Notes:
|
|
299
|
+
- Maximum training data per fold
|
|
300
|
+
- Very computationally expensive
|
|
301
|
+
- Use for small datasets only
|
|
302
|
+
- n_splits = n_samples
|
|
303
|
+
"""
|
|
304
|
+
if len(X) != len(y):
|
|
305
|
+
raise ValueError("X and y must have same length")
|
|
306
|
+
|
|
307
|
+
n = len(X)
|
|
308
|
+
splits = []
|
|
309
|
+
|
|
310
|
+
for i in range(n):
|
|
311
|
+
train_indices = list(range(i)) + list(range(i + 1, n))
|
|
312
|
+
val_indices = [i]
|
|
313
|
+
splits.append((train_indices, val_indices))
|
|
314
|
+
|
|
315
|
+
return splits
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# Create alias
|
|
319
|
+
loocv = leave_one_out_cv
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def shuffle_split_cv(
|
|
323
|
+
X: List,
|
|
324
|
+
y: List,
|
|
325
|
+
n_splits: int = 10,
|
|
326
|
+
test_size: float = 0.2,
|
|
327
|
+
random_state: Optional[int] = None
|
|
328
|
+
) -> List[Tuple[List[int], List[int]]]:
|
|
329
|
+
"""
|
|
330
|
+
Shuffle Split Cross-Validation.
|
|
331
|
+
|
|
332
|
+
Alias: shuffle_cv()
|
|
333
|
+
|
|
334
|
+
Random permutation CV. Creates random train/test splits.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
X: Feature data
|
|
338
|
+
y: Target data
|
|
339
|
+
n_splits: Number of splits. Default: 10
|
|
340
|
+
test_size: Proportion of test set. Default: 0.2
|
|
341
|
+
random_state: Random seed for reproducibility
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
list: List of (train_indices, test_indices) tuples
|
|
345
|
+
|
|
346
|
+
Examples:
|
|
347
|
+
>>> from ilovetools.ml import shuffle_cv # Short alias
|
|
348
|
+
>>> X = list(range(10))
|
|
349
|
+
>>> y = [0, 1] * 5
|
|
350
|
+
>>> splits = shuffle_cv(X, y, n_splits=5, test_size=0.3)
|
|
351
|
+
>>> len(splits)
|
|
352
|
+
5
|
|
353
|
+
|
|
354
|
+
>>> from ilovetools.ml import shuffle_split_cv # Full name
|
|
355
|
+
>>> splits = shuffle_split_cv(X, y, n_splits=3)
|
|
356
|
+
|
|
357
|
+
Notes:
|
|
358
|
+
- Random train/test splits
|
|
359
|
+
- Samples can appear in multiple test sets
|
|
360
|
+
- Good for large datasets
|
|
361
|
+
- More flexible than K-Fold
|
|
362
|
+
"""
|
|
363
|
+
if len(X) != len(y):
|
|
364
|
+
raise ValueError("X and y must have same length")
|
|
365
|
+
|
|
366
|
+
n = len(X)
|
|
367
|
+
n_test = int(n * test_size)
|
|
368
|
+
|
|
369
|
+
if random_state is not None:
|
|
370
|
+
random.seed(random_state)
|
|
371
|
+
|
|
372
|
+
splits = []
|
|
373
|
+
|
|
374
|
+
for _ in range(n_splits):
|
|
375
|
+
indices = list(range(n))
|
|
376
|
+
random.shuffle(indices)
|
|
377
|
+
|
|
378
|
+
test_indices = indices[:n_test]
|
|
379
|
+
train_indices = indices[n_test:]
|
|
380
|
+
|
|
381
|
+
splits.append((train_indices, test_indices))
|
|
382
|
+
|
|
383
|
+
return splits
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# Create alias
|
|
387
|
+
shuffle_cv = shuffle_split_cv
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def cross_validate_score(
|
|
391
|
+
X: List,
|
|
392
|
+
y: List,
|
|
393
|
+
model_func: Callable,
|
|
394
|
+
metric_func: Callable,
|
|
395
|
+
cv_method: str = 'kfold',
|
|
396
|
+
k: int = 5
|
|
397
|
+
) -> Dict[str, Any]:
|
|
398
|
+
"""
|
|
399
|
+
Perform cross-validation and return scores.
|
|
400
|
+
|
|
401
|
+
Alias: cv_score()
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
X: Feature data
|
|
405
|
+
y: Target data
|
|
406
|
+
model_func: Function that trains and returns predictions
|
|
407
|
+
metric_func: Function that calculates metric
|
|
408
|
+
cv_method: CV method ('kfold', 'stratified', 'timeseries')
|
|
409
|
+
k: Number of folds
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
dict: CV results with scores and statistics
|
|
413
|
+
|
|
414
|
+
Examples:
|
|
415
|
+
>>> from ilovetools.ml import cv_score # Short alias
|
|
416
|
+
>>> X = [[1], [2], [3], [4], [5]]
|
|
417
|
+
>>> y = [1, 2, 3, 4, 5]
|
|
418
|
+
>>>
|
|
419
|
+
>>> def simple_model(X_train, y_train, X_val):
|
|
420
|
+
... # Simple average predictor
|
|
421
|
+
... avg = sum(y_train) / len(y_train)
|
|
422
|
+
... return [avg] * len(X_val)
|
|
423
|
+
>>>
|
|
424
|
+
>>> def mae_metric(y_true, y_pred):
|
|
425
|
+
... return sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
|
|
426
|
+
>>>
|
|
427
|
+
>>> results = cv_score(X, y, simple_model, mae_metric, k=3)
|
|
428
|
+
>>> print(results['mean_score'])
|
|
429
|
+
|
|
430
|
+
>>> from ilovetools.ml import cross_validate_score # Full name
|
|
431
|
+
>>> results = cross_validate_score(X, y, simple_model, mae_metric)
|
|
432
|
+
|
|
433
|
+
Notes:
|
|
434
|
+
- Automates CV workflow
|
|
435
|
+
- Returns mean, std, all scores
|
|
436
|
+
- Flexible with any model/metric
|
|
437
|
+
- Easy model comparison
|
|
438
|
+
"""
|
|
439
|
+
# Get CV splits
|
|
440
|
+
if cv_method == 'kfold':
|
|
441
|
+
splits = k_fold_cross_validation(X, y, k=k)
|
|
442
|
+
elif cv_method == 'stratified':
|
|
443
|
+
splits = stratified_k_fold(X, y, k=k)
|
|
444
|
+
elif cv_method == 'timeseries':
|
|
445
|
+
splits = time_series_split(X, y, n_splits=k)
|
|
446
|
+
else:
|
|
447
|
+
splits = k_fold_cross_validation(X, y, k=k)
|
|
448
|
+
|
|
449
|
+
scores = []
|
|
450
|
+
|
|
451
|
+
for train_idx, val_idx in splits:
|
|
452
|
+
X_train = [X[i] for i in train_idx]
|
|
453
|
+
y_train = [y[i] for i in train_idx]
|
|
454
|
+
X_val = [X[i] for i in val_idx]
|
|
455
|
+
y_val = [y[i] for i in val_idx]
|
|
456
|
+
|
|
457
|
+
# Train and predict
|
|
458
|
+
y_pred = model_func(X_train, y_train, X_val)
|
|
459
|
+
|
|
460
|
+
# Calculate metric
|
|
461
|
+
score = metric_func(y_val, y_pred)
|
|
462
|
+
scores.append(score)
|
|
463
|
+
|
|
464
|
+
return {
|
|
465
|
+
'scores': scores,
|
|
466
|
+
'mean_score': sum(scores) / len(scores),
|
|
467
|
+
'std_score': (sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores)) ** 0.5,
|
|
468
|
+
'n_splits': len(splits)
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
# Create alias
|
|
473
|
+
cv_score = cross_validate_score
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def holdout_validation_split(
|
|
477
|
+
X: List,
|
|
478
|
+
y: List,
|
|
479
|
+
test_size: float = 0.2,
|
|
480
|
+
random_state: Optional[int] = None
|
|
481
|
+
) -> Tuple[List, List, List, List]:
|
|
482
|
+
"""
|
|
483
|
+
Simple holdout validation split.
|
|
484
|
+
|
|
485
|
+
Alias: holdout()
|
|
486
|
+
|
|
487
|
+
Single train/test split. Fast but less reliable than CV.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
X: Feature data
|
|
491
|
+
y: Target data
|
|
492
|
+
test_size: Proportion of test set. Default: 0.2
|
|
493
|
+
random_state: Random seed for reproducibility
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
tuple: (X_train, X_test, y_train, y_test)
|
|
497
|
+
|
|
498
|
+
Examples:
|
|
499
|
+
>>> from ilovetools.ml import holdout # Short alias
|
|
500
|
+
>>> X = list(range(10))
|
|
501
|
+
>>> y = [0, 1] * 5
|
|
502
|
+
>>> X_train, X_test, y_train, y_test = holdout(X, y, test_size=0.3)
|
|
503
|
+
>>> len(X_train), len(X_test)
|
|
504
|
+
(7, 3)
|
|
505
|
+
|
|
506
|
+
>>> from ilovetools.ml import holdout_validation_split # Full name
|
|
507
|
+
>>> X_train, X_test, y_train, y_test = holdout_validation_split(X, y)
|
|
508
|
+
|
|
509
|
+
Notes:
|
|
510
|
+
- Fastest validation method
|
|
511
|
+
- Less reliable than CV
|
|
512
|
+
- Use for quick experiments
|
|
513
|
+
- Good for large datasets
|
|
514
|
+
"""
|
|
515
|
+
if len(X) != len(y):
|
|
516
|
+
raise ValueError("X and y must have same length")
|
|
517
|
+
|
|
518
|
+
n = len(X)
|
|
519
|
+
n_test = int(n * test_size)
|
|
520
|
+
|
|
521
|
+
indices = list(range(n))
|
|
522
|
+
|
|
523
|
+
if random_state is not None:
|
|
524
|
+
random.seed(random_state)
|
|
525
|
+
random.shuffle(indices)
|
|
526
|
+
|
|
527
|
+
test_indices = indices[:n_test]
|
|
528
|
+
train_indices = indices[n_test:]
|
|
529
|
+
|
|
530
|
+
X_train = [X[i] for i in train_indices]
|
|
531
|
+
X_test = [X[i] for i in test_indices]
|
|
532
|
+
y_train = [y[i] for i in train_indices]
|
|
533
|
+
y_test = [y[i] for i in test_indices]
|
|
534
|
+
|
|
535
|
+
return X_train, X_test, y_train, y_test
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
# Create alias
|
|
539
|
+
holdout = holdout_validation_split
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def train_val_test_split(
|
|
543
|
+
X: List,
|
|
544
|
+
y: List,
|
|
545
|
+
val_size: float = 0.2,
|
|
546
|
+
test_size: float = 0.2,
|
|
547
|
+
random_state: Optional[int] = None
|
|
548
|
+
) -> Tuple[List, List, List, List, List, List]:
|
|
549
|
+
"""
|
|
550
|
+
Three-way split: train, validation, test.
|
|
551
|
+
|
|
552
|
+
Alias: tvt_split()
|
|
553
|
+
|
|
554
|
+
Creates separate train, validation, and test sets.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
X: Feature data
|
|
558
|
+
y: Target data
|
|
559
|
+
val_size: Proportion of validation set. Default: 0.2
|
|
560
|
+
test_size: Proportion of test set. Default: 0.2
|
|
561
|
+
random_state: Random seed for reproducibility
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
|
|
565
|
+
|
|
566
|
+
Examples:
|
|
567
|
+
>>> from ilovetools.ml import tvt_split # Short alias
|
|
568
|
+
>>> X = list(range(10))
|
|
569
|
+
>>> y = [0, 1] * 5
|
|
570
|
+
>>> X_tr, X_val, X_te, y_tr, y_val, y_te = tvt_split(X, y)
|
|
571
|
+
>>> len(X_tr), len(X_val), len(X_te)
|
|
572
|
+
(6, 2, 2)
|
|
573
|
+
|
|
574
|
+
>>> from ilovetools.ml import train_val_test_split # Full name
|
|
575
|
+
>>> splits = train_val_test_split(X, y, val_size=0.15, test_size=0.15)
|
|
576
|
+
|
|
577
|
+
Notes:
|
|
578
|
+
- Standard ML workflow split
|
|
579
|
+
- Train: Model training
|
|
580
|
+
- Val: Hyperparameter tuning
|
|
581
|
+
- Test: Final evaluation
|
|
582
|
+
- Typical: 60-20-20 or 70-15-15
|
|
583
|
+
"""
|
|
584
|
+
if len(X) != len(y):
|
|
585
|
+
raise ValueError("X and y must have same length")
|
|
586
|
+
|
|
587
|
+
n = len(X)
|
|
588
|
+
n_test = int(n * test_size)
|
|
589
|
+
n_val = int(n * val_size)
|
|
590
|
+
|
|
591
|
+
indices = list(range(n))
|
|
592
|
+
|
|
593
|
+
if random_state is not None:
|
|
594
|
+
random.seed(random_state)
|
|
595
|
+
random.shuffle(indices)
|
|
596
|
+
|
|
597
|
+
test_indices = indices[:n_test]
|
|
598
|
+
val_indices = indices[n_test:n_test + n_val]
|
|
599
|
+
train_indices = indices[n_test + n_val:]
|
|
600
|
+
|
|
601
|
+
X_train = [X[i] for i in train_indices]
|
|
602
|
+
X_val = [X[i] for i in val_indices]
|
|
603
|
+
X_test = [X[i] for i in test_indices]
|
|
604
|
+
y_train = [y[i] for i in train_indices]
|
|
605
|
+
y_val = [y[i] for i in val_indices]
|
|
606
|
+
y_test = [y[i] for i in test_indices]
|
|
607
|
+
|
|
608
|
+
return X_train, X_val, X_test, y_train, y_val, y_test
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
# Create alias
|
|
612
|
+
tvt_split = train_val_test_split
|