ilovetools 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,612 @@
1
+ """
2
+ Cross-validation utilities for ML workflows
3
+ Each function has TWO names: full descriptive name + abbreviated alias
4
+ """
5
+
6
+ from typing import List, Tuple, Dict, Callable, Any, Optional
7
+ import random
8
+
9
+ __all__ = [
10
+ # Full names
11
+ 'k_fold_cross_validation',
12
+ 'stratified_k_fold',
13
+ 'time_series_split',
14
+ 'leave_one_out_cv',
15
+ 'shuffle_split_cv',
16
+ 'cross_validate_score',
17
+ 'holdout_validation_split',
18
+ 'train_val_test_split',
19
+ # Abbreviated aliases
20
+ 'kfold',
21
+ 'skfold',
22
+ 'tssplit',
23
+ 'loocv',
24
+ 'shuffle_cv',
25
+ 'cv_score',
26
+ 'holdout',
27
+ 'tvt_split',
28
+ ]
29
+
30
+
31
+ def k_fold_cross_validation(
32
+ X: List,
33
+ y: List,
34
+ k: int = 5,
35
+ shuffle: bool = True,
36
+ random_state: Optional[int] = None
37
+ ) -> List[Tuple[List[int], List[int]]]:
38
+ """
39
+ K-Fold Cross-Validation split.
40
+
41
+ Alias: kfold()
42
+
43
+ Splits data into K folds. Each fold is used once as validation
44
+ while remaining K-1 folds form training set.
45
+
46
+ Args:
47
+ X: Feature data
48
+ y: Target data
49
+ k: Number of folds. Default: 5
50
+ shuffle: Shuffle data before splitting. Default: True
51
+ random_state: Random seed for reproducibility
52
+
53
+ Returns:
54
+ list: List of (train_indices, val_indices) tuples
55
+
56
+ Examples:
57
+ >>> from ilovetools.ml import kfold # Short alias
58
+ >>> X = list(range(10))
59
+ >>> y = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
60
+ >>> folds = kfold(X, y, k=5)
61
+ >>> len(folds)
62
+ 5
63
+ >>> train_idx, val_idx = folds[0]
64
+ >>> len(train_idx), len(val_idx)
65
+ (8, 2)
66
+
67
+ >>> from ilovetools.ml import k_fold_cross_validation # Full name
68
+ >>> folds = k_fold_cross_validation(X, y, k=3)
69
+
70
+ Notes:
71
+ - Most common CV method
72
+ - Use k=5 or k=10 typically
73
+ - Larger k = more training data per fold
74
+ - Smaller k = faster computation
75
+ """
76
+ if len(X) != len(y):
77
+ raise ValueError("X and y must have same length")
78
+
79
+ n = len(X)
80
+ indices = list(range(n))
81
+
82
+ if shuffle:
83
+ if random_state is not None:
84
+ random.seed(random_state)
85
+ random.shuffle(indices)
86
+
87
+ fold_size = n // k
88
+ folds = []
89
+
90
+ for i in range(k):
91
+ start = i * fold_size
92
+ end = start + fold_size if i < k - 1 else n
93
+
94
+ val_indices = indices[start:end]
95
+ train_indices = indices[:start] + indices[end:]
96
+
97
+ folds.append((train_indices, val_indices))
98
+
99
+ return folds
100
+
101
+
102
+ # Create alias
103
+ kfold = k_fold_cross_validation
104
+
105
+
106
+ def stratified_k_fold(
107
+ X: List,
108
+ y: List,
109
+ k: int = 5,
110
+ shuffle: bool = True,
111
+ random_state: Optional[int] = None
112
+ ) -> List[Tuple[List[int], List[int]]]:
113
+ """
114
+ Stratified K-Fold Cross-Validation split.
115
+
116
+ Alias: skfold()
117
+
118
+ Like K-Fold but maintains class distribution in each fold.
119
+ Essential for imbalanced datasets.
120
+
121
+ Args:
122
+ X: Feature data
123
+ y: Target data (class labels)
124
+ k: Number of folds. Default: 5
125
+ shuffle: Shuffle data before splitting. Default: True
126
+ random_state: Random seed for reproducibility
127
+
128
+ Returns:
129
+ list: List of (train_indices, val_indices) tuples
130
+
131
+ Examples:
132
+ >>> from ilovetools.ml import skfold # Short alias
133
+ >>> X = list(range(10))
134
+ >>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] # Balanced
135
+ >>> folds = skfold(X, y, k=5)
136
+
137
+ >>> from ilovetools.ml import stratified_k_fold # Full name
138
+ >>> folds = stratified_k_fold(X, y, k=3)
139
+
140
+ # Imbalanced dataset
141
+ >>> y_imbalanced = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1] # 80-20 split
142
+ >>> folds = skfold(X, y_imbalanced, k=5)
143
+ # Each fold maintains 80-20 ratio
144
+
145
+ Notes:
146
+ - Use for imbalanced datasets
147
+ - Maintains class distribution
148
+ - More reliable than regular K-Fold
149
+ - Slightly slower than K-Fold
150
+ """
151
+ if len(X) != len(y):
152
+ raise ValueError("X and y must have same length")
153
+
154
+ # Group indices by class
155
+ class_indices = {}
156
+ for idx, label in enumerate(y):
157
+ if label not in class_indices:
158
+ class_indices[label] = []
159
+ class_indices[label].append(idx)
160
+
161
+ # Shuffle within each class
162
+ if shuffle:
163
+ if random_state is not None:
164
+ random.seed(random_state)
165
+ for label in class_indices:
166
+ random.shuffle(class_indices[label])
167
+
168
+ # Create folds maintaining class distribution
169
+ folds = [[] for _ in range(k)]
170
+
171
+ for label, indices in class_indices.items():
172
+ fold_size = len(indices) // k
173
+ for i in range(k):
174
+ start = i * fold_size
175
+ end = start + fold_size if i < k - 1 else len(indices)
176
+ folds[i].extend(indices[start:end])
177
+
178
+ # Convert to train/val splits
179
+ result = []
180
+ all_indices = list(range(len(X)))
181
+
182
+ for val_indices in folds:
183
+ train_indices = [idx for idx in all_indices if idx not in val_indices]
184
+ result.append((train_indices, val_indices))
185
+
186
+ return result
187
+
188
+
189
+ # Create alias
190
+ skfold = stratified_k_fold
191
+
192
+
193
+ def time_series_split(
194
+ X: List,
195
+ y: List,
196
+ n_splits: int = 5,
197
+ test_size: Optional[int] = None
198
+ ) -> List[Tuple[List[int], List[int]]]:
199
+ """
200
+ Time Series Cross-Validation split.
201
+
202
+ Alias: tssplit()
203
+
204
+ Respects temporal order. Training set always comes before test set.
205
+ No future data leakage!
206
+
207
+ Args:
208
+ X: Feature data (time-ordered)
209
+ y: Target data (time-ordered)
210
+ n_splits: Number of splits. Default: 5
211
+ test_size: Size of test set. If None, uses expanding window
212
+
213
+ Returns:
214
+ list: List of (train_indices, test_indices) tuples
215
+
216
+ Examples:
217
+ >>> from ilovetools.ml import tssplit # Short alias
218
+ >>> X = list(range(10))
219
+ >>> y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
220
+ >>> splits = tssplit(X, y, n_splits=3)
221
+ >>> len(splits)
222
+ 3
223
+
224
+ >>> from ilovetools.ml import time_series_split # Full name
225
+ >>> splits = time_series_split(X, y, n_splits=5)
226
+
227
+ # Stock price prediction
228
+ >>> prices = [100, 102, 101, 105, 103, 107, 110, 108, 112, 115]
229
+ >>> dates = list(range(len(prices)))
230
+ >>> splits = tssplit(dates, prices, n_splits=3)
231
+ # Each split: train on past, test on future
232
+
233
+ Notes:
234
+ - Essential for time series data
235
+ - Prevents future data leakage
236
+ - Training set grows over time
237
+ - Use for: Stock prices, weather, sales
238
+ """
239
+ if len(X) != len(y):
240
+ raise ValueError("X and y must have same length")
241
+
242
+ n = len(X)
243
+
244
+ if test_size is None:
245
+ test_size = n // (n_splits + 1)
246
+
247
+ splits = []
248
+
249
+ for i in range(n_splits):
250
+ test_start = (i + 1) * test_size
251
+ test_end = test_start + test_size
252
+
253
+ if test_end > n:
254
+ break
255
+
256
+ train_indices = list(range(test_start))
257
+ test_indices = list(range(test_start, test_end))
258
+
259
+ splits.append((train_indices, test_indices))
260
+
261
+ return splits
262
+
263
+
264
+ # Create alias
265
+ tssplit = time_series_split
266
+
267
+
268
+ def leave_one_out_cv(X: List, y: List) -> List[Tuple[List[int], List[int]]]:
269
+ """
270
+ Leave-One-Out Cross-Validation.
271
+
272
+ Alias: loocv()
273
+
274
+ Each sample is used once as validation, rest as training.
275
+ Maximum training data but computationally expensive.
276
+
277
+ Args:
278
+ X: Feature data
279
+ y: Target data
280
+
281
+ Returns:
282
+ list: List of (train_indices, val_indices) tuples
283
+
284
+ Examples:
285
+ >>> from ilovetools.ml import loocv # Short alias
286
+ >>> X = [1, 2, 3, 4, 5]
287
+ >>> y = [10, 20, 30, 40, 50]
288
+ >>> splits = loocv(X, y)
289
+ >>> len(splits)
290
+ 5
291
+ >>> train_idx, val_idx = splits[0]
292
+ >>> len(train_idx), len(val_idx)
293
+ (4, 1)
294
+
295
+ >>> from ilovetools.ml import leave_one_out_cv # Full name
296
+ >>> splits = leave_one_out_cv(X, y)
297
+
298
+ Notes:
299
+ - Maximum training data per fold
300
+ - Very computationally expensive
301
+ - Use for small datasets only
302
+ - n_splits = n_samples
303
+ """
304
+ if len(X) != len(y):
305
+ raise ValueError("X and y must have same length")
306
+
307
+ n = len(X)
308
+ splits = []
309
+
310
+ for i in range(n):
311
+ train_indices = list(range(i)) + list(range(i + 1, n))
312
+ val_indices = [i]
313
+ splits.append((train_indices, val_indices))
314
+
315
+ return splits
316
+
317
+
318
+ # Create alias
319
+ loocv = leave_one_out_cv
320
+
321
+
322
+ def shuffle_split_cv(
323
+ X: List,
324
+ y: List,
325
+ n_splits: int = 10,
326
+ test_size: float = 0.2,
327
+ random_state: Optional[int] = None
328
+ ) -> List[Tuple[List[int], List[int]]]:
329
+ """
330
+ Shuffle Split Cross-Validation.
331
+
332
+ Alias: shuffle_cv()
333
+
334
+ Random permutation CV. Creates random train/test splits.
335
+
336
+ Args:
337
+ X: Feature data
338
+ y: Target data
339
+ n_splits: Number of splits. Default: 10
340
+ test_size: Proportion of test set. Default: 0.2
341
+ random_state: Random seed for reproducibility
342
+
343
+ Returns:
344
+ list: List of (train_indices, test_indices) tuples
345
+
346
+ Examples:
347
+ >>> from ilovetools.ml import shuffle_cv # Short alias
348
+ >>> X = list(range(10))
349
+ >>> y = [0, 1] * 5
350
+ >>> splits = shuffle_cv(X, y, n_splits=5, test_size=0.3)
351
+ >>> len(splits)
352
+ 5
353
+
354
+ >>> from ilovetools.ml import shuffle_split_cv # Full name
355
+ >>> splits = shuffle_split_cv(X, y, n_splits=3)
356
+
357
+ Notes:
358
+ - Random train/test splits
359
+ - Samples can appear in multiple test sets
360
+ - Good for large datasets
361
+ - More flexible than K-Fold
362
+ """
363
+ if len(X) != len(y):
364
+ raise ValueError("X and y must have same length")
365
+
366
+ n = len(X)
367
+ n_test = int(n * test_size)
368
+
369
+ if random_state is not None:
370
+ random.seed(random_state)
371
+
372
+ splits = []
373
+
374
+ for _ in range(n_splits):
375
+ indices = list(range(n))
376
+ random.shuffle(indices)
377
+
378
+ test_indices = indices[:n_test]
379
+ train_indices = indices[n_test:]
380
+
381
+ splits.append((train_indices, test_indices))
382
+
383
+ return splits
384
+
385
+
386
+ # Create alias
387
+ shuffle_cv = shuffle_split_cv
388
+
389
+
390
+ def cross_validate_score(
391
+ X: List,
392
+ y: List,
393
+ model_func: Callable,
394
+ metric_func: Callable,
395
+ cv_method: str = 'kfold',
396
+ k: int = 5
397
+ ) -> Dict[str, Any]:
398
+ """
399
+ Perform cross-validation and return scores.
400
+
401
+ Alias: cv_score()
402
+
403
+ Args:
404
+ X: Feature data
405
+ y: Target data
406
+ model_func: Function that trains and returns predictions
407
+ metric_func: Function that calculates metric
408
+ cv_method: CV method ('kfold', 'stratified', 'timeseries')
409
+ k: Number of folds
410
+
411
+ Returns:
412
+ dict: CV results with scores and statistics
413
+
414
+ Examples:
415
+ >>> from ilovetools.ml import cv_score # Short alias
416
+ >>> X = [[1], [2], [3], [4], [5]]
417
+ >>> y = [1, 2, 3, 4, 5]
418
+ >>>
419
+ >>> def simple_model(X_train, y_train, X_val):
420
+ ... # Simple average predictor
421
+ ... avg = sum(y_train) / len(y_train)
422
+ ... return [avg] * len(X_val)
423
+ >>>
424
+ >>> def mae_metric(y_true, y_pred):
425
+ ... return sum(abs(t - p) for t, p in zip(y_true, y_pred)) / len(y_true)
426
+ >>>
427
+ >>> results = cv_score(X, y, simple_model, mae_metric, k=3)
428
+ >>> print(results['mean_score'])
429
+
430
+ >>> from ilovetools.ml import cross_validate_score # Full name
431
+ >>> results = cross_validate_score(X, y, simple_model, mae_metric)
432
+
433
+ Notes:
434
+ - Automates CV workflow
435
+ - Returns mean, std, all scores
436
+ - Flexible with any model/metric
437
+ - Easy model comparison
438
+ """
439
+ # Get CV splits
440
+ if cv_method == 'kfold':
441
+ splits = k_fold_cross_validation(X, y, k=k)
442
+ elif cv_method == 'stratified':
443
+ splits = stratified_k_fold(X, y, k=k)
444
+ elif cv_method == 'timeseries':
445
+ splits = time_series_split(X, y, n_splits=k)
446
+ else:
447
+ splits = k_fold_cross_validation(X, y, k=k)
448
+
449
+ scores = []
450
+
451
+ for train_idx, val_idx in splits:
452
+ X_train = [X[i] for i in train_idx]
453
+ y_train = [y[i] for i in train_idx]
454
+ X_val = [X[i] for i in val_idx]
455
+ y_val = [y[i] for i in val_idx]
456
+
457
+ # Train and predict
458
+ y_pred = model_func(X_train, y_train, X_val)
459
+
460
+ # Calculate metric
461
+ score = metric_func(y_val, y_pred)
462
+ scores.append(score)
463
+
464
+ return {
465
+ 'scores': scores,
466
+ 'mean_score': sum(scores) / len(scores),
467
+ 'std_score': (sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores)) ** 0.5,
468
+ 'n_splits': len(splits)
469
+ }
470
+
471
+
472
+ # Create alias
473
+ cv_score = cross_validate_score
474
+
475
+
476
+ def holdout_validation_split(
477
+ X: List,
478
+ y: List,
479
+ test_size: float = 0.2,
480
+ random_state: Optional[int] = None
481
+ ) -> Tuple[List, List, List, List]:
482
+ """
483
+ Simple holdout validation split.
484
+
485
+ Alias: holdout()
486
+
487
+ Single train/test split. Fast but less reliable than CV.
488
+
489
+ Args:
490
+ X: Feature data
491
+ y: Target data
492
+ test_size: Proportion of test set. Default: 0.2
493
+ random_state: Random seed for reproducibility
494
+
495
+ Returns:
496
+ tuple: (X_train, X_test, y_train, y_test)
497
+
498
+ Examples:
499
+ >>> from ilovetools.ml import holdout # Short alias
500
+ >>> X = list(range(10))
501
+ >>> y = [0, 1] * 5
502
+ >>> X_train, X_test, y_train, y_test = holdout(X, y, test_size=0.3)
503
+ >>> len(X_train), len(X_test)
504
+ (7, 3)
505
+
506
+ >>> from ilovetools.ml import holdout_validation_split # Full name
507
+ >>> X_train, X_test, y_train, y_test = holdout_validation_split(X, y)
508
+
509
+ Notes:
510
+ - Fastest validation method
511
+ - Less reliable than CV
512
+ - Use for quick experiments
513
+ - Good for large datasets
514
+ """
515
+ if len(X) != len(y):
516
+ raise ValueError("X and y must have same length")
517
+
518
+ n = len(X)
519
+ n_test = int(n * test_size)
520
+
521
+ indices = list(range(n))
522
+
523
+ if random_state is not None:
524
+ random.seed(random_state)
525
+ random.shuffle(indices)
526
+
527
+ test_indices = indices[:n_test]
528
+ train_indices = indices[n_test:]
529
+
530
+ X_train = [X[i] for i in train_indices]
531
+ X_test = [X[i] for i in test_indices]
532
+ y_train = [y[i] for i in train_indices]
533
+ y_test = [y[i] for i in test_indices]
534
+
535
+ return X_train, X_test, y_train, y_test
536
+
537
+
538
+ # Create alias
539
+ holdout = holdout_validation_split
540
+
541
+
542
+ def train_val_test_split(
543
+ X: List,
544
+ y: List,
545
+ val_size: float = 0.2,
546
+ test_size: float = 0.2,
547
+ random_state: Optional[int] = None
548
+ ) -> Tuple[List, List, List, List, List, List]:
549
+ """
550
+ Three-way split: train, validation, test.
551
+
552
+ Alias: tvt_split()
553
+
554
+ Creates separate train, validation, and test sets.
555
+
556
+ Args:
557
+ X: Feature data
558
+ y: Target data
559
+ val_size: Proportion of validation set. Default: 0.2
560
+ test_size: Proportion of test set. Default: 0.2
561
+ random_state: Random seed for reproducibility
562
+
563
+ Returns:
564
+ tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
565
+
566
+ Examples:
567
+ >>> from ilovetools.ml import tvt_split # Short alias
568
+ >>> X = list(range(10))
569
+ >>> y = [0, 1] * 5
570
+ >>> X_tr, X_val, X_te, y_tr, y_val, y_te = tvt_split(X, y)
571
+ >>> len(X_tr), len(X_val), len(X_te)
572
+ (6, 2, 2)
573
+
574
+ >>> from ilovetools.ml import train_val_test_split # Full name
575
+ >>> splits = train_val_test_split(X, y, val_size=0.15, test_size=0.15)
576
+
577
+ Notes:
578
+ - Standard ML workflow split
579
+ - Train: Model training
580
+ - Val: Hyperparameter tuning
581
+ - Test: Final evaluation
582
+ - Typical: 60-20-20 or 70-15-15
583
+ """
584
+ if len(X) != len(y):
585
+ raise ValueError("X and y must have same length")
586
+
587
+ n = len(X)
588
+ n_test = int(n * test_size)
589
+ n_val = int(n * val_size)
590
+
591
+ indices = list(range(n))
592
+
593
+ if random_state is not None:
594
+ random.seed(random_state)
595
+ random.shuffle(indices)
596
+
597
+ test_indices = indices[:n_test]
598
+ val_indices = indices[n_test:n_test + n_val]
599
+ train_indices = indices[n_test + n_val:]
600
+
601
+ X_train = [X[i] for i in train_indices]
602
+ X_val = [X[i] for i in val_indices]
603
+ X_test = [X[i] for i in test_indices]
604
+ y_train = [y[i] for i in train_indices]
605
+ y_val = [y[i] for i in val_indices]
606
+ y_test = [y[i] for i in test_indices]
607
+
608
+ return X_train, X_val, X_test, y_train, y_val, y_test
609
+
610
+
611
+ # Create alias
612
+ tvt_split = train_val_test_split