ilovetools 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,234 @@
1
+ """
2
+ Data preprocessing utilities
3
+ """
4
+
5
+ import random
6
+ from typing import Tuple, List, Union, Optional
7
+ import numpy as np
8
+
9
+ __all__ = ['train_test_split', 'normalize_data', 'standardize_data']
10
+
11
+
12
+ def train_test_split(
13
+ X: Union[List, np.ndarray],
14
+ y: Optional[Union[List, np.ndarray]] = None,
15
+ test_size: float = 0.2,
16
+ random_state: Optional[int] = None,
17
+ shuffle: bool = True,
18
+ stratify: bool = False
19
+ ) -> Union[Tuple[List, List], Tuple[List, List, List, List]]:
20
+ """
21
+ Split arrays or lists into random train and test subsets.
22
+
23
+ Perfect for ML workflows - implements the fundamental train-test split
24
+ pattern without requiring scikit-learn. Supports stratified splitting
25
+ to maintain class distribution.
26
+
27
+ Args:
28
+ X: Features array/list to split
29
+ y: Target array/list to split (optional)
30
+ test_size: Proportion of dataset for test set (0.0 to 1.0). Default: 0.2
31
+ random_state: Random seed for reproducibility. Default: None
32
+ shuffle: Whether to shuffle data before splitting. Default: True
33
+ stratify: Maintain class distribution in splits (requires y). Default: False
34
+
35
+ Returns:
36
+ If y is None: (X_train, X_test)
37
+ If y is provided: (X_train, X_test, y_train, y_test)
38
+
39
+ Examples:
40
+ >>> from ilovetools.data import train_test_split
41
+
42
+ # Basic split
43
+ >>> X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
44
+ >>> y = [0, 1, 0, 1, 0]
45
+ >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
46
+ >>> len(X_train), len(X_test)
47
+ (4, 1)
48
+
49
+ # With random seed for reproducibility
50
+ >>> X_train, X_test, y_train, y_test = train_test_split(
51
+ ... X, y, test_size=0.3, random_state=42
52
+ ... )
53
+
54
+ # Stratified split (maintains class distribution)
55
+ >>> X_train, X_test, y_train, y_test = train_test_split(
56
+ ... X, y, test_size=0.2, stratify=True, random_state=42
57
+ ... )
58
+
59
+ # Split features only (no labels)
60
+ >>> data = list(range(100))
61
+ >>> train, test = train_test_split(data, test_size=0.2)
62
+ >>> len(train), len(test)
63
+ (80, 20)
64
+
65
+ # Real-world example: Email spam detection
66
+ >>> emails = ["email1", "email2", "email3", "email4", "email5"]
67
+ >>> labels = [1, 0, 1, 0, 1] # 1=spam, 0=not spam
68
+ >>> X_train, X_test, y_train, y_test = train_test_split(
69
+ ... emails, labels, test_size=0.2, random_state=42
70
+ ... )
71
+
72
+ # 70-30 split
73
+ >>> X_train, X_test, y_train, y_test = train_test_split(
74
+ ... X, y, test_size=0.3
75
+ ... )
76
+
77
+ # 60-20-20 split (train-val-test)
78
+ >>> X_temp, X_test, y_temp, y_test = train_test_split(
79
+ ... X, y, test_size=0.2, random_state=42
80
+ ... )
81
+ >>> X_train, X_val, y_train, y_val = train_test_split(
82
+ ... X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
83
+ ... )
84
+
85
+ Notes:
86
+ - Always split data BEFORE any preprocessing to avoid data leakage
87
+ - Use random_state for reproducible results
88
+ - Stratified splitting ensures balanced class distribution
89
+ - Common splits: 80-20, 70-30, 60-20-20 (train-val-test)
90
+ - Test data should NEVER be seen during training
91
+
92
+ Raises:
93
+ ValueError: If test_size is not between 0 and 1
94
+ ValueError: If stratify=True but y is None
95
+ ValueError: If X and y have different lengths
96
+
97
+ References:
98
+ - scikit-learn train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
99
+ - ML best practices: https://developers.google.com/machine-learning/crash-course/training-and-test-sets/splitting-data
100
+ """
101
+
102
+ # Validation
103
+ if not 0 < test_size < 1:
104
+ raise ValueError(f"test_size must be between 0 and 1, got {test_size}")
105
+
106
+ if stratify and y is None:
107
+ raise ValueError("stratify=True requires y to be provided")
108
+
109
+ # Convert to lists if numpy arrays
110
+ if isinstance(X, np.ndarray):
111
+ X = X.tolist()
112
+ if y is not None and isinstance(y, np.ndarray):
113
+ y = y.tolist()
114
+
115
+ # Check lengths match
116
+ if y is not None and len(X) != len(y):
117
+ raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
118
+
119
+ n_samples = len(X)
120
+ n_test = int(n_samples * test_size)
121
+ n_train = n_samples - n_test
122
+
123
+ # Set random seed
124
+ if random_state is not None:
125
+ random.seed(random_state)
126
+
127
+ # Create indices
128
+ indices = list(range(n_samples))
129
+
130
+ if stratify and y is not None:
131
+ # Stratified split - maintain class distribution
132
+ X_train, X_test = [], []
133
+ y_train, y_test = [], []
134
+
135
+ # Group indices by class
136
+ class_indices = {}
137
+ for idx, label in enumerate(y):
138
+ if label not in class_indices:
139
+ class_indices[label] = []
140
+ class_indices[label].append(idx)
141
+
142
+ # Split each class proportionally
143
+ for label, class_idx in class_indices.items():
144
+ if shuffle:
145
+ random.shuffle(class_idx)
146
+
147
+ n_class_test = max(1, int(len(class_idx) * test_size))
148
+
149
+ test_idx = class_idx[:n_class_test]
150
+ train_idx = class_idx[n_class_test:]
151
+
152
+ X_test.extend([X[i] for i in test_idx])
153
+ y_test.extend([y[i] for i in test_idx])
154
+ X_train.extend([X[i] for i in train_idx])
155
+ y_train.extend([y[i] for i in train_idx])
156
+
157
+ return X_train, X_test, y_train, y_test
158
+
159
+ else:
160
+ # Regular split
161
+ if shuffle:
162
+ random.shuffle(indices)
163
+
164
+ test_indices = indices[:n_test]
165
+ train_indices = indices[n_test:]
166
+
167
+ X_train = [X[i] for i in train_indices]
168
+ X_test = [X[i] for i in test_indices]
169
+
170
+ if y is not None:
171
+ y_train = [y[i] for i in train_indices]
172
+ y_test = [y[i] for i in test_indices]
173
+ return X_train, X_test, y_train, y_test
174
+ else:
175
+ return X_train, X_test
176
+
177
+
178
+ def normalize_data(data: Union[List[float], np.ndarray]) -> List[float]:
179
+ """
180
+ Normalize data to range [0, 1] using min-max scaling.
181
+
182
+ Args:
183
+ data: List or array of numerical values
184
+
185
+ Returns:
186
+ list: Normalized values between 0 and 1
187
+
188
+ Example:
189
+ >>> from ilovetools.data import normalize_data
190
+ >>> data = [1, 2, 3, 4, 5]
191
+ >>> normalized = normalize_data(data)
192
+ >>> print(normalized)
193
+ [0.0, 0.25, 0.5, 0.75, 1.0]
194
+ """
195
+ if isinstance(data, np.ndarray):
196
+ data = data.tolist()
197
+
198
+ min_val = min(data)
199
+ max_val = max(data)
200
+
201
+ if max_val == min_val:
202
+ return [0.0] * len(data)
203
+
204
+ return [(x - min_val) / (max_val - min_val) for x in data]
205
+
206
+
207
+ def standardize_data(data: Union[List[float], np.ndarray]) -> List[float]:
208
+ """
209
+ Standardize data to have mean=0 and std=1 (Z-score normalization).
210
+
211
+ Args:
212
+ data: List or array of numerical values
213
+
214
+ Returns:
215
+ list: Standardized values with mean=0, std=1
216
+
217
+ Example:
218
+ >>> from ilovetools.data import standardize_data
219
+ >>> data = [1, 2, 3, 4, 5]
220
+ >>> standardized = standardize_data(data)
221
+ >>> print(standardized)
222
+ [-1.414, -0.707, 0.0, 0.707, 1.414]
223
+ """
224
+ if isinstance(data, np.ndarray):
225
+ data = data.tolist()
226
+
227
+ mean = sum(data) / len(data)
228
+ variance = sum((x - mean) ** 2 for x in data) / len(data)
229
+ std = variance ** 0.5
230
+
231
+ if std == 0:
232
+ return [0.0] * len(data)
233
+
234
+ return [(x - mean) / std for x in data]
@@ -0,0 +1,5 @@
1
+ """
2
+ Database connection and query utilities
3
+ """
4
+
5
+ __all__ = []
@@ -0,0 +1,5 @@
1
+ """
2
+ Date and time utilities
3
+ """
4
+
5
+ __all__ = []
@@ -0,0 +1,5 @@
1
+ """
2
+ File operations and management utilities
3
+ """
4
+
5
+ __all__ = []
@@ -0,0 +1,5 @@
1
+ """
2
+ Image processing utilities
3
+ """
4
+
5
+ __all__ = []