ilovetools 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +42 -0
- ilovetools/ai/__init__.py +13 -0
- ilovetools/ai/embeddings.py +270 -0
- ilovetools/ai/inference.py +5 -0
- ilovetools/ai/llm_helpers.py +141 -0
- ilovetools/audio/__init__.py +5 -0
- ilovetools/automation/__init__.py +5 -0
- ilovetools/conversion/__init__.py +5 -0
- ilovetools/data/__init__.py +27 -0
- ilovetools/data/feature_engineering.py +497 -0
- ilovetools/data/preprocessing.py +234 -0
- ilovetools/database/__init__.py +5 -0
- ilovetools/datetime/__init__.py +5 -0
- ilovetools/files/__init__.py +5 -0
- ilovetools/image/__init__.py +5 -0
- ilovetools/ml/__init__.py +603 -0
- ilovetools/ml/clustering.py +1107 -0
- ilovetools/ml/cross_validation.py +612 -0
- ilovetools/ml/dimensionality.py +1001 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/feature_selection.py +971 -0
- ilovetools/ml/imbalanced.py +797 -0
- ilovetools/ml/interpretation.py +915 -0
- ilovetools/ml/metrics.py +601 -0
- ilovetools/ml/pipeline.py +711 -0
- ilovetools/ml/timeseries.py +984 -0
- ilovetools/ml/tuning.py +781 -0
- ilovetools/security/__init__.py +5 -0
- ilovetools/text/__init__.py +5 -0
- ilovetools/utils/__init__.py +5 -0
- ilovetools/validation/__init__.py +5 -0
- ilovetools/web/__init__.py +5 -0
- ilovetools-0.2.3.dist-info/METADATA +143 -0
- ilovetools-0.2.3.dist-info/RECORD +38 -0
- ilovetools-0.2.3.dist-info/WHEEL +5 -0
- ilovetools-0.2.3.dist-info/licenses/LICENSE +21 -0
- ilovetools-0.2.3.dist-info/top_level.txt +2 -0
- tests/__init__.py +3 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data preprocessing utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Tuple, List, Union, Optional
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
__all__ = ['train_test_split', 'normalize_data', 'standardize_data']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def train_test_split(
|
|
13
|
+
X: Union[List, np.ndarray],
|
|
14
|
+
y: Optional[Union[List, np.ndarray]] = None,
|
|
15
|
+
test_size: float = 0.2,
|
|
16
|
+
random_state: Optional[int] = None,
|
|
17
|
+
shuffle: bool = True,
|
|
18
|
+
stratify: bool = False
|
|
19
|
+
) -> Union[Tuple[List, List], Tuple[List, List, List, List]]:
|
|
20
|
+
"""
|
|
21
|
+
Split arrays or lists into random train and test subsets.
|
|
22
|
+
|
|
23
|
+
Perfect for ML workflows - implements the fundamental train-test split
|
|
24
|
+
pattern without requiring scikit-learn. Supports stratified splitting
|
|
25
|
+
to maintain class distribution.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
X: Features array/list to split
|
|
29
|
+
y: Target array/list to split (optional)
|
|
30
|
+
test_size: Proportion of dataset for test set (0.0 to 1.0). Default: 0.2
|
|
31
|
+
random_state: Random seed for reproducibility. Default: None
|
|
32
|
+
shuffle: Whether to shuffle data before splitting. Default: True
|
|
33
|
+
stratify: Maintain class distribution in splits (requires y). Default: False
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
If y is None: (X_train, X_test)
|
|
37
|
+
If y is provided: (X_train, X_test, y_train, y_test)
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> from ilovetools.data import train_test_split
|
|
41
|
+
|
|
42
|
+
# Basic split
|
|
43
|
+
>>> X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
|
|
44
|
+
>>> y = [0, 1, 0, 1, 0]
|
|
45
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
46
|
+
>>> len(X_train), len(X_test)
|
|
47
|
+
(4, 1)
|
|
48
|
+
|
|
49
|
+
# With random seed for reproducibility
|
|
50
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
51
|
+
... X, y, test_size=0.3, random_state=42
|
|
52
|
+
... )
|
|
53
|
+
|
|
54
|
+
# Stratified split (maintains class distribution)
|
|
55
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
56
|
+
... X, y, test_size=0.2, stratify=True, random_state=42
|
|
57
|
+
... )
|
|
58
|
+
|
|
59
|
+
# Split features only (no labels)
|
|
60
|
+
>>> data = list(range(100))
|
|
61
|
+
>>> train, test = train_test_split(data, test_size=0.2)
|
|
62
|
+
>>> len(train), len(test)
|
|
63
|
+
(80, 20)
|
|
64
|
+
|
|
65
|
+
# Real-world example: Email spam detection
|
|
66
|
+
>>> emails = ["email1", "email2", "email3", "email4", "email5"]
|
|
67
|
+
>>> labels = [1, 0, 1, 0, 1] # 1=spam, 0=not spam
|
|
68
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
69
|
+
... emails, labels, test_size=0.2, random_state=42
|
|
70
|
+
... )
|
|
71
|
+
|
|
72
|
+
# 70-30 split
|
|
73
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
74
|
+
... X, y, test_size=0.3
|
|
75
|
+
... )
|
|
76
|
+
|
|
77
|
+
# 60-20-20 split (train-val-test)
|
|
78
|
+
>>> X_temp, X_test, y_temp, y_test = train_test_split(
|
|
79
|
+
... X, y, test_size=0.2, random_state=42
|
|
80
|
+
... )
|
|
81
|
+
>>> X_train, X_val, y_train, y_val = train_test_split(
|
|
82
|
+
... X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
|
|
83
|
+
... )
|
|
84
|
+
|
|
85
|
+
Notes:
|
|
86
|
+
- Always split data BEFORE any preprocessing to avoid data leakage
|
|
87
|
+
- Use random_state for reproducible results
|
|
88
|
+
- Stratified splitting ensures balanced class distribution
|
|
89
|
+
- Common splits: 80-20, 70-30, 60-20-20 (train-val-test)
|
|
90
|
+
- Test data should NEVER be seen during training
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ValueError: If test_size is not between 0 and 1
|
|
94
|
+
ValueError: If stratify=True but y is None
|
|
95
|
+
ValueError: If X and y have different lengths
|
|
96
|
+
|
|
97
|
+
References:
|
|
98
|
+
- scikit-learn train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
|
|
99
|
+
- ML best practices: https://developers.google.com/machine-learning/crash-course/training-and-test-sets/splitting-data
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# Validation
|
|
103
|
+
if not 0 < test_size < 1:
|
|
104
|
+
raise ValueError(f"test_size must be between 0 and 1, got {test_size}")
|
|
105
|
+
|
|
106
|
+
if stratify and y is None:
|
|
107
|
+
raise ValueError("stratify=True requires y to be provided")
|
|
108
|
+
|
|
109
|
+
# Convert to lists if numpy arrays
|
|
110
|
+
if isinstance(X, np.ndarray):
|
|
111
|
+
X = X.tolist()
|
|
112
|
+
if y is not None and isinstance(y, np.ndarray):
|
|
113
|
+
y = y.tolist()
|
|
114
|
+
|
|
115
|
+
# Check lengths match
|
|
116
|
+
if y is not None and len(X) != len(y):
|
|
117
|
+
raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
|
|
118
|
+
|
|
119
|
+
n_samples = len(X)
|
|
120
|
+
n_test = int(n_samples * test_size)
|
|
121
|
+
n_train = n_samples - n_test
|
|
122
|
+
|
|
123
|
+
# Set random seed
|
|
124
|
+
if random_state is not None:
|
|
125
|
+
random.seed(random_state)
|
|
126
|
+
|
|
127
|
+
# Create indices
|
|
128
|
+
indices = list(range(n_samples))
|
|
129
|
+
|
|
130
|
+
if stratify and y is not None:
|
|
131
|
+
# Stratified split - maintain class distribution
|
|
132
|
+
X_train, X_test = [], []
|
|
133
|
+
y_train, y_test = [], []
|
|
134
|
+
|
|
135
|
+
# Group indices by class
|
|
136
|
+
class_indices = {}
|
|
137
|
+
for idx, label in enumerate(y):
|
|
138
|
+
if label not in class_indices:
|
|
139
|
+
class_indices[label] = []
|
|
140
|
+
class_indices[label].append(idx)
|
|
141
|
+
|
|
142
|
+
# Split each class proportionally
|
|
143
|
+
for label, class_idx in class_indices.items():
|
|
144
|
+
if shuffle:
|
|
145
|
+
random.shuffle(class_idx)
|
|
146
|
+
|
|
147
|
+
n_class_test = max(1, int(len(class_idx) * test_size))
|
|
148
|
+
|
|
149
|
+
test_idx = class_idx[:n_class_test]
|
|
150
|
+
train_idx = class_idx[n_class_test:]
|
|
151
|
+
|
|
152
|
+
X_test.extend([X[i] for i in test_idx])
|
|
153
|
+
y_test.extend([y[i] for i in test_idx])
|
|
154
|
+
X_train.extend([X[i] for i in train_idx])
|
|
155
|
+
y_train.extend([y[i] for i in train_idx])
|
|
156
|
+
|
|
157
|
+
return X_train, X_test, y_train, y_test
|
|
158
|
+
|
|
159
|
+
else:
|
|
160
|
+
# Regular split
|
|
161
|
+
if shuffle:
|
|
162
|
+
random.shuffle(indices)
|
|
163
|
+
|
|
164
|
+
test_indices = indices[:n_test]
|
|
165
|
+
train_indices = indices[n_test:]
|
|
166
|
+
|
|
167
|
+
X_train = [X[i] for i in train_indices]
|
|
168
|
+
X_test = [X[i] for i in test_indices]
|
|
169
|
+
|
|
170
|
+
if y is not None:
|
|
171
|
+
y_train = [y[i] for i in train_indices]
|
|
172
|
+
y_test = [y[i] for i in test_indices]
|
|
173
|
+
return X_train, X_test, y_train, y_test
|
|
174
|
+
else:
|
|
175
|
+
return X_train, X_test
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def normalize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
179
|
+
"""
|
|
180
|
+
Normalize data to range [0, 1] using min-max scaling.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
data: List or array of numerical values
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
list: Normalized values between 0 and 1
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
>>> from ilovetools.data import normalize_data
|
|
190
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
191
|
+
>>> normalized = normalize_data(data)
|
|
192
|
+
>>> print(normalized)
|
|
193
|
+
[0.0, 0.25, 0.5, 0.75, 1.0]
|
|
194
|
+
"""
|
|
195
|
+
if isinstance(data, np.ndarray):
|
|
196
|
+
data = data.tolist()
|
|
197
|
+
|
|
198
|
+
min_val = min(data)
|
|
199
|
+
max_val = max(data)
|
|
200
|
+
|
|
201
|
+
if max_val == min_val:
|
|
202
|
+
return [0.0] * len(data)
|
|
203
|
+
|
|
204
|
+
return [(x - min_val) / (max_val - min_val) for x in data]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def standardize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
208
|
+
"""
|
|
209
|
+
Standardize data to have mean=0 and std=1 (Z-score normalization).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
data: List or array of numerical values
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
list: Standardized values with mean=0, std=1
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
>>> from ilovetools.data import standardize_data
|
|
219
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
220
|
+
>>> standardized = standardize_data(data)
|
|
221
|
+
>>> print(standardized)
|
|
222
|
+
[-1.414, -0.707, 0.0, 0.707, 1.414]
|
|
223
|
+
"""
|
|
224
|
+
if isinstance(data, np.ndarray):
|
|
225
|
+
data = data.tolist()
|
|
226
|
+
|
|
227
|
+
mean = sum(data) / len(data)
|
|
228
|
+
variance = sum((x - mean) ** 2 for x in data) / len(data)
|
|
229
|
+
std = variance ** 0.5
|
|
230
|
+
|
|
231
|
+
if std == 0:
|
|
232
|
+
return [0.0] * len(data)
|
|
233
|
+
|
|
234
|
+
return [(x - mean) / std for x in data]
|