ilovetools 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.1.2/ilovetools.egg-info → ilovetools-0.1.4}/PKG-INFO +1 -1
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/__init__.py +3 -1
- ilovetools-0.1.4/ilovetools/data/__init__.py +27 -0
- ilovetools-0.1.4/ilovetools/data/feature_engineering.py +497 -0
- ilovetools-0.1.4/ilovetools/ml/__init__.py +31 -0
- ilovetools-0.1.4/ilovetools/ml/metrics.py +589 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools.egg-info/SOURCES.txt +3 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/pyproject.toml +1 -1
- {ilovetools-0.1.2 → ilovetools-0.1.4}/setup.py +1 -1
- ilovetools-0.1.2/ilovetools/data/__init__.py +0 -11
- {ilovetools-0.1.2 → ilovetools-0.1.4}/LICENSE +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/MANIFEST.in +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/README.md +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/ai/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/ai/embeddings.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/data/preprocessing.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/requirements.txt +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/setup.cfg +0 -0
- {ilovetools-0.1.2 → ilovetools-0.1.4}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -2,13 +2,14 @@
|
|
|
2
2
|
ilovetools - A comprehensive Python utility library
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
__version__ = "0.1.
|
|
5
|
+
__version__ = "0.1.4"
|
|
6
6
|
__author__ = "Ali Mehdi"
|
|
7
7
|
__email__ = "ali.mehdi.dev579@gmail.com"
|
|
8
8
|
|
|
9
9
|
# Import all modules for easy access
|
|
10
10
|
from . import ai
|
|
11
11
|
from . import data
|
|
12
|
+
from . import ml
|
|
12
13
|
from . import files
|
|
13
14
|
from . import text
|
|
14
15
|
from . import image
|
|
@@ -25,6 +26,7 @@ from . import utils
|
|
|
25
26
|
__all__ = [
|
|
26
27
|
"ai",
|
|
27
28
|
"data",
|
|
29
|
+
"ml",
|
|
28
30
|
"files",
|
|
29
31
|
"text",
|
|
30
32
|
"image",
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing and manipulation utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .preprocessing import train_test_split, normalize_data, standardize_data
|
|
6
|
+
from .feature_engineering import (
|
|
7
|
+
create_polynomial_features,
|
|
8
|
+
bin_numerical_feature,
|
|
9
|
+
one_hot_encode,
|
|
10
|
+
label_encode,
|
|
11
|
+
extract_datetime_features,
|
|
12
|
+
handle_missing_values,
|
|
13
|
+
create_interaction_features
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'train_test_split',
|
|
18
|
+
'normalize_data',
|
|
19
|
+
'standardize_data',
|
|
20
|
+
'create_polynomial_features',
|
|
21
|
+
'bin_numerical_feature',
|
|
22
|
+
'one_hot_encode',
|
|
23
|
+
'label_encode',
|
|
24
|
+
'extract_datetime_features',
|
|
25
|
+
'handle_missing_values',
|
|
26
|
+
'create_interaction_features',
|
|
27
|
+
]
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feature engineering utilities for ML workflows
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Union, Dict, Tuple, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
'create_polynomial_features',
|
|
11
|
+
'bin_numerical_feature',
|
|
12
|
+
'one_hot_encode',
|
|
13
|
+
'label_encode',
|
|
14
|
+
'extract_datetime_features',
|
|
15
|
+
'handle_missing_values',
|
|
16
|
+
'create_interaction_features'
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def create_polynomial_features(
|
|
21
|
+
data: Union[List[float], np.ndarray],
|
|
22
|
+
degree: int = 2,
|
|
23
|
+
include_bias: bool = False
|
|
24
|
+
) -> List[List[float]]:
|
|
25
|
+
"""
|
|
26
|
+
Create polynomial features from numerical data.
|
|
27
|
+
|
|
28
|
+
Transforms [x] into [x, x^2, x^3, ...] to capture non-linear relationships.
|
|
29
|
+
Essential for models that need to learn curved patterns.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
data: List or array of numerical values
|
|
33
|
+
degree: Maximum polynomial degree. Default: 2
|
|
34
|
+
include_bias: Include bias term (column of 1s). Default: False
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
list: Polynomial features as list of lists
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> from ilovetools.data import create_polynomial_features
|
|
41
|
+
|
|
42
|
+
# Basic usage
|
|
43
|
+
>>> ages = [20, 25, 30, 35, 40]
|
|
44
|
+
>>> poly_features = create_polynomial_features(ages, degree=2)
|
|
45
|
+
>>> print(poly_features)
|
|
46
|
+
[[20, 400], [25, 625], [30, 900], [35, 1225], [40, 1600]]
|
|
47
|
+
|
|
48
|
+
# With bias term
|
|
49
|
+
>>> poly_features = create_polynomial_features(ages, degree=2, include_bias=True)
|
|
50
|
+
>>> print(poly_features[0])
|
|
51
|
+
[1, 20, 400]
|
|
52
|
+
|
|
53
|
+
# Degree 3
|
|
54
|
+
>>> poly_features = create_polynomial_features([2, 3, 4], degree=3)
|
|
55
|
+
>>> print(poly_features)
|
|
56
|
+
[[2, 4, 8], [3, 9, 27], [4, 16, 64]]
|
|
57
|
+
|
|
58
|
+
# Real-world: Age features for insurance pricing
|
|
59
|
+
>>> customer_ages = [25, 35, 45, 55, 65]
|
|
60
|
+
>>> age_features = create_polynomial_features(customer_ages, degree=2)
|
|
61
|
+
# Now model can learn: premium = a*age + b*age^2
|
|
62
|
+
|
|
63
|
+
Notes:
|
|
64
|
+
- Useful for capturing non-linear relationships
|
|
65
|
+
- Common in regression problems
|
|
66
|
+
- Be careful with high degrees (overfitting risk)
|
|
67
|
+
- Normalize features after polynomial expansion
|
|
68
|
+
"""
|
|
69
|
+
if isinstance(data, np.ndarray):
|
|
70
|
+
data = data.tolist()
|
|
71
|
+
|
|
72
|
+
result = []
|
|
73
|
+
for value in data:
|
|
74
|
+
features = []
|
|
75
|
+
if include_bias:
|
|
76
|
+
features.append(1)
|
|
77
|
+
for d in range(1, degree + 1):
|
|
78
|
+
features.append(value ** d)
|
|
79
|
+
result.append(features)
|
|
80
|
+
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def bin_numerical_feature(
|
|
85
|
+
data: Union[List[float], np.ndarray],
|
|
86
|
+
bins: Union[int, List[float]] = 5,
|
|
87
|
+
labels: Optional[List[str]] = None
|
|
88
|
+
) -> List[Union[int, str]]:
|
|
89
|
+
"""
|
|
90
|
+
Bin continuous numerical data into discrete categories.
|
|
91
|
+
|
|
92
|
+
Converts continuous values into groups/bins. Useful for creating
|
|
93
|
+
categorical features from numerical data.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
data: List or array of numerical values
|
|
97
|
+
bins: Number of equal-width bins OR list of bin edges
|
|
98
|
+
labels: Optional labels for bins. If None, returns bin indices
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
list: Binned values (indices or labels)
|
|
102
|
+
|
|
103
|
+
Examples:
|
|
104
|
+
>>> from ilovetools.data import bin_numerical_feature
|
|
105
|
+
|
|
106
|
+
# Age groups
|
|
107
|
+
>>> ages = [5, 15, 25, 35, 45, 55, 65, 75]
|
|
108
|
+
>>> age_groups = bin_numerical_feature(
|
|
109
|
+
... ages,
|
|
110
|
+
... bins=[0, 18, 35, 60, 100],
|
|
111
|
+
... labels=["Child", "Young Adult", "Adult", "Senior"]
|
|
112
|
+
... )
|
|
113
|
+
>>> print(age_groups)
|
|
114
|
+
['Child', 'Child', 'Young Adult', 'Adult', 'Adult', 'Senior', 'Senior', 'Senior']
|
|
115
|
+
|
|
116
|
+
# Income brackets
|
|
117
|
+
>>> incomes = [25000, 45000, 65000, 85000, 120000]
|
|
118
|
+
>>> income_brackets = bin_numerical_feature(
|
|
119
|
+
... incomes,
|
|
120
|
+
... bins=[0, 40000, 80000, 150000],
|
|
121
|
+
... labels=["Low", "Medium", "High"]
|
|
122
|
+
... )
|
|
123
|
+
|
|
124
|
+
# Equal-width bins
|
|
125
|
+
>>> scores = [45, 67, 89, 92, 78, 56, 34, 88]
|
|
126
|
+
>>> score_bins = bin_numerical_feature(scores, bins=3)
|
|
127
|
+
>>> print(score_bins)
|
|
128
|
+
[0, 1, 2, 2, 2, 1, 0, 2]
|
|
129
|
+
|
|
130
|
+
Notes:
|
|
131
|
+
- Useful for creating categorical features
|
|
132
|
+
- Helps models learn threshold effects
|
|
133
|
+
- Can reduce noise in continuous data
|
|
134
|
+
- Choose bins based on domain knowledge
|
|
135
|
+
"""
|
|
136
|
+
if isinstance(data, np.ndarray):
|
|
137
|
+
data = data.tolist()
|
|
138
|
+
|
|
139
|
+
if isinstance(bins, int):
|
|
140
|
+
# Create equal-width bins
|
|
141
|
+
min_val = min(data)
|
|
142
|
+
max_val = max(data)
|
|
143
|
+
bin_width = (max_val - min_val) / bins
|
|
144
|
+
bin_edges = [min_val + i * bin_width for i in range(bins + 1)]
|
|
145
|
+
bin_edges[-1] += 0.001 # Ensure max value is included
|
|
146
|
+
else:
|
|
147
|
+
bin_edges = bins
|
|
148
|
+
|
|
149
|
+
result = []
|
|
150
|
+
for value in data:
|
|
151
|
+
bin_idx = 0
|
|
152
|
+
for i in range(len(bin_edges) - 1):
|
|
153
|
+
if bin_edges[i] <= value < bin_edges[i + 1]:
|
|
154
|
+
bin_idx = i
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
if labels:
|
|
158
|
+
result.append(labels[bin_idx])
|
|
159
|
+
else:
|
|
160
|
+
result.append(bin_idx)
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def one_hot_encode(
|
|
166
|
+
data: List[str],
|
|
167
|
+
categories: Optional[List[str]] = None
|
|
168
|
+
) -> Dict[str, List[int]]:
|
|
169
|
+
"""
|
|
170
|
+
One-hot encode categorical data.
|
|
171
|
+
|
|
172
|
+
Converts categories into binary columns. Each category becomes
|
|
173
|
+
a separate binary feature.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
data: List of categorical values
|
|
177
|
+
categories: Optional list of all possible categories
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
dict: Dictionary with category names as keys, binary lists as values
|
|
181
|
+
|
|
182
|
+
Examples:
|
|
183
|
+
>>> from ilovetools.data import one_hot_encode
|
|
184
|
+
|
|
185
|
+
# Basic encoding
|
|
186
|
+
>>> colors = ["Red", "Blue", "Green", "Red", "Blue"]
|
|
187
|
+
>>> encoded = one_hot_encode(colors)
|
|
188
|
+
>>> print(encoded)
|
|
189
|
+
{'Red': [1, 0, 0, 1, 0], 'Blue': [0, 1, 0, 0, 1], 'Green': [0, 0, 1, 0, 0]}
|
|
190
|
+
|
|
191
|
+
# With predefined categories
|
|
192
|
+
>>> sizes = ["S", "M", "L", "M"]
|
|
193
|
+
>>> encoded = one_hot_encode(sizes, categories=["XS", "S", "M", "L", "XL"])
|
|
194
|
+
|
|
195
|
+
# Real-world: Product categories
|
|
196
|
+
>>> products = ["Electronics", "Clothing", "Electronics", "Food"]
|
|
197
|
+
>>> encoded = one_hot_encode(products)
|
|
198
|
+
# Use in ML: Each category becomes a feature
|
|
199
|
+
|
|
200
|
+
Notes:
|
|
201
|
+
- Standard encoding for categorical features
|
|
202
|
+
- Creates sparse features (mostly zeros)
|
|
203
|
+
- Number of features = number of categories
|
|
204
|
+
- Use for nominal categories (no order)
|
|
205
|
+
"""
|
|
206
|
+
if categories is None:
|
|
207
|
+
categories = sorted(list(set(data)))
|
|
208
|
+
|
|
209
|
+
result = {cat: [] for cat in categories}
|
|
210
|
+
|
|
211
|
+
for value in data:
|
|
212
|
+
for cat in categories:
|
|
213
|
+
result[cat].append(1 if value == cat else 0)
|
|
214
|
+
|
|
215
|
+
return result
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def label_encode(data: List[str]) -> Tuple[List[int], Dict[str, int]]:
|
|
219
|
+
"""
|
|
220
|
+
Label encode categorical data to integers.
|
|
221
|
+
|
|
222
|
+
Converts categories to integer labels. Useful for ordinal categories
|
|
223
|
+
or when one-hot encoding creates too many features.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
data: List of categorical values
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
tuple: (encoded_data, label_mapping)
|
|
230
|
+
|
|
231
|
+
Examples:
|
|
232
|
+
>>> from ilovetools.data import label_encode
|
|
233
|
+
|
|
234
|
+
# Basic encoding
|
|
235
|
+
>>> sizes = ["Small", "Large", "Medium", "Small", "Large"]
|
|
236
|
+
>>> encoded, mapping = label_encode(sizes)
|
|
237
|
+
>>> print(encoded)
|
|
238
|
+
[2, 0, 1, 2, 0]
|
|
239
|
+
>>> print(mapping)
|
|
240
|
+
{'Large': 0, 'Medium': 1, 'Small': 2}
|
|
241
|
+
|
|
242
|
+
# Education levels (ordinal)
|
|
243
|
+
>>> education = ["High School", "Bachelor", "Master", "Bachelor"]
|
|
244
|
+
>>> encoded, mapping = label_encode(education)
|
|
245
|
+
|
|
246
|
+
# Decode back
|
|
247
|
+
>>> reverse_mapping = {v: k for k, v in mapping.items()}
|
|
248
|
+
>>> original = [reverse_mapping[code] for code in encoded]
|
|
249
|
+
|
|
250
|
+
Notes:
|
|
251
|
+
- More memory efficient than one-hot encoding
|
|
252
|
+
- Use for ordinal categories (has order)
|
|
253
|
+
- Model may assume order exists
|
|
254
|
+
- Returns mapping for decoding
|
|
255
|
+
"""
|
|
256
|
+
unique_values = sorted(list(set(data)))
|
|
257
|
+
mapping = {val: idx for idx, val in enumerate(unique_values)}
|
|
258
|
+
encoded = [mapping[val] for val in data]
|
|
259
|
+
|
|
260
|
+
return encoded, mapping
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def extract_datetime_features(
|
|
264
|
+
timestamps: List[str],
|
|
265
|
+
format: str = "%Y-%m-%d %H:%M:%S"
|
|
266
|
+
) -> Dict[str, List[int]]:
|
|
267
|
+
"""
|
|
268
|
+
Extract useful features from datetime strings.
|
|
269
|
+
|
|
270
|
+
Converts timestamps into multiple temporal features like hour,
|
|
271
|
+
day of week, month, etc. Essential for time-series ML.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
timestamps: List of datetime strings
|
|
275
|
+
format: Datetime format string. Default: "%Y-%m-%d %H:%M:%S"
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
dict: Dictionary with feature names and values
|
|
279
|
+
|
|
280
|
+
Examples:
|
|
281
|
+
>>> from ilovetools.data import extract_datetime_features
|
|
282
|
+
|
|
283
|
+
# Basic usage
|
|
284
|
+
>>> dates = [
|
|
285
|
+
... "2024-03-15 14:30:00",
|
|
286
|
+
... "2024-03-16 09:15:00",
|
|
287
|
+
... "2024-03-17 18:45:00"
|
|
288
|
+
... ]
|
|
289
|
+
>>> features = extract_datetime_features(dates)
|
|
290
|
+
>>> print(features.keys())
|
|
291
|
+
dict_keys(['year', 'month', 'day', 'hour', 'minute', 'day_of_week', 'is_weekend'])
|
|
292
|
+
|
|
293
|
+
# E-commerce: Purchase patterns
|
|
294
|
+
>>> purchase_times = ["2024-12-25 10:30:00", "2024-12-26 15:45:00"]
|
|
295
|
+
>>> features = extract_datetime_features(purchase_times)
|
|
296
|
+
>>> print(features['is_weekend'])
|
|
297
|
+
[0, 0]
|
|
298
|
+
>>> print(features['hour'])
|
|
299
|
+
[10, 15]
|
|
300
|
+
|
|
301
|
+
# Different format
|
|
302
|
+
>>> dates = ["15/03/2024", "16/03/2024"]
|
|
303
|
+
>>> features = extract_datetime_features(dates, format="%d/%m/%Y")
|
|
304
|
+
|
|
305
|
+
Notes:
|
|
306
|
+
- Captures temporal patterns
|
|
307
|
+
- Essential for time-series forecasting
|
|
308
|
+
- Helps model learn seasonality
|
|
309
|
+
- Common features: hour, day, month, is_weekend
|
|
310
|
+
"""
|
|
311
|
+
result = {
|
|
312
|
+
'year': [],
|
|
313
|
+
'month': [],
|
|
314
|
+
'day': [],
|
|
315
|
+
'hour': [],
|
|
316
|
+
'minute': [],
|
|
317
|
+
'day_of_week': [], # 0=Monday, 6=Sunday
|
|
318
|
+
'is_weekend': []
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
for ts in timestamps:
|
|
322
|
+
dt = datetime.strptime(ts, format)
|
|
323
|
+
result['year'].append(dt.year)
|
|
324
|
+
result['month'].append(dt.month)
|
|
325
|
+
result['day'].append(dt.day)
|
|
326
|
+
result['hour'].append(dt.hour)
|
|
327
|
+
result['minute'].append(dt.minute)
|
|
328
|
+
result['day_of_week'].append(dt.weekday())
|
|
329
|
+
result['is_weekend'].append(1 if dt.weekday() >= 5 else 0)
|
|
330
|
+
|
|
331
|
+
return result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def handle_missing_values(
|
|
335
|
+
data: List[Optional[float]],
|
|
336
|
+
strategy: str = "mean"
|
|
337
|
+
) -> List[float]:
|
|
338
|
+
"""
|
|
339
|
+
Handle missing values in numerical data.
|
|
340
|
+
|
|
341
|
+
Fills None/NaN values using various strategies. Essential
|
|
342
|
+
preprocessing step for ML models.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
data: List with potential None values
|
|
346
|
+
strategy: Fill strategy - "mean", "median", "mode", "forward", "backward", "zero"
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
list: Data with missing values filled
|
|
350
|
+
|
|
351
|
+
Examples:
|
|
352
|
+
>>> from ilovetools.data import handle_missing_values
|
|
353
|
+
|
|
354
|
+
# Mean imputation
|
|
355
|
+
>>> data = [1.0, 2.0, None, 4.0, 5.0]
|
|
356
|
+
>>> filled = handle_missing_values(data, strategy="mean")
|
|
357
|
+
>>> print(filled)
|
|
358
|
+
[1.0, 2.0, 3.0, 4.0, 5.0]
|
|
359
|
+
|
|
360
|
+
# Median imputation
|
|
361
|
+
>>> data = [1.0, 2.0, None, 100.0]
|
|
362
|
+
>>> filled = handle_missing_values(data, strategy="median")
|
|
363
|
+
|
|
364
|
+
# Forward fill
|
|
365
|
+
>>> data = [1.0, None, None, 4.0]
|
|
366
|
+
>>> filled = handle_missing_values(data, strategy="forward")
|
|
367
|
+
>>> print(filled)
|
|
368
|
+
[1.0, 1.0, 1.0, 4.0]
|
|
369
|
+
|
|
370
|
+
# Zero fill
|
|
371
|
+
>>> data = [1.0, None, 3.0]
|
|
372
|
+
>>> filled = handle_missing_values(data, strategy="zero")
|
|
373
|
+
>>> print(filled)
|
|
374
|
+
[1.0, 0.0, 3.0]
|
|
375
|
+
|
|
376
|
+
Notes:
|
|
377
|
+
- Most ML models can't handle missing values
|
|
378
|
+
- Choose strategy based on data distribution
|
|
379
|
+
- Mean: Sensitive to outliers
|
|
380
|
+
- Median: Robust to outliers
|
|
381
|
+
- Forward/Backward: For time-series data
|
|
382
|
+
"""
|
|
383
|
+
valid_values = [x for x in data if x is not None]
|
|
384
|
+
|
|
385
|
+
if not valid_values:
|
|
386
|
+
return [0.0] * len(data)
|
|
387
|
+
|
|
388
|
+
if strategy == "mean":
|
|
389
|
+
fill_value = sum(valid_values) / len(valid_values)
|
|
390
|
+
elif strategy == "median":
|
|
391
|
+
sorted_vals = sorted(valid_values)
|
|
392
|
+
n = len(sorted_vals)
|
|
393
|
+
fill_value = sorted_vals[n // 2] if n % 2 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
|
|
394
|
+
elif strategy == "mode":
|
|
395
|
+
fill_value = max(set(valid_values), key=valid_values.count)
|
|
396
|
+
elif strategy == "zero":
|
|
397
|
+
fill_value = 0.0
|
|
398
|
+
else:
|
|
399
|
+
fill_value = sum(valid_values) / len(valid_values)
|
|
400
|
+
|
|
401
|
+
result = []
|
|
402
|
+
last_valid = fill_value
|
|
403
|
+
|
|
404
|
+
for value in data:
|
|
405
|
+
if value is None:
|
|
406
|
+
if strategy == "forward":
|
|
407
|
+
result.append(last_valid)
|
|
408
|
+
else:
|
|
409
|
+
result.append(fill_value)
|
|
410
|
+
else:
|
|
411
|
+
result.append(value)
|
|
412
|
+
last_valid = value
|
|
413
|
+
|
|
414
|
+
# Backward fill if needed
|
|
415
|
+
if strategy == "backward":
|
|
416
|
+
result_reversed = []
|
|
417
|
+
last_valid = fill_value
|
|
418
|
+
for value in reversed(result):
|
|
419
|
+
if value is None:
|
|
420
|
+
result_reversed.append(last_valid)
|
|
421
|
+
else:
|
|
422
|
+
result_reversed.append(value)
|
|
423
|
+
last_valid = value
|
|
424
|
+
result = list(reversed(result_reversed))
|
|
425
|
+
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def create_interaction_features(
|
|
430
|
+
feature1: Union[List[float], np.ndarray],
|
|
431
|
+
feature2: Union[List[float], np.ndarray],
|
|
432
|
+
operation: str = "multiply"
|
|
433
|
+
) -> List[float]:
|
|
434
|
+
"""
|
|
435
|
+
Create interaction features between two features.
|
|
436
|
+
|
|
437
|
+
Combines two features to capture their joint effect. Useful when
|
|
438
|
+
features interact in meaningful ways.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
feature1: First feature
|
|
442
|
+
feature2: Second feature
|
|
443
|
+
operation: "multiply", "add", "subtract", "divide"
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
list: Interaction feature values
|
|
447
|
+
|
|
448
|
+
Examples:
|
|
449
|
+
>>> from ilovetools.data import create_interaction_features
|
|
450
|
+
|
|
451
|
+
# Multiply interaction
|
|
452
|
+
>>> height = [170, 180, 160, 175]
|
|
453
|
+
>>> weight = [70, 85, 60, 80]
|
|
454
|
+
>>> bmi_proxy = create_interaction_features(height, weight, "multiply")
|
|
455
|
+
|
|
456
|
+
# Real-world: Price per square foot
|
|
457
|
+
>>> prices = [300000, 450000, 250000]
|
|
458
|
+
>>> sqft = [1500, 2000, 1200]
|
|
459
|
+
>>> price_per_sqft = create_interaction_features(prices, sqft, "divide")
|
|
460
|
+
>>> print(price_per_sqft)
|
|
461
|
+
[200.0, 225.0, 208.33]
|
|
462
|
+
|
|
463
|
+
# Add interaction
|
|
464
|
+
>>> feature1 = [1, 2, 3]
|
|
465
|
+
>>> feature2 = [4, 5, 6]
|
|
466
|
+
>>> combined = create_interaction_features(feature1, feature2, "add")
|
|
467
|
+
>>> print(combined)
|
|
468
|
+
[5, 7, 9]
|
|
469
|
+
|
|
470
|
+
Notes:
|
|
471
|
+
- Captures feature interactions
|
|
472
|
+
- Common in real estate (price * sqft)
|
|
473
|
+
- Useful in e-commerce (quantity * price)
|
|
474
|
+
- Can significantly improve model performance
|
|
475
|
+
"""
|
|
476
|
+
if isinstance(feature1, np.ndarray):
|
|
477
|
+
feature1 = feature1.tolist()
|
|
478
|
+
if isinstance(feature2, np.ndarray):
|
|
479
|
+
feature2 = feature2.tolist()
|
|
480
|
+
|
|
481
|
+
if len(feature1) != len(feature2):
|
|
482
|
+
raise ValueError("Features must have same length")
|
|
483
|
+
|
|
484
|
+
result = []
|
|
485
|
+
for v1, v2 in zip(feature1, feature2):
|
|
486
|
+
if operation == "multiply":
|
|
487
|
+
result.append(v1 * v2)
|
|
488
|
+
elif operation == "add":
|
|
489
|
+
result.append(v1 + v2)
|
|
490
|
+
elif operation == "subtract":
|
|
491
|
+
result.append(v1 - v2)
|
|
492
|
+
elif operation == "divide":
|
|
493
|
+
result.append(v1 / v2 if v2 != 0 else 0.0)
|
|
494
|
+
else:
|
|
495
|
+
result.append(v1 * v2)
|
|
496
|
+
|
|
497
|
+
return result
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Machine Learning utilities module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .metrics import (
|
|
6
|
+
accuracy_score,
|
|
7
|
+
precision_score,
|
|
8
|
+
recall_score,
|
|
9
|
+
f1_score,
|
|
10
|
+
confusion_matrix,
|
|
11
|
+
classification_report,
|
|
12
|
+
mean_squared_error,
|
|
13
|
+
mean_absolute_error,
|
|
14
|
+
root_mean_squared_error,
|
|
15
|
+
r2_score,
|
|
16
|
+
roc_auc_score
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
'accuracy_score',
|
|
21
|
+
'precision_score',
|
|
22
|
+
'recall_score',
|
|
23
|
+
'f1_score',
|
|
24
|
+
'confusion_matrix',
|
|
25
|
+
'classification_report',
|
|
26
|
+
'mean_squared_error',
|
|
27
|
+
'mean_absolute_error',
|
|
28
|
+
'root_mean_squared_error',
|
|
29
|
+
'r2_score',
|
|
30
|
+
'roc_auc_score',
|
|
31
|
+
]
|
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model evaluation metrics for ML workflows
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Tuple, Dict, Union, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'accuracy_score',
|
|
10
|
+
'precision_score',
|
|
11
|
+
'recall_score',
|
|
12
|
+
'f1_score',
|
|
13
|
+
'confusion_matrix',
|
|
14
|
+
'classification_report',
|
|
15
|
+
'mean_squared_error',
|
|
16
|
+
'mean_absolute_error',
|
|
17
|
+
'root_mean_squared_error',
|
|
18
|
+
'r2_score',
|
|
19
|
+
'roc_auc_score'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def accuracy_score(y_true: List, y_pred: List) -> float:
|
|
24
|
+
"""
|
|
25
|
+
Calculate accuracy score for classification.
|
|
26
|
+
|
|
27
|
+
Accuracy = (Correct Predictions) / (Total Predictions)
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
y_true: True labels
|
|
31
|
+
y_pred: Predicted labels
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
float: Accuracy score (0.0 to 1.0)
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
>>> from ilovetools.ml import accuracy_score
|
|
38
|
+
|
|
39
|
+
# Perfect predictions
|
|
40
|
+
>>> y_true = [1, 0, 1, 1, 0]
|
|
41
|
+
>>> y_pred = [1, 0, 1, 1, 0]
|
|
42
|
+
>>> accuracy_score(y_true, y_pred)
|
|
43
|
+
1.0
|
|
44
|
+
|
|
45
|
+
# 80% accuracy
|
|
46
|
+
>>> y_true = [1, 0, 1, 1, 0]
|
|
47
|
+
>>> y_pred = [1, 0, 1, 0, 0]
|
|
48
|
+
>>> accuracy_score(y_true, y_pred)
|
|
49
|
+
0.8
|
|
50
|
+
|
|
51
|
+
# Real-world: Email spam detection
|
|
52
|
+
>>> actual = [1, 1, 0, 0, 1, 0, 1, 0]
|
|
53
|
+
>>> predicted = [1, 0, 0, 0, 1, 0, 1, 1]
|
|
54
|
+
>>> acc = accuracy_score(actual, predicted)
|
|
55
|
+
>>> print(f"Model accuracy: {acc:.2%}")
|
|
56
|
+
Model accuracy: 75.00%
|
|
57
|
+
|
|
58
|
+
Notes:
|
|
59
|
+
- Use for balanced datasets
|
|
60
|
+
- Don't use for imbalanced datasets
|
|
61
|
+
- Range: 0.0 (worst) to 1.0 (best)
|
|
62
|
+
- Simple and intuitive metric
|
|
63
|
+
"""
|
|
64
|
+
if len(y_true) != len(y_pred):
|
|
65
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
66
|
+
|
|
67
|
+
correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
|
|
68
|
+
return correct / len(y_true)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def precision_score(y_true: List, y_pred: List, positive_label: int = 1) -> float:
|
|
72
|
+
"""
|
|
73
|
+
Calculate precision score for binary classification.
|
|
74
|
+
|
|
75
|
+
Precision = TP / (TP + FP)
|
|
76
|
+
"Of all positive predictions, how many were correct?"
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
y_true: True labels
|
|
80
|
+
y_pred: Predicted labels
|
|
81
|
+
positive_label: Label considered as positive class. Default: 1
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
float: Precision score (0.0 to 1.0)
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
>>> from ilovetools.ml import precision_score
|
|
88
|
+
|
|
89
|
+
# High precision (few false positives)
|
|
90
|
+
>>> y_true = [1, 0, 1, 1, 0, 0, 1, 0]
|
|
91
|
+
>>> y_pred = [1, 0, 1, 1, 0, 0, 0, 0]
|
|
92
|
+
>>> precision_score(y_true, y_pred)
|
|
93
|
+
1.0
|
|
94
|
+
|
|
95
|
+
# Lower precision (some false positives)
|
|
96
|
+
>>> y_true = [1, 0, 1, 1, 0]
|
|
97
|
+
>>> y_pred = [1, 1, 1, 1, 0]
|
|
98
|
+
>>> precision_score(y_true, y_pred)
|
|
99
|
+
0.75
|
|
100
|
+
|
|
101
|
+
# Spam detection (don't mark important emails as spam)
|
|
102
|
+
>>> actual_spam = [1, 1, 0, 0, 1, 0, 1, 0]
|
|
103
|
+
>>> predicted_spam = [1, 1, 1, 0, 1, 0, 1, 0]
|
|
104
|
+
>>> prec = precision_score(actual_spam, predicted_spam)
|
|
105
|
+
>>> print(f"Precision: {prec:.2%}")
|
|
106
|
+
Precision: 80.00%
|
|
107
|
+
|
|
108
|
+
Notes:
|
|
109
|
+
- Use when false positives are costly
|
|
110
|
+
- High precision = Few false alarms
|
|
111
|
+
- Example: Spam detection, fraud detection
|
|
112
|
+
- Returns 0.0 if no positive predictions
|
|
113
|
+
"""
|
|
114
|
+
if len(y_true) != len(y_pred):
|
|
115
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
116
|
+
|
|
117
|
+
tp = sum(1 for true, pred in zip(y_true, y_pred)
|
|
118
|
+
if true == positive_label and pred == positive_label)
|
|
119
|
+
fp = sum(1 for true, pred in zip(y_true, y_pred)
|
|
120
|
+
if true != positive_label and pred == positive_label)
|
|
121
|
+
|
|
122
|
+
if tp + fp == 0:
|
|
123
|
+
return 0.0
|
|
124
|
+
|
|
125
|
+
return tp / (tp + fp)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def recall_score(y_true: List, y_pred: List, positive_label: int = 1) -> float:
|
|
129
|
+
"""
|
|
130
|
+
Calculate recall score (sensitivity) for binary classification.
|
|
131
|
+
|
|
132
|
+
Recall = TP / (TP + FN)
|
|
133
|
+
"Of all actual positives, how many did we catch?"
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
y_true: True labels
|
|
137
|
+
y_pred: Predicted labels
|
|
138
|
+
positive_label: Label considered as positive class. Default: 1
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
float: Recall score (0.0 to 1.0)
|
|
142
|
+
|
|
143
|
+
Examples:
|
|
144
|
+
>>> from ilovetools.ml import recall_score
|
|
145
|
+
|
|
146
|
+
# High recall (caught most positives)
|
|
147
|
+
>>> y_true = [1, 0, 1, 1, 0, 0, 1, 0]
|
|
148
|
+
>>> y_pred = [1, 1, 1, 1, 0, 0, 1, 0]
|
|
149
|
+
>>> recall_score(y_true, y_pred)
|
|
150
|
+
1.0
|
|
151
|
+
|
|
152
|
+
# Lower recall (missed some positives)
|
|
153
|
+
>>> y_true = [1, 0, 1, 1, 0]
|
|
154
|
+
>>> y_pred = [1, 0, 0, 1, 0]
|
|
155
|
+
>>> recall_score(y_true, y_pred)
|
|
156
|
+
0.6666666666666666
|
|
157
|
+
|
|
158
|
+
# Cancer detection (don't miss any cases)
|
|
159
|
+
>>> actual_cancer = [1, 1, 0, 0, 1, 0, 1, 0]
|
|
160
|
+
>>> predicted_cancer = [1, 1, 0, 1, 1, 0, 0, 0]
|
|
161
|
+
>>> rec = recall_score(actual_cancer, predicted_cancer)
|
|
162
|
+
>>> print(f"Recall: {rec:.2%}")
|
|
163
|
+
Recall: 75.00%
|
|
164
|
+
|
|
165
|
+
Notes:
|
|
166
|
+
- Use when false negatives are costly
|
|
167
|
+
- High recall = Few missed cases
|
|
168
|
+
- Example: Disease detection, fraud detection
|
|
169
|
+
- Returns 0.0 if no actual positives
|
|
170
|
+
"""
|
|
171
|
+
if len(y_true) != len(y_pred):
|
|
172
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
173
|
+
|
|
174
|
+
tp = sum(1 for true, pred in zip(y_true, y_pred)
|
|
175
|
+
if true == positive_label and pred == positive_label)
|
|
176
|
+
fn = sum(1 for true, pred in zip(y_true, y_pred)
|
|
177
|
+
if true == positive_label and pred != positive_label)
|
|
178
|
+
|
|
179
|
+
if tp + fn == 0:
|
|
180
|
+
return 0.0
|
|
181
|
+
|
|
182
|
+
return tp / (tp + fn)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def f1_score(y_true: List, y_pred: List, positive_label: int = 1) -> float:
|
|
186
|
+
"""
|
|
187
|
+
Calculate F1 score (harmonic mean of precision and recall).
|
|
188
|
+
|
|
189
|
+
F1 = 2 * (Precision * Recall) / (Precision + Recall)
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
y_true: True labels
|
|
193
|
+
y_pred: Predicted labels
|
|
194
|
+
positive_label: Label considered as positive class. Default: 1
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
float: F1 score (0.0 to 1.0)
|
|
198
|
+
|
|
199
|
+
Examples:
|
|
200
|
+
>>> from ilovetools.ml import f1_score
|
|
201
|
+
|
|
202
|
+
# Balanced precision and recall
|
|
203
|
+
>>> y_true = [1, 0, 1, 1, 0, 0, 1, 0]
|
|
204
|
+
>>> y_pred = [1, 0, 1, 1, 0, 0, 1, 1]
|
|
205
|
+
>>> f1_score(y_true, y_pred)
|
|
206
|
+
0.8888888888888888
|
|
207
|
+
|
|
208
|
+
# Imbalanced dataset
|
|
209
|
+
>>> y_true = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
|
210
|
+
>>> y_pred = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
|
211
|
+
>>> f1 = f1_score(y_true, y_pred)
|
|
212
|
+
>>> print(f"F1 Score: {f1:.2%}")
|
|
213
|
+
F1 Score: 100.00%
|
|
214
|
+
|
|
215
|
+
Notes:
|
|
216
|
+
- Best metric for imbalanced datasets
|
|
217
|
+
- Balances precision and recall
|
|
218
|
+
- Range: 0.0 (worst) to 1.0 (best)
|
|
219
|
+
- Use when both false positives and negatives matter
|
|
220
|
+
"""
|
|
221
|
+
precision = precision_score(y_true, y_pred, positive_label)
|
|
222
|
+
recall = recall_score(y_true, y_pred, positive_label)
|
|
223
|
+
|
|
224
|
+
if precision + recall == 0:
|
|
225
|
+
return 0.0
|
|
226
|
+
|
|
227
|
+
return 2 * (precision * recall) / (precision + recall)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def confusion_matrix(y_true: List, y_pred: List) -> List[List[int]]:
|
|
231
|
+
"""
|
|
232
|
+
Calculate confusion matrix for binary classification.
|
|
233
|
+
|
|
234
|
+
Returns 2x2 matrix:
|
|
235
|
+
[[TN, FP],
|
|
236
|
+
[FN, TP]]
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
y_true: True labels
|
|
240
|
+
y_pred: Predicted labels
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
list: 2x2 confusion matrix
|
|
244
|
+
|
|
245
|
+
Examples:
|
|
246
|
+
>>> from ilovetools.ml import confusion_matrix
|
|
247
|
+
|
|
248
|
+
# Perfect predictions
|
|
249
|
+
>>> y_true = [1, 0, 1, 1, 0]
|
|
250
|
+
>>> y_pred = [1, 0, 1, 1, 0]
|
|
251
|
+
>>> cm = confusion_matrix(y_true, y_pred)
|
|
252
|
+
>>> print(cm)
|
|
253
|
+
[[2, 0], [0, 3]]
|
|
254
|
+
|
|
255
|
+
# With errors
|
|
256
|
+
>>> y_true = [1, 0, 1, 1, 0, 0, 1, 0]
|
|
257
|
+
>>> y_pred = [1, 0, 1, 0, 0, 1, 1, 0]
|
|
258
|
+
>>> cm = confusion_matrix(y_true, y_pred)
|
|
259
|
+
>>> print(cm)
|
|
260
|
+
[[3, 1], [1, 3]]
|
|
261
|
+
|
|
262
|
+
# Interpret results
|
|
263
|
+
>>> tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
|
|
264
|
+
>>> print(f"True Negatives: {tn}")
|
|
265
|
+
>>> print(f"False Positives: {fp}")
|
|
266
|
+
>>> print(f"False Negatives: {fn}")
|
|
267
|
+
>>> print(f"True Positives: {tp}")
|
|
268
|
+
|
|
269
|
+
Notes:
|
|
270
|
+
- Foundation of classification metrics
|
|
271
|
+
- Shows all types of errors
|
|
272
|
+
- Format: [[TN, FP], [FN, TP]]
|
|
273
|
+
- Use to understand model behavior
|
|
274
|
+
"""
|
|
275
|
+
if len(y_true) != len(y_pred):
|
|
276
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
277
|
+
|
|
278
|
+
tn = sum(1 for true, pred in zip(y_true, y_pred) if true == 0 and pred == 0)
|
|
279
|
+
fp = sum(1 for true, pred in zip(y_true, y_pred) if true == 0 and pred == 1)
|
|
280
|
+
fn = sum(1 for true, pred in zip(y_true, y_pred) if true == 1 and pred == 0)
|
|
281
|
+
tp = sum(1 for true, pred in zip(y_true, y_pred) if true == 1 and pred == 1)
|
|
282
|
+
|
|
283
|
+
return [[tn, fp], [fn, tp]]
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def classification_report(y_true: List, y_pred: List) -> Dict[str, float]:
|
|
287
|
+
"""
|
|
288
|
+
Generate comprehensive classification report.
|
|
289
|
+
|
|
290
|
+
Returns accuracy, precision, recall, and F1 score.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
y_true: True labels
|
|
294
|
+
y_pred: Predicted labels
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
dict: Dictionary with all metrics
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
>>> from ilovetools.ml import classification_report
|
|
301
|
+
|
|
302
|
+
>>> y_true = [1, 0, 1, 1, 0, 0, 1, 0]
|
|
303
|
+
>>> y_pred = [1, 0, 1, 0, 0, 1, 1, 0]
|
|
304
|
+
>>> report = classification_report(y_true, y_pred)
|
|
305
|
+
>>> print(report)
|
|
306
|
+
{'accuracy': 0.75, 'precision': 0.75, 'recall': 0.75, 'f1_score': 0.75}
|
|
307
|
+
|
|
308
|
+
# Pretty print
|
|
309
|
+
>>> for metric, value in report.items():
|
|
310
|
+
... print(f"{metric}: {value:.2%}")
|
|
311
|
+
accuracy: 75.00%
|
|
312
|
+
precision: 75.00%
|
|
313
|
+
recall: 75.00%
|
|
314
|
+
f1_score: 75.00%
|
|
315
|
+
|
|
316
|
+
Notes:
|
|
317
|
+
- Comprehensive overview of model performance
|
|
318
|
+
- All metrics in one call
|
|
319
|
+
- Easy to compare models
|
|
320
|
+
- Returns dictionary for flexibility
|
|
321
|
+
"""
|
|
322
|
+
return {
|
|
323
|
+
'accuracy': accuracy_score(y_true, y_pred),
|
|
324
|
+
'precision': precision_score(y_true, y_pred),
|
|
325
|
+
'recall': recall_score(y_true, y_pred),
|
|
326
|
+
'f1_score': f1_score(y_true, y_pred)
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def mean_squared_error(y_true: List[float], y_pred: List[float]) -> float:
|
|
331
|
+
"""
|
|
332
|
+
Calculate Mean Squared Error for regression.
|
|
333
|
+
|
|
334
|
+
MSE = Average of (actual - predicted)^2
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
y_true: True values
|
|
338
|
+
y_pred: Predicted values
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
float: MSE value
|
|
342
|
+
|
|
343
|
+
Examples:
|
|
344
|
+
>>> from ilovetools.ml import mean_squared_error
|
|
345
|
+
|
|
346
|
+
# Perfect predictions
|
|
347
|
+
>>> y_true = [1.0, 2.0, 3.0, 4.0]
|
|
348
|
+
>>> y_pred = [1.0, 2.0, 3.0, 4.0]
|
|
349
|
+
>>> mean_squared_error(y_true, y_pred)
|
|
350
|
+
0.0
|
|
351
|
+
|
|
352
|
+
# With errors
|
|
353
|
+
>>> y_true = [100, 200, 300, 400]
|
|
354
|
+
>>> y_pred = [110, 190, 310, 390]
|
|
355
|
+
>>> mse = mean_squared_error(y_true, y_pred)
|
|
356
|
+
>>> print(f"MSE: {mse:.2f}")
|
|
357
|
+
MSE: 100.00
|
|
358
|
+
|
|
359
|
+
# House price prediction
|
|
360
|
+
>>> actual_prices = [250000, 300000, 350000]
|
|
361
|
+
>>> predicted_prices = [245000, 310000, 340000]
|
|
362
|
+
>>> mse = mean_squared_error(actual_prices, predicted_prices)
|
|
363
|
+
|
|
364
|
+
Notes:
|
|
365
|
+
- Penalizes large errors heavily
|
|
366
|
+
- Not in original units (squared)
|
|
367
|
+
- Sensitive to outliers
|
|
368
|
+
- Lower is better
|
|
369
|
+
"""
|
|
370
|
+
if len(y_true) != len(y_pred):
|
|
371
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
372
|
+
|
|
373
|
+
squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
|
|
374
|
+
return sum(squared_errors) / len(squared_errors)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def mean_absolute_error(y_true: List[float], y_pred: List[float]) -> float:
|
|
378
|
+
"""
|
|
379
|
+
Calculate Mean Absolute Error for regression.
|
|
380
|
+
|
|
381
|
+
MAE = Average of |actual - predicted|
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
y_true: True values
|
|
385
|
+
y_pred: Predicted values
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
float: MAE value
|
|
389
|
+
|
|
390
|
+
Examples:
|
|
391
|
+
>>> from ilovetools.ml import mean_absolute_error
|
|
392
|
+
|
|
393
|
+
# Perfect predictions
|
|
394
|
+
>>> y_true = [1.0, 2.0, 3.0, 4.0]
|
|
395
|
+
>>> y_pred = [1.0, 2.0, 3.0, 4.0]
|
|
396
|
+
>>> mean_absolute_error(y_true, y_pred)
|
|
397
|
+
0.0
|
|
398
|
+
|
|
399
|
+
# With errors
|
|
400
|
+
>>> y_true = [100, 200, 300, 400]
|
|
401
|
+
>>> y_pred = [110, 190, 310, 390]
|
|
402
|
+
>>> mae = mean_absolute_error(y_true, y_pred)
|
|
403
|
+
>>> print(f"MAE: ${mae:.2f}")
|
|
404
|
+
MAE: $10.00
|
|
405
|
+
|
|
406
|
+
# House price prediction
|
|
407
|
+
>>> actual_prices = [250000, 300000, 350000]
|
|
408
|
+
>>> predicted_prices = [245000, 310000, 340000]
|
|
409
|
+
>>> mae = mean_absolute_error(actual_prices, predicted_prices)
|
|
410
|
+
>>> print(f"Average error: ${mae:,.0f}")
|
|
411
|
+
Average error: $8,333
|
|
412
|
+
|
|
413
|
+
Notes:
|
|
414
|
+
- Easy to interpret
|
|
415
|
+
- Same units as target variable
|
|
416
|
+
- Less sensitive to outliers than MSE
|
|
417
|
+
- Lower is better
|
|
418
|
+
"""
|
|
419
|
+
if len(y_true) != len(y_pred):
|
|
420
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
421
|
+
|
|
422
|
+
absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
|
|
423
|
+
return sum(absolute_errors) / len(absolute_errors)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def root_mean_squared_error(y_true: List[float], y_pred: List[float]) -> float:
|
|
427
|
+
"""
|
|
428
|
+
Calculate Root Mean Squared Error for regression.
|
|
429
|
+
|
|
430
|
+
RMSE = sqrt(MSE)
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
y_true: True values
|
|
434
|
+
y_pred: Predicted values
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
float: RMSE value
|
|
438
|
+
|
|
439
|
+
Examples:
|
|
440
|
+
>>> from ilovetools.ml import root_mean_squared_error
|
|
441
|
+
|
|
442
|
+
# Perfect predictions
|
|
443
|
+
>>> y_true = [1.0, 2.0, 3.0, 4.0]
|
|
444
|
+
>>> y_pred = [1.0, 2.0, 3.0, 4.0]
|
|
445
|
+
>>> root_mean_squared_error(y_true, y_pred)
|
|
446
|
+
0.0
|
|
447
|
+
|
|
448
|
+
# With errors
|
|
449
|
+
>>> y_true = [100, 200, 300, 400]
|
|
450
|
+
>>> y_pred = [110, 190, 310, 390]
|
|
451
|
+
>>> rmse = root_mean_squared_error(y_true, y_pred)
|
|
452
|
+
>>> print(f"RMSE: {rmse:.2f}")
|
|
453
|
+
RMSE: 10.00
|
|
454
|
+
|
|
455
|
+
# House price prediction
|
|
456
|
+
>>> actual_prices = [250000, 300000, 350000]
|
|
457
|
+
>>> predicted_prices = [245000, 310000, 340000]
|
|
458
|
+
>>> rmse = root_mean_squared_error(actual_prices, predicted_prices)
|
|
459
|
+
>>> print(f"RMSE: ${rmse:,.0f}")
|
|
460
|
+
RMSE: $8,165
|
|
461
|
+
|
|
462
|
+
Notes:
|
|
463
|
+
- Most common regression metric
|
|
464
|
+
- Same units as target variable
|
|
465
|
+
- Penalizes large errors
|
|
466
|
+
- Lower is better
|
|
467
|
+
"""
|
|
468
|
+
mse = mean_squared_error(y_true, y_pred)
|
|
469
|
+
return mse ** 0.5
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def r2_score(y_true: List[float], y_pred: List[float]) -> float:
|
|
473
|
+
"""
|
|
474
|
+
Calculate R-squared (coefficient of determination) for regression.
|
|
475
|
+
|
|
476
|
+
R² = 1 - (SS_res / SS_tot)
|
|
477
|
+
Proportion of variance explained by the model.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
y_true: True values
|
|
481
|
+
y_pred: Predicted values
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
float: R² value (-inf to 1.0, higher is better)
|
|
485
|
+
|
|
486
|
+
Examples:
|
|
487
|
+
>>> from ilovetools.ml import r2_score
|
|
488
|
+
|
|
489
|
+
# Perfect predictions
|
|
490
|
+
>>> y_true = [1.0, 2.0, 3.0, 4.0]
|
|
491
|
+
>>> y_pred = [1.0, 2.0, 3.0, 4.0]
|
|
492
|
+
>>> r2_score(y_true, y_pred)
|
|
493
|
+
1.0
|
|
494
|
+
|
|
495
|
+
# Good predictions
|
|
496
|
+
>>> y_true = [100, 200, 300, 400, 500]
|
|
497
|
+
>>> y_pred = [110, 190, 310, 390, 510]
|
|
498
|
+
>>> r2 = r2_score(y_true, y_pred)
|
|
499
|
+
>>> print(f"R²: {r2:.2%}")
|
|
500
|
+
R²: 99.00%
|
|
501
|
+
|
|
502
|
+
# Interpretation
|
|
503
|
+
>>> r2 = 0.85
|
|
504
|
+
>>> print(f"Model explains {r2:.0%} of variance")
|
|
505
|
+
Model explains 85% of variance
|
|
506
|
+
|
|
507
|
+
Notes:
|
|
508
|
+
- Range: -inf to 1.0 (1.0 is perfect)
|
|
509
|
+
- 0.0 = Model as good as mean baseline
|
|
510
|
+
- Negative = Model worse than mean
|
|
511
|
+
- Easy to interpret as percentage
|
|
512
|
+
"""
|
|
513
|
+
if len(y_true) != len(y_pred):
|
|
514
|
+
raise ValueError("y_true and y_pred must have same length")
|
|
515
|
+
|
|
516
|
+
mean_true = sum(y_true) / len(y_true)
|
|
517
|
+
|
|
518
|
+
ss_tot = sum((true - mean_true) ** 2 for true in y_true)
|
|
519
|
+
ss_res = sum((true - pred) ** 2 for true, pred in zip(y_true, y_pred))
|
|
520
|
+
|
|
521
|
+
if ss_tot == 0:
|
|
522
|
+
return 0.0
|
|
523
|
+
|
|
524
|
+
return 1 - (ss_res / ss_tot)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def roc_auc_score(y_true: List[int], y_scores: List[float]) -> float:
|
|
528
|
+
"""
|
|
529
|
+
Calculate ROC AUC score for binary classification.
|
|
530
|
+
|
|
531
|
+
AUC = Area Under the ROC Curve
|
|
532
|
+
Measures model's ability to distinguish between classes.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
y_true: True binary labels (0 or 1)
|
|
536
|
+
y_scores: Predicted probabilities or scores
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
float: AUC score (0.0 to 1.0)
|
|
540
|
+
|
|
541
|
+
Examples:
|
|
542
|
+
>>> from ilovetools.ml import roc_auc_score
|
|
543
|
+
|
|
544
|
+
# Perfect separation
|
|
545
|
+
>>> y_true = [0, 0, 1, 1]
|
|
546
|
+
>>> y_scores = [0.1, 0.2, 0.8, 0.9]
|
|
547
|
+
>>> roc_auc_score(y_true, y_scores)
|
|
548
|
+
1.0
|
|
549
|
+
|
|
550
|
+
# Good separation
|
|
551
|
+
>>> y_true = [0, 0, 1, 1, 0, 1]
|
|
552
|
+
>>> y_scores = [0.2, 0.3, 0.7, 0.8, 0.4, 0.9]
|
|
553
|
+
>>> auc = roc_auc_score(y_true, y_scores)
|
|
554
|
+
>>> print(f"AUC: {auc:.2%}")
|
|
555
|
+
AUC: 91.67%
|
|
556
|
+
|
|
557
|
+
Notes:
|
|
558
|
+
- 1.0 = Perfect classifier
|
|
559
|
+
- 0.5 = Random guessing
|
|
560
|
+
- < 0.5 = Worse than random
|
|
561
|
+
- Threshold-independent metric
|
|
562
|
+
"""
|
|
563
|
+
if len(y_true) != len(y_scores):
|
|
564
|
+
raise ValueError("y_true and y_scores must have same length")
|
|
565
|
+
|
|
566
|
+
# Sort by scores
|
|
567
|
+
pairs = sorted(zip(y_scores, y_true), reverse=True)
|
|
568
|
+
|
|
569
|
+
# Count positive and negative samples
|
|
570
|
+
n_pos = sum(y_true)
|
|
571
|
+
n_neg = len(y_true) - n_pos
|
|
572
|
+
|
|
573
|
+
if n_pos == 0 or n_neg == 0:
|
|
574
|
+
return 0.5
|
|
575
|
+
|
|
576
|
+
# Calculate AUC using trapezoidal rule
|
|
577
|
+
tp = 0
|
|
578
|
+
fp = 0
|
|
579
|
+
auc = 0.0
|
|
580
|
+
prev_score = None
|
|
581
|
+
|
|
582
|
+
for score, label in pairs:
|
|
583
|
+
if label == 1:
|
|
584
|
+
tp += 1
|
|
585
|
+
else:
|
|
586
|
+
fp += 1
|
|
587
|
+
auc += tp
|
|
588
|
+
|
|
589
|
+
return auc / (n_pos * n_neg)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -17,11 +17,14 @@ ilovetools/audio/__init__.py
|
|
|
17
17
|
ilovetools/automation/__init__.py
|
|
18
18
|
ilovetools/conversion/__init__.py
|
|
19
19
|
ilovetools/data/__init__.py
|
|
20
|
+
ilovetools/data/feature_engineering.py
|
|
20
21
|
ilovetools/data/preprocessing.py
|
|
21
22
|
ilovetools/database/__init__.py
|
|
22
23
|
ilovetools/datetime/__init__.py
|
|
23
24
|
ilovetools/files/__init__.py
|
|
24
25
|
ilovetools/image/__init__.py
|
|
26
|
+
ilovetools/ml/__init__.py
|
|
27
|
+
ilovetools/ml/metrics.py
|
|
25
28
|
ilovetools/security/__init__.py
|
|
26
29
|
ilovetools/text/__init__.py
|
|
27
30
|
ilovetools/utils/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.4"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.3",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|