ilovetools 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,497 @@
1
+ """
2
+ Feature engineering utilities for ML workflows
3
+ """
4
+
5
+ from typing import List, Union, Dict, Tuple, Optional
6
+ import numpy as np
7
+ from datetime import datetime
8
+
9
+ __all__ = [
10
+ 'create_polynomial_features',
11
+ 'bin_numerical_feature',
12
+ 'one_hot_encode',
13
+ 'label_encode',
14
+ 'extract_datetime_features',
15
+ 'handle_missing_values',
16
+ 'create_interaction_features'
17
+ ]
18
+
19
+
20
+ def create_polynomial_features(
21
+ data: Union[List[float], np.ndarray],
22
+ degree: int = 2,
23
+ include_bias: bool = False
24
+ ) -> List[List[float]]:
25
+ """
26
+ Create polynomial features from numerical data.
27
+
28
+ Transforms [x] into [x, x^2, x^3, ...] to capture non-linear relationships.
29
+ Essential for models that need to learn curved patterns.
30
+
31
+ Args:
32
+ data: List or array of numerical values
33
+ degree: Maximum polynomial degree. Default: 2
34
+ include_bias: Include bias term (column of 1s). Default: False
35
+
36
+ Returns:
37
+ list: Polynomial features as list of lists
38
+
39
+ Examples:
40
+ >>> from ilovetools.data import create_polynomial_features
41
+
42
+ # Basic usage
43
+ >>> ages = [20, 25, 30, 35, 40]
44
+ >>> poly_features = create_polynomial_features(ages, degree=2)
45
+ >>> print(poly_features)
46
+ [[20, 400], [25, 625], [30, 900], [35, 1225], [40, 1600]]
47
+
48
+ # With bias term
49
+ >>> poly_features = create_polynomial_features(ages, degree=2, include_bias=True)
50
+ >>> print(poly_features[0])
51
+ [1, 20, 400]
52
+
53
+ # Degree 3
54
+ >>> poly_features = create_polynomial_features([2, 3, 4], degree=3)
55
+ >>> print(poly_features)
56
+ [[2, 4, 8], [3, 9, 27], [4, 16, 64]]
57
+
58
+ # Real-world: Age features for insurance pricing
59
+ >>> customer_ages = [25, 35, 45, 55, 65]
60
+ >>> age_features = create_polynomial_features(customer_ages, degree=2)
61
+ # Now model can learn: premium = a*age + b*age^2
62
+
63
+ Notes:
64
+ - Useful for capturing non-linear relationships
65
+ - Common in regression problems
66
+ - Be careful with high degrees (overfitting risk)
67
+ - Normalize features after polynomial expansion
68
+ """
69
+ if isinstance(data, np.ndarray):
70
+ data = data.tolist()
71
+
72
+ result = []
73
+ for value in data:
74
+ features = []
75
+ if include_bias:
76
+ features.append(1)
77
+ for d in range(1, degree + 1):
78
+ features.append(value ** d)
79
+ result.append(features)
80
+
81
+ return result
82
+
83
+
84
+ def bin_numerical_feature(
85
+ data: Union[List[float], np.ndarray],
86
+ bins: Union[int, List[float]] = 5,
87
+ labels: Optional[List[str]] = None
88
+ ) -> List[Union[int, str]]:
89
+ """
90
+ Bin continuous numerical data into discrete categories.
91
+
92
+ Converts continuous values into groups/bins. Useful for creating
93
+ categorical features from numerical data.
94
+
95
+ Args:
96
+ data: List or array of numerical values
97
+ bins: Number of equal-width bins OR list of bin edges
98
+ labels: Optional labels for bins. If None, returns bin indices
99
+
100
+ Returns:
101
+ list: Binned values (indices or labels)
102
+
103
+ Examples:
104
+ >>> from ilovetools.data import bin_numerical_feature
105
+
106
+ # Age groups
107
+ >>> ages = [5, 15, 25, 35, 45, 55, 65, 75]
108
+ >>> age_groups = bin_numerical_feature(
109
+ ... ages,
110
+ ... bins=[0, 18, 35, 60, 100],
111
+ ... labels=["Child", "Young Adult", "Adult", "Senior"]
112
+ ... )
113
+ >>> print(age_groups)
114
+ ['Child', 'Child', 'Young Adult', 'Adult', 'Adult', 'Senior', 'Senior', 'Senior']
115
+
116
+ # Income brackets
117
+ >>> incomes = [25000, 45000, 65000, 85000, 120000]
118
+ >>> income_brackets = bin_numerical_feature(
119
+ ... incomes,
120
+ ... bins=[0, 40000, 80000, 150000],
121
+ ... labels=["Low", "Medium", "High"]
122
+ ... )
123
+
124
+ # Equal-width bins
125
+ >>> scores = [45, 67, 89, 92, 78, 56, 34, 88]
126
+ >>> score_bins = bin_numerical_feature(scores, bins=3)
127
+ >>> print(score_bins)
128
+ [0, 1, 2, 2, 2, 1, 0, 2]
129
+
130
+ Notes:
131
+ - Useful for creating categorical features
132
+ - Helps models learn threshold effects
133
+ - Can reduce noise in continuous data
134
+ - Choose bins based on domain knowledge
135
+ """
136
+ if isinstance(data, np.ndarray):
137
+ data = data.tolist()
138
+
139
+ if isinstance(bins, int):
140
+ # Create equal-width bins
141
+ min_val = min(data)
142
+ max_val = max(data)
143
+ bin_width = (max_val - min_val) / bins
144
+ bin_edges = [min_val + i * bin_width for i in range(bins + 1)]
145
+ bin_edges[-1] += 0.001 # Ensure max value is included
146
+ else:
147
+ bin_edges = bins
148
+
149
+ result = []
150
+ for value in data:
151
+ bin_idx = 0
152
+ for i in range(len(bin_edges) - 1):
153
+ if bin_edges[i] <= value < bin_edges[i + 1]:
154
+ bin_idx = i
155
+ break
156
+
157
+ if labels:
158
+ result.append(labels[bin_idx])
159
+ else:
160
+ result.append(bin_idx)
161
+
162
+ return result
163
+
164
+
165
+ def one_hot_encode(
166
+ data: List[str],
167
+ categories: Optional[List[str]] = None
168
+ ) -> Dict[str, List[int]]:
169
+ """
170
+ One-hot encode categorical data.
171
+
172
+ Converts categories into binary columns. Each category becomes
173
+ a separate binary feature.
174
+
175
+ Args:
176
+ data: List of categorical values
177
+ categories: Optional list of all possible categories
178
+
179
+ Returns:
180
+ dict: Dictionary with category names as keys, binary lists as values
181
+
182
+ Examples:
183
+ >>> from ilovetools.data import one_hot_encode
184
+
185
+ # Basic encoding
186
+ >>> colors = ["Red", "Blue", "Green", "Red", "Blue"]
187
+ >>> encoded = one_hot_encode(colors)
188
+ >>> print(encoded)
189
+ {'Red': [1, 0, 0, 1, 0], 'Blue': [0, 1, 0, 0, 1], 'Green': [0, 0, 1, 0, 0]}
190
+
191
+ # With predefined categories
192
+ >>> sizes = ["S", "M", "L", "M"]
193
+ >>> encoded = one_hot_encode(sizes, categories=["XS", "S", "M", "L", "XL"])
194
+
195
+ # Real-world: Product categories
196
+ >>> products = ["Electronics", "Clothing", "Electronics", "Food"]
197
+ >>> encoded = one_hot_encode(products)
198
+ # Use in ML: Each category becomes a feature
199
+
200
+ Notes:
201
+ - Standard encoding for categorical features
202
+ - Creates sparse features (mostly zeros)
203
+ - Number of features = number of categories
204
+ - Use for nominal categories (no order)
205
+ """
206
+ if categories is None:
207
+ categories = sorted(list(set(data)))
208
+
209
+ result = {cat: [] for cat in categories}
210
+
211
+ for value in data:
212
+ for cat in categories:
213
+ result[cat].append(1 if value == cat else 0)
214
+
215
+ return result
216
+
217
+
218
+ def label_encode(data: List[str]) -> Tuple[List[int], Dict[str, int]]:
219
+ """
220
+ Label encode categorical data to integers.
221
+
222
+ Converts categories to integer labels. Useful for ordinal categories
223
+ or when one-hot encoding creates too many features.
224
+
225
+ Args:
226
+ data: List of categorical values
227
+
228
+ Returns:
229
+ tuple: (encoded_data, label_mapping)
230
+
231
+ Examples:
232
+ >>> from ilovetools.data import label_encode
233
+
234
+ # Basic encoding
235
+ >>> sizes = ["Small", "Large", "Medium", "Small", "Large"]
236
+ >>> encoded, mapping = label_encode(sizes)
237
+ >>> print(encoded)
238
+ [2, 0, 1, 2, 0]
239
+ >>> print(mapping)
240
+ {'Large': 0, 'Medium': 1, 'Small': 2}
241
+
242
+ # Education levels (ordinal)
243
+ >>> education = ["High School", "Bachelor", "Master", "Bachelor"]
244
+ >>> encoded, mapping = label_encode(education)
245
+
246
+ # Decode back
247
+ >>> reverse_mapping = {v: k for k, v in mapping.items()}
248
+ >>> original = [reverse_mapping[code] for code in encoded]
249
+
250
+ Notes:
251
+ - More memory efficient than one-hot encoding
252
+ - Use for ordinal categories (has order)
253
+ - Model may assume order exists
254
+ - Returns mapping for decoding
255
+ """
256
+ unique_values = sorted(list(set(data)))
257
+ mapping = {val: idx for idx, val in enumerate(unique_values)}
258
+ encoded = [mapping[val] for val in data]
259
+
260
+ return encoded, mapping
261
+
262
+
263
+ def extract_datetime_features(
264
+ timestamps: List[str],
265
+ format: str = "%Y-%m-%d %H:%M:%S"
266
+ ) -> Dict[str, List[int]]:
267
+ """
268
+ Extract useful features from datetime strings.
269
+
270
+ Converts timestamps into multiple temporal features like hour,
271
+ day of week, month, etc. Essential for time-series ML.
272
+
273
+ Args:
274
+ timestamps: List of datetime strings
275
+ format: Datetime format string. Default: "%Y-%m-%d %H:%M:%S"
276
+
277
+ Returns:
278
+ dict: Dictionary with feature names and values
279
+
280
+ Examples:
281
+ >>> from ilovetools.data import extract_datetime_features
282
+
283
+ # Basic usage
284
+ >>> dates = [
285
+ ... "2024-03-15 14:30:00",
286
+ ... "2024-03-16 09:15:00",
287
+ ... "2024-03-17 18:45:00"
288
+ ... ]
289
+ >>> features = extract_datetime_features(dates)
290
+ >>> print(features.keys())
291
+ dict_keys(['year', 'month', 'day', 'hour', 'minute', 'day_of_week', 'is_weekend'])
292
+
293
+ # E-commerce: Purchase patterns
294
+ >>> purchase_times = ["2024-12-25 10:30:00", "2024-12-26 15:45:00"]
295
+ >>> features = extract_datetime_features(purchase_times)
296
+ >>> print(features['is_weekend'])
297
+ [0, 0]
298
+ >>> print(features['hour'])
299
+ [10, 15]
300
+
301
+ # Different format
302
+ >>> dates = ["15/03/2024", "16/03/2024"]
303
+ >>> features = extract_datetime_features(dates, format="%d/%m/%Y")
304
+
305
+ Notes:
306
+ - Captures temporal patterns
307
+ - Essential for time-series forecasting
308
+ - Helps model learn seasonality
309
+ - Common features: hour, day, month, is_weekend
310
+ """
311
+ result = {
312
+ 'year': [],
313
+ 'month': [],
314
+ 'day': [],
315
+ 'hour': [],
316
+ 'minute': [],
317
+ 'day_of_week': [], # 0=Monday, 6=Sunday
318
+ 'is_weekend': []
319
+ }
320
+
321
+ for ts in timestamps:
322
+ dt = datetime.strptime(ts, format)
323
+ result['year'].append(dt.year)
324
+ result['month'].append(dt.month)
325
+ result['day'].append(dt.day)
326
+ result['hour'].append(dt.hour)
327
+ result['minute'].append(dt.minute)
328
+ result['day_of_week'].append(dt.weekday())
329
+ result['is_weekend'].append(1 if dt.weekday() >= 5 else 0)
330
+
331
+ return result
332
+
333
+
334
+ def handle_missing_values(
335
+ data: List[Optional[float]],
336
+ strategy: str = "mean"
337
+ ) -> List[float]:
338
+ """
339
+ Handle missing values in numerical data.
340
+
341
+ Fills None/NaN values using various strategies. Essential
342
+ preprocessing step for ML models.
343
+
344
+ Args:
345
+ data: List with potential None values
346
+ strategy: Fill strategy - "mean", "median", "mode", "forward", "backward", "zero"
347
+
348
+ Returns:
349
+ list: Data with missing values filled
350
+
351
+ Examples:
352
+ >>> from ilovetools.data import handle_missing_values
353
+
354
+ # Mean imputation
355
+ >>> data = [1.0, 2.0, None, 4.0, 5.0]
356
+ >>> filled = handle_missing_values(data, strategy="mean")
357
+ >>> print(filled)
358
+ [1.0, 2.0, 3.0, 4.0, 5.0]
359
+
360
+ # Median imputation
361
+ >>> data = [1.0, 2.0, None, 100.0]
362
+ >>> filled = handle_missing_values(data, strategy="median")
363
+
364
+ # Forward fill
365
+ >>> data = [1.0, None, None, 4.0]
366
+ >>> filled = handle_missing_values(data, strategy="forward")
367
+ >>> print(filled)
368
+ [1.0, 1.0, 1.0, 4.0]
369
+
370
+ # Zero fill
371
+ >>> data = [1.0, None, 3.0]
372
+ >>> filled = handle_missing_values(data, strategy="zero")
373
+ >>> print(filled)
374
+ [1.0, 0.0, 3.0]
375
+
376
+ Notes:
377
+ - Most ML models can't handle missing values
378
+ - Choose strategy based on data distribution
379
+ - Mean: Sensitive to outliers
380
+ - Median: Robust to outliers
381
+ - Forward/Backward: For time-series data
382
+ """
383
+ valid_values = [x for x in data if x is not None]
384
+
385
+ if not valid_values:
386
+ return [0.0] * len(data)
387
+
388
+ if strategy == "mean":
389
+ fill_value = sum(valid_values) / len(valid_values)
390
+ elif strategy == "median":
391
+ sorted_vals = sorted(valid_values)
392
+ n = len(sorted_vals)
393
+ fill_value = sorted_vals[n // 2] if n % 2 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
394
+ elif strategy == "mode":
395
+ fill_value = max(set(valid_values), key=valid_values.count)
396
+ elif strategy == "zero":
397
+ fill_value = 0.0
398
+ else:
399
+ fill_value = sum(valid_values) / len(valid_values)
400
+
401
+ result = []
402
+ last_valid = fill_value
403
+
404
+ for value in data:
405
+ if value is None:
406
+ if strategy == "forward":
407
+ result.append(last_valid)
408
+ else:
409
+ result.append(fill_value)
410
+ else:
411
+ result.append(value)
412
+ last_valid = value
413
+
414
+ # Backward fill if needed
415
+ if strategy == "backward":
416
+ result_reversed = []
417
+ last_valid = fill_value
418
+ for value in reversed(result):
419
+ if value is None:
420
+ result_reversed.append(last_valid)
421
+ else:
422
+ result_reversed.append(value)
423
+ last_valid = value
424
+ result = list(reversed(result_reversed))
425
+
426
+ return result
427
+
428
+
429
+ def create_interaction_features(
430
+ feature1: Union[List[float], np.ndarray],
431
+ feature2: Union[List[float], np.ndarray],
432
+ operation: str = "multiply"
433
+ ) -> List[float]:
434
+ """
435
+ Create interaction features between two features.
436
+
437
+ Combines two features to capture their joint effect. Useful when
438
+ features interact in meaningful ways.
439
+
440
+ Args:
441
+ feature1: First feature
442
+ feature2: Second feature
443
+ operation: "multiply", "add", "subtract", "divide"
444
+
445
+ Returns:
446
+ list: Interaction feature values
447
+
448
+ Examples:
449
+ >>> from ilovetools.data import create_interaction_features
450
+
451
+ # Multiply interaction
452
+ >>> height = [170, 180, 160, 175]
453
+ >>> weight = [70, 85, 60, 80]
454
+ >>> bmi_proxy = create_interaction_features(height, weight, "multiply")
455
+
456
+ # Real-world: Price per square foot
457
+ >>> prices = [300000, 450000, 250000]
458
+ >>> sqft = [1500, 2000, 1200]
459
+ >>> price_per_sqft = create_interaction_features(prices, sqft, "divide")
460
+ >>> print(price_per_sqft)
461
+ [200.0, 225.0, 208.33]
462
+
463
+ # Add interaction
464
+ >>> feature1 = [1, 2, 3]
465
+ >>> feature2 = [4, 5, 6]
466
+ >>> combined = create_interaction_features(feature1, feature2, "add")
467
+ >>> print(combined)
468
+ [5, 7, 9]
469
+
470
+ Notes:
471
+ - Captures feature interactions
472
+ - Common in real estate (price * sqft)
473
+ - Useful in e-commerce (quantity * price)
474
+ - Can significantly improve model performance
475
+ """
476
+ if isinstance(feature1, np.ndarray):
477
+ feature1 = feature1.tolist()
478
+ if isinstance(feature2, np.ndarray):
479
+ feature2 = feature2.tolist()
480
+
481
+ if len(feature1) != len(feature2):
482
+ raise ValueError("Features must have same length")
483
+
484
+ result = []
485
+ for v1, v2 in zip(feature1, feature2):
486
+ if operation == "multiply":
487
+ result.append(v1 * v2)
488
+ elif operation == "add":
489
+ result.append(v1 + v2)
490
+ elif operation == "subtract":
491
+ result.append(v1 - v2)
492
+ elif operation == "divide":
493
+ result.append(v1 / v2 if v2 != 0 else 0.0)
494
+ else:
495
+ result.append(v1 * v2)
496
+
497
+ return result