ilovetools 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.1.1/ilovetools.egg-info → ilovetools-0.1.3}/PKG-INFO +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/__init__.py +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/__init__.py +3 -1
- ilovetools-0.1.3/ilovetools/ai/embeddings.py +270 -0
- ilovetools-0.1.3/ilovetools/data/__init__.py +27 -0
- ilovetools-0.1.3/ilovetools/data/feature_engineering.py +497 -0
- ilovetools-0.1.3/ilovetools/data/preprocessing.py +234 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/SOURCES.txt +2 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/pyproject.toml +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.3}/setup.py +1 -1
- ilovetools-0.1.1/ilovetools/ai/embeddings.py +0 -5
- ilovetools-0.1.1/ilovetools/data/__init__.py +0 -5
- {ilovetools-0.1.1 → ilovetools-0.1.3}/LICENSE +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/MANIFEST.in +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/README.md +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/requirements.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/setup.cfg +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.3}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -3,9 +3,11 @@ AI & Machine Learning utilities module
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .llm_helpers import token_counter
|
|
6
|
-
from .embeddings import
|
|
6
|
+
from .embeddings import similarity_search, cosine_similarity
|
|
7
7
|
from .inference import *
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
10
10
|
'token_counter',
|
|
11
|
+
'similarity_search',
|
|
12
|
+
'cosine_similarity',
|
|
11
13
|
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding utilities for text and vector operations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import List, Union, Tuple, Dict
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
__all__ = ['similarity_search', 'cosine_similarity']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def cosine_similarity(vec1: Union[List[float], np.ndarray], vec2: Union[List[float], np.ndarray]) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Calculate cosine similarity between two vectors.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
vec1: First vector
|
|
18
|
+
vec2: Second vector
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
float: Cosine similarity score between -1 and 1
|
|
22
|
+
"""
|
|
23
|
+
vec1 = np.array(vec1)
|
|
24
|
+
vec2 = np.array(vec2)
|
|
25
|
+
|
|
26
|
+
dot_product = np.dot(vec1, vec2)
|
|
27
|
+
norm1 = np.linalg.norm(vec1)
|
|
28
|
+
norm2 = np.linalg.norm(vec2)
|
|
29
|
+
|
|
30
|
+
if norm1 == 0 or norm2 == 0:
|
|
31
|
+
return 0.0
|
|
32
|
+
|
|
33
|
+
return float(dot_product / (norm1 * norm2))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def similarity_search(
|
|
37
|
+
query: str,
|
|
38
|
+
documents: List[str],
|
|
39
|
+
top_k: int = 5,
|
|
40
|
+
method: str = "tfidf",
|
|
41
|
+
return_scores: bool = True
|
|
42
|
+
) -> Union[List[str], List[Tuple[str, float]]]:
|
|
43
|
+
"""
|
|
44
|
+
Find most similar documents to a query using various similarity methods.
|
|
45
|
+
|
|
46
|
+
This function performs semantic similarity search without requiring external APIs
|
|
47
|
+
or heavy ML models. Perfect for quick document retrieval, search functionality,
|
|
48
|
+
and finding relevant content.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
query (str): Search query text
|
|
52
|
+
documents (list): List of document strings to search through
|
|
53
|
+
top_k (int): Number of top results to return. Default: 5
|
|
54
|
+
method (str): Similarity method to use:
|
|
55
|
+
- "tfidf": TF-IDF based similarity (default, fast)
|
|
56
|
+
- "jaccard": Jaccard similarity (word overlap)
|
|
57
|
+
- "levenshtein": Edit distance based similarity
|
|
58
|
+
- "ngram": N-gram based similarity
|
|
59
|
+
return_scores (bool): If True, returns (document, score) tuples.
|
|
60
|
+
If False, returns only documents. Default: True
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
list: Top-k most similar documents
|
|
64
|
+
- If return_scores=True: [(doc, score), ...]
|
|
65
|
+
- If return_scores=False: [doc, ...]
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
>>> from ilovetools.ai import similarity_search
|
|
69
|
+
|
|
70
|
+
# Basic usage
|
|
71
|
+
>>> docs = [
|
|
72
|
+
... "Python is a programming language",
|
|
73
|
+
... "Machine learning with Python",
|
|
74
|
+
... "Java programming basics",
|
|
75
|
+
... "Deep learning and AI"
|
|
76
|
+
... ]
|
|
77
|
+
>>> results = similarity_search("Python ML", docs, top_k=2)
|
|
78
|
+
>>> print(results)
|
|
79
|
+
[('Machine learning with Python', 0.85), ('Python is a programming language', 0.72)]
|
|
80
|
+
|
|
81
|
+
# Without scores
|
|
82
|
+
>>> results = similarity_search("Python ML", docs, return_scores=False)
|
|
83
|
+
>>> print(results)
|
|
84
|
+
['Machine learning with Python', 'Python is a programming language']
|
|
85
|
+
|
|
86
|
+
# Different methods
|
|
87
|
+
>>> results = similarity_search("Python", docs, method="jaccard")
|
|
88
|
+
>>> results = similarity_search("Python", docs, method="levenshtein")
|
|
89
|
+
|
|
90
|
+
# Real-world use case: FAQ search
|
|
91
|
+
>>> faqs = [
|
|
92
|
+
... "How do I reset my password?",
|
|
93
|
+
... "What is the refund policy?",
|
|
94
|
+
... "How to contact support?",
|
|
95
|
+
... "Where is my order?"
|
|
96
|
+
... ]
|
|
97
|
+
>>> user_query = "forgot password"
|
|
98
|
+
>>> answer = similarity_search(user_query, faqs, top_k=1, return_scores=False)[0]
|
|
99
|
+
>>> print(answer)
|
|
100
|
+
'How do I reset my password?'
|
|
101
|
+
|
|
102
|
+
Notes:
|
|
103
|
+
- TF-IDF method is fastest and works well for most cases
|
|
104
|
+
- Jaccard is good for short texts and keyword matching
|
|
105
|
+
- Levenshtein is useful for typo-tolerant search
|
|
106
|
+
- No external dependencies or API calls required
|
|
107
|
+
- Works offline and is very fast
|
|
108
|
+
|
|
109
|
+
Performance:
|
|
110
|
+
- TF-IDF: O(n*m) where n=docs, m=avg words
|
|
111
|
+
- Jaccard: O(n*m)
|
|
112
|
+
- Levenshtein: O(n*m^2)
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if not documents:
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
if top_k > len(documents):
|
|
119
|
+
top_k = len(documents)
|
|
120
|
+
|
|
121
|
+
# Normalize query
|
|
122
|
+
query_lower = query.lower()
|
|
123
|
+
|
|
124
|
+
if method == "tfidf":
|
|
125
|
+
scores = _tfidf_similarity(query_lower, documents)
|
|
126
|
+
elif method == "jaccard":
|
|
127
|
+
scores = _jaccard_similarity(query_lower, documents)
|
|
128
|
+
elif method == "levenshtein":
|
|
129
|
+
scores = _levenshtein_similarity(query_lower, documents)
|
|
130
|
+
elif method == "ngram":
|
|
131
|
+
scores = _ngram_similarity(query_lower, documents)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unknown method: {method}. Use 'tfidf', 'jaccard', 'levenshtein', or 'ngram'")
|
|
134
|
+
|
|
135
|
+
# Sort by score (descending)
|
|
136
|
+
doc_scores = list(zip(documents, scores))
|
|
137
|
+
doc_scores.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
|
|
139
|
+
# Get top-k results
|
|
140
|
+
top_results = doc_scores[:top_k]
|
|
141
|
+
|
|
142
|
+
if return_scores:
|
|
143
|
+
return top_results
|
|
144
|
+
else:
|
|
145
|
+
return [doc for doc, _ in top_results]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _tfidf_similarity(query: str, documents: List[str]) -> List[float]:
|
|
149
|
+
"""TF-IDF based similarity calculation."""
|
|
150
|
+
# Tokenize
|
|
151
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
152
|
+
|
|
153
|
+
if not query_words:
|
|
154
|
+
return [0.0] * len(documents)
|
|
155
|
+
|
|
156
|
+
# Calculate document frequencies
|
|
157
|
+
doc_freq = {}
|
|
158
|
+
for doc in documents:
|
|
159
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
160
|
+
for word in doc_words:
|
|
161
|
+
doc_freq[word] = doc_freq.get(word, 0) + 1
|
|
162
|
+
|
|
163
|
+
num_docs = len(documents)
|
|
164
|
+
scores = []
|
|
165
|
+
|
|
166
|
+
for doc in documents:
|
|
167
|
+
doc_words = re.findall(r'\w+', doc.lower())
|
|
168
|
+
doc_word_set = set(doc_words)
|
|
169
|
+
|
|
170
|
+
# Calculate TF-IDF score
|
|
171
|
+
score = 0.0
|
|
172
|
+
for word in query_words:
|
|
173
|
+
if word in doc_word_set:
|
|
174
|
+
# TF: term frequency in document
|
|
175
|
+
tf = doc_words.count(word) / len(doc_words) if doc_words else 0
|
|
176
|
+
# IDF: inverse document frequency
|
|
177
|
+
idf = np.log(num_docs / (doc_freq.get(word, 0) + 1))
|
|
178
|
+
score += tf * idf
|
|
179
|
+
|
|
180
|
+
scores.append(score)
|
|
181
|
+
|
|
182
|
+
return scores
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _jaccard_similarity(query: str, documents: List[str]) -> List[float]:
|
|
186
|
+
"""Jaccard similarity based on word overlap."""
|
|
187
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
188
|
+
|
|
189
|
+
if not query_words:
|
|
190
|
+
return [0.0] * len(documents)
|
|
191
|
+
|
|
192
|
+
scores = []
|
|
193
|
+
for doc in documents:
|
|
194
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
195
|
+
|
|
196
|
+
if not doc_words:
|
|
197
|
+
scores.append(0.0)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
intersection = len(query_words & doc_words)
|
|
201
|
+
union = len(query_words | doc_words)
|
|
202
|
+
|
|
203
|
+
score = intersection / union if union > 0 else 0.0
|
|
204
|
+
scores.append(score)
|
|
205
|
+
|
|
206
|
+
return scores
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
210
|
+
"""Calculate Levenshtein distance between two strings."""
|
|
211
|
+
if len(s1) < len(s2):
|
|
212
|
+
return _levenshtein_distance(s2, s1)
|
|
213
|
+
|
|
214
|
+
if len(s2) == 0:
|
|
215
|
+
return len(s1)
|
|
216
|
+
|
|
217
|
+
previous_row = range(len(s2) + 1)
|
|
218
|
+
for i, c1 in enumerate(s1):
|
|
219
|
+
current_row = [i + 1]
|
|
220
|
+
for j, c2 in enumerate(s2):
|
|
221
|
+
insertions = previous_row[j + 1] + 1
|
|
222
|
+
deletions = current_row[j] + 1
|
|
223
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
224
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
225
|
+
previous_row = current_row
|
|
226
|
+
|
|
227
|
+
return previous_row[-1]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _levenshtein_similarity(query: str, documents: List[str]) -> List[float]:
|
|
231
|
+
"""Levenshtein distance based similarity."""
|
|
232
|
+
scores = []
|
|
233
|
+
for doc in documents:
|
|
234
|
+
doc_lower = doc.lower()
|
|
235
|
+
distance = _levenshtein_distance(query, doc_lower)
|
|
236
|
+
max_len = max(len(query), len(doc_lower))
|
|
237
|
+
|
|
238
|
+
# Convert distance to similarity (0 to 1)
|
|
239
|
+
similarity = 1 - (distance / max_len) if max_len > 0 else 0.0
|
|
240
|
+
scores.append(similarity)
|
|
241
|
+
|
|
242
|
+
return scores
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _ngram_similarity(query: str, documents: List[str], n: int = 2) -> List[float]:
|
|
246
|
+
"""N-gram based similarity."""
|
|
247
|
+
def get_ngrams(text: str, n: int) -> set:
|
|
248
|
+
text = text.lower()
|
|
249
|
+
return set(text[i:i+n] for i in range(len(text) - n + 1))
|
|
250
|
+
|
|
251
|
+
query_ngrams = get_ngrams(query, n)
|
|
252
|
+
|
|
253
|
+
if not query_ngrams:
|
|
254
|
+
return [0.0] * len(documents)
|
|
255
|
+
|
|
256
|
+
scores = []
|
|
257
|
+
for doc in documents:
|
|
258
|
+
doc_ngrams = get_ngrams(doc, n)
|
|
259
|
+
|
|
260
|
+
if not doc_ngrams:
|
|
261
|
+
scores.append(0.0)
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
intersection = len(query_ngrams & doc_ngrams)
|
|
265
|
+
union = len(query_ngrams | doc_ngrams)
|
|
266
|
+
|
|
267
|
+
score = intersection / union if union > 0 else 0.0
|
|
268
|
+
scores.append(score)
|
|
269
|
+
|
|
270
|
+
return scores
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing and manipulation utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .preprocessing import train_test_split, normalize_data, standardize_data
|
|
6
|
+
from .feature_engineering import (
|
|
7
|
+
create_polynomial_features,
|
|
8
|
+
bin_numerical_feature,
|
|
9
|
+
one_hot_encode,
|
|
10
|
+
label_encode,
|
|
11
|
+
extract_datetime_features,
|
|
12
|
+
handle_missing_values,
|
|
13
|
+
create_interaction_features
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'train_test_split',
|
|
18
|
+
'normalize_data',
|
|
19
|
+
'standardize_data',
|
|
20
|
+
'create_polynomial_features',
|
|
21
|
+
'bin_numerical_feature',
|
|
22
|
+
'one_hot_encode',
|
|
23
|
+
'label_encode',
|
|
24
|
+
'extract_datetime_features',
|
|
25
|
+
'handle_missing_values',
|
|
26
|
+
'create_interaction_features',
|
|
27
|
+
]
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feature engineering utilities for ML workflows
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Union, Dict, Tuple, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
'create_polynomial_features',
|
|
11
|
+
'bin_numerical_feature',
|
|
12
|
+
'one_hot_encode',
|
|
13
|
+
'label_encode',
|
|
14
|
+
'extract_datetime_features',
|
|
15
|
+
'handle_missing_values',
|
|
16
|
+
'create_interaction_features'
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def create_polynomial_features(
|
|
21
|
+
data: Union[List[float], np.ndarray],
|
|
22
|
+
degree: int = 2,
|
|
23
|
+
include_bias: bool = False
|
|
24
|
+
) -> List[List[float]]:
|
|
25
|
+
"""
|
|
26
|
+
Create polynomial features from numerical data.
|
|
27
|
+
|
|
28
|
+
Transforms [x] into [x, x^2, x^3, ...] to capture non-linear relationships.
|
|
29
|
+
Essential for models that need to learn curved patterns.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
data: List or array of numerical values
|
|
33
|
+
degree: Maximum polynomial degree. Default: 2
|
|
34
|
+
include_bias: Include bias term (column of 1s). Default: False
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
list: Polynomial features as list of lists
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> from ilovetools.data import create_polynomial_features
|
|
41
|
+
|
|
42
|
+
# Basic usage
|
|
43
|
+
>>> ages = [20, 25, 30, 35, 40]
|
|
44
|
+
>>> poly_features = create_polynomial_features(ages, degree=2)
|
|
45
|
+
>>> print(poly_features)
|
|
46
|
+
[[20, 400], [25, 625], [30, 900], [35, 1225], [40, 1600]]
|
|
47
|
+
|
|
48
|
+
# With bias term
|
|
49
|
+
>>> poly_features = create_polynomial_features(ages, degree=2, include_bias=True)
|
|
50
|
+
>>> print(poly_features[0])
|
|
51
|
+
[1, 20, 400]
|
|
52
|
+
|
|
53
|
+
# Degree 3
|
|
54
|
+
>>> poly_features = create_polynomial_features([2, 3, 4], degree=3)
|
|
55
|
+
>>> print(poly_features)
|
|
56
|
+
[[2, 4, 8], [3, 9, 27], [4, 16, 64]]
|
|
57
|
+
|
|
58
|
+
# Real-world: Age features for insurance pricing
|
|
59
|
+
>>> customer_ages = [25, 35, 45, 55, 65]
|
|
60
|
+
>>> age_features = create_polynomial_features(customer_ages, degree=2)
|
|
61
|
+
# Now model can learn: premium = a*age + b*age^2
|
|
62
|
+
|
|
63
|
+
Notes:
|
|
64
|
+
- Useful for capturing non-linear relationships
|
|
65
|
+
- Common in regression problems
|
|
66
|
+
- Be careful with high degrees (overfitting risk)
|
|
67
|
+
- Normalize features after polynomial expansion
|
|
68
|
+
"""
|
|
69
|
+
if isinstance(data, np.ndarray):
|
|
70
|
+
data = data.tolist()
|
|
71
|
+
|
|
72
|
+
result = []
|
|
73
|
+
for value in data:
|
|
74
|
+
features = []
|
|
75
|
+
if include_bias:
|
|
76
|
+
features.append(1)
|
|
77
|
+
for d in range(1, degree + 1):
|
|
78
|
+
features.append(value ** d)
|
|
79
|
+
result.append(features)
|
|
80
|
+
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def bin_numerical_feature(
|
|
85
|
+
data: Union[List[float], np.ndarray],
|
|
86
|
+
bins: Union[int, List[float]] = 5,
|
|
87
|
+
labels: Optional[List[str]] = None
|
|
88
|
+
) -> List[Union[int, str]]:
|
|
89
|
+
"""
|
|
90
|
+
Bin continuous numerical data into discrete categories.
|
|
91
|
+
|
|
92
|
+
Converts continuous values into groups/bins. Useful for creating
|
|
93
|
+
categorical features from numerical data.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
data: List or array of numerical values
|
|
97
|
+
bins: Number of equal-width bins OR list of bin edges
|
|
98
|
+
labels: Optional labels for bins. If None, returns bin indices
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
list: Binned values (indices or labels)
|
|
102
|
+
|
|
103
|
+
Examples:
|
|
104
|
+
>>> from ilovetools.data import bin_numerical_feature
|
|
105
|
+
|
|
106
|
+
# Age groups
|
|
107
|
+
>>> ages = [5, 15, 25, 35, 45, 55, 65, 75]
|
|
108
|
+
>>> age_groups = bin_numerical_feature(
|
|
109
|
+
... ages,
|
|
110
|
+
... bins=[0, 18, 35, 60, 100],
|
|
111
|
+
... labels=["Child", "Young Adult", "Adult", "Senior"]
|
|
112
|
+
... )
|
|
113
|
+
>>> print(age_groups)
|
|
114
|
+
['Child', 'Child', 'Young Adult', 'Adult', 'Adult', 'Senior', 'Senior', 'Senior']
|
|
115
|
+
|
|
116
|
+
# Income brackets
|
|
117
|
+
>>> incomes = [25000, 45000, 65000, 85000, 120000]
|
|
118
|
+
>>> income_brackets = bin_numerical_feature(
|
|
119
|
+
... incomes,
|
|
120
|
+
... bins=[0, 40000, 80000, 150000],
|
|
121
|
+
... labels=["Low", "Medium", "High"]
|
|
122
|
+
... )
|
|
123
|
+
|
|
124
|
+
# Equal-width bins
|
|
125
|
+
>>> scores = [45, 67, 89, 92, 78, 56, 34, 88]
|
|
126
|
+
>>> score_bins = bin_numerical_feature(scores, bins=3)
|
|
127
|
+
>>> print(score_bins)
|
|
128
|
+
[0, 1, 2, 2, 2, 1, 0, 2]
|
|
129
|
+
|
|
130
|
+
Notes:
|
|
131
|
+
- Useful for creating categorical features
|
|
132
|
+
- Helps models learn threshold effects
|
|
133
|
+
- Can reduce noise in continuous data
|
|
134
|
+
- Choose bins based on domain knowledge
|
|
135
|
+
"""
|
|
136
|
+
if isinstance(data, np.ndarray):
|
|
137
|
+
data = data.tolist()
|
|
138
|
+
|
|
139
|
+
if isinstance(bins, int):
|
|
140
|
+
# Create equal-width bins
|
|
141
|
+
min_val = min(data)
|
|
142
|
+
max_val = max(data)
|
|
143
|
+
bin_width = (max_val - min_val) / bins
|
|
144
|
+
bin_edges = [min_val + i * bin_width for i in range(bins + 1)]
|
|
145
|
+
bin_edges[-1] += 0.001 # Ensure max value is included
|
|
146
|
+
else:
|
|
147
|
+
bin_edges = bins
|
|
148
|
+
|
|
149
|
+
result = []
|
|
150
|
+
for value in data:
|
|
151
|
+
bin_idx = 0
|
|
152
|
+
for i in range(len(bin_edges) - 1):
|
|
153
|
+
if bin_edges[i] <= value < bin_edges[i + 1]:
|
|
154
|
+
bin_idx = i
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
if labels:
|
|
158
|
+
result.append(labels[bin_idx])
|
|
159
|
+
else:
|
|
160
|
+
result.append(bin_idx)
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def one_hot_encode(
|
|
166
|
+
data: List[str],
|
|
167
|
+
categories: Optional[List[str]] = None
|
|
168
|
+
) -> Dict[str, List[int]]:
|
|
169
|
+
"""
|
|
170
|
+
One-hot encode categorical data.
|
|
171
|
+
|
|
172
|
+
Converts categories into binary columns. Each category becomes
|
|
173
|
+
a separate binary feature.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
data: List of categorical values
|
|
177
|
+
categories: Optional list of all possible categories
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
dict: Dictionary with category names as keys, binary lists as values
|
|
181
|
+
|
|
182
|
+
Examples:
|
|
183
|
+
>>> from ilovetools.data import one_hot_encode
|
|
184
|
+
|
|
185
|
+
# Basic encoding
|
|
186
|
+
>>> colors = ["Red", "Blue", "Green", "Red", "Blue"]
|
|
187
|
+
>>> encoded = one_hot_encode(colors)
|
|
188
|
+
>>> print(encoded)
|
|
189
|
+
{'Red': [1, 0, 0, 1, 0], 'Blue': [0, 1, 0, 0, 1], 'Green': [0, 0, 1, 0, 0]}
|
|
190
|
+
|
|
191
|
+
# With predefined categories
|
|
192
|
+
>>> sizes = ["S", "M", "L", "M"]
|
|
193
|
+
>>> encoded = one_hot_encode(sizes, categories=["XS", "S", "M", "L", "XL"])
|
|
194
|
+
|
|
195
|
+
# Real-world: Product categories
|
|
196
|
+
>>> products = ["Electronics", "Clothing", "Electronics", "Food"]
|
|
197
|
+
>>> encoded = one_hot_encode(products)
|
|
198
|
+
# Use in ML: Each category becomes a feature
|
|
199
|
+
|
|
200
|
+
Notes:
|
|
201
|
+
- Standard encoding for categorical features
|
|
202
|
+
- Creates sparse features (mostly zeros)
|
|
203
|
+
- Number of features = number of categories
|
|
204
|
+
- Use for nominal categories (no order)
|
|
205
|
+
"""
|
|
206
|
+
if categories is None:
|
|
207
|
+
categories = sorted(list(set(data)))
|
|
208
|
+
|
|
209
|
+
result = {cat: [] for cat in categories}
|
|
210
|
+
|
|
211
|
+
for value in data:
|
|
212
|
+
for cat in categories:
|
|
213
|
+
result[cat].append(1 if value == cat else 0)
|
|
214
|
+
|
|
215
|
+
return result
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def label_encode(data: List[str]) -> Tuple[List[int], Dict[str, int]]:
|
|
219
|
+
"""
|
|
220
|
+
Label encode categorical data to integers.
|
|
221
|
+
|
|
222
|
+
Converts categories to integer labels. Useful for ordinal categories
|
|
223
|
+
or when one-hot encoding creates too many features.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
data: List of categorical values
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
tuple: (encoded_data, label_mapping)
|
|
230
|
+
|
|
231
|
+
Examples:
|
|
232
|
+
>>> from ilovetools.data import label_encode
|
|
233
|
+
|
|
234
|
+
# Basic encoding
|
|
235
|
+
>>> sizes = ["Small", "Large", "Medium", "Small", "Large"]
|
|
236
|
+
>>> encoded, mapping = label_encode(sizes)
|
|
237
|
+
>>> print(encoded)
|
|
238
|
+
[2, 0, 1, 2, 0]
|
|
239
|
+
>>> print(mapping)
|
|
240
|
+
{'Large': 0, 'Medium': 1, 'Small': 2}
|
|
241
|
+
|
|
242
|
+
# Education levels (ordinal)
|
|
243
|
+
>>> education = ["High School", "Bachelor", "Master", "Bachelor"]
|
|
244
|
+
>>> encoded, mapping = label_encode(education)
|
|
245
|
+
|
|
246
|
+
# Decode back
|
|
247
|
+
>>> reverse_mapping = {v: k for k, v in mapping.items()}
|
|
248
|
+
>>> original = [reverse_mapping[code] for code in encoded]
|
|
249
|
+
|
|
250
|
+
Notes:
|
|
251
|
+
- More memory efficient than one-hot encoding
|
|
252
|
+
- Use for ordinal categories (has order)
|
|
253
|
+
- Model may assume order exists
|
|
254
|
+
- Returns mapping for decoding
|
|
255
|
+
"""
|
|
256
|
+
unique_values = sorted(list(set(data)))
|
|
257
|
+
mapping = {val: idx for idx, val in enumerate(unique_values)}
|
|
258
|
+
encoded = [mapping[val] for val in data]
|
|
259
|
+
|
|
260
|
+
return encoded, mapping
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def extract_datetime_features(
|
|
264
|
+
timestamps: List[str],
|
|
265
|
+
format: str = "%Y-%m-%d %H:%M:%S"
|
|
266
|
+
) -> Dict[str, List[int]]:
|
|
267
|
+
"""
|
|
268
|
+
Extract useful features from datetime strings.
|
|
269
|
+
|
|
270
|
+
Converts timestamps into multiple temporal features like hour,
|
|
271
|
+
day of week, month, etc. Essential for time-series ML.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
timestamps: List of datetime strings
|
|
275
|
+
format: Datetime format string. Default: "%Y-%m-%d %H:%M:%S"
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
dict: Dictionary with feature names and values
|
|
279
|
+
|
|
280
|
+
Examples:
|
|
281
|
+
>>> from ilovetools.data import extract_datetime_features
|
|
282
|
+
|
|
283
|
+
# Basic usage
|
|
284
|
+
>>> dates = [
|
|
285
|
+
... "2024-03-15 14:30:00",
|
|
286
|
+
... "2024-03-16 09:15:00",
|
|
287
|
+
... "2024-03-17 18:45:00"
|
|
288
|
+
... ]
|
|
289
|
+
>>> features = extract_datetime_features(dates)
|
|
290
|
+
>>> print(features.keys())
|
|
291
|
+
dict_keys(['year', 'month', 'day', 'hour', 'minute', 'day_of_week', 'is_weekend'])
|
|
292
|
+
|
|
293
|
+
# E-commerce: Purchase patterns
|
|
294
|
+
>>> purchase_times = ["2024-12-25 10:30:00", "2024-12-26 15:45:00"]
|
|
295
|
+
>>> features = extract_datetime_features(purchase_times)
|
|
296
|
+
>>> print(features['is_weekend'])
|
|
297
|
+
[0, 0]
|
|
298
|
+
>>> print(features['hour'])
|
|
299
|
+
[10, 15]
|
|
300
|
+
|
|
301
|
+
# Different format
|
|
302
|
+
>>> dates = ["15/03/2024", "16/03/2024"]
|
|
303
|
+
>>> features = extract_datetime_features(dates, format="%d/%m/%Y")
|
|
304
|
+
|
|
305
|
+
Notes:
|
|
306
|
+
- Captures temporal patterns
|
|
307
|
+
- Essential for time-series forecasting
|
|
308
|
+
- Helps model learn seasonality
|
|
309
|
+
- Common features: hour, day, month, is_weekend
|
|
310
|
+
"""
|
|
311
|
+
result = {
|
|
312
|
+
'year': [],
|
|
313
|
+
'month': [],
|
|
314
|
+
'day': [],
|
|
315
|
+
'hour': [],
|
|
316
|
+
'minute': [],
|
|
317
|
+
'day_of_week': [], # 0=Monday, 6=Sunday
|
|
318
|
+
'is_weekend': []
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
for ts in timestamps:
|
|
322
|
+
dt = datetime.strptime(ts, format)
|
|
323
|
+
result['year'].append(dt.year)
|
|
324
|
+
result['month'].append(dt.month)
|
|
325
|
+
result['day'].append(dt.day)
|
|
326
|
+
result['hour'].append(dt.hour)
|
|
327
|
+
result['minute'].append(dt.minute)
|
|
328
|
+
result['day_of_week'].append(dt.weekday())
|
|
329
|
+
result['is_weekend'].append(1 if dt.weekday() >= 5 else 0)
|
|
330
|
+
|
|
331
|
+
return result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def handle_missing_values(
|
|
335
|
+
data: List[Optional[float]],
|
|
336
|
+
strategy: str = "mean"
|
|
337
|
+
) -> List[float]:
|
|
338
|
+
"""
|
|
339
|
+
Handle missing values in numerical data.
|
|
340
|
+
|
|
341
|
+
Fills None/NaN values using various strategies. Essential
|
|
342
|
+
preprocessing step for ML models.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
data: List with potential None values
|
|
346
|
+
strategy: Fill strategy - "mean", "median", "mode", "forward", "backward", "zero"
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
list: Data with missing values filled
|
|
350
|
+
|
|
351
|
+
Examples:
|
|
352
|
+
>>> from ilovetools.data import handle_missing_values
|
|
353
|
+
|
|
354
|
+
# Mean imputation
|
|
355
|
+
>>> data = [1.0, 2.0, None, 4.0, 5.0]
|
|
356
|
+
>>> filled = handle_missing_values(data, strategy="mean")
|
|
357
|
+
>>> print(filled)
|
|
358
|
+
[1.0, 2.0, 3.0, 4.0, 5.0]
|
|
359
|
+
|
|
360
|
+
# Median imputation
|
|
361
|
+
>>> data = [1.0, 2.0, None, 100.0]
|
|
362
|
+
>>> filled = handle_missing_values(data, strategy="median")
|
|
363
|
+
|
|
364
|
+
# Forward fill
|
|
365
|
+
>>> data = [1.0, None, None, 4.0]
|
|
366
|
+
>>> filled = handle_missing_values(data, strategy="forward")
|
|
367
|
+
>>> print(filled)
|
|
368
|
+
[1.0, 1.0, 1.0, 4.0]
|
|
369
|
+
|
|
370
|
+
# Zero fill
|
|
371
|
+
>>> data = [1.0, None, 3.0]
|
|
372
|
+
>>> filled = handle_missing_values(data, strategy="zero")
|
|
373
|
+
>>> print(filled)
|
|
374
|
+
[1.0, 0.0, 3.0]
|
|
375
|
+
|
|
376
|
+
Notes:
|
|
377
|
+
- Most ML models can't handle missing values
|
|
378
|
+
- Choose strategy based on data distribution
|
|
379
|
+
- Mean: Sensitive to outliers
|
|
380
|
+
- Median: Robust to outliers
|
|
381
|
+
- Forward/Backward: For time-series data
|
|
382
|
+
"""
|
|
383
|
+
valid_values = [x for x in data if x is not None]
|
|
384
|
+
|
|
385
|
+
if not valid_values:
|
|
386
|
+
return [0.0] * len(data)
|
|
387
|
+
|
|
388
|
+
if strategy == "mean":
|
|
389
|
+
fill_value = sum(valid_values) / len(valid_values)
|
|
390
|
+
elif strategy == "median":
|
|
391
|
+
sorted_vals = sorted(valid_values)
|
|
392
|
+
n = len(sorted_vals)
|
|
393
|
+
fill_value = sorted_vals[n // 2] if n % 2 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
|
|
394
|
+
elif strategy == "mode":
|
|
395
|
+
fill_value = max(set(valid_values), key=valid_values.count)
|
|
396
|
+
elif strategy == "zero":
|
|
397
|
+
fill_value = 0.0
|
|
398
|
+
else:
|
|
399
|
+
fill_value = sum(valid_values) / len(valid_values)
|
|
400
|
+
|
|
401
|
+
result = []
|
|
402
|
+
last_valid = fill_value
|
|
403
|
+
|
|
404
|
+
for value in data:
|
|
405
|
+
if value is None:
|
|
406
|
+
if strategy == "forward":
|
|
407
|
+
result.append(last_valid)
|
|
408
|
+
else:
|
|
409
|
+
result.append(fill_value)
|
|
410
|
+
else:
|
|
411
|
+
result.append(value)
|
|
412
|
+
last_valid = value
|
|
413
|
+
|
|
414
|
+
# Backward fill if needed
|
|
415
|
+
if strategy == "backward":
|
|
416
|
+
result_reversed = []
|
|
417
|
+
last_valid = fill_value
|
|
418
|
+
for value in reversed(result):
|
|
419
|
+
if value is None:
|
|
420
|
+
result_reversed.append(last_valid)
|
|
421
|
+
else:
|
|
422
|
+
result_reversed.append(value)
|
|
423
|
+
last_valid = value
|
|
424
|
+
result = list(reversed(result_reversed))
|
|
425
|
+
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def create_interaction_features(
|
|
430
|
+
feature1: Union[List[float], np.ndarray],
|
|
431
|
+
feature2: Union[List[float], np.ndarray],
|
|
432
|
+
operation: str = "multiply"
|
|
433
|
+
) -> List[float]:
|
|
434
|
+
"""
|
|
435
|
+
Create interaction features between two features.
|
|
436
|
+
|
|
437
|
+
Combines two features to capture their joint effect. Useful when
|
|
438
|
+
features interact in meaningful ways.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
feature1: First feature
|
|
442
|
+
feature2: Second feature
|
|
443
|
+
operation: "multiply", "add", "subtract", "divide"
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
list: Interaction feature values
|
|
447
|
+
|
|
448
|
+
Examples:
|
|
449
|
+
>>> from ilovetools.data import create_interaction_features
|
|
450
|
+
|
|
451
|
+
# Multiply interaction
|
|
452
|
+
>>> height = [170, 180, 160, 175]
|
|
453
|
+
>>> weight = [70, 85, 60, 80]
|
|
454
|
+
>>> bmi_proxy = create_interaction_features(height, weight, "multiply")
|
|
455
|
+
|
|
456
|
+
# Real-world: Price per square foot
|
|
457
|
+
>>> prices = [300000, 450000, 250000]
|
|
458
|
+
>>> sqft = [1500, 2000, 1200]
|
|
459
|
+
>>> price_per_sqft = create_interaction_features(prices, sqft, "divide")
|
|
460
|
+
>>> print(price_per_sqft)
|
|
461
|
+
[200.0, 225.0, 208.33]
|
|
462
|
+
|
|
463
|
+
# Add interaction
|
|
464
|
+
>>> feature1 = [1, 2, 3]
|
|
465
|
+
>>> feature2 = [4, 5, 6]
|
|
466
|
+
>>> combined = create_interaction_features(feature1, feature2, "add")
|
|
467
|
+
>>> print(combined)
|
|
468
|
+
[5, 7, 9]
|
|
469
|
+
|
|
470
|
+
Notes:
|
|
471
|
+
- Captures feature interactions
|
|
472
|
+
- Common in real estate (price * sqft)
|
|
473
|
+
- Useful in e-commerce (quantity * price)
|
|
474
|
+
- Can significantly improve model performance
|
|
475
|
+
"""
|
|
476
|
+
if isinstance(feature1, np.ndarray):
|
|
477
|
+
feature1 = feature1.tolist()
|
|
478
|
+
if isinstance(feature2, np.ndarray):
|
|
479
|
+
feature2 = feature2.tolist()
|
|
480
|
+
|
|
481
|
+
if len(feature1) != len(feature2):
|
|
482
|
+
raise ValueError("Features must have same length")
|
|
483
|
+
|
|
484
|
+
result = []
|
|
485
|
+
for v1, v2 in zip(feature1, feature2):
|
|
486
|
+
if operation == "multiply":
|
|
487
|
+
result.append(v1 * v2)
|
|
488
|
+
elif operation == "add":
|
|
489
|
+
result.append(v1 + v2)
|
|
490
|
+
elif operation == "subtract":
|
|
491
|
+
result.append(v1 - v2)
|
|
492
|
+
elif operation == "divide":
|
|
493
|
+
result.append(v1 / v2 if v2 != 0 else 0.0)
|
|
494
|
+
else:
|
|
495
|
+
result.append(v1 * v2)
|
|
496
|
+
|
|
497
|
+
return result
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data preprocessing utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Tuple, List, Union, Optional
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
__all__ = ['train_test_split', 'normalize_data', 'standardize_data']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def train_test_split(
|
|
13
|
+
X: Union[List, np.ndarray],
|
|
14
|
+
y: Optional[Union[List, np.ndarray]] = None,
|
|
15
|
+
test_size: float = 0.2,
|
|
16
|
+
random_state: Optional[int] = None,
|
|
17
|
+
shuffle: bool = True,
|
|
18
|
+
stratify: bool = False
|
|
19
|
+
) -> Union[Tuple[List, List], Tuple[List, List, List, List]]:
|
|
20
|
+
"""
|
|
21
|
+
Split arrays or lists into random train and test subsets.
|
|
22
|
+
|
|
23
|
+
Perfect for ML workflows - implements the fundamental train-test split
|
|
24
|
+
pattern without requiring scikit-learn. Supports stratified splitting
|
|
25
|
+
to maintain class distribution.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
X: Features array/list to split
|
|
29
|
+
y: Target array/list to split (optional)
|
|
30
|
+
test_size: Proportion of dataset for test set (0.0 to 1.0). Default: 0.2
|
|
31
|
+
random_state: Random seed for reproducibility. Default: None
|
|
32
|
+
shuffle: Whether to shuffle data before splitting. Default: True
|
|
33
|
+
stratify: Maintain class distribution in splits (requires y). Default: False
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
If y is None: (X_train, X_test)
|
|
37
|
+
If y is provided: (X_train, X_test, y_train, y_test)
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> from ilovetools.data import train_test_split
|
|
41
|
+
|
|
42
|
+
# Basic split
|
|
43
|
+
>>> X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
|
|
44
|
+
>>> y = [0, 1, 0, 1, 0]
|
|
45
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
46
|
+
>>> len(X_train), len(X_test)
|
|
47
|
+
(4, 1)
|
|
48
|
+
|
|
49
|
+
# With random seed for reproducibility
|
|
50
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
51
|
+
... X, y, test_size=0.3, random_state=42
|
|
52
|
+
... )
|
|
53
|
+
|
|
54
|
+
# Stratified split (maintains class distribution)
|
|
55
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
56
|
+
... X, y, test_size=0.2, stratify=True, random_state=42
|
|
57
|
+
... )
|
|
58
|
+
|
|
59
|
+
# Split features only (no labels)
|
|
60
|
+
>>> data = list(range(100))
|
|
61
|
+
>>> train, test = train_test_split(data, test_size=0.2)
|
|
62
|
+
>>> len(train), len(test)
|
|
63
|
+
(80, 20)
|
|
64
|
+
|
|
65
|
+
# Real-world example: Email spam detection
|
|
66
|
+
>>> emails = ["email1", "email2", "email3", "email4", "email5"]
|
|
67
|
+
>>> labels = [1, 0, 1, 0, 1] # 1=spam, 0=not spam
|
|
68
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
69
|
+
... emails, labels, test_size=0.2, random_state=42
|
|
70
|
+
... )
|
|
71
|
+
|
|
72
|
+
# 70-30 split
|
|
73
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
74
|
+
... X, y, test_size=0.3
|
|
75
|
+
... )
|
|
76
|
+
|
|
77
|
+
# 60-20-20 split (train-val-test)
|
|
78
|
+
>>> X_temp, X_test, y_temp, y_test = train_test_split(
|
|
79
|
+
... X, y, test_size=0.2, random_state=42
|
|
80
|
+
... )
|
|
81
|
+
>>> X_train, X_val, y_train, y_val = train_test_split(
|
|
82
|
+
... X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
|
|
83
|
+
... )
|
|
84
|
+
|
|
85
|
+
Notes:
|
|
86
|
+
- Always split data BEFORE any preprocessing to avoid data leakage
|
|
87
|
+
- Use random_state for reproducible results
|
|
88
|
+
- Stratified splitting ensures balanced class distribution
|
|
89
|
+
- Common splits: 80-20, 70-30, 60-20-20 (train-val-test)
|
|
90
|
+
- Test data should NEVER be seen during training
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ValueError: If test_size is not between 0 and 1
|
|
94
|
+
ValueError: If stratify=True but y is None
|
|
95
|
+
ValueError: If X and y have different lengths
|
|
96
|
+
|
|
97
|
+
References:
|
|
98
|
+
- scikit-learn train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
|
|
99
|
+
- ML best practices: https://developers.google.com/machine-learning/crash-course/training-and-test-sets/splitting-data
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# Validation
|
|
103
|
+
if not 0 < test_size < 1:
|
|
104
|
+
raise ValueError(f"test_size must be between 0 and 1, got {test_size}")
|
|
105
|
+
|
|
106
|
+
if stratify and y is None:
|
|
107
|
+
raise ValueError("stratify=True requires y to be provided")
|
|
108
|
+
|
|
109
|
+
# Convert to lists if numpy arrays
|
|
110
|
+
if isinstance(X, np.ndarray):
|
|
111
|
+
X = X.tolist()
|
|
112
|
+
if y is not None and isinstance(y, np.ndarray):
|
|
113
|
+
y = y.tolist()
|
|
114
|
+
|
|
115
|
+
# Check lengths match
|
|
116
|
+
if y is not None and len(X) != len(y):
|
|
117
|
+
raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
|
|
118
|
+
|
|
119
|
+
n_samples = len(X)
|
|
120
|
+
n_test = int(n_samples * test_size)
|
|
121
|
+
n_train = n_samples - n_test
|
|
122
|
+
|
|
123
|
+
# Set random seed
|
|
124
|
+
if random_state is not None:
|
|
125
|
+
random.seed(random_state)
|
|
126
|
+
|
|
127
|
+
# Create indices
|
|
128
|
+
indices = list(range(n_samples))
|
|
129
|
+
|
|
130
|
+
if stratify and y is not None:
|
|
131
|
+
# Stratified split - maintain class distribution
|
|
132
|
+
X_train, X_test = [], []
|
|
133
|
+
y_train, y_test = [], []
|
|
134
|
+
|
|
135
|
+
# Group indices by class
|
|
136
|
+
class_indices = {}
|
|
137
|
+
for idx, label in enumerate(y):
|
|
138
|
+
if label not in class_indices:
|
|
139
|
+
class_indices[label] = []
|
|
140
|
+
class_indices[label].append(idx)
|
|
141
|
+
|
|
142
|
+
# Split each class proportionally
|
|
143
|
+
for label, class_idx in class_indices.items():
|
|
144
|
+
if shuffle:
|
|
145
|
+
random.shuffle(class_idx)
|
|
146
|
+
|
|
147
|
+
n_class_test = max(1, int(len(class_idx) * test_size))
|
|
148
|
+
|
|
149
|
+
test_idx = class_idx[:n_class_test]
|
|
150
|
+
train_idx = class_idx[n_class_test:]
|
|
151
|
+
|
|
152
|
+
X_test.extend([X[i] for i in test_idx])
|
|
153
|
+
y_test.extend([y[i] for i in test_idx])
|
|
154
|
+
X_train.extend([X[i] for i in train_idx])
|
|
155
|
+
y_train.extend([y[i] for i in train_idx])
|
|
156
|
+
|
|
157
|
+
return X_train, X_test, y_train, y_test
|
|
158
|
+
|
|
159
|
+
else:
|
|
160
|
+
# Regular split
|
|
161
|
+
if shuffle:
|
|
162
|
+
random.shuffle(indices)
|
|
163
|
+
|
|
164
|
+
test_indices = indices[:n_test]
|
|
165
|
+
train_indices = indices[n_test:]
|
|
166
|
+
|
|
167
|
+
X_train = [X[i] for i in train_indices]
|
|
168
|
+
X_test = [X[i] for i in test_indices]
|
|
169
|
+
|
|
170
|
+
if y is not None:
|
|
171
|
+
y_train = [y[i] for i in train_indices]
|
|
172
|
+
y_test = [y[i] for i in test_indices]
|
|
173
|
+
return X_train, X_test, y_train, y_test
|
|
174
|
+
else:
|
|
175
|
+
return X_train, X_test
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def normalize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
179
|
+
"""
|
|
180
|
+
Normalize data to range [0, 1] using min-max scaling.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
data: List or array of numerical values
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
list: Normalized values between 0 and 1
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
>>> from ilovetools.data import normalize_data
|
|
190
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
191
|
+
>>> normalized = normalize_data(data)
|
|
192
|
+
>>> print(normalized)
|
|
193
|
+
[0.0, 0.25, 0.5, 0.75, 1.0]
|
|
194
|
+
"""
|
|
195
|
+
if isinstance(data, np.ndarray):
|
|
196
|
+
data = data.tolist()
|
|
197
|
+
|
|
198
|
+
min_val = min(data)
|
|
199
|
+
max_val = max(data)
|
|
200
|
+
|
|
201
|
+
if max_val == min_val:
|
|
202
|
+
return [0.0] * len(data)
|
|
203
|
+
|
|
204
|
+
return [(x - min_val) / (max_val - min_val) for x in data]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def standardize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
208
|
+
"""
|
|
209
|
+
Standardize data to have mean=0 and std=1 (Z-score normalization).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
data: List or array of numerical values
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
list: Standardized values with mean=0, std=1
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
>>> from ilovetools.data import standardize_data
|
|
219
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
220
|
+
>>> standardized = standardize_data(data)
|
|
221
|
+
>>> print(standardized)
|
|
222
|
+
[-1.414, -0.707, 0.0, 0.707, 1.414]
|
|
223
|
+
"""
|
|
224
|
+
if isinstance(data, np.ndarray):
|
|
225
|
+
data = data.tolist()
|
|
226
|
+
|
|
227
|
+
mean = sum(data) / len(data)
|
|
228
|
+
variance = sum((x - mean) ** 2 for x in data) / len(data)
|
|
229
|
+
std = variance ** 0.5
|
|
230
|
+
|
|
231
|
+
if std == 0:
|
|
232
|
+
return [0.0] * len(data)
|
|
233
|
+
|
|
234
|
+
return [(x - mean) / std for x in data]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -17,6 +17,8 @@ ilovetools/audio/__init__.py
|
|
|
17
17
|
ilovetools/automation/__init__.py
|
|
18
18
|
ilovetools/conversion/__init__.py
|
|
19
19
|
ilovetools/data/__init__.py
|
|
20
|
+
ilovetools/data/feature_engineering.py
|
|
21
|
+
ilovetools/data/preprocessing.py
|
|
20
22
|
ilovetools/database/__init__.py
|
|
21
23
|
ilovetools/datetime/__init__.py
|
|
22
24
|
ilovetools/files/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.3"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.2",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|