ilovetools 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {ilovetools-0.1.1/ilovetools.egg-info → ilovetools-0.1.3}/PKG-INFO +1 -1
  2. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/__init__.py +1 -1
  3. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/__init__.py +3 -1
  4. ilovetools-0.1.3/ilovetools/ai/embeddings.py +270 -0
  5. ilovetools-0.1.3/ilovetools/data/__init__.py +27 -0
  6. ilovetools-0.1.3/ilovetools/data/feature_engineering.py +497 -0
  7. ilovetools-0.1.3/ilovetools/data/preprocessing.py +234 -0
  8. {ilovetools-0.1.1 → ilovetools-0.1.3/ilovetools.egg-info}/PKG-INFO +1 -1
  9. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/SOURCES.txt +2 -0
  10. {ilovetools-0.1.1 → ilovetools-0.1.3}/pyproject.toml +1 -1
  11. {ilovetools-0.1.1 → ilovetools-0.1.3}/setup.py +1 -1
  12. ilovetools-0.1.1/ilovetools/ai/embeddings.py +0 -5
  13. ilovetools-0.1.1/ilovetools/data/__init__.py +0 -5
  14. {ilovetools-0.1.1 → ilovetools-0.1.3}/LICENSE +0 -0
  15. {ilovetools-0.1.1 → ilovetools-0.1.3}/MANIFEST.in +0 -0
  16. {ilovetools-0.1.1 → ilovetools-0.1.3}/README.md +0 -0
  17. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/inference.py +0 -0
  18. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/ai/llm_helpers.py +0 -0
  19. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/audio/__init__.py +0 -0
  20. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/automation/__init__.py +0 -0
  21. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/conversion/__init__.py +0 -0
  22. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/database/__init__.py +0 -0
  23. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/datetime/__init__.py +0 -0
  24. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/files/__init__.py +0 -0
  25. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/image/__init__.py +0 -0
  26. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/security/__init__.py +0 -0
  27. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/text/__init__.py +0 -0
  28. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/utils/__init__.py +0 -0
  29. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/validation/__init__.py +0 -0
  30. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools/web/__init__.py +0 -0
  31. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/dependency_links.txt +0 -0
  32. {ilovetools-0.1.1 → ilovetools-0.1.3}/ilovetools.egg-info/top_level.txt +0 -0
  33. {ilovetools-0.1.1 → ilovetools-0.1.3}/requirements.txt +0 -0
  34. {ilovetools-0.1.1 → ilovetools-0.1.3}/setup.cfg +0 -0
  35. {ilovetools-0.1.1 → ilovetools-0.1.3}/tests/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ilovetools
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
5
5
  Home-page: https://github.com/AliMehdi512/ilovetools
6
6
  Author: Ali Mehdi
@@ -2,7 +2,7 @@
2
2
  ilovetools - A comprehensive Python utility library
3
3
  """
4
4
 
5
- __version__ = "0.1.1"
5
+ __version__ = "0.1.3"
6
6
  __author__ = "Ali Mehdi"
7
7
  __email__ = "ali.mehdi.dev579@gmail.com"
8
8
 
@@ -3,9 +3,11 @@ AI & Machine Learning utilities module
3
3
  """
4
4
 
5
5
  from .llm_helpers import token_counter
6
- from .embeddings import *
6
+ from .embeddings import similarity_search, cosine_similarity
7
7
  from .inference import *
8
8
 
9
9
  __all__ = [
10
10
  'token_counter',
11
+ 'similarity_search',
12
+ 'cosine_similarity',
11
13
  ]
@@ -0,0 +1,270 @@
1
+ """
2
+ Embedding utilities for text and vector operations
3
+ """
4
+
5
+ import numpy as np
6
+ from typing import List, Union, Tuple, Dict
7
+ import re
8
+
9
+ __all__ = ['similarity_search', 'cosine_similarity']
10
+
11
+
12
+ def cosine_similarity(vec1: Union[List[float], np.ndarray], vec2: Union[List[float], np.ndarray]) -> float:
13
+ """
14
+ Calculate cosine similarity between two vectors.
15
+
16
+ Args:
17
+ vec1: First vector
18
+ vec2: Second vector
19
+
20
+ Returns:
21
+ float: Cosine similarity score between -1 and 1
22
+ """
23
+ vec1 = np.array(vec1)
24
+ vec2 = np.array(vec2)
25
+
26
+ dot_product = np.dot(vec1, vec2)
27
+ norm1 = np.linalg.norm(vec1)
28
+ norm2 = np.linalg.norm(vec2)
29
+
30
+ if norm1 == 0 or norm2 == 0:
31
+ return 0.0
32
+
33
+ return float(dot_product / (norm1 * norm2))
34
+
35
+
36
+ def similarity_search(
37
+ query: str,
38
+ documents: List[str],
39
+ top_k: int = 5,
40
+ method: str = "tfidf",
41
+ return_scores: bool = True
42
+ ) -> Union[List[str], List[Tuple[str, float]]]:
43
+ """
44
+ Find most similar documents to a query using various similarity methods.
45
+
46
+ This function performs semantic similarity search without requiring external APIs
47
+ or heavy ML models. Perfect for quick document retrieval, search functionality,
48
+ and finding relevant content.
49
+
50
+ Args:
51
+ query (str): Search query text
52
+ documents (list): List of document strings to search through
53
+ top_k (int): Number of top results to return. Default: 5
54
+ method (str): Similarity method to use:
55
+ - "tfidf": TF-IDF based similarity (default, fast)
56
+ - "jaccard": Jaccard similarity (word overlap)
57
+ - "levenshtein": Edit distance based similarity
58
+ - "ngram": N-gram based similarity
59
+ return_scores (bool): If True, returns (document, score) tuples.
60
+ If False, returns only documents. Default: True
61
+
62
+ Returns:
63
+ list: Top-k most similar documents
64
+ - If return_scores=True: [(doc, score), ...]
65
+ - If return_scores=False: [doc, ...]
66
+
67
+ Examples:
68
+ >>> from ilovetools.ai import similarity_search
69
+
70
+ # Basic usage
71
+ >>> docs = [
72
+ ... "Python is a programming language",
73
+ ... "Machine learning with Python",
74
+ ... "Java programming basics",
75
+ ... "Deep learning and AI"
76
+ ... ]
77
+ >>> results = similarity_search("Python ML", docs, top_k=2)
78
+ >>> print(results)
79
+ [('Machine learning with Python', 0.85), ('Python is a programming language', 0.72)]
80
+
81
+ # Without scores
82
+ >>> results = similarity_search("Python ML", docs, return_scores=False)
83
+ >>> print(results)
84
+ ['Machine learning with Python', 'Python is a programming language']
85
+
86
+ # Different methods
87
+ >>> results = similarity_search("Python", docs, method="jaccard")
88
+ >>> results = similarity_search("Python", docs, method="levenshtein")
89
+
90
+ # Real-world use case: FAQ search
91
+ >>> faqs = [
92
+ ... "How do I reset my password?",
93
+ ... "What is the refund policy?",
94
+ ... "How to contact support?",
95
+ ... "Where is my order?"
96
+ ... ]
97
+ >>> user_query = "forgot password"
98
+ >>> answer = similarity_search(user_query, faqs, top_k=1, return_scores=False)[0]
99
+ >>> print(answer)
100
+ 'How do I reset my password?'
101
+
102
+ Notes:
103
+ - TF-IDF method is fastest and works well for most cases
104
+ - Jaccard is good for short texts and keyword matching
105
+ - Levenshtein is useful for typo-tolerant search
106
+ - No external dependencies or API calls required
107
+ - Works offline and is very fast
108
+
109
+ Performance:
110
+ - TF-IDF: O(n*m) where n=docs, m=avg words
111
+ - Jaccard: O(n*m)
112
+ - Levenshtein: O(n*m^2)
113
+ """
114
+
115
+ if not documents:
116
+ return []
117
+
118
+ if top_k > len(documents):
119
+ top_k = len(documents)
120
+
121
+ # Normalize query
122
+ query_lower = query.lower()
123
+
124
+ if method == "tfidf":
125
+ scores = _tfidf_similarity(query_lower, documents)
126
+ elif method == "jaccard":
127
+ scores = _jaccard_similarity(query_lower, documents)
128
+ elif method == "levenshtein":
129
+ scores = _levenshtein_similarity(query_lower, documents)
130
+ elif method == "ngram":
131
+ scores = _ngram_similarity(query_lower, documents)
132
+ else:
133
+ raise ValueError(f"Unknown method: {method}. Use 'tfidf', 'jaccard', 'levenshtein', or 'ngram'")
134
+
135
+ # Sort by score (descending)
136
+ doc_scores = list(zip(documents, scores))
137
+ doc_scores.sort(key=lambda x: x[1], reverse=True)
138
+
139
+ # Get top-k results
140
+ top_results = doc_scores[:top_k]
141
+
142
+ if return_scores:
143
+ return top_results
144
+ else:
145
+ return [doc for doc, _ in top_results]
146
+
147
+
148
+ def _tfidf_similarity(query: str, documents: List[str]) -> List[float]:
149
+ """TF-IDF based similarity calculation."""
150
+ # Tokenize
151
+ query_words = set(re.findall(r'\w+', query.lower()))
152
+
153
+ if not query_words:
154
+ return [0.0] * len(documents)
155
+
156
+ # Calculate document frequencies
157
+ doc_freq = {}
158
+ for doc in documents:
159
+ doc_words = set(re.findall(r'\w+', doc.lower()))
160
+ for word in doc_words:
161
+ doc_freq[word] = doc_freq.get(word, 0) + 1
162
+
163
+ num_docs = len(documents)
164
+ scores = []
165
+
166
+ for doc in documents:
167
+ doc_words = re.findall(r'\w+', doc.lower())
168
+ doc_word_set = set(doc_words)
169
+
170
+ # Calculate TF-IDF score
171
+ score = 0.0
172
+ for word in query_words:
173
+ if word in doc_word_set:
174
+ # TF: term frequency in document
175
+ tf = doc_words.count(word) / len(doc_words) if doc_words else 0
176
+ # IDF: inverse document frequency
177
+ idf = np.log(num_docs / (doc_freq.get(word, 0) + 1))
178
+ score += tf * idf
179
+
180
+ scores.append(score)
181
+
182
+ return scores
183
+
184
+
185
+ def _jaccard_similarity(query: str, documents: List[str]) -> List[float]:
186
+ """Jaccard similarity based on word overlap."""
187
+ query_words = set(re.findall(r'\w+', query.lower()))
188
+
189
+ if not query_words:
190
+ return [0.0] * len(documents)
191
+
192
+ scores = []
193
+ for doc in documents:
194
+ doc_words = set(re.findall(r'\w+', doc.lower()))
195
+
196
+ if not doc_words:
197
+ scores.append(0.0)
198
+ continue
199
+
200
+ intersection = len(query_words & doc_words)
201
+ union = len(query_words | doc_words)
202
+
203
+ score = intersection / union if union > 0 else 0.0
204
+ scores.append(score)
205
+
206
+ return scores
207
+
208
+
209
+ def _levenshtein_distance(s1: str, s2: str) -> int:
210
+ """Calculate Levenshtein distance between two strings."""
211
+ if len(s1) < len(s2):
212
+ return _levenshtein_distance(s2, s1)
213
+
214
+ if len(s2) == 0:
215
+ return len(s1)
216
+
217
+ previous_row = range(len(s2) + 1)
218
+ for i, c1 in enumerate(s1):
219
+ current_row = [i + 1]
220
+ for j, c2 in enumerate(s2):
221
+ insertions = previous_row[j + 1] + 1
222
+ deletions = current_row[j] + 1
223
+ substitutions = previous_row[j] + (c1 != c2)
224
+ current_row.append(min(insertions, deletions, substitutions))
225
+ previous_row = current_row
226
+
227
+ return previous_row[-1]
228
+
229
+
230
+ def _levenshtein_similarity(query: str, documents: List[str]) -> List[float]:
231
+ """Levenshtein distance based similarity."""
232
+ scores = []
233
+ for doc in documents:
234
+ doc_lower = doc.lower()
235
+ distance = _levenshtein_distance(query, doc_lower)
236
+ max_len = max(len(query), len(doc_lower))
237
+
238
+ # Convert distance to similarity (0 to 1)
239
+ similarity = 1 - (distance / max_len) if max_len > 0 else 0.0
240
+ scores.append(similarity)
241
+
242
+ return scores
243
+
244
+
245
+ def _ngram_similarity(query: str, documents: List[str], n: int = 2) -> List[float]:
246
+ """N-gram based similarity."""
247
+ def get_ngrams(text: str, n: int) -> set:
248
+ text = text.lower()
249
+ return set(text[i:i+n] for i in range(len(text) - n + 1))
250
+
251
+ query_ngrams = get_ngrams(query, n)
252
+
253
+ if not query_ngrams:
254
+ return [0.0] * len(documents)
255
+
256
+ scores = []
257
+ for doc in documents:
258
+ doc_ngrams = get_ngrams(doc, n)
259
+
260
+ if not doc_ngrams:
261
+ scores.append(0.0)
262
+ continue
263
+
264
+ intersection = len(query_ngrams & doc_ngrams)
265
+ union = len(query_ngrams | doc_ngrams)
266
+
267
+ score = intersection / union if union > 0 else 0.0
268
+ scores.append(score)
269
+
270
+ return scores
@@ -0,0 +1,27 @@
1
+ """
2
+ Data processing and manipulation utilities
3
+ """
4
+
5
+ from .preprocessing import train_test_split, normalize_data, standardize_data
6
+ from .feature_engineering import (
7
+ create_polynomial_features,
8
+ bin_numerical_feature,
9
+ one_hot_encode,
10
+ label_encode,
11
+ extract_datetime_features,
12
+ handle_missing_values,
13
+ create_interaction_features
14
+ )
15
+
16
+ __all__ = [
17
+ 'train_test_split',
18
+ 'normalize_data',
19
+ 'standardize_data',
20
+ 'create_polynomial_features',
21
+ 'bin_numerical_feature',
22
+ 'one_hot_encode',
23
+ 'label_encode',
24
+ 'extract_datetime_features',
25
+ 'handle_missing_values',
26
+ 'create_interaction_features',
27
+ ]
@@ -0,0 +1,497 @@
1
+ """
2
+ Feature engineering utilities for ML workflows
3
+ """
4
+
5
+ from typing import List, Union, Dict, Tuple, Optional
6
+ import numpy as np
7
+ from datetime import datetime
8
+
9
+ __all__ = [
10
+ 'create_polynomial_features',
11
+ 'bin_numerical_feature',
12
+ 'one_hot_encode',
13
+ 'label_encode',
14
+ 'extract_datetime_features',
15
+ 'handle_missing_values',
16
+ 'create_interaction_features'
17
+ ]
18
+
19
+
20
+ def create_polynomial_features(
21
+ data: Union[List[float], np.ndarray],
22
+ degree: int = 2,
23
+ include_bias: bool = False
24
+ ) -> List[List[float]]:
25
+ """
26
+ Create polynomial features from numerical data.
27
+
28
+ Transforms [x] into [x, x^2, x^3, ...] to capture non-linear relationships.
29
+ Essential for models that need to learn curved patterns.
30
+
31
+ Args:
32
+ data: List or array of numerical values
33
+ degree: Maximum polynomial degree. Default: 2
34
+ include_bias: Include bias term (column of 1s). Default: False
35
+
36
+ Returns:
37
+ list: Polynomial features as list of lists
38
+
39
+ Examples:
40
+ >>> from ilovetools.data import create_polynomial_features
41
+
42
+ # Basic usage
43
+ >>> ages = [20, 25, 30, 35, 40]
44
+ >>> poly_features = create_polynomial_features(ages, degree=2)
45
+ >>> print(poly_features)
46
+ [[20, 400], [25, 625], [30, 900], [35, 1225], [40, 1600]]
47
+
48
+ # With bias term
49
+ >>> poly_features = create_polynomial_features(ages, degree=2, include_bias=True)
50
+ >>> print(poly_features[0])
51
+ [1, 20, 400]
52
+
53
+ # Degree 3
54
+ >>> poly_features = create_polynomial_features([2, 3, 4], degree=3)
55
+ >>> print(poly_features)
56
+ [[2, 4, 8], [3, 9, 27], [4, 16, 64]]
57
+
58
+ # Real-world: Age features for insurance pricing
59
+ >>> customer_ages = [25, 35, 45, 55, 65]
60
+ >>> age_features = create_polynomial_features(customer_ages, degree=2)
61
+ # Now model can learn: premium = a*age + b*age^2
62
+
63
+ Notes:
64
+ - Useful for capturing non-linear relationships
65
+ - Common in regression problems
66
+ - Be careful with high degrees (overfitting risk)
67
+ - Normalize features after polynomial expansion
68
+ """
69
+ if isinstance(data, np.ndarray):
70
+ data = data.tolist()
71
+
72
+ result = []
73
+ for value in data:
74
+ features = []
75
+ if include_bias:
76
+ features.append(1)
77
+ for d in range(1, degree + 1):
78
+ features.append(value ** d)
79
+ result.append(features)
80
+
81
+ return result
82
+
83
+
84
+ def bin_numerical_feature(
85
+ data: Union[List[float], np.ndarray],
86
+ bins: Union[int, List[float]] = 5,
87
+ labels: Optional[List[str]] = None
88
+ ) -> List[Union[int, str]]:
89
+ """
90
+ Bin continuous numerical data into discrete categories.
91
+
92
+ Converts continuous values into groups/bins. Useful for creating
93
+ categorical features from numerical data.
94
+
95
+ Args:
96
+ data: List or array of numerical values
97
+ bins: Number of equal-width bins OR list of bin edges
98
+ labels: Optional labels for bins. If None, returns bin indices
99
+
100
+ Returns:
101
+ list: Binned values (indices or labels)
102
+
103
+ Examples:
104
+ >>> from ilovetools.data import bin_numerical_feature
105
+
106
+ # Age groups
107
+ >>> ages = [5, 15, 25, 35, 45, 55, 65, 75]
108
+ >>> age_groups = bin_numerical_feature(
109
+ ... ages,
110
+ ... bins=[0, 18, 35, 60, 100],
111
+ ... labels=["Child", "Young Adult", "Adult", "Senior"]
112
+ ... )
113
+ >>> print(age_groups)
114
+ ['Child', 'Child', 'Young Adult', 'Adult', 'Adult', 'Senior', 'Senior', 'Senior']
115
+
116
+ # Income brackets
117
+ >>> incomes = [25000, 45000, 65000, 85000, 120000]
118
+ >>> income_brackets = bin_numerical_feature(
119
+ ... incomes,
120
+ ... bins=[0, 40000, 80000, 150000],
121
+ ... labels=["Low", "Medium", "High"]
122
+ ... )
123
+
124
+ # Equal-width bins
125
+ >>> scores = [45, 67, 89, 92, 78, 56, 34, 88]
126
+ >>> score_bins = bin_numerical_feature(scores, bins=3)
127
+ >>> print(score_bins)
128
+ [0, 1, 2, 2, 2, 1, 0, 2]
129
+
130
+ Notes:
131
+ - Useful for creating categorical features
132
+ - Helps models learn threshold effects
133
+ - Can reduce noise in continuous data
134
+ - Choose bins based on domain knowledge
135
+ """
136
+ if isinstance(data, np.ndarray):
137
+ data = data.tolist()
138
+
139
+ if isinstance(bins, int):
140
+ # Create equal-width bins
141
+ min_val = min(data)
142
+ max_val = max(data)
143
+ bin_width = (max_val - min_val) / bins
144
+ bin_edges = [min_val + i * bin_width for i in range(bins + 1)]
145
+ bin_edges[-1] += 0.001 # Ensure max value is included
146
+ else:
147
+ bin_edges = bins
148
+
149
+ result = []
150
+ for value in data:
151
+ bin_idx = 0
152
+ for i in range(len(bin_edges) - 1):
153
+ if bin_edges[i] <= value < bin_edges[i + 1]:
154
+ bin_idx = i
155
+ break
156
+
157
+ if labels:
158
+ result.append(labels[bin_idx])
159
+ else:
160
+ result.append(bin_idx)
161
+
162
+ return result
163
+
164
+
165
+ def one_hot_encode(
166
+ data: List[str],
167
+ categories: Optional[List[str]] = None
168
+ ) -> Dict[str, List[int]]:
169
+ """
170
+ One-hot encode categorical data.
171
+
172
+ Converts categories into binary columns. Each category becomes
173
+ a separate binary feature.
174
+
175
+ Args:
176
+ data: List of categorical values
177
+ categories: Optional list of all possible categories
178
+
179
+ Returns:
180
+ dict: Dictionary with category names as keys, binary lists as values
181
+
182
+ Examples:
183
+ >>> from ilovetools.data import one_hot_encode
184
+
185
+ # Basic encoding
186
+ >>> colors = ["Red", "Blue", "Green", "Red", "Blue"]
187
+ >>> encoded = one_hot_encode(colors)
188
+ >>> print(encoded)
189
+ {'Red': [1, 0, 0, 1, 0], 'Blue': [0, 1, 0, 0, 1], 'Green': [0, 0, 1, 0, 0]}
190
+
191
+ # With predefined categories
192
+ >>> sizes = ["S", "M", "L", "M"]
193
+ >>> encoded = one_hot_encode(sizes, categories=["XS", "S", "M", "L", "XL"])
194
+
195
+ # Real-world: Product categories
196
+ >>> products = ["Electronics", "Clothing", "Electronics", "Food"]
197
+ >>> encoded = one_hot_encode(products)
198
+ # Use in ML: Each category becomes a feature
199
+
200
+ Notes:
201
+ - Standard encoding for categorical features
202
+ - Creates sparse features (mostly zeros)
203
+ - Number of features = number of categories
204
+ - Use for nominal categories (no order)
205
+ """
206
+ if categories is None:
207
+ categories = sorted(list(set(data)))
208
+
209
+ result = {cat: [] for cat in categories}
210
+
211
+ for value in data:
212
+ for cat in categories:
213
+ result[cat].append(1 if value == cat else 0)
214
+
215
+ return result
216
+
217
+
218
+ def label_encode(data: List[str]) -> Tuple[List[int], Dict[str, int]]:
219
+ """
220
+ Label encode categorical data to integers.
221
+
222
+ Converts categories to integer labels. Useful for ordinal categories
223
+ or when one-hot encoding creates too many features.
224
+
225
+ Args:
226
+ data: List of categorical values
227
+
228
+ Returns:
229
+ tuple: (encoded_data, label_mapping)
230
+
231
+ Examples:
232
+ >>> from ilovetools.data import label_encode
233
+
234
+ # Basic encoding
235
+ >>> sizes = ["Small", "Large", "Medium", "Small", "Large"]
236
+ >>> encoded, mapping = label_encode(sizes)
237
+ >>> print(encoded)
238
+ [2, 0, 1, 2, 0]
239
+ >>> print(mapping)
240
+ {'Large': 0, 'Medium': 1, 'Small': 2}
241
+
242
+ # Education levels (ordinal)
243
+ >>> education = ["High School", "Bachelor", "Master", "Bachelor"]
244
+ >>> encoded, mapping = label_encode(education)
245
+
246
+ # Decode back
247
+ >>> reverse_mapping = {v: k for k, v in mapping.items()}
248
+ >>> original = [reverse_mapping[code] for code in encoded]
249
+
250
+ Notes:
251
+ - More memory efficient than one-hot encoding
252
+ - Use for ordinal categories (has order)
253
+ - Model may assume order exists
254
+ - Returns mapping for decoding
255
+ """
256
+ unique_values = sorted(list(set(data)))
257
+ mapping = {val: idx for idx, val in enumerate(unique_values)}
258
+ encoded = [mapping[val] for val in data]
259
+
260
+ return encoded, mapping
261
+
262
+
263
+ def extract_datetime_features(
264
+ timestamps: List[str],
265
+ format: str = "%Y-%m-%d %H:%M:%S"
266
+ ) -> Dict[str, List[int]]:
267
+ """
268
+ Extract useful features from datetime strings.
269
+
270
+ Converts timestamps into multiple temporal features like hour,
271
+ day of week, month, etc. Essential for time-series ML.
272
+
273
+ Args:
274
+ timestamps: List of datetime strings
275
+ format: Datetime format string. Default: "%Y-%m-%d %H:%M:%S"
276
+
277
+ Returns:
278
+ dict: Dictionary with feature names and values
279
+
280
+ Examples:
281
+ >>> from ilovetools.data import extract_datetime_features
282
+
283
+ # Basic usage
284
+ >>> dates = [
285
+ ... "2024-03-15 14:30:00",
286
+ ... "2024-03-16 09:15:00",
287
+ ... "2024-03-17 18:45:00"
288
+ ... ]
289
+ >>> features = extract_datetime_features(dates)
290
+ >>> print(features.keys())
291
+ dict_keys(['year', 'month', 'day', 'hour', 'minute', 'day_of_week', 'is_weekend'])
292
+
293
+ # E-commerce: Purchase patterns
294
+ >>> purchase_times = ["2024-12-25 10:30:00", "2024-12-26 15:45:00"]
295
+ >>> features = extract_datetime_features(purchase_times)
296
+ >>> print(features['is_weekend'])
297
+ [0, 0]
298
+ >>> print(features['hour'])
299
+ [10, 15]
300
+
301
+ # Different format
302
+ >>> dates = ["15/03/2024", "16/03/2024"]
303
+ >>> features = extract_datetime_features(dates, format="%d/%m/%Y")
304
+
305
+ Notes:
306
+ - Captures temporal patterns
307
+ - Essential for time-series forecasting
308
+ - Helps model learn seasonality
309
+ - Common features: hour, day, month, is_weekend
310
+ """
311
+ result = {
312
+ 'year': [],
313
+ 'month': [],
314
+ 'day': [],
315
+ 'hour': [],
316
+ 'minute': [],
317
+ 'day_of_week': [], # 0=Monday, 6=Sunday
318
+ 'is_weekend': []
319
+ }
320
+
321
+ for ts in timestamps:
322
+ dt = datetime.strptime(ts, format)
323
+ result['year'].append(dt.year)
324
+ result['month'].append(dt.month)
325
+ result['day'].append(dt.day)
326
+ result['hour'].append(dt.hour)
327
+ result['minute'].append(dt.minute)
328
+ result['day_of_week'].append(dt.weekday())
329
+ result['is_weekend'].append(1 if dt.weekday() >= 5 else 0)
330
+
331
+ return result
332
+
333
+
334
+ def handle_missing_values(
335
+ data: List[Optional[float]],
336
+ strategy: str = "mean"
337
+ ) -> List[float]:
338
+ """
339
+ Handle missing values in numerical data.
340
+
341
+ Fills None/NaN values using various strategies. Essential
342
+ preprocessing step for ML models.
343
+
344
+ Args:
345
+ data: List with potential None values
346
+ strategy: Fill strategy - "mean", "median", "mode", "forward", "backward", "zero"
347
+
348
+ Returns:
349
+ list: Data with missing values filled
350
+
351
+ Examples:
352
+ >>> from ilovetools.data import handle_missing_values
353
+
354
+ # Mean imputation
355
+ >>> data = [1.0, 2.0, None, 4.0, 5.0]
356
+ >>> filled = handle_missing_values(data, strategy="mean")
357
+ >>> print(filled)
358
+ [1.0, 2.0, 3.0, 4.0, 5.0]
359
+
360
+ # Median imputation
361
+ >>> data = [1.0, 2.0, None, 100.0]
362
+ >>> filled = handle_missing_values(data, strategy="median")
363
+
364
+ # Forward fill
365
+ >>> data = [1.0, None, None, 4.0]
366
+ >>> filled = handle_missing_values(data, strategy="forward")
367
+ >>> print(filled)
368
+ [1.0, 1.0, 1.0, 4.0]
369
+
370
+ # Zero fill
371
+ >>> data = [1.0, None, 3.0]
372
+ >>> filled = handle_missing_values(data, strategy="zero")
373
+ >>> print(filled)
374
+ [1.0, 0.0, 3.0]
375
+
376
+ Notes:
377
+ - Most ML models can't handle missing values
378
+ - Choose strategy based on data distribution
379
+ - Mean: Sensitive to outliers
380
+ - Median: Robust to outliers
381
+ - Forward/Backward: For time-series data
382
+ """
383
+ valid_values = [x for x in data if x is not None]
384
+
385
+ if not valid_values:
386
+ return [0.0] * len(data)
387
+
388
+ if strategy == "mean":
389
+ fill_value = sum(valid_values) / len(valid_values)
390
+ elif strategy == "median":
391
+ sorted_vals = sorted(valid_values)
392
+ n = len(sorted_vals)
393
+ fill_value = sorted_vals[n // 2] if n % 2 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2
394
+ elif strategy == "mode":
395
+ fill_value = max(set(valid_values), key=valid_values.count)
396
+ elif strategy == "zero":
397
+ fill_value = 0.0
398
+ else:
399
+ fill_value = sum(valid_values) / len(valid_values)
400
+
401
+ result = []
402
+ last_valid = fill_value
403
+
404
+ for value in data:
405
+ if value is None:
406
+ if strategy == "forward":
407
+ result.append(last_valid)
408
+ else:
409
+ result.append(fill_value)
410
+ else:
411
+ result.append(value)
412
+ last_valid = value
413
+
414
+ # Backward fill if needed
415
+ if strategy == "backward":
416
+ result_reversed = []
417
+ last_valid = fill_value
418
+ for value in reversed(result):
419
+ if value is None:
420
+ result_reversed.append(last_valid)
421
+ else:
422
+ result_reversed.append(value)
423
+ last_valid = value
424
+ result = list(reversed(result_reversed))
425
+
426
+ return result
427
+
428
+
429
+ def create_interaction_features(
430
+ feature1: Union[List[float], np.ndarray],
431
+ feature2: Union[List[float], np.ndarray],
432
+ operation: str = "multiply"
433
+ ) -> List[float]:
434
+ """
435
+ Create interaction features between two features.
436
+
437
+ Combines two features to capture their joint effect. Useful when
438
+ features interact in meaningful ways.
439
+
440
+ Args:
441
+ feature1: First feature
442
+ feature2: Second feature
443
+ operation: "multiply", "add", "subtract", "divide"
444
+
445
+ Returns:
446
+ list: Interaction feature values
447
+
448
+ Examples:
449
+ >>> from ilovetools.data import create_interaction_features
450
+
451
+ # Multiply interaction
452
+ >>> height = [170, 180, 160, 175]
453
+ >>> weight = [70, 85, 60, 80]
454
+ >>> bmi_proxy = create_interaction_features(height, weight, "multiply")
455
+
456
+ # Real-world: Price per square foot
457
+ >>> prices = [300000, 450000, 250000]
458
+ >>> sqft = [1500, 2000, 1200]
459
+ >>> price_per_sqft = create_interaction_features(prices, sqft, "divide")
460
+ >>> print(price_per_sqft)
461
+ [200.0, 225.0, 208.33]
462
+
463
+ # Add interaction
464
+ >>> feature1 = [1, 2, 3]
465
+ >>> feature2 = [4, 5, 6]
466
+ >>> combined = create_interaction_features(feature1, feature2, "add")
467
+ >>> print(combined)
468
+ [5, 7, 9]
469
+
470
+ Notes:
471
+ - Captures feature interactions
472
+ - Common in real estate (price * sqft)
473
+ - Useful in e-commerce (quantity * price)
474
+ - Can significantly improve model performance
475
+ """
476
+ if isinstance(feature1, np.ndarray):
477
+ feature1 = feature1.tolist()
478
+ if isinstance(feature2, np.ndarray):
479
+ feature2 = feature2.tolist()
480
+
481
+ if len(feature1) != len(feature2):
482
+ raise ValueError("Features must have same length")
483
+
484
+ result = []
485
+ for v1, v2 in zip(feature1, feature2):
486
+ if operation == "multiply":
487
+ result.append(v1 * v2)
488
+ elif operation == "add":
489
+ result.append(v1 + v2)
490
+ elif operation == "subtract":
491
+ result.append(v1 - v2)
492
+ elif operation == "divide":
493
+ result.append(v1 / v2 if v2 != 0 else 0.0)
494
+ else:
495
+ result.append(v1 * v2)
496
+
497
+ return result
@@ -0,0 +1,234 @@
1
+ """
2
+ Data preprocessing utilities
3
+ """
4
+
5
+ import random
6
+ from typing import Tuple, List, Union, Optional
7
+ import numpy as np
8
+
9
+ __all__ = ['train_test_split', 'normalize_data', 'standardize_data']
10
+
11
+
12
+ def train_test_split(
13
+ X: Union[List, np.ndarray],
14
+ y: Optional[Union[List, np.ndarray]] = None,
15
+ test_size: float = 0.2,
16
+ random_state: Optional[int] = None,
17
+ shuffle: bool = True,
18
+ stratify: bool = False
19
+ ) -> Union[Tuple[List, List], Tuple[List, List, List, List]]:
20
+ """
21
+ Split arrays or lists into random train and test subsets.
22
+
23
+ Perfect for ML workflows - implements the fundamental train-test split
24
+ pattern without requiring scikit-learn. Supports stratified splitting
25
+ to maintain class distribution.
26
+
27
+ Args:
28
+ X: Features array/list to split
29
+ y: Target array/list to split (optional)
30
+ test_size: Proportion of dataset for test set (0.0 to 1.0). Default: 0.2
31
+ random_state: Random seed for reproducibility. Default: None
32
+ shuffle: Whether to shuffle data before splitting. Default: True
33
+ stratify: Maintain class distribution in splits (requires y). Default: False
34
+
35
+ Returns:
36
+ If y is None: (X_train, X_test)
37
+ If y is provided: (X_train, X_test, y_train, y_test)
38
+
39
+ Examples:
40
+ >>> from ilovetools.data import train_test_split
41
+
42
+ # Basic split
43
+ >>> X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
44
+ >>> y = [0, 1, 0, 1, 0]
45
+ >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
46
+ >>> len(X_train), len(X_test)
47
+ (4, 1)
48
+
49
+ # With random seed for reproducibility
50
+ >>> X_train, X_test, y_train, y_test = train_test_split(
51
+ ... X, y, test_size=0.3, random_state=42
52
+ ... )
53
+
54
+ # Stratified split (maintains class distribution)
55
+ >>> X_train, X_test, y_train, y_test = train_test_split(
56
+ ... X, y, test_size=0.2, stratify=True, random_state=42
57
+ ... )
58
+
59
+ # Split features only (no labels)
60
+ >>> data = list(range(100))
61
+ >>> train, test = train_test_split(data, test_size=0.2)
62
+ >>> len(train), len(test)
63
+ (80, 20)
64
+
65
+ # Real-world example: Email spam detection
66
+ >>> emails = ["email1", "email2", "email3", "email4", "email5"]
67
+ >>> labels = [1, 0, 1, 0, 1] # 1=spam, 0=not spam
68
+ >>> X_train, X_test, y_train, y_test = train_test_split(
69
+ ... emails, labels, test_size=0.2, random_state=42
70
+ ... )
71
+
72
+ # 70-30 split
73
+ >>> X_train, X_test, y_train, y_test = train_test_split(
74
+ ... X, y, test_size=0.3
75
+ ... )
76
+
77
+ # 60-20-20 split (train-val-test)
78
+ >>> X_temp, X_test, y_temp, y_test = train_test_split(
79
+ ... X, y, test_size=0.2, random_state=42
80
+ ... )
81
+ >>> X_train, X_val, y_train, y_val = train_test_split(
82
+ ... X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
83
+ ... )
84
+
85
+ Notes:
86
+ - Always split data BEFORE any preprocessing to avoid data leakage
87
+ - Use random_state for reproducible results
88
+ - Stratified splitting ensures balanced class distribution
89
+ - Common splits: 80-20, 70-30, 60-20-20 (train-val-test)
90
+ - Test data should NEVER be seen during training
91
+
92
+ Raises:
93
+ ValueError: If test_size is not between 0 and 1
94
+ ValueError: If stratify=True but y is None
95
+ ValueError: If X and y have different lengths
96
+
97
+ References:
98
+ - scikit-learn train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
99
+ - ML best practices: https://developers.google.com/machine-learning/crash-course/training-and-test-sets/splitting-data
100
+ """
101
+
102
+ # Validation
103
+ if not 0 < test_size < 1:
104
+ raise ValueError(f"test_size must be between 0 and 1, got {test_size}")
105
+
106
+ if stratify and y is None:
107
+ raise ValueError("stratify=True requires y to be provided")
108
+
109
+ # Convert to lists if numpy arrays
110
+ if isinstance(X, np.ndarray):
111
+ X = X.tolist()
112
+ if y is not None and isinstance(y, np.ndarray):
113
+ y = y.tolist()
114
+
115
+ # Check lengths match
116
+ if y is not None and len(X) != len(y):
117
+ raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
118
+
119
+ n_samples = len(X)
120
+ n_test = int(n_samples * test_size)
121
+ n_train = n_samples - n_test
122
+
123
+ # Set random seed
124
+ if random_state is not None:
125
+ random.seed(random_state)
126
+
127
+ # Create indices
128
+ indices = list(range(n_samples))
129
+
130
+ if stratify and y is not None:
131
+ # Stratified split - maintain class distribution
132
+ X_train, X_test = [], []
133
+ y_train, y_test = [], []
134
+
135
+ # Group indices by class
136
+ class_indices = {}
137
+ for idx, label in enumerate(y):
138
+ if label not in class_indices:
139
+ class_indices[label] = []
140
+ class_indices[label].append(idx)
141
+
142
+ # Split each class proportionally
143
+ for label, class_idx in class_indices.items():
144
+ if shuffle:
145
+ random.shuffle(class_idx)
146
+
147
+ n_class_test = max(1, int(len(class_idx) * test_size))
148
+
149
+ test_idx = class_idx[:n_class_test]
150
+ train_idx = class_idx[n_class_test:]
151
+
152
+ X_test.extend([X[i] for i in test_idx])
153
+ y_test.extend([y[i] for i in test_idx])
154
+ X_train.extend([X[i] for i in train_idx])
155
+ y_train.extend([y[i] for i in train_idx])
156
+
157
+ return X_train, X_test, y_train, y_test
158
+
159
+ else:
160
+ # Regular split
161
+ if shuffle:
162
+ random.shuffle(indices)
163
+
164
+ test_indices = indices[:n_test]
165
+ train_indices = indices[n_test:]
166
+
167
+ X_train = [X[i] for i in train_indices]
168
+ X_test = [X[i] for i in test_indices]
169
+
170
+ if y is not None:
171
+ y_train = [y[i] for i in train_indices]
172
+ y_test = [y[i] for i in test_indices]
173
+ return X_train, X_test, y_train, y_test
174
+ else:
175
+ return X_train, X_test
176
+
177
+
178
+ def normalize_data(data: Union[List[float], np.ndarray]) -> List[float]:
179
+ """
180
+ Normalize data to range [0, 1] using min-max scaling.
181
+
182
+ Args:
183
+ data: List or array of numerical values
184
+
185
+ Returns:
186
+ list: Normalized values between 0 and 1
187
+
188
+ Example:
189
+ >>> from ilovetools.data import normalize_data
190
+ >>> data = [1, 2, 3, 4, 5]
191
+ >>> normalized = normalize_data(data)
192
+ >>> print(normalized)
193
+ [0.0, 0.25, 0.5, 0.75, 1.0]
194
+ """
195
+ if isinstance(data, np.ndarray):
196
+ data = data.tolist()
197
+
198
+ min_val = min(data)
199
+ max_val = max(data)
200
+
201
+ if max_val == min_val:
202
+ return [0.0] * len(data)
203
+
204
+ return [(x - min_val) / (max_val - min_val) for x in data]
205
+
206
+
207
+ def standardize_data(data: Union[List[float], np.ndarray]) -> List[float]:
208
+ """
209
+ Standardize data to have mean=0 and std=1 (Z-score normalization).
210
+
211
+ Args:
212
+ data: List or array of numerical values
213
+
214
+ Returns:
215
+ list: Standardized values with mean=0, std=1
216
+
217
+ Example:
218
+ >>> from ilovetools.data import standardize_data
219
+ >>> data = [1, 2, 3, 4, 5]
220
+ >>> standardized = standardize_data(data)
221
+ >>> print(standardized)
222
+ [-1.414, -0.707, 0.0, 0.707, 1.414]
223
+ """
224
+ if isinstance(data, np.ndarray):
225
+ data = data.tolist()
226
+
227
+ mean = sum(data) / len(data)
228
+ variance = sum((x - mean) ** 2 for x in data) / len(data)
229
+ std = variance ** 0.5
230
+
231
+ if std == 0:
232
+ return [0.0] * len(data)
233
+
234
+ return [(x - mean) / std for x in data]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ilovetools
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
5
5
  Home-page: https://github.com/AliMehdi512/ilovetools
6
6
  Author: Ali Mehdi
@@ -17,6 +17,8 @@ ilovetools/audio/__init__.py
17
17
  ilovetools/automation/__init__.py
18
18
  ilovetools/conversion/__init__.py
19
19
  ilovetools/data/__init__.py
20
+ ilovetools/data/feature_engineering.py
21
+ ilovetools/data/preprocessing.py
20
22
  ilovetools/database/__init__.py
21
23
  ilovetools/datetime/__init__.py
22
24
  ilovetools/files/__init__.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ilovetools"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="ilovetools",
8
- version="0.1.1",
8
+ version="0.1.2",
9
9
  author="Ali Mehdi",
10
10
  author_email="ali.mehdi.dev579@gmail.com",
11
11
  description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
@@ -1,5 +0,0 @@
1
- """
2
- Embedding utilities for text and vector operations
3
- """
4
-
5
- __all__ = []
@@ -1,5 +0,0 @@
1
- """
2
- Data processing and manipulation utilities
3
- """
4
-
5
- __all__ = []
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes