ilovetools 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ilovetools-0.1.1/ilovetools.egg-info → ilovetools-0.1.2}/PKG-INFO +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/__init__.py +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/ai/__init__.py +3 -1
- ilovetools-0.1.2/ilovetools/ai/embeddings.py +270 -0
- ilovetools-0.1.2/ilovetools/data/__init__.py +11 -0
- ilovetools-0.1.2/ilovetools/data/preprocessing.py +234 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2/ilovetools.egg-info}/PKG-INFO +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools.egg-info/SOURCES.txt +1 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/pyproject.toml +1 -1
- {ilovetools-0.1.1 → ilovetools-0.1.2}/setup.py +1 -1
- ilovetools-0.1.1/ilovetools/ai/embeddings.py +0 -5
- ilovetools-0.1.1/ilovetools/data/__init__.py +0 -5
- {ilovetools-0.1.1 → ilovetools-0.1.2}/LICENSE +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/MANIFEST.in +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/README.md +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/ai/inference.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/ai/llm_helpers.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/audio/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/automation/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/conversion/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/database/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/datetime/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/files/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/image/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/security/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/text/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/utils/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/validation/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools/web/__init__.py +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools.egg-info/dependency_links.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/ilovetools.egg-info/top_level.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/requirements.txt +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/setup.cfg +0 -0
- {ilovetools-0.1.1 → ilovetools-0.1.2}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -3,9 +3,11 @@ AI & Machine Learning utilities module
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .llm_helpers import token_counter
|
|
6
|
-
from .embeddings import
|
|
6
|
+
from .embeddings import similarity_search, cosine_similarity
|
|
7
7
|
from .inference import *
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
10
10
|
'token_counter',
|
|
11
|
+
'similarity_search',
|
|
12
|
+
'cosine_similarity',
|
|
11
13
|
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding utilities for text and vector operations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import List, Union, Tuple, Dict
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
__all__ = ['similarity_search', 'cosine_similarity']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def cosine_similarity(vec1: Union[List[float], np.ndarray], vec2: Union[List[float], np.ndarray]) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Calculate cosine similarity between two vectors.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
vec1: First vector
|
|
18
|
+
vec2: Second vector
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
float: Cosine similarity score between -1 and 1
|
|
22
|
+
"""
|
|
23
|
+
vec1 = np.array(vec1)
|
|
24
|
+
vec2 = np.array(vec2)
|
|
25
|
+
|
|
26
|
+
dot_product = np.dot(vec1, vec2)
|
|
27
|
+
norm1 = np.linalg.norm(vec1)
|
|
28
|
+
norm2 = np.linalg.norm(vec2)
|
|
29
|
+
|
|
30
|
+
if norm1 == 0 or norm2 == 0:
|
|
31
|
+
return 0.0
|
|
32
|
+
|
|
33
|
+
return float(dot_product / (norm1 * norm2))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def similarity_search(
|
|
37
|
+
query: str,
|
|
38
|
+
documents: List[str],
|
|
39
|
+
top_k: int = 5,
|
|
40
|
+
method: str = "tfidf",
|
|
41
|
+
return_scores: bool = True
|
|
42
|
+
) -> Union[List[str], List[Tuple[str, float]]]:
|
|
43
|
+
"""
|
|
44
|
+
Find most similar documents to a query using various similarity methods.
|
|
45
|
+
|
|
46
|
+
This function performs semantic similarity search without requiring external APIs
|
|
47
|
+
or heavy ML models. Perfect for quick document retrieval, search functionality,
|
|
48
|
+
and finding relevant content.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
query (str): Search query text
|
|
52
|
+
documents (list): List of document strings to search through
|
|
53
|
+
top_k (int): Number of top results to return. Default: 5
|
|
54
|
+
method (str): Similarity method to use:
|
|
55
|
+
- "tfidf": TF-IDF based similarity (default, fast)
|
|
56
|
+
- "jaccard": Jaccard similarity (word overlap)
|
|
57
|
+
- "levenshtein": Edit distance based similarity
|
|
58
|
+
- "ngram": N-gram based similarity
|
|
59
|
+
return_scores (bool): If True, returns (document, score) tuples.
|
|
60
|
+
If False, returns only documents. Default: True
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
list: Top-k most similar documents
|
|
64
|
+
- If return_scores=True: [(doc, score), ...]
|
|
65
|
+
- If return_scores=False: [doc, ...]
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
>>> from ilovetools.ai import similarity_search
|
|
69
|
+
|
|
70
|
+
# Basic usage
|
|
71
|
+
>>> docs = [
|
|
72
|
+
... "Python is a programming language",
|
|
73
|
+
... "Machine learning with Python",
|
|
74
|
+
... "Java programming basics",
|
|
75
|
+
... "Deep learning and AI"
|
|
76
|
+
... ]
|
|
77
|
+
>>> results = similarity_search("Python ML", docs, top_k=2)
|
|
78
|
+
>>> print(results)
|
|
79
|
+
[('Machine learning with Python', 0.85), ('Python is a programming language', 0.72)]
|
|
80
|
+
|
|
81
|
+
# Without scores
|
|
82
|
+
>>> results = similarity_search("Python ML", docs, return_scores=False)
|
|
83
|
+
>>> print(results)
|
|
84
|
+
['Machine learning with Python', 'Python is a programming language']
|
|
85
|
+
|
|
86
|
+
# Different methods
|
|
87
|
+
>>> results = similarity_search("Python", docs, method="jaccard")
|
|
88
|
+
>>> results = similarity_search("Python", docs, method="levenshtein")
|
|
89
|
+
|
|
90
|
+
# Real-world use case: FAQ search
|
|
91
|
+
>>> faqs = [
|
|
92
|
+
... "How do I reset my password?",
|
|
93
|
+
... "What is the refund policy?",
|
|
94
|
+
... "How to contact support?",
|
|
95
|
+
... "Where is my order?"
|
|
96
|
+
... ]
|
|
97
|
+
>>> user_query = "forgot password"
|
|
98
|
+
>>> answer = similarity_search(user_query, faqs, top_k=1, return_scores=False)[0]
|
|
99
|
+
>>> print(answer)
|
|
100
|
+
'How do I reset my password?'
|
|
101
|
+
|
|
102
|
+
Notes:
|
|
103
|
+
- TF-IDF method is fastest and works well for most cases
|
|
104
|
+
- Jaccard is good for short texts and keyword matching
|
|
105
|
+
- Levenshtein is useful for typo-tolerant search
|
|
106
|
+
- No external dependencies or API calls required
|
|
107
|
+
- Works offline and is very fast
|
|
108
|
+
|
|
109
|
+
Performance:
|
|
110
|
+
- TF-IDF: O(n*m) where n=docs, m=avg words
|
|
111
|
+
- Jaccard: O(n*m)
|
|
112
|
+
- Levenshtein: O(n*m^2)
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if not documents:
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
if top_k > len(documents):
|
|
119
|
+
top_k = len(documents)
|
|
120
|
+
|
|
121
|
+
# Normalize query
|
|
122
|
+
query_lower = query.lower()
|
|
123
|
+
|
|
124
|
+
if method == "tfidf":
|
|
125
|
+
scores = _tfidf_similarity(query_lower, documents)
|
|
126
|
+
elif method == "jaccard":
|
|
127
|
+
scores = _jaccard_similarity(query_lower, documents)
|
|
128
|
+
elif method == "levenshtein":
|
|
129
|
+
scores = _levenshtein_similarity(query_lower, documents)
|
|
130
|
+
elif method == "ngram":
|
|
131
|
+
scores = _ngram_similarity(query_lower, documents)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unknown method: {method}. Use 'tfidf', 'jaccard', 'levenshtein', or 'ngram'")
|
|
134
|
+
|
|
135
|
+
# Sort by score (descending)
|
|
136
|
+
doc_scores = list(zip(documents, scores))
|
|
137
|
+
doc_scores.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
|
|
139
|
+
# Get top-k results
|
|
140
|
+
top_results = doc_scores[:top_k]
|
|
141
|
+
|
|
142
|
+
if return_scores:
|
|
143
|
+
return top_results
|
|
144
|
+
else:
|
|
145
|
+
return [doc for doc, _ in top_results]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _tfidf_similarity(query: str, documents: List[str]) -> List[float]:
|
|
149
|
+
"""TF-IDF based similarity calculation."""
|
|
150
|
+
# Tokenize
|
|
151
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
152
|
+
|
|
153
|
+
if not query_words:
|
|
154
|
+
return [0.0] * len(documents)
|
|
155
|
+
|
|
156
|
+
# Calculate document frequencies
|
|
157
|
+
doc_freq = {}
|
|
158
|
+
for doc in documents:
|
|
159
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
160
|
+
for word in doc_words:
|
|
161
|
+
doc_freq[word] = doc_freq.get(word, 0) + 1
|
|
162
|
+
|
|
163
|
+
num_docs = len(documents)
|
|
164
|
+
scores = []
|
|
165
|
+
|
|
166
|
+
for doc in documents:
|
|
167
|
+
doc_words = re.findall(r'\w+', doc.lower())
|
|
168
|
+
doc_word_set = set(doc_words)
|
|
169
|
+
|
|
170
|
+
# Calculate TF-IDF score
|
|
171
|
+
score = 0.0
|
|
172
|
+
for word in query_words:
|
|
173
|
+
if word in doc_word_set:
|
|
174
|
+
# TF: term frequency in document
|
|
175
|
+
tf = doc_words.count(word) / len(doc_words) if doc_words else 0
|
|
176
|
+
# IDF: inverse document frequency
|
|
177
|
+
idf = np.log(num_docs / (doc_freq.get(word, 0) + 1))
|
|
178
|
+
score += tf * idf
|
|
179
|
+
|
|
180
|
+
scores.append(score)
|
|
181
|
+
|
|
182
|
+
return scores
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _jaccard_similarity(query: str, documents: List[str]) -> List[float]:
|
|
186
|
+
"""Jaccard similarity based on word overlap."""
|
|
187
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
188
|
+
|
|
189
|
+
if not query_words:
|
|
190
|
+
return [0.0] * len(documents)
|
|
191
|
+
|
|
192
|
+
scores = []
|
|
193
|
+
for doc in documents:
|
|
194
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
195
|
+
|
|
196
|
+
if not doc_words:
|
|
197
|
+
scores.append(0.0)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
intersection = len(query_words & doc_words)
|
|
201
|
+
union = len(query_words | doc_words)
|
|
202
|
+
|
|
203
|
+
score = intersection / union if union > 0 else 0.0
|
|
204
|
+
scores.append(score)
|
|
205
|
+
|
|
206
|
+
return scores
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
210
|
+
"""Calculate Levenshtein distance between two strings."""
|
|
211
|
+
if len(s1) < len(s2):
|
|
212
|
+
return _levenshtein_distance(s2, s1)
|
|
213
|
+
|
|
214
|
+
if len(s2) == 0:
|
|
215
|
+
return len(s1)
|
|
216
|
+
|
|
217
|
+
previous_row = range(len(s2) + 1)
|
|
218
|
+
for i, c1 in enumerate(s1):
|
|
219
|
+
current_row = [i + 1]
|
|
220
|
+
for j, c2 in enumerate(s2):
|
|
221
|
+
insertions = previous_row[j + 1] + 1
|
|
222
|
+
deletions = current_row[j] + 1
|
|
223
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
224
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
225
|
+
previous_row = current_row
|
|
226
|
+
|
|
227
|
+
return previous_row[-1]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _levenshtein_similarity(query: str, documents: List[str]) -> List[float]:
|
|
231
|
+
"""Levenshtein distance based similarity."""
|
|
232
|
+
scores = []
|
|
233
|
+
for doc in documents:
|
|
234
|
+
doc_lower = doc.lower()
|
|
235
|
+
distance = _levenshtein_distance(query, doc_lower)
|
|
236
|
+
max_len = max(len(query), len(doc_lower))
|
|
237
|
+
|
|
238
|
+
# Convert distance to similarity (0 to 1)
|
|
239
|
+
similarity = 1 - (distance / max_len) if max_len > 0 else 0.0
|
|
240
|
+
scores.append(similarity)
|
|
241
|
+
|
|
242
|
+
return scores
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _ngram_similarity(query: str, documents: List[str], n: int = 2) -> List[float]:
|
|
246
|
+
"""N-gram based similarity."""
|
|
247
|
+
def get_ngrams(text: str, n: int) -> set:
|
|
248
|
+
text = text.lower()
|
|
249
|
+
return set(text[i:i+n] for i in range(len(text) - n + 1))
|
|
250
|
+
|
|
251
|
+
query_ngrams = get_ngrams(query, n)
|
|
252
|
+
|
|
253
|
+
if not query_ngrams:
|
|
254
|
+
return [0.0] * len(documents)
|
|
255
|
+
|
|
256
|
+
scores = []
|
|
257
|
+
for doc in documents:
|
|
258
|
+
doc_ngrams = get_ngrams(doc, n)
|
|
259
|
+
|
|
260
|
+
if not doc_ngrams:
|
|
261
|
+
scores.append(0.0)
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
intersection = len(query_ngrams & doc_ngrams)
|
|
265
|
+
union = len(query_ngrams | doc_ngrams)
|
|
266
|
+
|
|
267
|
+
score = intersection / union if union > 0 else 0.0
|
|
268
|
+
scores.append(score)
|
|
269
|
+
|
|
270
|
+
return scores
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data preprocessing utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Tuple, List, Union, Optional
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
__all__ = ['train_test_split', 'normalize_data', 'standardize_data']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def train_test_split(
|
|
13
|
+
X: Union[List, np.ndarray],
|
|
14
|
+
y: Optional[Union[List, np.ndarray]] = None,
|
|
15
|
+
test_size: float = 0.2,
|
|
16
|
+
random_state: Optional[int] = None,
|
|
17
|
+
shuffle: bool = True,
|
|
18
|
+
stratify: bool = False
|
|
19
|
+
) -> Union[Tuple[List, List], Tuple[List, List, List, List]]:
|
|
20
|
+
"""
|
|
21
|
+
Split arrays or lists into random train and test subsets.
|
|
22
|
+
|
|
23
|
+
Perfect for ML workflows - implements the fundamental train-test split
|
|
24
|
+
pattern without requiring scikit-learn. Supports stratified splitting
|
|
25
|
+
to maintain class distribution.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
X: Features array/list to split
|
|
29
|
+
y: Target array/list to split (optional)
|
|
30
|
+
test_size: Proportion of dataset for test set (0.0 to 1.0). Default: 0.2
|
|
31
|
+
random_state: Random seed for reproducibility. Default: None
|
|
32
|
+
shuffle: Whether to shuffle data before splitting. Default: True
|
|
33
|
+
stratify: Maintain class distribution in splits (requires y). Default: False
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
If y is None: (X_train, X_test)
|
|
37
|
+
If y is provided: (X_train, X_test, y_train, y_test)
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> from ilovetools.data import train_test_split
|
|
41
|
+
|
|
42
|
+
# Basic split
|
|
43
|
+
>>> X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
|
|
44
|
+
>>> y = [0, 1, 0, 1, 0]
|
|
45
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
46
|
+
>>> len(X_train), len(X_test)
|
|
47
|
+
(4, 1)
|
|
48
|
+
|
|
49
|
+
# With random seed for reproducibility
|
|
50
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
51
|
+
... X, y, test_size=0.3, random_state=42
|
|
52
|
+
... )
|
|
53
|
+
|
|
54
|
+
# Stratified split (maintains class distribution)
|
|
55
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
56
|
+
... X, y, test_size=0.2, stratify=True, random_state=42
|
|
57
|
+
... )
|
|
58
|
+
|
|
59
|
+
# Split features only (no labels)
|
|
60
|
+
>>> data = list(range(100))
|
|
61
|
+
>>> train, test = train_test_split(data, test_size=0.2)
|
|
62
|
+
>>> len(train), len(test)
|
|
63
|
+
(80, 20)
|
|
64
|
+
|
|
65
|
+
# Real-world example: Email spam detection
|
|
66
|
+
>>> emails = ["email1", "email2", "email3", "email4", "email5"]
|
|
67
|
+
>>> labels = [1, 0, 1, 0, 1] # 1=spam, 0=not spam
|
|
68
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
69
|
+
... emails, labels, test_size=0.2, random_state=42
|
|
70
|
+
... )
|
|
71
|
+
|
|
72
|
+
# 70-30 split
|
|
73
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
74
|
+
... X, y, test_size=0.3
|
|
75
|
+
... )
|
|
76
|
+
|
|
77
|
+
# 60-20-20 split (train-val-test)
|
|
78
|
+
>>> X_temp, X_test, y_temp, y_test = train_test_split(
|
|
79
|
+
... X, y, test_size=0.2, random_state=42
|
|
80
|
+
... )
|
|
81
|
+
>>> X_train, X_val, y_train, y_val = train_test_split(
|
|
82
|
+
... X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
|
|
83
|
+
... )
|
|
84
|
+
|
|
85
|
+
Notes:
|
|
86
|
+
- Always split data BEFORE any preprocessing to avoid data leakage
|
|
87
|
+
- Use random_state for reproducible results
|
|
88
|
+
- Stratified splitting ensures balanced class distribution
|
|
89
|
+
- Common splits: 80-20, 70-30, 60-20-20 (train-val-test)
|
|
90
|
+
- Test data should NEVER be seen during training
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ValueError: If test_size is not between 0 and 1
|
|
94
|
+
ValueError: If stratify=True but y is None
|
|
95
|
+
ValueError: If X and y have different lengths
|
|
96
|
+
|
|
97
|
+
References:
|
|
98
|
+
- scikit-learn train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
|
|
99
|
+
- ML best practices: https://developers.google.com/machine-learning/crash-course/training-and-test-sets/splitting-data
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# Validation
|
|
103
|
+
if not 0 < test_size < 1:
|
|
104
|
+
raise ValueError(f"test_size must be between 0 and 1, got {test_size}")
|
|
105
|
+
|
|
106
|
+
if stratify and y is None:
|
|
107
|
+
raise ValueError("stratify=True requires y to be provided")
|
|
108
|
+
|
|
109
|
+
# Convert to lists if numpy arrays
|
|
110
|
+
if isinstance(X, np.ndarray):
|
|
111
|
+
X = X.tolist()
|
|
112
|
+
if y is not None and isinstance(y, np.ndarray):
|
|
113
|
+
y = y.tolist()
|
|
114
|
+
|
|
115
|
+
# Check lengths match
|
|
116
|
+
if y is not None and len(X) != len(y):
|
|
117
|
+
raise ValueError(f"X and y must have same length. Got X: {len(X)}, y: {len(y)}")
|
|
118
|
+
|
|
119
|
+
n_samples = len(X)
|
|
120
|
+
n_test = int(n_samples * test_size)
|
|
121
|
+
n_train = n_samples - n_test
|
|
122
|
+
|
|
123
|
+
# Set random seed
|
|
124
|
+
if random_state is not None:
|
|
125
|
+
random.seed(random_state)
|
|
126
|
+
|
|
127
|
+
# Create indices
|
|
128
|
+
indices = list(range(n_samples))
|
|
129
|
+
|
|
130
|
+
if stratify and y is not None:
|
|
131
|
+
# Stratified split - maintain class distribution
|
|
132
|
+
X_train, X_test = [], []
|
|
133
|
+
y_train, y_test = [], []
|
|
134
|
+
|
|
135
|
+
# Group indices by class
|
|
136
|
+
class_indices = {}
|
|
137
|
+
for idx, label in enumerate(y):
|
|
138
|
+
if label not in class_indices:
|
|
139
|
+
class_indices[label] = []
|
|
140
|
+
class_indices[label].append(idx)
|
|
141
|
+
|
|
142
|
+
# Split each class proportionally
|
|
143
|
+
for label, class_idx in class_indices.items():
|
|
144
|
+
if shuffle:
|
|
145
|
+
random.shuffle(class_idx)
|
|
146
|
+
|
|
147
|
+
n_class_test = max(1, int(len(class_idx) * test_size))
|
|
148
|
+
|
|
149
|
+
test_idx = class_idx[:n_class_test]
|
|
150
|
+
train_idx = class_idx[n_class_test:]
|
|
151
|
+
|
|
152
|
+
X_test.extend([X[i] for i in test_idx])
|
|
153
|
+
y_test.extend([y[i] for i in test_idx])
|
|
154
|
+
X_train.extend([X[i] for i in train_idx])
|
|
155
|
+
y_train.extend([y[i] for i in train_idx])
|
|
156
|
+
|
|
157
|
+
return X_train, X_test, y_train, y_test
|
|
158
|
+
|
|
159
|
+
else:
|
|
160
|
+
# Regular split
|
|
161
|
+
if shuffle:
|
|
162
|
+
random.shuffle(indices)
|
|
163
|
+
|
|
164
|
+
test_indices = indices[:n_test]
|
|
165
|
+
train_indices = indices[n_test:]
|
|
166
|
+
|
|
167
|
+
X_train = [X[i] for i in train_indices]
|
|
168
|
+
X_test = [X[i] for i in test_indices]
|
|
169
|
+
|
|
170
|
+
if y is not None:
|
|
171
|
+
y_train = [y[i] for i in train_indices]
|
|
172
|
+
y_test = [y[i] for i in test_indices]
|
|
173
|
+
return X_train, X_test, y_train, y_test
|
|
174
|
+
else:
|
|
175
|
+
return X_train, X_test
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def normalize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
179
|
+
"""
|
|
180
|
+
Normalize data to range [0, 1] using min-max scaling.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
data: List or array of numerical values
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
list: Normalized values between 0 and 1
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
>>> from ilovetools.data import normalize_data
|
|
190
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
191
|
+
>>> normalized = normalize_data(data)
|
|
192
|
+
>>> print(normalized)
|
|
193
|
+
[0.0, 0.25, 0.5, 0.75, 1.0]
|
|
194
|
+
"""
|
|
195
|
+
if isinstance(data, np.ndarray):
|
|
196
|
+
data = data.tolist()
|
|
197
|
+
|
|
198
|
+
min_val = min(data)
|
|
199
|
+
max_val = max(data)
|
|
200
|
+
|
|
201
|
+
if max_val == min_val:
|
|
202
|
+
return [0.0] * len(data)
|
|
203
|
+
|
|
204
|
+
return [(x - min_val) / (max_val - min_val) for x in data]
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def standardize_data(data: Union[List[float], np.ndarray]) -> List[float]:
|
|
208
|
+
"""
|
|
209
|
+
Standardize data to have mean=0 and std=1 (Z-score normalization).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
data: List or array of numerical values
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
list: Standardized values with mean=0, std=1
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
>>> from ilovetools.data import standardize_data
|
|
219
|
+
>>> data = [1, 2, 3, 4, 5]
|
|
220
|
+
>>> standardized = standardize_data(data)
|
|
221
|
+
>>> print(standardized)
|
|
222
|
+
[-1.414, -0.707, 0.0, 0.707, 1.414]
|
|
223
|
+
"""
|
|
224
|
+
if isinstance(data, np.ndarray):
|
|
225
|
+
data = data.tolist()
|
|
226
|
+
|
|
227
|
+
mean = sum(data) / len(data)
|
|
228
|
+
variance = sum((x - mean) ** 2 for x in data) / len(data)
|
|
229
|
+
std = variance ** 0.5
|
|
230
|
+
|
|
231
|
+
if std == 0:
|
|
232
|
+
return [0.0] * len(data)
|
|
233
|
+
|
|
234
|
+
return [(x - mean) / std for x in data]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ilovetools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs
|
|
5
5
|
Home-page: https://github.com/AliMehdi512/ilovetools
|
|
6
6
|
Author: Ali Mehdi
|
|
@@ -17,6 +17,7 @@ ilovetools/audio/__init__.py
|
|
|
17
17
|
ilovetools/automation/__init__.py
|
|
18
18
|
ilovetools/conversion/__init__.py
|
|
19
19
|
ilovetools/data/__init__.py
|
|
20
|
+
ilovetools/data/preprocessing.py
|
|
20
21
|
ilovetools/database/__init__.py
|
|
21
22
|
ilovetools/datetime/__init__.py
|
|
22
23
|
ilovetools/files/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ilovetools"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="ilovetools",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.2",
|
|
9
9
|
author="Ali Mehdi",
|
|
10
10
|
author_email="ali.mehdi.dev579@gmail.com",
|
|
11
11
|
description="A comprehensive Python utility library with modular tools for AI/ML, data processing, and daily programming needs",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|