ilovetools 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ilovetools/__init__.py +42 -0
- ilovetools/ai/__init__.py +13 -0
- ilovetools/ai/embeddings.py +270 -0
- ilovetools/ai/inference.py +5 -0
- ilovetools/ai/llm_helpers.py +141 -0
- ilovetools/audio/__init__.py +5 -0
- ilovetools/automation/__init__.py +5 -0
- ilovetools/conversion/__init__.py +5 -0
- ilovetools/data/__init__.py +27 -0
- ilovetools/data/feature_engineering.py +497 -0
- ilovetools/data/preprocessing.py +234 -0
- ilovetools/database/__init__.py +5 -0
- ilovetools/datetime/__init__.py +5 -0
- ilovetools/files/__init__.py +5 -0
- ilovetools/image/__init__.py +5 -0
- ilovetools/ml/__init__.py +182 -0
- ilovetools/ml/cross_validation.py +612 -0
- ilovetools/ml/ensemble.py +872 -0
- ilovetools/ml/metrics.py +601 -0
- ilovetools/ml/tuning.py +781 -0
- ilovetools/security/__init__.py +5 -0
- ilovetools/text/__init__.py +5 -0
- ilovetools/utils/__init__.py +5 -0
- ilovetools/validation/__init__.py +5 -0
- ilovetools/web/__init__.py +5 -0
- ilovetools-0.1.6.dist-info/METADATA +143 -0
- ilovetools-0.1.6.dist-info/RECORD +31 -0
- ilovetools-0.1.6.dist-info/WHEEL +5 -0
- ilovetools-0.1.6.dist-info/licenses/LICENSE +21 -0
- ilovetools-0.1.6.dist-info/top_level.txt +2 -0
- tests/__init__.py +3 -0
ilovetools/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ilovetools - A comprehensive Python utility library
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.6"
|
|
6
|
+
__author__ = "Ali Mehdi"
|
|
7
|
+
__email__ = "ali.mehdi.dev579@gmail.com"
|
|
8
|
+
|
|
9
|
+
# Import all modules for easy access
|
|
10
|
+
from . import ai
|
|
11
|
+
from . import data
|
|
12
|
+
from . import ml
|
|
13
|
+
from . import files
|
|
14
|
+
from . import text
|
|
15
|
+
from . import image
|
|
16
|
+
from . import audio
|
|
17
|
+
from . import web
|
|
18
|
+
from . import security
|
|
19
|
+
from . import database
|
|
20
|
+
from . import datetime
|
|
21
|
+
from . import validation
|
|
22
|
+
from . import conversion
|
|
23
|
+
from . import automation
|
|
24
|
+
from . import utils
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"ai",
|
|
28
|
+
"data",
|
|
29
|
+
"ml",
|
|
30
|
+
"files",
|
|
31
|
+
"text",
|
|
32
|
+
"image",
|
|
33
|
+
"audio",
|
|
34
|
+
"web",
|
|
35
|
+
"security",
|
|
36
|
+
"database",
|
|
37
|
+
"datetime",
|
|
38
|
+
"validation",
|
|
39
|
+
"conversion",
|
|
40
|
+
"automation",
|
|
41
|
+
"utils",
|
|
42
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AI & Machine Learning utilities module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .llm_helpers import token_counter
|
|
6
|
+
from .embeddings import similarity_search, cosine_similarity
|
|
7
|
+
from .inference import *
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
'token_counter',
|
|
11
|
+
'similarity_search',
|
|
12
|
+
'cosine_similarity',
|
|
13
|
+
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding utilities for text and vector operations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import List, Union, Tuple, Dict
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
__all__ = ['similarity_search', 'cosine_similarity']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def cosine_similarity(vec1: Union[List[float], np.ndarray], vec2: Union[List[float], np.ndarray]) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Calculate cosine similarity between two vectors.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
vec1: First vector
|
|
18
|
+
vec2: Second vector
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
float: Cosine similarity score between -1 and 1
|
|
22
|
+
"""
|
|
23
|
+
vec1 = np.array(vec1)
|
|
24
|
+
vec2 = np.array(vec2)
|
|
25
|
+
|
|
26
|
+
dot_product = np.dot(vec1, vec2)
|
|
27
|
+
norm1 = np.linalg.norm(vec1)
|
|
28
|
+
norm2 = np.linalg.norm(vec2)
|
|
29
|
+
|
|
30
|
+
if norm1 == 0 or norm2 == 0:
|
|
31
|
+
return 0.0
|
|
32
|
+
|
|
33
|
+
return float(dot_product / (norm1 * norm2))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def similarity_search(
|
|
37
|
+
query: str,
|
|
38
|
+
documents: List[str],
|
|
39
|
+
top_k: int = 5,
|
|
40
|
+
method: str = "tfidf",
|
|
41
|
+
return_scores: bool = True
|
|
42
|
+
) -> Union[List[str], List[Tuple[str, float]]]:
|
|
43
|
+
"""
|
|
44
|
+
Find most similar documents to a query using various similarity methods.
|
|
45
|
+
|
|
46
|
+
This function performs semantic similarity search without requiring external APIs
|
|
47
|
+
or heavy ML models. Perfect for quick document retrieval, search functionality,
|
|
48
|
+
and finding relevant content.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
query (str): Search query text
|
|
52
|
+
documents (list): List of document strings to search through
|
|
53
|
+
top_k (int): Number of top results to return. Default: 5
|
|
54
|
+
method (str): Similarity method to use:
|
|
55
|
+
- "tfidf": TF-IDF based similarity (default, fast)
|
|
56
|
+
- "jaccard": Jaccard similarity (word overlap)
|
|
57
|
+
- "levenshtein": Edit distance based similarity
|
|
58
|
+
- "ngram": N-gram based similarity
|
|
59
|
+
return_scores (bool): If True, returns (document, score) tuples.
|
|
60
|
+
If False, returns only documents. Default: True
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
list: Top-k most similar documents
|
|
64
|
+
- If return_scores=True: [(doc, score), ...]
|
|
65
|
+
- If return_scores=False: [doc, ...]
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
>>> from ilovetools.ai import similarity_search
|
|
69
|
+
|
|
70
|
+
# Basic usage
|
|
71
|
+
>>> docs = [
|
|
72
|
+
... "Python is a programming language",
|
|
73
|
+
... "Machine learning with Python",
|
|
74
|
+
... "Java programming basics",
|
|
75
|
+
... "Deep learning and AI"
|
|
76
|
+
... ]
|
|
77
|
+
>>> results = similarity_search("Python ML", docs, top_k=2)
|
|
78
|
+
>>> print(results)
|
|
79
|
+
[('Machine learning with Python', 0.85), ('Python is a programming language', 0.72)]
|
|
80
|
+
|
|
81
|
+
# Without scores
|
|
82
|
+
>>> results = similarity_search("Python ML", docs, return_scores=False)
|
|
83
|
+
>>> print(results)
|
|
84
|
+
['Machine learning with Python', 'Python is a programming language']
|
|
85
|
+
|
|
86
|
+
# Different methods
|
|
87
|
+
>>> results = similarity_search("Python", docs, method="jaccard")
|
|
88
|
+
>>> results = similarity_search("Python", docs, method="levenshtein")
|
|
89
|
+
|
|
90
|
+
# Real-world use case: FAQ search
|
|
91
|
+
>>> faqs = [
|
|
92
|
+
... "How do I reset my password?",
|
|
93
|
+
... "What is the refund policy?",
|
|
94
|
+
... "How to contact support?",
|
|
95
|
+
... "Where is my order?"
|
|
96
|
+
... ]
|
|
97
|
+
>>> user_query = "forgot password"
|
|
98
|
+
>>> answer = similarity_search(user_query, faqs, top_k=1, return_scores=False)[0]
|
|
99
|
+
>>> print(answer)
|
|
100
|
+
'How do I reset my password?'
|
|
101
|
+
|
|
102
|
+
Notes:
|
|
103
|
+
- TF-IDF method is fastest and works well for most cases
|
|
104
|
+
- Jaccard is good for short texts and keyword matching
|
|
105
|
+
- Levenshtein is useful for typo-tolerant search
|
|
106
|
+
- No external dependencies or API calls required
|
|
107
|
+
- Works offline and is very fast
|
|
108
|
+
|
|
109
|
+
Performance:
|
|
110
|
+
- TF-IDF: O(n*m) where n=docs, m=avg words
|
|
111
|
+
- Jaccard: O(n*m)
|
|
112
|
+
- Levenshtein: O(n*m^2)
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if not documents:
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
if top_k > len(documents):
|
|
119
|
+
top_k = len(documents)
|
|
120
|
+
|
|
121
|
+
# Normalize query
|
|
122
|
+
query_lower = query.lower()
|
|
123
|
+
|
|
124
|
+
if method == "tfidf":
|
|
125
|
+
scores = _tfidf_similarity(query_lower, documents)
|
|
126
|
+
elif method == "jaccard":
|
|
127
|
+
scores = _jaccard_similarity(query_lower, documents)
|
|
128
|
+
elif method == "levenshtein":
|
|
129
|
+
scores = _levenshtein_similarity(query_lower, documents)
|
|
130
|
+
elif method == "ngram":
|
|
131
|
+
scores = _ngram_similarity(query_lower, documents)
|
|
132
|
+
else:
|
|
133
|
+
raise ValueError(f"Unknown method: {method}. Use 'tfidf', 'jaccard', 'levenshtein', or 'ngram'")
|
|
134
|
+
|
|
135
|
+
# Sort by score (descending)
|
|
136
|
+
doc_scores = list(zip(documents, scores))
|
|
137
|
+
doc_scores.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
|
|
139
|
+
# Get top-k results
|
|
140
|
+
top_results = doc_scores[:top_k]
|
|
141
|
+
|
|
142
|
+
if return_scores:
|
|
143
|
+
return top_results
|
|
144
|
+
else:
|
|
145
|
+
return [doc for doc, _ in top_results]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _tfidf_similarity(query: str, documents: List[str]) -> List[float]:
|
|
149
|
+
"""TF-IDF based similarity calculation."""
|
|
150
|
+
# Tokenize
|
|
151
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
152
|
+
|
|
153
|
+
if not query_words:
|
|
154
|
+
return [0.0] * len(documents)
|
|
155
|
+
|
|
156
|
+
# Calculate document frequencies
|
|
157
|
+
doc_freq = {}
|
|
158
|
+
for doc in documents:
|
|
159
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
160
|
+
for word in doc_words:
|
|
161
|
+
doc_freq[word] = doc_freq.get(word, 0) + 1
|
|
162
|
+
|
|
163
|
+
num_docs = len(documents)
|
|
164
|
+
scores = []
|
|
165
|
+
|
|
166
|
+
for doc in documents:
|
|
167
|
+
doc_words = re.findall(r'\w+', doc.lower())
|
|
168
|
+
doc_word_set = set(doc_words)
|
|
169
|
+
|
|
170
|
+
# Calculate TF-IDF score
|
|
171
|
+
score = 0.0
|
|
172
|
+
for word in query_words:
|
|
173
|
+
if word in doc_word_set:
|
|
174
|
+
# TF: term frequency in document
|
|
175
|
+
tf = doc_words.count(word) / len(doc_words) if doc_words else 0
|
|
176
|
+
# IDF: inverse document frequency
|
|
177
|
+
idf = np.log(num_docs / (doc_freq.get(word, 0) + 1))
|
|
178
|
+
score += tf * idf
|
|
179
|
+
|
|
180
|
+
scores.append(score)
|
|
181
|
+
|
|
182
|
+
return scores
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _jaccard_similarity(query: str, documents: List[str]) -> List[float]:
|
|
186
|
+
"""Jaccard similarity based on word overlap."""
|
|
187
|
+
query_words = set(re.findall(r'\w+', query.lower()))
|
|
188
|
+
|
|
189
|
+
if not query_words:
|
|
190
|
+
return [0.0] * len(documents)
|
|
191
|
+
|
|
192
|
+
scores = []
|
|
193
|
+
for doc in documents:
|
|
194
|
+
doc_words = set(re.findall(r'\w+', doc.lower()))
|
|
195
|
+
|
|
196
|
+
if not doc_words:
|
|
197
|
+
scores.append(0.0)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
intersection = len(query_words & doc_words)
|
|
201
|
+
union = len(query_words | doc_words)
|
|
202
|
+
|
|
203
|
+
score = intersection / union if union > 0 else 0.0
|
|
204
|
+
scores.append(score)
|
|
205
|
+
|
|
206
|
+
return scores
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _levenshtein_distance(s1: str, s2: str) -> int:
|
|
210
|
+
"""Calculate Levenshtein distance between two strings."""
|
|
211
|
+
if len(s1) < len(s2):
|
|
212
|
+
return _levenshtein_distance(s2, s1)
|
|
213
|
+
|
|
214
|
+
if len(s2) == 0:
|
|
215
|
+
return len(s1)
|
|
216
|
+
|
|
217
|
+
previous_row = range(len(s2) + 1)
|
|
218
|
+
for i, c1 in enumerate(s1):
|
|
219
|
+
current_row = [i + 1]
|
|
220
|
+
for j, c2 in enumerate(s2):
|
|
221
|
+
insertions = previous_row[j + 1] + 1
|
|
222
|
+
deletions = current_row[j] + 1
|
|
223
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
224
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
225
|
+
previous_row = current_row
|
|
226
|
+
|
|
227
|
+
return previous_row[-1]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _levenshtein_similarity(query: str, documents: List[str]) -> List[float]:
|
|
231
|
+
"""Levenshtein distance based similarity."""
|
|
232
|
+
scores = []
|
|
233
|
+
for doc in documents:
|
|
234
|
+
doc_lower = doc.lower()
|
|
235
|
+
distance = _levenshtein_distance(query, doc_lower)
|
|
236
|
+
max_len = max(len(query), len(doc_lower))
|
|
237
|
+
|
|
238
|
+
# Convert distance to similarity (0 to 1)
|
|
239
|
+
similarity = 1 - (distance / max_len) if max_len > 0 else 0.0
|
|
240
|
+
scores.append(similarity)
|
|
241
|
+
|
|
242
|
+
return scores
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _ngram_similarity(query: str, documents: List[str], n: int = 2) -> List[float]:
|
|
246
|
+
"""N-gram based similarity."""
|
|
247
|
+
def get_ngrams(text: str, n: int) -> set:
|
|
248
|
+
text = text.lower()
|
|
249
|
+
return set(text[i:i+n] for i in range(len(text) - n + 1))
|
|
250
|
+
|
|
251
|
+
query_ngrams = get_ngrams(query, n)
|
|
252
|
+
|
|
253
|
+
if not query_ngrams:
|
|
254
|
+
return [0.0] * len(documents)
|
|
255
|
+
|
|
256
|
+
scores = []
|
|
257
|
+
for doc in documents:
|
|
258
|
+
doc_ngrams = get_ngrams(doc, n)
|
|
259
|
+
|
|
260
|
+
if not doc_ngrams:
|
|
261
|
+
scores.append(0.0)
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
intersection = len(query_ngrams & doc_ngrams)
|
|
265
|
+
union = len(query_ngrams | doc_ngrams)
|
|
266
|
+
|
|
267
|
+
score = intersection / union if union > 0 else 0.0
|
|
268
|
+
scores.append(score)
|
|
269
|
+
|
|
270
|
+
return scores
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM helper utilities for working with language models
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Union, List
|
|
7
|
+
|
|
8
|
+
__all__ = ['token_counter']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def token_counter(
|
|
12
|
+
text: Union[str, List[str]],
|
|
13
|
+
model: str = "gpt-3.5-turbo",
|
|
14
|
+
detailed: bool = False
|
|
15
|
+
) -> Union[int, dict]:
|
|
16
|
+
"""
|
|
17
|
+
Estimate token count for text input across different LLM models.
|
|
18
|
+
|
|
19
|
+
This function provides accurate token estimation for various language models
|
|
20
|
+
without requiring API calls. Essential for managing costs and staying within
|
|
21
|
+
context limits.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
text (str or list): Input text or list of texts to count tokens for
|
|
25
|
+
model (str): Model name for token estimation. Supported models:
|
|
26
|
+
- "gpt-3.5-turbo", "gpt-4", "gpt-4-turbo" (OpenAI)
|
|
27
|
+
- "claude-3", "claude-2" (Anthropic)
|
|
28
|
+
- "llama-2", "llama-3" (Meta)
|
|
29
|
+
- "gemini-pro" (Google)
|
|
30
|
+
Default: "gpt-3.5-turbo"
|
|
31
|
+
detailed (bool): If True, returns detailed breakdown. Default: False
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
int: Estimated token count (if detailed=False)
|
|
35
|
+
dict: Detailed breakdown with tokens, characters, words (if detailed=True)
|
|
36
|
+
|
|
37
|
+
Examples:
|
|
38
|
+
>>> from ilovetools.ai import token_counter
|
|
39
|
+
|
|
40
|
+
# Basic usage
|
|
41
|
+
>>> token_counter("Hello, how are you?")
|
|
42
|
+
6
|
|
43
|
+
|
|
44
|
+
# With specific model
|
|
45
|
+
>>> token_counter("Hello, how are you?", model="gpt-4")
|
|
46
|
+
6
|
|
47
|
+
|
|
48
|
+
# Detailed breakdown
|
|
49
|
+
>>> token_counter("Hello, how are you?", detailed=True)
|
|
50
|
+
{
|
|
51
|
+
'tokens': 6,
|
|
52
|
+
'characters': 19,
|
|
53
|
+
'words': 4,
|
|
54
|
+
'model': 'gpt-3.5-turbo',
|
|
55
|
+
'cost_estimate_1k': 0.0015
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Multiple texts
|
|
59
|
+
>>> texts = ["First message", "Second message"]
|
|
60
|
+
>>> token_counter(texts)
|
|
61
|
+
8
|
|
62
|
+
|
|
63
|
+
# Check if text fits in context window
|
|
64
|
+
>>> text = "Your long text here..."
|
|
65
|
+
>>> tokens = token_counter(text, model="gpt-3.5-turbo")
|
|
66
|
+
>>> if tokens > 4096:
|
|
67
|
+
... print("Text too long for model context!")
|
|
68
|
+
|
|
69
|
+
Notes:
|
|
70
|
+
- Token estimation is approximate but typically within 5% accuracy
|
|
71
|
+
- Different models use different tokenization methods
|
|
72
|
+
- Useful for cost estimation and context window management
|
|
73
|
+
- No API calls required - works offline
|
|
74
|
+
|
|
75
|
+
References:
|
|
76
|
+
- OpenAI Tokenization: https://platform.openai.com/tokenizer
|
|
77
|
+
- Token pricing: https://openai.com/pricing
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
# Handle list input
|
|
81
|
+
if isinstance(text, list):
|
|
82
|
+
text = " ".join(text)
|
|
83
|
+
|
|
84
|
+
# Model-specific token estimation ratios
|
|
85
|
+
# Based on empirical analysis of different tokenizers
|
|
86
|
+
model_ratios = {
|
|
87
|
+
"gpt-3.5-turbo": 0.75, # ~4 chars per token
|
|
88
|
+
"gpt-4": 0.75,
|
|
89
|
+
"gpt-4-turbo": 0.75,
|
|
90
|
+
"claude-3": 0.72, # Slightly more efficient
|
|
91
|
+
"claude-2": 0.72,
|
|
92
|
+
"llama-2": 0.78, # Slightly less efficient
|
|
93
|
+
"llama-3": 0.76,
|
|
94
|
+
"gemini-pro": 0.74,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Cost per 1K tokens (USD) - approximate
|
|
98
|
+
model_costs = {
|
|
99
|
+
"gpt-3.5-turbo": 0.0015,
|
|
100
|
+
"gpt-4": 0.03,
|
|
101
|
+
"gpt-4-turbo": 0.01,
|
|
102
|
+
"claude-3": 0.015,
|
|
103
|
+
"claude-2": 0.008,
|
|
104
|
+
"llama-2": 0.0, # Open source
|
|
105
|
+
"llama-3": 0.0,
|
|
106
|
+
"gemini-pro": 0.00025,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Get ratio for model (default to GPT-3.5)
|
|
110
|
+
ratio = model_ratios.get(model.lower(), 0.75)
|
|
111
|
+
|
|
112
|
+
# Character count
|
|
113
|
+
char_count = len(text)
|
|
114
|
+
|
|
115
|
+
# Word count (simple split)
|
|
116
|
+
word_count = len(text.split())
|
|
117
|
+
|
|
118
|
+
# Token estimation
|
|
119
|
+
# Formula: (characters * ratio) with adjustments for spaces and punctuation
|
|
120
|
+
base_tokens = char_count * ratio
|
|
121
|
+
|
|
122
|
+
# Adjust for spaces (spaces are often separate tokens)
|
|
123
|
+
space_count = text.count(' ')
|
|
124
|
+
|
|
125
|
+
# Adjust for special characters and punctuation
|
|
126
|
+
special_chars = len(re.findall(r'[^\w\s]', text))
|
|
127
|
+
|
|
128
|
+
# Final token estimate
|
|
129
|
+
estimated_tokens = int(base_tokens + (space_count * 0.3) + (special_chars * 0.5))
|
|
130
|
+
|
|
131
|
+
if detailed:
|
|
132
|
+
return {
|
|
133
|
+
'tokens': estimated_tokens,
|
|
134
|
+
'characters': char_count,
|
|
135
|
+
'words': word_count,
|
|
136
|
+
'model': model,
|
|
137
|
+
'cost_estimate_1k': model_costs.get(model.lower(), 0.0),
|
|
138
|
+
'estimated_cost': (estimated_tokens / 1000) * model_costs.get(model.lower(), 0.0)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return estimated_tokens
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing and manipulation utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .preprocessing import train_test_split, normalize_data, standardize_data
|
|
6
|
+
from .feature_engineering import (
|
|
7
|
+
create_polynomial_features,
|
|
8
|
+
bin_numerical_feature,
|
|
9
|
+
one_hot_encode,
|
|
10
|
+
label_encode,
|
|
11
|
+
extract_datetime_features,
|
|
12
|
+
handle_missing_values,
|
|
13
|
+
create_interaction_features
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'train_test_split',
|
|
18
|
+
'normalize_data',
|
|
19
|
+
'standardize_data',
|
|
20
|
+
'create_polynomial_features',
|
|
21
|
+
'bin_numerical_feature',
|
|
22
|
+
'one_hot_encode',
|
|
23
|
+
'label_encode',
|
|
24
|
+
'extract_datetime_features',
|
|
25
|
+
'handle_missing_values',
|
|
26
|
+
'create_interaction_features',
|
|
27
|
+
]
|