lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +277 -0
- lecrapaud/config.py +10 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/env.py +2 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/alembic.ini +116 -0
- lecrapaud/db/models/__init__.py +10 -10
- lecrapaud/db/models/base.py +176 -1
- lecrapaud/db/models/dataset.py +25 -20
- lecrapaud/db/models/feature.py +5 -6
- lecrapaud/db/models/feature_selection.py +3 -4
- lecrapaud/db/models/feature_selection_rank.py +3 -4
- lecrapaud/db/models/model.py +3 -4
- lecrapaud/db/models/model_selection.py +15 -8
- lecrapaud/db/models/model_training.py +15 -7
- lecrapaud/db/models/score.py +9 -6
- lecrapaud/db/models/target.py +16 -8
- lecrapaud/db/session.py +66 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +747 -1022
- lecrapaud/feature_selection.py +915 -998
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +2 -2
- lecrapaud/jobs/config.py +1 -1
- lecrapaud/jobs/scheduler.py +1 -1
- lecrapaud/jobs/tasks.py +6 -6
- lecrapaud/model_selection.py +1060 -960
- lecrapaud/search_space.py +4 -0
- lecrapaud/utils.py +2 -2
- lecrapaud-0.4.1.dist-info/METADATA +171 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
- lecrapaud/db/crud.py +0 -179
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +0 -58
- lecrapaud/predictions.py +0 -292
- lecrapaud/training.py +0 -151
- lecrapaud-0.4.0.dist-info/METADATA +0 -103
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
import tiktoken
|
|
4
|
+
from lecrapaud.utils import logger
|
|
5
|
+
from lecrapaud.config import OPENAI_API_KEY
|
|
6
|
+
|
|
7
|
+
# OpenAI’s max tokens per request for embeddings
|
|
8
|
+
MAX_TOKENS = 8192
|
|
9
|
+
OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"
|
|
10
|
+
OPEN_AI_MODEL = "gpt-4o-2024-08-06"
|
|
11
|
+
OPEN_AI_TOKENIZER = "cl100k_base"
|
|
12
|
+
OPEN_AI_EMBEDDING_DIM = 1536 # 3072 if embedding model is text-embedding-3-large
|
|
13
|
+
TPM_LIMIT = 5000000
|
|
14
|
+
TPR_LIMIT = 300_000 # known empirically because of a error message
|
|
15
|
+
MAX_LENGHT_ARRAY_FOR_BULK_EMBEDDINGS = 2048
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_openai_client():
|
|
19
|
+
if not OPENAI_API_KEY:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
"Please set an OPENAI_API_KEY environment variable to use embeddings"
|
|
22
|
+
)
|
|
23
|
+
return OpenAI(api_key=OPENAI_API_KEY)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_openai_embedding(document: str | dict) -> list[float]:
|
|
27
|
+
"""embed a string into a vector using latest openai model, text-embedding-3-small
|
|
28
|
+
|
|
29
|
+
:param document: the string to be embedded
|
|
30
|
+
:return: the embedded vector
|
|
31
|
+
"""
|
|
32
|
+
client = get_openai_client()
|
|
33
|
+
|
|
34
|
+
if isinstance(document, dict):
|
|
35
|
+
document = dict_to_markdown_headers_nested(document)
|
|
36
|
+
if not isinstance(document, str):
|
|
37
|
+
raise ValueError("document must be a string or dict")
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
res = client.embeddings.create(input=document, model=OPENAI_EMBEDDING_MODEL)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
if f"This model's maximum context length is {MAX_TOKENS} tokens" in str(e):
|
|
43
|
+
raise Exception(
|
|
44
|
+
f"get_embedding: the document is too long to be vectorized, it is longer than {MAX_TOKENS} tokens"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
raise Exception(e)
|
|
48
|
+
|
|
49
|
+
return res.data[0].embedding
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_openai_embeddings(
|
|
53
|
+
documents: list[str | dict], dimensions=None
|
|
54
|
+
) -> list[list[float]]:
|
|
55
|
+
"""embed a string into a vector using latest openai model, text-embedding-3-small
|
|
56
|
+
|
|
57
|
+
:param document: an array of documents
|
|
58
|
+
:return: a array of embedded vector
|
|
59
|
+
"""
|
|
60
|
+
_documents = documents.copy()
|
|
61
|
+
client = get_openai_client()
|
|
62
|
+
dimensions = dimensions or OPEN_AI_EMBEDDING_DIM
|
|
63
|
+
|
|
64
|
+
if not isinstance(documents, list):
|
|
65
|
+
raise ValueError("documents must be a list")
|
|
66
|
+
|
|
67
|
+
for i, doc in enumerate(documents):
|
|
68
|
+
if isinstance(doc, dict):
|
|
69
|
+
doc = dict_to_markdown_headers_nested(doc)
|
|
70
|
+
_documents[i] = doc
|
|
71
|
+
if not isinstance(doc, str):
|
|
72
|
+
raise ValueError("documents must be a list of strings or dict")
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
max_token = min(max_number_of_tokens(_documents), MAX_TOKENS)
|
|
76
|
+
docs_per_batch = min(
|
|
77
|
+
TPM_LIMIT // max_token,
|
|
78
|
+
TPR_LIMIT // max_token,
|
|
79
|
+
MAX_LENGHT_ARRAY_FOR_BULK_EMBEDDINGS,
|
|
80
|
+
) # TODO: un peu plus de marge ?
|
|
81
|
+
|
|
82
|
+
embeddings = []
|
|
83
|
+
for i, chunk in enumerate(
|
|
84
|
+
[
|
|
85
|
+
_documents[i : i + docs_per_batch]
|
|
86
|
+
for i in range(0, len(_documents), docs_per_batch)
|
|
87
|
+
]
|
|
88
|
+
):
|
|
89
|
+
logger.debug(f"Embedding chunk {i+1} with {len(chunk)} documents...")
|
|
90
|
+
res = client.embeddings.create(
|
|
91
|
+
input=[doc for doc in chunk],
|
|
92
|
+
model=OPENAI_EMBEDDING_MODEL,
|
|
93
|
+
dimensions=dimensions,
|
|
94
|
+
)
|
|
95
|
+
chunk_embeddings = [data.embedding for data in res.data]
|
|
96
|
+
embeddings.extend(chunk_embeddings)
|
|
97
|
+
|
|
98
|
+
return embeddings
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
if f"This model's maximum context length is {MAX_TOKENS} tokens" in str(e):
|
|
102
|
+
raise Exception(
|
|
103
|
+
f"get_embedding: the document is too long to be vectorized, it is longer than {MAX_TOKENS} tokens"
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
raise Exception(e)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def max_number_of_tokens(list):
|
|
110
|
+
return max([num_tokens_from_string(str(item)) for item in list])
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def num_tokens_from_string(string: str, encoding_name: str = OPEN_AI_TOKENIZER) -> int:
|
|
114
|
+
"""Count the number of token in string
|
|
115
|
+
|
|
116
|
+
:param string: the string
|
|
117
|
+
:param encoding_name: the encoding model
|
|
118
|
+
:return: the number of tokens
|
|
119
|
+
"""
|
|
120
|
+
if not string:
|
|
121
|
+
return 0
|
|
122
|
+
tokenizer = tiktoken.get_encoding(encoding_name)
|
|
123
|
+
num_tokens = len(tokenizer.encode(string))
|
|
124
|
+
return num_tokens
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def chunk_text_words(text, max_tokens=MAX_TOKENS):
|
|
128
|
+
"""Splits text into chunks of max_tokens or less."""
|
|
129
|
+
words = text.split()
|
|
130
|
+
|
|
131
|
+
chunks = []
|
|
132
|
+
current_chunk = []
|
|
133
|
+
current_tokens = 0
|
|
134
|
+
|
|
135
|
+
for word in words:
|
|
136
|
+
word_tokens = num_tokens_from_string(word) # Count tokens for word
|
|
137
|
+
if current_tokens + word_tokens > max_tokens:
|
|
138
|
+
chunks.append(" ".join(current_chunk))
|
|
139
|
+
current_chunk = []
|
|
140
|
+
current_tokens = 0
|
|
141
|
+
|
|
142
|
+
current_chunk.append(word)
|
|
143
|
+
current_tokens += word_tokens
|
|
144
|
+
|
|
145
|
+
if current_chunk:
|
|
146
|
+
chunks.append(" ".join(current_chunk))
|
|
147
|
+
|
|
148
|
+
return chunks
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def chunk_text_sentences(text, max_tokens=MAX_TOKENS):
|
|
152
|
+
# Sentence-split using regex (can also use nltk.sent_tokenize if preferred)
|
|
153
|
+
# TODO: should we do a sliding window for chunking ?
|
|
154
|
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
155
|
+
|
|
156
|
+
chunks = []
|
|
157
|
+
current_chunk = ""
|
|
158
|
+
current_tokens = 0
|
|
159
|
+
|
|
160
|
+
for sentence in sentences:
|
|
161
|
+
sentence_tokens = num_tokens_from_string(sentence)
|
|
162
|
+
|
|
163
|
+
if current_tokens + sentence_tokens <= max_tokens:
|
|
164
|
+
current_chunk += " " + sentence if current_chunk else sentence
|
|
165
|
+
current_tokens += sentence_tokens
|
|
166
|
+
else:
|
|
167
|
+
if current_chunk:
|
|
168
|
+
chunks.append(current_chunk.strip())
|
|
169
|
+
# Sentence too long to fit, need to split it
|
|
170
|
+
if sentence_tokens > max_tokens:
|
|
171
|
+
words = sentence.split()
|
|
172
|
+
sub_chunk = ""
|
|
173
|
+
sub_tokens = 0
|
|
174
|
+
for word in words:
|
|
175
|
+
word_tokens = num_tokens_from_string(word + " ")
|
|
176
|
+
if sub_tokens + word_tokens > max_tokens:
|
|
177
|
+
chunks.append(sub_chunk.strip())
|
|
178
|
+
sub_chunk = word
|
|
179
|
+
sub_tokens = word_tokens
|
|
180
|
+
else:
|
|
181
|
+
sub_chunk += " " + word if sub_chunk else word
|
|
182
|
+
sub_tokens += word_tokens
|
|
183
|
+
if sub_chunk:
|
|
184
|
+
chunks.append(sub_chunk.strip())
|
|
185
|
+
current_chunk = ""
|
|
186
|
+
current_tokens = 0
|
|
187
|
+
else:
|
|
188
|
+
current_chunk = sentence
|
|
189
|
+
current_tokens = sentence_tokens
|
|
190
|
+
|
|
191
|
+
if current_chunk:
|
|
192
|
+
chunks.append(current_chunk.strip())
|
|
193
|
+
|
|
194
|
+
return chunks
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def truncate_text(text, max_tokens=MAX_TOKENS):
|
|
198
|
+
"""Limits text to max_tokens or less by truncating."""
|
|
199
|
+
words = text.split()
|
|
200
|
+
truncated_text = []
|
|
201
|
+
current_length = 0
|
|
202
|
+
|
|
203
|
+
for word in words:
|
|
204
|
+
token_length = num_tokens_from_string(word) # Count tokens for word
|
|
205
|
+
if current_length + token_length > max_tokens:
|
|
206
|
+
break # Stop once limit is reached
|
|
207
|
+
|
|
208
|
+
truncated_text.append(word)
|
|
209
|
+
current_length += token_length
|
|
210
|
+
|
|
211
|
+
return " ".join(truncated_text)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def dict_to_markdown_headers_nested(d: dict, level: int = 1) -> str:
|
|
215
|
+
lines = []
|
|
216
|
+
for key, value in d.items():
|
|
217
|
+
header = "#" * level + f" {key}"
|
|
218
|
+
if isinstance(value, dict):
|
|
219
|
+
lines.append(header)
|
|
220
|
+
lines.append(dict_to_markdown_headers_nested(value, level + 1))
|
|
221
|
+
else:
|
|
222
|
+
lines.append(header)
|
|
223
|
+
lines.append(str(value).strip())
|
|
224
|
+
lines.append("") # Blank line between sections
|
|
225
|
+
return "\n".join(lines)
|
lecrapaud/jobs/__init__.py
CHANGED
lecrapaud/jobs/config.py
CHANGED
lecrapaud/jobs/scheduler.py
CHANGED
lecrapaud/jobs/tasks.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from
|
|
1
|
+
from lecrapaud.jobs import app
|
|
2
2
|
|
|
3
3
|
# from honeybadger import honeybadger
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
4
|
+
from lecrapaud.send_daily_emails import send_daily_emails
|
|
5
|
+
from lecrapaud.config import DATASET_ID, RECEIVER_EMAIL
|
|
6
|
+
from lecrapaud.training import run_training
|
|
7
|
+
from lecrapaud.constants import stock_list_3
|
|
8
|
+
from lecrapaud.search_space import get_models_idx
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@app.task(
|