lecrapaud 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (63) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +271 -0
  3. lecrapaud/config.py +25 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/README +1 -0
  6. lecrapaud/db/alembic/env.py +78 -0
  7. lecrapaud/db/alembic/script.py.mako +26 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  9. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  10. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  15. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  21. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  22. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  23. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  24. lecrapaud/db/models/__init__.py +11 -0
  25. lecrapaud/db/models/base.py +181 -0
  26. lecrapaud/db/models/dataset.py +129 -0
  27. lecrapaud/db/models/feature.py +45 -0
  28. lecrapaud/db/models/feature_selection.py +125 -0
  29. lecrapaud/db/models/feature_selection_rank.py +79 -0
  30. lecrapaud/db/models/model.py +40 -0
  31. lecrapaud/db/models/model_selection.py +63 -0
  32. lecrapaud/db/models/model_training.py +62 -0
  33. lecrapaud/db/models/score.py +65 -0
  34. lecrapaud/db/models/target.py +67 -0
  35. lecrapaud/db/session.py +45 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/experiment.py +64 -0
  38. lecrapaud/feature_engineering.py +846 -0
  39. lecrapaud/feature_selection.py +1167 -0
  40. lecrapaud/integrations/openai_integration.py +225 -0
  41. lecrapaud/jobs/__init__.py +13 -0
  42. lecrapaud/jobs/config.py +17 -0
  43. lecrapaud/jobs/scheduler.py +36 -0
  44. lecrapaud/jobs/tasks.py +57 -0
  45. lecrapaud/model_selection.py +1671 -0
  46. lecrapaud/predictions.py +292 -0
  47. lecrapaud/preprocessing.py +984 -0
  48. lecrapaud/search_space.py +848 -0
  49. lecrapaud/services/__init__.py +0 -0
  50. lecrapaud/services/embedding_categorical.py +71 -0
  51. lecrapaud/services/indicators.py +309 -0
  52. lecrapaud/speed_tests/experiments.py +139 -0
  53. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  54. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  55. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  56. lecrapaud/speed_tests/tests.ipynb +145 -0
  57. lecrapaud/speed_tests/trash.py +37 -0
  58. lecrapaud/training.py +239 -0
  59. lecrapaud/utils.py +246 -0
  60. lecrapaud-0.1.0.dist-info/LICENSE +201 -0
  61. lecrapaud-0.1.0.dist-info/METADATA +105 -0
  62. lecrapaud-0.1.0.dist-info/RECORD +63 -0
  63. lecrapaud-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,225 @@
1
+ import re
2
+ from openai import OpenAI
3
+ import tiktoken
4
+ from lecrapaud.utils import logger
5
+ from lecrapaud.config import OPENAI_API_KEY
6
+
7
+ # OpenAI’s max tokens per request for embeddings
8
+ MAX_TOKENS = 8192
9
+ OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"
10
+ OPEN_AI_MODEL = "gpt-4o-2024-08-06"
11
+ OPEN_AI_TOKENIZER = "cl100k_base"
12
+ OPEN_AI_EMBEDDING_DIM = 1536 # 3072 if embedding model is text-embedding-3-large
13
+ TPM_LIMIT = 5000000
14
+ TPR_LIMIT = 300_000 # known empirically because of a error message
15
+ MAX_LENGHT_ARRAY_FOR_BULK_EMBEDDINGS = 2048
16
+
17
+
18
+ def get_openai_client():
19
+ if not OPENAI_API_KEY:
20
+ raise ValueError(
21
+ "Please set an OPENAI_API_KEY environment variable to use embeddings"
22
+ )
23
+ return OpenAI(api_key=OPENAI_API_KEY)
24
+
25
+
26
+ def get_openai_embedding(document: str | dict) -> list[float]:
27
+ """embed a string into a vector using latest openai model, text-embedding-3-small
28
+
29
+ :param document: the string to be embedded
30
+ :return: the embedded vector
31
+ """
32
+ client = get_openai_client()
33
+
34
+ if isinstance(document, dict):
35
+ document = dict_to_markdown_headers_nested(document)
36
+ if not isinstance(document, str):
37
+ raise ValueError("document must be a string or dict")
38
+
39
+ try:
40
+ res = client.embeddings.create(input=document, model=OPENAI_EMBEDDING_MODEL)
41
+ except Exception as e:
42
+ if f"This model's maximum context length is {MAX_TOKENS} tokens" in str(e):
43
+ raise Exception(
44
+ f"get_embedding: the document is too long to be vectorized, it is longer than {MAX_TOKENS} tokens"
45
+ )
46
+ else:
47
+ raise Exception(e)
48
+
49
+ return res.data[0].embedding
50
+
51
+
52
+ def get_openai_embeddings(
53
+ documents: list[str | dict], dimensions=None
54
+ ) -> list[list[float]]:
55
+ """embed a string into a vector using latest openai model, text-embedding-3-small
56
+
57
+ :param document: an array of documents
58
+ :return: a array of embedded vector
59
+ """
60
+ _documents = documents.copy()
61
+ client = get_openai_client()
62
+ dimensions = dimensions or OPEN_AI_EMBEDDING_DIM
63
+
64
+ if not isinstance(documents, list):
65
+ raise ValueError("documents must be a list")
66
+
67
+ for i, doc in enumerate(documents):
68
+ if isinstance(doc, dict):
69
+ doc = dict_to_markdown_headers_nested(doc)
70
+ _documents[i] = doc
71
+ if not isinstance(doc, str):
72
+ raise ValueError("documents must be a list of strings or dict")
73
+
74
+ try:
75
+ max_token = min(max_number_of_tokens(_documents), MAX_TOKENS)
76
+ docs_per_batch = min(
77
+ TPM_LIMIT // max_token,
78
+ TPR_LIMIT // max_token,
79
+ MAX_LENGHT_ARRAY_FOR_BULK_EMBEDDINGS,
80
+ ) # TODO: un peu plus de marge ?
81
+
82
+ embeddings = []
83
+ for i, chunk in enumerate(
84
+ [
85
+ _documents[i : i + docs_per_batch]
86
+ for i in range(0, len(_documents), docs_per_batch)
87
+ ]
88
+ ):
89
+ logger.debug(f"Embedding chunk {i+1} with {len(chunk)} documents...")
90
+ res = client.embeddings.create(
91
+ input=[doc for doc in chunk],
92
+ model=OPENAI_EMBEDDING_MODEL,
93
+ dimensions=dimensions,
94
+ )
95
+ chunk_embeddings = [data.embedding for data in res.data]
96
+ embeddings.extend(chunk_embeddings)
97
+
98
+ return embeddings
99
+
100
+ except Exception as e:
101
+ if f"This model's maximum context length is {MAX_TOKENS} tokens" in str(e):
102
+ raise Exception(
103
+ f"get_embedding: the document is too long to be vectorized, it is longer than {MAX_TOKENS} tokens"
104
+ )
105
+ else:
106
+ raise Exception(e)
107
+
108
+
109
+ def max_number_of_tokens(list):
110
+ return max([num_tokens_from_string(str(item)) for item in list])
111
+
112
+
113
+ def num_tokens_from_string(string: str, encoding_name: str = OPEN_AI_TOKENIZER) -> int:
114
+ """Count the number of token in string
115
+
116
+ :param string: the string
117
+ :param encoding_name: the encoding model
118
+ :return: the number of tokens
119
+ """
120
+ if not string:
121
+ return 0
122
+ tokenizer = tiktoken.get_encoding(encoding_name)
123
+ num_tokens = len(tokenizer.encode(string))
124
+ return num_tokens
125
+
126
+
127
+ def chunk_text_words(text, max_tokens=MAX_TOKENS):
128
+ """Splits text into chunks of max_tokens or less."""
129
+ words = text.split()
130
+
131
+ chunks = []
132
+ current_chunk = []
133
+ current_tokens = 0
134
+
135
+ for word in words:
136
+ word_tokens = num_tokens_from_string(word) # Count tokens for word
137
+ if current_tokens + word_tokens > max_tokens:
138
+ chunks.append(" ".join(current_chunk))
139
+ current_chunk = []
140
+ current_tokens = 0
141
+
142
+ current_chunk.append(word)
143
+ current_tokens += word_tokens
144
+
145
+ if current_chunk:
146
+ chunks.append(" ".join(current_chunk))
147
+
148
+ return chunks
149
+
150
+
151
+ def chunk_text_sentences(text, max_tokens=MAX_TOKENS):
152
+ # Sentence-split using regex (can also use nltk.sent_tokenize if preferred)
153
+ # TODO: should we do a sliding window for chunking ?
154
+ sentences = re.split(r"(?<=[.!?])\s+", text)
155
+
156
+ chunks = []
157
+ current_chunk = ""
158
+ current_tokens = 0
159
+
160
+ for sentence in sentences:
161
+ sentence_tokens = num_tokens_from_string(sentence)
162
+
163
+ if current_tokens + sentence_tokens <= max_tokens:
164
+ current_chunk += " " + sentence if current_chunk else sentence
165
+ current_tokens += sentence_tokens
166
+ else:
167
+ if current_chunk:
168
+ chunks.append(current_chunk.strip())
169
+ # Sentence too long to fit, need to split it
170
+ if sentence_tokens > max_tokens:
171
+ words = sentence.split()
172
+ sub_chunk = ""
173
+ sub_tokens = 0
174
+ for word in words:
175
+ word_tokens = num_tokens_from_string(word + " ")
176
+ if sub_tokens + word_tokens > max_tokens:
177
+ chunks.append(sub_chunk.strip())
178
+ sub_chunk = word
179
+ sub_tokens = word_tokens
180
+ else:
181
+ sub_chunk += " " + word if sub_chunk else word
182
+ sub_tokens += word_tokens
183
+ if sub_chunk:
184
+ chunks.append(sub_chunk.strip())
185
+ current_chunk = ""
186
+ current_tokens = 0
187
+ else:
188
+ current_chunk = sentence
189
+ current_tokens = sentence_tokens
190
+
191
+ if current_chunk:
192
+ chunks.append(current_chunk.strip())
193
+
194
+ return chunks
195
+
196
+
197
+ def truncate_text(text, max_tokens=MAX_TOKENS):
198
+ """Limits text to max_tokens or less by truncating."""
199
+ words = text.split()
200
+ truncated_text = []
201
+ current_length = 0
202
+
203
+ for word in words:
204
+ token_length = num_tokens_from_string(word) # Count tokens for word
205
+ if current_length + token_length > max_tokens:
206
+ break # Stop once limit is reached
207
+
208
+ truncated_text.append(word)
209
+ current_length += token_length
210
+
211
+ return " ".join(truncated_text)
212
+
213
+
214
+ def dict_to_markdown_headers_nested(d: dict, level: int = 1) -> str:
215
+ lines = []
216
+ for key, value in d.items():
217
+ header = "#" * level + f" {key}"
218
+ if isinstance(value, dict):
219
+ lines.append(header)
220
+ lines.append(dict_to_markdown_headers_nested(value, level + 1))
221
+ else:
222
+ lines.append(header)
223
+ lines.append(str(value).strip())
224
+ lines.append("") # Blank line between sections
225
+ return "\n".join(lines)
@@ -0,0 +1,13 @@
1
+ from celery import Celery, signals
2
+ from lecrapaud.jobs import config
3
+ from lecrapaud.utils import setup_logger
4
+
5
+
6
+ @signals.setup_logging.connect
7
+ def configure_celery_logging(**kwargs):
8
+ setup_logger()
9
+
10
+
11
+ app = Celery("src")
12
+ app.config_from_object(config)
13
+ app.autodiscover_tasks(["src.jobs"])
@@ -0,0 +1,17 @@
1
+ from lecrapaud.config import REDIS_URL
2
+
3
+ REDIS_URL = REDIS_URL + "/1"
4
+ broker_url = REDIS_URL
5
+ result_backend = REDIS_URL
6
+
7
+ # For RedBeat
8
+ redbeat_redis_url = REDIS_URL
9
+ beat_scheduler = "redbeat.RedBeatScheduler"
10
+
11
+ timezone = "UTC"
12
+
13
+ task_acks_late = True
14
+ task_reject_on_worker_lost = True
15
+ worker_prefetch_multiplier = 1
16
+ task_acks_on_failure_or_timeout = False
17
+ worker_concurrency = 1
@@ -0,0 +1,36 @@
1
+ from redbeat.schedulers import RedBeatSchedulerEntry
2
+ from celery.schedules import crontab
3
+ from lecrapaud.jobs.tasks import app
4
+
5
+
6
+ def schedule_tasks():
7
+ schedule_tasks_list = [
8
+ {
9
+ "name": "task_send_daily_emails",
10
+ "task": "src.jobs.tasks.task_send_daily_emails",
11
+ "schedule": crontab(minute=00, hour=12),
12
+ },
13
+ {
14
+ "name": "task_training_experiment",
15
+ "task": "src.jobs.tasks.task_training_experiment",
16
+ "schedule": crontab(minute=45, hour=00),
17
+ },
18
+ ]
19
+
20
+ for task in schedule_tasks_list:
21
+ entry = RedBeatSchedulerEntry(**task, app=app)
22
+ entry.save()
23
+
24
+
25
+ def unschedule_tasks():
26
+ unschedule_task_keys = [
27
+ "redbeat:task_send_daily_emails",
28
+ "redbeat:task_train_models",
29
+ ]
30
+
31
+ for key in unschedule_task_keys:
32
+ try:
33
+ entry = RedBeatSchedulerEntry.from_key(key, app=app)
34
+ entry.delete()
35
+ except KeyError:
36
+ pass
@@ -0,0 +1,57 @@
1
+ from lecrapaud.jobs import app
2
+
3
+ # from honeybadger import honeybadger
4
+ from lecrapaud.send_daily_emails import send_daily_emails
5
+ from lecrapaud.config import DATASET_ID, RECEIVER_EMAIL
6
+ from lecrapaud.training import run_training
7
+ from lecrapaud.constants import stock_list_3
8
+ from lecrapaud.search_space import get_models_idx
9
+
10
+
11
+ @app.task(
12
+ bind=True,
13
+ autoretry_for=(Exception,),
14
+ retry_backoff=True,
15
+ retry_kwargs={"max_retries": 5},
16
+ acks_late=True,
17
+ )
18
+ def task_send_daily_emails(self):
19
+ try:
20
+ print(f"[Attempt #{self.request.retries}] task_send_daily_emails")
21
+ dataset_id = int(DATASET_ID)
22
+ email = RECEIVER_EMAIL
23
+ return send_daily_emails(email, dataset_id)
24
+ except Exception as e:
25
+ print(e)
26
+ # honeybadger.notify(e)
27
+ raise
28
+
29
+
30
+ @app.task(
31
+ bind=True,
32
+ autoretry_for=(Exception,),
33
+ retry_backoff=True,
34
+ retry_kwargs={"max_retries": 5},
35
+ acks_late=True,
36
+ )
37
+ def task_training_experiment(self):
38
+ try:
39
+ print(f"[Attempt #{self.request.retries}] task_training_experiment")
40
+ run_training(
41
+ years_of_data=20,
42
+ list_of_groups=stock_list_3,
43
+ targets_numbers=range(1, 15),
44
+ percentile=20,
45
+ corr_threshold=80,
46
+ max_features=25,
47
+ models_idx=get_models_idx("linear", "xgb"),
48
+ number_of_trials=20,
49
+ perform_hyperoptimization=True,
50
+ perform_crossval=False,
51
+ preserve_model=False,
52
+ session_name="20y_stock_list_3_linear_xgb",
53
+ )
54
+ except Exception as e:
55
+ print(e)
56
+ # honeybadger.notify(e)
57
+ raise