lecrapaud 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +0 -0
- lecrapaud/config.py +16 -0
- lecrapaud/db/__init__.py +0 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/crud.py +179 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +6 -0
- lecrapaud/db/models/dataset.py +124 -0
- lecrapaud/db/models/feature.py +46 -0
- lecrapaud/db/models/feature_selection.py +126 -0
- lecrapaud/db/models/feature_selection_rank.py +80 -0
- lecrapaud/db/models/model.py +41 -0
- lecrapaud/db/models/model_selection.py +56 -0
- lecrapaud/db/models/model_training.py +54 -0
- lecrapaud/db/models/score.py +62 -0
- lecrapaud/db/models/target.py +59 -0
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +58 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/feature_engineering.py +1119 -0
- lecrapaud/feature_selection.py +1229 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1571 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/search_space.py +844 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +151 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.4.0.dist-info/LICENSE +201 -0
- lecrapaud-0.4.0.dist-info/METADATA +103 -0
- lecrapaud-0.4.0.dist-info/RECORD +60 -0
- lecrapaud-0.4.0.dist-info/WHEEL +4 -0
lecrapaud/predictions.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import keras
|
|
2
|
+
import pickle
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import joblib
|
|
6
|
+
from datetime import timedelta, datetime
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
from src.search_space import ml_models, dl_recurrent_models
|
|
10
|
+
from src.data_sourcing import get_filtered_data
|
|
11
|
+
from src.feature_engineering import feature_engineering
|
|
12
|
+
from src.feature_selection import (
|
|
13
|
+
encode_categorical_features,
|
|
14
|
+
reshape_df,
|
|
15
|
+
TARGETS_CLF,
|
|
16
|
+
reshape_time_series,
|
|
17
|
+
)
|
|
18
|
+
from src.model_selection import predict, evaluate
|
|
19
|
+
from src.utils import logger
|
|
20
|
+
from src.db.models import Dataset
|
|
21
|
+
from src.config import LOGGING_LEVEL
|
|
22
|
+
|
|
23
|
+
MODELS_LIST = ml_models + dl_recurrent_models
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run_prediction(
|
|
27
|
+
dataset_id: str,
|
|
28
|
+
targets_numbers: list[int],
|
|
29
|
+
test: bool = True,
|
|
30
|
+
date: datetime = None,
|
|
31
|
+
verbose: int = 0,
|
|
32
|
+
):
|
|
33
|
+
"""Function to run prediction on several TARGETS using best models"""
|
|
34
|
+
if verbose == 0:
|
|
35
|
+
logger.setLevel(logging.WARNING)
|
|
36
|
+
|
|
37
|
+
logger.warning("Running prediction...")
|
|
38
|
+
|
|
39
|
+
dataset = Dataset.get(dataset_id)
|
|
40
|
+
dataset_dir = dataset.path
|
|
41
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
42
|
+
list_of_groups = dataset.list_of_groups
|
|
43
|
+
|
|
44
|
+
features_dict = {}
|
|
45
|
+
scaler_y_dict = {}
|
|
46
|
+
model_dict = {}
|
|
47
|
+
threshold_dict = {}
|
|
48
|
+
for target_number in targets_numbers:
|
|
49
|
+
(
|
|
50
|
+
model_dict[target_number],
|
|
51
|
+
threshold_dict[target_number],
|
|
52
|
+
features_dict[target_number],
|
|
53
|
+
scaler_y_dict[target_number],
|
|
54
|
+
all_features,
|
|
55
|
+
scaler_x,
|
|
56
|
+
) = load_model(dataset, target_number)
|
|
57
|
+
|
|
58
|
+
# get data for backtesting
|
|
59
|
+
if test:
|
|
60
|
+
train_data_dir = f"{dataset_dir}/data"
|
|
61
|
+
data_for_pred = joblib.load(f"{train_data_dir}/test.pkl")
|
|
62
|
+
data_for_pred_scaled = joblib.load(f"{train_data_dir}/test_scaled.pkl")
|
|
63
|
+
|
|
64
|
+
if any(
|
|
65
|
+
config["recurrent"]
|
|
66
|
+
for config in MODELS_LIST
|
|
67
|
+
if config["model_name"] in model_dict.values()
|
|
68
|
+
):
|
|
69
|
+
train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
|
|
70
|
+
val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
|
|
71
|
+
test_scaled = joblib.load(f"{train_data_dir}/test_scaled.pkl")
|
|
72
|
+
reshaped_data = reshape_time_series(
|
|
73
|
+
train_scaled, val_scaled, test_scaled, all_features, timesteps=120
|
|
74
|
+
)
|
|
75
|
+
data_for_pred_reshaped = reshaped_data["x_train_reshaped"]
|
|
76
|
+
|
|
77
|
+
most_recent_data = joblib.load(f"{train_data_dir}/full.pkl")
|
|
78
|
+
most_recent_data = most_recent_data.loc[data_for_pred.index]
|
|
79
|
+
|
|
80
|
+
scores_clf = []
|
|
81
|
+
scores_reg = []
|
|
82
|
+
# get data for predicting future
|
|
83
|
+
else:
|
|
84
|
+
# TODO: if date is a bit more older, need more than 0 years
|
|
85
|
+
most_recent_data = get_filtered_data(
|
|
86
|
+
years_of_data=0, list_of_groups=list_of_groups
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
most_recent_data = feature_engineering(
|
|
90
|
+
most_recent_data, for_training=False, save_as_csv=True
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
data_for_pred = encode_categorical_features(
|
|
94
|
+
most_recent_data, save_dir=preprocessing_dir, fit=False
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
data_for_pred_scaled = pd.DataFrame(
|
|
98
|
+
scaler_x.transform(data_for_pred[all_features]),
|
|
99
|
+
columns=list(data_for_pred[all_features].columns),
|
|
100
|
+
index=data_for_pred.index,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# TODO: don't we need to have 120 days of data for each stock?
|
|
104
|
+
if any(
|
|
105
|
+
config["recurrent"]
|
|
106
|
+
for config in MODELS_LIST
|
|
107
|
+
if config["model_name"] in model_dict.values()
|
|
108
|
+
):
|
|
109
|
+
# Count number of rows per stock
|
|
110
|
+
counts = data_for_pred["STOCK"].value_counts()
|
|
111
|
+
|
|
112
|
+
# Find stocks with insufficient history
|
|
113
|
+
insufficient_stocks = counts[counts < 120]
|
|
114
|
+
|
|
115
|
+
if not insufficient_stocks.empty:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Insufficient history for stocks: {', '.join(insufficient_stocks.index)}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
data_for_pred_reshaped = reshape_df(
|
|
121
|
+
data_for_pred_scaled[all_features], data_for_pred["STOCK"], 120
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# make prediction
|
|
125
|
+
for target_number in targets_numbers:
|
|
126
|
+
|
|
127
|
+
# Prepare variables and data
|
|
128
|
+
target_type = "classification" if target_number in TARGETS_CLF else "regression"
|
|
129
|
+
features = features_dict[target_number]
|
|
130
|
+
model = model_dict[target_number]
|
|
131
|
+
threshold = threshold_dict[target_number]
|
|
132
|
+
|
|
133
|
+
config = [
|
|
134
|
+
config for config in MODELS_LIST if config["model_name"] == model.model_name
|
|
135
|
+
]
|
|
136
|
+
if config is None or len(config) == 0:
|
|
137
|
+
Exception(f"Model {model.model_name} was not found in search space.")
|
|
138
|
+
else:
|
|
139
|
+
config = config[0]
|
|
140
|
+
|
|
141
|
+
need_scaling = config["need_scaling"] and target_type == "regression"
|
|
142
|
+
if config["recurrent"]:
|
|
143
|
+
features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
|
|
144
|
+
x_pred = data_for_pred_reshaped[:, :, features_idx]
|
|
145
|
+
else:
|
|
146
|
+
x_pred = (
|
|
147
|
+
data_for_pred_scaled[features]
|
|
148
|
+
if need_scaling
|
|
149
|
+
else data_for_pred[features]
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Predict
|
|
153
|
+
y_pred = predict(model, x_pred, target_type, config, threshold)
|
|
154
|
+
|
|
155
|
+
# Fix for recurrent model because x_val has no index as it is a 3D np array
|
|
156
|
+
if config["recurrent"]:
|
|
157
|
+
y_pred.index = (
|
|
158
|
+
most_recent_data.index
|
|
159
|
+
) # TODO: not sure this will work for old dataset not aligned with data_for_training for test use case (done, this is why we decode the test set)
|
|
160
|
+
|
|
161
|
+
# Unscale prediction
|
|
162
|
+
if need_scaling or config["recurrent"]:
|
|
163
|
+
scaler_y = scaler_y_dict[target_number]
|
|
164
|
+
y_pred = pd.Series(
|
|
165
|
+
scaler_y.inverse_transform(y_pred.values.reshape(-1, 1)).flatten(),
|
|
166
|
+
index=most_recent_data.index,
|
|
167
|
+
)
|
|
168
|
+
y_pred.name = "PRED"
|
|
169
|
+
|
|
170
|
+
# Evaluate if test
|
|
171
|
+
if test:
|
|
172
|
+
prediction = pd.concat(
|
|
173
|
+
[most_recent_data[f"TARGET_{target_number}"], y_pred], axis=1
|
|
174
|
+
)
|
|
175
|
+
prediction.rename(
|
|
176
|
+
columns={f"TARGET_{target_number}": "TARGET"}, inplace=True
|
|
177
|
+
)
|
|
178
|
+
score = evaluate(prediction, target_type)
|
|
179
|
+
score["TARGET"] = f"TARGET_{target_number}"
|
|
180
|
+
(
|
|
181
|
+
scores_clf.append(score)
|
|
182
|
+
if target_type == "classification"
|
|
183
|
+
else scores_reg.append(score)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if isinstance(y_pred, pd.DataFrame):
|
|
187
|
+
y_pred.rename(
|
|
188
|
+
columns={"PRED": f"TARGET_{target_number}_PRED"}, inplace=True
|
|
189
|
+
)
|
|
190
|
+
most_recent_data = pd.concat(
|
|
191
|
+
[most_recent_data, y_pred[f"TARGET_{target_number}_PRED"]], axis=1
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
y_pred.name = f"TARGET_{target_number}_PRED"
|
|
196
|
+
most_recent_data = pd.concat([most_recent_data, y_pred], axis=1)
|
|
197
|
+
|
|
198
|
+
# return result either for test set or for tomorrow prediction
|
|
199
|
+
result = most_recent_data
|
|
200
|
+
|
|
201
|
+
if verbose == 0:
|
|
202
|
+
logger.setLevel(LOGGING_LEVEL)
|
|
203
|
+
|
|
204
|
+
if test:
|
|
205
|
+
logger.info("Test results on test set")
|
|
206
|
+
scores_reg = pd.DataFrame(scores_reg).set_index("TARGET")
|
|
207
|
+
scores_clf = pd.DataFrame(scores_clf).set_index("TARGET")
|
|
208
|
+
return result, scores_reg, scores_clf, prediction
|
|
209
|
+
elif date:
|
|
210
|
+
date = date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
211
|
+
tomorrow = date + timedelta(days=1)
|
|
212
|
+
logger.info(f"Prediction for : {tomorrow.date()}")
|
|
213
|
+
result = result[result["DATE"] == date]
|
|
214
|
+
return result, None, None, None
|
|
215
|
+
else:
|
|
216
|
+
date = datetime.today()
|
|
217
|
+
max_date = result["DATE"].max()
|
|
218
|
+
if max_date.date() != date.date():
|
|
219
|
+
logger.info(
|
|
220
|
+
f"The maximum date found in the dataset is {max_date} and not {date}"
|
|
221
|
+
)
|
|
222
|
+
tomorrow = max_date + timedelta(days=1)
|
|
223
|
+
logger.info(f"Prediction for tomorrow : {tomorrow.date()}")
|
|
224
|
+
|
|
225
|
+
# Filter the DataFrame for the last date
|
|
226
|
+
filtered_result = result[result["DATE"] == max_date]
|
|
227
|
+
|
|
228
|
+
return filtered_result, None, None, None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# Helpers
|
|
232
|
+
def load_model(dataset: Dataset, target_number: int):
|
|
233
|
+
dataset_dir = dataset.path
|
|
234
|
+
training_target_dir = f"{dataset_dir}/TARGET_{target_number}"
|
|
235
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
236
|
+
|
|
237
|
+
# Search for files that contain '.best' or '.keras' in the name
|
|
238
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
239
|
+
training_target_dir = Path(training_target_dir)
|
|
240
|
+
best_files = list(training_target_dir.glob("*.best*")) + list(
|
|
241
|
+
training_target_dir.glob("*.keras*")
|
|
242
|
+
)
|
|
243
|
+
threshold = (
|
|
244
|
+
scores_tracking["THRESHOLD"].values[0]
|
|
245
|
+
if "THRESHOLD" in scores_tracking.columns
|
|
246
|
+
else None
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# If any files are found, try loading the first one (or process as needed)
|
|
250
|
+
if best_files:
|
|
251
|
+
file_path = best_files[0] # Assuming you want to open the first matching file
|
|
252
|
+
try:
|
|
253
|
+
# Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
|
|
254
|
+
model = joblib.load(file_path)
|
|
255
|
+
logger.info(f"Loaded model {model.model_name} and threshold {threshold}")
|
|
256
|
+
except (pickle.UnpicklingError, EOFError):
|
|
257
|
+
# If it's not a pickle file, try loading it as a Keras model
|
|
258
|
+
try:
|
|
259
|
+
# Attempt to load the file as a Keras model
|
|
260
|
+
model = keras.models.load_model(file_path)
|
|
261
|
+
logger.info(
|
|
262
|
+
f"Loaded model {model.model_name} and threshold {threshold}"
|
|
263
|
+
)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
raise FileNotFoundError(
|
|
266
|
+
f"Model could not be loaded from path: {file_path}: {e}"
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
raise FileNotFoundError(
|
|
270
|
+
f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if dataset.name == "data_28_X_X":
|
|
274
|
+
features = joblib.load(
|
|
275
|
+
f"{preprocessing_dir}/features_{target_number}.pkl"
|
|
276
|
+
) # we keep this for backward compatibility
|
|
277
|
+
else:
|
|
278
|
+
features = dataset.get_features(target_number)
|
|
279
|
+
|
|
280
|
+
scaler_y = None
|
|
281
|
+
if target_number not in TARGETS_CLF:
|
|
282
|
+
scaler_y = joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
|
|
283
|
+
|
|
284
|
+
if dataset.name == "data_28_X_X":
|
|
285
|
+
all_features = joblib.load(
|
|
286
|
+
f"{preprocessing_dir}/all_features.pkl"
|
|
287
|
+
) # we keep this for backward compatibility
|
|
288
|
+
else:
|
|
289
|
+
all_features = dataset.get_all_features()
|
|
290
|
+
scaler_x = joblib.load(f"{preprocessing_dir}/scaler_x.pkl")
|
|
291
|
+
|
|
292
|
+
return model, threshold, features, scaler_y, all_features, scaler_x
|