lecrapaud 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +271 -0
- lecrapaud/config.py +25 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +181 -0
- lecrapaud/db/models/dataset.py +129 -0
- lecrapaud/db/models/feature.py +45 -0
- lecrapaud/db/models/feature_selection.py +125 -0
- lecrapaud/db/models/feature_selection_rank.py +79 -0
- lecrapaud/db/models/model.py +40 -0
- lecrapaud/db/models/model_selection.py +63 -0
- lecrapaud/db/models/model_training.py +62 -0
- lecrapaud/db/models/score.py +65 -0
- lecrapaud/db/models/target.py +67 -0
- lecrapaud/db/session.py +45 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +846 -0
- lecrapaud/feature_selection.py +1167 -0
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1671 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/preprocessing.py +984 -0
- lecrapaud/search_space.py +848 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +239 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.1.0.dist-info/LICENSE +201 -0
- lecrapaud-0.1.0.dist-info/METADATA +105 -0
- lecrapaud-0.1.0.dist-info/RECORD +63 -0
- lecrapaud-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1671 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
import time
|
|
7
|
+
import os
|
|
8
|
+
import json
|
|
9
|
+
import warnings
|
|
10
|
+
import joblib
|
|
11
|
+
import glob
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import pickle
|
|
14
|
+
|
|
15
|
+
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
16
|
+
|
|
17
|
+
# ML models
|
|
18
|
+
from sklearn.model_selection import TimeSeriesSplit
|
|
19
|
+
from sklearn.calibration import CalibratedClassifierCV
|
|
20
|
+
from sklearn.metrics import (
|
|
21
|
+
mean_absolute_percentage_error,
|
|
22
|
+
root_mean_squared_error,
|
|
23
|
+
mean_absolute_error,
|
|
24
|
+
r2_score,
|
|
25
|
+
accuracy_score,
|
|
26
|
+
precision_score,
|
|
27
|
+
recall_score,
|
|
28
|
+
f1_score,
|
|
29
|
+
log_loss,
|
|
30
|
+
roc_auc_score,
|
|
31
|
+
roc_curve,
|
|
32
|
+
auc,
|
|
33
|
+
precision_recall_curve,
|
|
34
|
+
average_precision_score,
|
|
35
|
+
confusion_matrix,
|
|
36
|
+
)
|
|
37
|
+
from sklearn.preprocessing import LabelBinarizer
|
|
38
|
+
import lightgbm as lgb
|
|
39
|
+
import xgboost as xgb
|
|
40
|
+
|
|
41
|
+
# DL models
|
|
42
|
+
import tensorflow as tf
|
|
43
|
+
import keras
|
|
44
|
+
from keras.callbacks import EarlyStopping, TensorBoard
|
|
45
|
+
from keras.metrics import (
|
|
46
|
+
Precision,
|
|
47
|
+
Recall,
|
|
48
|
+
F1Score,
|
|
49
|
+
)
|
|
50
|
+
from keras.losses import BinaryCrossentropy, CategoricalCrossentropy
|
|
51
|
+
from keras.optimizers import Adam
|
|
52
|
+
|
|
53
|
+
K = tf.keras.backend
|
|
54
|
+
from tensorboardX import SummaryWriter
|
|
55
|
+
|
|
56
|
+
# Optimization
|
|
57
|
+
import ray
|
|
58
|
+
from ray.tune import Tuner, TuneConfig, with_parameters
|
|
59
|
+
from ray.train import RunConfig
|
|
60
|
+
from ray.tune.search.hyperopt import HyperOptSearch
|
|
61
|
+
from ray.tune.search.bayesopt import BayesOptSearch
|
|
62
|
+
from ray.tune.logger import TBXLoggerCallback
|
|
63
|
+
from ray.tune.schedulers import ASHAScheduler
|
|
64
|
+
from ray.air import session
|
|
65
|
+
|
|
66
|
+
# Internal library
|
|
67
|
+
from lecrapaud.search_space import all_models
|
|
68
|
+
from lecrapaud.directory_management import clean_directory
|
|
69
|
+
from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
|
|
70
|
+
from lecrapaud.config import PYTHON_ENV
|
|
71
|
+
from lecrapaud.feature_selection import load_train_data
|
|
72
|
+
from lecrapaud.db import (
|
|
73
|
+
Model,
|
|
74
|
+
ModelSelection,
|
|
75
|
+
ModelTraining,
|
|
76
|
+
Score,
|
|
77
|
+
Target,
|
|
78
|
+
Dataset,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Reproducible result
|
|
82
|
+
keras.utils.set_random_seed(42)
|
|
83
|
+
np.random.seed(42)
|
|
84
|
+
tf.config.experimental.enable_op_determinism()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# test configuration
|
|
88
|
+
def test_hardware():
|
|
89
|
+
devices = tf.config.list_physical_devices()
|
|
90
|
+
logger.info("\nDevices: ", devices)
|
|
91
|
+
|
|
92
|
+
gpus = tf.config.list_physical_devices("GPU")
|
|
93
|
+
if gpus:
|
|
94
|
+
details = tf.config.experimental.get_device_details(gpus[0])
|
|
95
|
+
logger.info("GPU details: ", details)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# Suppress specific warning messages related to file system monitor
|
|
99
|
+
# logging.getLogger("ray").setLevel(logging.CRITICAL)
|
|
100
|
+
# logging.getLogger("ray.train").setLevel(logging.CRITICAL)
|
|
101
|
+
# logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
|
|
102
|
+
# logging.getLogger("ray.autoscaler").setLevel(logging.CRITICAL)
|
|
103
|
+
# logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
|
|
104
|
+
# logging.getLogger("ray.monitor").setLevel(logging.CRITICAL)
|
|
105
|
+
# logging.getLogger("ray.dashboard").setLevel(logging.CRITICAL)
|
|
106
|
+
# logging.getLogger("ray.gcs_server").setLevel(logging.CRITICAL)
|
|
107
|
+
|
|
108
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ModelEngine:
|
|
112
|
+
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
model_name: str = None,
|
|
116
|
+
target_type: str = None,
|
|
117
|
+
path: str = None,
|
|
118
|
+
search_params: dict = {},
|
|
119
|
+
create_model=None,
|
|
120
|
+
plot: bool = False,
|
|
121
|
+
log_dir: str = None,
|
|
122
|
+
):
|
|
123
|
+
if path:
|
|
124
|
+
self.load(path)
|
|
125
|
+
else:
|
|
126
|
+
self.model_name = model_name
|
|
127
|
+
self.target_type = target_type
|
|
128
|
+
|
|
129
|
+
config = [
|
|
130
|
+
config for config in all_models if config["model_name"] == self.model_name
|
|
131
|
+
]
|
|
132
|
+
if config is None or len(config) == 0:
|
|
133
|
+
Exception(
|
|
134
|
+
f"Model {self.model_name} is not supported by this library."
|
|
135
|
+
f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
self.recurrent = config["recurrent"]
|
|
139
|
+
self.need_scaling = config["need_scaling"]
|
|
140
|
+
self.search_params = search_params
|
|
141
|
+
self.create_model = create_model
|
|
142
|
+
self.plot = plot
|
|
143
|
+
self.log_dir = log_dir
|
|
144
|
+
|
|
145
|
+
if self.need_scaling and self.target_type == "regression":
|
|
146
|
+
self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
|
|
147
|
+
else:
|
|
148
|
+
self.scaler_y = None
|
|
149
|
+
|
|
150
|
+
self.path = path
|
|
151
|
+
|
|
152
|
+
def fit(self, *args):
|
|
153
|
+
if self.recurrent:
|
|
154
|
+
fit = self.fit_recurrent
|
|
155
|
+
elif (self.create_model == "lgb") or (self.create_model == "xgb"):
|
|
156
|
+
fit = self.fit_boosting
|
|
157
|
+
else:
|
|
158
|
+
fit = self.fit_sklearn
|
|
159
|
+
model = fit(*args)
|
|
160
|
+
return model
|
|
161
|
+
|
|
162
|
+
# Functions to fit & evaluate models
|
|
163
|
+
def fit_sklearn(self, x_train, y_train, x_val, y_val, params):
|
|
164
|
+
|
|
165
|
+
# Create & Compile the model
|
|
166
|
+
model = self.create_model(**params)
|
|
167
|
+
|
|
168
|
+
# Train the model
|
|
169
|
+
logger.info("Fitting the model...")
|
|
170
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
171
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
172
|
+
|
|
173
|
+
model.fit(x_train, y_train)
|
|
174
|
+
|
|
175
|
+
if (
|
|
176
|
+
self.target_type == "classification"
|
|
177
|
+
and "loss" in model.get_params().keys()
|
|
178
|
+
and "hinge" in model.get_params()["loss"]
|
|
179
|
+
):
|
|
180
|
+
# This is for SVC models with hinge loss
|
|
181
|
+
# You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
|
|
182
|
+
# TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
|
|
183
|
+
logger.info(
|
|
184
|
+
f"Re-Calibrating {self.model_name} to get predict probabilities..."
|
|
185
|
+
)
|
|
186
|
+
calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
|
|
187
|
+
model = calibrator.fit(x_train, y_train)
|
|
188
|
+
|
|
189
|
+
# set model_name after calibrator
|
|
190
|
+
model.model_name = self.model_name
|
|
191
|
+
model.target_type = self.target_type
|
|
192
|
+
|
|
193
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
194
|
+
|
|
195
|
+
self._model = model
|
|
196
|
+
|
|
197
|
+
return model
|
|
198
|
+
|
|
199
|
+
def fit_boosting(self, x_train, y_train, x_val, y_val, params):
|
|
200
|
+
"""
|
|
201
|
+
This is using lightGBM or XGboost C++ librairies
|
|
202
|
+
"""
|
|
203
|
+
lightGBM = self.create_model == "lgb"
|
|
204
|
+
|
|
205
|
+
# Datasets
|
|
206
|
+
boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
|
|
207
|
+
train_data = boosting_dataset(x_train, label=y_train)
|
|
208
|
+
val_data = boosting_dataset(x_val, label=y_val)
|
|
209
|
+
|
|
210
|
+
# Create a TensorBoardX writer
|
|
211
|
+
writer = SummaryWriter(self.log_dir)
|
|
212
|
+
evals_result = {}
|
|
213
|
+
|
|
214
|
+
# Training
|
|
215
|
+
labels = np.unique(y_train)
|
|
216
|
+
num_class = (
|
|
217
|
+
labels.size
|
|
218
|
+
if self.target_type == "classification" and labels.size > 2
|
|
219
|
+
else 1
|
|
220
|
+
)
|
|
221
|
+
logger.info("Fitting the model...")
|
|
222
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
223
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
224
|
+
|
|
225
|
+
if lightGBM:
|
|
226
|
+
|
|
227
|
+
def tensorboard_callback(env):
|
|
228
|
+
for i, metric in enumerate(env.evaluation_result_list):
|
|
229
|
+
metric_name, _, metric_value, _ = metric
|
|
230
|
+
writer.add_scalar(
|
|
231
|
+
f"LightGBM/{metric_name}", metric_value, env.iteration
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
loss = (
|
|
235
|
+
"regression"
|
|
236
|
+
if self.target_type == "regression"
|
|
237
|
+
else ("binary" if num_class <= 2 else "multiclass")
|
|
238
|
+
)
|
|
239
|
+
eval_metric = (
|
|
240
|
+
"rmse"
|
|
241
|
+
if self.target_type == "regression"
|
|
242
|
+
else ("binary_logloss" if num_class <= 2 else "multi_logloss")
|
|
243
|
+
)
|
|
244
|
+
model = lgb.train(
|
|
245
|
+
params={
|
|
246
|
+
**params["model_params"],
|
|
247
|
+
"objective": loss,
|
|
248
|
+
"metric": eval_metric,
|
|
249
|
+
"num_class": num_class,
|
|
250
|
+
},
|
|
251
|
+
num_boost_round=params["num_boost_round"],
|
|
252
|
+
train_set=train_data,
|
|
253
|
+
valid_sets=[train_data, val_data],
|
|
254
|
+
valid_names=["train", "val"],
|
|
255
|
+
callbacks=[
|
|
256
|
+
lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
|
|
257
|
+
lgb.record_evaluation(evals_result),
|
|
258
|
+
tensorboard_callback,
|
|
259
|
+
],
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
|
|
263
|
+
class TensorBoardCallback(xgb.callback.TrainingCallback):
|
|
264
|
+
|
|
265
|
+
def __init__(self, log_dir: str):
|
|
266
|
+
self.writer = SummaryWriter(log_dir=log_dir)
|
|
267
|
+
|
|
268
|
+
def after_iteration(
|
|
269
|
+
self,
|
|
270
|
+
model,
|
|
271
|
+
epoch: int,
|
|
272
|
+
evals_log: xgb.callback.TrainingCallback.EvalsLog,
|
|
273
|
+
) -> bool:
|
|
274
|
+
if not evals_log:
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
for data, metric in evals_log.items():
|
|
278
|
+
for metric_name, log in metric.items():
|
|
279
|
+
score = (
|
|
280
|
+
log[-1][0] if isinstance(log[-1], tuple) else log[-1]
|
|
281
|
+
)
|
|
282
|
+
self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
|
|
283
|
+
|
|
284
|
+
return False
|
|
285
|
+
|
|
286
|
+
tensorboard_callback = TensorBoardCallback(self.log_dir)
|
|
287
|
+
|
|
288
|
+
loss = (
|
|
289
|
+
"reg:squarederror"
|
|
290
|
+
if self.target_type == "regression"
|
|
291
|
+
else ("binary:logistic" if num_class <= 2 else "multi:softprob")
|
|
292
|
+
)
|
|
293
|
+
eval_metric = (
|
|
294
|
+
"rmse"
|
|
295
|
+
if self.target_type == "regression"
|
|
296
|
+
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
297
|
+
)
|
|
298
|
+
model = xgb.train(
|
|
299
|
+
params={
|
|
300
|
+
**params["model_params"],
|
|
301
|
+
"objective": loss,
|
|
302
|
+
"eval_metric": eval_metric,
|
|
303
|
+
"num_class": num_class,
|
|
304
|
+
},
|
|
305
|
+
num_boost_round=params["num_boost_round"],
|
|
306
|
+
dtrain=train_data,
|
|
307
|
+
evals=[(val_data, "val"), (train_data, "train")],
|
|
308
|
+
callbacks=[
|
|
309
|
+
xgb.callback.EarlyStopping(
|
|
310
|
+
rounds=params["early_stopping_rounds"], save_best=True
|
|
311
|
+
),
|
|
312
|
+
xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
|
|
313
|
+
tensorboard_callback,
|
|
314
|
+
],
|
|
315
|
+
evals_result=evals_result, # Record evaluation result
|
|
316
|
+
verbose_eval=0,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
model.model_name = self.create_model
|
|
320
|
+
model.target_type = self.target_type
|
|
321
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
322
|
+
|
|
323
|
+
# Close the writer after training is done
|
|
324
|
+
writer.close()
|
|
325
|
+
|
|
326
|
+
if self.plot:
|
|
327
|
+
# Plot loss per epoch
|
|
328
|
+
train_loss = evals_result["train"][eval_metric]
|
|
329
|
+
val_loss = evals_result["val"][eval_metric]
|
|
330
|
+
logs = pd.DataFrame({"train": train_loss, "val": val_loss})
|
|
331
|
+
|
|
332
|
+
plt.figure(figsize=(14, 4))
|
|
333
|
+
plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
|
|
334
|
+
plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
|
|
335
|
+
plt.xlabel("Epoch")
|
|
336
|
+
plt.ylabel("Loss")
|
|
337
|
+
plt.legend()
|
|
338
|
+
plt.show()
|
|
339
|
+
|
|
340
|
+
self._model = model
|
|
341
|
+
|
|
342
|
+
return model
|
|
343
|
+
|
|
344
|
+
def fit_recurrent(self, x_train, y_train, x_val, y_val, params):
|
|
345
|
+
|
|
346
|
+
# metrics functions
|
|
347
|
+
def rmse_tf(y_true, y_pred):
|
|
348
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
349
|
+
results = K.sqrt(K.mean(K.square(y_pred - y_true)))
|
|
350
|
+
return results
|
|
351
|
+
|
|
352
|
+
def mae_tf(y_true, y_pred):
|
|
353
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
354
|
+
results = K.mean(K.abs(y_pred - y_true))
|
|
355
|
+
return results
|
|
356
|
+
|
|
357
|
+
def unscale_tf(y_true, y_pred):
|
|
358
|
+
if self.target_type == "regression":
|
|
359
|
+
scale = K.constant(self.scaler_y.scale_[0])
|
|
360
|
+
mean = K.constant(self.scaler_y.mean_[0])
|
|
361
|
+
|
|
362
|
+
y_true = K.mul(y_true, scale)
|
|
363
|
+
y_true = K.bias_add(y_true, mean)
|
|
364
|
+
|
|
365
|
+
y_pred = K.mul(y_pred, scale)
|
|
366
|
+
y_pred = K.bias_add(y_pred, mean)
|
|
367
|
+
return y_true, y_pred
|
|
368
|
+
|
|
369
|
+
# Create the model
|
|
370
|
+
labels = np.unique(y_train[:, 0])
|
|
371
|
+
num_class = labels.size if self.target_type == "classification" else None
|
|
372
|
+
input_shape = (x_train.shape[1], x_train.shape[2])
|
|
373
|
+
model = self.create_model(params, input_shape, self.target_type, num_class)
|
|
374
|
+
model.target_type = self.target_type
|
|
375
|
+
|
|
376
|
+
# Compile the model
|
|
377
|
+
loss = (
|
|
378
|
+
rmse_tf
|
|
379
|
+
if self.target_type == "regression"
|
|
380
|
+
else (
|
|
381
|
+
BinaryCrossentropy(from_logits=False)
|
|
382
|
+
if num_class <= 2
|
|
383
|
+
else CategoricalCrossentropy(from_logits=False)
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
optimizer = Adam(
|
|
387
|
+
learning_rate=params["learning_rate"], clipnorm=params["clipnorm"]
|
|
388
|
+
)
|
|
389
|
+
metrics = (
|
|
390
|
+
[mae_tf]
|
|
391
|
+
if self.target_type == "regression"
|
|
392
|
+
else (
|
|
393
|
+
["accuracy", Precision(), Recall()]
|
|
394
|
+
if num_class <= 2
|
|
395
|
+
else ["categorical_accuracy"]
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
|
|
399
|
+
|
|
400
|
+
# Callbacks
|
|
401
|
+
tensorboard_callback = TensorBoard(log_dir=self.log_dir)
|
|
402
|
+
early_stopping_callback = EarlyStopping(
|
|
403
|
+
monitor="val_loss",
|
|
404
|
+
patience=3,
|
|
405
|
+
restore_best_weights=True,
|
|
406
|
+
start_from_epoch=5,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Custom callbacks
|
|
410
|
+
class PrintTrainableWeights(keras.callbacks.Callback):
|
|
411
|
+
def on_epoch_end(self, epoch, logs={}):
|
|
412
|
+
logger.info(model.trainable_variables)
|
|
413
|
+
|
|
414
|
+
class GradientCalcCallback(keras.callbacks.Callback):
|
|
415
|
+
def __init__(self):
|
|
416
|
+
self.epoch_gradient = []
|
|
417
|
+
|
|
418
|
+
def get_gradient_func(self, model):
|
|
419
|
+
# grads = K.gradients(model.total_loss, model.trainable_weights)
|
|
420
|
+
grads = K.gradients(model.loss, model.trainable_weights)
|
|
421
|
+
# inputs = model.model.inputs + model.targets + model.sample_weights
|
|
422
|
+
# use below line of code if above line doesn't work for you
|
|
423
|
+
# inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
|
|
424
|
+
inputs = (
|
|
425
|
+
model._feed_inputs
|
|
426
|
+
+ model._feed_targets
|
|
427
|
+
+ model._feed_sample_weights
|
|
428
|
+
)
|
|
429
|
+
func = K.function(inputs, grads)
|
|
430
|
+
return func
|
|
431
|
+
|
|
432
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
433
|
+
get_gradient = self.get_gradient_func(model)
|
|
434
|
+
grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
|
|
435
|
+
self.epoch_gradient.append(grads)
|
|
436
|
+
|
|
437
|
+
# Train the model
|
|
438
|
+
if self.target_type == "classification" and num_class > 2:
|
|
439
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
440
|
+
lb.fit(labels)
|
|
441
|
+
y_train = lb.transform(y_train[:, 0].flatten())
|
|
442
|
+
y_val = lb.transform(y_val[:, 0].flatten())
|
|
443
|
+
else:
|
|
444
|
+
y_train = y_train[:, 0].flatten()
|
|
445
|
+
y_val = y_val[:, 0].flatten()
|
|
446
|
+
|
|
447
|
+
logger.info("Fitting the model...")
|
|
448
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
449
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
450
|
+
|
|
451
|
+
history = model.fit(
|
|
452
|
+
x_train,
|
|
453
|
+
y_train,
|
|
454
|
+
batch_size=params["batch_size"],
|
|
455
|
+
verbose=0,
|
|
456
|
+
epochs=params["epochs"],
|
|
457
|
+
shuffle=False,
|
|
458
|
+
validation_data=(x_val, y_val),
|
|
459
|
+
callbacks=[early_stopping_callback, tensorboard_callback],
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
463
|
+
# logger.info(pd.DataFrame(gradiant.epoch_gradient))
|
|
464
|
+
|
|
465
|
+
if self.plot:
|
|
466
|
+
# Plot loss per epoch
|
|
467
|
+
logs = pd.DataFrame(history.history)
|
|
468
|
+
|
|
469
|
+
plt.figure(figsize=(14, 4))
|
|
470
|
+
plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
|
|
471
|
+
plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
|
|
472
|
+
plt.xlabel("Epoch")
|
|
473
|
+
plt.ylabel("Loss")
|
|
474
|
+
plt.legend()
|
|
475
|
+
plt.show()
|
|
476
|
+
|
|
477
|
+
self._model = model
|
|
478
|
+
|
|
479
|
+
return model
|
|
480
|
+
|
|
481
|
+
def predict(
|
|
482
|
+
self,
|
|
483
|
+
data: pd.DataFrame,
|
|
484
|
+
threshold: float = 0.5,
|
|
485
|
+
):
|
|
486
|
+
"""Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
- data: the data for prediction
|
|
490
|
+
- threshold: the threshold for classification
|
|
491
|
+
"""
|
|
492
|
+
if not self._model:
|
|
493
|
+
raise Exception(
|
|
494
|
+
"Model is not fitted, cannot predict, run model.fit() first, or pass a fitted model when creating the Model object to the `model` parameter."
|
|
495
|
+
)
|
|
496
|
+
model = self._model
|
|
497
|
+
|
|
498
|
+
if self.threshold and threshold == 0.5:
|
|
499
|
+
threshold = self.threshold
|
|
500
|
+
|
|
501
|
+
if self.recurrent or model.model_name in ["lgb", "xgb"]:
|
|
502
|
+
# keras, lgb & xgb
|
|
503
|
+
if model.model_name == "lgb":
|
|
504
|
+
# Direct prediction for LightGBM
|
|
505
|
+
pred = model.predict(data)
|
|
506
|
+
elif model.model_name == "xgb":
|
|
507
|
+
# Convert val_data to DMatrix for XGBoost
|
|
508
|
+
d_data = xgb.DMatrix(data)
|
|
509
|
+
pred = model.predict(d_data)
|
|
510
|
+
else:
|
|
511
|
+
# Reshape (flatten) for keras if not multiclass
|
|
512
|
+
pred = model.predict(data)
|
|
513
|
+
if pred.shape[1] == 1:
|
|
514
|
+
pred = pred.reshape(-1)
|
|
515
|
+
|
|
516
|
+
if self.target_type == "classification":
|
|
517
|
+
num_class = pred.shape[1] if len(pred.shape) > 1 else 2
|
|
518
|
+
|
|
519
|
+
if num_class <= 2:
|
|
520
|
+
# For binary classification, concatenate the predicted probabilities for both classes
|
|
521
|
+
pred_df = pd.DataFrame(
|
|
522
|
+
{
|
|
523
|
+
0: 1 - pred, # Probability of class 0
|
|
524
|
+
1: pred, # Probability of class 1
|
|
525
|
+
},
|
|
526
|
+
)
|
|
527
|
+
else:
|
|
528
|
+
# For multi-class classification, use the predicted probabilities for each class
|
|
529
|
+
pred_df = pd.DataFrame(pred, columns=range(num_class))
|
|
530
|
+
|
|
531
|
+
# Get final predictions (argmax for multi-class, threshold for binary)
|
|
532
|
+
if num_class == 2:
|
|
533
|
+
pred_df["PRED"] = np.where(
|
|
534
|
+
pred_df[1] >= threshold, 1, 0
|
|
535
|
+
) # Class 1 if prob >= threshold
|
|
536
|
+
else:
|
|
537
|
+
pred_df["PRED"] = pred_df.idxmax(
|
|
538
|
+
axis=1
|
|
539
|
+
) # Class with highest probability for multiclasses
|
|
540
|
+
|
|
541
|
+
# Reorder columns to show predicted class first, then probabilities
|
|
542
|
+
pred = pred_df[["PRED"] + list(range(num_class))]
|
|
543
|
+
|
|
544
|
+
else:
|
|
545
|
+
pred = pd.Series(pred, name="PRED")
|
|
546
|
+
|
|
547
|
+
# set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
|
|
548
|
+
if model.model_name in ["lgb", "xgb"]:
|
|
549
|
+
pred.index = data.index
|
|
550
|
+
else:
|
|
551
|
+
# sk learn
|
|
552
|
+
pred = pd.Series(model.predict(data), index=data.index, name="PRED")
|
|
553
|
+
if self.target_type == "classification":
|
|
554
|
+
pred_proba = pd.DataFrame(
|
|
555
|
+
model.predict_proba(data),
|
|
556
|
+
index=data.index,
|
|
557
|
+
columns=[
|
|
558
|
+
int(c) if isinstance(c, float) and c.is_integer() else c
|
|
559
|
+
for c in model.classes_
|
|
560
|
+
],
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Apply threshold for binary classification
|
|
564
|
+
if len(model.classes_) == 2:
|
|
565
|
+
positive_class = model.classes_[1] # Assuming classes are ordered
|
|
566
|
+
pred = (pred_proba[positive_class] >= threshold).astype(int)
|
|
567
|
+
pred.name = "PRED"
|
|
568
|
+
|
|
569
|
+
pred = pd.concat([pred, pred_proba], axis=1)
|
|
570
|
+
|
|
571
|
+
return pred
|
|
572
|
+
|
|
573
|
+
def save(self, path):
|
|
574
|
+
if self.recurrent:
|
|
575
|
+
path += "/" + self.model_name + ".keras"
|
|
576
|
+
self._model.save(path)
|
|
577
|
+
else:
|
|
578
|
+
path += "/" + self.model_name + ".best"
|
|
579
|
+
joblib.dump(self._model, path)
|
|
580
|
+
self.path = path
|
|
581
|
+
return path
|
|
582
|
+
|
|
583
|
+
def load(self):
|
|
584
|
+
if not self.path:
|
|
585
|
+
raise ValueError("Path is not set, cannot load model")
|
|
586
|
+
|
|
587
|
+
training_target_dir = Path(self.path)
|
|
588
|
+
|
|
589
|
+
# Load threshold
|
|
590
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
591
|
+
self.threshold = (
|
|
592
|
+
scores_tracking["THRESHOLD"].values[0]
|
|
593
|
+
if "THRESHOLD" in scores_tracking.columns
|
|
594
|
+
else None
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Search for files that contain '.best' or '.keras' in the name
|
|
598
|
+
best_files = list(training_target_dir.glob("*.best*")) + list(
|
|
599
|
+
training_target_dir.glob("*.keras*")
|
|
600
|
+
)
|
|
601
|
+
# If any files are found, try loading the first one (or process as needed)
|
|
602
|
+
if best_files:
|
|
603
|
+
file_path = best_files[
|
|
604
|
+
0
|
|
605
|
+
] # Assuming you want to open the first matching file
|
|
606
|
+
try:
|
|
607
|
+
# Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
|
|
608
|
+
self._model = joblib.load(file_path)
|
|
609
|
+
logger.info(
|
|
610
|
+
f"Loaded model {self._model.model_name} and threshold {self.threshold}"
|
|
611
|
+
)
|
|
612
|
+
except (pickle.UnpicklingError, EOFError):
|
|
613
|
+
# If it's not a pickle file, try loading it as a Keras model
|
|
614
|
+
try:
|
|
615
|
+
# Attempt to load the file as a Keras model
|
|
616
|
+
self._model = keras.models.load_model(file_path)
|
|
617
|
+
logger.info(
|
|
618
|
+
f"Loaded model {self._model.model_name} and threshold {self.threshold}"
|
|
619
|
+
)
|
|
620
|
+
except Exception as e:
|
|
621
|
+
raise FileNotFoundError(
|
|
622
|
+
f"Model could not be loaded from path: {file_path}: {e}"
|
|
623
|
+
)
|
|
624
|
+
else:
|
|
625
|
+
raise FileNotFoundError(
|
|
626
|
+
f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
self.model_name = self._model.model_name
|
|
630
|
+
self.target_type = self._model.target_type
|
|
631
|
+
|
|
632
|
+
def __getattr__(self, attr):
|
|
633
|
+
return getattr(self._model, attr)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def trainable(
|
|
637
|
+
params,
|
|
638
|
+
x_train,
|
|
639
|
+
y_train,
|
|
640
|
+
x_val,
|
|
641
|
+
y_val,
|
|
642
|
+
model_name,
|
|
643
|
+
target_type,
|
|
644
|
+
session_name,
|
|
645
|
+
target_number,
|
|
646
|
+
create_model,
|
|
647
|
+
type_name="hyperopts",
|
|
648
|
+
plot=False,
|
|
649
|
+
):
|
|
650
|
+
"""Standalone version of train_model that doesn't depend on self"""
|
|
651
|
+
# Create model engine
|
|
652
|
+
model = ModelEngine(
|
|
653
|
+
model_name=model_name,
|
|
654
|
+
target_type=target_type,
|
|
655
|
+
create_model=create_model,
|
|
656
|
+
plot=plot,
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
logger.info(
|
|
660
|
+
f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
if model.recurrent:
|
|
664
|
+
timesteps = params["timesteps"]
|
|
665
|
+
x_train = x_train[:, -timesteps:, :]
|
|
666
|
+
x_val = x_val[:, -timesteps:, :]
|
|
667
|
+
|
|
668
|
+
# Compile and fit model on train set
|
|
669
|
+
start = time.time()
|
|
670
|
+
model.fit(x_train, y_train, x_val, y_val, params)
|
|
671
|
+
stop = time.time()
|
|
672
|
+
|
|
673
|
+
# Prediction on val set
|
|
674
|
+
y_pred = model.predict(x_val)
|
|
675
|
+
|
|
676
|
+
# fix for recurrent model because x_val has no index as it is a 3D np array
|
|
677
|
+
if model.recurrent:
|
|
678
|
+
y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
|
|
679
|
+
y_pred.index = y_val.index
|
|
680
|
+
|
|
681
|
+
prediction = pd.concat([y_val, y_pred], axis=1)
|
|
682
|
+
|
|
683
|
+
# Unscale the data
|
|
684
|
+
if (
|
|
685
|
+
model.need_scaling
|
|
686
|
+
and model.target_type == "regression"
|
|
687
|
+
and model.scaler_y is not None
|
|
688
|
+
):
|
|
689
|
+
# scaler_y needs 2D array with shape (-1, 1)
|
|
690
|
+
prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
|
|
691
|
+
prediction[["TARGET"]].values
|
|
692
|
+
)
|
|
693
|
+
prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
|
|
694
|
+
prediction[["PRED"]].values
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# Evaluate model
|
|
698
|
+
score = {
|
|
699
|
+
"DATE": datetime.now(),
|
|
700
|
+
"SESSION": session_name,
|
|
701
|
+
"TRAIN_DATA": x_train.shape[0],
|
|
702
|
+
"VAL_DATA": x_val.shape[0],
|
|
703
|
+
"FEATURES": x_train.shape[-1],
|
|
704
|
+
"MODEL_NAME": model.model_name,
|
|
705
|
+
"TYPE": type_name,
|
|
706
|
+
"TRAINING_TIME": stop - start,
|
|
707
|
+
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
score.update(evaluate(prediction, target_type))
|
|
711
|
+
|
|
712
|
+
if type_name == "hyperopts":
|
|
713
|
+
session.report(metrics=score)
|
|
714
|
+
return score
|
|
715
|
+
|
|
716
|
+
return score, model, prediction
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
class ModelSelectionEngine:
|
|
720
|
+
|
|
721
|
+
def __init__(
|
|
722
|
+
self,
|
|
723
|
+
data,
|
|
724
|
+
reshaped_data,
|
|
725
|
+
target_number,
|
|
726
|
+
target_clf,
|
|
727
|
+
dataset,
|
|
728
|
+
models_idx,
|
|
729
|
+
time_series,
|
|
730
|
+
date_column,
|
|
731
|
+
group_column,
|
|
732
|
+
**kwargs,
|
|
733
|
+
):
|
|
734
|
+
self.data = data
|
|
735
|
+
self.reshaped_data = reshaped_data
|
|
736
|
+
self.target_number = target_number
|
|
737
|
+
self.dataset = dataset
|
|
738
|
+
self.target_clf = target_clf
|
|
739
|
+
self.models_idx = models_idx
|
|
740
|
+
self.time_series = time_series
|
|
741
|
+
self.date_column = date_column
|
|
742
|
+
self.group_column = group_column
|
|
743
|
+
|
|
744
|
+
self.target_type = (
|
|
745
|
+
"classification" if self.target_number in self.target_clf else "regression"
|
|
746
|
+
)
|
|
747
|
+
self.dataset_dir = self.dataset.path
|
|
748
|
+
self.dataset_id = self.dataset.id
|
|
749
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
750
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
751
|
+
self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
|
|
752
|
+
self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
|
|
753
|
+
self.features = self.dataset.get_features(self.target_number)
|
|
754
|
+
self.all_features = self.dataset.get_all_features(
|
|
755
|
+
date_column=self.date_column, group_column=self.group_column
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# Main training function
|
|
759
|
+
def run(
|
|
760
|
+
self,
|
|
761
|
+
session_name,
|
|
762
|
+
perform_hyperopt=True,
|
|
763
|
+
number_of_trials=20,
|
|
764
|
+
perform_crossval=False,
|
|
765
|
+
plot=True,
|
|
766
|
+
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
767
|
+
preserve_model=True,
|
|
768
|
+
):
|
|
769
|
+
"""
|
|
770
|
+
Selects the best models based on a target variable, optionally performing hyperparameter optimization
|
|
771
|
+
and cross-validation, and manages outputs in a session-specific directory.
|
|
772
|
+
"""
|
|
773
|
+
self.session_name = session_name
|
|
774
|
+
self.plot = plot
|
|
775
|
+
self.number_of_trials = number_of_trials
|
|
776
|
+
|
|
777
|
+
if self.dataset_id is None:
|
|
778
|
+
raise ValueError("Please provide a dataset.")
|
|
779
|
+
|
|
780
|
+
if self.data:
|
|
781
|
+
self.train = self.data["train"]
|
|
782
|
+
self.val = self.data["val"]
|
|
783
|
+
self.test = self.data["test"]
|
|
784
|
+
self.train_scaled = self.data["train_scaled"]
|
|
785
|
+
self.val_scaled = self.data["val_scaled"]
|
|
786
|
+
self.test_scaled = self.data["test_scaled"]
|
|
787
|
+
else:
|
|
788
|
+
(
|
|
789
|
+
self.train,
|
|
790
|
+
self.val,
|
|
791
|
+
self.test,
|
|
792
|
+
self.train_scaled,
|
|
793
|
+
self.val_scaled,
|
|
794
|
+
self.test_scaled,
|
|
795
|
+
) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
|
|
796
|
+
|
|
797
|
+
if (
|
|
798
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
799
|
+
and not self.time_series
|
|
800
|
+
):
|
|
801
|
+
ValueError(
|
|
802
|
+
"You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
if (
|
|
806
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
807
|
+
and self.time_series
|
|
808
|
+
):
|
|
809
|
+
if self.reshaped_data is None:
|
|
810
|
+
raise ValueError("reshaped_data is not provided.")
|
|
811
|
+
|
|
812
|
+
logger.info("Loading reshaped data...")
|
|
813
|
+
self.x_train_reshaped = self.reshaped_data["x_train_reshaped"]
|
|
814
|
+
self.y_train_reshaped = self.reshaped_data["y_train_reshaped"]
|
|
815
|
+
self.x_val_reshaped = self.reshaped_data["x_val_reshaped"]
|
|
816
|
+
self.y_val_reshaped = self.reshaped_data["y_val_reshaped"]
|
|
817
|
+
|
|
818
|
+
# create model selection in db
|
|
819
|
+
target = Target.find_by(name=f"TARGET_{self.target_number}")
|
|
820
|
+
model_selection = ModelSelection.upsert(
|
|
821
|
+
match_fields=["target_id", "dataset_id"],
|
|
822
|
+
target_id=target.id,
|
|
823
|
+
dataset_id=self.dataset_id,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
# recurrent models starts at 9 # len(list_models)
|
|
827
|
+
for i in self.models_idx:
|
|
828
|
+
config = all_models[i]
|
|
829
|
+
recurrent = config["recurrent"]
|
|
830
|
+
need_scaling = config["need_scaling"]
|
|
831
|
+
model_name = config["model_name"]
|
|
832
|
+
|
|
833
|
+
if recurrent is False and config[self.target_type] is None:
|
|
834
|
+
continue # for naive bayes models that cannot be used in regression
|
|
835
|
+
|
|
836
|
+
self.results_dir = f"{self.training_target_dir}/{model_name}"
|
|
837
|
+
if not os.path.exists(f"{self.results_dir}"):
|
|
838
|
+
os.makedirs(f"{self.results_dir}")
|
|
839
|
+
elif preserve_model and contains_best(self.results_dir):
|
|
840
|
+
continue
|
|
841
|
+
elif perform_hyperopt:
|
|
842
|
+
clean_directory(self.results_dir)
|
|
843
|
+
|
|
844
|
+
logger.info(f"Training a {model_name}")
|
|
845
|
+
model = Model.upsert(
|
|
846
|
+
match_fields=["name", "type"],
|
|
847
|
+
name=model_name,
|
|
848
|
+
type=self.target_type,
|
|
849
|
+
)
|
|
850
|
+
model_training = ModelTraining.upsert(
|
|
851
|
+
match_fields=["model_id", "model_selection_id"],
|
|
852
|
+
model_id=model.id,
|
|
853
|
+
model_selection_id=model_selection.id,
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
# getting data
|
|
857
|
+
if recurrent:
|
|
858
|
+
# Clear cluster from previous Keras session graphs.
|
|
859
|
+
K.clear_session()
|
|
860
|
+
|
|
861
|
+
features_idx = [
|
|
862
|
+
i
|
|
863
|
+
for i, e in enumerate(self.all_features)
|
|
864
|
+
if e in set(self.features)
|
|
865
|
+
]
|
|
866
|
+
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
867
|
+
self.x_train = self.x_train_reshaped[:, :, features_idx]
|
|
868
|
+
self.y_train = self.y_train_reshaped[:, [self.target_number, 0]]
|
|
869
|
+
self.x_val = self.x_val_reshaped[:, :, features_idx]
|
|
870
|
+
self.y_val = self.y_val_reshaped[:, [self.target_number, 0]]
|
|
871
|
+
else:
|
|
872
|
+
config = config[self.target_type]
|
|
873
|
+
|
|
874
|
+
if need_scaling and self.target_type == "regression":
|
|
875
|
+
self.x_train = self.train_scaled[self.features]
|
|
876
|
+
self.y_train = self.train_scaled[
|
|
877
|
+
f"TARGET_{self.target_number}"
|
|
878
|
+
].rename("TARGET")
|
|
879
|
+
self.x_val = self.val_scaled[self.features]
|
|
880
|
+
self.y_val = self.val_scaled[f"TARGET_{self.target_number}"].rename(
|
|
881
|
+
"TARGET"
|
|
882
|
+
)
|
|
883
|
+
else:
|
|
884
|
+
self.x_train = self.train[self.features]
|
|
885
|
+
self.y_train = self.train[f"TARGET_{self.target_number}"].rename(
|
|
886
|
+
"TARGET"
|
|
887
|
+
)
|
|
888
|
+
self.x_val = self.val[self.features]
|
|
889
|
+
self.y_val = self.val[f"TARGET_{self.target_number}"].rename(
|
|
890
|
+
"TARGET"
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
log_dir = get_log_dir(self.training_target_dir, model_name)
|
|
894
|
+
# instantiate model
|
|
895
|
+
model = ModelEngine(
|
|
896
|
+
model_name=model_name,
|
|
897
|
+
recurrent=recurrent,
|
|
898
|
+
need_scaling=need_scaling,
|
|
899
|
+
search_params=config["search_params"],
|
|
900
|
+
target_type=self.target_type,
|
|
901
|
+
create_model=config["create_model"],
|
|
902
|
+
scaler_y=self.scaler_y,
|
|
903
|
+
plot=self.plot,
|
|
904
|
+
log_dir=log_dir,
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
start = time.time()
|
|
908
|
+
# Tuning hyperparameters
|
|
909
|
+
if perform_hyperopt:
|
|
910
|
+
best_params = self.hyperoptimize(model)
|
|
911
|
+
|
|
912
|
+
# save best params
|
|
913
|
+
best_params_file = f"{self.training_target_dir}/best_params.json"
|
|
914
|
+
try:
|
|
915
|
+
with open(best_params_file, "r") as f:
|
|
916
|
+
json_dict = json.load(f)
|
|
917
|
+
except FileNotFoundError:
|
|
918
|
+
json_dict = {}
|
|
919
|
+
|
|
920
|
+
json_dict[model.model_name] = serialize_for_json(best_params)
|
|
921
|
+
with open(best_params_file, "w") as f:
|
|
922
|
+
json.dump(json_dict, f, indent=4)
|
|
923
|
+
else:
|
|
924
|
+
try:
|
|
925
|
+
with open(f"{self.training_target_dir}/best_params.json") as f:
|
|
926
|
+
json_dict = json.load(f)
|
|
927
|
+
best_params = json_dict[model_name]
|
|
928
|
+
except Exception:
|
|
929
|
+
raise FileNotFoundError(
|
|
930
|
+
f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true"
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Perform cross-validation of the best model on k-folds of train + val set
|
|
934
|
+
if perform_crossval:
|
|
935
|
+
x_train_val = pd.concat([self.x_train, self.x_val, self.x_test], axis=0)
|
|
936
|
+
y_train_val = pd.concat([self.y_train, self.y_val, self.y_test], axis=0)
|
|
937
|
+
n_splits = 4
|
|
938
|
+
n_samples = len(x_train_val)
|
|
939
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
940
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
941
|
+
|
|
942
|
+
# Store the scores
|
|
943
|
+
cross_validation_scores = []
|
|
944
|
+
|
|
945
|
+
for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
|
|
946
|
+
self.type_name = f"crossval_fold_{i}"
|
|
947
|
+
|
|
948
|
+
if self.time_series:
|
|
949
|
+
date_series = self.train[self.date_column].copy()
|
|
950
|
+
|
|
951
|
+
if need_scaling:
|
|
952
|
+
date_series = date_series.map(pd.Timestamp.fromordinal)
|
|
953
|
+
|
|
954
|
+
# Now you can use the actual train/val indices to extract ranges
|
|
955
|
+
train_start = date_series.iloc[train_index[0]]
|
|
956
|
+
train_end = date_series.iloc[train_index[-1]]
|
|
957
|
+
val_start = date_series.iloc[val_index[0]]
|
|
958
|
+
val_end = date_series.iloc[val_index[-1]]
|
|
959
|
+
|
|
960
|
+
logger.info(
|
|
961
|
+
f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
|
|
962
|
+
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
963
|
+
)
|
|
964
|
+
else:
|
|
965
|
+
logger.info(
|
|
966
|
+
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# Train the model and get the score
|
|
970
|
+
if recurrent:
|
|
971
|
+
cross_validation_score, _, _ = self.train_model(
|
|
972
|
+
params=best_params,
|
|
973
|
+
x_train=x_train_val[train_index],
|
|
974
|
+
y_train=y_train_val[train_index],
|
|
975
|
+
x_val=x_train_val[val_index],
|
|
976
|
+
y_val=y_train_val[val_index],
|
|
977
|
+
model=model,
|
|
978
|
+
)
|
|
979
|
+
else:
|
|
980
|
+
cross_validation_score, _, _ = self.train_model(
|
|
981
|
+
params=best_params,
|
|
982
|
+
x_train=x_train_val.iloc[train_index],
|
|
983
|
+
y_train=y_train_val.iloc[train_index],
|
|
984
|
+
x_val=x_train_val.iloc[val_index],
|
|
985
|
+
y_val=y_train_val.iloc[val_index],
|
|
986
|
+
model=model,
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
# Append score to the list
|
|
990
|
+
cross_validation_scores.append(cross_validation_score)
|
|
991
|
+
|
|
992
|
+
# Calculate and log the mean score
|
|
993
|
+
cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
|
|
994
|
+
self.metric
|
|
995
|
+
].mean()
|
|
996
|
+
logger.info(
|
|
997
|
+
f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
# Retrain on entire training set, but keep score on cross-validation folds
|
|
1001
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1002
|
+
params=best_params,
|
|
1003
|
+
x_train=pd.concat([self.x_train, self.x_val], axis=0),
|
|
1004
|
+
y_train=pd.concat([self.y_train, self.y_val], axis=0),
|
|
1005
|
+
x_val=self.x_test,
|
|
1006
|
+
y_val=self.y_test,
|
|
1007
|
+
model=model,
|
|
1008
|
+
)
|
|
1009
|
+
best_score = cross_validation_mean_score
|
|
1010
|
+
else:
|
|
1011
|
+
# Evaluate on validation set
|
|
1012
|
+
self.type_name = "validation"
|
|
1013
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1014
|
+
params=best_params,
|
|
1015
|
+
x_train=pd.concat([self.x_train, self.x_val], axis=0),
|
|
1016
|
+
y_train=pd.concat([self.y_train, self.y_val], axis=0),
|
|
1017
|
+
x_val=self.x_test,
|
|
1018
|
+
y_val=self.y_test,
|
|
1019
|
+
model=model,
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
logger.info(f"Best model scores on test set: {best_score}")
|
|
1023
|
+
|
|
1024
|
+
# Save validation predictions
|
|
1025
|
+
best_pred.to_csv(
|
|
1026
|
+
f"{self.results_dir}/pred_val.csv",
|
|
1027
|
+
index=True,
|
|
1028
|
+
header=True,
|
|
1029
|
+
index_label="ID",
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# Save best model
|
|
1033
|
+
model_path = best_model.save(self.results_dir)
|
|
1034
|
+
|
|
1035
|
+
model_path = Path(model_path).resolve()
|
|
1036
|
+
best_score["MODEL_PATH"] = model_path
|
|
1037
|
+
|
|
1038
|
+
# Track scores
|
|
1039
|
+
scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
|
|
1040
|
+
best_score_df = pd.DataFrame([best_score])
|
|
1041
|
+
|
|
1042
|
+
if os.path.exists(scores_tracking_path):
|
|
1043
|
+
existing_scores = pd.read_csv(scores_tracking_path)
|
|
1044
|
+
common_cols = existing_scores.columns.intersection(
|
|
1045
|
+
best_score_df.columns
|
|
1046
|
+
)
|
|
1047
|
+
best_score_df = best_score_df[common_cols]
|
|
1048
|
+
scores_tracking = pd.concat(
|
|
1049
|
+
[existing_scores, best_score_df], ignore_index=True
|
|
1050
|
+
)
|
|
1051
|
+
else:
|
|
1052
|
+
scores_tracking = best_score_df
|
|
1053
|
+
|
|
1054
|
+
scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
|
|
1055
|
+
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1056
|
+
|
|
1057
|
+
# Save model training metadata
|
|
1058
|
+
stop = time.time()
|
|
1059
|
+
training_time = stop - start
|
|
1060
|
+
model_training.best_params = best_params
|
|
1061
|
+
model_training.model_path = model_path
|
|
1062
|
+
model_training.training_time = training_time
|
|
1063
|
+
model_training.save()
|
|
1064
|
+
|
|
1065
|
+
# Store metrics in DB
|
|
1066
|
+
drop_cols = [
|
|
1067
|
+
"DATE",
|
|
1068
|
+
"SESSION",
|
|
1069
|
+
"TRAIN_DATA",
|
|
1070
|
+
"VAL_DATA",
|
|
1071
|
+
"FEATURES",
|
|
1072
|
+
"MODEL_NAME",
|
|
1073
|
+
"MODEL_PATH",
|
|
1074
|
+
]
|
|
1075
|
+
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1076
|
+
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1077
|
+
|
|
1078
|
+
Score.upsert(
|
|
1079
|
+
match_fields=["model_training_id"],
|
|
1080
|
+
model_training_id=model_training.id,
|
|
1081
|
+
**score_data,
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1085
|
+
|
|
1086
|
+
# find best model type
|
|
1087
|
+
scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
|
|
1088
|
+
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1089
|
+
best_score_overall = scores_tracking.iloc[0, :]
|
|
1090
|
+
best_model_name = best_score_overall["MODEL_NAME"]
|
|
1091
|
+
|
|
1092
|
+
# Remove any .best or .keras files
|
|
1093
|
+
for file_path in glob.glob(
|
|
1094
|
+
os.path.join(self.training_target_dir, "*.best")
|
|
1095
|
+
) + glob.glob(os.path.join(self.training_target_dir, "*.keras")):
|
|
1096
|
+
os.remove(file_path)
|
|
1097
|
+
# Copy the best model in root training folder for this target
|
|
1098
|
+
best_model_path = Path(
|
|
1099
|
+
f"{self.training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1100
|
+
).resolve()
|
|
1101
|
+
copy_any(
|
|
1102
|
+
best_score_overall["MODEL_PATH"],
|
|
1103
|
+
best_model_path,
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
with open(f"{self.training_target_dir}/best_params.json", "r") as f:
|
|
1107
|
+
best_model_params = json.load(f)[best_model_name]
|
|
1108
|
+
|
|
1109
|
+
# save model_selection results to db
|
|
1110
|
+
model_selection = ModelSelection.get(model_selection.id)
|
|
1111
|
+
model_selection.best_model_id = Model.find_by(
|
|
1112
|
+
name=best_score_overall["MODEL_NAME"], type=self.target_type
|
|
1113
|
+
).id
|
|
1114
|
+
model_selection.best_model_params = best_model_params
|
|
1115
|
+
model_selection.best_model_path = best_model_path
|
|
1116
|
+
model_selection.save()
|
|
1117
|
+
|
|
1118
|
+
logger.info(f"Best model overall is : {best_score_overall}")
|
|
1119
|
+
|
|
1120
|
+
def hyperoptimize(self, model: ModelEngine):
|
|
1121
|
+
self.type_name = "hyperopts"
|
|
1122
|
+
|
|
1123
|
+
def collect_error_logs(training_target_dir: int, storage_path: str):
|
|
1124
|
+
output_error_file = f"{training_target_dir}/errors.log"
|
|
1125
|
+
|
|
1126
|
+
with open(output_error_file, "a") as outfile:
|
|
1127
|
+
# Walk through the ray_results directory
|
|
1128
|
+
for root, dirs, files in os.walk(storage_path):
|
|
1129
|
+
# Check if 'error.txt' exists in the current directory
|
|
1130
|
+
if "error.txt" in files:
|
|
1131
|
+
error_file_path = os.path.join(root, "error.txt")
|
|
1132
|
+
logger.info(f"Processing error file: {error_file_path}")
|
|
1133
|
+
# Read and append the content of the error.txt file
|
|
1134
|
+
with open(error_file_path, "r") as infile:
|
|
1135
|
+
outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
|
|
1136
|
+
outfile.write(infile.read())
|
|
1137
|
+
logger.info(f"All errors written to {output_error_file}")
|
|
1138
|
+
|
|
1139
|
+
logger.info("Start tuning hyperparameters...")
|
|
1140
|
+
|
|
1141
|
+
storage_path = f"{self.results_dir}/ray_results"
|
|
1142
|
+
|
|
1143
|
+
tuner = Tuner(
|
|
1144
|
+
trainable=with_parameters(
|
|
1145
|
+
trainable,
|
|
1146
|
+
x_train=self.x_train,
|
|
1147
|
+
y_train=self.y_train,
|
|
1148
|
+
x_val=self.x_val,
|
|
1149
|
+
y_val=self.y_val,
|
|
1150
|
+
model_name=model.model_name,
|
|
1151
|
+
target_type=self.target_type,
|
|
1152
|
+
session_name=self.session_name,
|
|
1153
|
+
target_number=self.target_number,
|
|
1154
|
+
create_model=model.create_model,
|
|
1155
|
+
type_name="hyperopts",
|
|
1156
|
+
plot=model.plot,
|
|
1157
|
+
),
|
|
1158
|
+
param_space=model.search_params,
|
|
1159
|
+
tune_config=TuneConfig(
|
|
1160
|
+
metric=self.metric,
|
|
1161
|
+
mode="min",
|
|
1162
|
+
search_alg=HyperOptSearch(),
|
|
1163
|
+
num_samples=self.number_of_trials,
|
|
1164
|
+
scheduler=ASHAScheduler(max_t=100, grace_period=10),
|
|
1165
|
+
),
|
|
1166
|
+
run_config=RunConfig(
|
|
1167
|
+
stop={"training_iteration": 100},
|
|
1168
|
+
storage_path=storage_path,
|
|
1169
|
+
callbacks=[TBXLoggerCallback()],
|
|
1170
|
+
),
|
|
1171
|
+
)
|
|
1172
|
+
try:
|
|
1173
|
+
results = tuner.fit()
|
|
1174
|
+
|
|
1175
|
+
best_result = results.get_best_result(self.metric, "max")
|
|
1176
|
+
best_params = best_result.config
|
|
1177
|
+
best_score = best_result.metrics
|
|
1178
|
+
|
|
1179
|
+
# log results
|
|
1180
|
+
logger.info(f"Best hyperparameters found were:\n{best_params}")
|
|
1181
|
+
logger.info(f"Best Scores found were:\n{best_score}")
|
|
1182
|
+
logger.info(
|
|
1183
|
+
f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
|
|
1184
|
+
)
|
|
1185
|
+
# Collect errors in single file
|
|
1186
|
+
collect_error_logs(
|
|
1187
|
+
training_target_dir=self.training_target_dir, storage_path=storage_path
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
except Exception as e:
|
|
1191
|
+
raise Exception(e)
|
|
1192
|
+
|
|
1193
|
+
finally:
|
|
1194
|
+
ray.shutdown()
|
|
1195
|
+
|
|
1196
|
+
return best_params
|
|
1197
|
+
|
|
1198
|
+
def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
|
|
1199
|
+
# Use the standalone training function to avoid duplication
|
|
1200
|
+
# For train_model, we pass the data directly (not as Ray references)
|
|
1201
|
+
return trainable(
|
|
1202
|
+
params,
|
|
1203
|
+
x_train,
|
|
1204
|
+
y_train,
|
|
1205
|
+
x_val,
|
|
1206
|
+
y_val,
|
|
1207
|
+
model.model_name,
|
|
1208
|
+
self.target_type,
|
|
1209
|
+
self.session_name,
|
|
1210
|
+
self.target_number,
|
|
1211
|
+
model.create_model,
|
|
1212
|
+
self.type_name,
|
|
1213
|
+
model.plot,
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
def evaluate(prediction: pd.DataFrame, target_type: str):
|
|
1218
|
+
"""
|
|
1219
|
+
Function to evaluate model performance
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
- prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
|
|
1223
|
+
- target_type: classification or regression
|
|
1224
|
+
"""
|
|
1225
|
+
score = {}
|
|
1226
|
+
y_true = prediction["TARGET"]
|
|
1227
|
+
y_pred = prediction["PRED"]
|
|
1228
|
+
|
|
1229
|
+
if target_type == "regression":
|
|
1230
|
+
# Main metrics
|
|
1231
|
+
score["RMSE"] = root_mean_squared_error(y_true, y_pred)
|
|
1232
|
+
score["MAE"] = mean_absolute_error(y_true, y_pred)
|
|
1233
|
+
score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
|
|
1234
|
+
score["R2"] = r2_score(y_true, y_pred)
|
|
1235
|
+
|
|
1236
|
+
# Robustness: avoid division by zero
|
|
1237
|
+
std_target = y_true.std()
|
|
1238
|
+
mean_target = y_true.mean()
|
|
1239
|
+
median_target = y_true.median()
|
|
1240
|
+
|
|
1241
|
+
# RMSE / STD
|
|
1242
|
+
score["RMSE_STD_RATIO"] = (
|
|
1243
|
+
float(100 * score["RMSE"] / std_target) if std_target else 1000
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
# Median absolute deviation (MAD)
|
|
1247
|
+
mam = (y_true - mean_target).abs().median() # Median Abs around Mean
|
|
1248
|
+
mad = (y_true - median_target).abs().median() # Median Abs around Median
|
|
1249
|
+
score["MAM"] = mam
|
|
1250
|
+
score["MAD"] = mad
|
|
1251
|
+
score["MAE_MAM_RATIO"] = (
|
|
1252
|
+
float(100 * score["MAE"] / mam) if mam else 1000
|
|
1253
|
+
) # MAE / MAD → Plus stable, moins sensible aux outliers.
|
|
1254
|
+
score["MAE_MAD_RATIO"] = (
|
|
1255
|
+
float(100 * score["MAE"] / mad) if mad else 1000
|
|
1256
|
+
) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
|
|
1257
|
+
|
|
1258
|
+
else:
|
|
1259
|
+
|
|
1260
|
+
labels = np.unique(y_true)
|
|
1261
|
+
num_classes = labels.size
|
|
1262
|
+
y_pred_proba = (
|
|
1263
|
+
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
1264
|
+
)
|
|
1265
|
+
if num_classes > 2:
|
|
1266
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
1267
|
+
lb.fit(labels)
|
|
1268
|
+
y_true_onhot = lb.transform(y_true)
|
|
1269
|
+
y_pred_onehot = lb.transform(y_pred)
|
|
1270
|
+
|
|
1271
|
+
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
1272
|
+
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
1273
|
+
score["PRECISION"] = precision_score(
|
|
1274
|
+
y_true,
|
|
1275
|
+
y_pred,
|
|
1276
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1277
|
+
)
|
|
1278
|
+
score["RECALL"] = recall_score(
|
|
1279
|
+
y_true,
|
|
1280
|
+
y_pred,
|
|
1281
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1282
|
+
)
|
|
1283
|
+
score["F1"] = f1_score(
|
|
1284
|
+
y_true,
|
|
1285
|
+
y_pred,
|
|
1286
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1287
|
+
)
|
|
1288
|
+
score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
|
|
1289
|
+
(
|
|
1290
|
+
score["THRESHOLD"],
|
|
1291
|
+
score["PRECISION_AT_THRESHOLD"],
|
|
1292
|
+
score["RECALL_AT_THRESHOLD"],
|
|
1293
|
+
) = (
|
|
1294
|
+
find_best_precision_threshold(prediction)
|
|
1295
|
+
if num_classes == 2
|
|
1296
|
+
else (None, None, None)
|
|
1297
|
+
)
|
|
1298
|
+
return score
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
# utils
|
|
1302
|
+
def get_log_dir(training_target_dir: str, model_name="test_model"):
|
|
1303
|
+
"""Generates a structured log directory path for TensorBoard."""
|
|
1304
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
|
1305
|
+
log_dir = (
|
|
1306
|
+
Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
|
|
1307
|
+
)
|
|
1308
|
+
log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
|
|
1309
|
+
return str(log_dir)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def print_scores(training_target_dir: str):
|
|
1313
|
+
"""
|
|
1314
|
+
Monitor scores
|
|
1315
|
+
"""
|
|
1316
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
1317
|
+
return scores_tracking
|
|
1318
|
+
|
|
1319
|
+
|
|
1320
|
+
# plots
|
|
1321
|
+
def plot_evaluation_for_classification(prediction: dict):
|
|
1322
|
+
"""
|
|
1323
|
+
Args
|
|
1324
|
+
prediction (pd.DataFrame): Should be a df with TARGET, PRED, 0, 1 columns for y_true value (TARGET), y_pred (PRED), and probabilities (for classification only : 0 and 1)
|
|
1325
|
+
"""
|
|
1326
|
+
y_true = prediction["TARGET"]
|
|
1327
|
+
y_pred = prediction["PRED"]
|
|
1328
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1329
|
+
|
|
1330
|
+
# Plot confusion matrix
|
|
1331
|
+
plot_confusion_matrix(y_true, y_pred)
|
|
1332
|
+
|
|
1333
|
+
# Compute ROC curve and ROC area
|
|
1334
|
+
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
|
|
1335
|
+
roc_auc = auc(fpr, tpr)
|
|
1336
|
+
|
|
1337
|
+
plt.figure(figsize=(8, 8))
|
|
1338
|
+
plt.plot(
|
|
1339
|
+
fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc
|
|
1340
|
+
)
|
|
1341
|
+
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
|
|
1342
|
+
plt.xlim([0.0, 1.0])
|
|
1343
|
+
plt.ylim([0.0, 1.05])
|
|
1344
|
+
plt.xlabel("False Positive Rate")
|
|
1345
|
+
plt.ylabel("True Positive Rate")
|
|
1346
|
+
plt.title("ROC Curve")
|
|
1347
|
+
plt.legend(loc="lower right")
|
|
1348
|
+
plt.show()
|
|
1349
|
+
|
|
1350
|
+
# Compute precision-recall curve
|
|
1351
|
+
precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
|
|
1352
|
+
average_precision = average_precision_score(y_true, y_pred_proba)
|
|
1353
|
+
|
|
1354
|
+
plt.figure(figsize=(8, 8))
|
|
1355
|
+
plt.step(recall, precision, color="b", alpha=0.2, where="post")
|
|
1356
|
+
plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
|
|
1357
|
+
plt.xlabel("Recall")
|
|
1358
|
+
plt.ylabel("Precision")
|
|
1359
|
+
plt.ylim([0.0, 1.05])
|
|
1360
|
+
plt.xlim([0.0, 1.0])
|
|
1361
|
+
plt.title("Precision-Recall Curve: AP={0:0.2f}".format(average_precision))
|
|
1362
|
+
plt.show()
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
def plot_confusion_matrix(y_true, y_pred):
|
|
1366
|
+
unique_labels = np.unique(np.concatenate((y_true, y_pred)))
|
|
1367
|
+
cm = confusion_matrix(y_true, y_pred)
|
|
1368
|
+
|
|
1369
|
+
labels = np.sort(unique_labels) # Sort labels based on numerical order
|
|
1370
|
+
|
|
1371
|
+
plt.figure(figsize=(10, 7))
|
|
1372
|
+
sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
|
|
1373
|
+
plt.xlabel("Predicted", fontsize=12)
|
|
1374
|
+
plt.ylabel("True", fontsize=12)
|
|
1375
|
+
plt.title("Confusion Matrix", fontsize=14)
|
|
1376
|
+
|
|
1377
|
+
plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
|
|
1378
|
+
plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
|
|
1379
|
+
|
|
1380
|
+
plt.show()
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
# thresholds
|
|
1384
|
+
def find_max_f1_threshold(prediction):
|
|
1385
|
+
"""
|
|
1386
|
+
Finds the threshold that maximizes the F1 score for a binary classification task.
|
|
1387
|
+
|
|
1388
|
+
Parameters:
|
|
1389
|
+
- prediction: DataFrame with 'TARGET' and '1' (predicted probabilities) columns
|
|
1390
|
+
|
|
1391
|
+
Returns:
|
|
1392
|
+
- best_threshold: The threshold that maximizes the F1 score
|
|
1393
|
+
- best_precision: The precision at that threshold
|
|
1394
|
+
- best_recall: The recall at that threshold
|
|
1395
|
+
"""
|
|
1396
|
+
y_true = prediction["TARGET"]
|
|
1397
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1398
|
+
|
|
1399
|
+
# Compute precision, recall, and thresholds
|
|
1400
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1401
|
+
|
|
1402
|
+
# Drop the first element to align with thresholds
|
|
1403
|
+
precision = precision[1:]
|
|
1404
|
+
recall = recall[1:]
|
|
1405
|
+
|
|
1406
|
+
# Filter out trivial cases (precision or recall = 0)
|
|
1407
|
+
valid = (precision > 0) & (recall > 0)
|
|
1408
|
+
if not np.any(valid):
|
|
1409
|
+
raise ValueError("No valid threshold with non-zero precision and recall")
|
|
1410
|
+
|
|
1411
|
+
precision = precision[valid]
|
|
1412
|
+
recall = recall[valid]
|
|
1413
|
+
thresholds = thresholds[valid]
|
|
1414
|
+
|
|
1415
|
+
# Compute F1 scores for each threshold
|
|
1416
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1417
|
+
|
|
1418
|
+
best_index = np.argmax(f1_scores)
|
|
1419
|
+
|
|
1420
|
+
best_threshold = thresholds[best_index]
|
|
1421
|
+
best_precision = precision[best_index]
|
|
1422
|
+
best_recall = recall[best_index]
|
|
1423
|
+
|
|
1424
|
+
return best_threshold, best_precision, best_recall
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
def find_best_f1_threshold(prediction, fscore_target: float):
|
|
1428
|
+
"""
|
|
1429
|
+
Finds the highest threshold achieving at least the given F1 score target.
|
|
1430
|
+
|
|
1431
|
+
Parameters:
|
|
1432
|
+
- prediction: DataFrame with 'TARGET' and '1' (or 1 as int) columns
|
|
1433
|
+
- fscore_target: Desired minimum F1 score (between 0 and 1)
|
|
1434
|
+
|
|
1435
|
+
Returns:
|
|
1436
|
+
- best_threshold: The highest threshold meeting the F1 target
|
|
1437
|
+
- best_precision: Precision at that threshold
|
|
1438
|
+
- best_recall: Recall at that threshold
|
|
1439
|
+
- best_f1: Actual F1 score at that threshold
|
|
1440
|
+
"""
|
|
1441
|
+
y_true = prediction["TARGET"]
|
|
1442
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1443
|
+
|
|
1444
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1445
|
+
|
|
1446
|
+
# Align precision/recall with thresholds
|
|
1447
|
+
precision = precision[1:]
|
|
1448
|
+
recall = recall[1:]
|
|
1449
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1450
|
+
|
|
1451
|
+
# Filter for thresholds meeting F1 target
|
|
1452
|
+
valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 >= fscore_target]
|
|
1453
|
+
|
|
1454
|
+
if not valid_indices:
|
|
1455
|
+
raise ValueError(f"Could not find a threshold with F1 >= {fscore_target:.2f}")
|
|
1456
|
+
|
|
1457
|
+
# Pick the highest threshold among valid ones
|
|
1458
|
+
best_index = valid_indices[-1]
|
|
1459
|
+
|
|
1460
|
+
return (
|
|
1461
|
+
thresholds[best_index],
|
|
1462
|
+
precision[best_index],
|
|
1463
|
+
recall[best_index],
|
|
1464
|
+
f1_scores[best_index],
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
|
|
1468
|
+
def find_max_precision_threshold_without_trivial_case(prediction: dict):
|
|
1469
|
+
"""
|
|
1470
|
+
Finds the threshold that maximizes precision without reaching a precision of 1,
|
|
1471
|
+
which indicates all predictions are classified as the negative class (0).
|
|
1472
|
+
|
|
1473
|
+
Parameters:
|
|
1474
|
+
- prediction: dict with keys 'TARGET' (true labels) and '1' (predicted probabilities)
|
|
1475
|
+
|
|
1476
|
+
Returns:
|
|
1477
|
+
- threshold: the probability threshold that maximizes precision
|
|
1478
|
+
- actual_recall: the recall achieved at this threshold
|
|
1479
|
+
- actual_precision: the precision achieved at this threshold
|
|
1480
|
+
"""
|
|
1481
|
+
y_true = prediction["TARGET"]
|
|
1482
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1483
|
+
|
|
1484
|
+
# Compute precision, recall, and thresholds
|
|
1485
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1486
|
+
|
|
1487
|
+
# Drop the first element of precision and recall to align with thresholds
|
|
1488
|
+
precision = precision[1:]
|
|
1489
|
+
recall = recall[1:]
|
|
1490
|
+
|
|
1491
|
+
# Filter out precision == 1.0 (which might correspond to predicting only 0s)
|
|
1492
|
+
valid_indices = np.where(precision < 1.0)[0]
|
|
1493
|
+
if len(valid_indices) == 0:
|
|
1494
|
+
raise ValueError("No valid precision values less than 1.0")
|
|
1495
|
+
|
|
1496
|
+
precision = precision[valid_indices]
|
|
1497
|
+
recall = recall[valid_indices]
|
|
1498
|
+
thresholds = thresholds[valid_indices]
|
|
1499
|
+
|
|
1500
|
+
# Find the index of the maximum precision
|
|
1501
|
+
best_index = np.argmax(precision)
|
|
1502
|
+
|
|
1503
|
+
# Return the corresponding threshold, precision, and recall
|
|
1504
|
+
best_threshold = thresholds[best_index]
|
|
1505
|
+
best_precision = precision[best_index]
|
|
1506
|
+
best_recall = recall[best_index]
|
|
1507
|
+
|
|
1508
|
+
return best_threshold, best_precision, best_recall
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
def find_best_precision_threshold(prediction, precision_target: float = 0.80):
|
|
1512
|
+
"""
|
|
1513
|
+
Finds the highest threshold that achieves at least the given precision target.
|
|
1514
|
+
|
|
1515
|
+
Parameters:
|
|
1516
|
+
prediction (pd.DataFrame): DataFrame with columns 'TARGET' and '1' or index 1 for predicted probabilities
|
|
1517
|
+
precision_target (float): Desired minimum precision (between 0 and 1)
|
|
1518
|
+
|
|
1519
|
+
Returns:
|
|
1520
|
+
tuple: (threshold, precision, recall) achieving the desired precision
|
|
1521
|
+
"""
|
|
1522
|
+
y_true = prediction["TARGET"]
|
|
1523
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1524
|
+
|
|
1525
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1526
|
+
|
|
1527
|
+
# Align lengths: thresholds is N-1 compared to precision/recall
|
|
1528
|
+
thresholds = thresholds
|
|
1529
|
+
precision = precision[1:] # Shift to match thresholds
|
|
1530
|
+
recall = recall[1:]
|
|
1531
|
+
|
|
1532
|
+
valid_indices = [i for i, p in enumerate(precision) if p >= precision_target]
|
|
1533
|
+
|
|
1534
|
+
if not valid_indices:
|
|
1535
|
+
raise ValueError(
|
|
1536
|
+
f"Could not find a threshold with precision >= {precision_target}"
|
|
1537
|
+
)
|
|
1538
|
+
|
|
1539
|
+
best_idx = valid_indices[-1] # Highest threshold with precision >= target
|
|
1540
|
+
|
|
1541
|
+
return thresholds[best_idx], precision[best_idx], recall[best_idx]
|
|
1542
|
+
|
|
1543
|
+
|
|
1544
|
+
def find_best_recall_threshold(prediction, recall_target: float = 0.98) -> float:
|
|
1545
|
+
"""
|
|
1546
|
+
Finds the highest threshold that achieves at least the given recall target.
|
|
1547
|
+
|
|
1548
|
+
Parameters:
|
|
1549
|
+
pred_df (pd.DataFrame): DataFrame with columns 'y_true' and 'y_pred_proba'
|
|
1550
|
+
recall_target (float): Desired minimum recall (between 0 and 1)
|
|
1551
|
+
|
|
1552
|
+
Returns:
|
|
1553
|
+
float: Best threshold achieving the desired recall, or None if not reachable
|
|
1554
|
+
"""
|
|
1555
|
+
y_true = prediction["TARGET"]
|
|
1556
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1557
|
+
|
|
1558
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1559
|
+
|
|
1560
|
+
# `thresholds` has length N-1 compared to precision and recall
|
|
1561
|
+
recall = recall[1:] # Drop first element to align with thresholds
|
|
1562
|
+
precision = precision[1:]
|
|
1563
|
+
|
|
1564
|
+
valid_indices = [i for i, r in enumerate(recall) if r >= recall_target]
|
|
1565
|
+
|
|
1566
|
+
if not valid_indices:
|
|
1567
|
+
logger.warning(f"Could not find a threshold with recall >= {recall_target}")
|
|
1568
|
+
return None, None, None
|
|
1569
|
+
|
|
1570
|
+
best_idx = valid_indices[-1] # Highest threshold with recall >= target
|
|
1571
|
+
|
|
1572
|
+
return thresholds[best_idx], precision[best_idx], recall[best_idx]
|
|
1573
|
+
|
|
1574
|
+
|
|
1575
|
+
def plot_threshold(prediction, threshold, precision, recall):
|
|
1576
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1577
|
+
y_true = prediction["TARGET"]
|
|
1578
|
+
|
|
1579
|
+
predicted_positive = (y_pred_proba >= threshold).sum()
|
|
1580
|
+
predicted_negative = (y_pred_proba < threshold).sum()
|
|
1581
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1582
|
+
per_predicted_positive = predicted_positive / len(y_pred_proba)
|
|
1583
|
+
per_predicted_negative = predicted_negative / len(y_pred_proba)
|
|
1584
|
+
|
|
1585
|
+
logger.info(
|
|
1586
|
+
f"""Threshold: {threshold*100:.2f}
|
|
1587
|
+
Precision: {precision*100:.2f}
|
|
1588
|
+
Recall: {recall*100:.2f}
|
|
1589
|
+
F1-score: {f1_scores*100:.2f}
|
|
1590
|
+
% of score over {threshold}: {predicted_positive}/{len(y_pred_proba)} = {per_predicted_positive*100:.2f}%
|
|
1591
|
+
% of score under {threshold}: {predicted_negative}/{len(y_pred_proba)} = {per_predicted_negative*100:.2f}%"""
|
|
1592
|
+
)
|
|
1593
|
+
|
|
1594
|
+
# Visualizing the scores of positive and negative classes
|
|
1595
|
+
plt.figure(figsize=(10, 6))
|
|
1596
|
+
sns.histplot(
|
|
1597
|
+
y_pred_proba[y_true == 1],
|
|
1598
|
+
color="blue",
|
|
1599
|
+
label="Positive Class",
|
|
1600
|
+
bins=30,
|
|
1601
|
+
kde=True,
|
|
1602
|
+
alpha=0.6,
|
|
1603
|
+
)
|
|
1604
|
+
sns.histplot(
|
|
1605
|
+
y_pred_proba[y_true == 0],
|
|
1606
|
+
color="red",
|
|
1607
|
+
label="Negative Class",
|
|
1608
|
+
bins=30,
|
|
1609
|
+
kde=True,
|
|
1610
|
+
alpha=0.6,
|
|
1611
|
+
)
|
|
1612
|
+
plt.axvline(
|
|
1613
|
+
x=threshold,
|
|
1614
|
+
color="green",
|
|
1615
|
+
linestyle="--",
|
|
1616
|
+
label=f"Threshold at {round(threshold,3)}",
|
|
1617
|
+
)
|
|
1618
|
+
plt.title("Distribution of Predicted Probabilities")
|
|
1619
|
+
plt.xlabel("Predicted Probabilities")
|
|
1620
|
+
plt.ylabel("Frequency")
|
|
1621
|
+
plt.legend()
|
|
1622
|
+
plt.show()
|
|
1623
|
+
return threshold
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
# OLD - to sort out
|
|
1627
|
+
def get_pred_distribution(training_target_dir: str, model_name="linear"):
|
|
1628
|
+
"""
|
|
1629
|
+
Look at prediction distributions
|
|
1630
|
+
"""
|
|
1631
|
+
prediction = pd.read_csv(
|
|
1632
|
+
f"{training_target_dir}/{model_name}/pred_val.csv",
|
|
1633
|
+
index_col="ID",
|
|
1634
|
+
)
|
|
1635
|
+
prediction.describe()
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
def plot_feature_importance(training_target_dir: str, model_name="linear"):
|
|
1639
|
+
"""
|
|
1640
|
+
Monitor feature importance ranking to filter out unrelevant features
|
|
1641
|
+
"""
|
|
1642
|
+
model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
|
|
1643
|
+
if hasattr(model, "feature_importances_"):
|
|
1644
|
+
feature_importances_ = model.feature_importances_.flatten()
|
|
1645
|
+
elif hasattr(model, "feature_importance"):
|
|
1646
|
+
feature_importances_ = model.feature_importance.flatten()
|
|
1647
|
+
elif hasattr(model, "coefs_"):
|
|
1648
|
+
feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
|
|
1649
|
+
elif hasattr(model, "coef_"):
|
|
1650
|
+
feature_importances_ = model.coef_.flatten()
|
|
1651
|
+
else:
|
|
1652
|
+
feature_importances_ = []
|
|
1653
|
+
|
|
1654
|
+
sns.barplot(
|
|
1655
|
+
data=feature_importances_,
|
|
1656
|
+
orient="h",
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
|
|
1660
|
+
def print_model_estimators(training_target_dir: str, model_name="linear"):
|
|
1661
|
+
"""
|
|
1662
|
+
Look at a specific trained model
|
|
1663
|
+
"""
|
|
1664
|
+
model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
|
|
1665
|
+
for i in range(0, 100):
|
|
1666
|
+
logger.info(model.estimators_[i].get_depth())
|
|
1667
|
+
|
|
1668
|
+
|
|
1669
|
+
def get_model_info(model):
|
|
1670
|
+
model.count_params()
|
|
1671
|
+
model.summary()
|