lecrapaud 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +0 -0
- lecrapaud/config.py +16 -0
- lecrapaud/db/__init__.py +0 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/crud.py +179 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +6 -0
- lecrapaud/db/models/dataset.py +124 -0
- lecrapaud/db/models/feature.py +46 -0
- lecrapaud/db/models/feature_selection.py +126 -0
- lecrapaud/db/models/feature_selection_rank.py +80 -0
- lecrapaud/db/models/model.py +41 -0
- lecrapaud/db/models/model_selection.py +56 -0
- lecrapaud/db/models/model_training.py +54 -0
- lecrapaud/db/models/score.py +62 -0
- lecrapaud/db/models/target.py +59 -0
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +58 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/feature_engineering.py +1119 -0
- lecrapaud/feature_selection.py +1229 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1571 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/search_space.py +844 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +151 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.4.0.dist-info/LICENSE +201 -0
- lecrapaud-0.4.0.dist-info/METADATA +103 -0
- lecrapaud-0.4.0.dist-info/RECORD +60 -0
- lecrapaud-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1571 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
import time
|
|
7
|
+
import os
|
|
8
|
+
import json
|
|
9
|
+
import warnings
|
|
10
|
+
import joblib
|
|
11
|
+
import glob
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
15
|
+
|
|
16
|
+
# ML models
|
|
17
|
+
from sklearn.model_selection import TimeSeriesSplit
|
|
18
|
+
from sklearn.calibration import CalibratedClassifierCV
|
|
19
|
+
from sklearn.metrics import (
|
|
20
|
+
mean_absolute_percentage_error,
|
|
21
|
+
root_mean_squared_error,
|
|
22
|
+
mean_absolute_error,
|
|
23
|
+
r2_score,
|
|
24
|
+
accuracy_score,
|
|
25
|
+
precision_score,
|
|
26
|
+
recall_score,
|
|
27
|
+
f1_score,
|
|
28
|
+
log_loss,
|
|
29
|
+
roc_auc_score,
|
|
30
|
+
roc_curve,
|
|
31
|
+
auc,
|
|
32
|
+
precision_recall_curve,
|
|
33
|
+
average_precision_score,
|
|
34
|
+
confusion_matrix,
|
|
35
|
+
)
|
|
36
|
+
from sklearn.preprocessing import LabelBinarizer
|
|
37
|
+
import lightgbm as lgb
|
|
38
|
+
import xgboost as xgb
|
|
39
|
+
|
|
40
|
+
# DL models
|
|
41
|
+
import tensorflow as tf
|
|
42
|
+
import keras
|
|
43
|
+
from keras.callbacks import EarlyStopping, TensorBoard
|
|
44
|
+
from keras.metrics import (
|
|
45
|
+
Precision,
|
|
46
|
+
Recall,
|
|
47
|
+
F1Score,
|
|
48
|
+
)
|
|
49
|
+
from keras.losses import BinaryCrossentropy, CategoricalCrossentropy
|
|
50
|
+
from keras.optimizers import Adam
|
|
51
|
+
|
|
52
|
+
K = tf.keras.backend
|
|
53
|
+
from tensorboardX import SummaryWriter
|
|
54
|
+
|
|
55
|
+
# Optimization
|
|
56
|
+
import ray
|
|
57
|
+
from ray.tune import Tuner, TuneConfig, with_parameters
|
|
58
|
+
from ray.train import RunConfig
|
|
59
|
+
from ray.tune.search.hyperopt import HyperOptSearch
|
|
60
|
+
from ray.tune.search.bayesopt import BayesOptSearch
|
|
61
|
+
from ray.tune.logger import TBXLoggerCallback
|
|
62
|
+
from ray.tune.schedulers import ASHAScheduler
|
|
63
|
+
from ray.air import session
|
|
64
|
+
|
|
65
|
+
# Internal library
|
|
66
|
+
from src.search_space import ml_models, dl_recurrent_models
|
|
67
|
+
from src.directory_management import clean_directory
|
|
68
|
+
from src.utils import copy_any, contains_best, logger, serialize_for_json
|
|
69
|
+
from src.config import PYTHON_ENV
|
|
70
|
+
from src.feature_selection import TARGETS_CLF, DATE_COLUMN, load_train_data
|
|
71
|
+
from src.db.models import Model, ModelSelection, ModelTraining, Score, Target, Dataset
|
|
72
|
+
|
|
73
|
+
# Reproducible result
|
|
74
|
+
keras.utils.set_random_seed(42)
|
|
75
|
+
np.random.seed(42)
|
|
76
|
+
tf.config.experimental.enable_op_determinism()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# test configuration
|
|
80
|
+
def test_hardware():
|
|
81
|
+
devices = tf.config.list_physical_devices()
|
|
82
|
+
logger.info("\nDevices: ", devices)
|
|
83
|
+
|
|
84
|
+
gpus = tf.config.list_physical_devices("GPU")
|
|
85
|
+
if gpus:
|
|
86
|
+
details = tf.config.experimental.get_device_details(gpus[0])
|
|
87
|
+
logger.info("GPU details: ", details)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Suppress specific warning messages related to file system monitor
|
|
91
|
+
# logging.getLogger("ray").setLevel(logging.CRITICAL)
|
|
92
|
+
# logging.getLogger("ray.train").setLevel(logging.CRITICAL)
|
|
93
|
+
# logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
|
|
94
|
+
# logging.getLogger("ray.autoscaler").setLevel(logging.CRITICAL)
|
|
95
|
+
# logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
|
|
96
|
+
# logging.getLogger("ray.monitor").setLevel(logging.CRITICAL)
|
|
97
|
+
# logging.getLogger("ray.dashboard").setLevel(logging.CRITICAL)
|
|
98
|
+
# logging.getLogger("ray.gcs_server").setLevel(logging.CRITICAL)
|
|
99
|
+
|
|
100
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# Metrics
|
|
104
|
+
def rmse_tf(y_true, y_pred):
|
|
105
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
106
|
+
results = K.sqrt(K.mean(K.square(y_pred - y_true)))
|
|
107
|
+
return results
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def mae_tf(y_true, y_pred):
|
|
111
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
112
|
+
results = K.mean(K.abs(y_pred - y_true))
|
|
113
|
+
return results
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def unscale_tf(y_true, y_pred):
|
|
117
|
+
if _target_type == "regression":
|
|
118
|
+
scale = K.constant(_scaler_y.scale_[0])
|
|
119
|
+
mean = K.constant(_scaler_y.mean_[0])
|
|
120
|
+
|
|
121
|
+
y_true = K.mul(y_true, scale)
|
|
122
|
+
y_true = K.bias_add(y_true, mean)
|
|
123
|
+
|
|
124
|
+
y_pred = K.mul(y_pred, scale)
|
|
125
|
+
y_pred = K.bias_add(y_pred, mean)
|
|
126
|
+
return y_true, y_pred
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def recall_tf(y_true, y_pred):
|
|
130
|
+
y_true = K.ones_like(y_true)
|
|
131
|
+
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
132
|
+
all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
|
|
133
|
+
|
|
134
|
+
recall = true_positives / (all_positives + K.epsilon())
|
|
135
|
+
return recall
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def precision_tf(y_true, y_pred):
|
|
139
|
+
y_true = K.ones_like(y_true)
|
|
140
|
+
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
141
|
+
|
|
142
|
+
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
|
|
143
|
+
precision = true_positives / (predicted_positives + K.epsilon())
|
|
144
|
+
return precision
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def f1_score_tf(y_true, y_pred):
|
|
148
|
+
precision = precision_tf(y_true, y_pred)
|
|
149
|
+
recall = recall_tf(y_true, y_pred)
|
|
150
|
+
return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_log_dir(training_target_dir: str, model_name="test_model"):
|
|
154
|
+
"""Generates a structured log directory path for TensorBoard."""
|
|
155
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
|
156
|
+
log_dir = (
|
|
157
|
+
Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
|
|
158
|
+
)
|
|
159
|
+
log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
|
|
160
|
+
return str(log_dir)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# Functions to fit & evaluate models
|
|
164
|
+
def fit_sklearn(x_train, y_train, x_val, y_val, create_model, params, config):
|
|
165
|
+
|
|
166
|
+
# Create & Compile the model
|
|
167
|
+
model = create_model(**params)
|
|
168
|
+
|
|
169
|
+
# Train the model
|
|
170
|
+
logger.info("Fitting the model...")
|
|
171
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
172
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
173
|
+
|
|
174
|
+
model.fit(x_train, y_train)
|
|
175
|
+
|
|
176
|
+
if (
|
|
177
|
+
_target_type == "classification"
|
|
178
|
+
and "loss" in model.get_params().keys()
|
|
179
|
+
and "hinge" in model.get_params()["loss"]
|
|
180
|
+
):
|
|
181
|
+
# This is for SVC models with hinge loss
|
|
182
|
+
# You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
|
|
183
|
+
# TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
|
|
184
|
+
logger.info(
|
|
185
|
+
f"Re-Calibrating {config["model_name"]} to get predict probabilities..."
|
|
186
|
+
)
|
|
187
|
+
calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
|
|
188
|
+
model = calibrator.fit(x_train, y_train)
|
|
189
|
+
|
|
190
|
+
# set model_name after calibrator
|
|
191
|
+
model.model_name = config["model_name"]
|
|
192
|
+
|
|
193
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
194
|
+
|
|
195
|
+
return model
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def fit_boosting(x_train, y_train, x_val, y_val, create_model, params, config):
|
|
199
|
+
"""
|
|
200
|
+
This is using lightGBM or XGboost C++ librairies
|
|
201
|
+
"""
|
|
202
|
+
lightGBM = create_model == "lgb"
|
|
203
|
+
|
|
204
|
+
# Datasets
|
|
205
|
+
Dataset = lgb.Dataset if lightGBM else xgb.DMatrix
|
|
206
|
+
train_data = Dataset(x_train, label=y_train)
|
|
207
|
+
val_data = Dataset(x_val, label=y_val)
|
|
208
|
+
|
|
209
|
+
# Callbacks
|
|
210
|
+
log_dir = get_log_dir(_training_target_dir, create_model)
|
|
211
|
+
|
|
212
|
+
# Create a TensorBoardX writer
|
|
213
|
+
writer = SummaryWriter(log_dir)
|
|
214
|
+
evals_result = {}
|
|
215
|
+
|
|
216
|
+
# Training
|
|
217
|
+
labels = np.unique(y_train)
|
|
218
|
+
num_class = (
|
|
219
|
+
labels.size if _target_type == "classification" and labels.size > 2 else 1
|
|
220
|
+
)
|
|
221
|
+
logger.info("Fitting the model...")
|
|
222
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
223
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
224
|
+
|
|
225
|
+
if lightGBM:
|
|
226
|
+
|
|
227
|
+
def tensorboard_callback(env):
|
|
228
|
+
for i, metric in enumerate(env.evaluation_result_list):
|
|
229
|
+
metric_name, _, metric_value, _ = metric
|
|
230
|
+
writer.add_scalar(
|
|
231
|
+
f"LightGBM/{metric_name}", metric_value, env.iteration
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
loss = (
|
|
235
|
+
"regression"
|
|
236
|
+
if _target_type == "regression"
|
|
237
|
+
else ("binary" if num_class <= 2 else "multiclass")
|
|
238
|
+
)
|
|
239
|
+
eval_metric = (
|
|
240
|
+
"rmse"
|
|
241
|
+
if _target_type == "regression"
|
|
242
|
+
else ("binary_logloss" if num_class <= 2 else "multi_logloss")
|
|
243
|
+
)
|
|
244
|
+
model = lgb.train(
|
|
245
|
+
params={
|
|
246
|
+
**params["model_params"],
|
|
247
|
+
"objective": loss,
|
|
248
|
+
"metric": eval_metric,
|
|
249
|
+
"num_class": num_class,
|
|
250
|
+
},
|
|
251
|
+
num_boost_round=params["num_boost_round"],
|
|
252
|
+
train_set=train_data,
|
|
253
|
+
valid_sets=[train_data, val_data],
|
|
254
|
+
valid_names=["train", "val"],
|
|
255
|
+
callbacks=[
|
|
256
|
+
lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
|
|
257
|
+
lgb.record_evaluation(evals_result),
|
|
258
|
+
tensorboard_callback,
|
|
259
|
+
],
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
|
|
263
|
+
class TensorBoardCallback(xgb.callback.TrainingCallback):
|
|
264
|
+
|
|
265
|
+
def __init__(self, log_dir: str):
|
|
266
|
+
self.writer = SummaryWriter(log_dir=log_dir)
|
|
267
|
+
|
|
268
|
+
def after_iteration(
|
|
269
|
+
self,
|
|
270
|
+
model,
|
|
271
|
+
epoch: int,
|
|
272
|
+
evals_log: xgb.callback.TrainingCallback.EvalsLog,
|
|
273
|
+
) -> bool:
|
|
274
|
+
if not evals_log:
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
for data, metric in evals_log.items():
|
|
278
|
+
for metric_name, log in metric.items():
|
|
279
|
+
score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
|
|
280
|
+
self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
|
|
281
|
+
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
tensorboard_callback = TensorBoardCallback(log_dir)
|
|
285
|
+
|
|
286
|
+
loss = (
|
|
287
|
+
"reg:squarederror"
|
|
288
|
+
if _target_type == "regression"
|
|
289
|
+
else ("binary:logistic" if num_class <= 2 else "multi:softprob")
|
|
290
|
+
)
|
|
291
|
+
eval_metric = (
|
|
292
|
+
"rmse"
|
|
293
|
+
if _target_type == "regression"
|
|
294
|
+
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
295
|
+
)
|
|
296
|
+
model = xgb.train(
|
|
297
|
+
params={
|
|
298
|
+
**params["model_params"],
|
|
299
|
+
"objective": loss,
|
|
300
|
+
"eval_metric": eval_metric,
|
|
301
|
+
"num_class": num_class,
|
|
302
|
+
},
|
|
303
|
+
num_boost_round=params["num_boost_round"],
|
|
304
|
+
dtrain=train_data,
|
|
305
|
+
evals=[(val_data, "val"), (train_data, "train")],
|
|
306
|
+
callbacks=[
|
|
307
|
+
xgb.callback.EarlyStopping(
|
|
308
|
+
rounds=params["early_stopping_rounds"], save_best=True
|
|
309
|
+
),
|
|
310
|
+
xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
|
|
311
|
+
tensorboard_callback,
|
|
312
|
+
],
|
|
313
|
+
evals_result=evals_result, # Record evaluation result
|
|
314
|
+
verbose_eval=0,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
model.model_name = create_model
|
|
318
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
319
|
+
|
|
320
|
+
# Close the writer after training is done
|
|
321
|
+
writer.close()
|
|
322
|
+
|
|
323
|
+
if _plot:
|
|
324
|
+
# Plot loss per epoch
|
|
325
|
+
train_loss = evals_result["train"][eval_metric]
|
|
326
|
+
val_loss = evals_result["val"][eval_metric]
|
|
327
|
+
logs = pd.DataFrame({"train": train_loss, "val": val_loss})
|
|
328
|
+
|
|
329
|
+
plt.figure(figsize=(14, 4))
|
|
330
|
+
plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
|
|
331
|
+
plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
|
|
332
|
+
plt.xlabel("Epoch")
|
|
333
|
+
plt.ylabel("Loss")
|
|
334
|
+
plt.legend()
|
|
335
|
+
plt.show()
|
|
336
|
+
|
|
337
|
+
return model
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def fit_recurrent(x_train, y_train, x_val, y_val, create_model, params, config):
|
|
341
|
+
|
|
342
|
+
# Create the model
|
|
343
|
+
labels = np.unique(y_train[:, 0])
|
|
344
|
+
num_class = labels.size if _target_type == "classification" else None
|
|
345
|
+
input_shape = (x_train.shape[1], x_train.shape[2])
|
|
346
|
+
model = create_model(params, input_shape, _target_type, num_class)
|
|
347
|
+
|
|
348
|
+
# Compile the model
|
|
349
|
+
loss = (
|
|
350
|
+
rmse_tf
|
|
351
|
+
if _target_type == "regression"
|
|
352
|
+
else (
|
|
353
|
+
BinaryCrossentropy(from_logits=False)
|
|
354
|
+
if num_class <= 2
|
|
355
|
+
else CategoricalCrossentropy(from_logits=False)
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
optimizer = Adam(learning_rate=params["learning_rate"], clipnorm=params["clipnorm"])
|
|
359
|
+
metrics = (
|
|
360
|
+
[mae_tf]
|
|
361
|
+
if _target_type == "regression"
|
|
362
|
+
else (
|
|
363
|
+
["accuracy", Precision(), Recall()]
|
|
364
|
+
if num_class <= 2
|
|
365
|
+
else ["categorical_accuracy"]
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
|
|
369
|
+
|
|
370
|
+
# Callbacks
|
|
371
|
+
log_dir = get_log_dir(_training_target_dir, model.model_name)
|
|
372
|
+
|
|
373
|
+
tensorboard_callback = TensorBoard(log_dir=log_dir)
|
|
374
|
+
early_stopping_callback = EarlyStopping(
|
|
375
|
+
monitor="val_loss", patience=3, restore_best_weights=True, start_from_epoch=5
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Custom callbacks
|
|
379
|
+
class PrintTrainableWeights(keras.callbacks.Callback):
|
|
380
|
+
def on_epoch_end(self, epoch, logs={}):
|
|
381
|
+
logger.info(model.trainable_variables)
|
|
382
|
+
|
|
383
|
+
class GradientCalcCallback(keras.callbacks.Callback):
|
|
384
|
+
def __init__(self):
|
|
385
|
+
self.epoch_gradient = []
|
|
386
|
+
|
|
387
|
+
def get_gradient_func(self, model):
|
|
388
|
+
# grads = K.gradients(model.total_loss, model.trainable_weights)
|
|
389
|
+
grads = K.gradients(model.loss, model.trainable_weights)
|
|
390
|
+
# inputs = model.model.inputs + model.targets + model.sample_weights
|
|
391
|
+
# use below line of code if above line doesn't work for you
|
|
392
|
+
# inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
|
|
393
|
+
inputs = (
|
|
394
|
+
model._feed_inputs + model._feed_targets + model._feed_sample_weights
|
|
395
|
+
)
|
|
396
|
+
func = K.function(inputs, grads)
|
|
397
|
+
return func
|
|
398
|
+
|
|
399
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
400
|
+
get_gradient = self.get_gradient_func(model)
|
|
401
|
+
grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
|
|
402
|
+
self.epoch_gradient.append(grads)
|
|
403
|
+
|
|
404
|
+
# Train the model
|
|
405
|
+
if _target_type == "classification" and num_class > 2:
|
|
406
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
407
|
+
lb.fit(labels)
|
|
408
|
+
y_train = lb.transform(y_train[:, 0].flatten())
|
|
409
|
+
y_val = lb.transform(y_val[:, 0].flatten())
|
|
410
|
+
else:
|
|
411
|
+
y_train = y_train[:, 0].flatten()
|
|
412
|
+
y_val = y_val[:, 0].flatten()
|
|
413
|
+
|
|
414
|
+
logger.info("Fitting the model...")
|
|
415
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
416
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
417
|
+
|
|
418
|
+
history = model.fit(
|
|
419
|
+
x_train,
|
|
420
|
+
y_train,
|
|
421
|
+
batch_size=params["batch_size"],
|
|
422
|
+
verbose=0,
|
|
423
|
+
epochs=params["epochs"],
|
|
424
|
+
shuffle=False,
|
|
425
|
+
validation_data=(x_val, y_val),
|
|
426
|
+
callbacks=[early_stopping_callback, tensorboard_callback],
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
430
|
+
# logger.info(pd.DataFrame(gradiant.epoch_gradient))
|
|
431
|
+
|
|
432
|
+
if _plot:
|
|
433
|
+
# Plot loss per epoch
|
|
434
|
+
logs = pd.DataFrame(history.history)
|
|
435
|
+
|
|
436
|
+
plt.figure(figsize=(14, 4))
|
|
437
|
+
plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
|
|
438
|
+
plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
|
|
439
|
+
plt.xlabel("Epoch")
|
|
440
|
+
plt.ylabel("Loss")
|
|
441
|
+
plt.legend()
|
|
442
|
+
plt.show()
|
|
443
|
+
|
|
444
|
+
return model
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def predict(
|
|
448
|
+
model, data: pd.DataFrame, target_type: str, config: dict, threshold: float = 0.5
|
|
449
|
+
):
|
|
450
|
+
"""Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
- model: the train model to predict value
|
|
454
|
+
- data: the data for prediction
|
|
455
|
+
- target_type: classification or regression
|
|
456
|
+
- config: dict containing model config
|
|
457
|
+
"""
|
|
458
|
+
if config["recurrent"] or model.model_name in ["lgb", "xgb"]:
|
|
459
|
+
# keras, lgb & xgb
|
|
460
|
+
if model.model_name == "lgb":
|
|
461
|
+
# Direct prediction for LightGBM
|
|
462
|
+
pred = model.predict(data)
|
|
463
|
+
elif model.model_name == "xgb":
|
|
464
|
+
# Convert val_data to DMatrix for XGBoost
|
|
465
|
+
d_data = xgb.DMatrix(data)
|
|
466
|
+
pred = model.predict(d_data)
|
|
467
|
+
else:
|
|
468
|
+
# Reshape (flatten) for keras if not multiclass
|
|
469
|
+
pred = model.predict(data)
|
|
470
|
+
if pred.shape[1] == 1:
|
|
471
|
+
pred = pred.reshape(-1)
|
|
472
|
+
|
|
473
|
+
if target_type == "classification":
|
|
474
|
+
num_class = pred.shape[1] if len(pred.shape) > 1 else 2
|
|
475
|
+
|
|
476
|
+
if num_class <= 2:
|
|
477
|
+
# For binary classification, concatenate the predicted probabilities for both classes
|
|
478
|
+
pred_df = pd.DataFrame(
|
|
479
|
+
{
|
|
480
|
+
0: 1 - pred, # Probability of class 0
|
|
481
|
+
1: pred, # Probability of class 1
|
|
482
|
+
},
|
|
483
|
+
)
|
|
484
|
+
else:
|
|
485
|
+
# For multi-class classification, use the predicted probabilities for each class
|
|
486
|
+
pred_df = pd.DataFrame(pred, columns=range(num_class))
|
|
487
|
+
|
|
488
|
+
# Get final predictions (argmax for multi-class, threshold for binary)
|
|
489
|
+
if num_class == 2:
|
|
490
|
+
pred_df["PRED"] = np.where(
|
|
491
|
+
pred_df[1] >= threshold, 1, 0
|
|
492
|
+
) # Class 1 if prob >= threshold
|
|
493
|
+
else:
|
|
494
|
+
pred_df["PRED"] = pred_df.idxmax(
|
|
495
|
+
axis=1
|
|
496
|
+
) # Class with highest probability for multiclasses
|
|
497
|
+
|
|
498
|
+
# Reorder columns to show predicted class first, then probabilities
|
|
499
|
+
pred = pred_df[["PRED"] + list(range(num_class))]
|
|
500
|
+
|
|
501
|
+
else:
|
|
502
|
+
pred = pd.Series(pred, name="PRED")
|
|
503
|
+
|
|
504
|
+
# set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
|
|
505
|
+
if model.model_name in ["lgb", "xgb"]:
|
|
506
|
+
pred.index = data.index
|
|
507
|
+
else:
|
|
508
|
+
# sk learn
|
|
509
|
+
pred = pd.Series(model.predict(data), index=data.index, name="PRED")
|
|
510
|
+
if target_type == "classification":
|
|
511
|
+
pred_proba = pd.DataFrame(
|
|
512
|
+
model.predict_proba(data),
|
|
513
|
+
index=data.index,
|
|
514
|
+
columns=[
|
|
515
|
+
int(c) if isinstance(c, float) and c.is_integer() else c
|
|
516
|
+
for c in model.classes_
|
|
517
|
+
],
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Apply threshold for binary classification
|
|
521
|
+
if len(model.classes_) == 2:
|
|
522
|
+
positive_class = model.classes_[1] # Assuming classes are ordered
|
|
523
|
+
pred = (pred_proba[positive_class] >= threshold).astype(int)
|
|
524
|
+
pred.name = "PRED"
|
|
525
|
+
|
|
526
|
+
pred = pd.concat([pred, pred_proba], axis=1)
|
|
527
|
+
|
|
528
|
+
return pred
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def evaluate(prediction: pd.DataFrame, target_type: str):
|
|
532
|
+
"""
|
|
533
|
+
Function to evaluate model performance
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
- prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
|
|
537
|
+
- target_type: classification or regression
|
|
538
|
+
"""
|
|
539
|
+
score = {}
|
|
540
|
+
y_true = prediction["TARGET"]
|
|
541
|
+
y_pred = prediction["PRED"]
|
|
542
|
+
|
|
543
|
+
if target_type == "regression":
|
|
544
|
+
# Main metrics
|
|
545
|
+
score["RMSE"] = root_mean_squared_error(y_true, y_pred)
|
|
546
|
+
score["MAE"] = mean_absolute_error(y_true, y_pred)
|
|
547
|
+
score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
|
|
548
|
+
score["R2"] = r2_score(y_true, y_pred)
|
|
549
|
+
|
|
550
|
+
# Robustness: avoid division by zero
|
|
551
|
+
std_target = y_true.std()
|
|
552
|
+
mean_target = y_true.mean()
|
|
553
|
+
median_target = y_true.median()
|
|
554
|
+
|
|
555
|
+
# RMSE / STD
|
|
556
|
+
score["RMSE_STD_RATIO"] = (
|
|
557
|
+
float(100 * score["RMSE"] / std_target) if std_target else 1000
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Median absolute deviation (MAD)
|
|
561
|
+
mam = (y_true - mean_target).abs().median() # Median Abs around Mean
|
|
562
|
+
mad = (y_true - median_target).abs().median() # Median Abs around Median
|
|
563
|
+
score["MAM"] = mam
|
|
564
|
+
score["MAD"] = mad
|
|
565
|
+
score["MAE_MAM_RATIO"] = (
|
|
566
|
+
float(100 * score["MAE"] / mam) if mam else 1000
|
|
567
|
+
) # MAE / MAD → Plus stable, moins sensible aux outliers.
|
|
568
|
+
score["MAE_MAD_RATIO"] = (
|
|
569
|
+
float(100 * score["MAE"] / mad) if mad else 1000
|
|
570
|
+
) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
|
|
571
|
+
|
|
572
|
+
else:
|
|
573
|
+
|
|
574
|
+
labels = np.unique(y_true)
|
|
575
|
+
num_classes = labels.size
|
|
576
|
+
y_pred_proba = (
|
|
577
|
+
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
578
|
+
)
|
|
579
|
+
if num_classes > 2:
|
|
580
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
581
|
+
lb.fit(labels)
|
|
582
|
+
y_true_onhot = lb.transform(y_true)
|
|
583
|
+
y_pred_onehot = lb.transform(y_pred)
|
|
584
|
+
|
|
585
|
+
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
586
|
+
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
587
|
+
score["PRECISION"] = precision_score(
|
|
588
|
+
y_true,
|
|
589
|
+
y_pred,
|
|
590
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
591
|
+
)
|
|
592
|
+
score["RECALL"] = recall_score(
|
|
593
|
+
y_true,
|
|
594
|
+
y_pred,
|
|
595
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
596
|
+
)
|
|
597
|
+
score["F1"] = f1_score(
|
|
598
|
+
y_true,
|
|
599
|
+
y_pred,
|
|
600
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
601
|
+
)
|
|
602
|
+
score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
|
|
603
|
+
(
|
|
604
|
+
score["THRESHOLD"],
|
|
605
|
+
score["PRECISION_AT_THRESHOLD"],
|
|
606
|
+
score["RECALL_AT_THRESHOLD"],
|
|
607
|
+
) = (
|
|
608
|
+
find_best_precision_threshold(prediction)
|
|
609
|
+
if num_classes == 2
|
|
610
|
+
else (None, None, None)
|
|
611
|
+
)
|
|
612
|
+
return score
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def train_model(params, x_train, y_train, x_val, y_val, config):
|
|
616
|
+
if "_type_name" in config.keys() and config["_type_name"] == "hyperopts":
|
|
617
|
+
global _target_number
|
|
618
|
+
global _target_type
|
|
619
|
+
global _session_name
|
|
620
|
+
global _plot
|
|
621
|
+
global _type_name
|
|
622
|
+
global _scaler_y
|
|
623
|
+
global _training_target_dir
|
|
624
|
+
_target_number = config["_target_number"]
|
|
625
|
+
_target_type = config["_target_type"]
|
|
626
|
+
_session_name = config["_session_name"]
|
|
627
|
+
_plot = config["_plot"]
|
|
628
|
+
_type_name = config["_type_name"]
|
|
629
|
+
_scaler_y = config["_scaler_y"]
|
|
630
|
+
_training_target_dir = config["_training_target_dir"]
|
|
631
|
+
|
|
632
|
+
# warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
633
|
+
# logging.getLogger("ray").setLevel(logging.CRITICAL)
|
|
634
|
+
# logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
|
|
635
|
+
# logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
|
|
636
|
+
# logging.getLogger("raylet").setLevel(logging.CRITICAL)
|
|
637
|
+
|
|
638
|
+
logger.info(
|
|
639
|
+
f"TARGET_{_target_number} - Training a {config['model_name']} at {datetime.now()} : {_session_name}, TARGET_{_target_number}"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
recurrent = config["recurrent"]
|
|
643
|
+
create_model = config["create_model"]
|
|
644
|
+
|
|
645
|
+
if recurrent:
|
|
646
|
+
timesteps = params["timesteps"]
|
|
647
|
+
x_train = x_train[:, -timesteps:, :]
|
|
648
|
+
x_val = x_val[:, -timesteps:, :]
|
|
649
|
+
|
|
650
|
+
# Compile and fit model on train set
|
|
651
|
+
start = time.time()
|
|
652
|
+
if recurrent:
|
|
653
|
+
fit = fit_recurrent
|
|
654
|
+
elif (create_model == "lgb") or (create_model == "xgb"):
|
|
655
|
+
fit = fit_boosting
|
|
656
|
+
else:
|
|
657
|
+
fit = fit_sklearn
|
|
658
|
+
model = fit(
|
|
659
|
+
x_train,
|
|
660
|
+
y_train,
|
|
661
|
+
x_val,
|
|
662
|
+
y_val,
|
|
663
|
+
create_model,
|
|
664
|
+
params=params,
|
|
665
|
+
config=config,
|
|
666
|
+
)
|
|
667
|
+
stop = time.time()
|
|
668
|
+
|
|
669
|
+
# Prediction on val set
|
|
670
|
+
y_pred = predict(model, x_val, _target_type, config)
|
|
671
|
+
|
|
672
|
+
# fix for recurrent model because x_val has no index as it is a 3D np array
|
|
673
|
+
if config["recurrent"]:
|
|
674
|
+
y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
|
|
675
|
+
y_pred.index = y_val.index
|
|
676
|
+
|
|
677
|
+
prediction = pd.concat([y_val, y_pred], axis=1)
|
|
678
|
+
|
|
679
|
+
# Unscale the data
|
|
680
|
+
if config["need_scaling"] and _target_type == "regression":
|
|
681
|
+
# scaler_y needs 2D array with shape (-1, 1)
|
|
682
|
+
prediction.loc[:, "TARGET"] = _scaler_y.inverse_transform(
|
|
683
|
+
prediction[["TARGET"]].values
|
|
684
|
+
)
|
|
685
|
+
prediction.loc[:, "PRED"] = _scaler_y.inverse_transform(
|
|
686
|
+
prediction[["PRED"]].values
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Evaluate model
|
|
690
|
+
score = {
|
|
691
|
+
"DATE": datetime.now(),
|
|
692
|
+
"SESSION": _session_name,
|
|
693
|
+
"TRAIN_DATA": x_train.shape[0],
|
|
694
|
+
"VAL_DATA": x_val.shape[0],
|
|
695
|
+
"FEATURES": x_train.shape[-1],
|
|
696
|
+
"MODEL_NAME": model.model_name,
|
|
697
|
+
"TYPE": _type_name,
|
|
698
|
+
"TRAINING_TIME": stop - start,
|
|
699
|
+
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
score.update(evaluate(prediction, _target_type))
|
|
703
|
+
|
|
704
|
+
if _type_name == "hyperopts":
|
|
705
|
+
session.report(metrics=score)
|
|
706
|
+
ray.tune.report(metrics=score)
|
|
707
|
+
return score
|
|
708
|
+
|
|
709
|
+
return score, model, prediction
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
# Main training function
|
|
713
|
+
def model_selection(
|
|
714
|
+
dataset_id: int,
|
|
715
|
+
models_idx: list,
|
|
716
|
+
target_number: int,
|
|
717
|
+
session_name,
|
|
718
|
+
perform_hyperoptimization=True,
|
|
719
|
+
perform_crossval=False,
|
|
720
|
+
number_of_trials=20,
|
|
721
|
+
plot=True,
|
|
722
|
+
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
723
|
+
preserve_model=True,
|
|
724
|
+
reshaped_data=None,
|
|
725
|
+
data=None,
|
|
726
|
+
):
|
|
727
|
+
"""
|
|
728
|
+
Selects the best models based on a target variable, optionally performing hyperparameter optimization
|
|
729
|
+
and cross-validation, and manages outputs in a session-specific directory.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
models_idx (list):
|
|
733
|
+
A list of indices or identifiers representing the models to evaluate.
|
|
734
|
+
Each identifier corresponds to a predefined or available model.
|
|
735
|
+
|
|
736
|
+
target_number (int):
|
|
737
|
+
The number of the target variable (e.g., column index or predefined target) to predict.
|
|
738
|
+
This determines the dataset's output variable for training and evaluation.
|
|
739
|
+
|
|
740
|
+
session_name (str):
|
|
741
|
+
A name for the current session, used to organize and store results
|
|
742
|
+
(e.g., logs, metrics, trained models) in a session-specific directory.
|
|
743
|
+
|
|
744
|
+
perform_hyperoptimization (bool, optional):
|
|
745
|
+
Whether to perform hyperparameter optimization for the models.
|
|
746
|
+
If `True`, the function will attempt to tune the hyperparameters of each model.
|
|
747
|
+
Defaults to `True`.
|
|
748
|
+
|
|
749
|
+
perform_crossval (bool, optional):
|
|
750
|
+
Whether to perform cross-validation to evaluate model performance.
|
|
751
|
+
If `True`, the function will use cross-validation to compute metrics.
|
|
752
|
+
Defaults to `True`.
|
|
753
|
+
|
|
754
|
+
number_of_trials (int, optional):
|
|
755
|
+
The number of trials to run for hyperparameter optimization.
|
|
756
|
+
Ignored if `perform_hyperoptimization` is `False`.
|
|
757
|
+
Defaults to `20`.
|
|
758
|
+
|
|
759
|
+
plot (bool, optional):
|
|
760
|
+
Whether to enable plotting during the process.
|
|
761
|
+
If `True`, plot will be displayed.
|
|
762
|
+
Defaults to `True`.
|
|
763
|
+
|
|
764
|
+
clean_dir (bool, optional):
|
|
765
|
+
Whether to clean the entire target training directory before starting the process.
|
|
766
|
+
If `True`, any existing files in the target training directory will be removed.
|
|
767
|
+
Defaults to `False`.
|
|
768
|
+
|
|
769
|
+
preserve_model (bool, optional):
|
|
770
|
+
Whether to run the search even if there is already a best model in the directory.
|
|
771
|
+
If `False`, previous best models won't be erased and the search will be skipped.
|
|
772
|
+
Defaults to `False`.
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
None
|
|
776
|
+
The function runs the model selection process and outputs results
|
|
777
|
+
(e.g., logs, metrics, and optionally models) to the session directory.
|
|
778
|
+
"""
|
|
779
|
+
global _target_number
|
|
780
|
+
global _target_type
|
|
781
|
+
global _session_name
|
|
782
|
+
global _plot
|
|
783
|
+
global _type_name
|
|
784
|
+
global _scaler_y
|
|
785
|
+
global _training_target_dir
|
|
786
|
+
|
|
787
|
+
global_vars = [
|
|
788
|
+
"_target_number",
|
|
789
|
+
"_target_type",
|
|
790
|
+
"_session_name",
|
|
791
|
+
"_plot",
|
|
792
|
+
"_type_name",
|
|
793
|
+
"_scaler_y",
|
|
794
|
+
"_training_target_dir",
|
|
795
|
+
]
|
|
796
|
+
|
|
797
|
+
_target_number = target_number
|
|
798
|
+
_target_type = "classification" if target_number in TARGETS_CLF else "regression"
|
|
799
|
+
_session_name = session_name
|
|
800
|
+
_plot = plot
|
|
801
|
+
|
|
802
|
+
if dataset_id is None:
|
|
803
|
+
raise ValueError("dataset_id is not provided.")
|
|
804
|
+
|
|
805
|
+
dataset = Dataset.get(dataset_id)
|
|
806
|
+
dataset_dir = dataset.path
|
|
807
|
+
|
|
808
|
+
training_target_dir = f"{dataset_dir}/TARGET_{_target_number}"
|
|
809
|
+
_training_target_dir = training_target_dir
|
|
810
|
+
|
|
811
|
+
metric = "RMSE" if _target_type == "regression" else "LOGLOSS"
|
|
812
|
+
|
|
813
|
+
# load features, scalers and data
|
|
814
|
+
features = dataset.get_features(target_number)
|
|
815
|
+
all_features = dataset.get_all_features()
|
|
816
|
+
|
|
817
|
+
if data:
|
|
818
|
+
train = data["train"]
|
|
819
|
+
val = data["val"]
|
|
820
|
+
train_scaled = data["train_scaled"]
|
|
821
|
+
val_scaled = data["val_scaled"]
|
|
822
|
+
_scaler_y = (
|
|
823
|
+
data["scalers_y"][f"scaler_y_{target_number}"]
|
|
824
|
+
if _target_type == "regression"
|
|
825
|
+
else None
|
|
826
|
+
)
|
|
827
|
+
else:
|
|
828
|
+
train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
|
|
829
|
+
dataset_dir, target_number, _target_type
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
list_models = ml_models + dl_recurrent_models
|
|
833
|
+
|
|
834
|
+
if any(list_models[i].get("recurrent") for i in models_idx):
|
|
835
|
+
if reshaped_data is None:
|
|
836
|
+
raise ValueError("reshaped_data is not provided.")
|
|
837
|
+
|
|
838
|
+
logger.info("Loading reshaped data...")
|
|
839
|
+
x_train_reshaped = reshaped_data["x_train_reshaped"]
|
|
840
|
+
y_train_reshaped = reshaped_data["y_train_reshaped"]
|
|
841
|
+
x_val_reshaped = reshaped_data["x_val_reshaped"]
|
|
842
|
+
y_val_reshaped = reshaped_data["y_val_reshaped"]
|
|
843
|
+
|
|
844
|
+
# create model selection in db
|
|
845
|
+
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
846
|
+
model_selection = ModelSelection.upsert(
|
|
847
|
+
match_fields=["target_id", "dataset_id"],
|
|
848
|
+
target_id=target.id,
|
|
849
|
+
dataset_id=dataset.id,
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
# recurrent models starts at 9 # len(list_models)
|
|
853
|
+
for i in models_idx:
|
|
854
|
+
config = list_models[i]
|
|
855
|
+
if config["recurrent"] is False and config[_target_type] is None:
|
|
856
|
+
continue # for naive bayes models that cannot be used in regression
|
|
857
|
+
|
|
858
|
+
results_dir = f"{training_target_dir}/{config['model_name']}"
|
|
859
|
+
if not os.path.exists(f"{results_dir}"):
|
|
860
|
+
os.makedirs(f"{results_dir}")
|
|
861
|
+
elif preserve_model and contains_best(results_dir):
|
|
862
|
+
continue
|
|
863
|
+
elif perform_hyperoptimization:
|
|
864
|
+
clean_directory(results_dir)
|
|
865
|
+
|
|
866
|
+
logger.info(f"Training a {config['model_name']}")
|
|
867
|
+
model = Model.upsert(
|
|
868
|
+
match_fields=["name", "type"],
|
|
869
|
+
name=config["model_name"],
|
|
870
|
+
type=_target_type,
|
|
871
|
+
)
|
|
872
|
+
model_training = ModelTraining.upsert(
|
|
873
|
+
match_fields=["model_id", "model_selection_id"],
|
|
874
|
+
model_id=model.id,
|
|
875
|
+
model_selection_id=model_selection.id,
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# getting data
|
|
879
|
+
if config["recurrent"]:
|
|
880
|
+
# Clear cluster from previous Keras session graphs.
|
|
881
|
+
K.clear_session()
|
|
882
|
+
|
|
883
|
+
features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
|
|
884
|
+
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
885
|
+
x_train = x_train_reshaped[:, :, features_idx]
|
|
886
|
+
y_train = y_train_reshaped[:, [target_number, 0]]
|
|
887
|
+
x_val = x_val_reshaped[:, :, features_idx]
|
|
888
|
+
y_val = y_val_reshaped[:, [target_number, 0]]
|
|
889
|
+
else:
|
|
890
|
+
new_config = config[_target_type]
|
|
891
|
+
new_config["model_name"] = config["model_name"]
|
|
892
|
+
new_config["recurrent"] = config["recurrent"]
|
|
893
|
+
new_config["need_scaling"] = config["need_scaling"]
|
|
894
|
+
config = new_config
|
|
895
|
+
|
|
896
|
+
if config["need_scaling"] and _target_type == "regression":
|
|
897
|
+
x_train = train_scaled[features]
|
|
898
|
+
y_train = train_scaled[f"TARGET_{target_number}"].rename("TARGET")
|
|
899
|
+
x_val = val_scaled[features]
|
|
900
|
+
y_val = val_scaled[f"TARGET_{target_number}"].rename("TARGET")
|
|
901
|
+
else:
|
|
902
|
+
x_train = train[features]
|
|
903
|
+
y_train = train[f"TARGET_{target_number}"].rename("TARGET")
|
|
904
|
+
x_val = val[features]
|
|
905
|
+
y_val = val[f"TARGET_{target_number}"].rename("TARGET")
|
|
906
|
+
|
|
907
|
+
start = time.time()
|
|
908
|
+
# Tuning hyperparameters
|
|
909
|
+
if perform_hyperoptimization:
|
|
910
|
+
_type_name = "hyperopts"
|
|
911
|
+
|
|
912
|
+
for var in global_vars:
|
|
913
|
+
config[var] = globals()[var]
|
|
914
|
+
|
|
915
|
+
logger.info("Start tuning hyperparameters...")
|
|
916
|
+
|
|
917
|
+
storage_path = f"{results_dir}/ray_results"
|
|
918
|
+
# ray.shutdown()
|
|
919
|
+
# ray.init(
|
|
920
|
+
# runtime_env={
|
|
921
|
+
# "working_dir": ".", # or your project path
|
|
922
|
+
# "env_vars": {"PYTHONPATH": "."}
|
|
923
|
+
# }
|
|
924
|
+
# )
|
|
925
|
+
tuner = Tuner(
|
|
926
|
+
trainable=with_parameters(
|
|
927
|
+
train_model,
|
|
928
|
+
x_train=x_train,
|
|
929
|
+
y_train=y_train,
|
|
930
|
+
x_val=x_val,
|
|
931
|
+
y_val=y_val,
|
|
932
|
+
config=config,
|
|
933
|
+
),
|
|
934
|
+
param_space=config["search_params"],
|
|
935
|
+
tune_config=TuneConfig(
|
|
936
|
+
metric=metric,
|
|
937
|
+
mode="min",
|
|
938
|
+
search_alg=HyperOptSearch(),
|
|
939
|
+
num_samples=number_of_trials,
|
|
940
|
+
scheduler=ASHAScheduler(max_t=100, grace_period=10),
|
|
941
|
+
),
|
|
942
|
+
run_config=RunConfig(
|
|
943
|
+
stop={"training_iteration": 100},
|
|
944
|
+
storage_path=storage_path,
|
|
945
|
+
# name=datetime.now().strftime("%d-%m-%Y") + "-" + session_name,
|
|
946
|
+
callbacks=[TBXLoggerCallback()],
|
|
947
|
+
# log_to_file=("stdout.log", "stderr.log"), # depreciated
|
|
948
|
+
# verbose=0,
|
|
949
|
+
),
|
|
950
|
+
)
|
|
951
|
+
try:
|
|
952
|
+
results = tuner.fit()
|
|
953
|
+
|
|
954
|
+
best_result = results.get_best_result(metric, "max")
|
|
955
|
+
best_params = best_result.config
|
|
956
|
+
best_score = best_result.metrics
|
|
957
|
+
|
|
958
|
+
# log results
|
|
959
|
+
logger.info(f"Best hyperparameters found were:\n{best_params}")
|
|
960
|
+
logger.info(f"Best Scores found were:\n{best_score}")
|
|
961
|
+
|
|
962
|
+
df_results = results.get_dataframe()
|
|
963
|
+
logger.info(
|
|
964
|
+
f"Markdown table with all trials :\n{df_results.to_markdown()}"
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# save best params
|
|
968
|
+
best_params_file = f"{training_target_dir}/best_params.json"
|
|
969
|
+
try:
|
|
970
|
+
with open(best_params_file, "r") as f:
|
|
971
|
+
json_dict = json.load(f)
|
|
972
|
+
except FileNotFoundError:
|
|
973
|
+
json_dict = {}
|
|
974
|
+
|
|
975
|
+
json_dict[config["model_name"]] = serialize_for_json(best_params)
|
|
976
|
+
with open(best_params_file, "w") as f:
|
|
977
|
+
json.dump(json_dict, f, indent=4)
|
|
978
|
+
|
|
979
|
+
except Exception as e:
|
|
980
|
+
ray.shutdown()
|
|
981
|
+
raise Exception(e)
|
|
982
|
+
logger.error(e)
|
|
983
|
+
|
|
984
|
+
ray.shutdown()
|
|
985
|
+
|
|
986
|
+
# Collect errors in single file
|
|
987
|
+
collect_error_logs(
|
|
988
|
+
training_target_dir=training_target_dir, storage_path=storage_path
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
# Clean up
|
|
992
|
+
for var in global_vars:
|
|
993
|
+
del config[var]
|
|
994
|
+
else:
|
|
995
|
+
try:
|
|
996
|
+
with open(f"{training_target_dir}/best_params.json") as f:
|
|
997
|
+
json_dict = json.load(f)
|
|
998
|
+
best_params = json_dict[config["model_name"]]
|
|
999
|
+
except Exception:
|
|
1000
|
+
raise FileNotFoundError(
|
|
1001
|
+
f"Could not find {config['model_name']} in current data. Try to run an hyperoptimization by setting `perform_hyperoptimization` to true"
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
# Perform cross-validation of the best model on k-folds of train + val set
|
|
1005
|
+
if perform_crossval:
|
|
1006
|
+
x_train_val = pd.concat([x_train, x_val], axis=0)
|
|
1007
|
+
y_train_val = pd.concat([y_train, y_val], axis=0)
|
|
1008
|
+
n_splits = 4
|
|
1009
|
+
n_samples = len(x_train_val)
|
|
1010
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
1011
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
1012
|
+
|
|
1013
|
+
# Store the scores
|
|
1014
|
+
cross_validation_scores = []
|
|
1015
|
+
|
|
1016
|
+
for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
|
|
1017
|
+
_type_name = f"crossval_fold_{i}"
|
|
1018
|
+
|
|
1019
|
+
if DATE_COLUMN:
|
|
1020
|
+
date_column = train[DATE_COLUMN].copy()
|
|
1021
|
+
|
|
1022
|
+
if config.get("need_scaling"):
|
|
1023
|
+
date_column = date_column.map(pd.Timestamp.fromordinal)
|
|
1024
|
+
|
|
1025
|
+
# Now you can use the actual train/val indices to extract ranges
|
|
1026
|
+
train_start = date_column.iloc[train_index[0]]
|
|
1027
|
+
train_end = date_column.iloc[train_index[-1]]
|
|
1028
|
+
val_start = date_column.iloc[val_index[0]]
|
|
1029
|
+
val_end = date_column.iloc[val_index[-1]]
|
|
1030
|
+
|
|
1031
|
+
logger.info(
|
|
1032
|
+
f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
|
|
1033
|
+
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
1034
|
+
)
|
|
1035
|
+
else:
|
|
1036
|
+
logger.info(
|
|
1037
|
+
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# Train the model and get the score
|
|
1041
|
+
if config["recurrent"]:
|
|
1042
|
+
cross_validation_score, _, _ = train_model(
|
|
1043
|
+
params=best_params,
|
|
1044
|
+
x_train=x_train_val[train_index],
|
|
1045
|
+
y_train=y_train_val[train_index],
|
|
1046
|
+
x_val=x_train_val[val_index],
|
|
1047
|
+
y_val=y_train_val[val_index],
|
|
1048
|
+
config=config,
|
|
1049
|
+
)
|
|
1050
|
+
else:
|
|
1051
|
+
cross_validation_score, _, _ = train_model(
|
|
1052
|
+
params=best_params,
|
|
1053
|
+
x_train=x_train_val.iloc[train_index],
|
|
1054
|
+
y_train=y_train_val.iloc[train_index],
|
|
1055
|
+
x_val=x_train_val.iloc[val_index],
|
|
1056
|
+
y_val=y_train_val.iloc[val_index],
|
|
1057
|
+
config=config,
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
# Append score to the list
|
|
1061
|
+
cross_validation_scores.append(cross_validation_score)
|
|
1062
|
+
|
|
1063
|
+
# Calculate and log the mean score
|
|
1064
|
+
cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
|
|
1065
|
+
metric
|
|
1066
|
+
].mean()
|
|
1067
|
+
logger.info(
|
|
1068
|
+
f"Best model mean cross-validation score: {cross_validation_mean_score}"
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
# Retrain on entire training set, but keep score on cross-validation folds
|
|
1072
|
+
best_score, best_model, best_pred = train_model(
|
|
1073
|
+
params=best_params,
|
|
1074
|
+
x_train=x_train,
|
|
1075
|
+
y_train=y_train,
|
|
1076
|
+
x_val=x_val,
|
|
1077
|
+
y_val=y_val,
|
|
1078
|
+
config=config,
|
|
1079
|
+
)
|
|
1080
|
+
best_score = cross_validation_mean_score
|
|
1081
|
+
else:
|
|
1082
|
+
# Evaluate on validation set
|
|
1083
|
+
_type_name = "validation"
|
|
1084
|
+
best_score, best_model, best_pred = train_model(
|
|
1085
|
+
params=best_params,
|
|
1086
|
+
x_train=x_train,
|
|
1087
|
+
y_train=y_train,
|
|
1088
|
+
x_val=x_val,
|
|
1089
|
+
y_val=y_val,
|
|
1090
|
+
config=config,
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
logger.info(f"Best model scores on validation set: {best_score}")
|
|
1094
|
+
|
|
1095
|
+
# Save validation predictions
|
|
1096
|
+
best_pred.to_csv(
|
|
1097
|
+
f"{results_dir}/pred_val.csv",
|
|
1098
|
+
index=True,
|
|
1099
|
+
header=True,
|
|
1100
|
+
index_label="ID",
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
# Save best model
|
|
1104
|
+
if config["recurrent"]:
|
|
1105
|
+
model_path = f"{results_dir}/{best_model.model_name}.keras"
|
|
1106
|
+
best_model.save(model_path)
|
|
1107
|
+
else:
|
|
1108
|
+
model_path = f"{results_dir}/{best_model.model_name}.best"
|
|
1109
|
+
joblib.dump(best_model, model_path)
|
|
1110
|
+
|
|
1111
|
+
model_path = Path(model_path).resolve()
|
|
1112
|
+
best_score["MODEL_PATH"] = model_path
|
|
1113
|
+
|
|
1114
|
+
# Track scores
|
|
1115
|
+
scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
|
|
1116
|
+
best_score_df = pd.DataFrame([best_score])
|
|
1117
|
+
|
|
1118
|
+
if os.path.exists(scores_tracking_path):
|
|
1119
|
+
existing_scores = pd.read_csv(scores_tracking_path)
|
|
1120
|
+
common_cols = existing_scores.columns.intersection(best_score_df.columns)
|
|
1121
|
+
best_score_df = best_score_df[common_cols]
|
|
1122
|
+
scores_tracking = pd.concat(
|
|
1123
|
+
[existing_scores, best_score_df], ignore_index=True
|
|
1124
|
+
)
|
|
1125
|
+
else:
|
|
1126
|
+
scores_tracking = best_score_df
|
|
1127
|
+
|
|
1128
|
+
scores_tracking.sort_values(metric, ascending=True, inplace=True)
|
|
1129
|
+
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1130
|
+
|
|
1131
|
+
# Save model training metadata
|
|
1132
|
+
stop = time.time()
|
|
1133
|
+
training_time = stop - start
|
|
1134
|
+
model_training.best_params = best_params
|
|
1135
|
+
model_training.model_path = model_path
|
|
1136
|
+
model_training.training_time = training_time
|
|
1137
|
+
model_training.save()
|
|
1138
|
+
|
|
1139
|
+
# Store metrics in DB
|
|
1140
|
+
drop_cols = [
|
|
1141
|
+
"DATE",
|
|
1142
|
+
"SESSION",
|
|
1143
|
+
"TRAIN_DATA",
|
|
1144
|
+
"VAL_DATA",
|
|
1145
|
+
"FEATURES",
|
|
1146
|
+
"MODEL_NAME",
|
|
1147
|
+
"MODEL_PATH",
|
|
1148
|
+
]
|
|
1149
|
+
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1150
|
+
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1151
|
+
|
|
1152
|
+
Score.upsert(
|
|
1153
|
+
match_fields=["model_training_id"],
|
|
1154
|
+
model_training_id=model_training.id,
|
|
1155
|
+
**score_data,
|
|
1156
|
+
)
|
|
1157
|
+
|
|
1158
|
+
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1159
|
+
|
|
1160
|
+
# find best model type
|
|
1161
|
+
scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
|
|
1162
|
+
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1163
|
+
best_score_overall = scores_tracking.iloc[0, :]
|
|
1164
|
+
best_model_name = best_score_overall["MODEL_NAME"]
|
|
1165
|
+
|
|
1166
|
+
# Remove any .best or .keras files
|
|
1167
|
+
for file_path in glob.glob(os.path.join(training_target_dir, "*.best")) + glob.glob(
|
|
1168
|
+
os.path.join(training_target_dir, "*.keras")
|
|
1169
|
+
):
|
|
1170
|
+
os.remove(file_path)
|
|
1171
|
+
# Copy the best model in root training folder for this target
|
|
1172
|
+
best_model_path = Path(
|
|
1173
|
+
f"{training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1174
|
+
).resolve()
|
|
1175
|
+
copy_any(
|
|
1176
|
+
best_score_overall["MODEL_PATH"],
|
|
1177
|
+
best_model_path,
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
with open(f"{training_target_dir}/best_params.json", "r") as f:
|
|
1181
|
+
best_model_params = json.load(f)[best_model_name]
|
|
1182
|
+
|
|
1183
|
+
# save model_selection results to db
|
|
1184
|
+
model_selection = ModelSelection.get(model_selection.id)
|
|
1185
|
+
model_selection.best_model_id = Model.find_by(
|
|
1186
|
+
name=best_score_overall["MODEL_NAME"], type=_target_type
|
|
1187
|
+
).id
|
|
1188
|
+
model_selection.best_model_params = best_model_params
|
|
1189
|
+
model_selection.best_model_path = best_model_path
|
|
1190
|
+
model_selection.save()
|
|
1191
|
+
|
|
1192
|
+
logger.info(f"Best model overall is : {best_score_overall}")
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
def collect_error_logs(training_target_dir: int, storage_path: str):
|
|
1196
|
+
|
|
1197
|
+
output_error_file = f"{training_target_dir}/errors.log"
|
|
1198
|
+
|
|
1199
|
+
with open(output_error_file, "a") as outfile:
|
|
1200
|
+
# Walk through the ray_results directory
|
|
1201
|
+
for root, dirs, files in os.walk(storage_path):
|
|
1202
|
+
# Check if 'error.txt' exists in the current directory
|
|
1203
|
+
if "error.txt" in files:
|
|
1204
|
+
error_file_path = os.path.join(root, "error.txt")
|
|
1205
|
+
logger.info(f"Processing error file: {error_file_path}")
|
|
1206
|
+
# Read and append the content of the error.txt file
|
|
1207
|
+
with open(error_file_path, "r") as infile:
|
|
1208
|
+
outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
|
|
1209
|
+
outfile.write(infile.read())
|
|
1210
|
+
logger.info(f"All errors written to {output_error_file}")
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def plot_evaluation_for_classification(prediction: dict):
|
|
1214
|
+
"""
|
|
1215
|
+
Args
|
|
1216
|
+
prediction (pd.DataFrame): Should be a df with TARGET, PRED, 0, 1 columns for y_true value (TARGET), y_pred (PRED), and probabilities (for classification only : 0 and 1)
|
|
1217
|
+
"""
|
|
1218
|
+
y_true = prediction["TARGET"]
|
|
1219
|
+
y_pred = prediction["PRED"]
|
|
1220
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1221
|
+
|
|
1222
|
+
# Plot confusion matrix
|
|
1223
|
+
plot_confusion_matrix(y_true, y_pred)
|
|
1224
|
+
|
|
1225
|
+
# Compute ROC curve and ROC area
|
|
1226
|
+
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
|
|
1227
|
+
roc_auc = auc(fpr, tpr)
|
|
1228
|
+
|
|
1229
|
+
plt.figure(figsize=(8, 8))
|
|
1230
|
+
plt.plot(
|
|
1231
|
+
fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc
|
|
1232
|
+
)
|
|
1233
|
+
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
|
|
1234
|
+
plt.xlim([0.0, 1.0])
|
|
1235
|
+
plt.ylim([0.0, 1.05])
|
|
1236
|
+
plt.xlabel("False Positive Rate")
|
|
1237
|
+
plt.ylabel("True Positive Rate")
|
|
1238
|
+
plt.title("ROC Curve")
|
|
1239
|
+
plt.legend(loc="lower right")
|
|
1240
|
+
plt.show()
|
|
1241
|
+
|
|
1242
|
+
# Compute precision-recall curve
|
|
1243
|
+
precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
|
|
1244
|
+
average_precision = average_precision_score(y_true, y_pred_proba)
|
|
1245
|
+
|
|
1246
|
+
plt.figure(figsize=(8, 8))
|
|
1247
|
+
plt.step(recall, precision, color="b", alpha=0.2, where="post")
|
|
1248
|
+
plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
|
|
1249
|
+
plt.xlabel("Recall")
|
|
1250
|
+
plt.ylabel("Precision")
|
|
1251
|
+
plt.ylim([0.0, 1.05])
|
|
1252
|
+
plt.xlim([0.0, 1.0])
|
|
1253
|
+
plt.title("Precision-Recall Curve: AP={0:0.2f}".format(average_precision))
|
|
1254
|
+
plt.show()
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def plot_confusion_matrix(y_true, y_pred):
|
|
1258
|
+
unique_labels = np.unique(np.concatenate((y_true, y_pred)))
|
|
1259
|
+
cm = confusion_matrix(y_true, y_pred)
|
|
1260
|
+
|
|
1261
|
+
labels = np.sort(unique_labels) # Sort labels based on numerical order
|
|
1262
|
+
|
|
1263
|
+
plt.figure(figsize=(10, 7))
|
|
1264
|
+
sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
|
|
1265
|
+
plt.xlabel("Predicted", fontsize=12)
|
|
1266
|
+
plt.ylabel("True", fontsize=12)
|
|
1267
|
+
plt.title("Confusion Matrix", fontsize=14)
|
|
1268
|
+
|
|
1269
|
+
plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
|
|
1270
|
+
plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
|
|
1271
|
+
|
|
1272
|
+
plt.show()
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
# THRESHOLD
|
|
1276
|
+
def find_max_f1_threshold(prediction):
|
|
1277
|
+
"""
|
|
1278
|
+
Finds the threshold that maximizes the F1 score for a binary classification task.
|
|
1279
|
+
|
|
1280
|
+
Parameters:
|
|
1281
|
+
- prediction: DataFrame with 'TARGET' and '1' (predicted probabilities) columns
|
|
1282
|
+
|
|
1283
|
+
Returns:
|
|
1284
|
+
- best_threshold: The threshold that maximizes the F1 score
|
|
1285
|
+
- best_precision: The precision at that threshold
|
|
1286
|
+
- best_recall: The recall at that threshold
|
|
1287
|
+
"""
|
|
1288
|
+
y_true = prediction["TARGET"]
|
|
1289
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1290
|
+
|
|
1291
|
+
# Compute precision, recall, and thresholds
|
|
1292
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1293
|
+
|
|
1294
|
+
# Drop the first element to align with thresholds
|
|
1295
|
+
precision = precision[1:]
|
|
1296
|
+
recall = recall[1:]
|
|
1297
|
+
|
|
1298
|
+
# Filter out trivial cases (precision or recall = 0)
|
|
1299
|
+
valid = (precision > 0) & (recall > 0)
|
|
1300
|
+
if not np.any(valid):
|
|
1301
|
+
raise ValueError("No valid threshold with non-zero precision and recall")
|
|
1302
|
+
|
|
1303
|
+
precision = precision[valid]
|
|
1304
|
+
recall = recall[valid]
|
|
1305
|
+
thresholds = thresholds[valid]
|
|
1306
|
+
|
|
1307
|
+
# Compute F1 scores for each threshold
|
|
1308
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1309
|
+
|
|
1310
|
+
best_index = np.argmax(f1_scores)
|
|
1311
|
+
|
|
1312
|
+
best_threshold = thresholds[best_index]
|
|
1313
|
+
best_precision = precision[best_index]
|
|
1314
|
+
best_recall = recall[best_index]
|
|
1315
|
+
|
|
1316
|
+
return best_threshold, best_precision, best_recall
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
def find_best_f1_threshold(prediction, fscore_target: float):
|
|
1320
|
+
"""
|
|
1321
|
+
Finds the highest threshold achieving at least the given F1 score target.
|
|
1322
|
+
|
|
1323
|
+
Parameters:
|
|
1324
|
+
- prediction: DataFrame with 'TARGET' and '1' (or 1 as int) columns
|
|
1325
|
+
- fscore_target: Desired minimum F1 score (between 0 and 1)
|
|
1326
|
+
|
|
1327
|
+
Returns:
|
|
1328
|
+
- best_threshold: The highest threshold meeting the F1 target
|
|
1329
|
+
- best_precision: Precision at that threshold
|
|
1330
|
+
- best_recall: Recall at that threshold
|
|
1331
|
+
- best_f1: Actual F1 score at that threshold
|
|
1332
|
+
"""
|
|
1333
|
+
y_true = prediction["TARGET"]
|
|
1334
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1335
|
+
|
|
1336
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1337
|
+
|
|
1338
|
+
# Align precision/recall with thresholds
|
|
1339
|
+
precision = precision[1:]
|
|
1340
|
+
recall = recall[1:]
|
|
1341
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1342
|
+
|
|
1343
|
+
# Filter for thresholds meeting F1 target
|
|
1344
|
+
valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 >= fscore_target]
|
|
1345
|
+
|
|
1346
|
+
if not valid_indices:
|
|
1347
|
+
raise ValueError(f"Could not find a threshold with F1 >= {fscore_target:.2f}")
|
|
1348
|
+
|
|
1349
|
+
# Pick the highest threshold among valid ones
|
|
1350
|
+
best_index = valid_indices[-1]
|
|
1351
|
+
|
|
1352
|
+
return (
|
|
1353
|
+
thresholds[best_index],
|
|
1354
|
+
precision[best_index],
|
|
1355
|
+
recall[best_index],
|
|
1356
|
+
f1_scores[best_index],
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
|
|
1360
|
+
def find_max_precision_threshold_without_trivial_case(prediction: dict):
|
|
1361
|
+
"""
|
|
1362
|
+
Finds the threshold that maximizes precision without reaching a precision of 1,
|
|
1363
|
+
which indicates all predictions are classified as the negative class (0).
|
|
1364
|
+
|
|
1365
|
+
Parameters:
|
|
1366
|
+
- prediction: dict with keys 'TARGET' (true labels) and '1' (predicted probabilities)
|
|
1367
|
+
|
|
1368
|
+
Returns:
|
|
1369
|
+
- threshold: the probability threshold that maximizes precision
|
|
1370
|
+
- actual_recall: the recall achieved at this threshold
|
|
1371
|
+
- actual_precision: the precision achieved at this threshold
|
|
1372
|
+
"""
|
|
1373
|
+
y_true = prediction["TARGET"]
|
|
1374
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1375
|
+
|
|
1376
|
+
# Compute precision, recall, and thresholds
|
|
1377
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1378
|
+
|
|
1379
|
+
# Drop the first element of precision and recall to align with thresholds
|
|
1380
|
+
precision = precision[1:]
|
|
1381
|
+
recall = recall[1:]
|
|
1382
|
+
|
|
1383
|
+
# Filter out precision == 1.0 (which might correspond to predicting only 0s)
|
|
1384
|
+
valid_indices = np.where(precision < 1.0)[0]
|
|
1385
|
+
if len(valid_indices) == 0:
|
|
1386
|
+
raise ValueError("No valid precision values less than 1.0")
|
|
1387
|
+
|
|
1388
|
+
precision = precision[valid_indices]
|
|
1389
|
+
recall = recall[valid_indices]
|
|
1390
|
+
thresholds = thresholds[valid_indices]
|
|
1391
|
+
|
|
1392
|
+
# Find the index of the maximum precision
|
|
1393
|
+
best_index = np.argmax(precision)
|
|
1394
|
+
|
|
1395
|
+
# Return the corresponding threshold, precision, and recall
|
|
1396
|
+
best_threshold = thresholds[best_index]
|
|
1397
|
+
best_precision = precision[best_index]
|
|
1398
|
+
best_recall = recall[best_index]
|
|
1399
|
+
|
|
1400
|
+
return best_threshold, best_precision, best_recall
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def find_best_precision_threshold(prediction, precision_target: float = 0.80):
|
|
1404
|
+
"""
|
|
1405
|
+
Finds the highest threshold that achieves at least the given precision target.
|
|
1406
|
+
|
|
1407
|
+
Parameters:
|
|
1408
|
+
prediction (pd.DataFrame): DataFrame with columns 'TARGET' and '1' or index 1 for predicted probabilities
|
|
1409
|
+
precision_target (float): Desired minimum precision (between 0 and 1)
|
|
1410
|
+
|
|
1411
|
+
Returns:
|
|
1412
|
+
tuple: (threshold, precision, recall) achieving the desired precision
|
|
1413
|
+
"""
|
|
1414
|
+
y_true = prediction["TARGET"]
|
|
1415
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1416
|
+
|
|
1417
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1418
|
+
|
|
1419
|
+
# Align lengths: thresholds is N-1 compared to precision/recall
|
|
1420
|
+
thresholds = thresholds
|
|
1421
|
+
precision = precision[1:] # Shift to match thresholds
|
|
1422
|
+
recall = recall[1:]
|
|
1423
|
+
|
|
1424
|
+
valid_indices = [i for i, p in enumerate(precision) if p >= precision_target]
|
|
1425
|
+
|
|
1426
|
+
if not valid_indices:
|
|
1427
|
+
raise ValueError(
|
|
1428
|
+
f"Could not find a threshold with precision >= {precision_target}"
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
best_idx = valid_indices[-1] # Highest threshold with precision >= target
|
|
1432
|
+
|
|
1433
|
+
return thresholds[best_idx], precision[best_idx], recall[best_idx]
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
def find_best_recall_threshold(prediction, recall_target: float = 0.98) -> float:
|
|
1437
|
+
"""
|
|
1438
|
+
Finds the highest threshold that achieves at least the given recall target.
|
|
1439
|
+
|
|
1440
|
+
Parameters:
|
|
1441
|
+
pred_df (pd.DataFrame): DataFrame with columns 'y_true' and 'y_pred_proba'
|
|
1442
|
+
recall_target (float): Desired minimum recall (between 0 and 1)
|
|
1443
|
+
|
|
1444
|
+
Returns:
|
|
1445
|
+
float: Best threshold achieving the desired recall, or None if not reachable
|
|
1446
|
+
"""
|
|
1447
|
+
y_true = prediction["TARGET"]
|
|
1448
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1449
|
+
|
|
1450
|
+
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
|
|
1451
|
+
|
|
1452
|
+
# `thresholds` has length N-1 compared to precision and recall
|
|
1453
|
+
recall = recall[1:] # Drop first element to align with thresholds
|
|
1454
|
+
precision = precision[1:]
|
|
1455
|
+
|
|
1456
|
+
valid_indices = [i for i, r in enumerate(recall) if r >= recall_target]
|
|
1457
|
+
|
|
1458
|
+
if not valid_indices:
|
|
1459
|
+
logger.warning(f"Could not find a threshold with recall >= {recall_target}")
|
|
1460
|
+
return None, None, None
|
|
1461
|
+
|
|
1462
|
+
best_idx = valid_indices[-1] # Highest threshold with recall >= target
|
|
1463
|
+
|
|
1464
|
+
return thresholds[best_idx], precision[best_idx], recall[best_idx]
|
|
1465
|
+
|
|
1466
|
+
|
|
1467
|
+
def plot_threshold(prediction, threshold, precision, recall):
|
|
1468
|
+
y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
|
|
1469
|
+
y_true = prediction["TARGET"]
|
|
1470
|
+
|
|
1471
|
+
predicted_positive = (y_pred_proba >= threshold).sum()
|
|
1472
|
+
predicted_negative = (y_pred_proba < threshold).sum()
|
|
1473
|
+
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
|
|
1474
|
+
per_predicted_positive = predicted_positive / len(y_pred_proba)
|
|
1475
|
+
per_predicted_negative = predicted_negative / len(y_pred_proba)
|
|
1476
|
+
|
|
1477
|
+
logger.info(
|
|
1478
|
+
f"""Threshold: {threshold*100:.2f}
|
|
1479
|
+
Precision: {precision*100:.2f}
|
|
1480
|
+
Recall: {recall*100:.2f}
|
|
1481
|
+
F1-score: {f1_scores*100:.2f}
|
|
1482
|
+
% of score over {threshold}: {predicted_positive}/{len(y_pred_proba)} = {per_predicted_positive*100:.2f}%
|
|
1483
|
+
% of score under {threshold}: {predicted_negative}/{len(y_pred_proba)} = {per_predicted_negative*100:.2f}%"""
|
|
1484
|
+
)
|
|
1485
|
+
|
|
1486
|
+
# Visualizing the scores of positive and negative classes
|
|
1487
|
+
plt.figure(figsize=(10, 6))
|
|
1488
|
+
sns.histplot(
|
|
1489
|
+
y_pred_proba[y_true == 1],
|
|
1490
|
+
color="blue",
|
|
1491
|
+
label="Positive Class",
|
|
1492
|
+
bins=30,
|
|
1493
|
+
kde=True,
|
|
1494
|
+
alpha=0.6,
|
|
1495
|
+
)
|
|
1496
|
+
sns.histplot(
|
|
1497
|
+
y_pred_proba[y_true == 0],
|
|
1498
|
+
color="red",
|
|
1499
|
+
label="Negative Class",
|
|
1500
|
+
bins=30,
|
|
1501
|
+
kde=True,
|
|
1502
|
+
alpha=0.6,
|
|
1503
|
+
)
|
|
1504
|
+
plt.axvline(
|
|
1505
|
+
x=threshold,
|
|
1506
|
+
color="green",
|
|
1507
|
+
linestyle="--",
|
|
1508
|
+
label=f"Threshold at {round(threshold,3)}",
|
|
1509
|
+
)
|
|
1510
|
+
plt.title("Distribution of Predicted Probabilities")
|
|
1511
|
+
plt.xlabel("Predicted Probabilities")
|
|
1512
|
+
plt.ylabel("Frequency")
|
|
1513
|
+
plt.legend()
|
|
1514
|
+
plt.show()
|
|
1515
|
+
return threshold
|
|
1516
|
+
|
|
1517
|
+
|
|
1518
|
+
def print_scores(training_target_dir: str):
|
|
1519
|
+
"""
|
|
1520
|
+
Monitor scores
|
|
1521
|
+
"""
|
|
1522
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
1523
|
+
return scores_tracking
|
|
1524
|
+
|
|
1525
|
+
|
|
1526
|
+
# OLD - to sort out
|
|
1527
|
+
def get_pred_distribution(training_target_dir: str, model_name="linear"):
|
|
1528
|
+
"""
|
|
1529
|
+
Look at prediction distributions
|
|
1530
|
+
"""
|
|
1531
|
+
prediction = pd.read_csv(
|
|
1532
|
+
f"{training_target_dir}/{model_name}/pred_val.csv",
|
|
1533
|
+
index_col="ID",
|
|
1534
|
+
)
|
|
1535
|
+
prediction.describe()
|
|
1536
|
+
|
|
1537
|
+
|
|
1538
|
+
def plot_feature_importance(training_target_dir: str, model_name="linear"):
|
|
1539
|
+
"""
|
|
1540
|
+
Monitor feature importance ranking to filter out unrelevant features
|
|
1541
|
+
"""
|
|
1542
|
+
model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
|
|
1543
|
+
if hasattr(model, "feature_importances_"):
|
|
1544
|
+
feature_importances_ = model.feature_importances_.flatten()
|
|
1545
|
+
elif hasattr(model, "feature_importance"):
|
|
1546
|
+
feature_importances_ = model.feature_importance.flatten()
|
|
1547
|
+
elif hasattr(model, "coefs_"):
|
|
1548
|
+
feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
|
|
1549
|
+
elif hasattr(model, "coef_"):
|
|
1550
|
+
feature_importances_ = model.coef_.flatten()
|
|
1551
|
+
else:
|
|
1552
|
+
feature_importances_ = []
|
|
1553
|
+
|
|
1554
|
+
sns.barplot(
|
|
1555
|
+
data=feature_importances_,
|
|
1556
|
+
orient="h",
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def print_model_estimators(training_target_dir: str, model_name="linear"):
|
|
1561
|
+
"""
|
|
1562
|
+
Look at a specific trained model
|
|
1563
|
+
"""
|
|
1564
|
+
model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
|
|
1565
|
+
for i in range(0, 100):
|
|
1566
|
+
logger.info(model.estimators_[i].get_depth())
|
|
1567
|
+
|
|
1568
|
+
|
|
1569
|
+
def get_model_info(model):
|
|
1570
|
+
model.count_params()
|
|
1571
|
+
model.summary()
|