lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +277 -0
- lecrapaud/config.py +10 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/env.py +2 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/alembic.ini +116 -0
- lecrapaud/db/models/__init__.py +10 -10
- lecrapaud/db/models/base.py +176 -1
- lecrapaud/db/models/dataset.py +25 -20
- lecrapaud/db/models/feature.py +5 -6
- lecrapaud/db/models/feature_selection.py +3 -4
- lecrapaud/db/models/feature_selection_rank.py +3 -4
- lecrapaud/db/models/model.py +3 -4
- lecrapaud/db/models/model_selection.py +15 -8
- lecrapaud/db/models/model_training.py +15 -7
- lecrapaud/db/models/score.py +9 -6
- lecrapaud/db/models/target.py +16 -8
- lecrapaud/db/session.py +66 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +747 -1022
- lecrapaud/feature_selection.py +915 -998
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +2 -2
- lecrapaud/jobs/config.py +1 -1
- lecrapaud/jobs/scheduler.py +1 -1
- lecrapaud/jobs/tasks.py +6 -6
- lecrapaud/model_selection.py +1060 -960
- lecrapaud/search_space.py +4 -0
- lecrapaud/utils.py +2 -2
- lecrapaud-0.4.1.dist-info/METADATA +171 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
- lecrapaud/db/crud.py +0 -179
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +0 -58
- lecrapaud/predictions.py +0 -292
- lecrapaud/training.py +0 -151
- lecrapaud-0.4.0.dist-info/METADATA +0 -103
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
lecrapaud/model_selection.py
CHANGED
|
@@ -10,6 +10,7 @@ import warnings
|
|
|
10
10
|
import joblib
|
|
11
11
|
import glob
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
import pickle
|
|
13
14
|
|
|
14
15
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
15
16
|
|
|
@@ -63,12 +64,19 @@ from ray.tune.schedulers import ASHAScheduler
|
|
|
63
64
|
from ray.air import session
|
|
64
65
|
|
|
65
66
|
# Internal library
|
|
66
|
-
from
|
|
67
|
-
from
|
|
68
|
-
from
|
|
69
|
-
from
|
|
70
|
-
from
|
|
71
|
-
from
|
|
67
|
+
from lecrapaud.search_space import all_models
|
|
68
|
+
from lecrapaud.directories import clean_directory
|
|
69
|
+
from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
|
|
70
|
+
from lecrapaud.config import PYTHON_ENV
|
|
71
|
+
from lecrapaud.feature_selection import load_train_data
|
|
72
|
+
from lecrapaud.db import (
|
|
73
|
+
Model,
|
|
74
|
+
ModelSelection,
|
|
75
|
+
ModelTraining,
|
|
76
|
+
Score,
|
|
77
|
+
Target,
|
|
78
|
+
Dataset,
|
|
79
|
+
)
|
|
72
80
|
|
|
73
81
|
# Reproducible result
|
|
74
82
|
keras.utils.set_random_seed(42)
|
|
@@ -100,1116 +108,1216 @@ def test_hardware():
|
|
|
100
108
|
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
101
109
|
|
|
102
110
|
|
|
103
|
-
|
|
104
|
-
def rmse_tf(y_true, y_pred):
|
|
105
|
-
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
106
|
-
results = K.sqrt(K.mean(K.square(y_pred - y_true)))
|
|
107
|
-
return results
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def mae_tf(y_true, y_pred):
|
|
111
|
-
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
112
|
-
results = K.mean(K.abs(y_pred - y_true))
|
|
113
|
-
return results
|
|
111
|
+
class ModelEngine:
|
|
114
112
|
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
model_name: str = None,
|
|
116
|
+
target_type: str = None,
|
|
117
|
+
path: str = None,
|
|
118
|
+
search_params: dict = {},
|
|
119
|
+
create_model=None,
|
|
120
|
+
plot: bool = False,
|
|
121
|
+
log_dir: str = None,
|
|
122
|
+
):
|
|
123
|
+
self.path = path
|
|
124
|
+
if path:
|
|
125
|
+
self.load()
|
|
126
|
+
else:
|
|
127
|
+
self.model_name = model_name
|
|
128
|
+
self.target_type = target_type
|
|
115
129
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
130
|
+
config = [
|
|
131
|
+
config for config in all_models if config["model_name"] == self.model_name
|
|
132
|
+
]
|
|
133
|
+
if config is None or len(config) == 0:
|
|
134
|
+
Exception(
|
|
135
|
+
f"Model {self.model_name} is not supported by this library."
|
|
136
|
+
f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
|
|
137
|
+
)
|
|
138
|
+
config = config[0]
|
|
120
139
|
|
|
121
|
-
|
|
122
|
-
|
|
140
|
+
self.recurrent = config["recurrent"]
|
|
141
|
+
self.need_scaling = config["need_scaling"]
|
|
142
|
+
self.search_params = search_params
|
|
143
|
+
self.create_model = create_model
|
|
144
|
+
self.plot = plot
|
|
145
|
+
self.log_dir = log_dir
|
|
123
146
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
147
|
+
if self.need_scaling and self.target_type == "regression":
|
|
148
|
+
self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
|
|
149
|
+
else:
|
|
150
|
+
self.scaler_y = None
|
|
127
151
|
|
|
152
|
+
self.threshold = None
|
|
128
153
|
|
|
129
|
-
def
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
154
|
+
def fit(self, *args):
|
|
155
|
+
if self.recurrent:
|
|
156
|
+
fit = self.fit_recurrent
|
|
157
|
+
elif (self.create_model == "lgb") or (self.create_model == "xgb"):
|
|
158
|
+
fit = self.fit_boosting
|
|
159
|
+
else:
|
|
160
|
+
fit = self.fit_sklearn
|
|
161
|
+
model = fit(*args)
|
|
162
|
+
return model
|
|
163
|
+
|
|
164
|
+
# Functions to fit & evaluate models
|
|
165
|
+
def fit_sklearn(self, x_train, y_train, x_val, y_val, params):
|
|
166
|
+
|
|
167
|
+
# Create & Compile the model
|
|
168
|
+
model = self.create_model(**params)
|
|
169
|
+
|
|
170
|
+
# Train the model
|
|
171
|
+
logger.info("Fitting the model...")
|
|
172
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
173
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
174
|
+
|
|
175
|
+
model.fit(x_train, y_train)
|
|
176
|
+
|
|
177
|
+
if (
|
|
178
|
+
self.target_type == "classification"
|
|
179
|
+
and "loss" in model.get_params().keys()
|
|
180
|
+
and "hinge" in model.get_params()["loss"]
|
|
181
|
+
):
|
|
182
|
+
# This is for SVC models with hinge loss
|
|
183
|
+
# You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
|
|
184
|
+
# TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
|
|
185
|
+
logger.info(
|
|
186
|
+
f"Re-Calibrating {self.model_name} to get predict probabilities..."
|
|
187
|
+
)
|
|
188
|
+
calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
|
|
189
|
+
model = calibrator.fit(x_train, y_train)
|
|
133
190
|
|
|
134
|
-
|
|
135
|
-
|
|
191
|
+
# set model_name after calibrator
|
|
192
|
+
model.model_name = self.model_name
|
|
193
|
+
model.target_type = self.target_type
|
|
136
194
|
|
|
195
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
137
196
|
|
|
138
|
-
|
|
139
|
-
y_true = K.ones_like(y_true)
|
|
140
|
-
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
197
|
+
self._model = model
|
|
141
198
|
|
|
142
|
-
|
|
143
|
-
precision = true_positives / (predicted_positives + K.epsilon())
|
|
144
|
-
return precision
|
|
199
|
+
return model
|
|
145
200
|
|
|
201
|
+
def fit_boosting(self, x_train, y_train, x_val, y_val, params):
|
|
202
|
+
"""
|
|
203
|
+
This is using lightGBM or XGboost C++ librairies
|
|
204
|
+
"""
|
|
205
|
+
lightGBM = self.create_model == "lgb"
|
|
146
206
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
207
|
+
# Datasets
|
|
208
|
+
boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
|
|
209
|
+
train_data = boosting_dataset(x_train, label=y_train)
|
|
210
|
+
val_data = boosting_dataset(x_val, label=y_val)
|
|
151
211
|
|
|
212
|
+
# Create a TensorBoardX writer
|
|
213
|
+
writer = SummaryWriter(self.log_dir)
|
|
214
|
+
evals_result = {}
|
|
152
215
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
216
|
+
# Training
|
|
217
|
+
labels = np.unique(y_train)
|
|
218
|
+
num_class = (
|
|
219
|
+
labels.size
|
|
220
|
+
if self.target_type == "classification" and labels.size > 2
|
|
221
|
+
else 1
|
|
222
|
+
)
|
|
223
|
+
logger.info("Fitting the model...")
|
|
224
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
225
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
162
226
|
|
|
163
|
-
|
|
164
|
-
def fit_sklearn(x_train, y_train, x_val, y_val, create_model, params, config):
|
|
227
|
+
if lightGBM:
|
|
165
228
|
|
|
166
|
-
|
|
167
|
-
|
|
229
|
+
def tensorboard_callback(env):
|
|
230
|
+
for i, metric in enumerate(env.evaluation_result_list):
|
|
231
|
+
metric_name, _, metric_value, _ = metric
|
|
232
|
+
writer.add_scalar(
|
|
233
|
+
f"LightGBM/{metric_name}", metric_value, env.iteration
|
|
234
|
+
)
|
|
168
235
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
236
|
+
loss = (
|
|
237
|
+
"regression"
|
|
238
|
+
if self.target_type == "regression"
|
|
239
|
+
else ("binary" if num_class <= 2 else "multiclass")
|
|
240
|
+
)
|
|
241
|
+
eval_metric = (
|
|
242
|
+
"rmse"
|
|
243
|
+
if self.target_type == "regression"
|
|
244
|
+
else ("binary_logloss" if num_class <= 2 else "multi_logloss")
|
|
245
|
+
)
|
|
246
|
+
model = lgb.train(
|
|
247
|
+
params={
|
|
248
|
+
**params["model_params"],
|
|
249
|
+
"objective": loss,
|
|
250
|
+
"metric": eval_metric,
|
|
251
|
+
"num_class": num_class,
|
|
252
|
+
},
|
|
253
|
+
num_boost_round=params["num_boost_round"],
|
|
254
|
+
train_set=train_data,
|
|
255
|
+
valid_sets=[train_data, val_data],
|
|
256
|
+
valid_names=["train", "val"],
|
|
257
|
+
callbacks=[
|
|
258
|
+
lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
|
|
259
|
+
lgb.record_evaluation(evals_result),
|
|
260
|
+
tensorboard_callback,
|
|
261
|
+
],
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
173
264
|
|
|
174
|
-
|
|
265
|
+
class TensorBoardCallback(xgb.callback.TrainingCallback):
|
|
175
266
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
and "loss" in model.get_params().keys()
|
|
179
|
-
and "hinge" in model.get_params()["loss"]
|
|
180
|
-
):
|
|
181
|
-
# This is for SVC models with hinge loss
|
|
182
|
-
# You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
|
|
183
|
-
# TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
|
|
184
|
-
logger.info(
|
|
185
|
-
f"Re-Calibrating {config["model_name"]} to get predict probabilities..."
|
|
186
|
-
)
|
|
187
|
-
calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
|
|
188
|
-
model = calibrator.fit(x_train, y_train)
|
|
267
|
+
def __init__(self, log_dir: str):
|
|
268
|
+
self.writer = SummaryWriter(log_dir=log_dir)
|
|
189
269
|
|
|
190
|
-
|
|
191
|
-
|
|
270
|
+
def after_iteration(
|
|
271
|
+
self,
|
|
272
|
+
model,
|
|
273
|
+
epoch: int,
|
|
274
|
+
evals_log: xgb.callback.TrainingCallback.EvalsLog,
|
|
275
|
+
) -> bool:
|
|
276
|
+
if not evals_log:
|
|
277
|
+
return False
|
|
192
278
|
|
|
193
|
-
|
|
279
|
+
for data, metric in evals_log.items():
|
|
280
|
+
for metric_name, log in metric.items():
|
|
281
|
+
score = (
|
|
282
|
+
log[-1][0] if isinstance(log[-1], tuple) else log[-1]
|
|
283
|
+
)
|
|
284
|
+
self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
|
|
194
285
|
|
|
195
|
-
|
|
286
|
+
return False
|
|
196
287
|
|
|
288
|
+
tensorboard_callback = TensorBoardCallback(self.log_dir)
|
|
197
289
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
290
|
+
loss = (
|
|
291
|
+
"reg:squarederror"
|
|
292
|
+
if self.target_type == "regression"
|
|
293
|
+
else ("binary:logistic" if num_class <= 2 else "multi:softprob")
|
|
294
|
+
)
|
|
295
|
+
eval_metric = (
|
|
296
|
+
"rmse"
|
|
297
|
+
if self.target_type == "regression"
|
|
298
|
+
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
299
|
+
)
|
|
300
|
+
model = xgb.train(
|
|
301
|
+
params={
|
|
302
|
+
**params["model_params"],
|
|
303
|
+
"objective": loss,
|
|
304
|
+
"eval_metric": eval_metric,
|
|
305
|
+
"num_class": num_class,
|
|
306
|
+
},
|
|
307
|
+
num_boost_round=params["num_boost_round"],
|
|
308
|
+
dtrain=train_data,
|
|
309
|
+
evals=[(val_data, "val"), (train_data, "train")],
|
|
310
|
+
callbacks=[
|
|
311
|
+
xgb.callback.EarlyStopping(
|
|
312
|
+
rounds=params["early_stopping_rounds"], save_best=True
|
|
313
|
+
),
|
|
314
|
+
xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
|
|
315
|
+
tensorboard_callback,
|
|
316
|
+
],
|
|
317
|
+
evals_result=evals_result, # Record evaluation result
|
|
318
|
+
verbose_eval=0,
|
|
319
|
+
)
|
|
203
320
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
val_data = Dataset(x_val, label=y_val)
|
|
321
|
+
model.model_name = self.create_model
|
|
322
|
+
model.target_type = self.target_type
|
|
323
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
208
324
|
|
|
209
|
-
|
|
210
|
-
|
|
325
|
+
# Close the writer after training is done
|
|
326
|
+
writer.close()
|
|
211
327
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
328
|
+
if self.plot:
|
|
329
|
+
# Plot loss per epoch
|
|
330
|
+
train_loss = evals_result["train"][eval_metric]
|
|
331
|
+
val_loss = evals_result["val"][eval_metric]
|
|
332
|
+
logs = pd.DataFrame({"train": train_loss, "val": val_loss})
|
|
215
333
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
334
|
+
plt.figure(figsize=(14, 4))
|
|
335
|
+
plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
|
|
336
|
+
plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
|
|
337
|
+
plt.xlabel("Epoch")
|
|
338
|
+
plt.ylabel("Loss")
|
|
339
|
+
plt.legend()
|
|
340
|
+
plt.show()
|
|
224
341
|
|
|
225
|
-
|
|
342
|
+
self._model = model
|
|
226
343
|
|
|
227
|
-
|
|
228
|
-
for i, metric in enumerate(env.evaluation_result_list):
|
|
229
|
-
metric_name, _, metric_value, _ = metric
|
|
230
|
-
writer.add_scalar(
|
|
231
|
-
f"LightGBM/{metric_name}", metric_value, env.iteration
|
|
232
|
-
)
|
|
344
|
+
return model
|
|
233
345
|
|
|
234
|
-
|
|
235
|
-
"regression"
|
|
236
|
-
if _target_type == "regression"
|
|
237
|
-
else ("binary" if num_class <= 2 else "multiclass")
|
|
238
|
-
)
|
|
239
|
-
eval_metric = (
|
|
240
|
-
"rmse"
|
|
241
|
-
if _target_type == "regression"
|
|
242
|
-
else ("binary_logloss" if num_class <= 2 else "multi_logloss")
|
|
243
|
-
)
|
|
244
|
-
model = lgb.train(
|
|
245
|
-
params={
|
|
246
|
-
**params["model_params"],
|
|
247
|
-
"objective": loss,
|
|
248
|
-
"metric": eval_metric,
|
|
249
|
-
"num_class": num_class,
|
|
250
|
-
},
|
|
251
|
-
num_boost_round=params["num_boost_round"],
|
|
252
|
-
train_set=train_data,
|
|
253
|
-
valid_sets=[train_data, val_data],
|
|
254
|
-
valid_names=["train", "val"],
|
|
255
|
-
callbacks=[
|
|
256
|
-
lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
|
|
257
|
-
lgb.record_evaluation(evals_result),
|
|
258
|
-
tensorboard_callback,
|
|
259
|
-
],
|
|
260
|
-
)
|
|
261
|
-
else:
|
|
346
|
+
def fit_recurrent(self, x_train, y_train, x_val, y_val, params):
|
|
262
347
|
|
|
263
|
-
|
|
348
|
+
# metrics functions
|
|
349
|
+
def rmse_tf(y_true, y_pred):
|
|
350
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
351
|
+
results = K.sqrt(K.mean(K.square(y_pred - y_true)))
|
|
352
|
+
return results
|
|
264
353
|
|
|
265
|
-
|
|
266
|
-
|
|
354
|
+
def mae_tf(y_true, y_pred):
|
|
355
|
+
y_true, y_pred = unscale_tf(y_true, y_pred)
|
|
356
|
+
results = K.mean(K.abs(y_pred - y_true))
|
|
357
|
+
return results
|
|
267
358
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
evals_log: xgb.callback.TrainingCallback.EvalsLog,
|
|
273
|
-
) -> bool:
|
|
274
|
-
if not evals_log:
|
|
275
|
-
return False
|
|
359
|
+
def unscale_tf(y_true, y_pred):
|
|
360
|
+
if self.target_type == "regression":
|
|
361
|
+
scale = K.constant(self.scaler_y.scale_[0])
|
|
362
|
+
mean = K.constant(self.scaler_y.mean_[0])
|
|
276
363
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
|
|
280
|
-
self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
|
|
364
|
+
y_true = K.mul(y_true, scale)
|
|
365
|
+
y_true = K.bias_add(y_true, mean)
|
|
281
366
|
|
|
282
|
-
|
|
367
|
+
y_pred = K.mul(y_pred, scale)
|
|
368
|
+
y_pred = K.bias_add(y_pred, mean)
|
|
369
|
+
return y_true, y_pred
|
|
283
370
|
|
|
284
|
-
|
|
371
|
+
# Create the model
|
|
372
|
+
labels = np.unique(y_train[:, 0])
|
|
373
|
+
num_class = labels.size if self.target_type == "classification" else None
|
|
374
|
+
input_shape = (x_train.shape[1], x_train.shape[2])
|
|
375
|
+
model = self.create_model(params, input_shape, self.target_type, num_class)
|
|
376
|
+
model.target_type = self.target_type
|
|
285
377
|
|
|
378
|
+
# Compile the model
|
|
286
379
|
loss = (
|
|
287
|
-
|
|
288
|
-
if
|
|
289
|
-
else (
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
380
|
+
rmse_tf
|
|
381
|
+
if self.target_type == "regression"
|
|
382
|
+
else (
|
|
383
|
+
BinaryCrossentropy(from_logits=False)
|
|
384
|
+
if num_class <= 2
|
|
385
|
+
else CategoricalCrossentropy(from_logits=False)
|
|
386
|
+
)
|
|
295
387
|
)
|
|
296
|
-
|
|
297
|
-
params=
|
|
298
|
-
**params["model_params"],
|
|
299
|
-
"objective": loss,
|
|
300
|
-
"eval_metric": eval_metric,
|
|
301
|
-
"num_class": num_class,
|
|
302
|
-
},
|
|
303
|
-
num_boost_round=params["num_boost_round"],
|
|
304
|
-
dtrain=train_data,
|
|
305
|
-
evals=[(val_data, "val"), (train_data, "train")],
|
|
306
|
-
callbacks=[
|
|
307
|
-
xgb.callback.EarlyStopping(
|
|
308
|
-
rounds=params["early_stopping_rounds"], save_best=True
|
|
309
|
-
),
|
|
310
|
-
xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
|
|
311
|
-
tensorboard_callback,
|
|
312
|
-
],
|
|
313
|
-
evals_result=evals_result, # Record evaluation result
|
|
314
|
-
verbose_eval=0,
|
|
388
|
+
optimizer = Adam(
|
|
389
|
+
learning_rate=params["learning_rate"], clipnorm=params["clipnorm"]
|
|
315
390
|
)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
# Plot loss per epoch
|
|
325
|
-
train_loss = evals_result["train"][eval_metric]
|
|
326
|
-
val_loss = evals_result["val"][eval_metric]
|
|
327
|
-
logs = pd.DataFrame({"train": train_loss, "val": val_loss})
|
|
328
|
-
|
|
329
|
-
plt.figure(figsize=(14, 4))
|
|
330
|
-
plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
|
|
331
|
-
plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
|
|
332
|
-
plt.xlabel("Epoch")
|
|
333
|
-
plt.ylabel("Loss")
|
|
334
|
-
plt.legend()
|
|
335
|
-
plt.show()
|
|
336
|
-
|
|
337
|
-
return model
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def fit_recurrent(x_train, y_train, x_val, y_val, create_model, params, config):
|
|
341
|
-
|
|
342
|
-
# Create the model
|
|
343
|
-
labels = np.unique(y_train[:, 0])
|
|
344
|
-
num_class = labels.size if _target_type == "classification" else None
|
|
345
|
-
input_shape = (x_train.shape[1], x_train.shape[2])
|
|
346
|
-
model = create_model(params, input_shape, _target_type, num_class)
|
|
347
|
-
|
|
348
|
-
# Compile the model
|
|
349
|
-
loss = (
|
|
350
|
-
rmse_tf
|
|
351
|
-
if _target_type == "regression"
|
|
352
|
-
else (
|
|
353
|
-
BinaryCrossentropy(from_logits=False)
|
|
354
|
-
if num_class <= 2
|
|
355
|
-
else CategoricalCrossentropy(from_logits=False)
|
|
391
|
+
metrics = (
|
|
392
|
+
[mae_tf]
|
|
393
|
+
if self.target_type == "regression"
|
|
394
|
+
else (
|
|
395
|
+
["accuracy", Precision(), Recall()]
|
|
396
|
+
if num_class <= 2
|
|
397
|
+
else ["categorical_accuracy"]
|
|
398
|
+
)
|
|
356
399
|
)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
400
|
+
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
|
|
401
|
+
|
|
402
|
+
# Callbacks
|
|
403
|
+
tensorboard_callback = TensorBoard(log_dir=self.log_dir)
|
|
404
|
+
early_stopping_callback = EarlyStopping(
|
|
405
|
+
monitor="val_loss",
|
|
406
|
+
patience=3,
|
|
407
|
+
restore_best_weights=True,
|
|
408
|
+
start_from_epoch=5,
|
|
366
409
|
)
|
|
367
|
-
)
|
|
368
|
-
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
|
|
369
|
-
|
|
370
|
-
# Callbacks
|
|
371
|
-
log_dir = get_log_dir(_training_target_dir, model.model_name)
|
|
372
|
-
|
|
373
|
-
tensorboard_callback = TensorBoard(log_dir=log_dir)
|
|
374
|
-
early_stopping_callback = EarlyStopping(
|
|
375
|
-
monitor="val_loss", patience=3, restore_best_weights=True, start_from_epoch=5
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
# Custom callbacks
|
|
379
|
-
class PrintTrainableWeights(keras.callbacks.Callback):
|
|
380
|
-
def on_epoch_end(self, epoch, logs={}):
|
|
381
|
-
logger.info(model.trainable_variables)
|
|
382
|
-
|
|
383
|
-
class GradientCalcCallback(keras.callbacks.Callback):
|
|
384
|
-
def __init__(self):
|
|
385
|
-
self.epoch_gradient = []
|
|
386
|
-
|
|
387
|
-
def get_gradient_func(self, model):
|
|
388
|
-
# grads = K.gradients(model.total_loss, model.trainable_weights)
|
|
389
|
-
grads = K.gradients(model.loss, model.trainable_weights)
|
|
390
|
-
# inputs = model.model.inputs + model.targets + model.sample_weights
|
|
391
|
-
# use below line of code if above line doesn't work for you
|
|
392
|
-
# inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
|
|
393
|
-
inputs = (
|
|
394
|
-
model._feed_inputs + model._feed_targets + model._feed_sample_weights
|
|
395
|
-
)
|
|
396
|
-
func = K.function(inputs, grads)
|
|
397
|
-
return func
|
|
398
|
-
|
|
399
|
-
def on_epoch_end(self, epoch, logs=None):
|
|
400
|
-
get_gradient = self.get_gradient_func(model)
|
|
401
|
-
grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
|
|
402
|
-
self.epoch_gradient.append(grads)
|
|
403
|
-
|
|
404
|
-
# Train the model
|
|
405
|
-
if _target_type == "classification" and num_class > 2:
|
|
406
|
-
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
407
|
-
lb.fit(labels)
|
|
408
|
-
y_train = lb.transform(y_train[:, 0].flatten())
|
|
409
|
-
y_val = lb.transform(y_val[:, 0].flatten())
|
|
410
|
-
else:
|
|
411
|
-
y_train = y_train[:, 0].flatten()
|
|
412
|
-
y_val = y_val[:, 0].flatten()
|
|
413
|
-
|
|
414
|
-
logger.info("Fitting the model...")
|
|
415
|
-
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
416
|
-
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
417
|
-
|
|
418
|
-
history = model.fit(
|
|
419
|
-
x_train,
|
|
420
|
-
y_train,
|
|
421
|
-
batch_size=params["batch_size"],
|
|
422
|
-
verbose=0,
|
|
423
|
-
epochs=params["epochs"],
|
|
424
|
-
shuffle=False,
|
|
425
|
-
validation_data=(x_val, y_val),
|
|
426
|
-
callbacks=[early_stopping_callback, tensorboard_callback],
|
|
427
|
-
)
|
|
428
410
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
model, data: pd.DataFrame, target_type: str, config: dict, threshold: float = 0.5
|
|
449
|
-
):
|
|
450
|
-
"""Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
|
|
451
|
-
|
|
452
|
-
Args:
|
|
453
|
-
- model: the train model to predict value
|
|
454
|
-
- data: the data for prediction
|
|
455
|
-
- target_type: classification or regression
|
|
456
|
-
- config: dict containing model config
|
|
457
|
-
"""
|
|
458
|
-
if config["recurrent"] or model.model_name in ["lgb", "xgb"]:
|
|
459
|
-
# keras, lgb & xgb
|
|
460
|
-
if model.model_name == "lgb":
|
|
461
|
-
# Direct prediction for LightGBM
|
|
462
|
-
pred = model.predict(data)
|
|
463
|
-
elif model.model_name == "xgb":
|
|
464
|
-
# Convert val_data to DMatrix for XGBoost
|
|
465
|
-
d_data = xgb.DMatrix(data)
|
|
466
|
-
pred = model.predict(d_data)
|
|
467
|
-
else:
|
|
468
|
-
# Reshape (flatten) for keras if not multiclass
|
|
469
|
-
pred = model.predict(data)
|
|
470
|
-
if pred.shape[1] == 1:
|
|
471
|
-
pred = pred.reshape(-1)
|
|
472
|
-
|
|
473
|
-
if target_type == "classification":
|
|
474
|
-
num_class = pred.shape[1] if len(pred.shape) > 1 else 2
|
|
475
|
-
|
|
476
|
-
if num_class <= 2:
|
|
477
|
-
# For binary classification, concatenate the predicted probabilities for both classes
|
|
478
|
-
pred_df = pd.DataFrame(
|
|
479
|
-
{
|
|
480
|
-
0: 1 - pred, # Probability of class 0
|
|
481
|
-
1: pred, # Probability of class 1
|
|
482
|
-
},
|
|
411
|
+
# Custom callbacks
|
|
412
|
+
class PrintTrainableWeights(keras.callbacks.Callback):
|
|
413
|
+
def on_epoch_end(self, epoch, logs={}):
|
|
414
|
+
logger.info(model.trainable_variables)
|
|
415
|
+
|
|
416
|
+
class GradientCalcCallback(keras.callbacks.Callback):
|
|
417
|
+
def __init__(self):
|
|
418
|
+
self.epoch_gradient = []
|
|
419
|
+
|
|
420
|
+
def get_gradient_func(self, model):
|
|
421
|
+
# grads = K.gradients(model.total_loss, model.trainable_weights)
|
|
422
|
+
grads = K.gradients(model.loss, model.trainable_weights)
|
|
423
|
+
# inputs = model.model.inputs + model.targets + model.sample_weights
|
|
424
|
+
# use below line of code if above line doesn't work for you
|
|
425
|
+
# inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
|
|
426
|
+
inputs = (
|
|
427
|
+
model._feed_inputs
|
|
428
|
+
+ model._feed_targets
|
|
429
|
+
+ model._feed_sample_weights
|
|
483
430
|
)
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
pred_df = pd.DataFrame(pred, columns=range(num_class))
|
|
487
|
-
|
|
488
|
-
# Get final predictions (argmax for multi-class, threshold for binary)
|
|
489
|
-
if num_class == 2:
|
|
490
|
-
pred_df["PRED"] = np.where(
|
|
491
|
-
pred_df[1] >= threshold, 1, 0
|
|
492
|
-
) # Class 1 if prob >= threshold
|
|
493
|
-
else:
|
|
494
|
-
pred_df["PRED"] = pred_df.idxmax(
|
|
495
|
-
axis=1
|
|
496
|
-
) # Class with highest probability for multiclasses
|
|
431
|
+
func = K.function(inputs, grads)
|
|
432
|
+
return func
|
|
497
433
|
|
|
498
|
-
|
|
499
|
-
|
|
434
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
435
|
+
get_gradient = self.get_gradient_func(model)
|
|
436
|
+
grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
|
|
437
|
+
self.epoch_gradient.append(grads)
|
|
500
438
|
|
|
439
|
+
# Train the model
|
|
440
|
+
if self.target_type == "classification" and num_class > 2:
|
|
441
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
442
|
+
lb.fit(labels)
|
|
443
|
+
y_train = lb.transform(y_train[:, 0].flatten())
|
|
444
|
+
y_val = lb.transform(y_val[:, 0].flatten())
|
|
501
445
|
else:
|
|
502
|
-
|
|
446
|
+
y_train = y_train[:, 0].flatten()
|
|
447
|
+
y_val = y_val[:, 0].flatten()
|
|
448
|
+
|
|
449
|
+
logger.info("Fitting the model...")
|
|
450
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
451
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
452
|
+
|
|
453
|
+
history = model.fit(
|
|
454
|
+
x_train,
|
|
455
|
+
y_train,
|
|
456
|
+
batch_size=params["batch_size"],
|
|
457
|
+
verbose=0,
|
|
458
|
+
epochs=params["epochs"],
|
|
459
|
+
shuffle=False,
|
|
460
|
+
validation_data=(x_val, y_val),
|
|
461
|
+
callbacks=[early_stopping_callback, tensorboard_callback],
|
|
462
|
+
)
|
|
503
463
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
pred.index = data.index
|
|
507
|
-
else:
|
|
508
|
-
# sk learn
|
|
509
|
-
pred = pd.Series(model.predict(data), index=data.index, name="PRED")
|
|
510
|
-
if target_type == "classification":
|
|
511
|
-
pred_proba = pd.DataFrame(
|
|
512
|
-
model.predict_proba(data),
|
|
513
|
-
index=data.index,
|
|
514
|
-
columns=[
|
|
515
|
-
int(c) if isinstance(c, float) and c.is_integer() else c
|
|
516
|
-
for c in model.classes_
|
|
517
|
-
],
|
|
518
|
-
)
|
|
464
|
+
logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
|
|
465
|
+
# logger.info(pd.DataFrame(gradiant.epoch_gradient))
|
|
519
466
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
pred = (pred_proba[positive_class] >= threshold).astype(int)
|
|
524
|
-
pred.name = "PRED"
|
|
467
|
+
if self.plot:
|
|
468
|
+
# Plot loss per epoch
|
|
469
|
+
logs = pd.DataFrame(history.history)
|
|
525
470
|
|
|
526
|
-
|
|
471
|
+
plt.figure(figsize=(14, 4))
|
|
472
|
+
plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
|
|
473
|
+
plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
|
|
474
|
+
plt.xlabel("Epoch")
|
|
475
|
+
plt.ylabel("Loss")
|
|
476
|
+
plt.legend()
|
|
477
|
+
plt.show()
|
|
527
478
|
|
|
528
|
-
|
|
479
|
+
self._model = model
|
|
529
480
|
|
|
481
|
+
return model
|
|
530
482
|
|
|
531
|
-
def
|
|
532
|
-
|
|
533
|
-
|
|
483
|
+
def predict(
|
|
484
|
+
self,
|
|
485
|
+
data: pd.DataFrame,
|
|
486
|
+
threshold: float = 0.5,
|
|
487
|
+
):
|
|
488
|
+
"""Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
- data: the data for prediction
|
|
492
|
+
- threshold: the threshold for classification
|
|
493
|
+
"""
|
|
494
|
+
if not self._model:
|
|
495
|
+
raise Exception(
|
|
496
|
+
"Model is not fitted, cannot predict, run model.fit() first, or pass a fitted model when creating the Model object to the `model` parameter."
|
|
497
|
+
)
|
|
498
|
+
model = self._model
|
|
499
|
+
|
|
500
|
+
if self.threshold and threshold == 0.5:
|
|
501
|
+
threshold = self.threshold
|
|
502
|
+
|
|
503
|
+
if self.recurrent or model.model_name in ["lgb", "xgb"]:
|
|
504
|
+
# keras, lgb & xgb
|
|
505
|
+
if model.model_name == "lgb":
|
|
506
|
+
# Direct prediction for LightGBM
|
|
507
|
+
pred = model.predict(data)
|
|
508
|
+
elif model.model_name == "xgb":
|
|
509
|
+
# Convert val_data to DMatrix for XGBoost
|
|
510
|
+
d_data = xgb.DMatrix(data)
|
|
511
|
+
pred = model.predict(d_data)
|
|
512
|
+
else:
|
|
513
|
+
# Reshape (flatten) for keras if not multiclass
|
|
514
|
+
pred = model.predict(data)
|
|
515
|
+
if pred.shape[1] == 1:
|
|
516
|
+
pred = pred.reshape(-1)
|
|
517
|
+
|
|
518
|
+
if self.target_type == "classification":
|
|
519
|
+
num_class = pred.shape[1] if len(pred.shape) > 1 else 2
|
|
520
|
+
|
|
521
|
+
if num_class <= 2:
|
|
522
|
+
# For binary classification, concatenate the predicted probabilities for both classes
|
|
523
|
+
pred_df = pd.DataFrame(
|
|
524
|
+
{
|
|
525
|
+
0: 1 - pred, # Probability of class 0
|
|
526
|
+
1: pred, # Probability of class 1
|
|
527
|
+
},
|
|
528
|
+
)
|
|
529
|
+
else:
|
|
530
|
+
# For multi-class classification, use the predicted probabilities for each class
|
|
531
|
+
pred_df = pd.DataFrame(pred, columns=range(num_class))
|
|
532
|
+
|
|
533
|
+
# Get final predictions (argmax for multi-class, threshold for binary)
|
|
534
|
+
if num_class == 2:
|
|
535
|
+
pred_df["PRED"] = np.where(
|
|
536
|
+
pred_df[1] >= threshold, 1, 0
|
|
537
|
+
) # Class 1 if prob >= threshold
|
|
538
|
+
else:
|
|
539
|
+
pred_df["PRED"] = pred_df.idxmax(
|
|
540
|
+
axis=1
|
|
541
|
+
) # Class with highest probability for multiclasses
|
|
534
542
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
- target_type: classification or regression
|
|
538
|
-
"""
|
|
539
|
-
score = {}
|
|
540
|
-
y_true = prediction["TARGET"]
|
|
541
|
-
y_pred = prediction["PRED"]
|
|
543
|
+
# Reorder columns to show predicted class first, then probabilities
|
|
544
|
+
pred = pred_df[["PRED"] + list(range(num_class))]
|
|
542
545
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
score["RMSE"] = root_mean_squared_error(y_true, y_pred)
|
|
546
|
-
score["MAE"] = mean_absolute_error(y_true, y_pred)
|
|
547
|
-
score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
|
|
548
|
-
score["R2"] = r2_score(y_true, y_pred)
|
|
546
|
+
else:
|
|
547
|
+
pred = pd.Series(pred, name="PRED")
|
|
549
548
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
549
|
+
# set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
|
|
550
|
+
if model.model_name in ["lgb", "xgb"]:
|
|
551
|
+
pred.index = data.index
|
|
552
|
+
else:
|
|
553
|
+
# sk learn
|
|
554
|
+
pred = pd.Series(model.predict(data), index=data.index, name="PRED")
|
|
555
|
+
if self.target_type == "classification":
|
|
556
|
+
pred_proba = pd.DataFrame(
|
|
557
|
+
model.predict_proba(data),
|
|
558
|
+
index=data.index,
|
|
559
|
+
columns=[
|
|
560
|
+
int(c) if isinstance(c, float) and c.is_integer() else c
|
|
561
|
+
for c in model.classes_
|
|
562
|
+
],
|
|
563
|
+
)
|
|
554
564
|
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
565
|
+
# Apply threshold for binary classification
|
|
566
|
+
if len(model.classes_) == 2:
|
|
567
|
+
positive_class = model.classes_[1] # Assuming classes are ordered
|
|
568
|
+
pred = (pred_proba[positive_class] >= threshold).astype(int)
|
|
569
|
+
pred.name = "PRED"
|
|
559
570
|
|
|
560
|
-
|
|
561
|
-
mam = (y_true - mean_target).abs().median() # Median Abs around Mean
|
|
562
|
-
mad = (y_true - median_target).abs().median() # Median Abs around Median
|
|
563
|
-
score["MAM"] = mam
|
|
564
|
-
score["MAD"] = mad
|
|
565
|
-
score["MAE_MAM_RATIO"] = (
|
|
566
|
-
float(100 * score["MAE"] / mam) if mam else 1000
|
|
567
|
-
) # MAE / MAD → Plus stable, moins sensible aux outliers.
|
|
568
|
-
score["MAE_MAD_RATIO"] = (
|
|
569
|
-
float(100 * score["MAE"] / mad) if mad else 1000
|
|
570
|
-
) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
|
|
571
|
+
pred = pd.concat([pred, pred_proba], axis=1)
|
|
571
572
|
|
|
572
|
-
|
|
573
|
+
return pred
|
|
573
574
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
575
|
+
def save(self, path):
|
|
576
|
+
if self.recurrent:
|
|
577
|
+
path += "/" + self.model_name + ".keras"
|
|
578
|
+
self._model.save(path)
|
|
579
|
+
else:
|
|
580
|
+
path += "/" + self.model_name + ".best"
|
|
581
|
+
joblib.dump(self._model, path)
|
|
582
|
+
self.path = path
|
|
583
|
+
return path
|
|
584
|
+
|
|
585
|
+
def load(self):
|
|
586
|
+
if not self.path:
|
|
587
|
+
raise ValueError("Path is not set, cannot load model")
|
|
588
|
+
|
|
589
|
+
training_target_dir = Path(self.path)
|
|
590
|
+
|
|
591
|
+
# Load threshold
|
|
592
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
593
|
+
self.threshold = (
|
|
594
|
+
scores_tracking["THRESHOLD"].values[0]
|
|
595
|
+
if "THRESHOLD" in scores_tracking.columns
|
|
596
|
+
else None
|
|
578
597
|
)
|
|
579
|
-
if num_classes > 2:
|
|
580
|
-
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
581
|
-
lb.fit(labels)
|
|
582
|
-
y_true_onhot = lb.transform(y_true)
|
|
583
|
-
y_pred_onehot = lb.transform(y_pred)
|
|
584
598
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
y_true,
|
|
589
|
-
y_pred,
|
|
590
|
-
average=("binary" if num_classes == 2 else "macro"),
|
|
591
|
-
)
|
|
592
|
-
score["RECALL"] = recall_score(
|
|
593
|
-
y_true,
|
|
594
|
-
y_pred,
|
|
595
|
-
average=("binary" if num_classes == 2 else "macro"),
|
|
596
|
-
)
|
|
597
|
-
score["F1"] = f1_score(
|
|
598
|
-
y_true,
|
|
599
|
-
y_pred,
|
|
600
|
-
average=("binary" if num_classes == 2 else "macro"),
|
|
601
|
-
)
|
|
602
|
-
score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
|
|
603
|
-
(
|
|
604
|
-
score["THRESHOLD"],
|
|
605
|
-
score["PRECISION_AT_THRESHOLD"],
|
|
606
|
-
score["RECALL_AT_THRESHOLD"],
|
|
607
|
-
) = (
|
|
608
|
-
find_best_precision_threshold(prediction)
|
|
609
|
-
if num_classes == 2
|
|
610
|
-
else (None, None, None)
|
|
599
|
+
# Search for files that contain '.best' or '.keras' in the name
|
|
600
|
+
best_files = list(training_target_dir.glob("*.best*")) + list(
|
|
601
|
+
training_target_dir.glob("*.keras*")
|
|
611
602
|
)
|
|
612
|
-
|
|
603
|
+
# If any files are found, try loading the first one (or process as needed)
|
|
604
|
+
if best_files:
|
|
605
|
+
file_path = best_files[
|
|
606
|
+
0
|
|
607
|
+
] # Assuming you want to open the first matching file
|
|
608
|
+
try:
|
|
609
|
+
# Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
|
|
610
|
+
self._model = joblib.load(file_path)
|
|
611
|
+
logger.info(
|
|
612
|
+
f"Loaded model {self._model.model_name} and threshold {self.threshold}"
|
|
613
|
+
)
|
|
614
|
+
except (pickle.UnpicklingError, EOFError):
|
|
615
|
+
# If it's not a pickle file, try loading it as a Keras model
|
|
616
|
+
try:
|
|
617
|
+
# Attempt to load the file as a Keras model
|
|
618
|
+
self._model = keras.models.load_model(file_path)
|
|
619
|
+
logger.info(
|
|
620
|
+
f"Loaded model {self._model.model_name} and threshold {self.threshold}"
|
|
621
|
+
)
|
|
622
|
+
except Exception as e:
|
|
623
|
+
raise FileNotFoundError(
|
|
624
|
+
f"Model could not be loaded from path: {file_path}: {e}"
|
|
625
|
+
)
|
|
626
|
+
else:
|
|
627
|
+
raise FileNotFoundError(
|
|
628
|
+
f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
|
|
629
|
+
)
|
|
613
630
|
|
|
631
|
+
self.model_name = self._model.model_name
|
|
632
|
+
self.target_type = self._model.target_type
|
|
614
633
|
|
|
615
|
-
def train_model(params, x_train, y_train, x_val, y_val, config):
|
|
616
|
-
if "_type_name" in config.keys() and config["_type_name"] == "hyperopts":
|
|
617
|
-
global _target_number
|
|
618
|
-
global _target_type
|
|
619
|
-
global _session_name
|
|
620
|
-
global _plot
|
|
621
|
-
global _type_name
|
|
622
|
-
global _scaler_y
|
|
623
|
-
global _training_target_dir
|
|
624
|
-
_target_number = config["_target_number"]
|
|
625
|
-
_target_type = config["_target_type"]
|
|
626
|
-
_session_name = config["_session_name"]
|
|
627
|
-
_plot = config["_plot"]
|
|
628
|
-
_type_name = config["_type_name"]
|
|
629
|
-
_scaler_y = config["_scaler_y"]
|
|
630
|
-
_training_target_dir = config["_training_target_dir"]
|
|
631
|
-
|
|
632
|
-
# warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
633
|
-
# logging.getLogger("ray").setLevel(logging.CRITICAL)
|
|
634
|
-
# logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
|
|
635
|
-
# logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
|
|
636
|
-
# logging.getLogger("raylet").setLevel(logging.CRITICAL)
|
|
637
634
|
|
|
638
|
-
|
|
639
|
-
|
|
635
|
+
def trainable(
|
|
636
|
+
params,
|
|
637
|
+
x_train,
|
|
638
|
+
y_train,
|
|
639
|
+
x_val,
|
|
640
|
+
y_val,
|
|
641
|
+
model_name,
|
|
642
|
+
target_type,
|
|
643
|
+
session_name,
|
|
644
|
+
target_number,
|
|
645
|
+
create_model,
|
|
646
|
+
type_name="hyperopts",
|
|
647
|
+
plot=False,
|
|
648
|
+
):
|
|
649
|
+
"""Standalone version of train_model that doesn't depend on self"""
|
|
650
|
+
# Create model engine
|
|
651
|
+
model = ModelEngine(
|
|
652
|
+
model_name=model_name,
|
|
653
|
+
target_type=target_type,
|
|
654
|
+
create_model=create_model,
|
|
655
|
+
plot=plot,
|
|
640
656
|
)
|
|
641
657
|
|
|
642
|
-
|
|
643
|
-
|
|
658
|
+
logger.info(
|
|
659
|
+
f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
|
|
660
|
+
)
|
|
644
661
|
|
|
645
|
-
if recurrent:
|
|
662
|
+
if model.recurrent:
|
|
646
663
|
timesteps = params["timesteps"]
|
|
647
664
|
x_train = x_train[:, -timesteps:, :]
|
|
648
665
|
x_val = x_val[:, -timesteps:, :]
|
|
649
666
|
|
|
650
667
|
# Compile and fit model on train set
|
|
651
668
|
start = time.time()
|
|
652
|
-
|
|
653
|
-
fit = fit_recurrent
|
|
654
|
-
elif (create_model == "lgb") or (create_model == "xgb"):
|
|
655
|
-
fit = fit_boosting
|
|
656
|
-
else:
|
|
657
|
-
fit = fit_sklearn
|
|
658
|
-
model = fit(
|
|
659
|
-
x_train,
|
|
660
|
-
y_train,
|
|
661
|
-
x_val,
|
|
662
|
-
y_val,
|
|
663
|
-
create_model,
|
|
664
|
-
params=params,
|
|
665
|
-
config=config,
|
|
666
|
-
)
|
|
669
|
+
model.fit(x_train, y_train, x_val, y_val, params)
|
|
667
670
|
stop = time.time()
|
|
668
671
|
|
|
669
672
|
# Prediction on val set
|
|
670
|
-
y_pred = predict(
|
|
673
|
+
y_pred = model.predict(x_val)
|
|
671
674
|
|
|
672
675
|
# fix for recurrent model because x_val has no index as it is a 3D np array
|
|
673
|
-
if
|
|
676
|
+
if model.recurrent:
|
|
674
677
|
y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
|
|
675
678
|
y_pred.index = y_val.index
|
|
676
679
|
|
|
677
680
|
prediction = pd.concat([y_val, y_pred], axis=1)
|
|
678
681
|
|
|
679
682
|
# Unscale the data
|
|
680
|
-
if
|
|
683
|
+
if (
|
|
684
|
+
model.need_scaling
|
|
685
|
+
and model.target_type == "regression"
|
|
686
|
+
and model.scaler_y is not None
|
|
687
|
+
):
|
|
681
688
|
# scaler_y needs 2D array with shape (-1, 1)
|
|
682
|
-
prediction.loc[:, "TARGET"] =
|
|
689
|
+
prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
|
|
683
690
|
prediction[["TARGET"]].values
|
|
684
691
|
)
|
|
685
|
-
prediction.loc[:, "PRED"] =
|
|
692
|
+
prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
|
|
686
693
|
prediction[["PRED"]].values
|
|
687
694
|
)
|
|
688
695
|
|
|
689
696
|
# Evaluate model
|
|
690
697
|
score = {
|
|
691
698
|
"DATE": datetime.now(),
|
|
692
|
-
"SESSION":
|
|
699
|
+
"SESSION": session_name,
|
|
693
700
|
"TRAIN_DATA": x_train.shape[0],
|
|
694
701
|
"VAL_DATA": x_val.shape[0],
|
|
695
702
|
"FEATURES": x_train.shape[-1],
|
|
696
703
|
"MODEL_NAME": model.model_name,
|
|
697
|
-
"TYPE":
|
|
704
|
+
"TYPE": type_name,
|
|
698
705
|
"TRAINING_TIME": stop - start,
|
|
699
706
|
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
700
707
|
}
|
|
701
708
|
|
|
702
|
-
score.update(evaluate(prediction,
|
|
709
|
+
score.update(evaluate(prediction, target_type))
|
|
703
710
|
|
|
704
|
-
if
|
|
711
|
+
if type_name == "hyperopts":
|
|
705
712
|
session.report(metrics=score)
|
|
706
|
-
ray.tune.report(metrics=score)
|
|
707
713
|
return score
|
|
708
714
|
|
|
709
715
|
return score, model, prediction
|
|
710
716
|
|
|
711
717
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
):
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
models_idx
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
session_name (str):
|
|
741
|
-
A name for the current session, used to organize and store results
|
|
742
|
-
(e.g., logs, metrics, trained models) in a session-specific directory.
|
|
743
|
-
|
|
744
|
-
perform_hyperoptimization (bool, optional):
|
|
745
|
-
Whether to perform hyperparameter optimization for the models.
|
|
746
|
-
If `True`, the function will attempt to tune the hyperparameters of each model.
|
|
747
|
-
Defaults to `True`.
|
|
748
|
-
|
|
749
|
-
perform_crossval (bool, optional):
|
|
750
|
-
Whether to perform cross-validation to evaluate model performance.
|
|
751
|
-
If `True`, the function will use cross-validation to compute metrics.
|
|
752
|
-
Defaults to `True`.
|
|
753
|
-
|
|
754
|
-
number_of_trials (int, optional):
|
|
755
|
-
The number of trials to run for hyperparameter optimization.
|
|
756
|
-
Ignored if `perform_hyperoptimization` is `False`.
|
|
757
|
-
Defaults to `20`.
|
|
758
|
-
|
|
759
|
-
plot (bool, optional):
|
|
760
|
-
Whether to enable plotting during the process.
|
|
761
|
-
If `True`, plot will be displayed.
|
|
762
|
-
Defaults to `True`.
|
|
763
|
-
|
|
764
|
-
clean_dir (bool, optional):
|
|
765
|
-
Whether to clean the entire target training directory before starting the process.
|
|
766
|
-
If `True`, any existing files in the target training directory will be removed.
|
|
767
|
-
Defaults to `False`.
|
|
768
|
-
|
|
769
|
-
preserve_model (bool, optional):
|
|
770
|
-
Whether to run the search even if there is already a best model in the directory.
|
|
771
|
-
If `False`, previous best models won't be erased and the search will be skipped.
|
|
772
|
-
Defaults to `False`.
|
|
773
|
-
|
|
774
|
-
Returns:
|
|
775
|
-
None
|
|
776
|
-
The function runs the model selection process and outputs results
|
|
777
|
-
(e.g., logs, metrics, and optionally models) to the session directory.
|
|
778
|
-
"""
|
|
779
|
-
global _target_number
|
|
780
|
-
global _target_type
|
|
781
|
-
global _session_name
|
|
782
|
-
global _plot
|
|
783
|
-
global _type_name
|
|
784
|
-
global _scaler_y
|
|
785
|
-
global _training_target_dir
|
|
786
|
-
|
|
787
|
-
global_vars = [
|
|
788
|
-
"_target_number",
|
|
789
|
-
"_target_type",
|
|
790
|
-
"_session_name",
|
|
791
|
-
"_plot",
|
|
792
|
-
"_type_name",
|
|
793
|
-
"_scaler_y",
|
|
794
|
-
"_training_target_dir",
|
|
795
|
-
]
|
|
796
|
-
|
|
797
|
-
_target_number = target_number
|
|
798
|
-
_target_type = "classification" if target_number in TARGETS_CLF else "regression"
|
|
799
|
-
_session_name = session_name
|
|
800
|
-
_plot = plot
|
|
801
|
-
|
|
802
|
-
if dataset_id is None:
|
|
803
|
-
raise ValueError("dataset_id is not provided.")
|
|
804
|
-
|
|
805
|
-
dataset = Dataset.get(dataset_id)
|
|
806
|
-
dataset_dir = dataset.path
|
|
807
|
-
|
|
808
|
-
training_target_dir = f"{dataset_dir}/TARGET_{_target_number}"
|
|
809
|
-
_training_target_dir = training_target_dir
|
|
810
|
-
|
|
811
|
-
metric = "RMSE" if _target_type == "regression" else "LOGLOSS"
|
|
812
|
-
|
|
813
|
-
# load features, scalers and data
|
|
814
|
-
features = dataset.get_features(target_number)
|
|
815
|
-
all_features = dataset.get_all_features()
|
|
816
|
-
|
|
817
|
-
if data:
|
|
818
|
-
train = data["train"]
|
|
819
|
-
val = data["val"]
|
|
820
|
-
train_scaled = data["train_scaled"]
|
|
821
|
-
val_scaled = data["val_scaled"]
|
|
822
|
-
_scaler_y = (
|
|
823
|
-
data["scalers_y"][f"scaler_y_{target_number}"]
|
|
824
|
-
if _target_type == "regression"
|
|
825
|
-
else None
|
|
718
|
+
class ModelSelectionEngine:
|
|
719
|
+
|
|
720
|
+
def __init__(
|
|
721
|
+
self,
|
|
722
|
+
data,
|
|
723
|
+
reshaped_data,
|
|
724
|
+
target_number,
|
|
725
|
+
target_clf,
|
|
726
|
+
dataset,
|
|
727
|
+
models_idx,
|
|
728
|
+
time_series,
|
|
729
|
+
date_column,
|
|
730
|
+
group_column,
|
|
731
|
+
**kwargs,
|
|
732
|
+
):
|
|
733
|
+
self.data = data
|
|
734
|
+
self.reshaped_data = reshaped_data
|
|
735
|
+
self.target_number = target_number
|
|
736
|
+
self.dataset = dataset
|
|
737
|
+
self.target_clf = target_clf
|
|
738
|
+
self.models_idx = models_idx
|
|
739
|
+
self.time_series = time_series
|
|
740
|
+
self.date_column = date_column
|
|
741
|
+
self.group_column = group_column
|
|
742
|
+
|
|
743
|
+
self.target_type = (
|
|
744
|
+
"classification" if self.target_number in self.target_clf else "regression"
|
|
826
745
|
)
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
746
|
+
self.dataset_dir = self.dataset.path
|
|
747
|
+
self.dataset_id = self.dataset.id
|
|
748
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
749
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
750
|
+
self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
|
|
751
|
+
self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
|
|
752
|
+
self.features = self.dataset.get_features(self.target_number)
|
|
753
|
+
self.all_features = self.dataset.get_all_features(
|
|
754
|
+
date_column=self.date_column, group_column=self.group_column
|
|
830
755
|
)
|
|
831
756
|
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
757
|
+
# Main training function
|
|
758
|
+
def run(
|
|
759
|
+
self,
|
|
760
|
+
session_name,
|
|
761
|
+
perform_hyperopt=True,
|
|
762
|
+
number_of_trials=20,
|
|
763
|
+
perform_crossval=False,
|
|
764
|
+
plot=True,
|
|
765
|
+
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
766
|
+
preserve_model=True,
|
|
767
|
+
):
|
|
768
|
+
"""
|
|
769
|
+
Selects the best models based on a target variable, optionally performing hyperparameter optimization
|
|
770
|
+
and cross-validation, and manages outputs in a session-specific directory.
|
|
771
|
+
"""
|
|
772
|
+
self.session_name = session_name
|
|
773
|
+
self.plot = plot
|
|
774
|
+
self.number_of_trials = number_of_trials
|
|
775
|
+
|
|
776
|
+
if self.dataset_id is None:
|
|
777
|
+
raise ValueError("Please provide a dataset.")
|
|
778
|
+
|
|
779
|
+
if self.data:
|
|
780
|
+
train = self.data["train"]
|
|
781
|
+
val = self.data["val"]
|
|
782
|
+
test = self.data["test"]
|
|
783
|
+
train_scaled = self.data["train_scaled"]
|
|
784
|
+
val_scaled = self.data["val_scaled"]
|
|
785
|
+
test_scaled = self.data["test_scaled"]
|
|
786
|
+
else:
|
|
787
|
+
(
|
|
788
|
+
train,
|
|
789
|
+
val,
|
|
790
|
+
test,
|
|
791
|
+
train_scaled,
|
|
792
|
+
val_scaled,
|
|
793
|
+
test_scaled,
|
|
794
|
+
) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
|
|
795
|
+
|
|
796
|
+
if (
|
|
797
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
798
|
+
and not self.time_series
|
|
799
|
+
):
|
|
800
|
+
ValueError(
|
|
801
|
+
"You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
|
|
802
|
+
)
|
|
851
803
|
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
model
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
match_fields=["model_id", "model_selection_id"],
|
|
874
|
-
model_id=model.id,
|
|
875
|
-
model_selection_id=model_selection.id,
|
|
804
|
+
if (
|
|
805
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
806
|
+
and self.time_series
|
|
807
|
+
):
|
|
808
|
+
if self.reshaped_data is None:
|
|
809
|
+
raise ValueError("reshaped_data is not provided.")
|
|
810
|
+
|
|
811
|
+
logger.info("Loading reshaped data...")
|
|
812
|
+
x_train_reshaped = self.reshaped_data["x_train_reshaped"]
|
|
813
|
+
y_train_reshaped = self.reshaped_data["y_train_reshaped"]
|
|
814
|
+
x_val_reshaped = self.reshaped_data["x_val_reshaped"]
|
|
815
|
+
y_val_reshaped = self.reshaped_data["y_val_reshaped"]
|
|
816
|
+
x_test_reshaped = self.reshaped_data["x_test_reshaped"]
|
|
817
|
+
y_test_reshaped = self.reshaped_data["y_test_reshaped"]
|
|
818
|
+
|
|
819
|
+
# create model selection in db
|
|
820
|
+
target = Target.find_by(name=f"TARGET_{self.target_number}")
|
|
821
|
+
model_selection = ModelSelection.upsert(
|
|
822
|
+
match_fields=["target_id", "dataset_id"],
|
|
823
|
+
target_id=target.id,
|
|
824
|
+
dataset_id=self.dataset_id,
|
|
876
825
|
)
|
|
877
826
|
|
|
878
|
-
#
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
start = time.time()
|
|
908
|
-
# Tuning hyperparameters
|
|
909
|
-
if perform_hyperoptimization:
|
|
910
|
-
_type_name = "hyperopts"
|
|
911
|
-
|
|
912
|
-
for var in global_vars:
|
|
913
|
-
config[var] = globals()[var]
|
|
914
|
-
|
|
915
|
-
logger.info("Start tuning hyperparameters...")
|
|
916
|
-
|
|
917
|
-
storage_path = f"{results_dir}/ray_results"
|
|
918
|
-
# ray.shutdown()
|
|
919
|
-
# ray.init(
|
|
920
|
-
# runtime_env={
|
|
921
|
-
# "working_dir": ".", # or your project path
|
|
922
|
-
# "env_vars": {"PYTHONPATH": "."}
|
|
923
|
-
# }
|
|
924
|
-
# )
|
|
925
|
-
tuner = Tuner(
|
|
926
|
-
trainable=with_parameters(
|
|
927
|
-
train_model,
|
|
928
|
-
x_train=x_train,
|
|
929
|
-
y_train=y_train,
|
|
930
|
-
x_val=x_val,
|
|
931
|
-
y_val=y_val,
|
|
932
|
-
config=config,
|
|
933
|
-
),
|
|
934
|
-
param_space=config["search_params"],
|
|
935
|
-
tune_config=TuneConfig(
|
|
936
|
-
metric=metric,
|
|
937
|
-
mode="min",
|
|
938
|
-
search_alg=HyperOptSearch(),
|
|
939
|
-
num_samples=number_of_trials,
|
|
940
|
-
scheduler=ASHAScheduler(max_t=100, grace_period=10),
|
|
941
|
-
),
|
|
942
|
-
run_config=RunConfig(
|
|
943
|
-
stop={"training_iteration": 100},
|
|
944
|
-
storage_path=storage_path,
|
|
945
|
-
# name=datetime.now().strftime("%d-%m-%Y") + "-" + session_name,
|
|
946
|
-
callbacks=[TBXLoggerCallback()],
|
|
947
|
-
# log_to_file=("stdout.log", "stderr.log"), # depreciated
|
|
948
|
-
# verbose=0,
|
|
949
|
-
),
|
|
827
|
+
# recurrent models starts at 9 # len(list_models)
|
|
828
|
+
for i in self.models_idx:
|
|
829
|
+
config = all_models[i]
|
|
830
|
+
recurrent = config["recurrent"]
|
|
831
|
+
need_scaling = config["need_scaling"]
|
|
832
|
+
model_name = config["model_name"]
|
|
833
|
+
|
|
834
|
+
if recurrent is False and config[self.target_type] is None:
|
|
835
|
+
continue # for naive bayes models that cannot be used in regression
|
|
836
|
+
|
|
837
|
+
self.results_dir = f"{self.training_target_dir}/{model_name}"
|
|
838
|
+
if not os.path.exists(f"{self.results_dir}"):
|
|
839
|
+
os.makedirs(f"{self.results_dir}")
|
|
840
|
+
elif preserve_model and contains_best(self.results_dir):
|
|
841
|
+
continue
|
|
842
|
+
elif perform_hyperopt:
|
|
843
|
+
clean_directory(self.results_dir)
|
|
844
|
+
|
|
845
|
+
logger.info(f"Training a {model_name}")
|
|
846
|
+
model = Model.upsert(
|
|
847
|
+
match_fields=["name", "type"],
|
|
848
|
+
name=model_name,
|
|
849
|
+
type=self.target_type,
|
|
850
|
+
)
|
|
851
|
+
model_training = ModelTraining.upsert(
|
|
852
|
+
match_fields=["model_id", "model_selection_id"],
|
|
853
|
+
model_id=model.id,
|
|
854
|
+
model_selection_id=model_selection.id,
|
|
950
855
|
)
|
|
951
|
-
try:
|
|
952
|
-
results = tuner.fit()
|
|
953
856
|
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
857
|
+
# getting data
|
|
858
|
+
if recurrent:
|
|
859
|
+
# Clear cluster from previous Keras session graphs.
|
|
860
|
+
K.clear_session()
|
|
861
|
+
|
|
862
|
+
features_idx = [
|
|
863
|
+
i
|
|
864
|
+
for i, e in enumerate(self.all_features)
|
|
865
|
+
if e in set(self.features)
|
|
866
|
+
]
|
|
867
|
+
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
868
|
+
x_train = x_train_reshaped[:, :, features_idx]
|
|
869
|
+
y_train = y_train_reshaped[:, [self.target_number, 0]]
|
|
870
|
+
x_val = x_val_reshaped[:, :, features_idx]
|
|
871
|
+
y_val = y_val_reshaped[:, [self.target_number, 0]]
|
|
872
|
+
x_test = x_test_reshaped[:, :, features_idx]
|
|
873
|
+
y_test = y_test_reshaped[:, [self.target_number, 0]]
|
|
874
|
+
else:
|
|
875
|
+
config = config[self.target_type]
|
|
957
876
|
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
877
|
+
if need_scaling and self.target_type == "regression":
|
|
878
|
+
x_train = train_scaled[self.features]
|
|
879
|
+
y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
|
|
880
|
+
"TARGET"
|
|
881
|
+
)
|
|
882
|
+
x_val = val_scaled[self.features]
|
|
883
|
+
y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
884
|
+
x_test = test_scaled[self.features]
|
|
885
|
+
y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
|
|
886
|
+
"TARGET"
|
|
887
|
+
)
|
|
888
|
+
else:
|
|
889
|
+
x_train = train[self.features]
|
|
890
|
+
y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
891
|
+
x_val = val[self.features]
|
|
892
|
+
y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
893
|
+
x_test = test[self.features]
|
|
894
|
+
y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
895
|
+
|
|
896
|
+
log_dir = get_log_dir(self.training_target_dir, model_name)
|
|
897
|
+
# instantiate model
|
|
898
|
+
model = ModelEngine(
|
|
899
|
+
model_name=model_name,
|
|
900
|
+
search_params=config["search_params"],
|
|
901
|
+
target_type=self.target_type,
|
|
902
|
+
create_model=config["create_model"],
|
|
903
|
+
plot=self.plot,
|
|
904
|
+
log_dir=log_dir,
|
|
905
|
+
)
|
|
961
906
|
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
)
|
|
907
|
+
start = time.time()
|
|
908
|
+
# Tuning hyperparameters
|
|
909
|
+
if perform_hyperopt:
|
|
910
|
+
best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
|
|
966
911
|
|
|
967
912
|
# save best params
|
|
968
|
-
best_params_file = f"{training_target_dir}/best_params.json"
|
|
913
|
+
best_params_file = f"{self.training_target_dir}/best_params.json"
|
|
969
914
|
try:
|
|
970
915
|
with open(best_params_file, "r") as f:
|
|
971
916
|
json_dict = json.load(f)
|
|
972
917
|
except FileNotFoundError:
|
|
973
918
|
json_dict = {}
|
|
974
919
|
|
|
975
|
-
json_dict[
|
|
920
|
+
json_dict[model.model_name] = serialize_for_json(best_params)
|
|
976
921
|
with open(best_params_file, "w") as f:
|
|
977
922
|
json.dump(json_dict, f, indent=4)
|
|
923
|
+
else:
|
|
924
|
+
try:
|
|
925
|
+
with open(f"{self.training_target_dir}/best_params.json") as f:
|
|
926
|
+
json_dict = json.load(f)
|
|
927
|
+
best_params = json_dict[model_name]
|
|
928
|
+
except Exception:
|
|
929
|
+
raise FileNotFoundError(
|
|
930
|
+
f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true"
|
|
931
|
+
)
|
|
978
932
|
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
933
|
+
# Perform cross-validation of the best model on k-folds of train + val set
|
|
934
|
+
if perform_crossval:
|
|
935
|
+
x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
|
|
936
|
+
y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
|
|
937
|
+
n_splits = 4
|
|
938
|
+
n_samples = len(x_train_val)
|
|
939
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
940
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
941
|
+
|
|
942
|
+
# Store the scores
|
|
943
|
+
cross_validation_scores = []
|
|
944
|
+
|
|
945
|
+
for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
|
|
946
|
+
self.type_name = f"crossval_fold_{i}"
|
|
947
|
+
|
|
948
|
+
if self.time_series:
|
|
949
|
+
date_series = train[self.date_column].copy()
|
|
950
|
+
|
|
951
|
+
if need_scaling:
|
|
952
|
+
date_series = date_series.map(pd.Timestamp.fromordinal)
|
|
953
|
+
|
|
954
|
+
# Now you can use the actual train/val indices to extract ranges
|
|
955
|
+
train_start = date_series.iloc[train_index[0]]
|
|
956
|
+
train_end = date_series.iloc[train_index[-1]]
|
|
957
|
+
val_start = date_series.iloc[val_index[0]]
|
|
958
|
+
val_end = date_series.iloc[val_index[-1]]
|
|
959
|
+
|
|
960
|
+
logger.info(
|
|
961
|
+
f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
|
|
962
|
+
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
963
|
+
)
|
|
964
|
+
else:
|
|
965
|
+
logger.info(
|
|
966
|
+
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# Train the model and get the score
|
|
970
|
+
if recurrent:
|
|
971
|
+
cross_validation_score, _, _ = self.train_model(
|
|
972
|
+
params=best_params,
|
|
973
|
+
x_train=x_train_val[train_index],
|
|
974
|
+
y_train=y_train_val[train_index],
|
|
975
|
+
x_val=x_train_val[val_index],
|
|
976
|
+
y_val=y_train_val[val_index],
|
|
977
|
+
model=model,
|
|
978
|
+
)
|
|
979
|
+
else:
|
|
980
|
+
cross_validation_score, _, _ = self.train_model(
|
|
981
|
+
params=best_params,
|
|
982
|
+
x_train=x_train_val.iloc[train_index],
|
|
983
|
+
y_train=y_train_val.iloc[train_index],
|
|
984
|
+
x_val=x_train_val.iloc[val_index],
|
|
985
|
+
y_val=y_train_val.iloc[val_index],
|
|
986
|
+
model=model,
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
# Append score to the list
|
|
990
|
+
cross_validation_scores.append(cross_validation_score)
|
|
991
|
+
|
|
992
|
+
# Calculate and log the mean score
|
|
993
|
+
cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
|
|
994
|
+
self.metric
|
|
995
|
+
].mean()
|
|
996
|
+
logger.info(
|
|
997
|
+
f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
|
|
1002
998
|
)
|
|
1003
999
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1000
|
+
# Retrain on entire training set, but keep score on cross-validation folds
|
|
1001
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1002
|
+
params=best_params,
|
|
1003
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1004
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1005
|
+
x_val=x_test,
|
|
1006
|
+
y_val=y_test,
|
|
1007
|
+
model=model,
|
|
1008
|
+
)
|
|
1009
|
+
best_score = cross_validation_mean_score
|
|
1010
|
+
else:
|
|
1011
|
+
# Evaluate on validation set
|
|
1012
|
+
self.type_name = "validation"
|
|
1013
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1014
|
+
params=best_params,
|
|
1015
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1016
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1017
|
+
x_val=x_test,
|
|
1018
|
+
y_val=y_test,
|
|
1019
|
+
model=model,
|
|
1020
|
+
)
|
|
1021
1021
|
|
|
1022
|
-
|
|
1023
|
-
date_column = date_column.map(pd.Timestamp.fromordinal)
|
|
1022
|
+
logger.info(f"Best model scores on test set: {best_score}")
|
|
1024
1023
|
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1024
|
+
# Save validation predictions
|
|
1025
|
+
best_pred.to_csv(
|
|
1026
|
+
f"{self.results_dir}/pred_val.csv",
|
|
1027
|
+
index=True,
|
|
1028
|
+
header=True,
|
|
1029
|
+
index_label="ID",
|
|
1030
|
+
)
|
|
1030
1031
|
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
1034
|
-
)
|
|
1035
|
-
else:
|
|
1036
|
-
logger.info(
|
|
1037
|
-
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
1038
|
-
)
|
|
1032
|
+
# Save best model
|
|
1033
|
+
model_path = best_model.save(self.results_dir)
|
|
1039
1034
|
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
cross_validation_score, _, _ = train_model(
|
|
1043
|
-
params=best_params,
|
|
1044
|
-
x_train=x_train_val[train_index],
|
|
1045
|
-
y_train=y_train_val[train_index],
|
|
1046
|
-
x_val=x_train_val[val_index],
|
|
1047
|
-
y_val=y_train_val[val_index],
|
|
1048
|
-
config=config,
|
|
1049
|
-
)
|
|
1050
|
-
else:
|
|
1051
|
-
cross_validation_score, _, _ = train_model(
|
|
1052
|
-
params=best_params,
|
|
1053
|
-
x_train=x_train_val.iloc[train_index],
|
|
1054
|
-
y_train=y_train_val.iloc[train_index],
|
|
1055
|
-
x_val=x_train_val.iloc[val_index],
|
|
1056
|
-
y_val=y_train_val.iloc[val_index],
|
|
1057
|
-
config=config,
|
|
1058
|
-
)
|
|
1035
|
+
model_path = Path(model_path).resolve()
|
|
1036
|
+
best_score["MODEL_PATH"] = model_path
|
|
1059
1037
|
|
|
1060
|
-
|
|
1061
|
-
|
|
1038
|
+
# Track scores
|
|
1039
|
+
scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
|
|
1040
|
+
best_score_df = pd.DataFrame([best_score])
|
|
1062
1041
|
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1042
|
+
if os.path.exists(scores_tracking_path):
|
|
1043
|
+
existing_scores = pd.read_csv(scores_tracking_path)
|
|
1044
|
+
common_cols = existing_scores.columns.intersection(
|
|
1045
|
+
best_score_df.columns
|
|
1046
|
+
)
|
|
1047
|
+
best_score_df = best_score_df[common_cols]
|
|
1048
|
+
scores_tracking = pd.concat(
|
|
1049
|
+
[existing_scores, best_score_df], ignore_index=True
|
|
1050
|
+
)
|
|
1051
|
+
else:
|
|
1052
|
+
scores_tracking = best_score_df
|
|
1053
|
+
|
|
1054
|
+
scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
|
|
1055
|
+
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1056
|
+
|
|
1057
|
+
# Save model training metadata
|
|
1058
|
+
stop = time.time()
|
|
1059
|
+
training_time = stop - start
|
|
1060
|
+
model_training.best_params = best_params
|
|
1061
|
+
model_training.model_path = model_path
|
|
1062
|
+
model_training.training_time = training_time
|
|
1063
|
+
model_training.save()
|
|
1064
|
+
|
|
1065
|
+
# Store metrics in DB
|
|
1066
|
+
drop_cols = [
|
|
1067
|
+
"DATE",
|
|
1068
|
+
"SESSION",
|
|
1069
|
+
"TRAIN_DATA",
|
|
1070
|
+
"VAL_DATA",
|
|
1071
|
+
"FEATURES",
|
|
1072
|
+
"MODEL_NAME",
|
|
1073
|
+
"MODEL_PATH",
|
|
1074
|
+
]
|
|
1075
|
+
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1076
|
+
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1077
|
+
|
|
1078
|
+
Score.upsert(
|
|
1079
|
+
match_fields=["model_training_id"],
|
|
1080
|
+
model_training_id=model_training.id,
|
|
1081
|
+
**score_data,
|
|
1069
1082
|
)
|
|
1070
1083
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1084
|
+
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1085
|
+
|
|
1086
|
+
# find best model type
|
|
1087
|
+
scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
|
|
1088
|
+
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1089
|
+
best_score_overall = scores_tracking.iloc[0, :]
|
|
1090
|
+
best_model_name = best_score_overall["MODEL_NAME"]
|
|
1091
|
+
|
|
1092
|
+
# Remove any .best or .keras files
|
|
1093
|
+
for file_path in glob.glob(
|
|
1094
|
+
os.path.join(self.training_target_dir, "*.best")
|
|
1095
|
+
) + glob.glob(os.path.join(self.training_target_dir, "*.keras")):
|
|
1096
|
+
os.remove(file_path)
|
|
1097
|
+
# Copy the best model in root training folder for this target
|
|
1098
|
+
best_model_path = Path(
|
|
1099
|
+
f"{self.training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1100
|
+
).resolve()
|
|
1101
|
+
copy_any(
|
|
1102
|
+
best_score_overall["MODEL_PATH"],
|
|
1103
|
+
best_model_path,
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
with open(f"{self.training_target_dir}/best_params.json", "r") as f:
|
|
1107
|
+
best_model_params = json.load(f)[best_model_name]
|
|
1108
|
+
|
|
1109
|
+
# save model_selection results to db
|
|
1110
|
+
model_selection = ModelSelection.get(model_selection.id)
|
|
1111
|
+
model_selection.best_model_id = Model.find_by(
|
|
1112
|
+
name=best_score_overall["MODEL_NAME"], type=self.target_type
|
|
1113
|
+
).id
|
|
1114
|
+
model_selection.best_model_params = best_model_params
|
|
1115
|
+
model_selection.best_model_path = best_model_path
|
|
1116
|
+
model_selection.save()
|
|
1117
|
+
|
|
1118
|
+
logger.info(f"Best model overall is : {best_score_overall}")
|
|
1119
|
+
|
|
1120
|
+
def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
|
|
1121
|
+
self.type_name = "hyperopts"
|
|
1122
|
+
|
|
1123
|
+
def collect_error_logs(training_target_dir: int, storage_path: str):
|
|
1124
|
+
output_error_file = f"{training_target_dir}/errors.log"
|
|
1125
|
+
|
|
1126
|
+
with open(output_error_file, "a") as outfile:
|
|
1127
|
+
# Walk through the ray_results directory
|
|
1128
|
+
for root, dirs, files in os.walk(storage_path):
|
|
1129
|
+
# Check if 'error.txt' exists in the current directory
|
|
1130
|
+
if "error.txt" in files:
|
|
1131
|
+
error_file_path = os.path.join(root, "error.txt")
|
|
1132
|
+
logger.info(f"Processing error file: {error_file_path}")
|
|
1133
|
+
# Read and append the content of the error.txt file
|
|
1134
|
+
with open(error_file_path, "r") as infile:
|
|
1135
|
+
outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
|
|
1136
|
+
outfile.write(infile.read())
|
|
1137
|
+
logger.info(f"All errors written to {output_error_file}")
|
|
1138
|
+
|
|
1139
|
+
logger.info("Start tuning hyperparameters...")
|
|
1140
|
+
|
|
1141
|
+
storage_path = f"{self.results_dir}/ray_results"
|
|
1142
|
+
|
|
1143
|
+
tuner = Tuner(
|
|
1144
|
+
trainable=with_parameters(
|
|
1145
|
+
trainable,
|
|
1074
1146
|
x_train=x_train,
|
|
1075
1147
|
y_train=y_train,
|
|
1076
1148
|
x_val=x_val,
|
|
1077
1149
|
y_val=y_val,
|
|
1078
|
-
|
|
1150
|
+
model_name=model.model_name,
|
|
1151
|
+
target_type=self.target_type,
|
|
1152
|
+
session_name=self.session_name,
|
|
1153
|
+
target_number=self.target_number,
|
|
1154
|
+
create_model=model.create_model,
|
|
1155
|
+
type_name="hyperopts",
|
|
1156
|
+
plot=model.plot,
|
|
1157
|
+
),
|
|
1158
|
+
param_space=model.search_params,
|
|
1159
|
+
tune_config=TuneConfig(
|
|
1160
|
+
metric=self.metric,
|
|
1161
|
+
mode="min",
|
|
1162
|
+
search_alg=HyperOptSearch(),
|
|
1163
|
+
num_samples=self.number_of_trials,
|
|
1164
|
+
scheduler=ASHAScheduler(max_t=100, grace_period=10),
|
|
1165
|
+
),
|
|
1166
|
+
run_config=RunConfig(
|
|
1167
|
+
stop={"training_iteration": 100},
|
|
1168
|
+
storage_path=storage_path,
|
|
1169
|
+
callbacks=[TBXLoggerCallback()],
|
|
1170
|
+
),
|
|
1171
|
+
)
|
|
1172
|
+
try:
|
|
1173
|
+
results = tuner.fit()
|
|
1174
|
+
|
|
1175
|
+
best_result = results.get_best_result(self.metric, "max")
|
|
1176
|
+
best_params = best_result.config
|
|
1177
|
+
best_score = best_result.metrics
|
|
1178
|
+
|
|
1179
|
+
# log results
|
|
1180
|
+
logger.info(f"Best hyperparameters found were:\n{best_params}")
|
|
1181
|
+
logger.info(f"Best Scores found were:\n{best_score}")
|
|
1182
|
+
logger.info(
|
|
1183
|
+
f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
|
|
1079
1184
|
)
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
_type_name = "validation"
|
|
1084
|
-
best_score, best_model, best_pred = train_model(
|
|
1085
|
-
params=best_params,
|
|
1086
|
-
x_train=x_train,
|
|
1087
|
-
y_train=y_train,
|
|
1088
|
-
x_val=x_val,
|
|
1089
|
-
y_val=y_val,
|
|
1090
|
-
config=config,
|
|
1185
|
+
# Collect errors in single file
|
|
1186
|
+
collect_error_logs(
|
|
1187
|
+
training_target_dir=self.training_target_dir, storage_path=storage_path
|
|
1091
1188
|
)
|
|
1092
1189
|
|
|
1093
|
-
|
|
1190
|
+
except Exception as e:
|
|
1191
|
+
raise Exception(e)
|
|
1094
1192
|
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1193
|
+
finally:
|
|
1194
|
+
ray.shutdown()
|
|
1195
|
+
|
|
1196
|
+
return best_params
|
|
1197
|
+
|
|
1198
|
+
def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
|
|
1199
|
+
# Use the standalone training function to avoid duplication
|
|
1200
|
+
# For train_model, we pass the data directly (not as Ray references)
|
|
1201
|
+
return trainable(
|
|
1202
|
+
params,
|
|
1203
|
+
x_train,
|
|
1204
|
+
y_train,
|
|
1205
|
+
x_val,
|
|
1206
|
+
y_val,
|
|
1207
|
+
model.model_name,
|
|
1208
|
+
self.target_type,
|
|
1209
|
+
self.session_name,
|
|
1210
|
+
self.target_number,
|
|
1211
|
+
model.create_model,
|
|
1212
|
+
self.type_name,
|
|
1213
|
+
model.plot,
|
|
1101
1214
|
)
|
|
1102
1215
|
|
|
1103
|
-
# Save best model
|
|
1104
|
-
if config["recurrent"]:
|
|
1105
|
-
model_path = f"{results_dir}/{best_model.model_name}.keras"
|
|
1106
|
-
best_model.save(model_path)
|
|
1107
|
-
else:
|
|
1108
|
-
model_path = f"{results_dir}/{best_model.model_name}.best"
|
|
1109
|
-
joblib.dump(best_model, model_path)
|
|
1110
|
-
|
|
1111
|
-
model_path = Path(model_path).resolve()
|
|
1112
|
-
best_score["MODEL_PATH"] = model_path
|
|
1113
|
-
|
|
1114
|
-
# Track scores
|
|
1115
|
-
scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
|
|
1116
|
-
best_score_df = pd.DataFrame([best_score])
|
|
1117
|
-
|
|
1118
|
-
if os.path.exists(scores_tracking_path):
|
|
1119
|
-
existing_scores = pd.read_csv(scores_tracking_path)
|
|
1120
|
-
common_cols = existing_scores.columns.intersection(best_score_df.columns)
|
|
1121
|
-
best_score_df = best_score_df[common_cols]
|
|
1122
|
-
scores_tracking = pd.concat(
|
|
1123
|
-
[existing_scores, best_score_df], ignore_index=True
|
|
1124
|
-
)
|
|
1125
|
-
else:
|
|
1126
|
-
scores_tracking = best_score_df
|
|
1127
|
-
|
|
1128
|
-
scores_tracking.sort_values(metric, ascending=True, inplace=True)
|
|
1129
|
-
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1130
|
-
|
|
1131
|
-
# Save model training metadata
|
|
1132
|
-
stop = time.time()
|
|
1133
|
-
training_time = stop - start
|
|
1134
|
-
model_training.best_params = best_params
|
|
1135
|
-
model_training.model_path = model_path
|
|
1136
|
-
model_training.training_time = training_time
|
|
1137
|
-
model_training.save()
|
|
1138
|
-
|
|
1139
|
-
# Store metrics in DB
|
|
1140
|
-
drop_cols = [
|
|
1141
|
-
"DATE",
|
|
1142
|
-
"SESSION",
|
|
1143
|
-
"TRAIN_DATA",
|
|
1144
|
-
"VAL_DATA",
|
|
1145
|
-
"FEATURES",
|
|
1146
|
-
"MODEL_NAME",
|
|
1147
|
-
"MODEL_PATH",
|
|
1148
|
-
]
|
|
1149
|
-
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1150
|
-
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1151
1216
|
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1217
|
+
def evaluate(prediction: pd.DataFrame, target_type: str):
|
|
1218
|
+
"""
|
|
1219
|
+
Function to evaluate model performance
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
- prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
|
|
1223
|
+
- target_type: classification or regression
|
|
1224
|
+
"""
|
|
1225
|
+
score = {}
|
|
1226
|
+
y_true = prediction["TARGET"]
|
|
1227
|
+
y_pred = prediction["PRED"]
|
|
1157
1228
|
|
|
1158
|
-
|
|
1229
|
+
if target_type == "regression":
|
|
1230
|
+
# Main metrics
|
|
1231
|
+
score["RMSE"] = root_mean_squared_error(y_true, y_pred)
|
|
1232
|
+
score["MAE"] = mean_absolute_error(y_true, y_pred)
|
|
1233
|
+
score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
|
|
1234
|
+
score["R2"] = r2_score(y_true, y_pred)
|
|
1159
1235
|
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
best_model_name = best_score_overall["MODEL_NAME"]
|
|
1236
|
+
# Robustness: avoid division by zero
|
|
1237
|
+
std_target = y_true.std()
|
|
1238
|
+
mean_target = y_true.mean()
|
|
1239
|
+
median_target = y_true.median()
|
|
1165
1240
|
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
os.remove(file_path)
|
|
1171
|
-
# Copy the best model in root training folder for this target
|
|
1172
|
-
best_model_path = Path(
|
|
1173
|
-
f"{training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1174
|
-
).resolve()
|
|
1175
|
-
copy_any(
|
|
1176
|
-
best_score_overall["MODEL_PATH"],
|
|
1177
|
-
best_model_path,
|
|
1178
|
-
)
|
|
1241
|
+
# RMSE / STD
|
|
1242
|
+
score["RMSE_STD_RATIO"] = (
|
|
1243
|
+
float(100 * score["RMSE"] / std_target) if std_target else 1000
|
|
1244
|
+
)
|
|
1179
1245
|
|
|
1180
|
-
|
|
1181
|
-
|
|
1246
|
+
# Median absolute deviation (MAD)
|
|
1247
|
+
mam = (y_true - mean_target).abs().median() # Median Abs around Mean
|
|
1248
|
+
mad = (y_true - median_target).abs().median() # Median Abs around Median
|
|
1249
|
+
score["MAM"] = mam
|
|
1250
|
+
score["MAD"] = mad
|
|
1251
|
+
score["MAE_MAM_RATIO"] = (
|
|
1252
|
+
float(100 * score["MAE"] / mam) if mam else 1000
|
|
1253
|
+
) # MAE / MAD → Plus stable, moins sensible aux outliers.
|
|
1254
|
+
score["MAE_MAD_RATIO"] = (
|
|
1255
|
+
float(100 * score["MAE"] / mad) if mad else 1000
|
|
1256
|
+
) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
|
|
1182
1257
|
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1258
|
+
else:
|
|
1259
|
+
|
|
1260
|
+
labels = np.unique(y_true)
|
|
1261
|
+
num_classes = labels.size
|
|
1262
|
+
y_pred_proba = (
|
|
1263
|
+
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
1264
|
+
)
|
|
1265
|
+
if num_classes > 2:
|
|
1266
|
+
lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
1267
|
+
lb.fit(labels)
|
|
1268
|
+
y_true_onhot = lb.transform(y_true)
|
|
1269
|
+
y_pred_onehot = lb.transform(y_pred)
|
|
1191
1270
|
|
|
1192
|
-
|
|
1271
|
+
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
1272
|
+
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
1273
|
+
score["PRECISION"] = precision_score(
|
|
1274
|
+
y_true,
|
|
1275
|
+
y_pred,
|
|
1276
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1277
|
+
)
|
|
1278
|
+
score["RECALL"] = recall_score(
|
|
1279
|
+
y_true,
|
|
1280
|
+
y_pred,
|
|
1281
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1282
|
+
)
|
|
1283
|
+
score["F1"] = f1_score(
|
|
1284
|
+
y_true,
|
|
1285
|
+
y_pred,
|
|
1286
|
+
average=("binary" if num_classes == 2 else "macro"),
|
|
1287
|
+
)
|
|
1288
|
+
score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
|
|
1289
|
+
(
|
|
1290
|
+
score["THRESHOLD"],
|
|
1291
|
+
score["PRECISION_AT_THRESHOLD"],
|
|
1292
|
+
score["RECALL_AT_THRESHOLD"],
|
|
1293
|
+
) = (
|
|
1294
|
+
find_best_precision_threshold(prediction)
|
|
1295
|
+
if num_classes == 2
|
|
1296
|
+
else (None, None, None)
|
|
1297
|
+
)
|
|
1298
|
+
return score
|
|
1193
1299
|
|
|
1194
1300
|
|
|
1195
|
-
|
|
1301
|
+
# utils
|
|
1302
|
+
def get_log_dir(training_target_dir: str, model_name="test_model"):
|
|
1303
|
+
"""Generates a structured log directory path for TensorBoard."""
|
|
1304
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
|
1305
|
+
log_dir = (
|
|
1306
|
+
Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
|
|
1307
|
+
)
|
|
1308
|
+
log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
|
|
1309
|
+
return str(log_dir)
|
|
1196
1310
|
|
|
1197
|
-
output_error_file = f"{training_target_dir}/errors.log"
|
|
1198
1311
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
logger.info(f"Processing error file: {error_file_path}")
|
|
1206
|
-
# Read and append the content of the error.txt file
|
|
1207
|
-
with open(error_file_path, "r") as infile:
|
|
1208
|
-
outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
|
|
1209
|
-
outfile.write(infile.read())
|
|
1210
|
-
logger.info(f"All errors written to {output_error_file}")
|
|
1312
|
+
def print_scores(training_target_dir: str):
|
|
1313
|
+
"""
|
|
1314
|
+
Monitor scores
|
|
1315
|
+
"""
|
|
1316
|
+
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
1317
|
+
return scores_tracking
|
|
1211
1318
|
|
|
1212
1319
|
|
|
1320
|
+
# plots
|
|
1213
1321
|
def plot_evaluation_for_classification(prediction: dict):
|
|
1214
1322
|
"""
|
|
1215
1323
|
Args
|
|
@@ -1272,7 +1380,7 @@ def plot_confusion_matrix(y_true, y_pred):
|
|
|
1272
1380
|
plt.show()
|
|
1273
1381
|
|
|
1274
1382
|
|
|
1275
|
-
#
|
|
1383
|
+
# thresholds
|
|
1276
1384
|
def find_max_f1_threshold(prediction):
|
|
1277
1385
|
"""
|
|
1278
1386
|
Finds the threshold that maximizes the F1 score for a binary classification task.
|
|
@@ -1515,14 +1623,6 @@ def plot_threshold(prediction, threshold, precision, recall):
|
|
|
1515
1623
|
return threshold
|
|
1516
1624
|
|
|
1517
1625
|
|
|
1518
|
-
def print_scores(training_target_dir: str):
|
|
1519
|
-
"""
|
|
1520
|
-
Monitor scores
|
|
1521
|
-
"""
|
|
1522
|
-
scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
|
|
1523
|
-
return scores_tracking
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
1626
|
# OLD - to sort out
|
|
1527
1627
|
def get_pred_distribution(training_target_dir: str, model_name="linear"):
|
|
1528
1628
|
"""
|