lecrapaud 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (63) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +271 -0
  3. lecrapaud/config.py +25 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/README +1 -0
  6. lecrapaud/db/alembic/env.py +78 -0
  7. lecrapaud/db/alembic/script.py.mako +26 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  9. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  10. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  15. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  21. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  22. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  23. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  24. lecrapaud/db/models/__init__.py +11 -0
  25. lecrapaud/db/models/base.py +181 -0
  26. lecrapaud/db/models/dataset.py +129 -0
  27. lecrapaud/db/models/feature.py +45 -0
  28. lecrapaud/db/models/feature_selection.py +125 -0
  29. lecrapaud/db/models/feature_selection_rank.py +79 -0
  30. lecrapaud/db/models/model.py +40 -0
  31. lecrapaud/db/models/model_selection.py +63 -0
  32. lecrapaud/db/models/model_training.py +62 -0
  33. lecrapaud/db/models/score.py +65 -0
  34. lecrapaud/db/models/target.py +67 -0
  35. lecrapaud/db/session.py +45 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/experiment.py +64 -0
  38. lecrapaud/feature_engineering.py +846 -0
  39. lecrapaud/feature_selection.py +1167 -0
  40. lecrapaud/integrations/openai_integration.py +225 -0
  41. lecrapaud/jobs/__init__.py +13 -0
  42. lecrapaud/jobs/config.py +17 -0
  43. lecrapaud/jobs/scheduler.py +36 -0
  44. lecrapaud/jobs/tasks.py +57 -0
  45. lecrapaud/model_selection.py +1671 -0
  46. lecrapaud/predictions.py +292 -0
  47. lecrapaud/preprocessing.py +984 -0
  48. lecrapaud/search_space.py +848 -0
  49. lecrapaud/services/__init__.py +0 -0
  50. lecrapaud/services/embedding_categorical.py +71 -0
  51. lecrapaud/services/indicators.py +309 -0
  52. lecrapaud/speed_tests/experiments.py +139 -0
  53. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  54. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  55. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  56. lecrapaud/speed_tests/tests.ipynb +145 -0
  57. lecrapaud/speed_tests/trash.py +37 -0
  58. lecrapaud/training.py +239 -0
  59. lecrapaud/utils.py +246 -0
  60. lecrapaud-0.1.0.dist-info/LICENSE +201 -0
  61. lecrapaud-0.1.0.dist-info/METADATA +105 -0
  62. lecrapaud-0.1.0.dist-info/RECORD +63 -0
  63. lecrapaud-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1671 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import datetime
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import time
7
+ import os
8
+ import json
9
+ import warnings
10
+ import joblib
11
+ import glob
12
+ from pathlib import Path
13
+ import pickle
14
+
15
+ os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
16
+
17
+ # ML models
18
+ from sklearn.model_selection import TimeSeriesSplit
19
+ from sklearn.calibration import CalibratedClassifierCV
20
+ from sklearn.metrics import (
21
+ mean_absolute_percentage_error,
22
+ root_mean_squared_error,
23
+ mean_absolute_error,
24
+ r2_score,
25
+ accuracy_score,
26
+ precision_score,
27
+ recall_score,
28
+ f1_score,
29
+ log_loss,
30
+ roc_auc_score,
31
+ roc_curve,
32
+ auc,
33
+ precision_recall_curve,
34
+ average_precision_score,
35
+ confusion_matrix,
36
+ )
37
+ from sklearn.preprocessing import LabelBinarizer
38
+ import lightgbm as lgb
39
+ import xgboost as xgb
40
+
41
+ # DL models
42
+ import tensorflow as tf
43
+ import keras
44
+ from keras.callbacks import EarlyStopping, TensorBoard
45
+ from keras.metrics import (
46
+ Precision,
47
+ Recall,
48
+ F1Score,
49
+ )
50
+ from keras.losses import BinaryCrossentropy, CategoricalCrossentropy
51
+ from keras.optimizers import Adam
52
+
53
+ K = tf.keras.backend
54
+ from tensorboardX import SummaryWriter
55
+
56
+ # Optimization
57
+ import ray
58
+ from ray.tune import Tuner, TuneConfig, with_parameters
59
+ from ray.train import RunConfig
60
+ from ray.tune.search.hyperopt import HyperOptSearch
61
+ from ray.tune.search.bayesopt import BayesOptSearch
62
+ from ray.tune.logger import TBXLoggerCallback
63
+ from ray.tune.schedulers import ASHAScheduler
64
+ from ray.air import session
65
+
66
+ # Internal library
67
+ from lecrapaud.search_space import all_models
68
+ from lecrapaud.directory_management import clean_directory
69
+ from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
+ from lecrapaud.config import PYTHON_ENV
71
+ from lecrapaud.feature_selection import load_train_data
72
+ from lecrapaud.db import (
73
+ Model,
74
+ ModelSelection,
75
+ ModelTraining,
76
+ Score,
77
+ Target,
78
+ Dataset,
79
+ )
80
+
81
+ # Reproducible result
82
+ keras.utils.set_random_seed(42)
83
+ np.random.seed(42)
84
+ tf.config.experimental.enable_op_determinism()
85
+
86
+
87
+ # test configuration
88
+ def test_hardware():
89
+ devices = tf.config.list_physical_devices()
90
+ logger.info("\nDevices: ", devices)
91
+
92
+ gpus = tf.config.list_physical_devices("GPU")
93
+ if gpus:
94
+ details = tf.config.experimental.get_device_details(gpus[0])
95
+ logger.info("GPU details: ", details)
96
+
97
+
98
+ # Suppress specific warning messages related to file system monitor
99
+ # logging.getLogger("ray").setLevel(logging.CRITICAL)
100
+ # logging.getLogger("ray.train").setLevel(logging.CRITICAL)
101
+ # logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
102
+ # logging.getLogger("ray.autoscaler").setLevel(logging.CRITICAL)
103
+ # logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
104
+ # logging.getLogger("ray.monitor").setLevel(logging.CRITICAL)
105
+ # logging.getLogger("ray.dashboard").setLevel(logging.CRITICAL)
106
+ # logging.getLogger("ray.gcs_server").setLevel(logging.CRITICAL)
107
+
108
+ warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
109
+
110
+
111
+ class ModelEngine:
112
+
113
+ def __init__(
114
+ self,
115
+ model_name: str = None,
116
+ target_type: str = None,
117
+ path: str = None,
118
+ search_params: dict = {},
119
+ create_model=None,
120
+ plot: bool = False,
121
+ log_dir: str = None,
122
+ ):
123
+ if path:
124
+ self.load(path)
125
+ else:
126
+ self.model_name = model_name
127
+ self.target_type = target_type
128
+
129
+ config = [
130
+ config for config in all_models if config["model_name"] == self.model_name
131
+ ]
132
+ if config is None or len(config) == 0:
133
+ Exception(
134
+ f"Model {self.model_name} is not supported by this library."
135
+ f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
136
+ )
137
+
138
+ self.recurrent = config["recurrent"]
139
+ self.need_scaling = config["need_scaling"]
140
+ self.search_params = search_params
141
+ self.create_model = create_model
142
+ self.plot = plot
143
+ self.log_dir = log_dir
144
+
145
+ if self.need_scaling and self.target_type == "regression":
146
+ self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
147
+ else:
148
+ self.scaler_y = None
149
+
150
+ self.path = path
151
+
152
+ def fit(self, *args):
153
+ if self.recurrent:
154
+ fit = self.fit_recurrent
155
+ elif (self.create_model == "lgb") or (self.create_model == "xgb"):
156
+ fit = self.fit_boosting
157
+ else:
158
+ fit = self.fit_sklearn
159
+ model = fit(*args)
160
+ return model
161
+
162
+ # Functions to fit & evaluate models
163
+ def fit_sklearn(self, x_train, y_train, x_val, y_val, params):
164
+
165
+ # Create & Compile the model
166
+ model = self.create_model(**params)
167
+
168
+ # Train the model
169
+ logger.info("Fitting the model...")
170
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
171
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
172
+
173
+ model.fit(x_train, y_train)
174
+
175
+ if (
176
+ self.target_type == "classification"
177
+ and "loss" in model.get_params().keys()
178
+ and "hinge" in model.get_params()["loss"]
179
+ ):
180
+ # This is for SVC models with hinge loss
181
+ # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
182
+ # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
183
+ logger.info(
184
+ f"Re-Calibrating {self.model_name} to get predict probabilities..."
185
+ )
186
+ calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
187
+ model = calibrator.fit(x_train, y_train)
188
+
189
+ # set model_name after calibrator
190
+ model.model_name = self.model_name
191
+ model.target_type = self.target_type
192
+
193
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
194
+
195
+ self._model = model
196
+
197
+ return model
198
+
199
+ def fit_boosting(self, x_train, y_train, x_val, y_val, params):
200
+ """
201
+ This is using lightGBM or XGboost C++ librairies
202
+ """
203
+ lightGBM = self.create_model == "lgb"
204
+
205
+ # Datasets
206
+ boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
207
+ train_data = boosting_dataset(x_train, label=y_train)
208
+ val_data = boosting_dataset(x_val, label=y_val)
209
+
210
+ # Create a TensorBoardX writer
211
+ writer = SummaryWriter(self.log_dir)
212
+ evals_result = {}
213
+
214
+ # Training
215
+ labels = np.unique(y_train)
216
+ num_class = (
217
+ labels.size
218
+ if self.target_type == "classification" and labels.size > 2
219
+ else 1
220
+ )
221
+ logger.info("Fitting the model...")
222
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
223
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
224
+
225
+ if lightGBM:
226
+
227
+ def tensorboard_callback(env):
228
+ for i, metric in enumerate(env.evaluation_result_list):
229
+ metric_name, _, metric_value, _ = metric
230
+ writer.add_scalar(
231
+ f"LightGBM/{metric_name}", metric_value, env.iteration
232
+ )
233
+
234
+ loss = (
235
+ "regression"
236
+ if self.target_type == "regression"
237
+ else ("binary" if num_class <= 2 else "multiclass")
238
+ )
239
+ eval_metric = (
240
+ "rmse"
241
+ if self.target_type == "regression"
242
+ else ("binary_logloss" if num_class <= 2 else "multi_logloss")
243
+ )
244
+ model = lgb.train(
245
+ params={
246
+ **params["model_params"],
247
+ "objective": loss,
248
+ "metric": eval_metric,
249
+ "num_class": num_class,
250
+ },
251
+ num_boost_round=params["num_boost_round"],
252
+ train_set=train_data,
253
+ valid_sets=[train_data, val_data],
254
+ valid_names=["train", "val"],
255
+ callbacks=[
256
+ lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
257
+ lgb.record_evaluation(evals_result),
258
+ tensorboard_callback,
259
+ ],
260
+ )
261
+ else:
262
+
263
+ class TensorBoardCallback(xgb.callback.TrainingCallback):
264
+
265
+ def __init__(self, log_dir: str):
266
+ self.writer = SummaryWriter(log_dir=log_dir)
267
+
268
+ def after_iteration(
269
+ self,
270
+ model,
271
+ epoch: int,
272
+ evals_log: xgb.callback.TrainingCallback.EvalsLog,
273
+ ) -> bool:
274
+ if not evals_log:
275
+ return False
276
+
277
+ for data, metric in evals_log.items():
278
+ for metric_name, log in metric.items():
279
+ score = (
280
+ log[-1][0] if isinstance(log[-1], tuple) else log[-1]
281
+ )
282
+ self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
283
+
284
+ return False
285
+
286
+ tensorboard_callback = TensorBoardCallback(self.log_dir)
287
+
288
+ loss = (
289
+ "reg:squarederror"
290
+ if self.target_type == "regression"
291
+ else ("binary:logistic" if num_class <= 2 else "multi:softprob")
292
+ )
293
+ eval_metric = (
294
+ "rmse"
295
+ if self.target_type == "regression"
296
+ else ("logloss" if num_class <= 2 else "mlogloss")
297
+ )
298
+ model = xgb.train(
299
+ params={
300
+ **params["model_params"],
301
+ "objective": loss,
302
+ "eval_metric": eval_metric,
303
+ "num_class": num_class,
304
+ },
305
+ num_boost_round=params["num_boost_round"],
306
+ dtrain=train_data,
307
+ evals=[(val_data, "val"), (train_data, "train")],
308
+ callbacks=[
309
+ xgb.callback.EarlyStopping(
310
+ rounds=params["early_stopping_rounds"], save_best=True
311
+ ),
312
+ xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
313
+ tensorboard_callback,
314
+ ],
315
+ evals_result=evals_result, # Record evaluation result
316
+ verbose_eval=0,
317
+ )
318
+
319
+ model.model_name = self.create_model
320
+ model.target_type = self.target_type
321
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
322
+
323
+ # Close the writer after training is done
324
+ writer.close()
325
+
326
+ if self.plot:
327
+ # Plot loss per epoch
328
+ train_loss = evals_result["train"][eval_metric]
329
+ val_loss = evals_result["val"][eval_metric]
330
+ logs = pd.DataFrame({"train": train_loss, "val": val_loss})
331
+
332
+ plt.figure(figsize=(14, 4))
333
+ plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
334
+ plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
335
+ plt.xlabel("Epoch")
336
+ plt.ylabel("Loss")
337
+ plt.legend()
338
+ plt.show()
339
+
340
+ self._model = model
341
+
342
+ return model
343
+
344
+ def fit_recurrent(self, x_train, y_train, x_val, y_val, params):
345
+
346
+ # metrics functions
347
+ def rmse_tf(y_true, y_pred):
348
+ y_true, y_pred = unscale_tf(y_true, y_pred)
349
+ results = K.sqrt(K.mean(K.square(y_pred - y_true)))
350
+ return results
351
+
352
+ def mae_tf(y_true, y_pred):
353
+ y_true, y_pred = unscale_tf(y_true, y_pred)
354
+ results = K.mean(K.abs(y_pred - y_true))
355
+ return results
356
+
357
+ def unscale_tf(y_true, y_pred):
358
+ if self.target_type == "regression":
359
+ scale = K.constant(self.scaler_y.scale_[0])
360
+ mean = K.constant(self.scaler_y.mean_[0])
361
+
362
+ y_true = K.mul(y_true, scale)
363
+ y_true = K.bias_add(y_true, mean)
364
+
365
+ y_pred = K.mul(y_pred, scale)
366
+ y_pred = K.bias_add(y_pred, mean)
367
+ return y_true, y_pred
368
+
369
+ # Create the model
370
+ labels = np.unique(y_train[:, 0])
371
+ num_class = labels.size if self.target_type == "classification" else None
372
+ input_shape = (x_train.shape[1], x_train.shape[2])
373
+ model = self.create_model(params, input_shape, self.target_type, num_class)
374
+ model.target_type = self.target_type
375
+
376
+ # Compile the model
377
+ loss = (
378
+ rmse_tf
379
+ if self.target_type == "regression"
380
+ else (
381
+ BinaryCrossentropy(from_logits=False)
382
+ if num_class <= 2
383
+ else CategoricalCrossentropy(from_logits=False)
384
+ )
385
+ )
386
+ optimizer = Adam(
387
+ learning_rate=params["learning_rate"], clipnorm=params["clipnorm"]
388
+ )
389
+ metrics = (
390
+ [mae_tf]
391
+ if self.target_type == "regression"
392
+ else (
393
+ ["accuracy", Precision(), Recall()]
394
+ if num_class <= 2
395
+ else ["categorical_accuracy"]
396
+ )
397
+ )
398
+ model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
399
+
400
+ # Callbacks
401
+ tensorboard_callback = TensorBoard(log_dir=self.log_dir)
402
+ early_stopping_callback = EarlyStopping(
403
+ monitor="val_loss",
404
+ patience=3,
405
+ restore_best_weights=True,
406
+ start_from_epoch=5,
407
+ )
408
+
409
+ # Custom callbacks
410
+ class PrintTrainableWeights(keras.callbacks.Callback):
411
+ def on_epoch_end(self, epoch, logs={}):
412
+ logger.info(model.trainable_variables)
413
+
414
+ class GradientCalcCallback(keras.callbacks.Callback):
415
+ def __init__(self):
416
+ self.epoch_gradient = []
417
+
418
+ def get_gradient_func(self, model):
419
+ # grads = K.gradients(model.total_loss, model.trainable_weights)
420
+ grads = K.gradients(model.loss, model.trainable_weights)
421
+ # inputs = model.model.inputs + model.targets + model.sample_weights
422
+ # use below line of code if above line doesn't work for you
423
+ # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
424
+ inputs = (
425
+ model._feed_inputs
426
+ + model._feed_targets
427
+ + model._feed_sample_weights
428
+ )
429
+ func = K.function(inputs, grads)
430
+ return func
431
+
432
+ def on_epoch_end(self, epoch, logs=None):
433
+ get_gradient = self.get_gradient_func(model)
434
+ grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
435
+ self.epoch_gradient.append(grads)
436
+
437
+ # Train the model
438
+ if self.target_type == "classification" and num_class > 2:
439
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
440
+ lb.fit(labels)
441
+ y_train = lb.transform(y_train[:, 0].flatten())
442
+ y_val = lb.transform(y_val[:, 0].flatten())
443
+ else:
444
+ y_train = y_train[:, 0].flatten()
445
+ y_val = y_val[:, 0].flatten()
446
+
447
+ logger.info("Fitting the model...")
448
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
449
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
450
+
451
+ history = model.fit(
452
+ x_train,
453
+ y_train,
454
+ batch_size=params["batch_size"],
455
+ verbose=0,
456
+ epochs=params["epochs"],
457
+ shuffle=False,
458
+ validation_data=(x_val, y_val),
459
+ callbacks=[early_stopping_callback, tensorboard_callback],
460
+ )
461
+
462
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
463
+ # logger.info(pd.DataFrame(gradiant.epoch_gradient))
464
+
465
+ if self.plot:
466
+ # Plot loss per epoch
467
+ logs = pd.DataFrame(history.history)
468
+
469
+ plt.figure(figsize=(14, 4))
470
+ plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
471
+ plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
472
+ plt.xlabel("Epoch")
473
+ plt.ylabel("Loss")
474
+ plt.legend()
475
+ plt.show()
476
+
477
+ self._model = model
478
+
479
+ return model
480
+
481
+ def predict(
482
+ self,
483
+ data: pd.DataFrame,
484
+ threshold: float = 0.5,
485
+ ):
486
+ """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
487
+
488
+ Args:
489
+ - data: the data for prediction
490
+ - threshold: the threshold for classification
491
+ """
492
+ if not self._model:
493
+ raise Exception(
494
+ "Model is not fitted, cannot predict, run model.fit() first, or pass a fitted model when creating the Model object to the `model` parameter."
495
+ )
496
+ model = self._model
497
+
498
+ if self.threshold and threshold == 0.5:
499
+ threshold = self.threshold
500
+
501
+ if self.recurrent or model.model_name in ["lgb", "xgb"]:
502
+ # keras, lgb & xgb
503
+ if model.model_name == "lgb":
504
+ # Direct prediction for LightGBM
505
+ pred = model.predict(data)
506
+ elif model.model_name == "xgb":
507
+ # Convert val_data to DMatrix for XGBoost
508
+ d_data = xgb.DMatrix(data)
509
+ pred = model.predict(d_data)
510
+ else:
511
+ # Reshape (flatten) for keras if not multiclass
512
+ pred = model.predict(data)
513
+ if pred.shape[1] == 1:
514
+ pred = pred.reshape(-1)
515
+
516
+ if self.target_type == "classification":
517
+ num_class = pred.shape[1] if len(pred.shape) > 1 else 2
518
+
519
+ if num_class <= 2:
520
+ # For binary classification, concatenate the predicted probabilities for both classes
521
+ pred_df = pd.DataFrame(
522
+ {
523
+ 0: 1 - pred, # Probability of class 0
524
+ 1: pred, # Probability of class 1
525
+ },
526
+ )
527
+ else:
528
+ # For multi-class classification, use the predicted probabilities for each class
529
+ pred_df = pd.DataFrame(pred, columns=range(num_class))
530
+
531
+ # Get final predictions (argmax for multi-class, threshold for binary)
532
+ if num_class == 2:
533
+ pred_df["PRED"] = np.where(
534
+ pred_df[1] >= threshold, 1, 0
535
+ ) # Class 1 if prob >= threshold
536
+ else:
537
+ pred_df["PRED"] = pred_df.idxmax(
538
+ axis=1
539
+ ) # Class with highest probability for multiclasses
540
+
541
+ # Reorder columns to show predicted class first, then probabilities
542
+ pred = pred_df[["PRED"] + list(range(num_class))]
543
+
544
+ else:
545
+ pred = pd.Series(pred, name="PRED")
546
+
547
+ # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
548
+ if model.model_name in ["lgb", "xgb"]:
549
+ pred.index = data.index
550
+ else:
551
+ # sk learn
552
+ pred = pd.Series(model.predict(data), index=data.index, name="PRED")
553
+ if self.target_type == "classification":
554
+ pred_proba = pd.DataFrame(
555
+ model.predict_proba(data),
556
+ index=data.index,
557
+ columns=[
558
+ int(c) if isinstance(c, float) and c.is_integer() else c
559
+ for c in model.classes_
560
+ ],
561
+ )
562
+
563
+ # Apply threshold for binary classification
564
+ if len(model.classes_) == 2:
565
+ positive_class = model.classes_[1] # Assuming classes are ordered
566
+ pred = (pred_proba[positive_class] >= threshold).astype(int)
567
+ pred.name = "PRED"
568
+
569
+ pred = pd.concat([pred, pred_proba], axis=1)
570
+
571
+ return pred
572
+
573
+ def save(self, path):
574
+ if self.recurrent:
575
+ path += "/" + self.model_name + ".keras"
576
+ self._model.save(path)
577
+ else:
578
+ path += "/" + self.model_name + ".best"
579
+ joblib.dump(self._model, path)
580
+ self.path = path
581
+ return path
582
+
583
+ def load(self):
584
+ if not self.path:
585
+ raise ValueError("Path is not set, cannot load model")
586
+
587
+ training_target_dir = Path(self.path)
588
+
589
+ # Load threshold
590
+ scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
591
+ self.threshold = (
592
+ scores_tracking["THRESHOLD"].values[0]
593
+ if "THRESHOLD" in scores_tracking.columns
594
+ else None
595
+ )
596
+
597
+ # Search for files that contain '.best' or '.keras' in the name
598
+ best_files = list(training_target_dir.glob("*.best*")) + list(
599
+ training_target_dir.glob("*.keras*")
600
+ )
601
+ # If any files are found, try loading the first one (or process as needed)
602
+ if best_files:
603
+ file_path = best_files[
604
+ 0
605
+ ] # Assuming you want to open the first matching file
606
+ try:
607
+ # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
608
+ self._model = joblib.load(file_path)
609
+ logger.info(
610
+ f"Loaded model {self._model.model_name} and threshold {self.threshold}"
611
+ )
612
+ except (pickle.UnpicklingError, EOFError):
613
+ # If it's not a pickle file, try loading it as a Keras model
614
+ try:
615
+ # Attempt to load the file as a Keras model
616
+ self._model = keras.models.load_model(file_path)
617
+ logger.info(
618
+ f"Loaded model {self._model.model_name} and threshold {self.threshold}"
619
+ )
620
+ except Exception as e:
621
+ raise FileNotFoundError(
622
+ f"Model could not be loaded from path: {file_path}: {e}"
623
+ )
624
+ else:
625
+ raise FileNotFoundError(
626
+ f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
627
+ )
628
+
629
+ self.model_name = self._model.model_name
630
+ self.target_type = self._model.target_type
631
+
632
+ def __getattr__(self, attr):
633
+ return getattr(self._model, attr)
634
+
635
+
636
+ def trainable(
637
+ params,
638
+ x_train,
639
+ y_train,
640
+ x_val,
641
+ y_val,
642
+ model_name,
643
+ target_type,
644
+ session_name,
645
+ target_number,
646
+ create_model,
647
+ type_name="hyperopts",
648
+ plot=False,
649
+ ):
650
+ """Standalone version of train_model that doesn't depend on self"""
651
+ # Create model engine
652
+ model = ModelEngine(
653
+ model_name=model_name,
654
+ target_type=target_type,
655
+ create_model=create_model,
656
+ plot=plot,
657
+ )
658
+
659
+ logger.info(
660
+ f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
661
+ )
662
+
663
+ if model.recurrent:
664
+ timesteps = params["timesteps"]
665
+ x_train = x_train[:, -timesteps:, :]
666
+ x_val = x_val[:, -timesteps:, :]
667
+
668
+ # Compile and fit model on train set
669
+ start = time.time()
670
+ model.fit(x_train, y_train, x_val, y_val, params)
671
+ stop = time.time()
672
+
673
+ # Prediction on val set
674
+ y_pred = model.predict(x_val)
675
+
676
+ # fix for recurrent model because x_val has no index as it is a 3D np array
677
+ if model.recurrent:
678
+ y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
679
+ y_pred.index = y_val.index
680
+
681
+ prediction = pd.concat([y_val, y_pred], axis=1)
682
+
683
+ # Unscale the data
684
+ if (
685
+ model.need_scaling
686
+ and model.target_type == "regression"
687
+ and model.scaler_y is not None
688
+ ):
689
+ # scaler_y needs 2D array with shape (-1, 1)
690
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
691
+ prediction[["TARGET"]].values
692
+ )
693
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
694
+ prediction[["PRED"]].values
695
+ )
696
+
697
+ # Evaluate model
698
+ score = {
699
+ "DATE": datetime.now(),
700
+ "SESSION": session_name,
701
+ "TRAIN_DATA": x_train.shape[0],
702
+ "VAL_DATA": x_val.shape[0],
703
+ "FEATURES": x_train.shape[-1],
704
+ "MODEL_NAME": model.model_name,
705
+ "TYPE": type_name,
706
+ "TRAINING_TIME": stop - start,
707
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
708
+ }
709
+
710
+ score.update(evaluate(prediction, target_type))
711
+
712
+ if type_name == "hyperopts":
713
+ session.report(metrics=score)
714
+ return score
715
+
716
+ return score, model, prediction
717
+
718
+
719
+ class ModelSelectionEngine:
720
+
721
+ def __init__(
722
+ self,
723
+ data,
724
+ reshaped_data,
725
+ target_number,
726
+ target_clf,
727
+ dataset,
728
+ models_idx,
729
+ time_series,
730
+ date_column,
731
+ group_column,
732
+ **kwargs,
733
+ ):
734
+ self.data = data
735
+ self.reshaped_data = reshaped_data
736
+ self.target_number = target_number
737
+ self.dataset = dataset
738
+ self.target_clf = target_clf
739
+ self.models_idx = models_idx
740
+ self.time_series = time_series
741
+ self.date_column = date_column
742
+ self.group_column = group_column
743
+
744
+ self.target_type = (
745
+ "classification" if self.target_number in self.target_clf else "regression"
746
+ )
747
+ self.dataset_dir = self.dataset.path
748
+ self.dataset_id = self.dataset.id
749
+ self.data_dir = f"{self.dataset_dir}/data"
750
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
751
+ self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
752
+ self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
753
+ self.features = self.dataset.get_features(self.target_number)
754
+ self.all_features = self.dataset.get_all_features(
755
+ date_column=self.date_column, group_column=self.group_column
756
+ )
757
+
758
+ # Main training function
759
+ def run(
760
+ self,
761
+ session_name,
762
+ perform_hyperopt=True,
763
+ number_of_trials=20,
764
+ perform_crossval=False,
765
+ plot=True,
766
+ clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
767
+ preserve_model=True,
768
+ ):
769
+ """
770
+ Selects the best models based on a target variable, optionally performing hyperparameter optimization
771
+ and cross-validation, and manages outputs in a session-specific directory.
772
+ """
773
+ self.session_name = session_name
774
+ self.plot = plot
775
+ self.number_of_trials = number_of_trials
776
+
777
+ if self.dataset_id is None:
778
+ raise ValueError("Please provide a dataset.")
779
+
780
+ if self.data:
781
+ self.train = self.data["train"]
782
+ self.val = self.data["val"]
783
+ self.test = self.data["test"]
784
+ self.train_scaled = self.data["train_scaled"]
785
+ self.val_scaled = self.data["val_scaled"]
786
+ self.test_scaled = self.data["test_scaled"]
787
+ else:
788
+ (
789
+ self.train,
790
+ self.val,
791
+ self.test,
792
+ self.train_scaled,
793
+ self.val_scaled,
794
+ self.test_scaled,
795
+ ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
796
+
797
+ if (
798
+ any(all_models[i].get("recurrent") for i in self.models_idx)
799
+ and not self.time_series
800
+ ):
801
+ ValueError(
802
+ "You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
803
+ )
804
+
805
+ if (
806
+ any(all_models[i].get("recurrent") for i in self.models_idx)
807
+ and self.time_series
808
+ ):
809
+ if self.reshaped_data is None:
810
+ raise ValueError("reshaped_data is not provided.")
811
+
812
+ logger.info("Loading reshaped data...")
813
+ self.x_train_reshaped = self.reshaped_data["x_train_reshaped"]
814
+ self.y_train_reshaped = self.reshaped_data["y_train_reshaped"]
815
+ self.x_val_reshaped = self.reshaped_data["x_val_reshaped"]
816
+ self.y_val_reshaped = self.reshaped_data["y_val_reshaped"]
817
+
818
+ # create model selection in db
819
+ target = Target.find_by(name=f"TARGET_{self.target_number}")
820
+ model_selection = ModelSelection.upsert(
821
+ match_fields=["target_id", "dataset_id"],
822
+ target_id=target.id,
823
+ dataset_id=self.dataset_id,
824
+ )
825
+
826
+ # recurrent models starts at 9 # len(list_models)
827
+ for i in self.models_idx:
828
+ config = all_models[i]
829
+ recurrent = config["recurrent"]
830
+ need_scaling = config["need_scaling"]
831
+ model_name = config["model_name"]
832
+
833
+ if recurrent is False and config[self.target_type] is None:
834
+ continue # for naive bayes models that cannot be used in regression
835
+
836
+ self.results_dir = f"{self.training_target_dir}/{model_name}"
837
+ if not os.path.exists(f"{self.results_dir}"):
838
+ os.makedirs(f"{self.results_dir}")
839
+ elif preserve_model and contains_best(self.results_dir):
840
+ continue
841
+ elif perform_hyperopt:
842
+ clean_directory(self.results_dir)
843
+
844
+ logger.info(f"Training a {model_name}")
845
+ model = Model.upsert(
846
+ match_fields=["name", "type"],
847
+ name=model_name,
848
+ type=self.target_type,
849
+ )
850
+ model_training = ModelTraining.upsert(
851
+ match_fields=["model_id", "model_selection_id"],
852
+ model_id=model.id,
853
+ model_selection_id=model_selection.id,
854
+ )
855
+
856
+ # getting data
857
+ if recurrent:
858
+ # Clear cluster from previous Keras session graphs.
859
+ K.clear_session()
860
+
861
+ features_idx = [
862
+ i
863
+ for i, e in enumerate(self.all_features)
864
+ if e in set(self.features)
865
+ ]
866
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
867
+ self.x_train = self.x_train_reshaped[:, :, features_idx]
868
+ self.y_train = self.y_train_reshaped[:, [self.target_number, 0]]
869
+ self.x_val = self.x_val_reshaped[:, :, features_idx]
870
+ self.y_val = self.y_val_reshaped[:, [self.target_number, 0]]
871
+ else:
872
+ config = config[self.target_type]
873
+
874
+ if need_scaling and self.target_type == "regression":
875
+ self.x_train = self.train_scaled[self.features]
876
+ self.y_train = self.train_scaled[
877
+ f"TARGET_{self.target_number}"
878
+ ].rename("TARGET")
879
+ self.x_val = self.val_scaled[self.features]
880
+ self.y_val = self.val_scaled[f"TARGET_{self.target_number}"].rename(
881
+ "TARGET"
882
+ )
883
+ else:
884
+ self.x_train = self.train[self.features]
885
+ self.y_train = self.train[f"TARGET_{self.target_number}"].rename(
886
+ "TARGET"
887
+ )
888
+ self.x_val = self.val[self.features]
889
+ self.y_val = self.val[f"TARGET_{self.target_number}"].rename(
890
+ "TARGET"
891
+ )
892
+
893
+ log_dir = get_log_dir(self.training_target_dir, model_name)
894
+ # instantiate model
895
+ model = ModelEngine(
896
+ model_name=model_name,
897
+ recurrent=recurrent,
898
+ need_scaling=need_scaling,
899
+ search_params=config["search_params"],
900
+ target_type=self.target_type,
901
+ create_model=config["create_model"],
902
+ scaler_y=self.scaler_y,
903
+ plot=self.plot,
904
+ log_dir=log_dir,
905
+ )
906
+
907
+ start = time.time()
908
+ # Tuning hyperparameters
909
+ if perform_hyperopt:
910
+ best_params = self.hyperoptimize(model)
911
+
912
+ # save best params
913
+ best_params_file = f"{self.training_target_dir}/best_params.json"
914
+ try:
915
+ with open(best_params_file, "r") as f:
916
+ json_dict = json.load(f)
917
+ except FileNotFoundError:
918
+ json_dict = {}
919
+
920
+ json_dict[model.model_name] = serialize_for_json(best_params)
921
+ with open(best_params_file, "w") as f:
922
+ json.dump(json_dict, f, indent=4)
923
+ else:
924
+ try:
925
+ with open(f"{self.training_target_dir}/best_params.json") as f:
926
+ json_dict = json.load(f)
927
+ best_params = json_dict[model_name]
928
+ except Exception:
929
+ raise FileNotFoundError(
930
+ f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true"
931
+ )
932
+
933
+ # Perform cross-validation of the best model on k-folds of train + val set
934
+ if perform_crossval:
935
+ x_train_val = pd.concat([self.x_train, self.x_val, self.x_test], axis=0)
936
+ y_train_val = pd.concat([self.y_train, self.y_val, self.y_test], axis=0)
937
+ n_splits = 4
938
+ n_samples = len(x_train_val)
939
+ test_size = int(n_samples / (n_splits + 4))
940
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
941
+
942
+ # Store the scores
943
+ cross_validation_scores = []
944
+
945
+ for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
946
+ self.type_name = f"crossval_fold_{i}"
947
+
948
+ if self.time_series:
949
+ date_series = self.train[self.date_column].copy()
950
+
951
+ if need_scaling:
952
+ date_series = date_series.map(pd.Timestamp.fromordinal)
953
+
954
+ # Now you can use the actual train/val indices to extract ranges
955
+ train_start = date_series.iloc[train_index[0]]
956
+ train_end = date_series.iloc[train_index[-1]]
957
+ val_start = date_series.iloc[val_index[0]]
958
+ val_end = date_series.iloc[val_index[-1]]
959
+
960
+ logger.info(
961
+ f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
962
+ f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
963
+ )
964
+ else:
965
+ logger.info(
966
+ f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
967
+ )
968
+
969
+ # Train the model and get the score
970
+ if recurrent:
971
+ cross_validation_score, _, _ = self.train_model(
972
+ params=best_params,
973
+ x_train=x_train_val[train_index],
974
+ y_train=y_train_val[train_index],
975
+ x_val=x_train_val[val_index],
976
+ y_val=y_train_val[val_index],
977
+ model=model,
978
+ )
979
+ else:
980
+ cross_validation_score, _, _ = self.train_model(
981
+ params=best_params,
982
+ x_train=x_train_val.iloc[train_index],
983
+ y_train=y_train_val.iloc[train_index],
984
+ x_val=x_train_val.iloc[val_index],
985
+ y_val=y_train_val.iloc[val_index],
986
+ model=model,
987
+ )
988
+
989
+ # Append score to the list
990
+ cross_validation_scores.append(cross_validation_score)
991
+
992
+ # Calculate and log the mean score
993
+ cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
994
+ self.metric
995
+ ].mean()
996
+ logger.info(
997
+ f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
998
+ )
999
+
1000
+ # Retrain on entire training set, but keep score on cross-validation folds
1001
+ best_score, best_model, best_pred = self.train_model(
1002
+ params=best_params,
1003
+ x_train=pd.concat([self.x_train, self.x_val], axis=0),
1004
+ y_train=pd.concat([self.y_train, self.y_val], axis=0),
1005
+ x_val=self.x_test,
1006
+ y_val=self.y_test,
1007
+ model=model,
1008
+ )
1009
+ best_score = cross_validation_mean_score
1010
+ else:
1011
+ # Evaluate on validation set
1012
+ self.type_name = "validation"
1013
+ best_score, best_model, best_pred = self.train_model(
1014
+ params=best_params,
1015
+ x_train=pd.concat([self.x_train, self.x_val], axis=0),
1016
+ y_train=pd.concat([self.y_train, self.y_val], axis=0),
1017
+ x_val=self.x_test,
1018
+ y_val=self.y_test,
1019
+ model=model,
1020
+ )
1021
+
1022
+ logger.info(f"Best model scores on test set: {best_score}")
1023
+
1024
+ # Save validation predictions
1025
+ best_pred.to_csv(
1026
+ f"{self.results_dir}/pred_val.csv",
1027
+ index=True,
1028
+ header=True,
1029
+ index_label="ID",
1030
+ )
1031
+
1032
+ # Save best model
1033
+ model_path = best_model.save(self.results_dir)
1034
+
1035
+ model_path = Path(model_path).resolve()
1036
+ best_score["MODEL_PATH"] = model_path
1037
+
1038
+ # Track scores
1039
+ scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
1040
+ best_score_df = pd.DataFrame([best_score])
1041
+
1042
+ if os.path.exists(scores_tracking_path):
1043
+ existing_scores = pd.read_csv(scores_tracking_path)
1044
+ common_cols = existing_scores.columns.intersection(
1045
+ best_score_df.columns
1046
+ )
1047
+ best_score_df = best_score_df[common_cols]
1048
+ scores_tracking = pd.concat(
1049
+ [existing_scores, best_score_df], ignore_index=True
1050
+ )
1051
+ else:
1052
+ scores_tracking = best_score_df
1053
+
1054
+ scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1055
+ scores_tracking.to_csv(scores_tracking_path, index=False)
1056
+
1057
+ # Save model training metadata
1058
+ stop = time.time()
1059
+ training_time = stop - start
1060
+ model_training.best_params = best_params
1061
+ model_training.model_path = model_path
1062
+ model_training.training_time = training_time
1063
+ model_training.save()
1064
+
1065
+ # Store metrics in DB
1066
+ drop_cols = [
1067
+ "DATE",
1068
+ "SESSION",
1069
+ "TRAIN_DATA",
1070
+ "VAL_DATA",
1071
+ "FEATURES",
1072
+ "MODEL_NAME",
1073
+ "MODEL_PATH",
1074
+ ]
1075
+ best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1076
+ score_data = {k.lower(): v for k, v in best_score.items()}
1077
+
1078
+ Score.upsert(
1079
+ match_fields=["model_training_id"],
1080
+ model_training_id=model_training.id,
1081
+ **score_data,
1082
+ )
1083
+
1084
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1085
+
1086
+ # find best model type
1087
+ scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
1088
+ scores_tracking = pd.read_csv(scores_tracking_path)
1089
+ best_score_overall = scores_tracking.iloc[0, :]
1090
+ best_model_name = best_score_overall["MODEL_NAME"]
1091
+
1092
+ # Remove any .best or .keras files
1093
+ for file_path in glob.glob(
1094
+ os.path.join(self.training_target_dir, "*.best")
1095
+ ) + glob.glob(os.path.join(self.training_target_dir, "*.keras")):
1096
+ os.remove(file_path)
1097
+ # Copy the best model in root training folder for this target
1098
+ best_model_path = Path(
1099
+ f"{self.training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1100
+ ).resolve()
1101
+ copy_any(
1102
+ best_score_overall["MODEL_PATH"],
1103
+ best_model_path,
1104
+ )
1105
+
1106
+ with open(f"{self.training_target_dir}/best_params.json", "r") as f:
1107
+ best_model_params = json.load(f)[best_model_name]
1108
+
1109
+ # save model_selection results to db
1110
+ model_selection = ModelSelection.get(model_selection.id)
1111
+ model_selection.best_model_id = Model.find_by(
1112
+ name=best_score_overall["MODEL_NAME"], type=self.target_type
1113
+ ).id
1114
+ model_selection.best_model_params = best_model_params
1115
+ model_selection.best_model_path = best_model_path
1116
+ model_selection.save()
1117
+
1118
+ logger.info(f"Best model overall is : {best_score_overall}")
1119
+
1120
+ def hyperoptimize(self, model: ModelEngine):
1121
+ self.type_name = "hyperopts"
1122
+
1123
+ def collect_error_logs(training_target_dir: int, storage_path: str):
1124
+ output_error_file = f"{training_target_dir}/errors.log"
1125
+
1126
+ with open(output_error_file, "a") as outfile:
1127
+ # Walk through the ray_results directory
1128
+ for root, dirs, files in os.walk(storage_path):
1129
+ # Check if 'error.txt' exists in the current directory
1130
+ if "error.txt" in files:
1131
+ error_file_path = os.path.join(root, "error.txt")
1132
+ logger.info(f"Processing error file: {error_file_path}")
1133
+ # Read and append the content of the error.txt file
1134
+ with open(error_file_path, "r") as infile:
1135
+ outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
1136
+ outfile.write(infile.read())
1137
+ logger.info(f"All errors written to {output_error_file}")
1138
+
1139
+ logger.info("Start tuning hyperparameters...")
1140
+
1141
+ storage_path = f"{self.results_dir}/ray_results"
1142
+
1143
+ tuner = Tuner(
1144
+ trainable=with_parameters(
1145
+ trainable,
1146
+ x_train=self.x_train,
1147
+ y_train=self.y_train,
1148
+ x_val=self.x_val,
1149
+ y_val=self.y_val,
1150
+ model_name=model.model_name,
1151
+ target_type=self.target_type,
1152
+ session_name=self.session_name,
1153
+ target_number=self.target_number,
1154
+ create_model=model.create_model,
1155
+ type_name="hyperopts",
1156
+ plot=model.plot,
1157
+ ),
1158
+ param_space=model.search_params,
1159
+ tune_config=TuneConfig(
1160
+ metric=self.metric,
1161
+ mode="min",
1162
+ search_alg=HyperOptSearch(),
1163
+ num_samples=self.number_of_trials,
1164
+ scheduler=ASHAScheduler(max_t=100, grace_period=10),
1165
+ ),
1166
+ run_config=RunConfig(
1167
+ stop={"training_iteration": 100},
1168
+ storage_path=storage_path,
1169
+ callbacks=[TBXLoggerCallback()],
1170
+ ),
1171
+ )
1172
+ try:
1173
+ results = tuner.fit()
1174
+
1175
+ best_result = results.get_best_result(self.metric, "max")
1176
+ best_params = best_result.config
1177
+ best_score = best_result.metrics
1178
+
1179
+ # log results
1180
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1181
+ logger.info(f"Best Scores found were:\n{best_score}")
1182
+ logger.info(
1183
+ f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
1184
+ )
1185
+ # Collect errors in single file
1186
+ collect_error_logs(
1187
+ training_target_dir=self.training_target_dir, storage_path=storage_path
1188
+ )
1189
+
1190
+ except Exception as e:
1191
+ raise Exception(e)
1192
+
1193
+ finally:
1194
+ ray.shutdown()
1195
+
1196
+ return best_params
1197
+
1198
+ def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
1199
+ # Use the standalone training function to avoid duplication
1200
+ # For train_model, we pass the data directly (not as Ray references)
1201
+ return trainable(
1202
+ params,
1203
+ x_train,
1204
+ y_train,
1205
+ x_val,
1206
+ y_val,
1207
+ model.model_name,
1208
+ self.target_type,
1209
+ self.session_name,
1210
+ self.target_number,
1211
+ model.create_model,
1212
+ self.type_name,
1213
+ model.plot,
1214
+ )
1215
+
1216
+
1217
+ def evaluate(prediction: pd.DataFrame, target_type: str):
1218
+ """
1219
+ Function to evaluate model performance
1220
+
1221
+ Args:
1222
+ - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
1223
+ - target_type: classification or regression
1224
+ """
1225
+ score = {}
1226
+ y_true = prediction["TARGET"]
1227
+ y_pred = prediction["PRED"]
1228
+
1229
+ if target_type == "regression":
1230
+ # Main metrics
1231
+ score["RMSE"] = root_mean_squared_error(y_true, y_pred)
1232
+ score["MAE"] = mean_absolute_error(y_true, y_pred)
1233
+ score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
1234
+ score["R2"] = r2_score(y_true, y_pred)
1235
+
1236
+ # Robustness: avoid division by zero
1237
+ std_target = y_true.std()
1238
+ mean_target = y_true.mean()
1239
+ median_target = y_true.median()
1240
+
1241
+ # RMSE / STD
1242
+ score["RMSE_STD_RATIO"] = (
1243
+ float(100 * score["RMSE"] / std_target) if std_target else 1000
1244
+ )
1245
+
1246
+ # Median absolute deviation (MAD)
1247
+ mam = (y_true - mean_target).abs().median() # Median Abs around Mean
1248
+ mad = (y_true - median_target).abs().median() # Median Abs around Median
1249
+ score["MAM"] = mam
1250
+ score["MAD"] = mad
1251
+ score["MAE_MAM_RATIO"] = (
1252
+ float(100 * score["MAE"] / mam) if mam else 1000
1253
+ ) # MAE / MAD → Plus stable, moins sensible aux outliers.
1254
+ score["MAE_MAD_RATIO"] = (
1255
+ float(100 * score["MAE"] / mad) if mad else 1000
1256
+ ) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
1257
+
1258
+ else:
1259
+
1260
+ labels = np.unique(y_true)
1261
+ num_classes = labels.size
1262
+ y_pred_proba = (
1263
+ prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1264
+ )
1265
+ if num_classes > 2:
1266
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1267
+ lb.fit(labels)
1268
+ y_true_onhot = lb.transform(y_true)
1269
+ y_pred_onehot = lb.transform(y_pred)
1270
+
1271
+ score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1272
+ score["ACCURACY"] = accuracy_score(y_true, y_pred)
1273
+ score["PRECISION"] = precision_score(
1274
+ y_true,
1275
+ y_pred,
1276
+ average=("binary" if num_classes == 2 else "macro"),
1277
+ )
1278
+ score["RECALL"] = recall_score(
1279
+ y_true,
1280
+ y_pred,
1281
+ average=("binary" if num_classes == 2 else "macro"),
1282
+ )
1283
+ score["F1"] = f1_score(
1284
+ y_true,
1285
+ y_pred,
1286
+ average=("binary" if num_classes == 2 else "macro"),
1287
+ )
1288
+ score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
1289
+ (
1290
+ score["THRESHOLD"],
1291
+ score["PRECISION_AT_THRESHOLD"],
1292
+ score["RECALL_AT_THRESHOLD"],
1293
+ ) = (
1294
+ find_best_precision_threshold(prediction)
1295
+ if num_classes == 2
1296
+ else (None, None, None)
1297
+ )
1298
+ return score
1299
+
1300
+
1301
+ # utils
1302
+ def get_log_dir(training_target_dir: str, model_name="test_model"):
1303
+ """Generates a structured log directory path for TensorBoard."""
1304
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
1305
+ log_dir = (
1306
+ Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
1307
+ )
1308
+ log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
1309
+ return str(log_dir)
1310
+
1311
+
1312
+ def print_scores(training_target_dir: str):
1313
+ """
1314
+ Monitor scores
1315
+ """
1316
+ scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
1317
+ return scores_tracking
1318
+
1319
+
1320
+ # plots
1321
+ def plot_evaluation_for_classification(prediction: dict):
1322
+ """
1323
+ Args
1324
+ prediction (pd.DataFrame): Should be a df with TARGET, PRED, 0, 1 columns for y_true value (TARGET), y_pred (PRED), and probabilities (for classification only : 0 and 1)
1325
+ """
1326
+ y_true = prediction["TARGET"]
1327
+ y_pred = prediction["PRED"]
1328
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1329
+
1330
+ # Plot confusion matrix
1331
+ plot_confusion_matrix(y_true, y_pred)
1332
+
1333
+ # Compute ROC curve and ROC area
1334
+ fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
1335
+ roc_auc = auc(fpr, tpr)
1336
+
1337
+ plt.figure(figsize=(8, 8))
1338
+ plt.plot(
1339
+ fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc
1340
+ )
1341
+ plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
1342
+ plt.xlim([0.0, 1.0])
1343
+ plt.ylim([0.0, 1.05])
1344
+ plt.xlabel("False Positive Rate")
1345
+ plt.ylabel("True Positive Rate")
1346
+ plt.title("ROC Curve")
1347
+ plt.legend(loc="lower right")
1348
+ plt.show()
1349
+
1350
+ # Compute precision-recall curve
1351
+ precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
1352
+ average_precision = average_precision_score(y_true, y_pred_proba)
1353
+
1354
+ plt.figure(figsize=(8, 8))
1355
+ plt.step(recall, precision, color="b", alpha=0.2, where="post")
1356
+ plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
1357
+ plt.xlabel("Recall")
1358
+ plt.ylabel("Precision")
1359
+ plt.ylim([0.0, 1.05])
1360
+ plt.xlim([0.0, 1.0])
1361
+ plt.title("Precision-Recall Curve: AP={0:0.2f}".format(average_precision))
1362
+ plt.show()
1363
+
1364
+
1365
+ def plot_confusion_matrix(y_true, y_pred):
1366
+ unique_labels = np.unique(np.concatenate((y_true, y_pred)))
1367
+ cm = confusion_matrix(y_true, y_pred)
1368
+
1369
+ labels = np.sort(unique_labels) # Sort labels based on numerical order
1370
+
1371
+ plt.figure(figsize=(10, 7))
1372
+ sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
1373
+ plt.xlabel("Predicted", fontsize=12)
1374
+ plt.ylabel("True", fontsize=12)
1375
+ plt.title("Confusion Matrix", fontsize=14)
1376
+
1377
+ plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
1378
+ plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
1379
+
1380
+ plt.show()
1381
+
1382
+
1383
+ # thresholds
1384
+ def find_max_f1_threshold(prediction):
1385
+ """
1386
+ Finds the threshold that maximizes the F1 score for a binary classification task.
1387
+
1388
+ Parameters:
1389
+ - prediction: DataFrame with 'TARGET' and '1' (predicted probabilities) columns
1390
+
1391
+ Returns:
1392
+ - best_threshold: The threshold that maximizes the F1 score
1393
+ - best_precision: The precision at that threshold
1394
+ - best_recall: The recall at that threshold
1395
+ """
1396
+ y_true = prediction["TARGET"]
1397
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1398
+
1399
+ # Compute precision, recall, and thresholds
1400
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1401
+
1402
+ # Drop the first element to align with thresholds
1403
+ precision = precision[1:]
1404
+ recall = recall[1:]
1405
+
1406
+ # Filter out trivial cases (precision or recall = 0)
1407
+ valid = (precision > 0) & (recall > 0)
1408
+ if not np.any(valid):
1409
+ raise ValueError("No valid threshold with non-zero precision and recall")
1410
+
1411
+ precision = precision[valid]
1412
+ recall = recall[valid]
1413
+ thresholds = thresholds[valid]
1414
+
1415
+ # Compute F1 scores for each threshold
1416
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1417
+
1418
+ best_index = np.argmax(f1_scores)
1419
+
1420
+ best_threshold = thresholds[best_index]
1421
+ best_precision = precision[best_index]
1422
+ best_recall = recall[best_index]
1423
+
1424
+ return best_threshold, best_precision, best_recall
1425
+
1426
+
1427
+ def find_best_f1_threshold(prediction, fscore_target: float):
1428
+ """
1429
+ Finds the highest threshold achieving at least the given F1 score target.
1430
+
1431
+ Parameters:
1432
+ - prediction: DataFrame with 'TARGET' and '1' (or 1 as int) columns
1433
+ - fscore_target: Desired minimum F1 score (between 0 and 1)
1434
+
1435
+ Returns:
1436
+ - best_threshold: The highest threshold meeting the F1 target
1437
+ - best_precision: Precision at that threshold
1438
+ - best_recall: Recall at that threshold
1439
+ - best_f1: Actual F1 score at that threshold
1440
+ """
1441
+ y_true = prediction["TARGET"]
1442
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1443
+
1444
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1445
+
1446
+ # Align precision/recall with thresholds
1447
+ precision = precision[1:]
1448
+ recall = recall[1:]
1449
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1450
+
1451
+ # Filter for thresholds meeting F1 target
1452
+ valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 >= fscore_target]
1453
+
1454
+ if not valid_indices:
1455
+ raise ValueError(f"Could not find a threshold with F1 >= {fscore_target:.2f}")
1456
+
1457
+ # Pick the highest threshold among valid ones
1458
+ best_index = valid_indices[-1]
1459
+
1460
+ return (
1461
+ thresholds[best_index],
1462
+ precision[best_index],
1463
+ recall[best_index],
1464
+ f1_scores[best_index],
1465
+ )
1466
+
1467
+
1468
+ def find_max_precision_threshold_without_trivial_case(prediction: dict):
1469
+ """
1470
+ Finds the threshold that maximizes precision without reaching a precision of 1,
1471
+ which indicates all predictions are classified as the negative class (0).
1472
+
1473
+ Parameters:
1474
+ - prediction: dict with keys 'TARGET' (true labels) and '1' (predicted probabilities)
1475
+
1476
+ Returns:
1477
+ - threshold: the probability threshold that maximizes precision
1478
+ - actual_recall: the recall achieved at this threshold
1479
+ - actual_precision: the precision achieved at this threshold
1480
+ """
1481
+ y_true = prediction["TARGET"]
1482
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1483
+
1484
+ # Compute precision, recall, and thresholds
1485
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1486
+
1487
+ # Drop the first element of precision and recall to align with thresholds
1488
+ precision = precision[1:]
1489
+ recall = recall[1:]
1490
+
1491
+ # Filter out precision == 1.0 (which might correspond to predicting only 0s)
1492
+ valid_indices = np.where(precision < 1.0)[0]
1493
+ if len(valid_indices) == 0:
1494
+ raise ValueError("No valid precision values less than 1.0")
1495
+
1496
+ precision = precision[valid_indices]
1497
+ recall = recall[valid_indices]
1498
+ thresholds = thresholds[valid_indices]
1499
+
1500
+ # Find the index of the maximum precision
1501
+ best_index = np.argmax(precision)
1502
+
1503
+ # Return the corresponding threshold, precision, and recall
1504
+ best_threshold = thresholds[best_index]
1505
+ best_precision = precision[best_index]
1506
+ best_recall = recall[best_index]
1507
+
1508
+ return best_threshold, best_precision, best_recall
1509
+
1510
+
1511
+ def find_best_precision_threshold(prediction, precision_target: float = 0.80):
1512
+ """
1513
+ Finds the highest threshold that achieves at least the given precision target.
1514
+
1515
+ Parameters:
1516
+ prediction (pd.DataFrame): DataFrame with columns 'TARGET' and '1' or index 1 for predicted probabilities
1517
+ precision_target (float): Desired minimum precision (between 0 and 1)
1518
+
1519
+ Returns:
1520
+ tuple: (threshold, precision, recall) achieving the desired precision
1521
+ """
1522
+ y_true = prediction["TARGET"]
1523
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1524
+
1525
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1526
+
1527
+ # Align lengths: thresholds is N-1 compared to precision/recall
1528
+ thresholds = thresholds
1529
+ precision = precision[1:] # Shift to match thresholds
1530
+ recall = recall[1:]
1531
+
1532
+ valid_indices = [i for i, p in enumerate(precision) if p >= precision_target]
1533
+
1534
+ if not valid_indices:
1535
+ raise ValueError(
1536
+ f"Could not find a threshold with precision >= {precision_target}"
1537
+ )
1538
+
1539
+ best_idx = valid_indices[-1] # Highest threshold with precision >= target
1540
+
1541
+ return thresholds[best_idx], precision[best_idx], recall[best_idx]
1542
+
1543
+
1544
+ def find_best_recall_threshold(prediction, recall_target: float = 0.98) -> float:
1545
+ """
1546
+ Finds the highest threshold that achieves at least the given recall target.
1547
+
1548
+ Parameters:
1549
+ pred_df (pd.DataFrame): DataFrame with columns 'y_true' and 'y_pred_proba'
1550
+ recall_target (float): Desired minimum recall (between 0 and 1)
1551
+
1552
+ Returns:
1553
+ float: Best threshold achieving the desired recall, or None if not reachable
1554
+ """
1555
+ y_true = prediction["TARGET"]
1556
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1557
+
1558
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1559
+
1560
+ # `thresholds` has length N-1 compared to precision and recall
1561
+ recall = recall[1:] # Drop first element to align with thresholds
1562
+ precision = precision[1:]
1563
+
1564
+ valid_indices = [i for i, r in enumerate(recall) if r >= recall_target]
1565
+
1566
+ if not valid_indices:
1567
+ logger.warning(f"Could not find a threshold with recall >= {recall_target}")
1568
+ return None, None, None
1569
+
1570
+ best_idx = valid_indices[-1] # Highest threshold with recall >= target
1571
+
1572
+ return thresholds[best_idx], precision[best_idx], recall[best_idx]
1573
+
1574
+
1575
+ def plot_threshold(prediction, threshold, precision, recall):
1576
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1577
+ y_true = prediction["TARGET"]
1578
+
1579
+ predicted_positive = (y_pred_proba >= threshold).sum()
1580
+ predicted_negative = (y_pred_proba < threshold).sum()
1581
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1582
+ per_predicted_positive = predicted_positive / len(y_pred_proba)
1583
+ per_predicted_negative = predicted_negative / len(y_pred_proba)
1584
+
1585
+ logger.info(
1586
+ f"""Threshold: {threshold*100:.2f}
1587
+ Precision: {precision*100:.2f}
1588
+ Recall: {recall*100:.2f}
1589
+ F1-score: {f1_scores*100:.2f}
1590
+ % of score over {threshold}: {predicted_positive}/{len(y_pred_proba)} = {per_predicted_positive*100:.2f}%
1591
+ % of score under {threshold}: {predicted_negative}/{len(y_pred_proba)} = {per_predicted_negative*100:.2f}%"""
1592
+ )
1593
+
1594
+ # Visualizing the scores of positive and negative classes
1595
+ plt.figure(figsize=(10, 6))
1596
+ sns.histplot(
1597
+ y_pred_proba[y_true == 1],
1598
+ color="blue",
1599
+ label="Positive Class",
1600
+ bins=30,
1601
+ kde=True,
1602
+ alpha=0.6,
1603
+ )
1604
+ sns.histplot(
1605
+ y_pred_proba[y_true == 0],
1606
+ color="red",
1607
+ label="Negative Class",
1608
+ bins=30,
1609
+ kde=True,
1610
+ alpha=0.6,
1611
+ )
1612
+ plt.axvline(
1613
+ x=threshold,
1614
+ color="green",
1615
+ linestyle="--",
1616
+ label=f"Threshold at {round(threshold,3)}",
1617
+ )
1618
+ plt.title("Distribution of Predicted Probabilities")
1619
+ plt.xlabel("Predicted Probabilities")
1620
+ plt.ylabel("Frequency")
1621
+ plt.legend()
1622
+ plt.show()
1623
+ return threshold
1624
+
1625
+
1626
+ # OLD - to sort out
1627
+ def get_pred_distribution(training_target_dir: str, model_name="linear"):
1628
+ """
1629
+ Look at prediction distributions
1630
+ """
1631
+ prediction = pd.read_csv(
1632
+ f"{training_target_dir}/{model_name}/pred_val.csv",
1633
+ index_col="ID",
1634
+ )
1635
+ prediction.describe()
1636
+
1637
+
1638
+ def plot_feature_importance(training_target_dir: str, model_name="linear"):
1639
+ """
1640
+ Monitor feature importance ranking to filter out unrelevant features
1641
+ """
1642
+ model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
1643
+ if hasattr(model, "feature_importances_"):
1644
+ feature_importances_ = model.feature_importances_.flatten()
1645
+ elif hasattr(model, "feature_importance"):
1646
+ feature_importances_ = model.feature_importance.flatten()
1647
+ elif hasattr(model, "coefs_"):
1648
+ feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
1649
+ elif hasattr(model, "coef_"):
1650
+ feature_importances_ = model.coef_.flatten()
1651
+ else:
1652
+ feature_importances_ = []
1653
+
1654
+ sns.barplot(
1655
+ data=feature_importances_,
1656
+ orient="h",
1657
+ )
1658
+
1659
+
1660
+ def print_model_estimators(training_target_dir: str, model_name="linear"):
1661
+ """
1662
+ Look at a specific trained model
1663
+ """
1664
+ model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
1665
+ for i in range(0, 100):
1666
+ logger.info(model.estimators_[i].get_depth())
1667
+
1668
+
1669
+ def get_model_info(model):
1670
+ model.count_params()
1671
+ model.summary()