lecrapaud 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (60) hide show
  1. lecrapaud/__init__.py +0 -0
  2. lecrapaud/config.py +16 -0
  3. lecrapaud/db/__init__.py +0 -0
  4. lecrapaud/db/alembic/README +1 -0
  5. lecrapaud/db/alembic/env.py +78 -0
  6. lecrapaud/db/alembic/script.py.mako +26 -0
  7. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  9. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  10. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  15. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  21. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  22. lecrapaud/db/crud.py +179 -0
  23. lecrapaud/db/models/__init__.py +11 -0
  24. lecrapaud/db/models/base.py +6 -0
  25. lecrapaud/db/models/dataset.py +124 -0
  26. lecrapaud/db/models/feature.py +46 -0
  27. lecrapaud/db/models/feature_selection.py +126 -0
  28. lecrapaud/db/models/feature_selection_rank.py +80 -0
  29. lecrapaud/db/models/model.py +41 -0
  30. lecrapaud/db/models/model_selection.py +56 -0
  31. lecrapaud/db/models/model_training.py +54 -0
  32. lecrapaud/db/models/score.py +62 -0
  33. lecrapaud/db/models/target.py +59 -0
  34. lecrapaud/db/services.py +0 -0
  35. lecrapaud/db/setup.py +58 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/feature_engineering.py +1119 -0
  38. lecrapaud/feature_selection.py +1229 -0
  39. lecrapaud/jobs/__init__.py +13 -0
  40. lecrapaud/jobs/config.py +17 -0
  41. lecrapaud/jobs/scheduler.py +36 -0
  42. lecrapaud/jobs/tasks.py +57 -0
  43. lecrapaud/model_selection.py +1571 -0
  44. lecrapaud/predictions.py +292 -0
  45. lecrapaud/search_space.py +844 -0
  46. lecrapaud/services/__init__.py +0 -0
  47. lecrapaud/services/embedding_categorical.py +71 -0
  48. lecrapaud/services/indicators.py +309 -0
  49. lecrapaud/speed_tests/experiments.py +139 -0
  50. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  51. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  52. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  53. lecrapaud/speed_tests/tests.ipynb +145 -0
  54. lecrapaud/speed_tests/trash.py +37 -0
  55. lecrapaud/training.py +151 -0
  56. lecrapaud/utils.py +246 -0
  57. lecrapaud-0.4.0.dist-info/LICENSE +201 -0
  58. lecrapaud-0.4.0.dist-info/METADATA +103 -0
  59. lecrapaud-0.4.0.dist-info/RECORD +60 -0
  60. lecrapaud-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1571 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import datetime
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import time
7
+ import os
8
+ import json
9
+ import warnings
10
+ import joblib
11
+ import glob
12
+ from pathlib import Path
13
+
14
+ os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
15
+
16
+ # ML models
17
+ from sklearn.model_selection import TimeSeriesSplit
18
+ from sklearn.calibration import CalibratedClassifierCV
19
+ from sklearn.metrics import (
20
+ mean_absolute_percentage_error,
21
+ root_mean_squared_error,
22
+ mean_absolute_error,
23
+ r2_score,
24
+ accuracy_score,
25
+ precision_score,
26
+ recall_score,
27
+ f1_score,
28
+ log_loss,
29
+ roc_auc_score,
30
+ roc_curve,
31
+ auc,
32
+ precision_recall_curve,
33
+ average_precision_score,
34
+ confusion_matrix,
35
+ )
36
+ from sklearn.preprocessing import LabelBinarizer
37
+ import lightgbm as lgb
38
+ import xgboost as xgb
39
+
40
+ # DL models
41
+ import tensorflow as tf
42
+ import keras
43
+ from keras.callbacks import EarlyStopping, TensorBoard
44
+ from keras.metrics import (
45
+ Precision,
46
+ Recall,
47
+ F1Score,
48
+ )
49
+ from keras.losses import BinaryCrossentropy, CategoricalCrossentropy
50
+ from keras.optimizers import Adam
51
+
52
+ K = tf.keras.backend
53
+ from tensorboardX import SummaryWriter
54
+
55
+ # Optimization
56
+ import ray
57
+ from ray.tune import Tuner, TuneConfig, with_parameters
58
+ from ray.train import RunConfig
59
+ from ray.tune.search.hyperopt import HyperOptSearch
60
+ from ray.tune.search.bayesopt import BayesOptSearch
61
+ from ray.tune.logger import TBXLoggerCallback
62
+ from ray.tune.schedulers import ASHAScheduler
63
+ from ray.air import session
64
+
65
+ # Internal library
66
+ from src.search_space import ml_models, dl_recurrent_models
67
+ from src.directory_management import clean_directory
68
+ from src.utils import copy_any, contains_best, logger, serialize_for_json
69
+ from src.config import PYTHON_ENV
70
+ from src.feature_selection import TARGETS_CLF, DATE_COLUMN, load_train_data
71
+ from src.db.models import Model, ModelSelection, ModelTraining, Score, Target, Dataset
72
+
73
+ # Reproducible result
74
+ keras.utils.set_random_seed(42)
75
+ np.random.seed(42)
76
+ tf.config.experimental.enable_op_determinism()
77
+
78
+
79
+ # test configuration
80
+ def test_hardware():
81
+ devices = tf.config.list_physical_devices()
82
+ logger.info("\nDevices: ", devices)
83
+
84
+ gpus = tf.config.list_physical_devices("GPU")
85
+ if gpus:
86
+ details = tf.config.experimental.get_device_details(gpus[0])
87
+ logger.info("GPU details: ", details)
88
+
89
+
90
+ # Suppress specific warning messages related to file system monitor
91
+ # logging.getLogger("ray").setLevel(logging.CRITICAL)
92
+ # logging.getLogger("ray.train").setLevel(logging.CRITICAL)
93
+ # logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
94
+ # logging.getLogger("ray.autoscaler").setLevel(logging.CRITICAL)
95
+ # logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
96
+ # logging.getLogger("ray.monitor").setLevel(logging.CRITICAL)
97
+ # logging.getLogger("ray.dashboard").setLevel(logging.CRITICAL)
98
+ # logging.getLogger("ray.gcs_server").setLevel(logging.CRITICAL)
99
+
100
+ warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
101
+
102
+
103
+ # Metrics
104
+ def rmse_tf(y_true, y_pred):
105
+ y_true, y_pred = unscale_tf(y_true, y_pred)
106
+ results = K.sqrt(K.mean(K.square(y_pred - y_true)))
107
+ return results
108
+
109
+
110
+ def mae_tf(y_true, y_pred):
111
+ y_true, y_pred = unscale_tf(y_true, y_pred)
112
+ results = K.mean(K.abs(y_pred - y_true))
113
+ return results
114
+
115
+
116
+ def unscale_tf(y_true, y_pred):
117
+ if _target_type == "regression":
118
+ scale = K.constant(_scaler_y.scale_[0])
119
+ mean = K.constant(_scaler_y.mean_[0])
120
+
121
+ y_true = K.mul(y_true, scale)
122
+ y_true = K.bias_add(y_true, mean)
123
+
124
+ y_pred = K.mul(y_pred, scale)
125
+ y_pred = K.bias_add(y_pred, mean)
126
+ return y_true, y_pred
127
+
128
+
129
+ def recall_tf(y_true, y_pred):
130
+ y_true = K.ones_like(y_true)
131
+ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
132
+ all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
133
+
134
+ recall = true_positives / (all_positives + K.epsilon())
135
+ return recall
136
+
137
+
138
+ def precision_tf(y_true, y_pred):
139
+ y_true = K.ones_like(y_true)
140
+ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
141
+
142
+ predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
143
+ precision = true_positives / (predicted_positives + K.epsilon())
144
+ return precision
145
+
146
+
147
+ def f1_score_tf(y_true, y_pred):
148
+ precision = precision_tf(y_true, y_pred)
149
+ recall = recall_tf(y_true, y_pred)
150
+ return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
151
+
152
+
153
+ def get_log_dir(training_target_dir: str, model_name="test_model"):
154
+ """Generates a structured log directory path for TensorBoard."""
155
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
156
+ log_dir = (
157
+ Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
158
+ )
159
+ log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
160
+ return str(log_dir)
161
+
162
+
163
+ # Functions to fit & evaluate models
164
+ def fit_sklearn(x_train, y_train, x_val, y_val, create_model, params, config):
165
+
166
+ # Create & Compile the model
167
+ model = create_model(**params)
168
+
169
+ # Train the model
170
+ logger.info("Fitting the model...")
171
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
172
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
173
+
174
+ model.fit(x_train, y_train)
175
+
176
+ if (
177
+ _target_type == "classification"
178
+ and "loss" in model.get_params().keys()
179
+ and "hinge" in model.get_params()["loss"]
180
+ ):
181
+ # This is for SVC models with hinge loss
182
+ # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
183
+ # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
184
+ logger.info(
185
+ f"Re-Calibrating {config["model_name"]} to get predict probabilities..."
186
+ )
187
+ calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
188
+ model = calibrator.fit(x_train, y_train)
189
+
190
+ # set model_name after calibrator
191
+ model.model_name = config["model_name"]
192
+
193
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
194
+
195
+ return model
196
+
197
+
198
+ def fit_boosting(x_train, y_train, x_val, y_val, create_model, params, config):
199
+ """
200
+ This is using lightGBM or XGboost C++ librairies
201
+ """
202
+ lightGBM = create_model == "lgb"
203
+
204
+ # Datasets
205
+ Dataset = lgb.Dataset if lightGBM else xgb.DMatrix
206
+ train_data = Dataset(x_train, label=y_train)
207
+ val_data = Dataset(x_val, label=y_val)
208
+
209
+ # Callbacks
210
+ log_dir = get_log_dir(_training_target_dir, create_model)
211
+
212
+ # Create a TensorBoardX writer
213
+ writer = SummaryWriter(log_dir)
214
+ evals_result = {}
215
+
216
+ # Training
217
+ labels = np.unique(y_train)
218
+ num_class = (
219
+ labels.size if _target_type == "classification" and labels.size > 2 else 1
220
+ )
221
+ logger.info("Fitting the model...")
222
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
223
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
224
+
225
+ if lightGBM:
226
+
227
+ def tensorboard_callback(env):
228
+ for i, metric in enumerate(env.evaluation_result_list):
229
+ metric_name, _, metric_value, _ = metric
230
+ writer.add_scalar(
231
+ f"LightGBM/{metric_name}", metric_value, env.iteration
232
+ )
233
+
234
+ loss = (
235
+ "regression"
236
+ if _target_type == "regression"
237
+ else ("binary" if num_class <= 2 else "multiclass")
238
+ )
239
+ eval_metric = (
240
+ "rmse"
241
+ if _target_type == "regression"
242
+ else ("binary_logloss" if num_class <= 2 else "multi_logloss")
243
+ )
244
+ model = lgb.train(
245
+ params={
246
+ **params["model_params"],
247
+ "objective": loss,
248
+ "metric": eval_metric,
249
+ "num_class": num_class,
250
+ },
251
+ num_boost_round=params["num_boost_round"],
252
+ train_set=train_data,
253
+ valid_sets=[train_data, val_data],
254
+ valid_names=["train", "val"],
255
+ callbacks=[
256
+ lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
257
+ lgb.record_evaluation(evals_result),
258
+ tensorboard_callback,
259
+ ],
260
+ )
261
+ else:
262
+
263
+ class TensorBoardCallback(xgb.callback.TrainingCallback):
264
+
265
+ def __init__(self, log_dir: str):
266
+ self.writer = SummaryWriter(log_dir=log_dir)
267
+
268
+ def after_iteration(
269
+ self,
270
+ model,
271
+ epoch: int,
272
+ evals_log: xgb.callback.TrainingCallback.EvalsLog,
273
+ ) -> bool:
274
+ if not evals_log:
275
+ return False
276
+
277
+ for data, metric in evals_log.items():
278
+ for metric_name, log in metric.items():
279
+ score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
280
+ self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
281
+
282
+ return False
283
+
284
+ tensorboard_callback = TensorBoardCallback(log_dir)
285
+
286
+ loss = (
287
+ "reg:squarederror"
288
+ if _target_type == "regression"
289
+ else ("binary:logistic" if num_class <= 2 else "multi:softprob")
290
+ )
291
+ eval_metric = (
292
+ "rmse"
293
+ if _target_type == "regression"
294
+ else ("logloss" if num_class <= 2 else "mlogloss")
295
+ )
296
+ model = xgb.train(
297
+ params={
298
+ **params["model_params"],
299
+ "objective": loss,
300
+ "eval_metric": eval_metric,
301
+ "num_class": num_class,
302
+ },
303
+ num_boost_round=params["num_boost_round"],
304
+ dtrain=train_data,
305
+ evals=[(val_data, "val"), (train_data, "train")],
306
+ callbacks=[
307
+ xgb.callback.EarlyStopping(
308
+ rounds=params["early_stopping_rounds"], save_best=True
309
+ ),
310
+ xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
311
+ tensorboard_callback,
312
+ ],
313
+ evals_result=evals_result, # Record evaluation result
314
+ verbose_eval=0,
315
+ )
316
+
317
+ model.model_name = create_model
318
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
319
+
320
+ # Close the writer after training is done
321
+ writer.close()
322
+
323
+ if _plot:
324
+ # Plot loss per epoch
325
+ train_loss = evals_result["train"][eval_metric]
326
+ val_loss = evals_result["val"][eval_metric]
327
+ logs = pd.DataFrame({"train": train_loss, "val": val_loss})
328
+
329
+ plt.figure(figsize=(14, 4))
330
+ plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
331
+ plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
332
+ plt.xlabel("Epoch")
333
+ plt.ylabel("Loss")
334
+ plt.legend()
335
+ plt.show()
336
+
337
+ return model
338
+
339
+
340
+ def fit_recurrent(x_train, y_train, x_val, y_val, create_model, params, config):
341
+
342
+ # Create the model
343
+ labels = np.unique(y_train[:, 0])
344
+ num_class = labels.size if _target_type == "classification" else None
345
+ input_shape = (x_train.shape[1], x_train.shape[2])
346
+ model = create_model(params, input_shape, _target_type, num_class)
347
+
348
+ # Compile the model
349
+ loss = (
350
+ rmse_tf
351
+ if _target_type == "regression"
352
+ else (
353
+ BinaryCrossentropy(from_logits=False)
354
+ if num_class <= 2
355
+ else CategoricalCrossentropy(from_logits=False)
356
+ )
357
+ )
358
+ optimizer = Adam(learning_rate=params["learning_rate"], clipnorm=params["clipnorm"])
359
+ metrics = (
360
+ [mae_tf]
361
+ if _target_type == "regression"
362
+ else (
363
+ ["accuracy", Precision(), Recall()]
364
+ if num_class <= 2
365
+ else ["categorical_accuracy"]
366
+ )
367
+ )
368
+ model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
369
+
370
+ # Callbacks
371
+ log_dir = get_log_dir(_training_target_dir, model.model_name)
372
+
373
+ tensorboard_callback = TensorBoard(log_dir=log_dir)
374
+ early_stopping_callback = EarlyStopping(
375
+ monitor="val_loss", patience=3, restore_best_weights=True, start_from_epoch=5
376
+ )
377
+
378
+ # Custom callbacks
379
+ class PrintTrainableWeights(keras.callbacks.Callback):
380
+ def on_epoch_end(self, epoch, logs={}):
381
+ logger.info(model.trainable_variables)
382
+
383
+ class GradientCalcCallback(keras.callbacks.Callback):
384
+ def __init__(self):
385
+ self.epoch_gradient = []
386
+
387
+ def get_gradient_func(self, model):
388
+ # grads = K.gradients(model.total_loss, model.trainable_weights)
389
+ grads = K.gradients(model.loss, model.trainable_weights)
390
+ # inputs = model.model.inputs + model.targets + model.sample_weights
391
+ # use below line of code if above line doesn't work for you
392
+ # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
393
+ inputs = (
394
+ model._feed_inputs + model._feed_targets + model._feed_sample_weights
395
+ )
396
+ func = K.function(inputs, grads)
397
+ return func
398
+
399
+ def on_epoch_end(self, epoch, logs=None):
400
+ get_gradient = self.get_gradient_func(model)
401
+ grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
402
+ self.epoch_gradient.append(grads)
403
+
404
+ # Train the model
405
+ if _target_type == "classification" and num_class > 2:
406
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
407
+ lb.fit(labels)
408
+ y_train = lb.transform(y_train[:, 0].flatten())
409
+ y_val = lb.transform(y_val[:, 0].flatten())
410
+ else:
411
+ y_train = y_train[:, 0].flatten()
412
+ y_val = y_val[:, 0].flatten()
413
+
414
+ logger.info("Fitting the model...")
415
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
416
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
417
+
418
+ history = model.fit(
419
+ x_train,
420
+ y_train,
421
+ batch_size=params["batch_size"],
422
+ verbose=0,
423
+ epochs=params["epochs"],
424
+ shuffle=False,
425
+ validation_data=(x_val, y_val),
426
+ callbacks=[early_stopping_callback, tensorboard_callback],
427
+ )
428
+
429
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
430
+ # logger.info(pd.DataFrame(gradiant.epoch_gradient))
431
+
432
+ if _plot:
433
+ # Plot loss per epoch
434
+ logs = pd.DataFrame(history.history)
435
+
436
+ plt.figure(figsize=(14, 4))
437
+ plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
438
+ plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
439
+ plt.xlabel("Epoch")
440
+ plt.ylabel("Loss")
441
+ plt.legend()
442
+ plt.show()
443
+
444
+ return model
445
+
446
+
447
+ def predict(
448
+ model, data: pd.DataFrame, target_type: str, config: dict, threshold: float = 0.5
449
+ ):
450
+ """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
451
+
452
+ Args:
453
+ - model: the train model to predict value
454
+ - data: the data for prediction
455
+ - target_type: classification or regression
456
+ - config: dict containing model config
457
+ """
458
+ if config["recurrent"] or model.model_name in ["lgb", "xgb"]:
459
+ # keras, lgb & xgb
460
+ if model.model_name == "lgb":
461
+ # Direct prediction for LightGBM
462
+ pred = model.predict(data)
463
+ elif model.model_name == "xgb":
464
+ # Convert val_data to DMatrix for XGBoost
465
+ d_data = xgb.DMatrix(data)
466
+ pred = model.predict(d_data)
467
+ else:
468
+ # Reshape (flatten) for keras if not multiclass
469
+ pred = model.predict(data)
470
+ if pred.shape[1] == 1:
471
+ pred = pred.reshape(-1)
472
+
473
+ if target_type == "classification":
474
+ num_class = pred.shape[1] if len(pred.shape) > 1 else 2
475
+
476
+ if num_class <= 2:
477
+ # For binary classification, concatenate the predicted probabilities for both classes
478
+ pred_df = pd.DataFrame(
479
+ {
480
+ 0: 1 - pred, # Probability of class 0
481
+ 1: pred, # Probability of class 1
482
+ },
483
+ )
484
+ else:
485
+ # For multi-class classification, use the predicted probabilities for each class
486
+ pred_df = pd.DataFrame(pred, columns=range(num_class))
487
+
488
+ # Get final predictions (argmax for multi-class, threshold for binary)
489
+ if num_class == 2:
490
+ pred_df["PRED"] = np.where(
491
+ pred_df[1] >= threshold, 1, 0
492
+ ) # Class 1 if prob >= threshold
493
+ else:
494
+ pred_df["PRED"] = pred_df.idxmax(
495
+ axis=1
496
+ ) # Class with highest probability for multiclasses
497
+
498
+ # Reorder columns to show predicted class first, then probabilities
499
+ pred = pred_df[["PRED"] + list(range(num_class))]
500
+
501
+ else:
502
+ pred = pd.Series(pred, name="PRED")
503
+
504
+ # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
505
+ if model.model_name in ["lgb", "xgb"]:
506
+ pred.index = data.index
507
+ else:
508
+ # sk learn
509
+ pred = pd.Series(model.predict(data), index=data.index, name="PRED")
510
+ if target_type == "classification":
511
+ pred_proba = pd.DataFrame(
512
+ model.predict_proba(data),
513
+ index=data.index,
514
+ columns=[
515
+ int(c) if isinstance(c, float) and c.is_integer() else c
516
+ for c in model.classes_
517
+ ],
518
+ )
519
+
520
+ # Apply threshold for binary classification
521
+ if len(model.classes_) == 2:
522
+ positive_class = model.classes_[1] # Assuming classes are ordered
523
+ pred = (pred_proba[positive_class] >= threshold).astype(int)
524
+ pred.name = "PRED"
525
+
526
+ pred = pd.concat([pred, pred_proba], axis=1)
527
+
528
+ return pred
529
+
530
+
531
+ def evaluate(prediction: pd.DataFrame, target_type: str):
532
+ """
533
+ Function to evaluate model performance
534
+
535
+ Args:
536
+ - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
537
+ - target_type: classification or regression
538
+ """
539
+ score = {}
540
+ y_true = prediction["TARGET"]
541
+ y_pred = prediction["PRED"]
542
+
543
+ if target_type == "regression":
544
+ # Main metrics
545
+ score["RMSE"] = root_mean_squared_error(y_true, y_pred)
546
+ score["MAE"] = mean_absolute_error(y_true, y_pred)
547
+ score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
548
+ score["R2"] = r2_score(y_true, y_pred)
549
+
550
+ # Robustness: avoid division by zero
551
+ std_target = y_true.std()
552
+ mean_target = y_true.mean()
553
+ median_target = y_true.median()
554
+
555
+ # RMSE / STD
556
+ score["RMSE_STD_RATIO"] = (
557
+ float(100 * score["RMSE"] / std_target) if std_target else 1000
558
+ )
559
+
560
+ # Median absolute deviation (MAD)
561
+ mam = (y_true - mean_target).abs().median() # Median Abs around Mean
562
+ mad = (y_true - median_target).abs().median() # Median Abs around Median
563
+ score["MAM"] = mam
564
+ score["MAD"] = mad
565
+ score["MAE_MAM_RATIO"] = (
566
+ float(100 * score["MAE"] / mam) if mam else 1000
567
+ ) # MAE / MAD → Plus stable, moins sensible aux outliers.
568
+ score["MAE_MAD_RATIO"] = (
569
+ float(100 * score["MAE"] / mad) if mad else 1000
570
+ ) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
571
+
572
+ else:
573
+
574
+ labels = np.unique(y_true)
575
+ num_classes = labels.size
576
+ y_pred_proba = (
577
+ prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
578
+ )
579
+ if num_classes > 2:
580
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
581
+ lb.fit(labels)
582
+ y_true_onhot = lb.transform(y_true)
583
+ y_pred_onehot = lb.transform(y_pred)
584
+
585
+ score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
586
+ score["ACCURACY"] = accuracy_score(y_true, y_pred)
587
+ score["PRECISION"] = precision_score(
588
+ y_true,
589
+ y_pred,
590
+ average=("binary" if num_classes == 2 else "macro"),
591
+ )
592
+ score["RECALL"] = recall_score(
593
+ y_true,
594
+ y_pred,
595
+ average=("binary" if num_classes == 2 else "macro"),
596
+ )
597
+ score["F1"] = f1_score(
598
+ y_true,
599
+ y_pred,
600
+ average=("binary" if num_classes == 2 else "macro"),
601
+ )
602
+ score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
603
+ (
604
+ score["THRESHOLD"],
605
+ score["PRECISION_AT_THRESHOLD"],
606
+ score["RECALL_AT_THRESHOLD"],
607
+ ) = (
608
+ find_best_precision_threshold(prediction)
609
+ if num_classes == 2
610
+ else (None, None, None)
611
+ )
612
+ return score
613
+
614
+
615
+ def train_model(params, x_train, y_train, x_val, y_val, config):
616
+ if "_type_name" in config.keys() and config["_type_name"] == "hyperopts":
617
+ global _target_number
618
+ global _target_type
619
+ global _session_name
620
+ global _plot
621
+ global _type_name
622
+ global _scaler_y
623
+ global _training_target_dir
624
+ _target_number = config["_target_number"]
625
+ _target_type = config["_target_type"]
626
+ _session_name = config["_session_name"]
627
+ _plot = config["_plot"]
628
+ _type_name = config["_type_name"]
629
+ _scaler_y = config["_scaler_y"]
630
+ _training_target_dir = config["_training_target_dir"]
631
+
632
+ # warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
633
+ # logging.getLogger("ray").setLevel(logging.CRITICAL)
634
+ # logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
635
+ # logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
636
+ # logging.getLogger("raylet").setLevel(logging.CRITICAL)
637
+
638
+ logger.info(
639
+ f"TARGET_{_target_number} - Training a {config['model_name']} at {datetime.now()} : {_session_name}, TARGET_{_target_number}"
640
+ )
641
+
642
+ recurrent = config["recurrent"]
643
+ create_model = config["create_model"]
644
+
645
+ if recurrent:
646
+ timesteps = params["timesteps"]
647
+ x_train = x_train[:, -timesteps:, :]
648
+ x_val = x_val[:, -timesteps:, :]
649
+
650
+ # Compile and fit model on train set
651
+ start = time.time()
652
+ if recurrent:
653
+ fit = fit_recurrent
654
+ elif (create_model == "lgb") or (create_model == "xgb"):
655
+ fit = fit_boosting
656
+ else:
657
+ fit = fit_sklearn
658
+ model = fit(
659
+ x_train,
660
+ y_train,
661
+ x_val,
662
+ y_val,
663
+ create_model,
664
+ params=params,
665
+ config=config,
666
+ )
667
+ stop = time.time()
668
+
669
+ # Prediction on val set
670
+ y_pred = predict(model, x_val, _target_type, config)
671
+
672
+ # fix for recurrent model because x_val has no index as it is a 3D np array
673
+ if config["recurrent"]:
674
+ y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
675
+ y_pred.index = y_val.index
676
+
677
+ prediction = pd.concat([y_val, y_pred], axis=1)
678
+
679
+ # Unscale the data
680
+ if config["need_scaling"] and _target_type == "regression":
681
+ # scaler_y needs 2D array with shape (-1, 1)
682
+ prediction.loc[:, "TARGET"] = _scaler_y.inverse_transform(
683
+ prediction[["TARGET"]].values
684
+ )
685
+ prediction.loc[:, "PRED"] = _scaler_y.inverse_transform(
686
+ prediction[["PRED"]].values
687
+ )
688
+
689
+ # Evaluate model
690
+ score = {
691
+ "DATE": datetime.now(),
692
+ "SESSION": _session_name,
693
+ "TRAIN_DATA": x_train.shape[0],
694
+ "VAL_DATA": x_val.shape[0],
695
+ "FEATURES": x_train.shape[-1],
696
+ "MODEL_NAME": model.model_name,
697
+ "TYPE": _type_name,
698
+ "TRAINING_TIME": stop - start,
699
+ "EVAL_DATA_STD": prediction["TARGET"].std(),
700
+ }
701
+
702
+ score.update(evaluate(prediction, _target_type))
703
+
704
+ if _type_name == "hyperopts":
705
+ session.report(metrics=score)
706
+ ray.tune.report(metrics=score)
707
+ return score
708
+
709
+ return score, model, prediction
710
+
711
+
712
+ # Main training function
713
+ def model_selection(
714
+ dataset_id: int,
715
+ models_idx: list,
716
+ target_number: int,
717
+ session_name,
718
+ perform_hyperoptimization=True,
719
+ perform_crossval=False,
720
+ number_of_trials=20,
721
+ plot=True,
722
+ clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
723
+ preserve_model=True,
724
+ reshaped_data=None,
725
+ data=None,
726
+ ):
727
+ """
728
+ Selects the best models based on a target variable, optionally performing hyperparameter optimization
729
+ and cross-validation, and manages outputs in a session-specific directory.
730
+
731
+ Args:
732
+ models_idx (list):
733
+ A list of indices or identifiers representing the models to evaluate.
734
+ Each identifier corresponds to a predefined or available model.
735
+
736
+ target_number (int):
737
+ The number of the target variable (e.g., column index or predefined target) to predict.
738
+ This determines the dataset's output variable for training and evaluation.
739
+
740
+ session_name (str):
741
+ A name for the current session, used to organize and store results
742
+ (e.g., logs, metrics, trained models) in a session-specific directory.
743
+
744
+ perform_hyperoptimization (bool, optional):
745
+ Whether to perform hyperparameter optimization for the models.
746
+ If `True`, the function will attempt to tune the hyperparameters of each model.
747
+ Defaults to `True`.
748
+
749
+ perform_crossval (bool, optional):
750
+ Whether to perform cross-validation to evaluate model performance.
751
+ If `True`, the function will use cross-validation to compute metrics.
752
+ Defaults to `True`.
753
+
754
+ number_of_trials (int, optional):
755
+ The number of trials to run for hyperparameter optimization.
756
+ Ignored if `perform_hyperoptimization` is `False`.
757
+ Defaults to `20`.
758
+
759
+ plot (bool, optional):
760
+ Whether to enable plotting during the process.
761
+ If `True`, plot will be displayed.
762
+ Defaults to `True`.
763
+
764
+ clean_dir (bool, optional):
765
+ Whether to clean the entire target training directory before starting the process.
766
+ If `True`, any existing files in the target training directory will be removed.
767
+ Defaults to `False`.
768
+
769
+ preserve_model (bool, optional):
770
+ Whether to run the search even if there is already a best model in the directory.
771
+ If `False`, previous best models won't be erased and the search will be skipped.
772
+ Defaults to `False`.
773
+
774
+ Returns:
775
+ None
776
+ The function runs the model selection process and outputs results
777
+ (e.g., logs, metrics, and optionally models) to the session directory.
778
+ """
779
+ global _target_number
780
+ global _target_type
781
+ global _session_name
782
+ global _plot
783
+ global _type_name
784
+ global _scaler_y
785
+ global _training_target_dir
786
+
787
+ global_vars = [
788
+ "_target_number",
789
+ "_target_type",
790
+ "_session_name",
791
+ "_plot",
792
+ "_type_name",
793
+ "_scaler_y",
794
+ "_training_target_dir",
795
+ ]
796
+
797
+ _target_number = target_number
798
+ _target_type = "classification" if target_number in TARGETS_CLF else "regression"
799
+ _session_name = session_name
800
+ _plot = plot
801
+
802
+ if dataset_id is None:
803
+ raise ValueError("dataset_id is not provided.")
804
+
805
+ dataset = Dataset.get(dataset_id)
806
+ dataset_dir = dataset.path
807
+
808
+ training_target_dir = f"{dataset_dir}/TARGET_{_target_number}"
809
+ _training_target_dir = training_target_dir
810
+
811
+ metric = "RMSE" if _target_type == "regression" else "LOGLOSS"
812
+
813
+ # load features, scalers and data
814
+ features = dataset.get_features(target_number)
815
+ all_features = dataset.get_all_features()
816
+
817
+ if data:
818
+ train = data["train"]
819
+ val = data["val"]
820
+ train_scaled = data["train_scaled"]
821
+ val_scaled = data["val_scaled"]
822
+ _scaler_y = (
823
+ data["scalers_y"][f"scaler_y_{target_number}"]
824
+ if _target_type == "regression"
825
+ else None
826
+ )
827
+ else:
828
+ train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
829
+ dataset_dir, target_number, _target_type
830
+ )
831
+
832
+ list_models = ml_models + dl_recurrent_models
833
+
834
+ if any(list_models[i].get("recurrent") for i in models_idx):
835
+ if reshaped_data is None:
836
+ raise ValueError("reshaped_data is not provided.")
837
+
838
+ logger.info("Loading reshaped data...")
839
+ x_train_reshaped = reshaped_data["x_train_reshaped"]
840
+ y_train_reshaped = reshaped_data["y_train_reshaped"]
841
+ x_val_reshaped = reshaped_data["x_val_reshaped"]
842
+ y_val_reshaped = reshaped_data["y_val_reshaped"]
843
+
844
+ # create model selection in db
845
+ target = Target.find_by(name=f"TARGET_{target_number}")
846
+ model_selection = ModelSelection.upsert(
847
+ match_fields=["target_id", "dataset_id"],
848
+ target_id=target.id,
849
+ dataset_id=dataset.id,
850
+ )
851
+
852
+ # recurrent models starts at 9 # len(list_models)
853
+ for i in models_idx:
854
+ config = list_models[i]
855
+ if config["recurrent"] is False and config[_target_type] is None:
856
+ continue # for naive bayes models that cannot be used in regression
857
+
858
+ results_dir = f"{training_target_dir}/{config['model_name']}"
859
+ if not os.path.exists(f"{results_dir}"):
860
+ os.makedirs(f"{results_dir}")
861
+ elif preserve_model and contains_best(results_dir):
862
+ continue
863
+ elif perform_hyperoptimization:
864
+ clean_directory(results_dir)
865
+
866
+ logger.info(f"Training a {config['model_name']}")
867
+ model = Model.upsert(
868
+ match_fields=["name", "type"],
869
+ name=config["model_name"],
870
+ type=_target_type,
871
+ )
872
+ model_training = ModelTraining.upsert(
873
+ match_fields=["model_id", "model_selection_id"],
874
+ model_id=model.id,
875
+ model_selection_id=model_selection.id,
876
+ )
877
+
878
+ # getting data
879
+ if config["recurrent"]:
880
+ # Clear cluster from previous Keras session graphs.
881
+ K.clear_session()
882
+
883
+ features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
884
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
885
+ x_train = x_train_reshaped[:, :, features_idx]
886
+ y_train = y_train_reshaped[:, [target_number, 0]]
887
+ x_val = x_val_reshaped[:, :, features_idx]
888
+ y_val = y_val_reshaped[:, [target_number, 0]]
889
+ else:
890
+ new_config = config[_target_type]
891
+ new_config["model_name"] = config["model_name"]
892
+ new_config["recurrent"] = config["recurrent"]
893
+ new_config["need_scaling"] = config["need_scaling"]
894
+ config = new_config
895
+
896
+ if config["need_scaling"] and _target_type == "regression":
897
+ x_train = train_scaled[features]
898
+ y_train = train_scaled[f"TARGET_{target_number}"].rename("TARGET")
899
+ x_val = val_scaled[features]
900
+ y_val = val_scaled[f"TARGET_{target_number}"].rename("TARGET")
901
+ else:
902
+ x_train = train[features]
903
+ y_train = train[f"TARGET_{target_number}"].rename("TARGET")
904
+ x_val = val[features]
905
+ y_val = val[f"TARGET_{target_number}"].rename("TARGET")
906
+
907
+ start = time.time()
908
+ # Tuning hyperparameters
909
+ if perform_hyperoptimization:
910
+ _type_name = "hyperopts"
911
+
912
+ for var in global_vars:
913
+ config[var] = globals()[var]
914
+
915
+ logger.info("Start tuning hyperparameters...")
916
+
917
+ storage_path = f"{results_dir}/ray_results"
918
+ # ray.shutdown()
919
+ # ray.init(
920
+ # runtime_env={
921
+ # "working_dir": ".", # or your project path
922
+ # "env_vars": {"PYTHONPATH": "."}
923
+ # }
924
+ # )
925
+ tuner = Tuner(
926
+ trainable=with_parameters(
927
+ train_model,
928
+ x_train=x_train,
929
+ y_train=y_train,
930
+ x_val=x_val,
931
+ y_val=y_val,
932
+ config=config,
933
+ ),
934
+ param_space=config["search_params"],
935
+ tune_config=TuneConfig(
936
+ metric=metric,
937
+ mode="min",
938
+ search_alg=HyperOptSearch(),
939
+ num_samples=number_of_trials,
940
+ scheduler=ASHAScheduler(max_t=100, grace_period=10),
941
+ ),
942
+ run_config=RunConfig(
943
+ stop={"training_iteration": 100},
944
+ storage_path=storage_path,
945
+ # name=datetime.now().strftime("%d-%m-%Y") + "-" + session_name,
946
+ callbacks=[TBXLoggerCallback()],
947
+ # log_to_file=("stdout.log", "stderr.log"), # depreciated
948
+ # verbose=0,
949
+ ),
950
+ )
951
+ try:
952
+ results = tuner.fit()
953
+
954
+ best_result = results.get_best_result(metric, "max")
955
+ best_params = best_result.config
956
+ best_score = best_result.metrics
957
+
958
+ # log results
959
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
960
+ logger.info(f"Best Scores found were:\n{best_score}")
961
+
962
+ df_results = results.get_dataframe()
963
+ logger.info(
964
+ f"Markdown table with all trials :\n{df_results.to_markdown()}"
965
+ )
966
+
967
+ # save best params
968
+ best_params_file = f"{training_target_dir}/best_params.json"
969
+ try:
970
+ with open(best_params_file, "r") as f:
971
+ json_dict = json.load(f)
972
+ except FileNotFoundError:
973
+ json_dict = {}
974
+
975
+ json_dict[config["model_name"]] = serialize_for_json(best_params)
976
+ with open(best_params_file, "w") as f:
977
+ json.dump(json_dict, f, indent=4)
978
+
979
+ except Exception as e:
980
+ ray.shutdown()
981
+ raise Exception(e)
982
+ logger.error(e)
983
+
984
+ ray.shutdown()
985
+
986
+ # Collect errors in single file
987
+ collect_error_logs(
988
+ training_target_dir=training_target_dir, storage_path=storage_path
989
+ )
990
+
991
+ # Clean up
992
+ for var in global_vars:
993
+ del config[var]
994
+ else:
995
+ try:
996
+ with open(f"{training_target_dir}/best_params.json") as f:
997
+ json_dict = json.load(f)
998
+ best_params = json_dict[config["model_name"]]
999
+ except Exception:
1000
+ raise FileNotFoundError(
1001
+ f"Could not find {config['model_name']} in current data. Try to run an hyperoptimization by setting `perform_hyperoptimization` to true"
1002
+ )
1003
+
1004
+ # Perform cross-validation of the best model on k-folds of train + val set
1005
+ if perform_crossval:
1006
+ x_train_val = pd.concat([x_train, x_val], axis=0)
1007
+ y_train_val = pd.concat([y_train, y_val], axis=0)
1008
+ n_splits = 4
1009
+ n_samples = len(x_train_val)
1010
+ test_size = int(n_samples / (n_splits + 4))
1011
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1012
+
1013
+ # Store the scores
1014
+ cross_validation_scores = []
1015
+
1016
+ for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1017
+ _type_name = f"crossval_fold_{i}"
1018
+
1019
+ if DATE_COLUMN:
1020
+ date_column = train[DATE_COLUMN].copy()
1021
+
1022
+ if config.get("need_scaling"):
1023
+ date_column = date_column.map(pd.Timestamp.fromordinal)
1024
+
1025
+ # Now you can use the actual train/val indices to extract ranges
1026
+ train_start = date_column.iloc[train_index[0]]
1027
+ train_end = date_column.iloc[train_index[-1]]
1028
+ val_start = date_column.iloc[val_index[0]]
1029
+ val_end = date_column.iloc[val_index[-1]]
1030
+
1031
+ logger.info(
1032
+ f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1033
+ f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1034
+ )
1035
+ else:
1036
+ logger.info(
1037
+ f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1038
+ )
1039
+
1040
+ # Train the model and get the score
1041
+ if config["recurrent"]:
1042
+ cross_validation_score, _, _ = train_model(
1043
+ params=best_params,
1044
+ x_train=x_train_val[train_index],
1045
+ y_train=y_train_val[train_index],
1046
+ x_val=x_train_val[val_index],
1047
+ y_val=y_train_val[val_index],
1048
+ config=config,
1049
+ )
1050
+ else:
1051
+ cross_validation_score, _, _ = train_model(
1052
+ params=best_params,
1053
+ x_train=x_train_val.iloc[train_index],
1054
+ y_train=y_train_val.iloc[train_index],
1055
+ x_val=x_train_val.iloc[val_index],
1056
+ y_val=y_train_val.iloc[val_index],
1057
+ config=config,
1058
+ )
1059
+
1060
+ # Append score to the list
1061
+ cross_validation_scores.append(cross_validation_score)
1062
+
1063
+ # Calculate and log the mean score
1064
+ cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
1065
+ metric
1066
+ ].mean()
1067
+ logger.info(
1068
+ f"Best model mean cross-validation score: {cross_validation_mean_score}"
1069
+ )
1070
+
1071
+ # Retrain on entire training set, but keep score on cross-validation folds
1072
+ best_score, best_model, best_pred = train_model(
1073
+ params=best_params,
1074
+ x_train=x_train,
1075
+ y_train=y_train,
1076
+ x_val=x_val,
1077
+ y_val=y_val,
1078
+ config=config,
1079
+ )
1080
+ best_score = cross_validation_mean_score
1081
+ else:
1082
+ # Evaluate on validation set
1083
+ _type_name = "validation"
1084
+ best_score, best_model, best_pred = train_model(
1085
+ params=best_params,
1086
+ x_train=x_train,
1087
+ y_train=y_train,
1088
+ x_val=x_val,
1089
+ y_val=y_val,
1090
+ config=config,
1091
+ )
1092
+
1093
+ logger.info(f"Best model scores on validation set: {best_score}")
1094
+
1095
+ # Save validation predictions
1096
+ best_pred.to_csv(
1097
+ f"{results_dir}/pred_val.csv",
1098
+ index=True,
1099
+ header=True,
1100
+ index_label="ID",
1101
+ )
1102
+
1103
+ # Save best model
1104
+ if config["recurrent"]:
1105
+ model_path = f"{results_dir}/{best_model.model_name}.keras"
1106
+ best_model.save(model_path)
1107
+ else:
1108
+ model_path = f"{results_dir}/{best_model.model_name}.best"
1109
+ joblib.dump(best_model, model_path)
1110
+
1111
+ model_path = Path(model_path).resolve()
1112
+ best_score["MODEL_PATH"] = model_path
1113
+
1114
+ # Track scores
1115
+ scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
1116
+ best_score_df = pd.DataFrame([best_score])
1117
+
1118
+ if os.path.exists(scores_tracking_path):
1119
+ existing_scores = pd.read_csv(scores_tracking_path)
1120
+ common_cols = existing_scores.columns.intersection(best_score_df.columns)
1121
+ best_score_df = best_score_df[common_cols]
1122
+ scores_tracking = pd.concat(
1123
+ [existing_scores, best_score_df], ignore_index=True
1124
+ )
1125
+ else:
1126
+ scores_tracking = best_score_df
1127
+
1128
+ scores_tracking.sort_values(metric, ascending=True, inplace=True)
1129
+ scores_tracking.to_csv(scores_tracking_path, index=False)
1130
+
1131
+ # Save model training metadata
1132
+ stop = time.time()
1133
+ training_time = stop - start
1134
+ model_training.best_params = best_params
1135
+ model_training.model_path = model_path
1136
+ model_training.training_time = training_time
1137
+ model_training.save()
1138
+
1139
+ # Store metrics in DB
1140
+ drop_cols = [
1141
+ "DATE",
1142
+ "SESSION",
1143
+ "TRAIN_DATA",
1144
+ "VAL_DATA",
1145
+ "FEATURES",
1146
+ "MODEL_NAME",
1147
+ "MODEL_PATH",
1148
+ ]
1149
+ best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1150
+ score_data = {k.lower(): v for k, v in best_score.items()}
1151
+
1152
+ Score.upsert(
1153
+ match_fields=["model_training_id"],
1154
+ model_training_id=model_training.id,
1155
+ **score_data,
1156
+ )
1157
+
1158
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1159
+
1160
+ # find best model type
1161
+ scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
1162
+ scores_tracking = pd.read_csv(scores_tracking_path)
1163
+ best_score_overall = scores_tracking.iloc[0, :]
1164
+ best_model_name = best_score_overall["MODEL_NAME"]
1165
+
1166
+ # Remove any .best or .keras files
1167
+ for file_path in glob.glob(os.path.join(training_target_dir, "*.best")) + glob.glob(
1168
+ os.path.join(training_target_dir, "*.keras")
1169
+ ):
1170
+ os.remove(file_path)
1171
+ # Copy the best model in root training folder for this target
1172
+ best_model_path = Path(
1173
+ f"{training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1174
+ ).resolve()
1175
+ copy_any(
1176
+ best_score_overall["MODEL_PATH"],
1177
+ best_model_path,
1178
+ )
1179
+
1180
+ with open(f"{training_target_dir}/best_params.json", "r") as f:
1181
+ best_model_params = json.load(f)[best_model_name]
1182
+
1183
+ # save model_selection results to db
1184
+ model_selection = ModelSelection.get(model_selection.id)
1185
+ model_selection.best_model_id = Model.find_by(
1186
+ name=best_score_overall["MODEL_NAME"], type=_target_type
1187
+ ).id
1188
+ model_selection.best_model_params = best_model_params
1189
+ model_selection.best_model_path = best_model_path
1190
+ model_selection.save()
1191
+
1192
+ logger.info(f"Best model overall is : {best_score_overall}")
1193
+
1194
+
1195
+ def collect_error_logs(training_target_dir: int, storage_path: str):
1196
+
1197
+ output_error_file = f"{training_target_dir}/errors.log"
1198
+
1199
+ with open(output_error_file, "a") as outfile:
1200
+ # Walk through the ray_results directory
1201
+ for root, dirs, files in os.walk(storage_path):
1202
+ # Check if 'error.txt' exists in the current directory
1203
+ if "error.txt" in files:
1204
+ error_file_path = os.path.join(root, "error.txt")
1205
+ logger.info(f"Processing error file: {error_file_path}")
1206
+ # Read and append the content of the error.txt file
1207
+ with open(error_file_path, "r") as infile:
1208
+ outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
1209
+ outfile.write(infile.read())
1210
+ logger.info(f"All errors written to {output_error_file}")
1211
+
1212
+
1213
+ def plot_evaluation_for_classification(prediction: dict):
1214
+ """
1215
+ Args
1216
+ prediction (pd.DataFrame): Should be a df with TARGET, PRED, 0, 1 columns for y_true value (TARGET), y_pred (PRED), and probabilities (for classification only : 0 and 1)
1217
+ """
1218
+ y_true = prediction["TARGET"]
1219
+ y_pred = prediction["PRED"]
1220
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1221
+
1222
+ # Plot confusion matrix
1223
+ plot_confusion_matrix(y_true, y_pred)
1224
+
1225
+ # Compute ROC curve and ROC area
1226
+ fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
1227
+ roc_auc = auc(fpr, tpr)
1228
+
1229
+ plt.figure(figsize=(8, 8))
1230
+ plt.plot(
1231
+ fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc
1232
+ )
1233
+ plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
1234
+ plt.xlim([0.0, 1.0])
1235
+ plt.ylim([0.0, 1.05])
1236
+ plt.xlabel("False Positive Rate")
1237
+ plt.ylabel("True Positive Rate")
1238
+ plt.title("ROC Curve")
1239
+ plt.legend(loc="lower right")
1240
+ plt.show()
1241
+
1242
+ # Compute precision-recall curve
1243
+ precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
1244
+ average_precision = average_precision_score(y_true, y_pred_proba)
1245
+
1246
+ plt.figure(figsize=(8, 8))
1247
+ plt.step(recall, precision, color="b", alpha=0.2, where="post")
1248
+ plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
1249
+ plt.xlabel("Recall")
1250
+ plt.ylabel("Precision")
1251
+ plt.ylim([0.0, 1.05])
1252
+ plt.xlim([0.0, 1.0])
1253
+ plt.title("Precision-Recall Curve: AP={0:0.2f}".format(average_precision))
1254
+ plt.show()
1255
+
1256
+
1257
+ def plot_confusion_matrix(y_true, y_pred):
1258
+ unique_labels = np.unique(np.concatenate((y_true, y_pred)))
1259
+ cm = confusion_matrix(y_true, y_pred)
1260
+
1261
+ labels = np.sort(unique_labels) # Sort labels based on numerical order
1262
+
1263
+ plt.figure(figsize=(10, 7))
1264
+ sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
1265
+ plt.xlabel("Predicted", fontsize=12)
1266
+ plt.ylabel("True", fontsize=12)
1267
+ plt.title("Confusion Matrix", fontsize=14)
1268
+
1269
+ plt.xticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
1270
+ plt.yticks(ticks=np.arange(len(labels)), labels=labels, fontsize=10)
1271
+
1272
+ plt.show()
1273
+
1274
+
1275
+ # THRESHOLD
1276
+ def find_max_f1_threshold(prediction):
1277
+ """
1278
+ Finds the threshold that maximizes the F1 score for a binary classification task.
1279
+
1280
+ Parameters:
1281
+ - prediction: DataFrame with 'TARGET' and '1' (predicted probabilities) columns
1282
+
1283
+ Returns:
1284
+ - best_threshold: The threshold that maximizes the F1 score
1285
+ - best_precision: The precision at that threshold
1286
+ - best_recall: The recall at that threshold
1287
+ """
1288
+ y_true = prediction["TARGET"]
1289
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1290
+
1291
+ # Compute precision, recall, and thresholds
1292
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1293
+
1294
+ # Drop the first element to align with thresholds
1295
+ precision = precision[1:]
1296
+ recall = recall[1:]
1297
+
1298
+ # Filter out trivial cases (precision or recall = 0)
1299
+ valid = (precision > 0) & (recall > 0)
1300
+ if not np.any(valid):
1301
+ raise ValueError("No valid threshold with non-zero precision and recall")
1302
+
1303
+ precision = precision[valid]
1304
+ recall = recall[valid]
1305
+ thresholds = thresholds[valid]
1306
+
1307
+ # Compute F1 scores for each threshold
1308
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1309
+
1310
+ best_index = np.argmax(f1_scores)
1311
+
1312
+ best_threshold = thresholds[best_index]
1313
+ best_precision = precision[best_index]
1314
+ best_recall = recall[best_index]
1315
+
1316
+ return best_threshold, best_precision, best_recall
1317
+
1318
+
1319
+ def find_best_f1_threshold(prediction, fscore_target: float):
1320
+ """
1321
+ Finds the highest threshold achieving at least the given F1 score target.
1322
+
1323
+ Parameters:
1324
+ - prediction: DataFrame with 'TARGET' and '1' (or 1 as int) columns
1325
+ - fscore_target: Desired minimum F1 score (between 0 and 1)
1326
+
1327
+ Returns:
1328
+ - best_threshold: The highest threshold meeting the F1 target
1329
+ - best_precision: Precision at that threshold
1330
+ - best_recall: Recall at that threshold
1331
+ - best_f1: Actual F1 score at that threshold
1332
+ """
1333
+ y_true = prediction["TARGET"]
1334
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1335
+
1336
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1337
+
1338
+ # Align precision/recall with thresholds
1339
+ precision = precision[1:]
1340
+ recall = recall[1:]
1341
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1342
+
1343
+ # Filter for thresholds meeting F1 target
1344
+ valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 >= fscore_target]
1345
+
1346
+ if not valid_indices:
1347
+ raise ValueError(f"Could not find a threshold with F1 >= {fscore_target:.2f}")
1348
+
1349
+ # Pick the highest threshold among valid ones
1350
+ best_index = valid_indices[-1]
1351
+
1352
+ return (
1353
+ thresholds[best_index],
1354
+ precision[best_index],
1355
+ recall[best_index],
1356
+ f1_scores[best_index],
1357
+ )
1358
+
1359
+
1360
+ def find_max_precision_threshold_without_trivial_case(prediction: dict):
1361
+ """
1362
+ Finds the threshold that maximizes precision without reaching a precision of 1,
1363
+ which indicates all predictions are classified as the negative class (0).
1364
+
1365
+ Parameters:
1366
+ - prediction: dict with keys 'TARGET' (true labels) and '1' (predicted probabilities)
1367
+
1368
+ Returns:
1369
+ - threshold: the probability threshold that maximizes precision
1370
+ - actual_recall: the recall achieved at this threshold
1371
+ - actual_precision: the precision achieved at this threshold
1372
+ """
1373
+ y_true = prediction["TARGET"]
1374
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1375
+
1376
+ # Compute precision, recall, and thresholds
1377
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1378
+
1379
+ # Drop the first element of precision and recall to align with thresholds
1380
+ precision = precision[1:]
1381
+ recall = recall[1:]
1382
+
1383
+ # Filter out precision == 1.0 (which might correspond to predicting only 0s)
1384
+ valid_indices = np.where(precision < 1.0)[0]
1385
+ if len(valid_indices) == 0:
1386
+ raise ValueError("No valid precision values less than 1.0")
1387
+
1388
+ precision = precision[valid_indices]
1389
+ recall = recall[valid_indices]
1390
+ thresholds = thresholds[valid_indices]
1391
+
1392
+ # Find the index of the maximum precision
1393
+ best_index = np.argmax(precision)
1394
+
1395
+ # Return the corresponding threshold, precision, and recall
1396
+ best_threshold = thresholds[best_index]
1397
+ best_precision = precision[best_index]
1398
+ best_recall = recall[best_index]
1399
+
1400
+ return best_threshold, best_precision, best_recall
1401
+
1402
+
1403
+ def find_best_precision_threshold(prediction, precision_target: float = 0.80):
1404
+ """
1405
+ Finds the highest threshold that achieves at least the given precision target.
1406
+
1407
+ Parameters:
1408
+ prediction (pd.DataFrame): DataFrame with columns 'TARGET' and '1' or index 1 for predicted probabilities
1409
+ precision_target (float): Desired minimum precision (between 0 and 1)
1410
+
1411
+ Returns:
1412
+ tuple: (threshold, precision, recall) achieving the desired precision
1413
+ """
1414
+ y_true = prediction["TARGET"]
1415
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1416
+
1417
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1418
+
1419
+ # Align lengths: thresholds is N-1 compared to precision/recall
1420
+ thresholds = thresholds
1421
+ precision = precision[1:] # Shift to match thresholds
1422
+ recall = recall[1:]
1423
+
1424
+ valid_indices = [i for i, p in enumerate(precision) if p >= precision_target]
1425
+
1426
+ if not valid_indices:
1427
+ raise ValueError(
1428
+ f"Could not find a threshold with precision >= {precision_target}"
1429
+ )
1430
+
1431
+ best_idx = valid_indices[-1] # Highest threshold with precision >= target
1432
+
1433
+ return thresholds[best_idx], precision[best_idx], recall[best_idx]
1434
+
1435
+
1436
+ def find_best_recall_threshold(prediction, recall_target: float = 0.98) -> float:
1437
+ """
1438
+ Finds the highest threshold that achieves at least the given recall target.
1439
+
1440
+ Parameters:
1441
+ pred_df (pd.DataFrame): DataFrame with columns 'y_true' and 'y_pred_proba'
1442
+ recall_target (float): Desired minimum recall (between 0 and 1)
1443
+
1444
+ Returns:
1445
+ float: Best threshold achieving the desired recall, or None if not reachable
1446
+ """
1447
+ y_true = prediction["TARGET"]
1448
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1449
+
1450
+ precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
1451
+
1452
+ # `thresholds` has length N-1 compared to precision and recall
1453
+ recall = recall[1:] # Drop first element to align with thresholds
1454
+ precision = precision[1:]
1455
+
1456
+ valid_indices = [i for i, r in enumerate(recall) if r >= recall_target]
1457
+
1458
+ if not valid_indices:
1459
+ logger.warning(f"Could not find a threshold with recall >= {recall_target}")
1460
+ return None, None, None
1461
+
1462
+ best_idx = valid_indices[-1] # Highest threshold with recall >= target
1463
+
1464
+ return thresholds[best_idx], precision[best_idx], recall[best_idx]
1465
+
1466
+
1467
+ def plot_threshold(prediction, threshold, precision, recall):
1468
+ y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
1469
+ y_true = prediction["TARGET"]
1470
+
1471
+ predicted_positive = (y_pred_proba >= threshold).sum()
1472
+ predicted_negative = (y_pred_proba < threshold).sum()
1473
+ f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
1474
+ per_predicted_positive = predicted_positive / len(y_pred_proba)
1475
+ per_predicted_negative = predicted_negative / len(y_pred_proba)
1476
+
1477
+ logger.info(
1478
+ f"""Threshold: {threshold*100:.2f}
1479
+ Precision: {precision*100:.2f}
1480
+ Recall: {recall*100:.2f}
1481
+ F1-score: {f1_scores*100:.2f}
1482
+ % of score over {threshold}: {predicted_positive}/{len(y_pred_proba)} = {per_predicted_positive*100:.2f}%
1483
+ % of score under {threshold}: {predicted_negative}/{len(y_pred_proba)} = {per_predicted_negative*100:.2f}%"""
1484
+ )
1485
+
1486
+ # Visualizing the scores of positive and negative classes
1487
+ plt.figure(figsize=(10, 6))
1488
+ sns.histplot(
1489
+ y_pred_proba[y_true == 1],
1490
+ color="blue",
1491
+ label="Positive Class",
1492
+ bins=30,
1493
+ kde=True,
1494
+ alpha=0.6,
1495
+ )
1496
+ sns.histplot(
1497
+ y_pred_proba[y_true == 0],
1498
+ color="red",
1499
+ label="Negative Class",
1500
+ bins=30,
1501
+ kde=True,
1502
+ alpha=0.6,
1503
+ )
1504
+ plt.axvline(
1505
+ x=threshold,
1506
+ color="green",
1507
+ linestyle="--",
1508
+ label=f"Threshold at {round(threshold,3)}",
1509
+ )
1510
+ plt.title("Distribution of Predicted Probabilities")
1511
+ plt.xlabel("Predicted Probabilities")
1512
+ plt.ylabel("Frequency")
1513
+ plt.legend()
1514
+ plt.show()
1515
+ return threshold
1516
+
1517
+
1518
+ def print_scores(training_target_dir: str):
1519
+ """
1520
+ Monitor scores
1521
+ """
1522
+ scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
1523
+ return scores_tracking
1524
+
1525
+
1526
+ # OLD - to sort out
1527
+ def get_pred_distribution(training_target_dir: str, model_name="linear"):
1528
+ """
1529
+ Look at prediction distributions
1530
+ """
1531
+ prediction = pd.read_csv(
1532
+ f"{training_target_dir}/{model_name}/pred_val.csv",
1533
+ index_col="ID",
1534
+ )
1535
+ prediction.describe()
1536
+
1537
+
1538
+ def plot_feature_importance(training_target_dir: str, model_name="linear"):
1539
+ """
1540
+ Monitor feature importance ranking to filter out unrelevant features
1541
+ """
1542
+ model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
1543
+ if hasattr(model, "feature_importances_"):
1544
+ feature_importances_ = model.feature_importances_.flatten()
1545
+ elif hasattr(model, "feature_importance"):
1546
+ feature_importances_ = model.feature_importance.flatten()
1547
+ elif hasattr(model, "coefs_"):
1548
+ feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
1549
+ elif hasattr(model, "coef_"):
1550
+ feature_importances_ = model.coef_.flatten()
1551
+ else:
1552
+ feature_importances_ = []
1553
+
1554
+ sns.barplot(
1555
+ data=feature_importances_,
1556
+ orient="h",
1557
+ )
1558
+
1559
+
1560
+ def print_model_estimators(training_target_dir: str, model_name="linear"):
1561
+ """
1562
+ Look at a specific trained model
1563
+ """
1564
+ model = joblib.load(f"{training_target_dir}/{model_name}/{model_name}.best")
1565
+ for i in range(0, 100):
1566
+ logger.info(model.estimators_[i].get_depth())
1567
+
1568
+
1569
+ def get_model_info(model):
1570
+ model.count_params()
1571
+ model.summary()