lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +277 -0
  3. lecrapaud/config.py +10 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/env.py +2 -2
  6. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
  7. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  8. lecrapaud/db/alembic.ini +116 -0
  9. lecrapaud/db/models/__init__.py +10 -10
  10. lecrapaud/db/models/base.py +176 -1
  11. lecrapaud/db/models/dataset.py +25 -20
  12. lecrapaud/db/models/feature.py +5 -6
  13. lecrapaud/db/models/feature_selection.py +3 -4
  14. lecrapaud/db/models/feature_selection_rank.py +3 -4
  15. lecrapaud/db/models/model.py +3 -4
  16. lecrapaud/db/models/model_selection.py +15 -8
  17. lecrapaud/db/models/model_training.py +15 -7
  18. lecrapaud/db/models/score.py +9 -6
  19. lecrapaud/db/models/target.py +16 -8
  20. lecrapaud/db/session.py +66 -0
  21. lecrapaud/experiment.py +64 -0
  22. lecrapaud/feature_engineering.py +747 -1022
  23. lecrapaud/feature_selection.py +915 -998
  24. lecrapaud/integrations/openai_integration.py +225 -0
  25. lecrapaud/jobs/__init__.py +2 -2
  26. lecrapaud/jobs/config.py +1 -1
  27. lecrapaud/jobs/scheduler.py +1 -1
  28. lecrapaud/jobs/tasks.py +6 -6
  29. lecrapaud/model_selection.py +1060 -960
  30. lecrapaud/search_space.py +4 -0
  31. lecrapaud/utils.py +2 -2
  32. lecrapaud-0.4.1.dist-info/METADATA +171 -0
  33. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
  34. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
  35. lecrapaud/db/crud.py +0 -179
  36. lecrapaud/db/services.py +0 -0
  37. lecrapaud/db/setup.py +0 -58
  38. lecrapaud/predictions.py +0 -292
  39. lecrapaud/training.py +0 -151
  40. lecrapaud-0.4.0.dist-info/METADATA +0 -103
  41. /lecrapaud/{directory_management.py → directories.py} +0 -0
  42. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
@@ -10,6 +10,7 @@ import warnings
10
10
  import joblib
11
11
  import glob
12
12
  from pathlib import Path
13
+ import pickle
13
14
 
14
15
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
15
16
 
@@ -63,12 +64,19 @@ from ray.tune.schedulers import ASHAScheduler
63
64
  from ray.air import session
64
65
 
65
66
  # Internal library
66
- from src.search_space import ml_models, dl_recurrent_models
67
- from src.directory_management import clean_directory
68
- from src.utils import copy_any, contains_best, logger, serialize_for_json
69
- from src.config import PYTHON_ENV
70
- from src.feature_selection import TARGETS_CLF, DATE_COLUMN, load_train_data
71
- from src.db.models import Model, ModelSelection, ModelTraining, Score, Target, Dataset
67
+ from lecrapaud.search_space import all_models
68
+ from lecrapaud.directories import clean_directory
69
+ from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
+ from lecrapaud.config import PYTHON_ENV
71
+ from lecrapaud.feature_selection import load_train_data
72
+ from lecrapaud.db import (
73
+ Model,
74
+ ModelSelection,
75
+ ModelTraining,
76
+ Score,
77
+ Target,
78
+ Dataset,
79
+ )
72
80
 
73
81
  # Reproducible result
74
82
  keras.utils.set_random_seed(42)
@@ -100,1116 +108,1216 @@ def test_hardware():
100
108
  warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
101
109
 
102
110
 
103
- # Metrics
104
- def rmse_tf(y_true, y_pred):
105
- y_true, y_pred = unscale_tf(y_true, y_pred)
106
- results = K.sqrt(K.mean(K.square(y_pred - y_true)))
107
- return results
108
-
109
-
110
- def mae_tf(y_true, y_pred):
111
- y_true, y_pred = unscale_tf(y_true, y_pred)
112
- results = K.mean(K.abs(y_pred - y_true))
113
- return results
111
+ class ModelEngine:
114
112
 
113
+ def __init__(
114
+ self,
115
+ model_name: str = None,
116
+ target_type: str = None,
117
+ path: str = None,
118
+ search_params: dict = {},
119
+ create_model=None,
120
+ plot: bool = False,
121
+ log_dir: str = None,
122
+ ):
123
+ self.path = path
124
+ if path:
125
+ self.load()
126
+ else:
127
+ self.model_name = model_name
128
+ self.target_type = target_type
115
129
 
116
- def unscale_tf(y_true, y_pred):
117
- if _target_type == "regression":
118
- scale = K.constant(_scaler_y.scale_[0])
119
- mean = K.constant(_scaler_y.mean_[0])
130
+ config = [
131
+ config for config in all_models if config["model_name"] == self.model_name
132
+ ]
133
+ if config is None or len(config) == 0:
134
+ Exception(
135
+ f"Model {self.model_name} is not supported by this library."
136
+ f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
137
+ )
138
+ config = config[0]
120
139
 
121
- y_true = K.mul(y_true, scale)
122
- y_true = K.bias_add(y_true, mean)
140
+ self.recurrent = config["recurrent"]
141
+ self.need_scaling = config["need_scaling"]
142
+ self.search_params = search_params
143
+ self.create_model = create_model
144
+ self.plot = plot
145
+ self.log_dir = log_dir
123
146
 
124
- y_pred = K.mul(y_pred, scale)
125
- y_pred = K.bias_add(y_pred, mean)
126
- return y_true, y_pred
147
+ if self.need_scaling and self.target_type == "regression":
148
+ self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
149
+ else:
150
+ self.scaler_y = None
127
151
 
152
+ self.threshold = None
128
153
 
129
- def recall_tf(y_true, y_pred):
130
- y_true = K.ones_like(y_true)
131
- true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
132
- all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
154
+ def fit(self, *args):
155
+ if self.recurrent:
156
+ fit = self.fit_recurrent
157
+ elif (self.create_model == "lgb") or (self.create_model == "xgb"):
158
+ fit = self.fit_boosting
159
+ else:
160
+ fit = self.fit_sklearn
161
+ model = fit(*args)
162
+ return model
163
+
164
+ # Functions to fit & evaluate models
165
+ def fit_sklearn(self, x_train, y_train, x_val, y_val, params):
166
+
167
+ # Create & Compile the model
168
+ model = self.create_model(**params)
169
+
170
+ # Train the model
171
+ logger.info("Fitting the model...")
172
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
173
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
174
+
175
+ model.fit(x_train, y_train)
176
+
177
+ if (
178
+ self.target_type == "classification"
179
+ and "loss" in model.get_params().keys()
180
+ and "hinge" in model.get_params()["loss"]
181
+ ):
182
+ # This is for SVC models with hinge loss
183
+ # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
184
+ # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
185
+ logger.info(
186
+ f"Re-Calibrating {self.model_name} to get predict probabilities..."
187
+ )
188
+ calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
189
+ model = calibrator.fit(x_train, y_train)
133
190
 
134
- recall = true_positives / (all_positives + K.epsilon())
135
- return recall
191
+ # set model_name after calibrator
192
+ model.model_name = self.model_name
193
+ model.target_type = self.target_type
136
194
 
195
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
137
196
 
138
- def precision_tf(y_true, y_pred):
139
- y_true = K.ones_like(y_true)
140
- true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
197
+ self._model = model
141
198
 
142
- predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
143
- precision = true_positives / (predicted_positives + K.epsilon())
144
- return precision
199
+ return model
145
200
 
201
+ def fit_boosting(self, x_train, y_train, x_val, y_val, params):
202
+ """
203
+ This is using lightGBM or XGboost C++ librairies
204
+ """
205
+ lightGBM = self.create_model == "lgb"
146
206
 
147
- def f1_score_tf(y_true, y_pred):
148
- precision = precision_tf(y_true, y_pred)
149
- recall = recall_tf(y_true, y_pred)
150
- return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
207
+ # Datasets
208
+ boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
209
+ train_data = boosting_dataset(x_train, label=y_train)
210
+ val_data = boosting_dataset(x_val, label=y_val)
151
211
 
212
+ # Create a TensorBoardX writer
213
+ writer = SummaryWriter(self.log_dir)
214
+ evals_result = {}
152
215
 
153
- def get_log_dir(training_target_dir: str, model_name="test_model"):
154
- """Generates a structured log directory path for TensorBoard."""
155
- timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
156
- log_dir = (
157
- Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
158
- )
159
- log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
160
- return str(log_dir)
161
-
216
+ # Training
217
+ labels = np.unique(y_train)
218
+ num_class = (
219
+ labels.size
220
+ if self.target_type == "classification" and labels.size > 2
221
+ else 1
222
+ )
223
+ logger.info("Fitting the model...")
224
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
225
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
162
226
 
163
- # Functions to fit & evaluate models
164
- def fit_sklearn(x_train, y_train, x_val, y_val, create_model, params, config):
227
+ if lightGBM:
165
228
 
166
- # Create & Compile the model
167
- model = create_model(**params)
229
+ def tensorboard_callback(env):
230
+ for i, metric in enumerate(env.evaluation_result_list):
231
+ metric_name, _, metric_value, _ = metric
232
+ writer.add_scalar(
233
+ f"LightGBM/{metric_name}", metric_value, env.iteration
234
+ )
168
235
 
169
- # Train the model
170
- logger.info("Fitting the model...")
171
- logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
172
- logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
236
+ loss = (
237
+ "regression"
238
+ if self.target_type == "regression"
239
+ else ("binary" if num_class <= 2 else "multiclass")
240
+ )
241
+ eval_metric = (
242
+ "rmse"
243
+ if self.target_type == "regression"
244
+ else ("binary_logloss" if num_class <= 2 else "multi_logloss")
245
+ )
246
+ model = lgb.train(
247
+ params={
248
+ **params["model_params"],
249
+ "objective": loss,
250
+ "metric": eval_metric,
251
+ "num_class": num_class,
252
+ },
253
+ num_boost_round=params["num_boost_round"],
254
+ train_set=train_data,
255
+ valid_sets=[train_data, val_data],
256
+ valid_names=["train", "val"],
257
+ callbacks=[
258
+ lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
259
+ lgb.record_evaluation(evals_result),
260
+ tensorboard_callback,
261
+ ],
262
+ )
263
+ else:
173
264
 
174
- model.fit(x_train, y_train)
265
+ class TensorBoardCallback(xgb.callback.TrainingCallback):
175
266
 
176
- if (
177
- _target_type == "classification"
178
- and "loss" in model.get_params().keys()
179
- and "hinge" in model.get_params()["loss"]
180
- ):
181
- # This is for SVC models with hinge loss
182
- # You should use CalibratedClassifierCV when you are working with classifiers that do not natively output well-calibrated probability estimates.
183
- # TODO: investigate if we should use calibration for random forest, gradiant boosting models, and bagging models
184
- logger.info(
185
- f"Re-Calibrating {config["model_name"]} to get predict probabilities..."
186
- )
187
- calibrator = CalibratedClassifierCV(model, cv="prefit", n_jobs=-1)
188
- model = calibrator.fit(x_train, y_train)
267
+ def __init__(self, log_dir: str):
268
+ self.writer = SummaryWriter(log_dir=log_dir)
189
269
 
190
- # set model_name after calibrator
191
- model.model_name = config["model_name"]
270
+ def after_iteration(
271
+ self,
272
+ model,
273
+ epoch: int,
274
+ evals_log: xgb.callback.TrainingCallback.EvalsLog,
275
+ ) -> bool:
276
+ if not evals_log:
277
+ return False
192
278
 
193
- logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
279
+ for data, metric in evals_log.items():
280
+ for metric_name, log in metric.items():
281
+ score = (
282
+ log[-1][0] if isinstance(log[-1], tuple) else log[-1]
283
+ )
284
+ self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
194
285
 
195
- return model
286
+ return False
196
287
 
288
+ tensorboard_callback = TensorBoardCallback(self.log_dir)
197
289
 
198
- def fit_boosting(x_train, y_train, x_val, y_val, create_model, params, config):
199
- """
200
- This is using lightGBM or XGboost C++ librairies
201
- """
202
- lightGBM = create_model == "lgb"
290
+ loss = (
291
+ "reg:squarederror"
292
+ if self.target_type == "regression"
293
+ else ("binary:logistic" if num_class <= 2 else "multi:softprob")
294
+ )
295
+ eval_metric = (
296
+ "rmse"
297
+ if self.target_type == "regression"
298
+ else ("logloss" if num_class <= 2 else "mlogloss")
299
+ )
300
+ model = xgb.train(
301
+ params={
302
+ **params["model_params"],
303
+ "objective": loss,
304
+ "eval_metric": eval_metric,
305
+ "num_class": num_class,
306
+ },
307
+ num_boost_round=params["num_boost_round"],
308
+ dtrain=train_data,
309
+ evals=[(val_data, "val"), (train_data, "train")],
310
+ callbacks=[
311
+ xgb.callback.EarlyStopping(
312
+ rounds=params["early_stopping_rounds"], save_best=True
313
+ ),
314
+ xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
315
+ tensorboard_callback,
316
+ ],
317
+ evals_result=evals_result, # Record evaluation result
318
+ verbose_eval=0,
319
+ )
203
320
 
204
- # Datasets
205
- Dataset = lgb.Dataset if lightGBM else xgb.DMatrix
206
- train_data = Dataset(x_train, label=y_train)
207
- val_data = Dataset(x_val, label=y_val)
321
+ model.model_name = self.create_model
322
+ model.target_type = self.target_type
323
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
208
324
 
209
- # Callbacks
210
- log_dir = get_log_dir(_training_target_dir, create_model)
325
+ # Close the writer after training is done
326
+ writer.close()
211
327
 
212
- # Create a TensorBoardX writer
213
- writer = SummaryWriter(log_dir)
214
- evals_result = {}
328
+ if self.plot:
329
+ # Plot loss per epoch
330
+ train_loss = evals_result["train"][eval_metric]
331
+ val_loss = evals_result["val"][eval_metric]
332
+ logs = pd.DataFrame({"train": train_loss, "val": val_loss})
215
333
 
216
- # Training
217
- labels = np.unique(y_train)
218
- num_class = (
219
- labels.size if _target_type == "classification" and labels.size > 2 else 1
220
- )
221
- logger.info("Fitting the model...")
222
- logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
223
- logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
334
+ plt.figure(figsize=(14, 4))
335
+ plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
336
+ plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
337
+ plt.xlabel("Epoch")
338
+ plt.ylabel("Loss")
339
+ plt.legend()
340
+ plt.show()
224
341
 
225
- if lightGBM:
342
+ self._model = model
226
343
 
227
- def tensorboard_callback(env):
228
- for i, metric in enumerate(env.evaluation_result_list):
229
- metric_name, _, metric_value, _ = metric
230
- writer.add_scalar(
231
- f"LightGBM/{metric_name}", metric_value, env.iteration
232
- )
344
+ return model
233
345
 
234
- loss = (
235
- "regression"
236
- if _target_type == "regression"
237
- else ("binary" if num_class <= 2 else "multiclass")
238
- )
239
- eval_metric = (
240
- "rmse"
241
- if _target_type == "regression"
242
- else ("binary_logloss" if num_class <= 2 else "multi_logloss")
243
- )
244
- model = lgb.train(
245
- params={
246
- **params["model_params"],
247
- "objective": loss,
248
- "metric": eval_metric,
249
- "num_class": num_class,
250
- },
251
- num_boost_round=params["num_boost_round"],
252
- train_set=train_data,
253
- valid_sets=[train_data, val_data],
254
- valid_names=["train", "val"],
255
- callbacks=[
256
- lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
257
- lgb.record_evaluation(evals_result),
258
- tensorboard_callback,
259
- ],
260
- )
261
- else:
346
+ def fit_recurrent(self, x_train, y_train, x_val, y_val, params):
262
347
 
263
- class TensorBoardCallback(xgb.callback.TrainingCallback):
348
+ # metrics functions
349
+ def rmse_tf(y_true, y_pred):
350
+ y_true, y_pred = unscale_tf(y_true, y_pred)
351
+ results = K.sqrt(K.mean(K.square(y_pred - y_true)))
352
+ return results
264
353
 
265
- def __init__(self, log_dir: str):
266
- self.writer = SummaryWriter(log_dir=log_dir)
354
+ def mae_tf(y_true, y_pred):
355
+ y_true, y_pred = unscale_tf(y_true, y_pred)
356
+ results = K.mean(K.abs(y_pred - y_true))
357
+ return results
267
358
 
268
- def after_iteration(
269
- self,
270
- model,
271
- epoch: int,
272
- evals_log: xgb.callback.TrainingCallback.EvalsLog,
273
- ) -> bool:
274
- if not evals_log:
275
- return False
359
+ def unscale_tf(y_true, y_pred):
360
+ if self.target_type == "regression":
361
+ scale = K.constant(self.scaler_y.scale_[0])
362
+ mean = K.constant(self.scaler_y.mean_[0])
276
363
 
277
- for data, metric in evals_log.items():
278
- for metric_name, log in metric.items():
279
- score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
280
- self.writer.add_scalar(f"XGBoost/{data}", score, epoch)
364
+ y_true = K.mul(y_true, scale)
365
+ y_true = K.bias_add(y_true, mean)
281
366
 
282
- return False
367
+ y_pred = K.mul(y_pred, scale)
368
+ y_pred = K.bias_add(y_pred, mean)
369
+ return y_true, y_pred
283
370
 
284
- tensorboard_callback = TensorBoardCallback(log_dir)
371
+ # Create the model
372
+ labels = np.unique(y_train[:, 0])
373
+ num_class = labels.size if self.target_type == "classification" else None
374
+ input_shape = (x_train.shape[1], x_train.shape[2])
375
+ model = self.create_model(params, input_shape, self.target_type, num_class)
376
+ model.target_type = self.target_type
285
377
 
378
+ # Compile the model
286
379
  loss = (
287
- "reg:squarederror"
288
- if _target_type == "regression"
289
- else ("binary:logistic" if num_class <= 2 else "multi:softprob")
290
- )
291
- eval_metric = (
292
- "rmse"
293
- if _target_type == "regression"
294
- else ("logloss" if num_class <= 2 else "mlogloss")
380
+ rmse_tf
381
+ if self.target_type == "regression"
382
+ else (
383
+ BinaryCrossentropy(from_logits=False)
384
+ if num_class <= 2
385
+ else CategoricalCrossentropy(from_logits=False)
386
+ )
295
387
  )
296
- model = xgb.train(
297
- params={
298
- **params["model_params"],
299
- "objective": loss,
300
- "eval_metric": eval_metric,
301
- "num_class": num_class,
302
- },
303
- num_boost_round=params["num_boost_round"],
304
- dtrain=train_data,
305
- evals=[(val_data, "val"), (train_data, "train")],
306
- callbacks=[
307
- xgb.callback.EarlyStopping(
308
- rounds=params["early_stopping_rounds"], save_best=True
309
- ),
310
- xgb.callback.EvaluationMonitor(), # This shows evaluation results at each iteration
311
- tensorboard_callback,
312
- ],
313
- evals_result=evals_result, # Record evaluation result
314
- verbose_eval=0,
388
+ optimizer = Adam(
389
+ learning_rate=params["learning_rate"], clipnorm=params["clipnorm"]
315
390
  )
316
-
317
- model.model_name = create_model
318
- logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
319
-
320
- # Close the writer after training is done
321
- writer.close()
322
-
323
- if _plot:
324
- # Plot loss per epoch
325
- train_loss = evals_result["train"][eval_metric]
326
- val_loss = evals_result["val"][eval_metric]
327
- logs = pd.DataFrame({"train": train_loss, "val": val_loss})
328
-
329
- plt.figure(figsize=(14, 4))
330
- plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
331
- plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
332
- plt.xlabel("Epoch")
333
- plt.ylabel("Loss")
334
- plt.legend()
335
- plt.show()
336
-
337
- return model
338
-
339
-
340
- def fit_recurrent(x_train, y_train, x_val, y_val, create_model, params, config):
341
-
342
- # Create the model
343
- labels = np.unique(y_train[:, 0])
344
- num_class = labels.size if _target_type == "classification" else None
345
- input_shape = (x_train.shape[1], x_train.shape[2])
346
- model = create_model(params, input_shape, _target_type, num_class)
347
-
348
- # Compile the model
349
- loss = (
350
- rmse_tf
351
- if _target_type == "regression"
352
- else (
353
- BinaryCrossentropy(from_logits=False)
354
- if num_class <= 2
355
- else CategoricalCrossentropy(from_logits=False)
391
+ metrics = (
392
+ [mae_tf]
393
+ if self.target_type == "regression"
394
+ else (
395
+ ["accuracy", Precision(), Recall()]
396
+ if num_class <= 2
397
+ else ["categorical_accuracy"]
398
+ )
356
399
  )
357
- )
358
- optimizer = Adam(learning_rate=params["learning_rate"], clipnorm=params["clipnorm"])
359
- metrics = (
360
- [mae_tf]
361
- if _target_type == "regression"
362
- else (
363
- ["accuracy", Precision(), Recall()]
364
- if num_class <= 2
365
- else ["categorical_accuracy"]
400
+ model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
401
+
402
+ # Callbacks
403
+ tensorboard_callback = TensorBoard(log_dir=self.log_dir)
404
+ early_stopping_callback = EarlyStopping(
405
+ monitor="val_loss",
406
+ patience=3,
407
+ restore_best_weights=True,
408
+ start_from_epoch=5,
366
409
  )
367
- )
368
- model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
369
-
370
- # Callbacks
371
- log_dir = get_log_dir(_training_target_dir, model.model_name)
372
-
373
- tensorboard_callback = TensorBoard(log_dir=log_dir)
374
- early_stopping_callback = EarlyStopping(
375
- monitor="val_loss", patience=3, restore_best_weights=True, start_from_epoch=5
376
- )
377
-
378
- # Custom callbacks
379
- class PrintTrainableWeights(keras.callbacks.Callback):
380
- def on_epoch_end(self, epoch, logs={}):
381
- logger.info(model.trainable_variables)
382
-
383
- class GradientCalcCallback(keras.callbacks.Callback):
384
- def __init__(self):
385
- self.epoch_gradient = []
386
-
387
- def get_gradient_func(self, model):
388
- # grads = K.gradients(model.total_loss, model.trainable_weights)
389
- grads = K.gradients(model.loss, model.trainable_weights)
390
- # inputs = model.model.inputs + model.targets + model.sample_weights
391
- # use below line of code if above line doesn't work for you
392
- # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
393
- inputs = (
394
- model._feed_inputs + model._feed_targets + model._feed_sample_weights
395
- )
396
- func = K.function(inputs, grads)
397
- return func
398
-
399
- def on_epoch_end(self, epoch, logs=None):
400
- get_gradient = self.get_gradient_func(model)
401
- grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
402
- self.epoch_gradient.append(grads)
403
-
404
- # Train the model
405
- if _target_type == "classification" and num_class > 2:
406
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
407
- lb.fit(labels)
408
- y_train = lb.transform(y_train[:, 0].flatten())
409
- y_val = lb.transform(y_val[:, 0].flatten())
410
- else:
411
- y_train = y_train[:, 0].flatten()
412
- y_val = y_val[:, 0].flatten()
413
-
414
- logger.info("Fitting the model...")
415
- logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
416
- logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
417
-
418
- history = model.fit(
419
- x_train,
420
- y_train,
421
- batch_size=params["batch_size"],
422
- verbose=0,
423
- epochs=params["epochs"],
424
- shuffle=False,
425
- validation_data=(x_val, y_val),
426
- callbacks=[early_stopping_callback, tensorboard_callback],
427
- )
428
410
 
429
- logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
430
- # logger.info(pd.DataFrame(gradiant.epoch_gradient))
431
-
432
- if _plot:
433
- # Plot loss per epoch
434
- logs = pd.DataFrame(history.history)
435
-
436
- plt.figure(figsize=(14, 4))
437
- plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
438
- plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
439
- plt.xlabel("Epoch")
440
- plt.ylabel("Loss")
441
- plt.legend()
442
- plt.show()
443
-
444
- return model
445
-
446
-
447
- def predict(
448
- model, data: pd.DataFrame, target_type: str, config: dict, threshold: float = 0.5
449
- ):
450
- """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
451
-
452
- Args:
453
- - model: the train model to predict value
454
- - data: the data for prediction
455
- - target_type: classification or regression
456
- - config: dict containing model config
457
- """
458
- if config["recurrent"] or model.model_name in ["lgb", "xgb"]:
459
- # keras, lgb & xgb
460
- if model.model_name == "lgb":
461
- # Direct prediction for LightGBM
462
- pred = model.predict(data)
463
- elif model.model_name == "xgb":
464
- # Convert val_data to DMatrix for XGBoost
465
- d_data = xgb.DMatrix(data)
466
- pred = model.predict(d_data)
467
- else:
468
- # Reshape (flatten) for keras if not multiclass
469
- pred = model.predict(data)
470
- if pred.shape[1] == 1:
471
- pred = pred.reshape(-1)
472
-
473
- if target_type == "classification":
474
- num_class = pred.shape[1] if len(pred.shape) > 1 else 2
475
-
476
- if num_class <= 2:
477
- # For binary classification, concatenate the predicted probabilities for both classes
478
- pred_df = pd.DataFrame(
479
- {
480
- 0: 1 - pred, # Probability of class 0
481
- 1: pred, # Probability of class 1
482
- },
411
+ # Custom callbacks
412
+ class PrintTrainableWeights(keras.callbacks.Callback):
413
+ def on_epoch_end(self, epoch, logs={}):
414
+ logger.info(model.trainable_variables)
415
+
416
+ class GradientCalcCallback(keras.callbacks.Callback):
417
+ def __init__(self):
418
+ self.epoch_gradient = []
419
+
420
+ def get_gradient_func(self, model):
421
+ # grads = K.gradients(model.total_loss, model.trainable_weights)
422
+ grads = K.gradients(model.loss, model.trainable_weights)
423
+ # inputs = model.model.inputs + model.targets + model.sample_weights
424
+ # use below line of code if above line doesn't work for you
425
+ # inputs = model.model._feed_inputs + model.model._feed_targets + model.model._feed_sample_weights
426
+ inputs = (
427
+ model._feed_inputs
428
+ + model._feed_targets
429
+ + model._feed_sample_weights
483
430
  )
484
- else:
485
- # For multi-class classification, use the predicted probabilities for each class
486
- pred_df = pd.DataFrame(pred, columns=range(num_class))
487
-
488
- # Get final predictions (argmax for multi-class, threshold for binary)
489
- if num_class == 2:
490
- pred_df["PRED"] = np.where(
491
- pred_df[1] >= threshold, 1, 0
492
- ) # Class 1 if prob >= threshold
493
- else:
494
- pred_df["PRED"] = pred_df.idxmax(
495
- axis=1
496
- ) # Class with highest probability for multiclasses
431
+ func = K.function(inputs, grads)
432
+ return func
497
433
 
498
- # Reorder columns to show predicted class first, then probabilities
499
- pred = pred_df[["PRED"] + list(range(num_class))]
434
+ def on_epoch_end(self, epoch, logs=None):
435
+ get_gradient = self.get_gradient_func(model)
436
+ grads = get_gradient([x_val, y_val[:, 0], np.ones(len(y_val[:, 0]))])
437
+ self.epoch_gradient.append(grads)
500
438
 
439
+ # Train the model
440
+ if self.target_type == "classification" and num_class > 2:
441
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
442
+ lb.fit(labels)
443
+ y_train = lb.transform(y_train[:, 0].flatten())
444
+ y_val = lb.transform(y_val[:, 0].flatten())
501
445
  else:
502
- pred = pd.Series(pred, name="PRED")
446
+ y_train = y_train[:, 0].flatten()
447
+ y_val = y_val[:, 0].flatten()
448
+
449
+ logger.info("Fitting the model...")
450
+ logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
451
+ logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
452
+
453
+ history = model.fit(
454
+ x_train,
455
+ y_train,
456
+ batch_size=params["batch_size"],
457
+ verbose=0,
458
+ epochs=params["epochs"],
459
+ shuffle=False,
460
+ validation_data=(x_val, y_val),
461
+ callbacks=[early_stopping_callback, tensorboard_callback],
462
+ )
503
463
 
504
- # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
505
- if model.model_name in ["lgb", "xgb"]:
506
- pred.index = data.index
507
- else:
508
- # sk learn
509
- pred = pd.Series(model.predict(data), index=data.index, name="PRED")
510
- if target_type == "classification":
511
- pred_proba = pd.DataFrame(
512
- model.predict_proba(data),
513
- index=data.index,
514
- columns=[
515
- int(c) if isinstance(c, float) and c.is_integer() else c
516
- for c in model.classes_
517
- ],
518
- )
464
+ logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
465
+ # logger.info(pd.DataFrame(gradiant.epoch_gradient))
519
466
 
520
- # Apply threshold for binary classification
521
- if len(model.classes_) == 2:
522
- positive_class = model.classes_[1] # Assuming classes are ordered
523
- pred = (pred_proba[positive_class] >= threshold).astype(int)
524
- pred.name = "PRED"
467
+ if self.plot:
468
+ # Plot loss per epoch
469
+ logs = pd.DataFrame(history.history)
525
470
 
526
- pred = pd.concat([pred, pred_proba], axis=1)
471
+ plt.figure(figsize=(14, 4))
472
+ plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
473
+ plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
474
+ plt.xlabel("Epoch")
475
+ plt.ylabel("Loss")
476
+ plt.legend()
477
+ plt.show()
527
478
 
528
- return pred
479
+ self._model = model
529
480
 
481
+ return model
530
482
 
531
- def evaluate(prediction: pd.DataFrame, target_type: str):
532
- """
533
- Function to evaluate model performance
483
+ def predict(
484
+ self,
485
+ data: pd.DataFrame,
486
+ threshold: float = 0.5,
487
+ ):
488
+ """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
489
+
490
+ Args:
491
+ - data: the data for prediction
492
+ - threshold: the threshold for classification
493
+ """
494
+ if not self._model:
495
+ raise Exception(
496
+ "Model is not fitted, cannot predict, run model.fit() first, or pass a fitted model when creating the Model object to the `model` parameter."
497
+ )
498
+ model = self._model
499
+
500
+ if self.threshold and threshold == 0.5:
501
+ threshold = self.threshold
502
+
503
+ if self.recurrent or model.model_name in ["lgb", "xgb"]:
504
+ # keras, lgb & xgb
505
+ if model.model_name == "lgb":
506
+ # Direct prediction for LightGBM
507
+ pred = model.predict(data)
508
+ elif model.model_name == "xgb":
509
+ # Convert val_data to DMatrix for XGBoost
510
+ d_data = xgb.DMatrix(data)
511
+ pred = model.predict(d_data)
512
+ else:
513
+ # Reshape (flatten) for keras if not multiclass
514
+ pred = model.predict(data)
515
+ if pred.shape[1] == 1:
516
+ pred = pred.reshape(-1)
517
+
518
+ if self.target_type == "classification":
519
+ num_class = pred.shape[1] if len(pred.shape) > 1 else 2
520
+
521
+ if num_class <= 2:
522
+ # For binary classification, concatenate the predicted probabilities for both classes
523
+ pred_df = pd.DataFrame(
524
+ {
525
+ 0: 1 - pred, # Probability of class 0
526
+ 1: pred, # Probability of class 1
527
+ },
528
+ )
529
+ else:
530
+ # For multi-class classification, use the predicted probabilities for each class
531
+ pred_df = pd.DataFrame(pred, columns=range(num_class))
532
+
533
+ # Get final predictions (argmax for multi-class, threshold for binary)
534
+ if num_class == 2:
535
+ pred_df["PRED"] = np.where(
536
+ pred_df[1] >= threshold, 1, 0
537
+ ) # Class 1 if prob >= threshold
538
+ else:
539
+ pred_df["PRED"] = pred_df.idxmax(
540
+ axis=1
541
+ ) # Class with highest probability for multiclasses
534
542
 
535
- Args:
536
- - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
537
- - target_type: classification or regression
538
- """
539
- score = {}
540
- y_true = prediction["TARGET"]
541
- y_pred = prediction["PRED"]
543
+ # Reorder columns to show predicted class first, then probabilities
544
+ pred = pred_df[["PRED"] + list(range(num_class))]
542
545
 
543
- if target_type == "regression":
544
- # Main metrics
545
- score["RMSE"] = root_mean_squared_error(y_true, y_pred)
546
- score["MAE"] = mean_absolute_error(y_true, y_pred)
547
- score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
548
- score["R2"] = r2_score(y_true, y_pred)
546
+ else:
547
+ pred = pd.Series(pred, name="PRED")
549
548
 
550
- # Robustness: avoid division by zero
551
- std_target = y_true.std()
552
- mean_target = y_true.mean()
553
- median_target = y_true.median()
549
+ # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
550
+ if model.model_name in ["lgb", "xgb"]:
551
+ pred.index = data.index
552
+ else:
553
+ # sk learn
554
+ pred = pd.Series(model.predict(data), index=data.index, name="PRED")
555
+ if self.target_type == "classification":
556
+ pred_proba = pd.DataFrame(
557
+ model.predict_proba(data),
558
+ index=data.index,
559
+ columns=[
560
+ int(c) if isinstance(c, float) and c.is_integer() else c
561
+ for c in model.classes_
562
+ ],
563
+ )
554
564
 
555
- # RMSE / STD
556
- score["RMSE_STD_RATIO"] = (
557
- float(100 * score["RMSE"] / std_target) if std_target else 1000
558
- )
565
+ # Apply threshold for binary classification
566
+ if len(model.classes_) == 2:
567
+ positive_class = model.classes_[1] # Assuming classes are ordered
568
+ pred = (pred_proba[positive_class] >= threshold).astype(int)
569
+ pred.name = "PRED"
559
570
 
560
- # Median absolute deviation (MAD)
561
- mam = (y_true - mean_target).abs().median() # Median Abs around Mean
562
- mad = (y_true - median_target).abs().median() # Median Abs around Median
563
- score["MAM"] = mam
564
- score["MAD"] = mad
565
- score["MAE_MAM_RATIO"] = (
566
- float(100 * score["MAE"] / mam) if mam else 1000
567
- ) # MAE / MAD → Plus stable, moins sensible aux outliers.
568
- score["MAE_MAD_RATIO"] = (
569
- float(100 * score["MAE"] / mad) if mad else 1000
570
- ) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
571
+ pred = pd.concat([pred, pred_proba], axis=1)
571
572
 
572
- else:
573
+ return pred
573
574
 
574
- labels = np.unique(y_true)
575
- num_classes = labels.size
576
- y_pred_proba = (
577
- prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
575
+ def save(self, path):
576
+ if self.recurrent:
577
+ path += "/" + self.model_name + ".keras"
578
+ self._model.save(path)
579
+ else:
580
+ path += "/" + self.model_name + ".best"
581
+ joblib.dump(self._model, path)
582
+ self.path = path
583
+ return path
584
+
585
+ def load(self):
586
+ if not self.path:
587
+ raise ValueError("Path is not set, cannot load model")
588
+
589
+ training_target_dir = Path(self.path)
590
+
591
+ # Load threshold
592
+ scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
593
+ self.threshold = (
594
+ scores_tracking["THRESHOLD"].values[0]
595
+ if "THRESHOLD" in scores_tracking.columns
596
+ else None
578
597
  )
579
- if num_classes > 2:
580
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
581
- lb.fit(labels)
582
- y_true_onhot = lb.transform(y_true)
583
- y_pred_onehot = lb.transform(y_pred)
584
598
 
585
- score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
586
- score["ACCURACY"] = accuracy_score(y_true, y_pred)
587
- score["PRECISION"] = precision_score(
588
- y_true,
589
- y_pred,
590
- average=("binary" if num_classes == 2 else "macro"),
591
- )
592
- score["RECALL"] = recall_score(
593
- y_true,
594
- y_pred,
595
- average=("binary" if num_classes == 2 else "macro"),
596
- )
597
- score["F1"] = f1_score(
598
- y_true,
599
- y_pred,
600
- average=("binary" if num_classes == 2 else "macro"),
601
- )
602
- score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
603
- (
604
- score["THRESHOLD"],
605
- score["PRECISION_AT_THRESHOLD"],
606
- score["RECALL_AT_THRESHOLD"],
607
- ) = (
608
- find_best_precision_threshold(prediction)
609
- if num_classes == 2
610
- else (None, None, None)
599
+ # Search for files that contain '.best' or '.keras' in the name
600
+ best_files = list(training_target_dir.glob("*.best*")) + list(
601
+ training_target_dir.glob("*.keras*")
611
602
  )
612
- return score
603
+ # If any files are found, try loading the first one (or process as needed)
604
+ if best_files:
605
+ file_path = best_files[
606
+ 0
607
+ ] # Assuming you want to open the first matching file
608
+ try:
609
+ # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
610
+ self._model = joblib.load(file_path)
611
+ logger.info(
612
+ f"Loaded model {self._model.model_name} and threshold {self.threshold}"
613
+ )
614
+ except (pickle.UnpicklingError, EOFError):
615
+ # If it's not a pickle file, try loading it as a Keras model
616
+ try:
617
+ # Attempt to load the file as a Keras model
618
+ self._model = keras.models.load_model(file_path)
619
+ logger.info(
620
+ f"Loaded model {self._model.model_name} and threshold {self.threshold}"
621
+ )
622
+ except Exception as e:
623
+ raise FileNotFoundError(
624
+ f"Model could not be loaded from path: {file_path}: {e}"
625
+ )
626
+ else:
627
+ raise FileNotFoundError(
628
+ f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
629
+ )
613
630
 
631
+ self.model_name = self._model.model_name
632
+ self.target_type = self._model.target_type
614
633
 
615
- def train_model(params, x_train, y_train, x_val, y_val, config):
616
- if "_type_name" in config.keys() and config["_type_name"] == "hyperopts":
617
- global _target_number
618
- global _target_type
619
- global _session_name
620
- global _plot
621
- global _type_name
622
- global _scaler_y
623
- global _training_target_dir
624
- _target_number = config["_target_number"]
625
- _target_type = config["_target_type"]
626
- _session_name = config["_session_name"]
627
- _plot = config["_plot"]
628
- _type_name = config["_type_name"]
629
- _scaler_y = config["_scaler_y"]
630
- _training_target_dir = config["_training_target_dir"]
631
-
632
- # warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
633
- # logging.getLogger("ray").setLevel(logging.CRITICAL)
634
- # logging.getLogger("ray.tune").setLevel(logging.CRITICAL)
635
- # logging.getLogger("ray.raylet").setLevel(logging.CRITICAL)
636
- # logging.getLogger("raylet").setLevel(logging.CRITICAL)
637
634
 
638
- logger.info(
639
- f"TARGET_{_target_number} - Training a {config['model_name']} at {datetime.now()} : {_session_name}, TARGET_{_target_number}"
635
+ def trainable(
636
+ params,
637
+ x_train,
638
+ y_train,
639
+ x_val,
640
+ y_val,
641
+ model_name,
642
+ target_type,
643
+ session_name,
644
+ target_number,
645
+ create_model,
646
+ type_name="hyperopts",
647
+ plot=False,
648
+ ):
649
+ """Standalone version of train_model that doesn't depend on self"""
650
+ # Create model engine
651
+ model = ModelEngine(
652
+ model_name=model_name,
653
+ target_type=target_type,
654
+ create_model=create_model,
655
+ plot=plot,
640
656
  )
641
657
 
642
- recurrent = config["recurrent"]
643
- create_model = config["create_model"]
658
+ logger.info(
659
+ f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
660
+ )
644
661
 
645
- if recurrent:
662
+ if model.recurrent:
646
663
  timesteps = params["timesteps"]
647
664
  x_train = x_train[:, -timesteps:, :]
648
665
  x_val = x_val[:, -timesteps:, :]
649
666
 
650
667
  # Compile and fit model on train set
651
668
  start = time.time()
652
- if recurrent:
653
- fit = fit_recurrent
654
- elif (create_model == "lgb") or (create_model == "xgb"):
655
- fit = fit_boosting
656
- else:
657
- fit = fit_sklearn
658
- model = fit(
659
- x_train,
660
- y_train,
661
- x_val,
662
- y_val,
663
- create_model,
664
- params=params,
665
- config=config,
666
- )
669
+ model.fit(x_train, y_train, x_val, y_val, params)
667
670
  stop = time.time()
668
671
 
669
672
  # Prediction on val set
670
- y_pred = predict(model, x_val, _target_type, config)
673
+ y_pred = model.predict(x_val)
671
674
 
672
675
  # fix for recurrent model because x_val has no index as it is a 3D np array
673
- if config["recurrent"]:
676
+ if model.recurrent:
674
677
  y_val = pd.DataFrame(y_val, columns=["TARGET", "index"]).set_index("index")
675
678
  y_pred.index = y_val.index
676
679
 
677
680
  prediction = pd.concat([y_val, y_pred], axis=1)
678
681
 
679
682
  # Unscale the data
680
- if config["need_scaling"] and _target_type == "regression":
683
+ if (
684
+ model.need_scaling
685
+ and model.target_type == "regression"
686
+ and model.scaler_y is not None
687
+ ):
681
688
  # scaler_y needs 2D array with shape (-1, 1)
682
- prediction.loc[:, "TARGET"] = _scaler_y.inverse_transform(
689
+ prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
683
690
  prediction[["TARGET"]].values
684
691
  )
685
- prediction.loc[:, "PRED"] = _scaler_y.inverse_transform(
692
+ prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
686
693
  prediction[["PRED"]].values
687
694
  )
688
695
 
689
696
  # Evaluate model
690
697
  score = {
691
698
  "DATE": datetime.now(),
692
- "SESSION": _session_name,
699
+ "SESSION": session_name,
693
700
  "TRAIN_DATA": x_train.shape[0],
694
701
  "VAL_DATA": x_val.shape[0],
695
702
  "FEATURES": x_train.shape[-1],
696
703
  "MODEL_NAME": model.model_name,
697
- "TYPE": _type_name,
704
+ "TYPE": type_name,
698
705
  "TRAINING_TIME": stop - start,
699
706
  "EVAL_DATA_STD": prediction["TARGET"].std(),
700
707
  }
701
708
 
702
- score.update(evaluate(prediction, _target_type))
709
+ score.update(evaluate(prediction, target_type))
703
710
 
704
- if _type_name == "hyperopts":
711
+ if type_name == "hyperopts":
705
712
  session.report(metrics=score)
706
- ray.tune.report(metrics=score)
707
713
  return score
708
714
 
709
715
  return score, model, prediction
710
716
 
711
717
 
712
- # Main training function
713
- def model_selection(
714
- dataset_id: int,
715
- models_idx: list,
716
- target_number: int,
717
- session_name,
718
- perform_hyperoptimization=True,
719
- perform_crossval=False,
720
- number_of_trials=20,
721
- plot=True,
722
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
723
- preserve_model=True,
724
- reshaped_data=None,
725
- data=None,
726
- ):
727
- """
728
- Selects the best models based on a target variable, optionally performing hyperparameter optimization
729
- and cross-validation, and manages outputs in a session-specific directory.
730
-
731
- Args:
732
- models_idx (list):
733
- A list of indices or identifiers representing the models to evaluate.
734
- Each identifier corresponds to a predefined or available model.
735
-
736
- target_number (int):
737
- The number of the target variable (e.g., column index or predefined target) to predict.
738
- This determines the dataset's output variable for training and evaluation.
739
-
740
- session_name (str):
741
- A name for the current session, used to organize and store results
742
- (e.g., logs, metrics, trained models) in a session-specific directory.
743
-
744
- perform_hyperoptimization (bool, optional):
745
- Whether to perform hyperparameter optimization for the models.
746
- If `True`, the function will attempt to tune the hyperparameters of each model.
747
- Defaults to `True`.
748
-
749
- perform_crossval (bool, optional):
750
- Whether to perform cross-validation to evaluate model performance.
751
- If `True`, the function will use cross-validation to compute metrics.
752
- Defaults to `True`.
753
-
754
- number_of_trials (int, optional):
755
- The number of trials to run for hyperparameter optimization.
756
- Ignored if `perform_hyperoptimization` is `False`.
757
- Defaults to `20`.
758
-
759
- plot (bool, optional):
760
- Whether to enable plotting during the process.
761
- If `True`, plot will be displayed.
762
- Defaults to `True`.
763
-
764
- clean_dir (bool, optional):
765
- Whether to clean the entire target training directory before starting the process.
766
- If `True`, any existing files in the target training directory will be removed.
767
- Defaults to `False`.
768
-
769
- preserve_model (bool, optional):
770
- Whether to run the search even if there is already a best model in the directory.
771
- If `False`, previous best models won't be erased and the search will be skipped.
772
- Defaults to `False`.
773
-
774
- Returns:
775
- None
776
- The function runs the model selection process and outputs results
777
- (e.g., logs, metrics, and optionally models) to the session directory.
778
- """
779
- global _target_number
780
- global _target_type
781
- global _session_name
782
- global _plot
783
- global _type_name
784
- global _scaler_y
785
- global _training_target_dir
786
-
787
- global_vars = [
788
- "_target_number",
789
- "_target_type",
790
- "_session_name",
791
- "_plot",
792
- "_type_name",
793
- "_scaler_y",
794
- "_training_target_dir",
795
- ]
796
-
797
- _target_number = target_number
798
- _target_type = "classification" if target_number in TARGETS_CLF else "regression"
799
- _session_name = session_name
800
- _plot = plot
801
-
802
- if dataset_id is None:
803
- raise ValueError("dataset_id is not provided.")
804
-
805
- dataset = Dataset.get(dataset_id)
806
- dataset_dir = dataset.path
807
-
808
- training_target_dir = f"{dataset_dir}/TARGET_{_target_number}"
809
- _training_target_dir = training_target_dir
810
-
811
- metric = "RMSE" if _target_type == "regression" else "LOGLOSS"
812
-
813
- # load features, scalers and data
814
- features = dataset.get_features(target_number)
815
- all_features = dataset.get_all_features()
816
-
817
- if data:
818
- train = data["train"]
819
- val = data["val"]
820
- train_scaled = data["train_scaled"]
821
- val_scaled = data["val_scaled"]
822
- _scaler_y = (
823
- data["scalers_y"][f"scaler_y_{target_number}"]
824
- if _target_type == "regression"
825
- else None
718
+ class ModelSelectionEngine:
719
+
720
+ def __init__(
721
+ self,
722
+ data,
723
+ reshaped_data,
724
+ target_number,
725
+ target_clf,
726
+ dataset,
727
+ models_idx,
728
+ time_series,
729
+ date_column,
730
+ group_column,
731
+ **kwargs,
732
+ ):
733
+ self.data = data
734
+ self.reshaped_data = reshaped_data
735
+ self.target_number = target_number
736
+ self.dataset = dataset
737
+ self.target_clf = target_clf
738
+ self.models_idx = models_idx
739
+ self.time_series = time_series
740
+ self.date_column = date_column
741
+ self.group_column = group_column
742
+
743
+ self.target_type = (
744
+ "classification" if self.target_number in self.target_clf else "regression"
826
745
  )
827
- else:
828
- train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
829
- dataset_dir, target_number, _target_type
746
+ self.dataset_dir = self.dataset.path
747
+ self.dataset_id = self.dataset.id
748
+ self.data_dir = f"{self.dataset_dir}/data"
749
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
750
+ self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
751
+ self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
752
+ self.features = self.dataset.get_features(self.target_number)
753
+ self.all_features = self.dataset.get_all_features(
754
+ date_column=self.date_column, group_column=self.group_column
830
755
  )
831
756
 
832
- list_models = ml_models + dl_recurrent_models
833
-
834
- if any(list_models[i].get("recurrent") for i in models_idx):
835
- if reshaped_data is None:
836
- raise ValueError("reshaped_data is not provided.")
837
-
838
- logger.info("Loading reshaped data...")
839
- x_train_reshaped = reshaped_data["x_train_reshaped"]
840
- y_train_reshaped = reshaped_data["y_train_reshaped"]
841
- x_val_reshaped = reshaped_data["x_val_reshaped"]
842
- y_val_reshaped = reshaped_data["y_val_reshaped"]
843
-
844
- # create model selection in db
845
- target = Target.find_by(name=f"TARGET_{target_number}")
846
- model_selection = ModelSelection.upsert(
847
- match_fields=["target_id", "dataset_id"],
848
- target_id=target.id,
849
- dataset_id=dataset.id,
850
- )
757
+ # Main training function
758
+ def run(
759
+ self,
760
+ session_name,
761
+ perform_hyperopt=True,
762
+ number_of_trials=20,
763
+ perform_crossval=False,
764
+ plot=True,
765
+ clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
766
+ preserve_model=True,
767
+ ):
768
+ """
769
+ Selects the best models based on a target variable, optionally performing hyperparameter optimization
770
+ and cross-validation, and manages outputs in a session-specific directory.
771
+ """
772
+ self.session_name = session_name
773
+ self.plot = plot
774
+ self.number_of_trials = number_of_trials
775
+
776
+ if self.dataset_id is None:
777
+ raise ValueError("Please provide a dataset.")
778
+
779
+ if self.data:
780
+ train = self.data["train"]
781
+ val = self.data["val"]
782
+ test = self.data["test"]
783
+ train_scaled = self.data["train_scaled"]
784
+ val_scaled = self.data["val_scaled"]
785
+ test_scaled = self.data["test_scaled"]
786
+ else:
787
+ (
788
+ train,
789
+ val,
790
+ test,
791
+ train_scaled,
792
+ val_scaled,
793
+ test_scaled,
794
+ ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
795
+
796
+ if (
797
+ any(all_models[i].get("recurrent") for i in self.models_idx)
798
+ and not self.time_series
799
+ ):
800
+ ValueError(
801
+ "You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
802
+ )
851
803
 
852
- # recurrent models starts at 9 # len(list_models)
853
- for i in models_idx:
854
- config = list_models[i]
855
- if config["recurrent"] is False and config[_target_type] is None:
856
- continue # for naive bayes models that cannot be used in regression
857
-
858
- results_dir = f"{training_target_dir}/{config['model_name']}"
859
- if not os.path.exists(f"{results_dir}"):
860
- os.makedirs(f"{results_dir}")
861
- elif preserve_model and contains_best(results_dir):
862
- continue
863
- elif perform_hyperoptimization:
864
- clean_directory(results_dir)
865
-
866
- logger.info(f"Training a {config['model_name']}")
867
- model = Model.upsert(
868
- match_fields=["name", "type"],
869
- name=config["model_name"],
870
- type=_target_type,
871
- )
872
- model_training = ModelTraining.upsert(
873
- match_fields=["model_id", "model_selection_id"],
874
- model_id=model.id,
875
- model_selection_id=model_selection.id,
804
+ if (
805
+ any(all_models[i].get("recurrent") for i in self.models_idx)
806
+ and self.time_series
807
+ ):
808
+ if self.reshaped_data is None:
809
+ raise ValueError("reshaped_data is not provided.")
810
+
811
+ logger.info("Loading reshaped data...")
812
+ x_train_reshaped = self.reshaped_data["x_train_reshaped"]
813
+ y_train_reshaped = self.reshaped_data["y_train_reshaped"]
814
+ x_val_reshaped = self.reshaped_data["x_val_reshaped"]
815
+ y_val_reshaped = self.reshaped_data["y_val_reshaped"]
816
+ x_test_reshaped = self.reshaped_data["x_test_reshaped"]
817
+ y_test_reshaped = self.reshaped_data["y_test_reshaped"]
818
+
819
+ # create model selection in db
820
+ target = Target.find_by(name=f"TARGET_{self.target_number}")
821
+ model_selection = ModelSelection.upsert(
822
+ match_fields=["target_id", "dataset_id"],
823
+ target_id=target.id,
824
+ dataset_id=self.dataset_id,
876
825
  )
877
826
 
878
- # getting data
879
- if config["recurrent"]:
880
- # Clear cluster from previous Keras session graphs.
881
- K.clear_session()
882
-
883
- features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
884
- # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
885
- x_train = x_train_reshaped[:, :, features_idx]
886
- y_train = y_train_reshaped[:, [target_number, 0]]
887
- x_val = x_val_reshaped[:, :, features_idx]
888
- y_val = y_val_reshaped[:, [target_number, 0]]
889
- else:
890
- new_config = config[_target_type]
891
- new_config["model_name"] = config["model_name"]
892
- new_config["recurrent"] = config["recurrent"]
893
- new_config["need_scaling"] = config["need_scaling"]
894
- config = new_config
895
-
896
- if config["need_scaling"] and _target_type == "regression":
897
- x_train = train_scaled[features]
898
- y_train = train_scaled[f"TARGET_{target_number}"].rename("TARGET")
899
- x_val = val_scaled[features]
900
- y_val = val_scaled[f"TARGET_{target_number}"].rename("TARGET")
901
- else:
902
- x_train = train[features]
903
- y_train = train[f"TARGET_{target_number}"].rename("TARGET")
904
- x_val = val[features]
905
- y_val = val[f"TARGET_{target_number}"].rename("TARGET")
906
-
907
- start = time.time()
908
- # Tuning hyperparameters
909
- if perform_hyperoptimization:
910
- _type_name = "hyperopts"
911
-
912
- for var in global_vars:
913
- config[var] = globals()[var]
914
-
915
- logger.info("Start tuning hyperparameters...")
916
-
917
- storage_path = f"{results_dir}/ray_results"
918
- # ray.shutdown()
919
- # ray.init(
920
- # runtime_env={
921
- # "working_dir": ".", # or your project path
922
- # "env_vars": {"PYTHONPATH": "."}
923
- # }
924
- # )
925
- tuner = Tuner(
926
- trainable=with_parameters(
927
- train_model,
928
- x_train=x_train,
929
- y_train=y_train,
930
- x_val=x_val,
931
- y_val=y_val,
932
- config=config,
933
- ),
934
- param_space=config["search_params"],
935
- tune_config=TuneConfig(
936
- metric=metric,
937
- mode="min",
938
- search_alg=HyperOptSearch(),
939
- num_samples=number_of_trials,
940
- scheduler=ASHAScheduler(max_t=100, grace_period=10),
941
- ),
942
- run_config=RunConfig(
943
- stop={"training_iteration": 100},
944
- storage_path=storage_path,
945
- # name=datetime.now().strftime("%d-%m-%Y") + "-" + session_name,
946
- callbacks=[TBXLoggerCallback()],
947
- # log_to_file=("stdout.log", "stderr.log"), # depreciated
948
- # verbose=0,
949
- ),
827
+ # recurrent models starts at 9 # len(list_models)
828
+ for i in self.models_idx:
829
+ config = all_models[i]
830
+ recurrent = config["recurrent"]
831
+ need_scaling = config["need_scaling"]
832
+ model_name = config["model_name"]
833
+
834
+ if recurrent is False and config[self.target_type] is None:
835
+ continue # for naive bayes models that cannot be used in regression
836
+
837
+ self.results_dir = f"{self.training_target_dir}/{model_name}"
838
+ if not os.path.exists(f"{self.results_dir}"):
839
+ os.makedirs(f"{self.results_dir}")
840
+ elif preserve_model and contains_best(self.results_dir):
841
+ continue
842
+ elif perform_hyperopt:
843
+ clean_directory(self.results_dir)
844
+
845
+ logger.info(f"Training a {model_name}")
846
+ model = Model.upsert(
847
+ match_fields=["name", "type"],
848
+ name=model_name,
849
+ type=self.target_type,
850
+ )
851
+ model_training = ModelTraining.upsert(
852
+ match_fields=["model_id", "model_selection_id"],
853
+ model_id=model.id,
854
+ model_selection_id=model_selection.id,
950
855
  )
951
- try:
952
- results = tuner.fit()
953
856
 
954
- best_result = results.get_best_result(metric, "max")
955
- best_params = best_result.config
956
- best_score = best_result.metrics
857
+ # getting data
858
+ if recurrent:
859
+ # Clear cluster from previous Keras session graphs.
860
+ K.clear_session()
861
+
862
+ features_idx = [
863
+ i
864
+ for i, e in enumerate(self.all_features)
865
+ if e in set(self.features)
866
+ ]
867
+ # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
868
+ x_train = x_train_reshaped[:, :, features_idx]
869
+ y_train = y_train_reshaped[:, [self.target_number, 0]]
870
+ x_val = x_val_reshaped[:, :, features_idx]
871
+ y_val = y_val_reshaped[:, [self.target_number, 0]]
872
+ x_test = x_test_reshaped[:, :, features_idx]
873
+ y_test = y_test_reshaped[:, [self.target_number, 0]]
874
+ else:
875
+ config = config[self.target_type]
957
876
 
958
- # log results
959
- logger.info(f"Best hyperparameters found were:\n{best_params}")
960
- logger.info(f"Best Scores found were:\n{best_score}")
877
+ if need_scaling and self.target_type == "regression":
878
+ x_train = train_scaled[self.features]
879
+ y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
880
+ "TARGET"
881
+ )
882
+ x_val = val_scaled[self.features]
883
+ y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
884
+ x_test = test_scaled[self.features]
885
+ y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
886
+ "TARGET"
887
+ )
888
+ else:
889
+ x_train = train[self.features]
890
+ y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
891
+ x_val = val[self.features]
892
+ y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
893
+ x_test = test[self.features]
894
+ y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
895
+
896
+ log_dir = get_log_dir(self.training_target_dir, model_name)
897
+ # instantiate model
898
+ model = ModelEngine(
899
+ model_name=model_name,
900
+ search_params=config["search_params"],
901
+ target_type=self.target_type,
902
+ create_model=config["create_model"],
903
+ plot=self.plot,
904
+ log_dir=log_dir,
905
+ )
961
906
 
962
- df_results = results.get_dataframe()
963
- logger.info(
964
- f"Markdown table with all trials :\n{df_results.to_markdown()}"
965
- )
907
+ start = time.time()
908
+ # Tuning hyperparameters
909
+ if perform_hyperopt:
910
+ best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
966
911
 
967
912
  # save best params
968
- best_params_file = f"{training_target_dir}/best_params.json"
913
+ best_params_file = f"{self.training_target_dir}/best_params.json"
969
914
  try:
970
915
  with open(best_params_file, "r") as f:
971
916
  json_dict = json.load(f)
972
917
  except FileNotFoundError:
973
918
  json_dict = {}
974
919
 
975
- json_dict[config["model_name"]] = serialize_for_json(best_params)
920
+ json_dict[model.model_name] = serialize_for_json(best_params)
976
921
  with open(best_params_file, "w") as f:
977
922
  json.dump(json_dict, f, indent=4)
923
+ else:
924
+ try:
925
+ with open(f"{self.training_target_dir}/best_params.json") as f:
926
+ json_dict = json.load(f)
927
+ best_params = json_dict[model_name]
928
+ except Exception:
929
+ raise FileNotFoundError(
930
+ f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true"
931
+ )
978
932
 
979
- except Exception as e:
980
- ray.shutdown()
981
- raise Exception(e)
982
- logger.error(e)
983
-
984
- ray.shutdown()
985
-
986
- # Collect errors in single file
987
- collect_error_logs(
988
- training_target_dir=training_target_dir, storage_path=storage_path
989
- )
990
-
991
- # Clean up
992
- for var in global_vars:
993
- del config[var]
994
- else:
995
- try:
996
- with open(f"{training_target_dir}/best_params.json") as f:
997
- json_dict = json.load(f)
998
- best_params = json_dict[config["model_name"]]
999
- except Exception:
1000
- raise FileNotFoundError(
1001
- f"Could not find {config['model_name']} in current data. Try to run an hyperoptimization by setting `perform_hyperoptimization` to true"
933
+ # Perform cross-validation of the best model on k-folds of train + val set
934
+ if perform_crossval:
935
+ x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
936
+ y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
937
+ n_splits = 4
938
+ n_samples = len(x_train_val)
939
+ test_size = int(n_samples / (n_splits + 4))
940
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
941
+
942
+ # Store the scores
943
+ cross_validation_scores = []
944
+
945
+ for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
946
+ self.type_name = f"crossval_fold_{i}"
947
+
948
+ if self.time_series:
949
+ date_series = train[self.date_column].copy()
950
+
951
+ if need_scaling:
952
+ date_series = date_series.map(pd.Timestamp.fromordinal)
953
+
954
+ # Now you can use the actual train/val indices to extract ranges
955
+ train_start = date_series.iloc[train_index[0]]
956
+ train_end = date_series.iloc[train_index[-1]]
957
+ val_start = date_series.iloc[val_index[0]]
958
+ val_end = date_series.iloc[val_index[-1]]
959
+
960
+ logger.info(
961
+ f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
962
+ f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
963
+ )
964
+ else:
965
+ logger.info(
966
+ f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
967
+ )
968
+
969
+ # Train the model and get the score
970
+ if recurrent:
971
+ cross_validation_score, _, _ = self.train_model(
972
+ params=best_params,
973
+ x_train=x_train_val[train_index],
974
+ y_train=y_train_val[train_index],
975
+ x_val=x_train_val[val_index],
976
+ y_val=y_train_val[val_index],
977
+ model=model,
978
+ )
979
+ else:
980
+ cross_validation_score, _, _ = self.train_model(
981
+ params=best_params,
982
+ x_train=x_train_val.iloc[train_index],
983
+ y_train=y_train_val.iloc[train_index],
984
+ x_val=x_train_val.iloc[val_index],
985
+ y_val=y_train_val.iloc[val_index],
986
+ model=model,
987
+ )
988
+
989
+ # Append score to the list
990
+ cross_validation_scores.append(cross_validation_score)
991
+
992
+ # Calculate and log the mean score
993
+ cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
994
+ self.metric
995
+ ].mean()
996
+ logger.info(
997
+ f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
1002
998
  )
1003
999
 
1004
- # Perform cross-validation of the best model on k-folds of train + val set
1005
- if perform_crossval:
1006
- x_train_val = pd.concat([x_train, x_val], axis=0)
1007
- y_train_val = pd.concat([y_train, y_val], axis=0)
1008
- n_splits = 4
1009
- n_samples = len(x_train_val)
1010
- test_size = int(n_samples / (n_splits + 4))
1011
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
1012
-
1013
- # Store the scores
1014
- cross_validation_scores = []
1015
-
1016
- for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
1017
- _type_name = f"crossval_fold_{i}"
1018
-
1019
- if DATE_COLUMN:
1020
- date_column = train[DATE_COLUMN].copy()
1000
+ # Retrain on entire training set, but keep score on cross-validation folds
1001
+ best_score, best_model, best_pred = self.train_model(
1002
+ params=best_params,
1003
+ x_train=pd.concat([x_train, x_val], axis=0),
1004
+ y_train=pd.concat([y_train, y_val], axis=0),
1005
+ x_val=x_test,
1006
+ y_val=y_test,
1007
+ model=model,
1008
+ )
1009
+ best_score = cross_validation_mean_score
1010
+ else:
1011
+ # Evaluate on validation set
1012
+ self.type_name = "validation"
1013
+ best_score, best_model, best_pred = self.train_model(
1014
+ params=best_params,
1015
+ x_train=pd.concat([x_train, x_val], axis=0),
1016
+ y_train=pd.concat([y_train, y_val], axis=0),
1017
+ x_val=x_test,
1018
+ y_val=y_test,
1019
+ model=model,
1020
+ )
1021
1021
 
1022
- if config.get("need_scaling"):
1023
- date_column = date_column.map(pd.Timestamp.fromordinal)
1022
+ logger.info(f"Best model scores on test set: {best_score}")
1024
1023
 
1025
- # Now you can use the actual train/val indices to extract ranges
1026
- train_start = date_column.iloc[train_index[0]]
1027
- train_end = date_column.iloc[train_index[-1]]
1028
- val_start = date_column.iloc[val_index[0]]
1029
- val_end = date_column.iloc[val_index[-1]]
1024
+ # Save validation predictions
1025
+ best_pred.to_csv(
1026
+ f"{self.results_dir}/pred_val.csv",
1027
+ index=True,
1028
+ header=True,
1029
+ index_label="ID",
1030
+ )
1030
1031
 
1031
- logger.info(
1032
- f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
1033
- f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
1034
- )
1035
- else:
1036
- logger.info(
1037
- f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
1038
- )
1032
+ # Save best model
1033
+ model_path = best_model.save(self.results_dir)
1039
1034
 
1040
- # Train the model and get the score
1041
- if config["recurrent"]:
1042
- cross_validation_score, _, _ = train_model(
1043
- params=best_params,
1044
- x_train=x_train_val[train_index],
1045
- y_train=y_train_val[train_index],
1046
- x_val=x_train_val[val_index],
1047
- y_val=y_train_val[val_index],
1048
- config=config,
1049
- )
1050
- else:
1051
- cross_validation_score, _, _ = train_model(
1052
- params=best_params,
1053
- x_train=x_train_val.iloc[train_index],
1054
- y_train=y_train_val.iloc[train_index],
1055
- x_val=x_train_val.iloc[val_index],
1056
- y_val=y_train_val.iloc[val_index],
1057
- config=config,
1058
- )
1035
+ model_path = Path(model_path).resolve()
1036
+ best_score["MODEL_PATH"] = model_path
1059
1037
 
1060
- # Append score to the list
1061
- cross_validation_scores.append(cross_validation_score)
1038
+ # Track scores
1039
+ scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
1040
+ best_score_df = pd.DataFrame([best_score])
1062
1041
 
1063
- # Calculate and log the mean score
1064
- cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
1065
- metric
1066
- ].mean()
1067
- logger.info(
1068
- f"Best model mean cross-validation score: {cross_validation_mean_score}"
1042
+ if os.path.exists(scores_tracking_path):
1043
+ existing_scores = pd.read_csv(scores_tracking_path)
1044
+ common_cols = existing_scores.columns.intersection(
1045
+ best_score_df.columns
1046
+ )
1047
+ best_score_df = best_score_df[common_cols]
1048
+ scores_tracking = pd.concat(
1049
+ [existing_scores, best_score_df], ignore_index=True
1050
+ )
1051
+ else:
1052
+ scores_tracking = best_score_df
1053
+
1054
+ scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
1055
+ scores_tracking.to_csv(scores_tracking_path, index=False)
1056
+
1057
+ # Save model training metadata
1058
+ stop = time.time()
1059
+ training_time = stop - start
1060
+ model_training.best_params = best_params
1061
+ model_training.model_path = model_path
1062
+ model_training.training_time = training_time
1063
+ model_training.save()
1064
+
1065
+ # Store metrics in DB
1066
+ drop_cols = [
1067
+ "DATE",
1068
+ "SESSION",
1069
+ "TRAIN_DATA",
1070
+ "VAL_DATA",
1071
+ "FEATURES",
1072
+ "MODEL_NAME",
1073
+ "MODEL_PATH",
1074
+ ]
1075
+ best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1076
+ score_data = {k.lower(): v for k, v in best_score.items()}
1077
+
1078
+ Score.upsert(
1079
+ match_fields=["model_training_id"],
1080
+ model_training_id=model_training.id,
1081
+ **score_data,
1069
1082
  )
1070
1083
 
1071
- # Retrain on entire training set, but keep score on cross-validation folds
1072
- best_score, best_model, best_pred = train_model(
1073
- params=best_params,
1084
+ logger.info(f"Model training finished in {training_time:.2f} seconds")
1085
+
1086
+ # find best model type
1087
+ scores_tracking_path = f"{self.training_target_dir}/scores_tracking.csv"
1088
+ scores_tracking = pd.read_csv(scores_tracking_path)
1089
+ best_score_overall = scores_tracking.iloc[0, :]
1090
+ best_model_name = best_score_overall["MODEL_NAME"]
1091
+
1092
+ # Remove any .best or .keras files
1093
+ for file_path in glob.glob(
1094
+ os.path.join(self.training_target_dir, "*.best")
1095
+ ) + glob.glob(os.path.join(self.training_target_dir, "*.keras")):
1096
+ os.remove(file_path)
1097
+ # Copy the best model in root training folder for this target
1098
+ best_model_path = Path(
1099
+ f"{self.training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1100
+ ).resolve()
1101
+ copy_any(
1102
+ best_score_overall["MODEL_PATH"],
1103
+ best_model_path,
1104
+ )
1105
+
1106
+ with open(f"{self.training_target_dir}/best_params.json", "r") as f:
1107
+ best_model_params = json.load(f)[best_model_name]
1108
+
1109
+ # save model_selection results to db
1110
+ model_selection = ModelSelection.get(model_selection.id)
1111
+ model_selection.best_model_id = Model.find_by(
1112
+ name=best_score_overall["MODEL_NAME"], type=self.target_type
1113
+ ).id
1114
+ model_selection.best_model_params = best_model_params
1115
+ model_selection.best_model_path = best_model_path
1116
+ model_selection.save()
1117
+
1118
+ logger.info(f"Best model overall is : {best_score_overall}")
1119
+
1120
+ def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1121
+ self.type_name = "hyperopts"
1122
+
1123
+ def collect_error_logs(training_target_dir: int, storage_path: str):
1124
+ output_error_file = f"{training_target_dir}/errors.log"
1125
+
1126
+ with open(output_error_file, "a") as outfile:
1127
+ # Walk through the ray_results directory
1128
+ for root, dirs, files in os.walk(storage_path):
1129
+ # Check if 'error.txt' exists in the current directory
1130
+ if "error.txt" in files:
1131
+ error_file_path = os.path.join(root, "error.txt")
1132
+ logger.info(f"Processing error file: {error_file_path}")
1133
+ # Read and append the content of the error.txt file
1134
+ with open(error_file_path, "r") as infile:
1135
+ outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
1136
+ outfile.write(infile.read())
1137
+ logger.info(f"All errors written to {output_error_file}")
1138
+
1139
+ logger.info("Start tuning hyperparameters...")
1140
+
1141
+ storage_path = f"{self.results_dir}/ray_results"
1142
+
1143
+ tuner = Tuner(
1144
+ trainable=with_parameters(
1145
+ trainable,
1074
1146
  x_train=x_train,
1075
1147
  y_train=y_train,
1076
1148
  x_val=x_val,
1077
1149
  y_val=y_val,
1078
- config=config,
1150
+ model_name=model.model_name,
1151
+ target_type=self.target_type,
1152
+ session_name=self.session_name,
1153
+ target_number=self.target_number,
1154
+ create_model=model.create_model,
1155
+ type_name="hyperopts",
1156
+ plot=model.plot,
1157
+ ),
1158
+ param_space=model.search_params,
1159
+ tune_config=TuneConfig(
1160
+ metric=self.metric,
1161
+ mode="min",
1162
+ search_alg=HyperOptSearch(),
1163
+ num_samples=self.number_of_trials,
1164
+ scheduler=ASHAScheduler(max_t=100, grace_period=10),
1165
+ ),
1166
+ run_config=RunConfig(
1167
+ stop={"training_iteration": 100},
1168
+ storage_path=storage_path,
1169
+ callbacks=[TBXLoggerCallback()],
1170
+ ),
1171
+ )
1172
+ try:
1173
+ results = tuner.fit()
1174
+
1175
+ best_result = results.get_best_result(self.metric, "max")
1176
+ best_params = best_result.config
1177
+ best_score = best_result.metrics
1178
+
1179
+ # log results
1180
+ logger.info(f"Best hyperparameters found were:\n{best_params}")
1181
+ logger.info(f"Best Scores found were:\n{best_score}")
1182
+ logger.info(
1183
+ f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
1079
1184
  )
1080
- best_score = cross_validation_mean_score
1081
- else:
1082
- # Evaluate on validation set
1083
- _type_name = "validation"
1084
- best_score, best_model, best_pred = train_model(
1085
- params=best_params,
1086
- x_train=x_train,
1087
- y_train=y_train,
1088
- x_val=x_val,
1089
- y_val=y_val,
1090
- config=config,
1185
+ # Collect errors in single file
1186
+ collect_error_logs(
1187
+ training_target_dir=self.training_target_dir, storage_path=storage_path
1091
1188
  )
1092
1189
 
1093
- logger.info(f"Best model scores on validation set: {best_score}")
1190
+ except Exception as e:
1191
+ raise Exception(e)
1094
1192
 
1095
- # Save validation predictions
1096
- best_pred.to_csv(
1097
- f"{results_dir}/pred_val.csv",
1098
- index=True,
1099
- header=True,
1100
- index_label="ID",
1193
+ finally:
1194
+ ray.shutdown()
1195
+
1196
+ return best_params
1197
+
1198
+ def train_model(self, params, x_train, y_train, x_val, y_val, model: ModelEngine):
1199
+ # Use the standalone training function to avoid duplication
1200
+ # For train_model, we pass the data directly (not as Ray references)
1201
+ return trainable(
1202
+ params,
1203
+ x_train,
1204
+ y_train,
1205
+ x_val,
1206
+ y_val,
1207
+ model.model_name,
1208
+ self.target_type,
1209
+ self.session_name,
1210
+ self.target_number,
1211
+ model.create_model,
1212
+ self.type_name,
1213
+ model.plot,
1101
1214
  )
1102
1215
 
1103
- # Save best model
1104
- if config["recurrent"]:
1105
- model_path = f"{results_dir}/{best_model.model_name}.keras"
1106
- best_model.save(model_path)
1107
- else:
1108
- model_path = f"{results_dir}/{best_model.model_name}.best"
1109
- joblib.dump(best_model, model_path)
1110
-
1111
- model_path = Path(model_path).resolve()
1112
- best_score["MODEL_PATH"] = model_path
1113
-
1114
- # Track scores
1115
- scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
1116
- best_score_df = pd.DataFrame([best_score])
1117
-
1118
- if os.path.exists(scores_tracking_path):
1119
- existing_scores = pd.read_csv(scores_tracking_path)
1120
- common_cols = existing_scores.columns.intersection(best_score_df.columns)
1121
- best_score_df = best_score_df[common_cols]
1122
- scores_tracking = pd.concat(
1123
- [existing_scores, best_score_df], ignore_index=True
1124
- )
1125
- else:
1126
- scores_tracking = best_score_df
1127
-
1128
- scores_tracking.sort_values(metric, ascending=True, inplace=True)
1129
- scores_tracking.to_csv(scores_tracking_path, index=False)
1130
-
1131
- # Save model training metadata
1132
- stop = time.time()
1133
- training_time = stop - start
1134
- model_training.best_params = best_params
1135
- model_training.model_path = model_path
1136
- model_training.training_time = training_time
1137
- model_training.save()
1138
-
1139
- # Store metrics in DB
1140
- drop_cols = [
1141
- "DATE",
1142
- "SESSION",
1143
- "TRAIN_DATA",
1144
- "VAL_DATA",
1145
- "FEATURES",
1146
- "MODEL_NAME",
1147
- "MODEL_PATH",
1148
- ]
1149
- best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
1150
- score_data = {k.lower(): v for k, v in best_score.items()}
1151
1216
 
1152
- Score.upsert(
1153
- match_fields=["model_training_id"],
1154
- model_training_id=model_training.id,
1155
- **score_data,
1156
- )
1217
+ def evaluate(prediction: pd.DataFrame, target_type: str):
1218
+ """
1219
+ Function to evaluate model performance
1220
+
1221
+ Args:
1222
+ - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
1223
+ - target_type: classification or regression
1224
+ """
1225
+ score = {}
1226
+ y_true = prediction["TARGET"]
1227
+ y_pred = prediction["PRED"]
1157
1228
 
1158
- logger.info(f"Model training finished in {training_time:.2f} seconds")
1229
+ if target_type == "regression":
1230
+ # Main metrics
1231
+ score["RMSE"] = root_mean_squared_error(y_true, y_pred)
1232
+ score["MAE"] = mean_absolute_error(y_true, y_pred)
1233
+ score["MAPE"] = mean_absolute_percentage_error(y_true, y_pred)
1234
+ score["R2"] = r2_score(y_true, y_pred)
1159
1235
 
1160
- # find best model type
1161
- scores_tracking_path = f"{training_target_dir}/scores_tracking.csv"
1162
- scores_tracking = pd.read_csv(scores_tracking_path)
1163
- best_score_overall = scores_tracking.iloc[0, :]
1164
- best_model_name = best_score_overall["MODEL_NAME"]
1236
+ # Robustness: avoid division by zero
1237
+ std_target = y_true.std()
1238
+ mean_target = y_true.mean()
1239
+ median_target = y_true.median()
1165
1240
 
1166
- # Remove any .best or .keras files
1167
- for file_path in glob.glob(os.path.join(training_target_dir, "*.best")) + glob.glob(
1168
- os.path.join(training_target_dir, "*.keras")
1169
- ):
1170
- os.remove(file_path)
1171
- # Copy the best model in root training folder for this target
1172
- best_model_path = Path(
1173
- f"{training_target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
1174
- ).resolve()
1175
- copy_any(
1176
- best_score_overall["MODEL_PATH"],
1177
- best_model_path,
1178
- )
1241
+ # RMSE / STD
1242
+ score["RMSE_STD_RATIO"] = (
1243
+ float(100 * score["RMSE"] / std_target) if std_target else 1000
1244
+ )
1179
1245
 
1180
- with open(f"{training_target_dir}/best_params.json", "r") as f:
1181
- best_model_params = json.load(f)[best_model_name]
1246
+ # Median absolute deviation (MAD)
1247
+ mam = (y_true - mean_target).abs().median() # Median Abs around Mean
1248
+ mad = (y_true - median_target).abs().median() # Median Abs around Median
1249
+ score["MAM"] = mam
1250
+ score["MAD"] = mad
1251
+ score["MAE_MAM_RATIO"] = (
1252
+ float(100 * score["MAE"] / mam) if mam else 1000
1253
+ ) # MAE / MAD → Plus stable, moins sensible aux outliers.
1254
+ score["MAE_MAD_RATIO"] = (
1255
+ float(100 * score["MAE"] / mad) if mad else 1000
1256
+ ) # MAE / Médiane des écarts absolus autour de la moyenne: Moins robuste aux outliers
1182
1257
 
1183
- # save model_selection results to db
1184
- model_selection = ModelSelection.get(model_selection.id)
1185
- model_selection.best_model_id = Model.find_by(
1186
- name=best_score_overall["MODEL_NAME"], type=_target_type
1187
- ).id
1188
- model_selection.best_model_params = best_model_params
1189
- model_selection.best_model_path = best_model_path
1190
- model_selection.save()
1258
+ else:
1259
+
1260
+ labels = np.unique(y_true)
1261
+ num_classes = labels.size
1262
+ y_pred_proba = (
1263
+ prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1264
+ )
1265
+ if num_classes > 2:
1266
+ lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1267
+ lb.fit(labels)
1268
+ y_true_onhot = lb.transform(y_true)
1269
+ y_pred_onehot = lb.transform(y_pred)
1191
1270
 
1192
- logger.info(f"Best model overall is : {best_score_overall}")
1271
+ score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1272
+ score["ACCURACY"] = accuracy_score(y_true, y_pred)
1273
+ score["PRECISION"] = precision_score(
1274
+ y_true,
1275
+ y_pred,
1276
+ average=("binary" if num_classes == 2 else "macro"),
1277
+ )
1278
+ score["RECALL"] = recall_score(
1279
+ y_true,
1280
+ y_pred,
1281
+ average=("binary" if num_classes == 2 else "macro"),
1282
+ )
1283
+ score["F1"] = f1_score(
1284
+ y_true,
1285
+ y_pred,
1286
+ average=("binary" if num_classes == 2 else "macro"),
1287
+ )
1288
+ score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
1289
+ (
1290
+ score["THRESHOLD"],
1291
+ score["PRECISION_AT_THRESHOLD"],
1292
+ score["RECALL_AT_THRESHOLD"],
1293
+ ) = (
1294
+ find_best_precision_threshold(prediction)
1295
+ if num_classes == 2
1296
+ else (None, None, None)
1297
+ )
1298
+ return score
1193
1299
 
1194
1300
 
1195
- def collect_error_logs(training_target_dir: int, storage_path: str):
1301
+ # utils
1302
+ def get_log_dir(training_target_dir: str, model_name="test_model"):
1303
+ """Generates a structured log directory path for TensorBoard."""
1304
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
1305
+ log_dir = (
1306
+ Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
1307
+ )
1308
+ log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
1309
+ return str(log_dir)
1196
1310
 
1197
- output_error_file = f"{training_target_dir}/errors.log"
1198
1311
 
1199
- with open(output_error_file, "a") as outfile:
1200
- # Walk through the ray_results directory
1201
- for root, dirs, files in os.walk(storage_path):
1202
- # Check if 'error.txt' exists in the current directory
1203
- if "error.txt" in files:
1204
- error_file_path = os.path.join(root, "error.txt")
1205
- logger.info(f"Processing error file: {error_file_path}")
1206
- # Read and append the content of the error.txt file
1207
- with open(error_file_path, "r") as infile:
1208
- outfile.write(f"\n\n=== Error from {error_file_path} ===\n")
1209
- outfile.write(infile.read())
1210
- logger.info(f"All errors written to {output_error_file}")
1312
+ def print_scores(training_target_dir: str):
1313
+ """
1314
+ Monitor scores
1315
+ """
1316
+ scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
1317
+ return scores_tracking
1211
1318
 
1212
1319
 
1320
+ # plots
1213
1321
  def plot_evaluation_for_classification(prediction: dict):
1214
1322
  """
1215
1323
  Args
@@ -1272,7 +1380,7 @@ def plot_confusion_matrix(y_true, y_pred):
1272
1380
  plt.show()
1273
1381
 
1274
1382
 
1275
- # THRESHOLD
1383
+ # thresholds
1276
1384
  def find_max_f1_threshold(prediction):
1277
1385
  """
1278
1386
  Finds the threshold that maximizes the F1 score for a binary classification task.
@@ -1515,14 +1623,6 @@ def plot_threshold(prediction, threshold, precision, recall):
1515
1623
  return threshold
1516
1624
 
1517
1625
 
1518
- def print_scores(training_target_dir: str):
1519
- """
1520
- Monitor scores
1521
- """
1522
- scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
1523
- return scores_tracking
1524
-
1525
-
1526
1626
  # OLD - to sort out
1527
1627
  def get_pred_distribution(training_target_dir: str, model_name="linear"):
1528
1628
  """