lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
lecrapaud/model_selection.py
CHANGED
|
@@ -14,10 +14,8 @@ import pickle
|
|
|
14
14
|
from pydantic import BaseModel
|
|
15
15
|
import ast
|
|
16
16
|
|
|
17
|
-
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
18
|
-
|
|
19
17
|
# ML models
|
|
20
|
-
from sklearn.model_selection import TimeSeriesSplit
|
|
18
|
+
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
|
|
21
19
|
from sklearn.calibration import CalibratedClassifierCV
|
|
22
20
|
from sklearn.metrics import (
|
|
23
21
|
mean_absolute_percentage_error,
|
|
@@ -57,28 +55,45 @@ from tensorboardX import SummaryWriter
|
|
|
57
55
|
|
|
58
56
|
# Optimization
|
|
59
57
|
import ray
|
|
60
|
-
from ray.tune import Tuner, TuneConfig, with_parameters
|
|
61
|
-
from ray.train import RunConfig
|
|
58
|
+
from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
|
|
62
59
|
from ray.tune.search.hyperopt import HyperOptSearch
|
|
63
60
|
from ray.tune.search.bayesopt import BayesOptSearch
|
|
64
61
|
from ray.tune.logger import TBXLoggerCallback
|
|
65
62
|
from ray.tune.schedulers import ASHAScheduler
|
|
66
63
|
from ray.air import session
|
|
67
64
|
|
|
65
|
+
# HyperOpt standalone
|
|
66
|
+
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
|
|
67
|
+
|
|
68
68
|
# Internal library
|
|
69
69
|
from lecrapaud.search_space import all_models
|
|
70
70
|
from lecrapaud.directories import clean_directory
|
|
71
71
|
from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
|
|
72
|
-
from lecrapaud.config import PYTHON_ENV
|
|
72
|
+
from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
|
|
73
73
|
from lecrapaud.feature_selection import load_train_data
|
|
74
74
|
from lecrapaud.db import (
|
|
75
75
|
Model,
|
|
76
76
|
ModelSelection,
|
|
77
|
-
|
|
78
|
-
Score,
|
|
77
|
+
ModelSelectionScore,
|
|
79
78
|
Target,
|
|
80
79
|
Experiment,
|
|
81
80
|
)
|
|
81
|
+
from lecrapaud.mixins import LeCrapaudEstimatorMixin
|
|
82
|
+
|
|
83
|
+
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
84
|
+
|
|
85
|
+
# Suppress XGBoost and LightGBM logging
|
|
86
|
+
import logging
|
|
87
|
+
|
|
88
|
+
logging.getLogger("lightgbm").setLevel(logging.ERROR)
|
|
89
|
+
logging.getLogger("xgboost").setLevel(logging.ERROR)
|
|
90
|
+
|
|
91
|
+
# Set global verbosity for XGBoost
|
|
92
|
+
xgb.set_config(verbosity=0)
|
|
93
|
+
|
|
94
|
+
# Suppress warnings
|
|
95
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
96
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
82
97
|
|
|
83
98
|
# Reproducible result
|
|
84
99
|
keras.utils.set_random_seed(42)
|
|
@@ -110,7 +125,64 @@ def test_hardware():
|
|
|
110
125
|
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
111
126
|
|
|
112
127
|
|
|
113
|
-
class
|
|
128
|
+
class CatBoostWrapper:
|
|
129
|
+
"""
|
|
130
|
+
Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
|
|
131
|
+
as direct attributes, while forwarding all method calls and properties.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
__slots__ = ("_model", "_extra_attrs")
|
|
135
|
+
|
|
136
|
+
def __init__(self, model, **kwargs):
|
|
137
|
+
object.__setattr__(self, "_model", model)
|
|
138
|
+
object.__setattr__(self, "_extra_attrs", {})
|
|
139
|
+
# Register kwargs as direct attributes
|
|
140
|
+
for key, value in kwargs.items():
|
|
141
|
+
setattr(self, key, value)
|
|
142
|
+
|
|
143
|
+
# ---- Transparent access ----
|
|
144
|
+
def __getattr__(self, name):
|
|
145
|
+
"""Forward attribute access to the underlying model if not found."""
|
|
146
|
+
model = object.__getattribute__(self, "_model")
|
|
147
|
+
if hasattr(model, name):
|
|
148
|
+
return getattr(model, name)
|
|
149
|
+
extra_attrs = object.__getattribute__(self, "_extra_attrs")
|
|
150
|
+
if name in extra_attrs:
|
|
151
|
+
return extra_attrs[name]
|
|
152
|
+
raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
|
|
153
|
+
|
|
154
|
+
def __setattr__(self, name, value):
|
|
155
|
+
"""Set to wrapper or forward to model when appropriate."""
|
|
156
|
+
if name in CatBoostWrapper.__slots__:
|
|
157
|
+
object.__setattr__(self, name, value)
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
model = object.__getattribute__(self, "_model")
|
|
161
|
+
if hasattr(model, name):
|
|
162
|
+
setattr(model, name, value)
|
|
163
|
+
else:
|
|
164
|
+
extra_attrs = object.__getattribute__(self, "_extra_attrs")
|
|
165
|
+
extra_attrs[name] = value
|
|
166
|
+
|
|
167
|
+
def __dir__(self):
|
|
168
|
+
"""Merge dir() from wrapper, model, and custom attributes."""
|
|
169
|
+
base = set(super().__dir__())
|
|
170
|
+
model_attrs = set(dir(object.__getattribute__(self, "_model")))
|
|
171
|
+
extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
|
|
172
|
+
return sorted(base | model_attrs | extra_attrs)
|
|
173
|
+
|
|
174
|
+
def __repr__(self):
|
|
175
|
+
model = object.__getattribute__(self, "_model")
|
|
176
|
+
extras = object.__getattribute__(self, "_extra_attrs")
|
|
177
|
+
return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def model(self):
|
|
181
|
+
"""Access the raw CatBoost model."""
|
|
182
|
+
return object.__getattribute__(self, "_model")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class BaseModel:
|
|
114
186
|
|
|
115
187
|
def __init__(
|
|
116
188
|
self,
|
|
@@ -157,8 +229,10 @@ class ModelEngine:
|
|
|
157
229
|
def fit(self, *args):
|
|
158
230
|
if self.recurrent:
|
|
159
231
|
fit = self.fit_recurrent
|
|
160
|
-
elif (self.
|
|
232
|
+
elif (self.model_name == "lgb") or (self.model_name == "xgb"):
|
|
161
233
|
fit = self.fit_boosting
|
|
234
|
+
elif self.model_name == "catboost":
|
|
235
|
+
fit = self.fit_catboost
|
|
162
236
|
else:
|
|
163
237
|
fit = self.fit_sklearn
|
|
164
238
|
model = fit(*args)
|
|
@@ -201,17 +275,113 @@ class ModelEngine:
|
|
|
201
275
|
|
|
202
276
|
return model
|
|
203
277
|
|
|
204
|
-
def
|
|
278
|
+
def fit_catboost(self, x_train, y_train, x_val, y_val, params):
|
|
205
279
|
"""
|
|
206
|
-
|
|
280
|
+
Train CatBoost models with native early stopping and log metrics to TensorBoard.
|
|
281
|
+
Also supports plotting of the primary eval metric if self.plot is True.
|
|
207
282
|
"""
|
|
208
|
-
|
|
283
|
+
# Prepare constructor parameters
|
|
284
|
+
ctor_params = dict(params) if params else {}
|
|
285
|
+
early_stopping_rounds = ctor_params.pop("early_stopping_rounds", None)
|
|
286
|
+
# Alias support: num_boost_round -> iterations
|
|
287
|
+
num_boost_round = ctor_params.pop("num_boost_round", None)
|
|
288
|
+
if num_boost_round is not None and "iterations" not in ctor_params:
|
|
289
|
+
ctor_params["iterations"] = num_boost_round
|
|
290
|
+
|
|
291
|
+
# Determine classification/regression setup
|
|
292
|
+
labels = np.unique(y_train)
|
|
293
|
+
num_class = (
|
|
294
|
+
labels.size
|
|
295
|
+
if self.target_type == "classification" and labels.size > 2
|
|
296
|
+
else 1
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if self.target_type == "regression":
|
|
300
|
+
ctor_params.setdefault("loss_function", "RMSE")
|
|
301
|
+
eval_metric = ctor_params.get("eval_metric", "RMSE")
|
|
302
|
+
else:
|
|
303
|
+
if num_class <= 2:
|
|
304
|
+
ctor_params.setdefault("loss_function", "Logloss")
|
|
305
|
+
eval_metric = ctor_params.get("eval_metric", "Logloss")
|
|
306
|
+
else:
|
|
307
|
+
ctor_params.setdefault("loss_function", "MultiClass")
|
|
308
|
+
eval_metric = ctor_params.get("eval_metric", "MultiClass")
|
|
309
|
+
ctor_params.setdefault("eval_metric", eval_metric)
|
|
310
|
+
|
|
311
|
+
# Instantiate CatBoost model from provided constructor
|
|
312
|
+
model = self.create_model(**ctor_params, allow_writing_files=False)
|
|
313
|
+
|
|
314
|
+
# Train with eval_set and early stopping
|
|
315
|
+
logger.info(f"Fitting the model {self.model_name}...")
|
|
316
|
+
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
317
|
+
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
318
|
+
|
|
319
|
+
model.fit(
|
|
320
|
+
x_train,
|
|
321
|
+
y_train,
|
|
322
|
+
eval_set=[(x_val, y_val)],
|
|
323
|
+
use_best_model=True,
|
|
324
|
+
early_stopping_rounds=early_stopping_rounds,
|
|
325
|
+
verbose=False,
|
|
326
|
+
)
|
|
209
327
|
|
|
210
|
-
#
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
328
|
+
# Retrieve evaluation results
|
|
329
|
+
evals_result = model.get_evals_result()
|
|
330
|
+
# CatBoost commonly uses 'learn' and 'validation' (or 'validation_0')
|
|
331
|
+
learn_key = "learn"
|
|
332
|
+
val_key = None
|
|
333
|
+
for k in evals_result.keys():
|
|
334
|
+
if k != learn_key:
|
|
335
|
+
val_key = k
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
# Ensure eval_metric exists; otherwise fallback to first available metric
|
|
339
|
+
if eval_metric not in evals_result.get(learn_key, {}):
|
|
340
|
+
if evals_result.get(learn_key):
|
|
341
|
+
eval_metric = next(iter(evals_result[learn_key].keys()))
|
|
342
|
+
|
|
343
|
+
# TensorBoard logging
|
|
344
|
+
writer = SummaryWriter(self.log_dir)
|
|
345
|
+
try:
|
|
346
|
+
# learn_scores = evals_result.get(learn_key, {}).get(eval_metric, [])
|
|
347
|
+
val_scores = (
|
|
348
|
+
evals_result.get(val_key, {}).get(eval_metric, []) if val_key else []
|
|
349
|
+
)
|
|
350
|
+
# for i, v in enumerate(learn_scores):
|
|
351
|
+
# writer.add_scalar(f"CatBoost/train/{eval_metric}", v, i)
|
|
352
|
+
for i, v in enumerate(val_scores):
|
|
353
|
+
writer.add_scalar(f"CatBoost/{eval_metric}", v, i)
|
|
354
|
+
finally:
|
|
355
|
+
writer.close()
|
|
214
356
|
|
|
357
|
+
# Optional plotting of training progress
|
|
358
|
+
if self.plot and eval_metric and learn_key in evals_result and val_key:
|
|
359
|
+
logs = {
|
|
360
|
+
"train": evals_result[learn_key].get(eval_metric, []),
|
|
361
|
+
"val": evals_result[val_key].get(eval_metric, []),
|
|
362
|
+
}
|
|
363
|
+
plot_training_progress(
|
|
364
|
+
logs=logs,
|
|
365
|
+
model_name=self.model_name,
|
|
366
|
+
target_number=self.target_number,
|
|
367
|
+
title_suffix=f"Training Progress - {eval_metric}",
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Attach metadata for consistency with sklearn path
|
|
371
|
+
model_wrapped = CatBoostWrapper(
|
|
372
|
+
model, model_name=self.model_name, target_type=self.target_type
|
|
373
|
+
)
|
|
374
|
+
logger.info(
|
|
375
|
+
f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
self._model = model_wrapped
|
|
379
|
+
return model_wrapped
|
|
380
|
+
|
|
381
|
+
def fit_boosting(self, x_train, y_train, x_val, y_val, params):
|
|
382
|
+
"""
|
|
383
|
+
This is using lightGBM or XGboost C++ librairies
|
|
384
|
+
"""
|
|
215
385
|
# Create a TensorBoardX writer
|
|
216
386
|
writer = SummaryWriter(self.log_dir)
|
|
217
387
|
evals_result = {}
|
|
@@ -223,11 +393,13 @@ class ModelEngine:
|
|
|
223
393
|
if self.target_type == "classification" and labels.size > 2
|
|
224
394
|
else 1
|
|
225
395
|
)
|
|
226
|
-
logger.info("Fitting the model...")
|
|
396
|
+
logger.info(f"Fitting the model {self.model_name}...")
|
|
227
397
|
logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
|
|
228
398
|
logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
|
|
229
399
|
|
|
230
|
-
if
|
|
400
|
+
if self.model_name == "lgb":
|
|
401
|
+
train_data = lgb.Dataset(x_train, label=y_train)
|
|
402
|
+
val_data = lgb.Dataset(x_val, label=y_val)
|
|
231
403
|
|
|
232
404
|
def tensorboard_callback(env):
|
|
233
405
|
for i, metric in enumerate(env.evaluation_result_list):
|
|
@@ -252,18 +424,25 @@ class ModelEngine:
|
|
|
252
424
|
"objective": loss,
|
|
253
425
|
"metric": eval_metric,
|
|
254
426
|
"num_class": num_class,
|
|
427
|
+
"verbose": -1,
|
|
428
|
+
"verbose_eval": False,
|
|
255
429
|
},
|
|
256
430
|
num_boost_round=params["num_boost_round"],
|
|
257
431
|
train_set=train_data,
|
|
258
432
|
valid_sets=[train_data, val_data],
|
|
259
433
|
valid_names=["train", "val"],
|
|
260
434
|
callbacks=[
|
|
261
|
-
lgb.early_stopping(
|
|
435
|
+
lgb.early_stopping(
|
|
436
|
+
stopping_rounds=params["early_stopping_rounds"], verbose=False
|
|
437
|
+
),
|
|
262
438
|
lgb.record_evaluation(evals_result),
|
|
263
439
|
tensorboard_callback,
|
|
440
|
+
lgb.log_evaluation(period=0), # Disable evaluation logging
|
|
264
441
|
],
|
|
265
442
|
)
|
|
266
443
|
else:
|
|
444
|
+
train_data = xgb.DMatrix(x_train, label=y_train)
|
|
445
|
+
val_data = xgb.DMatrix(x_val, label=y_val)
|
|
267
446
|
|
|
268
447
|
class TensorBoardCallback(xgb.callback.TrainingCallback):
|
|
269
448
|
|
|
@@ -300,6 +479,7 @@ class ModelEngine:
|
|
|
300
479
|
if self.target_type == "regression"
|
|
301
480
|
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
302
481
|
)
|
|
482
|
+
# XGBoost verbosity already set globally
|
|
303
483
|
model = xgb.train(
|
|
304
484
|
params={
|
|
305
485
|
**params["model_params"],
|
|
@@ -314,11 +494,11 @@ class ModelEngine:
|
|
|
314
494
|
xgb.callback.EarlyStopping(
|
|
315
495
|
rounds=params["early_stopping_rounds"], save_best=True
|
|
316
496
|
),
|
|
317
|
-
|
|
497
|
+
# Removed EvaluationMonitor to suppress logs
|
|
318
498
|
tensorboard_callback,
|
|
319
499
|
],
|
|
320
500
|
evals_result=evals_result, # Record evaluation result
|
|
321
|
-
verbose_eval=
|
|
501
|
+
verbose_eval=False, # Disable evaluation logging
|
|
322
502
|
)
|
|
323
503
|
|
|
324
504
|
model.model_name = self.create_model
|
|
@@ -583,6 +763,171 @@ class ModelEngine:
|
|
|
583
763
|
)
|
|
584
764
|
|
|
585
765
|
|
|
766
|
+
def trainable_cv(
|
|
767
|
+
params,
|
|
768
|
+
x_train,
|
|
769
|
+
y_train,
|
|
770
|
+
x_val,
|
|
771
|
+
y_val,
|
|
772
|
+
model_name,
|
|
773
|
+
target_type,
|
|
774
|
+
experiment_name,
|
|
775
|
+
target_number,
|
|
776
|
+
create_model,
|
|
777
|
+
n_splits=3,
|
|
778
|
+
plot=False,
|
|
779
|
+
log_dir=None,
|
|
780
|
+
target_clf_thresholds: dict = None,
|
|
781
|
+
time_series=True,
|
|
782
|
+
recurrent=False,
|
|
783
|
+
):
|
|
784
|
+
"""Cross-validation version of trainable for hyperopt.
|
|
785
|
+
|
|
786
|
+
Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
|
|
787
|
+
Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
|
|
788
|
+
"""
|
|
789
|
+
# Combine train and validation data for cross-validation
|
|
790
|
+
if recurrent:
|
|
791
|
+
x_train_val = np.concatenate([x_train, x_val], axis=0)
|
|
792
|
+
y_train_val = np.concatenate([y_train, y_val], axis=0)
|
|
793
|
+
else:
|
|
794
|
+
x_train_val = pd.concat([x_train, x_val], axis=0)
|
|
795
|
+
y_train_val = pd.concat([y_train, y_val], axis=0)
|
|
796
|
+
# Store original index for later use if needed
|
|
797
|
+
original_index = x_train_val.index.copy()
|
|
798
|
+
# Reset index for proper iloc indexing with CV splits
|
|
799
|
+
x_train_val = x_train_val.reset_index(drop=True)
|
|
800
|
+
y_train_val = y_train_val.reset_index(drop=True)
|
|
801
|
+
|
|
802
|
+
# Choose appropriate cross-validation splitter
|
|
803
|
+
if time_series:
|
|
804
|
+
# Time series split for temporal data
|
|
805
|
+
n_samples = len(x_train_val)
|
|
806
|
+
test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
|
|
807
|
+
cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
808
|
+
else:
|
|
809
|
+
# Stratified or regular K-fold for i.i.d. data
|
|
810
|
+
if target_type == "classification":
|
|
811
|
+
cv_splitter = StratifiedKFold(
|
|
812
|
+
n_splits=n_splits, shuffle=True, random_state=42
|
|
813
|
+
)
|
|
814
|
+
else:
|
|
815
|
+
cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
|
|
816
|
+
|
|
817
|
+
# Store all predictions and true values for pooled metrics
|
|
818
|
+
all_predictions = []
|
|
819
|
+
all_y_true = []
|
|
820
|
+
fold_times = []
|
|
821
|
+
|
|
822
|
+
# Get splits based on the CV strategy
|
|
823
|
+
if time_series or target_type == "regression":
|
|
824
|
+
splits = cv_splitter.split(x_train_val)
|
|
825
|
+
else:
|
|
826
|
+
# For stratified split, we need to pass y
|
|
827
|
+
if recurrent:
|
|
828
|
+
# Extract the target from the 2D array (first column is target)
|
|
829
|
+
y_for_split = y_train_val[:, 0]
|
|
830
|
+
else:
|
|
831
|
+
y_for_split = y_train_val
|
|
832
|
+
splits = cv_splitter.split(x_train_val, y_for_split)
|
|
833
|
+
|
|
834
|
+
for fold_idx, (train_idx, val_idx) in enumerate(splits):
|
|
835
|
+
# Extract fold data
|
|
836
|
+
if recurrent:
|
|
837
|
+
x_fold_train = x_train_val[train_idx]
|
|
838
|
+
y_fold_train = y_train_val[train_idx]
|
|
839
|
+
x_fold_val = x_train_val[val_idx]
|
|
840
|
+
y_fold_val = y_train_val[val_idx]
|
|
841
|
+
else:
|
|
842
|
+
x_fold_train = x_train_val.iloc[train_idx]
|
|
843
|
+
y_fold_train = y_train_val.iloc[train_idx]
|
|
844
|
+
x_fold_val = x_train_val.iloc[val_idx]
|
|
845
|
+
y_fold_val = y_train_val.iloc[val_idx]
|
|
846
|
+
|
|
847
|
+
# Train model for this fold
|
|
848
|
+
model = BaseModel(
|
|
849
|
+
model_name=model_name,
|
|
850
|
+
target_type=target_type,
|
|
851
|
+
target_number=target_number,
|
|
852
|
+
create_model=create_model,
|
|
853
|
+
plot=False, # Disable individual fold plots
|
|
854
|
+
log_dir=log_dir,
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
if recurrent:
|
|
858
|
+
timesteps = params["timesteps"]
|
|
859
|
+
x_fold_train = x_fold_train[:, -timesteps:, :]
|
|
860
|
+
x_fold_val = x_fold_val[:, -timesteps:, :]
|
|
861
|
+
|
|
862
|
+
# Fit model
|
|
863
|
+
model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
|
|
864
|
+
|
|
865
|
+
# Get predictions
|
|
866
|
+
y_pred = model.predict(x_fold_val)
|
|
867
|
+
|
|
868
|
+
# Handle recurrent model indexing
|
|
869
|
+
if recurrent:
|
|
870
|
+
y_fold_val = pd.DataFrame(
|
|
871
|
+
y_fold_val, columns=["TARGET", "index"]
|
|
872
|
+
).set_index("index")
|
|
873
|
+
y_pred.index = y_fold_val.index
|
|
874
|
+
|
|
875
|
+
# Store predictions and true values
|
|
876
|
+
all_predictions.append(y_pred)
|
|
877
|
+
all_y_true.append(y_fold_val)
|
|
878
|
+
|
|
879
|
+
# Concatenate all fold predictions
|
|
880
|
+
if target_type == "classification":
|
|
881
|
+
# For classification, we need to handle probability columns
|
|
882
|
+
all_pred_df = pd.concat(all_predictions, axis=0)
|
|
883
|
+
all_y_series = pd.concat(all_y_true, axis=0)
|
|
884
|
+
# Ensure we have a DataFrame with TARGET column
|
|
885
|
+
if isinstance(all_y_series, pd.Series):
|
|
886
|
+
all_y_df = pd.DataFrame({"TARGET": all_y_series})
|
|
887
|
+
else:
|
|
888
|
+
all_y_df = all_y_series
|
|
889
|
+
else:
|
|
890
|
+
# For regression, just concatenate the predictions
|
|
891
|
+
all_pred_series = pd.concat(all_predictions, axis=0)
|
|
892
|
+
all_y_series = pd.concat(all_y_true, axis=0)
|
|
893
|
+
all_pred_df = pd.DataFrame({"PRED": all_pred_series})
|
|
894
|
+
all_y_df = pd.DataFrame({"TARGET": all_y_series})
|
|
895
|
+
|
|
896
|
+
# Create combined prediction DataFrame
|
|
897
|
+
prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
|
|
898
|
+
|
|
899
|
+
# Calculate pooled metrics
|
|
900
|
+
score = {
|
|
901
|
+
"DATE": datetime.now(),
|
|
902
|
+
"MODEL_NAME": model_name,
|
|
903
|
+
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
# Unscale if needed (for regression with scaling)
|
|
907
|
+
if (
|
|
908
|
+
model.need_scaling
|
|
909
|
+
and target_type == "regression"
|
|
910
|
+
and model.scaler_y is not None
|
|
911
|
+
):
|
|
912
|
+
prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
|
|
913
|
+
prediction[["TARGET"]].values
|
|
914
|
+
)
|
|
915
|
+
prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
|
|
916
|
+
prediction[["PRED"]].values
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# Evaluate with pooled predictions
|
|
920
|
+
score.update(evaluate(prediction, target_type, target_clf_thresholds))
|
|
921
|
+
|
|
922
|
+
metric = "RMSE" if target_type == "regression" else "LOGLOSS"
|
|
923
|
+
logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
|
|
924
|
+
|
|
925
|
+
# Report to Ray if in Ray context
|
|
926
|
+
if session.get_session():
|
|
927
|
+
session.report(metrics=score)
|
|
928
|
+
return score
|
|
929
|
+
|
|
930
|
+
|
|
586
931
|
def trainable(
|
|
587
932
|
params,
|
|
588
933
|
x_train,
|
|
@@ -594,14 +939,13 @@ def trainable(
|
|
|
594
939
|
experiment_name,
|
|
595
940
|
target_number,
|
|
596
941
|
create_model,
|
|
597
|
-
type_name="hyperopts",
|
|
598
942
|
plot=False,
|
|
599
943
|
log_dir=None,
|
|
600
944
|
target_clf_thresholds: dict = None,
|
|
601
945
|
):
|
|
602
946
|
"""Standalone version of train_model that doesn't depend on self"""
|
|
603
947
|
# Create model engine
|
|
604
|
-
model =
|
|
948
|
+
model = BaseModel(
|
|
605
949
|
model_name=model_name,
|
|
606
950
|
target_type=target_type,
|
|
607
951
|
target_number=target_number,
|
|
@@ -620,9 +964,7 @@ def trainable(
|
|
|
620
964
|
x_val = x_val[:, -timesteps:, :]
|
|
621
965
|
|
|
622
966
|
# Compile and fit model on train set
|
|
623
|
-
start = time.time()
|
|
624
967
|
model.fit(x_train, y_train, x_val, y_val, params)
|
|
625
|
-
stop = time.time()
|
|
626
968
|
|
|
627
969
|
# Prediction on val set
|
|
628
970
|
y_pred = model.predict(x_val)
|
|
@@ -652,8 +994,6 @@ def trainable(
|
|
|
652
994
|
score = {
|
|
653
995
|
"DATE": datetime.now(),
|
|
654
996
|
"MODEL_NAME": model.model_name,
|
|
655
|
-
"TYPE": type_name,
|
|
656
|
-
"TRAINING_TIME": stop - start,
|
|
657
997
|
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
658
998
|
}
|
|
659
999
|
|
|
@@ -662,77 +1002,107 @@ def trainable(
|
|
|
662
1002
|
metric = "RMSE" if target_type == "regression" else "LOGLOSS"
|
|
663
1003
|
logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
|
|
664
1004
|
|
|
665
|
-
if
|
|
1005
|
+
# Report to Ray if in Ray context
|
|
1006
|
+
if session.get_session():
|
|
666
1007
|
session.report(metrics=score)
|
|
667
1008
|
return score
|
|
668
1009
|
|
|
669
1010
|
return score, model, prediction
|
|
670
1011
|
|
|
671
1012
|
|
|
672
|
-
class
|
|
1013
|
+
class ModelSelector(LeCrapaudEstimatorMixin):
|
|
673
1014
|
|
|
674
1015
|
def __init__(
|
|
675
1016
|
self,
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
target_number,
|
|
679
|
-
target_clf,
|
|
680
|
-
experiment,
|
|
681
|
-
models_idx,
|
|
682
|
-
time_series,
|
|
683
|
-
date_column,
|
|
684
|
-
group_column,
|
|
685
|
-
target_clf_thresholds,
|
|
1017
|
+
experiment: Experiment = None,
|
|
1018
|
+
target_number: int = None,
|
|
686
1019
|
**kwargs,
|
|
687
1020
|
):
|
|
688
|
-
|
|
689
|
-
|
|
1021
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
1022
|
+
super().__init__(experiment=experiment, target_number=target_number, **kwargs)
|
|
1023
|
+
|
|
1024
|
+
# Set defaults for required parameters if not provided
|
|
1025
|
+
if not hasattr(self, "target_clf"):
|
|
1026
|
+
self.target_clf = []
|
|
1027
|
+
if not hasattr(self, "models_idx"):
|
|
1028
|
+
self.models_idx = []
|
|
1029
|
+
if not hasattr(self, "time_series"):
|
|
1030
|
+
self.time_series = False
|
|
1031
|
+
if not hasattr(self, "date_column"):
|
|
1032
|
+
self.date_column = None
|
|
1033
|
+
if not hasattr(self, "group_column"):
|
|
1034
|
+
self.group_column = None
|
|
1035
|
+
if not hasattr(self, "target_clf_thresholds"):
|
|
1036
|
+
self.target_clf_thresholds = {}
|
|
690
1037
|
self.target_number = target_number
|
|
691
|
-
self.experiment = experiment
|
|
692
|
-
self.target_clf = target_clf
|
|
693
|
-
self.models_idx = models_idx
|
|
694
|
-
self.time_series = time_series
|
|
695
|
-
self.date_column = date_column
|
|
696
|
-
self.group_column = group_column
|
|
697
|
-
self.target_clf_thresholds = (
|
|
698
|
-
target_clf_thresholds[target_number]
|
|
699
|
-
if target_number in target_clf_thresholds.keys()
|
|
700
|
-
else None
|
|
701
|
-
)
|
|
702
1038
|
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
1039
|
+
# Handle target_clf_thresholds for specific target
|
|
1040
|
+
# Handle both string and integer keys for backward compatibility
|
|
1041
|
+
if self.target_number and self.target_clf_thresholds:
|
|
1042
|
+
# Try both integer and string versions of the target number
|
|
1043
|
+
if self.target_number in self.target_clf_thresholds:
|
|
1044
|
+
self.target_clf_thresholds = self.target_clf_thresholds[
|
|
1045
|
+
self.target_number
|
|
1046
|
+
]
|
|
1047
|
+
elif str(self.target_number) in self.target_clf_thresholds:
|
|
1048
|
+
self.target_clf_thresholds = self.target_clf_thresholds[
|
|
1049
|
+
str(self.target_number)
|
|
1050
|
+
]
|
|
1051
|
+
|
|
1052
|
+
# Derived attributes
|
|
1053
|
+
if self.target_number is not None:
|
|
1054
|
+
self.target_type = (
|
|
1055
|
+
"classification"
|
|
1056
|
+
if self.target_number in self.target_clf
|
|
1057
|
+
else "regression"
|
|
1058
|
+
)
|
|
1059
|
+
self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
|
|
1060
|
+
|
|
1061
|
+
# Set paths and features if experiment is available
|
|
1062
|
+
if self.experiment:
|
|
1063
|
+
self.experiment_dir = self.experiment.path
|
|
1064
|
+
self.experiment_id = self.experiment.id
|
|
1065
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
1066
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
1067
|
+
|
|
1068
|
+
if self.target_number is not None:
|
|
1069
|
+
self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
|
|
1070
|
+
self.features = self.experiment.get_features(self.target_number)
|
|
1071
|
+
|
|
1072
|
+
self.all_features = self.experiment.get_all_features(
|
|
1073
|
+
date_column=self.date_column, group_column=self.group_column
|
|
1074
|
+
)
|
|
716
1075
|
|
|
717
1076
|
# Main training function
|
|
718
|
-
def
|
|
719
|
-
self,
|
|
720
|
-
experiment_name,
|
|
721
|
-
perform_hyperopt=True,
|
|
722
|
-
number_of_trials=20,
|
|
723
|
-
perform_crossval=False,
|
|
724
|
-
plot=True,
|
|
725
|
-
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
726
|
-
preserve_model=True,
|
|
727
|
-
best_params=None,
|
|
728
|
-
):
|
|
1077
|
+
def fit(self, X, y=None, reshaped_data=None, best_params=None):
|
|
729
1078
|
"""
|
|
730
|
-
|
|
731
|
-
|
|
1079
|
+
Fit the model selector (train and select best model).
|
|
1080
|
+
|
|
1081
|
+
Args:
|
|
1082
|
+
X: Either a DataFrame or a dict with train/val/test data
|
|
1083
|
+
y: Target values (ignored, uses TARGET columns)
|
|
1084
|
+
reshaped_data: Optional reshaped data for recurrent models
|
|
1085
|
+
best_params: Optional pre-defined best parameters
|
|
1086
|
+
|
|
1087
|
+
Returns:
|
|
1088
|
+
self: Returns self for chaining
|
|
732
1089
|
"""
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
1090
|
+
# Handle both DataFrame and dict inputs
|
|
1091
|
+
if isinstance(X, dict):
|
|
1092
|
+
self.data = X
|
|
1093
|
+
self.reshaped_data = reshaped_data
|
|
1094
|
+
else:
|
|
1095
|
+
# For simple DataFrame input, we expect it to be just training data
|
|
1096
|
+
# This is less common for ModelSelector which typically needs train/val/test
|
|
1097
|
+
raise ValueError("ModelSelector requires a dict with train/val/test data")
|
|
1098
|
+
# Get all parameters from experiment context
|
|
1099
|
+
context = self.experiment.context
|
|
1100
|
+
self.experiment_name = context.get("experiment_name", "")
|
|
1101
|
+
self.plot = context.get("plot", True)
|
|
1102
|
+
self.number_of_trials = context.get("number_of_trials", 20)
|
|
1103
|
+
self.perform_crossval = context.get("perform_crossval", False)
|
|
1104
|
+
self.preserve_model = context.get("preserve_model", True)
|
|
1105
|
+
self.perform_hyperopt = context.get("perform_hyperopt", True)
|
|
736
1106
|
|
|
737
1107
|
if self.experiment_id is None:
|
|
738
1108
|
raise ValueError("Please provide a experiment.")
|
|
@@ -782,12 +1152,11 @@ class ModelSelectionEngine:
|
|
|
782
1152
|
# create model selection in db
|
|
783
1153
|
target = Target.find_by(name=f"TARGET_{self.target_number}")
|
|
784
1154
|
model_selection = ModelSelection.upsert(
|
|
785
|
-
match_fields=["target_id", "experiment_id"],
|
|
786
1155
|
target_id=target.id,
|
|
787
1156
|
experiment_id=self.experiment_id,
|
|
788
1157
|
)
|
|
789
1158
|
|
|
790
|
-
#
|
|
1159
|
+
# STEP 1 : TRAINING MODELS
|
|
791
1160
|
for i in self.models_idx:
|
|
792
1161
|
config = all_models[i]
|
|
793
1162
|
recurrent = config["recurrent"]
|
|
@@ -800,24 +1169,16 @@ class ModelSelectionEngine:
|
|
|
800
1169
|
self.results_dir = f"{self.target_dir}/{model_name}"
|
|
801
1170
|
if not os.path.exists(f"{self.results_dir}"):
|
|
802
1171
|
os.makedirs(f"{self.results_dir}")
|
|
803
|
-
elif preserve_model and contains_best(self.results_dir):
|
|
1172
|
+
elif self.preserve_model and contains_best(self.results_dir):
|
|
804
1173
|
continue
|
|
805
|
-
elif perform_hyperopt:
|
|
1174
|
+
elif self.perform_hyperopt:
|
|
806
1175
|
clean_directory(self.results_dir)
|
|
807
1176
|
|
|
808
|
-
logger.info(
|
|
809
|
-
|
|
810
|
-
match_fields=["name", "type"],
|
|
811
|
-
name=model_name,
|
|
812
|
-
type=self.target_type,
|
|
813
|
-
)
|
|
814
|
-
model_training = ModelTraining.upsert(
|
|
815
|
-
match_fields=["model_id", "model_selection_id"],
|
|
816
|
-
model_id=model.id,
|
|
817
|
-
model_selection_id=model_selection.id,
|
|
1177
|
+
logger.info(
|
|
1178
|
+
f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
|
|
818
1179
|
)
|
|
819
1180
|
|
|
820
|
-
#
|
|
1181
|
+
# Getting data
|
|
821
1182
|
if recurrent:
|
|
822
1183
|
# Clear cluster from previous Keras session graphs.
|
|
823
1184
|
K.clear_session()
|
|
@@ -827,7 +1188,7 @@ class ModelSelectionEngine:
|
|
|
827
1188
|
for i, e in enumerate(self.all_features)
|
|
828
1189
|
if e in set(self.features)
|
|
829
1190
|
]
|
|
830
|
-
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
1191
|
+
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
|
|
831
1192
|
x_train = x_train_reshaped[:, :, features_idx]
|
|
832
1193
|
y_train = y_train_reshaped[:, [self.target_number, 0]]
|
|
833
1194
|
x_val = x_val_reshaped[:, :, features_idx]
|
|
@@ -857,8 +1218,9 @@ class ModelSelectionEngine:
|
|
|
857
1218
|
y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
858
1219
|
|
|
859
1220
|
log_dir = get_log_dir(self.target_dir, model_name)
|
|
860
|
-
|
|
861
|
-
|
|
1221
|
+
|
|
1222
|
+
# Instantiate model
|
|
1223
|
+
model = BaseModel(
|
|
862
1224
|
target_number=self.target_number,
|
|
863
1225
|
model_name=model_name,
|
|
864
1226
|
search_params=config["search_params"],
|
|
@@ -868,9 +1230,9 @@ class ModelSelectionEngine:
|
|
|
868
1230
|
log_dir=log_dir,
|
|
869
1231
|
)
|
|
870
1232
|
|
|
871
|
-
start = time.time()
|
|
872
1233
|
# Tuning hyperparameters
|
|
873
|
-
|
|
1234
|
+
start = time.time()
|
|
1235
|
+
if self.perform_hyperopt:
|
|
874
1236
|
model_best_params = self.hyperoptimize(
|
|
875
1237
|
x_train, y_train, x_val, y_val, model
|
|
876
1238
|
)
|
|
@@ -886,7 +1248,7 @@ class ModelSelectionEngine:
|
|
|
886
1248
|
f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
|
|
887
1249
|
)
|
|
888
1250
|
|
|
889
|
-
#
|
|
1251
|
+
# Save best params
|
|
890
1252
|
best_params_file = f"{self.target_dir}/best_params.json"
|
|
891
1253
|
try:
|
|
892
1254
|
with open(best_params_file, "r") as f:
|
|
@@ -898,114 +1260,25 @@ class ModelSelectionEngine:
|
|
|
898
1260
|
with open(best_params_file, "w") as f:
|
|
899
1261
|
json.dump(json_dict, f, indent=4)
|
|
900
1262
|
|
|
901
|
-
#
|
|
902
|
-
if
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
|
|
914
|
-
self.type_name = f"crossval_fold_{i}"
|
|
915
|
-
|
|
916
|
-
if self.time_series:
|
|
917
|
-
date_series = pd.concat(
|
|
918
|
-
[
|
|
919
|
-
train[self.date_column],
|
|
920
|
-
val[self.date_column],
|
|
921
|
-
test[self.date_column],
|
|
922
|
-
],
|
|
923
|
-
axis=0,
|
|
924
|
-
).reset_index(drop=True)
|
|
925
|
-
|
|
926
|
-
date_series = date_series.map(pd.Timestamp.fromordinal)
|
|
927
|
-
|
|
928
|
-
# Now you can use the actual train/val indices to extract ranges
|
|
929
|
-
train_start = date_series.iloc[train_index[0]]
|
|
930
|
-
train_end = date_series.iloc[train_index[-1]]
|
|
931
|
-
val_start = date_series.iloc[val_index[0]]
|
|
932
|
-
val_end = date_series.iloc[val_index[-1]]
|
|
933
|
-
|
|
934
|
-
logger.info(
|
|
935
|
-
f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
|
|
936
|
-
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
937
|
-
)
|
|
938
|
-
else:
|
|
939
|
-
logger.info(
|
|
940
|
-
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
941
|
-
)
|
|
942
|
-
|
|
943
|
-
# Train the model and get the score
|
|
944
|
-
if recurrent:
|
|
945
|
-
cv_score, _, _ = self.train_model(
|
|
946
|
-
params=model_best_params,
|
|
947
|
-
x_train=x_train_val[train_index],
|
|
948
|
-
y_train=y_train_val[train_index],
|
|
949
|
-
x_val=x_train_val[val_index],
|
|
950
|
-
y_val=y_train_val[val_index],
|
|
951
|
-
model=model,
|
|
952
|
-
)
|
|
953
|
-
else:
|
|
954
|
-
cv_score, _, _ = self.train_model(
|
|
955
|
-
params=model_best_params,
|
|
956
|
-
x_train=x_train_val.iloc[train_index],
|
|
957
|
-
y_train=y_train_val.iloc[train_index],
|
|
958
|
-
x_val=x_train_val.iloc[val_index],
|
|
959
|
-
y_val=y_train_val.iloc[val_index],
|
|
960
|
-
model=model,
|
|
961
|
-
)
|
|
962
|
-
|
|
963
|
-
# Append score to the list
|
|
964
|
-
cv_scores.append(cv_score)
|
|
965
|
-
|
|
966
|
-
# Calculate mean of all numerical metrics across all cross-validation folds
|
|
967
|
-
cv_scores_df = pd.DataFrame(cv_scores)
|
|
968
|
-
# Get mean of all numeric columns
|
|
969
|
-
cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
|
|
1263
|
+
# Always evaluate on test set (no cross-validation here)
|
|
1264
|
+
# The hyperopt already did CV if needed to find best params
|
|
1265
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1266
|
+
params=model_best_params,
|
|
1267
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1268
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1269
|
+
x_val=x_test,
|
|
1270
|
+
y_val=y_test,
|
|
1271
|
+
model=model,
|
|
1272
|
+
)
|
|
1273
|
+
stop = time.time()
|
|
1274
|
+
training_time = stop - start
|
|
970
1275
|
|
|
971
|
-
|
|
972
|
-
|
|
1276
|
+
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1277
|
+
logger.info(f"👉 {model.model_name} scores on test set:")
|
|
1278
|
+
for metric, value in best_score.items():
|
|
1279
|
+
if isinstance(value, (int, float)):
|
|
973
1280
|
logger.info(f" {metric}: {value:.4f}")
|
|
974
1281
|
|
|
975
|
-
# Retrain on entire training set, but keep score on cross-validation folds
|
|
976
|
-
# Get the test score using the best model
|
|
977
|
-
test_score, best_model, best_pred = self.train_model(
|
|
978
|
-
params=model_best_params,
|
|
979
|
-
x_train=pd.concat([x_train, x_val], axis=0),
|
|
980
|
-
y_train=pd.concat([y_train, y_val], axis=0),
|
|
981
|
-
x_val=x_test,
|
|
982
|
-
y_val=y_test,
|
|
983
|
-
model=model,
|
|
984
|
-
)
|
|
985
|
-
|
|
986
|
-
# Update all metrics with cross-validation means
|
|
987
|
-
for metric, value in cv_means.items():
|
|
988
|
-
if metric in test_score: # Only update existing metrics
|
|
989
|
-
test_score[metric] = value
|
|
990
|
-
best_score = test_score
|
|
991
|
-
best_score["TYPE"] = "crossval"
|
|
992
|
-
else:
|
|
993
|
-
# Evaluate on test set
|
|
994
|
-
self.type_name = "testset"
|
|
995
|
-
best_score, best_model, best_pred = self.train_model(
|
|
996
|
-
params=model_best_params,
|
|
997
|
-
x_train=pd.concat([x_train, x_val], axis=0),
|
|
998
|
-
y_train=pd.concat([y_train, y_val], axis=0),
|
|
999
|
-
x_val=x_test,
|
|
1000
|
-
y_val=y_test,
|
|
1001
|
-
model=model,
|
|
1002
|
-
)
|
|
1003
|
-
|
|
1004
|
-
logger.info(f"👉 {model.model_name} scores on test set:")
|
|
1005
|
-
for metric, value in best_score.items():
|
|
1006
|
-
if isinstance(value, (int, float)):
|
|
1007
|
-
logger.info(f" {metric}: {value:.4f}")
|
|
1008
|
-
|
|
1009
1282
|
# Save predictions
|
|
1010
1283
|
best_pred.to_csv(
|
|
1011
1284
|
f"{self.results_dir}/prediction.csv",
|
|
@@ -1016,7 +1289,6 @@ class ModelSelectionEngine:
|
|
|
1016
1289
|
|
|
1017
1290
|
# Save best model
|
|
1018
1291
|
model_path = best_model.save(self.results_dir)
|
|
1019
|
-
|
|
1020
1292
|
model_path = Path(model_path).resolve()
|
|
1021
1293
|
best_score["MODEL_PATH"] = model_path
|
|
1022
1294
|
|
|
@@ -1039,32 +1311,26 @@ class ModelSelectionEngine:
|
|
|
1039
1311
|
scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
|
|
1040
1312
|
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1041
1313
|
|
|
1042
|
-
# Save
|
|
1043
|
-
stop = time.time()
|
|
1044
|
-
training_time = stop - start
|
|
1045
|
-
model_training.best_params = model_best_params
|
|
1046
|
-
model_training.model_path = model_path
|
|
1047
|
-
model_training.training_time = training_time
|
|
1048
|
-
model_training.save()
|
|
1049
|
-
|
|
1050
|
-
# Store metrics in DB
|
|
1314
|
+
# Save in db
|
|
1051
1315
|
drop_cols = [
|
|
1052
1316
|
"DATE",
|
|
1053
1317
|
"MODEL_NAME",
|
|
1054
|
-
"MODEL_PATH",
|
|
1055
1318
|
]
|
|
1056
1319
|
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1057
1320
|
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1321
|
+
model = Model.upsert(
|
|
1322
|
+
name=model_name,
|
|
1323
|
+
type=self.target_type,
|
|
1324
|
+
)
|
|
1325
|
+
ModelSelectionScore.upsert(
|
|
1326
|
+
model_id=model.id,
|
|
1327
|
+
model_selection_id=model_selection.id,
|
|
1328
|
+
best_params=serialize_for_json(model_best_params),
|
|
1329
|
+
training_time=training_time,
|
|
1062
1330
|
**score_data,
|
|
1063
1331
|
)
|
|
1064
1332
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
# find best model type
|
|
1333
|
+
# STEP 2 :FINDING BEST MODEL OVERALL
|
|
1068
1334
|
scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
|
|
1069
1335
|
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1070
1336
|
best_score_overall = scores_tracking.iloc[0, :]
|
|
@@ -1075,12 +1341,11 @@ class ModelSelectionEngine:
|
|
|
1075
1341
|
else:
|
|
1076
1342
|
best_thresholds = None
|
|
1077
1343
|
|
|
1078
|
-
# Remove any .best or .keras files
|
|
1344
|
+
# Remove any .best or .keras files, and save best model in target_dir
|
|
1079
1345
|
for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
|
|
1080
1346
|
os.path.join(self.target_dir, "*.keras")
|
|
1081
1347
|
):
|
|
1082
1348
|
os.remove(file_path)
|
|
1083
|
-
# Copy the best model in root training folder for this target
|
|
1084
1349
|
best_model_path = Path(
|
|
1085
1350
|
f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1086
1351
|
).resolve()
|
|
@@ -1092,13 +1357,13 @@ class ModelSelectionEngine:
|
|
|
1092
1357
|
with open(f"{self.target_dir}/best_params.json", "r") as f:
|
|
1093
1358
|
best_model_params = json.load(f)[best_model_name]
|
|
1094
1359
|
|
|
1095
|
-
# Save
|
|
1096
|
-
|
|
1360
|
+
# Save to db
|
|
1097
1361
|
model_selection = ModelSelection.get(model_selection.id)
|
|
1098
|
-
|
|
1362
|
+
model = Model.find_by(
|
|
1099
1363
|
name=best_score_overall["MODEL_NAME"], type=self.target_type
|
|
1100
|
-
)
|
|
1101
|
-
model_selection.
|
|
1364
|
+
)
|
|
1365
|
+
model_selection.best_model_id = model.id
|
|
1366
|
+
model_selection.best_model_params = serialize_for_json(best_model_params)
|
|
1102
1367
|
model_selection.best_thresholds = best_thresholds
|
|
1103
1368
|
model_selection.best_model_path = best_model_path
|
|
1104
1369
|
|
|
@@ -1111,7 +1376,7 @@ class ModelSelectionEngine:
|
|
|
1111
1376
|
k: v for k, v in best_score_overall.items() if k not in drop_cols
|
|
1112
1377
|
}
|
|
1113
1378
|
score_data = {k.lower(): v for k, v in best_score_overall.items()}
|
|
1114
|
-
model_selection.best_score = score_data
|
|
1379
|
+
model_selection.best_score = serialize_for_json(score_data)
|
|
1115
1380
|
model_selection.save()
|
|
1116
1381
|
|
|
1117
1382
|
logger.info(f"Best model overall is : {best_score_overall}")
|
|
@@ -1119,11 +1384,188 @@ class ModelSelectionEngine:
|
|
|
1119
1384
|
# Consolidate best parameters from all targets into a single file
|
|
1120
1385
|
self.consolidate_best_params()
|
|
1121
1386
|
|
|
1122
|
-
|
|
1123
|
-
|
|
1387
|
+
self.best_model_ = BaseModel(
|
|
1388
|
+
path=self.target_dir, target_number=self.target_number
|
|
1389
|
+
)
|
|
1390
|
+
self._set_fitted()
|
|
1391
|
+
return self
|
|
1392
|
+
|
|
1393
|
+
def get_best_model(self):
|
|
1394
|
+
"""
|
|
1395
|
+
Get the best trained model.
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
The best model found during training
|
|
1399
|
+
"""
|
|
1400
|
+
self._check_is_fitted()
|
|
1401
|
+
return self.best_model_
|
|
1402
|
+
|
|
1403
|
+
def hyperoptimize(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1404
|
+
"""Choose between Ray Tune and HyperOpt standalone based on configuration."""
|
|
1405
|
+
if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
|
|
1406
|
+
return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
|
|
1407
|
+
elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
|
|
1408
|
+
return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
|
|
1409
|
+
else:
|
|
1410
|
+
raise ValueError(
|
|
1411
|
+
f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
def hyperoptimize_hyperopt(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1415
|
+
"""Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
|
|
1416
|
+
|
|
1417
|
+
logger.info("Start tuning hyperparameters with HyperOpt standalone...")
|
|
1418
|
+
|
|
1419
|
+
# Convert Ray search space to HyperOpt search space
|
|
1420
|
+
def convert_search_space(ray_space):
|
|
1421
|
+
"""Convert Ray Tune search space to HyperOpt format."""
|
|
1422
|
+
from ray.tune.search.sample import Categorical, Float, Integer
|
|
1423
|
+
|
|
1424
|
+
hp_space = {}
|
|
1425
|
+
for key, value in ray_space.items():
|
|
1426
|
+
if isinstance(value, Float):
|
|
1427
|
+
if (
|
|
1428
|
+
hasattr(value, "sampler")
|
|
1429
|
+
and value.sampler.__class__.__name__ == "LogUniform"
|
|
1430
|
+
):
|
|
1431
|
+
# LogUniform distribution
|
|
1432
|
+
hp_space[key] = hp.loguniform(
|
|
1433
|
+
key, np.log(value.lower), np.log(value.upper)
|
|
1434
|
+
)
|
|
1435
|
+
else:
|
|
1436
|
+
# Uniform distribution
|
|
1437
|
+
hp_space[key] = hp.uniform(key, value.lower, value.upper)
|
|
1438
|
+
elif isinstance(value, Integer):
|
|
1439
|
+
# Integer uniform distribution
|
|
1440
|
+
hp_space[key] = hp.randint(key, value.lower, value.upper)
|
|
1441
|
+
elif isinstance(value, Categorical):
|
|
1442
|
+
# Categorical/choice distribution
|
|
1443
|
+
hp_space[key] = hp.choice(key, value.categories)
|
|
1444
|
+
elif isinstance(value, dict):
|
|
1445
|
+
# Nested dict, recurse
|
|
1446
|
+
hp_space[key] = convert_search_space(value)
|
|
1447
|
+
else:
|
|
1448
|
+
# Static value or unknown type
|
|
1449
|
+
hp_space[key] = value
|
|
1450
|
+
return hp_space
|
|
1451
|
+
|
|
1452
|
+
# Create objective function for HyperOpt
|
|
1453
|
+
def objective(params):
|
|
1454
|
+
"""Objective function to minimize."""
|
|
1455
|
+
try:
|
|
1456
|
+
# Convert numpy types to native Python types
|
|
1457
|
+
params = serialize_for_json(params)
|
|
1458
|
+
|
|
1459
|
+
# Use existing trainable function based on perform_crossval
|
|
1460
|
+
if self.perform_crossval:
|
|
1461
|
+
score = trainable_cv(
|
|
1462
|
+
params,
|
|
1463
|
+
x_train,
|
|
1464
|
+
y_train,
|
|
1465
|
+
x_val,
|
|
1466
|
+
y_val,
|
|
1467
|
+
model.model_name,
|
|
1468
|
+
self.target_type,
|
|
1469
|
+
self.experiment_name,
|
|
1470
|
+
self.target_number,
|
|
1471
|
+
model.create_model,
|
|
1472
|
+
n_splits=3,
|
|
1473
|
+
plot=model.plot,
|
|
1474
|
+
log_dir=model.log_dir,
|
|
1475
|
+
target_clf_thresholds=self.target_clf_thresholds,
|
|
1476
|
+
time_series=self.time_series,
|
|
1477
|
+
recurrent=model.recurrent,
|
|
1478
|
+
)
|
|
1479
|
+
else:
|
|
1480
|
+
score, _, _ = trainable(
|
|
1481
|
+
params,
|
|
1482
|
+
x_train,
|
|
1483
|
+
y_train,
|
|
1484
|
+
x_val,
|
|
1485
|
+
y_val,
|
|
1486
|
+
model.model_name,
|
|
1487
|
+
self.target_type,
|
|
1488
|
+
self.experiment_name,
|
|
1489
|
+
self.target_number,
|
|
1490
|
+
model.create_model,
|
|
1491
|
+
plot=model.plot,
|
|
1492
|
+
log_dir=model.log_dir,
|
|
1493
|
+
target_clf_thresholds=self.target_clf_thresholds,
|
|
1494
|
+
)
|
|
1495
|
+
|
|
1496
|
+
# HyperOpt minimizes, so return the metric directly
|
|
1497
|
+
loss = score[self.metric]
|
|
1498
|
+
|
|
1499
|
+
# Log trial info
|
|
1500
|
+
logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
|
|
1501
|
+
|
|
1502
|
+
return {
|
|
1503
|
+
"loss": loss,
|
|
1504
|
+
"status": STATUS_OK,
|
|
1505
|
+
"score": score, # Keep full score dict for analysis
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
except Exception as e:
|
|
1509
|
+
logger.error(f"Trial failed: {str(e)}")
|
|
1510
|
+
return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
|
|
1511
|
+
|
|
1512
|
+
# Convert search space
|
|
1513
|
+
hp_search_space = convert_search_space(model.search_params)
|
|
1514
|
+
|
|
1515
|
+
# Run optimization
|
|
1516
|
+
trials = Trials()
|
|
1517
|
+
best_params = fmin(
|
|
1518
|
+
fn=objective,
|
|
1519
|
+
space=hp_search_space,
|
|
1520
|
+
algo=tpe.suggest,
|
|
1521
|
+
max_evals=self.number_of_trials,
|
|
1522
|
+
trials=trials,
|
|
1523
|
+
verbose=True,
|
|
1524
|
+
show_progressbar=True,
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1527
|
+
# Get the actual parameter values (not just indices for hp.choice)
|
|
1528
|
+
best_params = space_eval(hp_search_space, best_params)
|
|
1529
|
+
|
|
1530
|
+
# Convert numpy types to native Python types
|
|
1531
|
+
best_params = serialize_for_json(best_params)
|
|
1532
|
+
|
|
1533
|
+
# Get best score from trials
|
|
1534
|
+
best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
|
|
1535
|
+
best_score = trials.trials[best_trial_idx]["result"].get("score", {})
|
|
1536
|
+
|
|
1537
|
+
# Log results
|
|
1538
|
+
logger.info(f"Best hyperparameters found were:\n{best_params}")
|
|
1539
|
+
logger.info(f"Best Scores found were:\n{best_score}")
|
|
1540
|
+
|
|
1541
|
+
# Create summary DataFrame for consistency with Ray version
|
|
1542
|
+
results_df = pd.DataFrame(
|
|
1543
|
+
[
|
|
1544
|
+
{
|
|
1545
|
+
"trial_id": i,
|
|
1546
|
+
self.metric: t["result"]["loss"],
|
|
1547
|
+
**{
|
|
1548
|
+
k: v
|
|
1549
|
+
for k, v in t["result"].get("score", {}).items()
|
|
1550
|
+
if isinstance(v, (int, float))
|
|
1551
|
+
},
|
|
1552
|
+
}
|
|
1553
|
+
for i, t in enumerate(trials.trials)
|
|
1554
|
+
if t["result"]["status"] == STATUS_OK
|
|
1555
|
+
]
|
|
1556
|
+
)
|
|
1124
1557
|
|
|
1125
|
-
|
|
1126
|
-
|
|
1558
|
+
if not results_df.empty:
|
|
1559
|
+
logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
|
|
1560
|
+
|
|
1561
|
+
# Save trial history for analysis
|
|
1562
|
+
trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
|
|
1563
|
+
with open(trials_path, "wb") as f:
|
|
1564
|
+
pickle.dump(trials, f)
|
|
1565
|
+
|
|
1566
|
+
return best_params
|
|
1567
|
+
|
|
1568
|
+
def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1127
1569
|
|
|
1128
1570
|
def collect_error_logs(target_dir: int, storage_path: str):
|
|
1129
1571
|
output_error_file = f"{target_dir}/errors.log"
|
|
@@ -1166,9 +1608,22 @@ class ModelSelectionEngine:
|
|
|
1166
1608
|
}
|
|
1167
1609
|
)
|
|
1168
1610
|
|
|
1611
|
+
# Choose between regular trainable or CV version based on perform_crossval flag
|
|
1612
|
+
# perform_crossval controls whether to use CV during hyperopt
|
|
1613
|
+
if self.perform_crossval:
|
|
1614
|
+
trainable_fn = trainable_cv
|
|
1615
|
+
additional_params = {
|
|
1616
|
+
"n_splits": 3, # Can be made configurable
|
|
1617
|
+
"time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
|
|
1618
|
+
"recurrent": model.recurrent,
|
|
1619
|
+
}
|
|
1620
|
+
else:
|
|
1621
|
+
trainable_fn = trainable
|
|
1622
|
+
additional_params = {}
|
|
1623
|
+
|
|
1169
1624
|
tuner = Tuner(
|
|
1170
1625
|
trainable=with_parameters(
|
|
1171
|
-
|
|
1626
|
+
trainable_fn,
|
|
1172
1627
|
x_train=x_train,
|
|
1173
1628
|
y_train=y_train,
|
|
1174
1629
|
x_val=x_val,
|
|
@@ -1178,10 +1633,10 @@ class ModelSelectionEngine:
|
|
|
1178
1633
|
experiment_name=self.experiment_name,
|
|
1179
1634
|
target_number=self.target_number,
|
|
1180
1635
|
create_model=model.create_model,
|
|
1181
|
-
type_name="hyperopts",
|
|
1182
1636
|
plot=model.plot,
|
|
1183
1637
|
log_dir=model.log_dir,
|
|
1184
1638
|
target_clf_thresholds=self.target_clf_thresholds,
|
|
1639
|
+
**additional_params,
|
|
1185
1640
|
),
|
|
1186
1641
|
param_space=model.search_params,
|
|
1187
1642
|
tune_config=TuneConfig(
|
|
@@ -1221,7 +1676,7 @@ class ModelSelectionEngine:
|
|
|
1221
1676
|
|
|
1222
1677
|
return best_params
|
|
1223
1678
|
|
|
1224
|
-
def train_model(self, params, x_train, y_train, x_val, y_val, model:
|
|
1679
|
+
def train_model(self, params, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1225
1680
|
# Use the standalone training function to avoid duplication
|
|
1226
1681
|
# For train_model, we pass the data directly (not as Ray references)
|
|
1227
1682
|
return trainable(
|
|
@@ -1235,7 +1690,6 @@ class ModelSelectionEngine:
|
|
|
1235
1690
|
self.experiment_name,
|
|
1236
1691
|
self.target_number,
|
|
1237
1692
|
model.create_model,
|
|
1238
|
-
self.type_name,
|
|
1239
1693
|
model.plot,
|
|
1240
1694
|
log_dir=model.log_dir,
|
|
1241
1695
|
target_clf_thresholds=self.target_clf_thresholds,
|
|
@@ -1341,11 +1795,11 @@ def evaluate(
|
|
|
1341
1795
|
y_pred_proba = (
|
|
1342
1796
|
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
1343
1797
|
)
|
|
1344
|
-
if num_classes > 2:
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1798
|
+
# if num_classes > 2:
|
|
1799
|
+
# lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
1800
|
+
# lb.fit(labels)
|
|
1801
|
+
# y_true_onhot = lb.transform(y_true)
|
|
1802
|
+
# y_pred_onehot = lb.transform(y_pred)
|
|
1349
1803
|
|
|
1350
1804
|
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
1351
1805
|
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
@@ -1365,6 +1819,9 @@ def evaluate(
|
|
|
1365
1819
|
average=("binary" if num_classes == 2 else "macro"),
|
|
1366
1820
|
)
|
|
1367
1821
|
score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
|
|
1822
|
+
score["AVG_PRECISION"] = average_precision_score(
|
|
1823
|
+
y_true, y_pred_proba, average="macro"
|
|
1824
|
+
)
|
|
1368
1825
|
|
|
1369
1826
|
# Store the complete thresholds dictionary
|
|
1370
1827
|
if len(target_clf_thresholds.keys()) > 1:
|
|
@@ -1719,6 +2176,20 @@ class Thresholds(BaseModel):
|
|
|
1719
2176
|
def find_best_threshold(
|
|
1720
2177
|
prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
|
|
1721
2178
|
) -> Thresholds:
|
|
2179
|
+
def _normalize_class_label(cls):
|
|
2180
|
+
if isinstance(cls, (np.integer, int)):
|
|
2181
|
+
return int(cls)
|
|
2182
|
+
if isinstance(cls, (float, np.floating)) and cls.is_integer():
|
|
2183
|
+
return int(cls)
|
|
2184
|
+
if isinstance(cls, str):
|
|
2185
|
+
try:
|
|
2186
|
+
as_float = float(cls)
|
|
2187
|
+
if as_float.is_integer():
|
|
2188
|
+
return int(as_float)
|
|
2189
|
+
except ValueError:
|
|
2190
|
+
pass
|
|
2191
|
+
return cls
|
|
2192
|
+
|
|
1722
2193
|
"""
|
|
1723
2194
|
General function to find best threshold optimizing recall, precision, or f1.
|
|
1724
2195
|
|
|
@@ -1737,10 +2208,15 @@ def find_best_threshold(
|
|
|
1737
2208
|
pred_cols = [
|
|
1738
2209
|
col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
|
|
1739
2210
|
]
|
|
1740
|
-
classes =
|
|
2211
|
+
classes = (
|
|
2212
|
+
[1]
|
|
2213
|
+
if len(pred_cols) <= 2
|
|
2214
|
+
else sorted({_normalize_class_label(cls) for cls in y_true.unique()}, key=str)
|
|
2215
|
+
)
|
|
1741
2216
|
|
|
1742
2217
|
results = {}
|
|
1743
|
-
for
|
|
2218
|
+
for raw_cls in classes:
|
|
2219
|
+
cls = _normalize_class_label(raw_cls)
|
|
1744
2220
|
cls_str = str(cls)
|
|
1745
2221
|
if cls_str not in prediction.columns and cls not in prediction.columns:
|
|
1746
2222
|
logger.warning(f"Missing predicted probabilities for class '{cls}'")
|