lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +116 -65
- lecrapaud/db/models/experiment.py +195 -182
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +4 -0
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +612 -242
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.19.0.dist-info/METADATA +0 -249
- lecrapaud-0.19.0.dist-info/RECORD +0 -48
lecrapaud/model_selection.py
CHANGED
|
@@ -15,7 +15,7 @@ from pydantic import BaseModel
|
|
|
15
15
|
import ast
|
|
16
16
|
|
|
17
17
|
# ML models
|
|
18
|
-
from sklearn.model_selection import TimeSeriesSplit
|
|
18
|
+
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
|
|
19
19
|
from sklearn.calibration import CalibratedClassifierCV
|
|
20
20
|
from sklearn.metrics import (
|
|
21
21
|
mean_absolute_percentage_error,
|
|
@@ -55,31 +55,46 @@ from tensorboardX import SummaryWriter
|
|
|
55
55
|
|
|
56
56
|
# Optimization
|
|
57
57
|
import ray
|
|
58
|
-
from ray.tune import Tuner, TuneConfig, with_parameters
|
|
59
|
-
from ray.train import RunConfig
|
|
58
|
+
from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
|
|
60
59
|
from ray.tune.search.hyperopt import HyperOptSearch
|
|
61
60
|
from ray.tune.search.bayesopt import BayesOptSearch
|
|
62
61
|
from ray.tune.logger import TBXLoggerCallback
|
|
63
62
|
from ray.tune.schedulers import ASHAScheduler
|
|
64
63
|
from ray.air import session
|
|
65
64
|
|
|
65
|
+
# HyperOpt standalone
|
|
66
|
+
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
|
|
67
|
+
|
|
66
68
|
# Internal library
|
|
67
69
|
from lecrapaud.search_space import all_models
|
|
68
70
|
from lecrapaud.directories import clean_directory
|
|
69
71
|
from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
|
|
70
|
-
from lecrapaud.config import PYTHON_ENV
|
|
72
|
+
from lecrapaud.config import PYTHON_ENV, LECRAPAUD_OPTIMIZATION_BACKEND
|
|
71
73
|
from lecrapaud.feature_selection import load_train_data
|
|
72
74
|
from lecrapaud.db import (
|
|
73
75
|
Model,
|
|
74
76
|
ModelSelection,
|
|
75
|
-
|
|
76
|
-
Score,
|
|
77
|
+
ModelSelectionScore,
|
|
77
78
|
Target,
|
|
78
79
|
Experiment,
|
|
79
80
|
)
|
|
81
|
+
from lecrapaud.mixins import LeCrapaudEstimatorMixin
|
|
80
82
|
|
|
81
83
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
82
84
|
|
|
85
|
+
# Suppress XGBoost and LightGBM logging
|
|
86
|
+
import logging
|
|
87
|
+
|
|
88
|
+
logging.getLogger("lightgbm").setLevel(logging.ERROR)
|
|
89
|
+
logging.getLogger("xgboost").setLevel(logging.ERROR)
|
|
90
|
+
|
|
91
|
+
# Set global verbosity for XGBoost
|
|
92
|
+
xgb.set_config(verbosity=0)
|
|
93
|
+
|
|
94
|
+
# Suppress warnings
|
|
95
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
96
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
97
|
+
|
|
83
98
|
# Reproducible result
|
|
84
99
|
keras.utils.set_random_seed(42)
|
|
85
100
|
np.random.seed(42)
|
|
@@ -110,7 +125,64 @@ def test_hardware():
|
|
|
110
125
|
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
111
126
|
|
|
112
127
|
|
|
113
|
-
class
|
|
128
|
+
class CatBoostWrapper:
|
|
129
|
+
"""
|
|
130
|
+
Transparent proxy for a CatBoost model that accepts arbitrary keyword arguments
|
|
131
|
+
as direct attributes, while forwarding all method calls and properties.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
__slots__ = ("_model", "_extra_attrs")
|
|
135
|
+
|
|
136
|
+
def __init__(self, model, **kwargs):
|
|
137
|
+
object.__setattr__(self, "_model", model)
|
|
138
|
+
object.__setattr__(self, "_extra_attrs", {})
|
|
139
|
+
# Register kwargs as direct attributes
|
|
140
|
+
for key, value in kwargs.items():
|
|
141
|
+
setattr(self, key, value)
|
|
142
|
+
|
|
143
|
+
# ---- Transparent access ----
|
|
144
|
+
def __getattr__(self, name):
|
|
145
|
+
"""Forward attribute access to the underlying model if not found."""
|
|
146
|
+
model = object.__getattribute__(self, "_model")
|
|
147
|
+
if hasattr(model, name):
|
|
148
|
+
return getattr(model, name)
|
|
149
|
+
extra_attrs = object.__getattribute__(self, "_extra_attrs")
|
|
150
|
+
if name in extra_attrs:
|
|
151
|
+
return extra_attrs[name]
|
|
152
|
+
raise AttributeError(f"{type(self).__name__!r} has no attribute {name!r}")
|
|
153
|
+
|
|
154
|
+
def __setattr__(self, name, value):
|
|
155
|
+
"""Set to wrapper or forward to model when appropriate."""
|
|
156
|
+
if name in CatBoostWrapper.__slots__:
|
|
157
|
+
object.__setattr__(self, name, value)
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
model = object.__getattribute__(self, "_model")
|
|
161
|
+
if hasattr(model, name):
|
|
162
|
+
setattr(model, name, value)
|
|
163
|
+
else:
|
|
164
|
+
extra_attrs = object.__getattribute__(self, "_extra_attrs")
|
|
165
|
+
extra_attrs[name] = value
|
|
166
|
+
|
|
167
|
+
def __dir__(self):
|
|
168
|
+
"""Merge dir() from wrapper, model, and custom attributes."""
|
|
169
|
+
base = set(super().__dir__())
|
|
170
|
+
model_attrs = set(dir(object.__getattribute__(self, "_model")))
|
|
171
|
+
extra_attrs = set(object.__getattribute__(self, "_extra_attrs").keys())
|
|
172
|
+
return sorted(base | model_attrs | extra_attrs)
|
|
173
|
+
|
|
174
|
+
def __repr__(self):
|
|
175
|
+
model = object.__getattribute__(self, "_model")
|
|
176
|
+
extras = object.__getattribute__(self, "_extra_attrs")
|
|
177
|
+
return f"CatBoostWrapper(model={model.__class__.__name__}, extras={extras})"
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def model(self):
|
|
181
|
+
"""Access the raw CatBoost model."""
|
|
182
|
+
return object.__getattribute__(self, "_model")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class BaseModel:
|
|
114
186
|
|
|
115
187
|
def __init__(
|
|
116
188
|
self,
|
|
@@ -296,12 +368,15 @@ class ModelEngine:
|
|
|
296
368
|
)
|
|
297
369
|
|
|
298
370
|
# Attach metadata for consistency with sklearn path
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
371
|
+
model_wrapped = CatBoostWrapper(
|
|
372
|
+
model, model_name=self.model_name, target_type=self.target_type
|
|
373
|
+
)
|
|
374
|
+
logger.info(
|
|
375
|
+
f"Successfully created a {model_wrapped.model_name} at {datetime.now()}"
|
|
376
|
+
)
|
|
302
377
|
|
|
303
|
-
self._model =
|
|
304
|
-
return
|
|
378
|
+
self._model = model_wrapped
|
|
379
|
+
return model_wrapped
|
|
305
380
|
|
|
306
381
|
def fit_boosting(self, x_train, y_train, x_val, y_val, params):
|
|
307
382
|
"""
|
|
@@ -350,6 +425,7 @@ class ModelEngine:
|
|
|
350
425
|
"metric": eval_metric,
|
|
351
426
|
"num_class": num_class,
|
|
352
427
|
"verbose": -1,
|
|
428
|
+
"verbose_eval": False,
|
|
353
429
|
},
|
|
354
430
|
num_boost_round=params["num_boost_round"],
|
|
355
431
|
train_set=train_data,
|
|
@@ -361,6 +437,7 @@ class ModelEngine:
|
|
|
361
437
|
),
|
|
362
438
|
lgb.record_evaluation(evals_result),
|
|
363
439
|
tensorboard_callback,
|
|
440
|
+
lgb.log_evaluation(period=0), # Disable evaluation logging
|
|
364
441
|
],
|
|
365
442
|
)
|
|
366
443
|
else:
|
|
@@ -402,7 +479,7 @@ class ModelEngine:
|
|
|
402
479
|
if self.target_type == "regression"
|
|
403
480
|
else ("logloss" if num_class <= 2 else "mlogloss")
|
|
404
481
|
)
|
|
405
|
-
|
|
482
|
+
# XGBoost verbosity already set globally
|
|
406
483
|
model = xgb.train(
|
|
407
484
|
params={
|
|
408
485
|
**params["model_params"],
|
|
@@ -417,11 +494,11 @@ class ModelEngine:
|
|
|
417
494
|
xgb.callback.EarlyStopping(
|
|
418
495
|
rounds=params["early_stopping_rounds"], save_best=True
|
|
419
496
|
),
|
|
420
|
-
|
|
497
|
+
# Removed EvaluationMonitor to suppress logs
|
|
421
498
|
tensorboard_callback,
|
|
422
499
|
],
|
|
423
500
|
evals_result=evals_result, # Record evaluation result
|
|
424
|
-
verbose_eval=
|
|
501
|
+
verbose_eval=False, # Disable evaluation logging
|
|
425
502
|
)
|
|
426
503
|
|
|
427
504
|
model.model_name = self.create_model
|
|
@@ -686,6 +763,171 @@ class ModelEngine:
|
|
|
686
763
|
)
|
|
687
764
|
|
|
688
765
|
|
|
766
|
+
def trainable_cv(
|
|
767
|
+
params,
|
|
768
|
+
x_train,
|
|
769
|
+
y_train,
|
|
770
|
+
x_val,
|
|
771
|
+
y_val,
|
|
772
|
+
model_name,
|
|
773
|
+
target_type,
|
|
774
|
+
experiment_name,
|
|
775
|
+
target_number,
|
|
776
|
+
create_model,
|
|
777
|
+
n_splits=3,
|
|
778
|
+
plot=False,
|
|
779
|
+
log_dir=None,
|
|
780
|
+
target_clf_thresholds: dict = None,
|
|
781
|
+
time_series=True,
|
|
782
|
+
recurrent=False,
|
|
783
|
+
):
|
|
784
|
+
"""Cross-validation version of trainable for hyperopt.
|
|
785
|
+
|
|
786
|
+
Uses TimeSeriesSplit for temporal data or StratifiedKFold/KFold for i.i.d. data.
|
|
787
|
+
Returns pooled metrics (single logloss/RMSE calculated on all concatenated predictions).
|
|
788
|
+
"""
|
|
789
|
+
# Combine train and validation data for cross-validation
|
|
790
|
+
if recurrent:
|
|
791
|
+
x_train_val = np.concatenate([x_train, x_val], axis=0)
|
|
792
|
+
y_train_val = np.concatenate([y_train, y_val], axis=0)
|
|
793
|
+
else:
|
|
794
|
+
x_train_val = pd.concat([x_train, x_val], axis=0)
|
|
795
|
+
y_train_val = pd.concat([y_train, y_val], axis=0)
|
|
796
|
+
# Store original index for later use if needed
|
|
797
|
+
original_index = x_train_val.index.copy()
|
|
798
|
+
# Reset index for proper iloc indexing with CV splits
|
|
799
|
+
x_train_val = x_train_val.reset_index(drop=True)
|
|
800
|
+
y_train_val = y_train_val.reset_index(drop=True)
|
|
801
|
+
|
|
802
|
+
# Choose appropriate cross-validation splitter
|
|
803
|
+
if time_series:
|
|
804
|
+
# Time series split for temporal data
|
|
805
|
+
n_samples = len(x_train_val)
|
|
806
|
+
test_size = int(n_samples / (n_splits + 1)) # Ensure reasonable test size
|
|
807
|
+
cv_splitter = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
808
|
+
else:
|
|
809
|
+
# Stratified or regular K-fold for i.i.d. data
|
|
810
|
+
if target_type == "classification":
|
|
811
|
+
cv_splitter = StratifiedKFold(
|
|
812
|
+
n_splits=n_splits, shuffle=True, random_state=42
|
|
813
|
+
)
|
|
814
|
+
else:
|
|
815
|
+
cv_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
|
|
816
|
+
|
|
817
|
+
# Store all predictions and true values for pooled metrics
|
|
818
|
+
all_predictions = []
|
|
819
|
+
all_y_true = []
|
|
820
|
+
fold_times = []
|
|
821
|
+
|
|
822
|
+
# Get splits based on the CV strategy
|
|
823
|
+
if time_series or target_type == "regression":
|
|
824
|
+
splits = cv_splitter.split(x_train_val)
|
|
825
|
+
else:
|
|
826
|
+
# For stratified split, we need to pass y
|
|
827
|
+
if recurrent:
|
|
828
|
+
# Extract the target from the 2D array (first column is target)
|
|
829
|
+
y_for_split = y_train_val[:, 0]
|
|
830
|
+
else:
|
|
831
|
+
y_for_split = y_train_val
|
|
832
|
+
splits = cv_splitter.split(x_train_val, y_for_split)
|
|
833
|
+
|
|
834
|
+
for fold_idx, (train_idx, val_idx) in enumerate(splits):
|
|
835
|
+
# Extract fold data
|
|
836
|
+
if recurrent:
|
|
837
|
+
x_fold_train = x_train_val[train_idx]
|
|
838
|
+
y_fold_train = y_train_val[train_idx]
|
|
839
|
+
x_fold_val = x_train_val[val_idx]
|
|
840
|
+
y_fold_val = y_train_val[val_idx]
|
|
841
|
+
else:
|
|
842
|
+
x_fold_train = x_train_val.iloc[train_idx]
|
|
843
|
+
y_fold_train = y_train_val.iloc[train_idx]
|
|
844
|
+
x_fold_val = x_train_val.iloc[val_idx]
|
|
845
|
+
y_fold_val = y_train_val.iloc[val_idx]
|
|
846
|
+
|
|
847
|
+
# Train model for this fold
|
|
848
|
+
model = BaseModel(
|
|
849
|
+
model_name=model_name,
|
|
850
|
+
target_type=target_type,
|
|
851
|
+
target_number=target_number,
|
|
852
|
+
create_model=create_model,
|
|
853
|
+
plot=False, # Disable individual fold plots
|
|
854
|
+
log_dir=log_dir,
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
if recurrent:
|
|
858
|
+
timesteps = params["timesteps"]
|
|
859
|
+
x_fold_train = x_fold_train[:, -timesteps:, :]
|
|
860
|
+
x_fold_val = x_fold_val[:, -timesteps:, :]
|
|
861
|
+
|
|
862
|
+
# Fit model
|
|
863
|
+
model.fit(x_fold_train, y_fold_train, x_fold_val, y_fold_val, params)
|
|
864
|
+
|
|
865
|
+
# Get predictions
|
|
866
|
+
y_pred = model.predict(x_fold_val)
|
|
867
|
+
|
|
868
|
+
# Handle recurrent model indexing
|
|
869
|
+
if recurrent:
|
|
870
|
+
y_fold_val = pd.DataFrame(
|
|
871
|
+
y_fold_val, columns=["TARGET", "index"]
|
|
872
|
+
).set_index("index")
|
|
873
|
+
y_pred.index = y_fold_val.index
|
|
874
|
+
|
|
875
|
+
# Store predictions and true values
|
|
876
|
+
all_predictions.append(y_pred)
|
|
877
|
+
all_y_true.append(y_fold_val)
|
|
878
|
+
|
|
879
|
+
# Concatenate all fold predictions
|
|
880
|
+
if target_type == "classification":
|
|
881
|
+
# For classification, we need to handle probability columns
|
|
882
|
+
all_pred_df = pd.concat(all_predictions, axis=0)
|
|
883
|
+
all_y_series = pd.concat(all_y_true, axis=0)
|
|
884
|
+
# Ensure we have a DataFrame with TARGET column
|
|
885
|
+
if isinstance(all_y_series, pd.Series):
|
|
886
|
+
all_y_df = pd.DataFrame({"TARGET": all_y_series})
|
|
887
|
+
else:
|
|
888
|
+
all_y_df = all_y_series
|
|
889
|
+
else:
|
|
890
|
+
# For regression, just concatenate the predictions
|
|
891
|
+
all_pred_series = pd.concat(all_predictions, axis=0)
|
|
892
|
+
all_y_series = pd.concat(all_y_true, axis=0)
|
|
893
|
+
all_pred_df = pd.DataFrame({"PRED": all_pred_series})
|
|
894
|
+
all_y_df = pd.DataFrame({"TARGET": all_y_series})
|
|
895
|
+
|
|
896
|
+
# Create combined prediction DataFrame
|
|
897
|
+
prediction = pd.concat([all_y_df[["TARGET"]], all_pred_df], axis=1)
|
|
898
|
+
|
|
899
|
+
# Calculate pooled metrics
|
|
900
|
+
score = {
|
|
901
|
+
"DATE": datetime.now(),
|
|
902
|
+
"MODEL_NAME": model_name,
|
|
903
|
+
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
# Unscale if needed (for regression with scaling)
|
|
907
|
+
if (
|
|
908
|
+
model.need_scaling
|
|
909
|
+
and target_type == "regression"
|
|
910
|
+
and model.scaler_y is not None
|
|
911
|
+
):
|
|
912
|
+
prediction.loc[:, "TARGET"] = model.scaler_y.inverse_transform(
|
|
913
|
+
prediction[["TARGET"]].values
|
|
914
|
+
)
|
|
915
|
+
prediction.loc[:, "PRED"] = model.scaler_y.inverse_transform(
|
|
916
|
+
prediction[["PRED"]].values
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# Evaluate with pooled predictions
|
|
920
|
+
score.update(evaluate(prediction, target_type, target_clf_thresholds))
|
|
921
|
+
|
|
922
|
+
metric = "RMSE" if target_type == "regression" else "LOGLOSS"
|
|
923
|
+
logger.info(f"{model_name} CV pooled {metric}: {score[metric]:.4f}")
|
|
924
|
+
|
|
925
|
+
# Report to Ray if in Ray context
|
|
926
|
+
if session.get_session():
|
|
927
|
+
session.report(metrics=score)
|
|
928
|
+
return score
|
|
929
|
+
|
|
930
|
+
|
|
689
931
|
def trainable(
|
|
690
932
|
params,
|
|
691
933
|
x_train,
|
|
@@ -697,14 +939,13 @@ def trainable(
|
|
|
697
939
|
experiment_name,
|
|
698
940
|
target_number,
|
|
699
941
|
create_model,
|
|
700
|
-
type_name="hyperopts",
|
|
701
942
|
plot=False,
|
|
702
943
|
log_dir=None,
|
|
703
944
|
target_clf_thresholds: dict = None,
|
|
704
945
|
):
|
|
705
946
|
"""Standalone version of train_model that doesn't depend on self"""
|
|
706
947
|
# Create model engine
|
|
707
|
-
model =
|
|
948
|
+
model = BaseModel(
|
|
708
949
|
model_name=model_name,
|
|
709
950
|
target_type=target_type,
|
|
710
951
|
target_number=target_number,
|
|
@@ -723,9 +964,7 @@ def trainable(
|
|
|
723
964
|
x_val = x_val[:, -timesteps:, :]
|
|
724
965
|
|
|
725
966
|
# Compile and fit model on train set
|
|
726
|
-
start = time.time()
|
|
727
967
|
model.fit(x_train, y_train, x_val, y_val, params)
|
|
728
|
-
stop = time.time()
|
|
729
968
|
|
|
730
969
|
# Prediction on val set
|
|
731
970
|
y_pred = model.predict(x_val)
|
|
@@ -755,8 +994,6 @@ def trainable(
|
|
|
755
994
|
score = {
|
|
756
995
|
"DATE": datetime.now(),
|
|
757
996
|
"MODEL_NAME": model.model_name,
|
|
758
|
-
"TYPE": type_name,
|
|
759
|
-
"TRAINING_TIME": stop - start,
|
|
760
997
|
"EVAL_DATA_STD": prediction["TARGET"].std(),
|
|
761
998
|
}
|
|
762
999
|
|
|
@@ -765,77 +1002,107 @@ def trainable(
|
|
|
765
1002
|
metric = "RMSE" if target_type == "regression" else "LOGLOSS"
|
|
766
1003
|
logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
|
|
767
1004
|
|
|
768
|
-
if
|
|
1005
|
+
# Report to Ray if in Ray context
|
|
1006
|
+
if session.get_session():
|
|
769
1007
|
session.report(metrics=score)
|
|
770
1008
|
return score
|
|
771
1009
|
|
|
772
1010
|
return score, model, prediction
|
|
773
1011
|
|
|
774
1012
|
|
|
775
|
-
class
|
|
1013
|
+
class ModelSelector(LeCrapaudEstimatorMixin):
|
|
776
1014
|
|
|
777
1015
|
def __init__(
|
|
778
1016
|
self,
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
target_number,
|
|
782
|
-
target_clf,
|
|
783
|
-
experiment,
|
|
784
|
-
models_idx,
|
|
785
|
-
time_series,
|
|
786
|
-
date_column,
|
|
787
|
-
group_column,
|
|
788
|
-
target_clf_thresholds,
|
|
1017
|
+
experiment: Experiment = None,
|
|
1018
|
+
target_number: int = None,
|
|
789
1019
|
**kwargs,
|
|
790
1020
|
):
|
|
791
|
-
|
|
792
|
-
|
|
1021
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
1022
|
+
super().__init__(experiment=experiment, target_number=target_number, **kwargs)
|
|
1023
|
+
|
|
1024
|
+
# Set defaults for required parameters if not provided
|
|
1025
|
+
if not hasattr(self, "target_clf"):
|
|
1026
|
+
self.target_clf = []
|
|
1027
|
+
if not hasattr(self, "models_idx"):
|
|
1028
|
+
self.models_idx = []
|
|
1029
|
+
if not hasattr(self, "time_series"):
|
|
1030
|
+
self.time_series = False
|
|
1031
|
+
if not hasattr(self, "date_column"):
|
|
1032
|
+
self.date_column = None
|
|
1033
|
+
if not hasattr(self, "group_column"):
|
|
1034
|
+
self.group_column = None
|
|
1035
|
+
if not hasattr(self, "target_clf_thresholds"):
|
|
1036
|
+
self.target_clf_thresholds = {}
|
|
793
1037
|
self.target_number = target_number
|
|
794
|
-
self.experiment = experiment
|
|
795
|
-
self.target_clf = target_clf
|
|
796
|
-
self.models_idx = models_idx
|
|
797
|
-
self.time_series = time_series
|
|
798
|
-
self.date_column = date_column
|
|
799
|
-
self.group_column = group_column
|
|
800
|
-
self.target_clf_thresholds = (
|
|
801
|
-
target_clf_thresholds[target_number]
|
|
802
|
-
if target_number in target_clf_thresholds.keys()
|
|
803
|
-
else None
|
|
804
|
-
)
|
|
805
1038
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
1039
|
+
# Handle target_clf_thresholds for specific target
|
|
1040
|
+
# Handle both string and integer keys for backward compatibility
|
|
1041
|
+
if self.target_number and self.target_clf_thresholds:
|
|
1042
|
+
# Try both integer and string versions of the target number
|
|
1043
|
+
if self.target_number in self.target_clf_thresholds:
|
|
1044
|
+
self.target_clf_thresholds = self.target_clf_thresholds[
|
|
1045
|
+
self.target_number
|
|
1046
|
+
]
|
|
1047
|
+
elif str(self.target_number) in self.target_clf_thresholds:
|
|
1048
|
+
self.target_clf_thresholds = self.target_clf_thresholds[
|
|
1049
|
+
str(self.target_number)
|
|
1050
|
+
]
|
|
1051
|
+
|
|
1052
|
+
# Derived attributes
|
|
1053
|
+
if self.target_number is not None:
|
|
1054
|
+
self.target_type = (
|
|
1055
|
+
"classification"
|
|
1056
|
+
if self.target_number in self.target_clf
|
|
1057
|
+
else "regression"
|
|
1058
|
+
)
|
|
1059
|
+
self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
|
|
1060
|
+
|
|
1061
|
+
# Set paths and features if experiment is available
|
|
1062
|
+
if self.experiment:
|
|
1063
|
+
self.experiment_dir = self.experiment.path
|
|
1064
|
+
self.experiment_id = self.experiment.id
|
|
1065
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
1066
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
1067
|
+
|
|
1068
|
+
if self.target_number is not None:
|
|
1069
|
+
self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
|
|
1070
|
+
self.features = self.experiment.get_features(self.target_number)
|
|
1071
|
+
|
|
1072
|
+
self.all_features = self.experiment.get_all_features(
|
|
1073
|
+
date_column=self.date_column, group_column=self.group_column
|
|
1074
|
+
)
|
|
819
1075
|
|
|
820
1076
|
# Main training function
|
|
821
|
-
def
|
|
822
|
-
self,
|
|
823
|
-
experiment_name,
|
|
824
|
-
perform_hyperopt=True,
|
|
825
|
-
number_of_trials=20,
|
|
826
|
-
perform_crossval=False,
|
|
827
|
-
plot=True,
|
|
828
|
-
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
829
|
-
preserve_model=True,
|
|
830
|
-
best_params=None,
|
|
831
|
-
):
|
|
1077
|
+
def fit(self, X, y=None, reshaped_data=None, best_params=None):
|
|
832
1078
|
"""
|
|
833
|
-
|
|
834
|
-
|
|
1079
|
+
Fit the model selector (train and select best model).
|
|
1080
|
+
|
|
1081
|
+
Args:
|
|
1082
|
+
X: Either a DataFrame or a dict with train/val/test data
|
|
1083
|
+
y: Target values (ignored, uses TARGET columns)
|
|
1084
|
+
reshaped_data: Optional reshaped data for recurrent models
|
|
1085
|
+
best_params: Optional pre-defined best parameters
|
|
1086
|
+
|
|
1087
|
+
Returns:
|
|
1088
|
+
self: Returns self for chaining
|
|
835
1089
|
"""
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
1090
|
+
# Handle both DataFrame and dict inputs
|
|
1091
|
+
if isinstance(X, dict):
|
|
1092
|
+
self.data = X
|
|
1093
|
+
self.reshaped_data = reshaped_data
|
|
1094
|
+
else:
|
|
1095
|
+
# For simple DataFrame input, we expect it to be just training data
|
|
1096
|
+
# This is less common for ModelSelector which typically needs train/val/test
|
|
1097
|
+
raise ValueError("ModelSelector requires a dict with train/val/test data")
|
|
1098
|
+
# Get all parameters from experiment context
|
|
1099
|
+
context = self.experiment.context
|
|
1100
|
+
self.experiment_name = context.get("experiment_name", "")
|
|
1101
|
+
self.plot = context.get("plot", True)
|
|
1102
|
+
self.number_of_trials = context.get("number_of_trials", 20)
|
|
1103
|
+
self.perform_crossval = context.get("perform_crossval", False)
|
|
1104
|
+
self.preserve_model = context.get("preserve_model", True)
|
|
1105
|
+
self.perform_hyperopt = context.get("perform_hyperopt", True)
|
|
839
1106
|
|
|
840
1107
|
if self.experiment_id is None:
|
|
841
1108
|
raise ValueError("Please provide a experiment.")
|
|
@@ -885,12 +1152,11 @@ class ModelSelectionEngine:
|
|
|
885
1152
|
# create model selection in db
|
|
886
1153
|
target = Target.find_by(name=f"TARGET_{self.target_number}")
|
|
887
1154
|
model_selection = ModelSelection.upsert(
|
|
888
|
-
match_fields=["target_id", "experiment_id"],
|
|
889
1155
|
target_id=target.id,
|
|
890
1156
|
experiment_id=self.experiment_id,
|
|
891
1157
|
)
|
|
892
1158
|
|
|
893
|
-
#
|
|
1159
|
+
# STEP 1 : TRAINING MODELS
|
|
894
1160
|
for i in self.models_idx:
|
|
895
1161
|
config = all_models[i]
|
|
896
1162
|
recurrent = config["recurrent"]
|
|
@@ -903,24 +1169,16 @@ class ModelSelectionEngine:
|
|
|
903
1169
|
self.results_dir = f"{self.target_dir}/{model_name}"
|
|
904
1170
|
if not os.path.exists(f"{self.results_dir}"):
|
|
905
1171
|
os.makedirs(f"{self.results_dir}")
|
|
906
|
-
elif preserve_model and contains_best(self.results_dir):
|
|
1172
|
+
elif self.preserve_model and contains_best(self.results_dir):
|
|
907
1173
|
continue
|
|
908
|
-
elif perform_hyperopt:
|
|
1174
|
+
elif self.perform_hyperopt:
|
|
909
1175
|
clean_directory(self.results_dir)
|
|
910
1176
|
|
|
911
|
-
logger.info(
|
|
912
|
-
|
|
913
|
-
match_fields=["name", "type"],
|
|
914
|
-
name=model_name,
|
|
915
|
-
type=self.target_type,
|
|
916
|
-
)
|
|
917
|
-
model_training = ModelTraining.upsert(
|
|
918
|
-
match_fields=["model_id", "model_selection_id"],
|
|
919
|
-
model_id=model.id,
|
|
920
|
-
model_selection_id=model_selection.id,
|
|
1177
|
+
logger.info(
|
|
1178
|
+
f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
|
|
921
1179
|
)
|
|
922
1180
|
|
|
923
|
-
#
|
|
1181
|
+
# Getting data
|
|
924
1182
|
if recurrent:
|
|
925
1183
|
# Clear cluster from previous Keras session graphs.
|
|
926
1184
|
K.clear_session()
|
|
@@ -930,7 +1188,7 @@ class ModelSelectionEngine:
|
|
|
930
1188
|
for i, e in enumerate(self.all_features)
|
|
931
1189
|
if e in set(self.features)
|
|
932
1190
|
]
|
|
933
|
-
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
1191
|
+
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns (should be good)...
|
|
934
1192
|
x_train = x_train_reshaped[:, :, features_idx]
|
|
935
1193
|
y_train = y_train_reshaped[:, [self.target_number, 0]]
|
|
936
1194
|
x_val = x_val_reshaped[:, :, features_idx]
|
|
@@ -960,8 +1218,9 @@ class ModelSelectionEngine:
|
|
|
960
1218
|
y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
961
1219
|
|
|
962
1220
|
log_dir = get_log_dir(self.target_dir, model_name)
|
|
963
|
-
|
|
964
|
-
|
|
1221
|
+
|
|
1222
|
+
# Instantiate model
|
|
1223
|
+
model = BaseModel(
|
|
965
1224
|
target_number=self.target_number,
|
|
966
1225
|
model_name=model_name,
|
|
967
1226
|
search_params=config["search_params"],
|
|
@@ -971,9 +1230,9 @@ class ModelSelectionEngine:
|
|
|
971
1230
|
log_dir=log_dir,
|
|
972
1231
|
)
|
|
973
1232
|
|
|
974
|
-
start = time.time()
|
|
975
1233
|
# Tuning hyperparameters
|
|
976
|
-
|
|
1234
|
+
start = time.time()
|
|
1235
|
+
if self.perform_hyperopt:
|
|
977
1236
|
model_best_params = self.hyperoptimize(
|
|
978
1237
|
x_train, y_train, x_val, y_val, model
|
|
979
1238
|
)
|
|
@@ -989,7 +1248,7 @@ class ModelSelectionEngine:
|
|
|
989
1248
|
f"Could not find {model_name} in current data. Try to run an hyperoptimization by setting `perform_hyperopt` to true, or pass `best_params`"
|
|
990
1249
|
)
|
|
991
1250
|
|
|
992
|
-
#
|
|
1251
|
+
# Save best params
|
|
993
1252
|
best_params_file = f"{self.target_dir}/best_params.json"
|
|
994
1253
|
try:
|
|
995
1254
|
with open(best_params_file, "r") as f:
|
|
@@ -1001,114 +1260,25 @@ class ModelSelectionEngine:
|
|
|
1001
1260
|
with open(best_params_file, "w") as f:
|
|
1002
1261
|
json.dump(json_dict, f, indent=4)
|
|
1003
1262
|
|
|
1004
|
-
#
|
|
1005
|
-
if
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
|
|
1017
|
-
self.type_name = f"crossval_fold_{i}"
|
|
1018
|
-
|
|
1019
|
-
if self.time_series:
|
|
1020
|
-
date_series = pd.concat(
|
|
1021
|
-
[
|
|
1022
|
-
train[self.date_column],
|
|
1023
|
-
val[self.date_column],
|
|
1024
|
-
test[self.date_column],
|
|
1025
|
-
],
|
|
1026
|
-
axis=0,
|
|
1027
|
-
).reset_index(drop=True)
|
|
1028
|
-
|
|
1029
|
-
date_series = date_series.map(pd.Timestamp.fromordinal)
|
|
1030
|
-
|
|
1031
|
-
# Now you can use the actual train/val indices to extract ranges
|
|
1032
|
-
train_start = date_series.iloc[train_index[0]]
|
|
1033
|
-
train_end = date_series.iloc[train_index[-1]]
|
|
1034
|
-
val_start = date_series.iloc[val_index[0]]
|
|
1035
|
-
val_end = date_series.iloc[val_index[-1]]
|
|
1036
|
-
|
|
1037
|
-
logger.info(
|
|
1038
|
-
f"[Fold {i}] Train: {len(train_index)} samples from {train_start.date()} to {train_end.date()} | "
|
|
1039
|
-
f"Validation: {len(val_index)} samples from {val_start.date()} to {val_end.date()}"
|
|
1040
|
-
)
|
|
1041
|
-
else:
|
|
1042
|
-
logger.info(
|
|
1043
|
-
f"[Fold {i}] Train: {len(train_index)} samples | Validation: {len(val_index)} samples"
|
|
1044
|
-
)
|
|
1045
|
-
|
|
1046
|
-
# Train the model and get the score
|
|
1047
|
-
if recurrent:
|
|
1048
|
-
cv_score, _, _ = self.train_model(
|
|
1049
|
-
params=model_best_params,
|
|
1050
|
-
x_train=x_train_val[train_index],
|
|
1051
|
-
y_train=y_train_val[train_index],
|
|
1052
|
-
x_val=x_train_val[val_index],
|
|
1053
|
-
y_val=y_train_val[val_index],
|
|
1054
|
-
model=model,
|
|
1055
|
-
)
|
|
1056
|
-
else:
|
|
1057
|
-
cv_score, _, _ = self.train_model(
|
|
1058
|
-
params=model_best_params,
|
|
1059
|
-
x_train=x_train_val.iloc[train_index],
|
|
1060
|
-
y_train=y_train_val.iloc[train_index],
|
|
1061
|
-
x_val=x_train_val.iloc[val_index],
|
|
1062
|
-
y_val=y_train_val.iloc[val_index],
|
|
1063
|
-
model=model,
|
|
1064
|
-
)
|
|
1065
|
-
|
|
1066
|
-
# Append score to the list
|
|
1067
|
-
cv_scores.append(cv_score)
|
|
1068
|
-
|
|
1069
|
-
# Calculate mean of all numerical metrics across all cross-validation folds
|
|
1070
|
-
cv_scores_df = pd.DataFrame(cv_scores)
|
|
1071
|
-
# Get mean of all numeric columns
|
|
1072
|
-
cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
|
|
1263
|
+
# Always evaluate on test set (no cross-validation here)
|
|
1264
|
+
# The hyperopt already did CV if needed to find best params
|
|
1265
|
+
best_score, best_model, best_pred = self.train_model(
|
|
1266
|
+
params=model_best_params,
|
|
1267
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1268
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1269
|
+
x_val=x_test,
|
|
1270
|
+
y_val=y_test,
|
|
1271
|
+
model=model,
|
|
1272
|
+
)
|
|
1273
|
+
stop = time.time()
|
|
1274
|
+
training_time = stop - start
|
|
1073
1275
|
|
|
1074
|
-
|
|
1075
|
-
|
|
1276
|
+
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1277
|
+
logger.info(f"👉 {model.model_name} scores on test set:")
|
|
1278
|
+
for metric, value in best_score.items():
|
|
1279
|
+
if isinstance(value, (int, float)):
|
|
1076
1280
|
logger.info(f" {metric}: {value:.4f}")
|
|
1077
1281
|
|
|
1078
|
-
# Retrain on entire training set, but keep score on cross-validation folds
|
|
1079
|
-
# Get the test score using the best model
|
|
1080
|
-
test_score, best_model, best_pred = self.train_model(
|
|
1081
|
-
params=model_best_params,
|
|
1082
|
-
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1083
|
-
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1084
|
-
x_val=x_test,
|
|
1085
|
-
y_val=y_test,
|
|
1086
|
-
model=model,
|
|
1087
|
-
)
|
|
1088
|
-
|
|
1089
|
-
# Update all metrics with cross-validation means
|
|
1090
|
-
for metric, value in cv_means.items():
|
|
1091
|
-
if metric in test_score: # Only update existing metrics
|
|
1092
|
-
test_score[metric] = value
|
|
1093
|
-
best_score = test_score
|
|
1094
|
-
best_score["TYPE"] = "crossval"
|
|
1095
|
-
else:
|
|
1096
|
-
# Evaluate on test set
|
|
1097
|
-
self.type_name = "testset"
|
|
1098
|
-
best_score, best_model, best_pred = self.train_model(
|
|
1099
|
-
params=model_best_params,
|
|
1100
|
-
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1101
|
-
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1102
|
-
x_val=x_test,
|
|
1103
|
-
y_val=y_test,
|
|
1104
|
-
model=model,
|
|
1105
|
-
)
|
|
1106
|
-
|
|
1107
|
-
logger.info(f"👉 {model.model_name} scores on test set:")
|
|
1108
|
-
for metric, value in best_score.items():
|
|
1109
|
-
if isinstance(value, (int, float)):
|
|
1110
|
-
logger.info(f" {metric}: {value:.4f}")
|
|
1111
|
-
|
|
1112
1282
|
# Save predictions
|
|
1113
1283
|
best_pred.to_csv(
|
|
1114
1284
|
f"{self.results_dir}/prediction.csv",
|
|
@@ -1119,7 +1289,6 @@ class ModelSelectionEngine:
|
|
|
1119
1289
|
|
|
1120
1290
|
# Save best model
|
|
1121
1291
|
model_path = best_model.save(self.results_dir)
|
|
1122
|
-
|
|
1123
1292
|
model_path = Path(model_path).resolve()
|
|
1124
1293
|
best_score["MODEL_PATH"] = model_path
|
|
1125
1294
|
|
|
@@ -1142,32 +1311,26 @@ class ModelSelectionEngine:
|
|
|
1142
1311
|
scores_tracking.sort_values(self.metric, ascending=True, inplace=True)
|
|
1143
1312
|
scores_tracking.to_csv(scores_tracking_path, index=False)
|
|
1144
1313
|
|
|
1145
|
-
# Save
|
|
1146
|
-
stop = time.time()
|
|
1147
|
-
training_time = stop - start
|
|
1148
|
-
model_training.best_params = model_best_params
|
|
1149
|
-
model_training.model_path = model_path
|
|
1150
|
-
model_training.training_time = training_time
|
|
1151
|
-
model_training.save()
|
|
1152
|
-
|
|
1153
|
-
# Store metrics in DB
|
|
1314
|
+
# Save in db
|
|
1154
1315
|
drop_cols = [
|
|
1155
1316
|
"DATE",
|
|
1156
1317
|
"MODEL_NAME",
|
|
1157
|
-
"MODEL_PATH",
|
|
1158
1318
|
]
|
|
1159
1319
|
best_score = {k: v for k, v in best_score.items() if k not in drop_cols}
|
|
1160
1320
|
score_data = {k.lower(): v for k, v in best_score.items()}
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1321
|
+
model = Model.upsert(
|
|
1322
|
+
name=model_name,
|
|
1323
|
+
type=self.target_type,
|
|
1324
|
+
)
|
|
1325
|
+
ModelSelectionScore.upsert(
|
|
1326
|
+
model_id=model.id,
|
|
1327
|
+
model_selection_id=model_selection.id,
|
|
1328
|
+
best_params=serialize_for_json(model_best_params),
|
|
1329
|
+
training_time=training_time,
|
|
1165
1330
|
**score_data,
|
|
1166
1331
|
)
|
|
1167
1332
|
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
# find best model type
|
|
1333
|
+
# STEP 2 :FINDING BEST MODEL OVERALL
|
|
1171
1334
|
scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
|
|
1172
1335
|
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1173
1336
|
best_score_overall = scores_tracking.iloc[0, :]
|
|
@@ -1178,12 +1341,11 @@ class ModelSelectionEngine:
|
|
|
1178
1341
|
else:
|
|
1179
1342
|
best_thresholds = None
|
|
1180
1343
|
|
|
1181
|
-
# Remove any .best or .keras files
|
|
1344
|
+
# Remove any .best or .keras files, and save best model in target_dir
|
|
1182
1345
|
for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
|
|
1183
1346
|
os.path.join(self.target_dir, "*.keras")
|
|
1184
1347
|
):
|
|
1185
1348
|
os.remove(file_path)
|
|
1186
|
-
# Copy the best model in root training folder for this target
|
|
1187
1349
|
best_model_path = Path(
|
|
1188
1350
|
f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1189
1351
|
).resolve()
|
|
@@ -1195,13 +1357,13 @@ class ModelSelectionEngine:
|
|
|
1195
1357
|
with open(f"{self.target_dir}/best_params.json", "r") as f:
|
|
1196
1358
|
best_model_params = json.load(f)[best_model_name]
|
|
1197
1359
|
|
|
1198
|
-
# Save
|
|
1199
|
-
|
|
1360
|
+
# Save to db
|
|
1200
1361
|
model_selection = ModelSelection.get(model_selection.id)
|
|
1201
|
-
|
|
1362
|
+
model = Model.find_by(
|
|
1202
1363
|
name=best_score_overall["MODEL_NAME"], type=self.target_type
|
|
1203
|
-
)
|
|
1204
|
-
model_selection.
|
|
1364
|
+
)
|
|
1365
|
+
model_selection.best_model_id = model.id
|
|
1366
|
+
model_selection.best_model_params = serialize_for_json(best_model_params)
|
|
1205
1367
|
model_selection.best_thresholds = best_thresholds
|
|
1206
1368
|
model_selection.best_model_path = best_model_path
|
|
1207
1369
|
|
|
@@ -1214,7 +1376,7 @@ class ModelSelectionEngine:
|
|
|
1214
1376
|
k: v for k, v in best_score_overall.items() if k not in drop_cols
|
|
1215
1377
|
}
|
|
1216
1378
|
score_data = {k.lower(): v for k, v in best_score_overall.items()}
|
|
1217
|
-
model_selection.best_score = score_data
|
|
1379
|
+
model_selection.best_score = serialize_for_json(score_data)
|
|
1218
1380
|
model_selection.save()
|
|
1219
1381
|
|
|
1220
1382
|
logger.info(f"Best model overall is : {best_score_overall}")
|
|
@@ -1222,11 +1384,188 @@ class ModelSelectionEngine:
|
|
|
1222
1384
|
# Consolidate best parameters from all targets into a single file
|
|
1223
1385
|
self.consolidate_best_params()
|
|
1224
1386
|
|
|
1225
|
-
|
|
1226
|
-
|
|
1387
|
+
self.best_model_ = BaseModel(
|
|
1388
|
+
path=self.target_dir, target_number=self.target_number
|
|
1389
|
+
)
|
|
1390
|
+
self._set_fitted()
|
|
1391
|
+
return self
|
|
1392
|
+
|
|
1393
|
+
def get_best_model(self):
|
|
1394
|
+
"""
|
|
1395
|
+
Get the best trained model.
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
The best model found during training
|
|
1399
|
+
"""
|
|
1400
|
+
self._check_is_fitted()
|
|
1401
|
+
return self.best_model_
|
|
1402
|
+
|
|
1403
|
+
def hyperoptimize(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1404
|
+
"""Choose between Ray Tune and HyperOpt standalone based on configuration."""
|
|
1405
|
+
if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
|
|
1406
|
+
return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
|
|
1407
|
+
elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
|
|
1408
|
+
return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
|
|
1409
|
+
else:
|
|
1410
|
+
raise ValueError(
|
|
1411
|
+
f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
def hyperoptimize_hyperopt(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1415
|
+
"""Hyperparameter optimization using HyperOpt standalone (Celery-friendly)."""
|
|
1416
|
+
|
|
1417
|
+
logger.info("Start tuning hyperparameters with HyperOpt standalone...")
|
|
1418
|
+
|
|
1419
|
+
# Convert Ray search space to HyperOpt search space
|
|
1420
|
+
def convert_search_space(ray_space):
|
|
1421
|
+
"""Convert Ray Tune search space to HyperOpt format."""
|
|
1422
|
+
from ray.tune.search.sample import Categorical, Float, Integer
|
|
1423
|
+
|
|
1424
|
+
hp_space = {}
|
|
1425
|
+
for key, value in ray_space.items():
|
|
1426
|
+
if isinstance(value, Float):
|
|
1427
|
+
if (
|
|
1428
|
+
hasattr(value, "sampler")
|
|
1429
|
+
and value.sampler.__class__.__name__ == "LogUniform"
|
|
1430
|
+
):
|
|
1431
|
+
# LogUniform distribution
|
|
1432
|
+
hp_space[key] = hp.loguniform(
|
|
1433
|
+
key, np.log(value.lower), np.log(value.upper)
|
|
1434
|
+
)
|
|
1435
|
+
else:
|
|
1436
|
+
# Uniform distribution
|
|
1437
|
+
hp_space[key] = hp.uniform(key, value.lower, value.upper)
|
|
1438
|
+
elif isinstance(value, Integer):
|
|
1439
|
+
# Integer uniform distribution
|
|
1440
|
+
hp_space[key] = hp.randint(key, value.lower, value.upper)
|
|
1441
|
+
elif isinstance(value, Categorical):
|
|
1442
|
+
# Categorical/choice distribution
|
|
1443
|
+
hp_space[key] = hp.choice(key, value.categories)
|
|
1444
|
+
elif isinstance(value, dict):
|
|
1445
|
+
# Nested dict, recurse
|
|
1446
|
+
hp_space[key] = convert_search_space(value)
|
|
1447
|
+
else:
|
|
1448
|
+
# Static value or unknown type
|
|
1449
|
+
hp_space[key] = value
|
|
1450
|
+
return hp_space
|
|
1451
|
+
|
|
1452
|
+
# Create objective function for HyperOpt
|
|
1453
|
+
def objective(params):
|
|
1454
|
+
"""Objective function to minimize."""
|
|
1455
|
+
try:
|
|
1456
|
+
# Convert numpy types to native Python types
|
|
1457
|
+
params = serialize_for_json(params)
|
|
1458
|
+
|
|
1459
|
+
# Use existing trainable function based on perform_crossval
|
|
1460
|
+
if self.perform_crossval:
|
|
1461
|
+
score = trainable_cv(
|
|
1462
|
+
params,
|
|
1463
|
+
x_train,
|
|
1464
|
+
y_train,
|
|
1465
|
+
x_val,
|
|
1466
|
+
y_val,
|
|
1467
|
+
model.model_name,
|
|
1468
|
+
self.target_type,
|
|
1469
|
+
self.experiment_name,
|
|
1470
|
+
self.target_number,
|
|
1471
|
+
model.create_model,
|
|
1472
|
+
n_splits=3,
|
|
1473
|
+
plot=model.plot,
|
|
1474
|
+
log_dir=model.log_dir,
|
|
1475
|
+
target_clf_thresholds=self.target_clf_thresholds,
|
|
1476
|
+
time_series=self.time_series,
|
|
1477
|
+
recurrent=model.recurrent,
|
|
1478
|
+
)
|
|
1479
|
+
else:
|
|
1480
|
+
score, _, _ = trainable(
|
|
1481
|
+
params,
|
|
1482
|
+
x_train,
|
|
1483
|
+
y_train,
|
|
1484
|
+
x_val,
|
|
1485
|
+
y_val,
|
|
1486
|
+
model.model_name,
|
|
1487
|
+
self.target_type,
|
|
1488
|
+
self.experiment_name,
|
|
1489
|
+
self.target_number,
|
|
1490
|
+
model.create_model,
|
|
1491
|
+
plot=model.plot,
|
|
1492
|
+
log_dir=model.log_dir,
|
|
1493
|
+
target_clf_thresholds=self.target_clf_thresholds,
|
|
1494
|
+
)
|
|
1495
|
+
|
|
1496
|
+
# HyperOpt minimizes, so return the metric directly
|
|
1497
|
+
loss = score[self.metric]
|
|
1498
|
+
|
|
1499
|
+
# Log trial info
|
|
1500
|
+
logger.info(f"Trial completed - {self.metric}: {loss:.4f}")
|
|
1501
|
+
|
|
1502
|
+
return {
|
|
1503
|
+
"loss": loss,
|
|
1504
|
+
"status": STATUS_OK,
|
|
1505
|
+
"score": score, # Keep full score dict for analysis
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
except Exception as e:
|
|
1509
|
+
logger.error(f"Trial failed: {str(e)}")
|
|
1510
|
+
return {"loss": float("inf"), "status": STATUS_OK, "error": str(e)}
|
|
1511
|
+
|
|
1512
|
+
# Convert search space
|
|
1513
|
+
hp_search_space = convert_search_space(model.search_params)
|
|
1514
|
+
|
|
1515
|
+
# Run optimization
|
|
1516
|
+
trials = Trials()
|
|
1517
|
+
best_params = fmin(
|
|
1518
|
+
fn=objective,
|
|
1519
|
+
space=hp_search_space,
|
|
1520
|
+
algo=tpe.suggest,
|
|
1521
|
+
max_evals=self.number_of_trials,
|
|
1522
|
+
trials=trials,
|
|
1523
|
+
verbose=True,
|
|
1524
|
+
show_progressbar=True,
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1527
|
+
# Get the actual parameter values (not just indices for hp.choice)
|
|
1528
|
+
best_params = space_eval(hp_search_space, best_params)
|
|
1529
|
+
|
|
1530
|
+
# Convert numpy types to native Python types
|
|
1531
|
+
best_params = serialize_for_json(best_params)
|
|
1532
|
+
|
|
1533
|
+
# Get best score from trials
|
|
1534
|
+
best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
|
|
1535
|
+
best_score = trials.trials[best_trial_idx]["result"].get("score", {})
|
|
1536
|
+
|
|
1537
|
+
# Log results
|
|
1538
|
+
logger.info(f"Best hyperparameters found were:\n{best_params}")
|
|
1539
|
+
logger.info(f"Best Scores found were:\n{best_score}")
|
|
1540
|
+
|
|
1541
|
+
# Create summary DataFrame for consistency with Ray version
|
|
1542
|
+
results_df = pd.DataFrame(
|
|
1543
|
+
[
|
|
1544
|
+
{
|
|
1545
|
+
"trial_id": i,
|
|
1546
|
+
self.metric: t["result"]["loss"],
|
|
1547
|
+
**{
|
|
1548
|
+
k: v
|
|
1549
|
+
for k, v in t["result"].get("score", {}).items()
|
|
1550
|
+
if isinstance(v, (int, float))
|
|
1551
|
+
},
|
|
1552
|
+
}
|
|
1553
|
+
for i, t in enumerate(trials.trials)
|
|
1554
|
+
if t["result"]["status"] == STATUS_OK
|
|
1555
|
+
]
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
if not results_df.empty:
|
|
1559
|
+
logger.info(f"Markdown table with all trials :\n{results_df.to_markdown()}")
|
|
1227
1560
|
|
|
1228
|
-
|
|
1229
|
-
|
|
1561
|
+
# Save trial history for analysis
|
|
1562
|
+
trials_path = f"{self.results_dir}/hyperopt_trials.pkl"
|
|
1563
|
+
with open(trials_path, "wb") as f:
|
|
1564
|
+
pickle.dump(trials, f)
|
|
1565
|
+
|
|
1566
|
+
return best_params
|
|
1567
|
+
|
|
1568
|
+
def hyperoptimize_ray(self, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1230
1569
|
|
|
1231
1570
|
def collect_error_logs(target_dir: int, storage_path: str):
|
|
1232
1571
|
output_error_file = f"{target_dir}/errors.log"
|
|
@@ -1269,9 +1608,22 @@ class ModelSelectionEngine:
|
|
|
1269
1608
|
}
|
|
1270
1609
|
)
|
|
1271
1610
|
|
|
1611
|
+
# Choose between regular trainable or CV version based on perform_crossval flag
|
|
1612
|
+
# perform_crossval controls whether to use CV during hyperopt
|
|
1613
|
+
if self.perform_crossval:
|
|
1614
|
+
trainable_fn = trainable_cv
|
|
1615
|
+
additional_params = {
|
|
1616
|
+
"n_splits": 3, # Can be made configurable
|
|
1617
|
+
"time_series": self.time_series, # Controls whether to use TimeSeriesSplit or StratifiedKFold
|
|
1618
|
+
"recurrent": model.recurrent,
|
|
1619
|
+
}
|
|
1620
|
+
else:
|
|
1621
|
+
trainable_fn = trainable
|
|
1622
|
+
additional_params = {}
|
|
1623
|
+
|
|
1272
1624
|
tuner = Tuner(
|
|
1273
1625
|
trainable=with_parameters(
|
|
1274
|
-
|
|
1626
|
+
trainable_fn,
|
|
1275
1627
|
x_train=x_train,
|
|
1276
1628
|
y_train=y_train,
|
|
1277
1629
|
x_val=x_val,
|
|
@@ -1281,10 +1633,10 @@ class ModelSelectionEngine:
|
|
|
1281
1633
|
experiment_name=self.experiment_name,
|
|
1282
1634
|
target_number=self.target_number,
|
|
1283
1635
|
create_model=model.create_model,
|
|
1284
|
-
type_name="hyperopts",
|
|
1285
1636
|
plot=model.plot,
|
|
1286
1637
|
log_dir=model.log_dir,
|
|
1287
1638
|
target_clf_thresholds=self.target_clf_thresholds,
|
|
1639
|
+
**additional_params,
|
|
1288
1640
|
),
|
|
1289
1641
|
param_space=model.search_params,
|
|
1290
1642
|
tune_config=TuneConfig(
|
|
@@ -1324,7 +1676,7 @@ class ModelSelectionEngine:
|
|
|
1324
1676
|
|
|
1325
1677
|
return best_params
|
|
1326
1678
|
|
|
1327
|
-
def train_model(self, params, x_train, y_train, x_val, y_val, model:
|
|
1679
|
+
def train_model(self, params, x_train, y_train, x_val, y_val, model: BaseModel):
|
|
1328
1680
|
# Use the standalone training function to avoid duplication
|
|
1329
1681
|
# For train_model, we pass the data directly (not as Ray references)
|
|
1330
1682
|
return trainable(
|
|
@@ -1338,7 +1690,6 @@ class ModelSelectionEngine:
|
|
|
1338
1690
|
self.experiment_name,
|
|
1339
1691
|
self.target_number,
|
|
1340
1692
|
model.create_model,
|
|
1341
|
-
self.type_name,
|
|
1342
1693
|
model.plot,
|
|
1343
1694
|
log_dir=model.log_dir,
|
|
1344
1695
|
target_clf_thresholds=self.target_clf_thresholds,
|
|
@@ -1444,11 +1795,11 @@ def evaluate(
|
|
|
1444
1795
|
y_pred_proba = (
|
|
1445
1796
|
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
1446
1797
|
)
|
|
1447
|
-
if num_classes > 2:
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1798
|
+
# if num_classes > 2:
|
|
1799
|
+
# lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
1800
|
+
# lb.fit(labels)
|
|
1801
|
+
# y_true_onhot = lb.transform(y_true)
|
|
1802
|
+
# y_pred_onehot = lb.transform(y_pred)
|
|
1452
1803
|
|
|
1453
1804
|
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
1454
1805
|
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
@@ -1825,6 +2176,20 @@ class Thresholds(BaseModel):
|
|
|
1825
2176
|
def find_best_threshold(
|
|
1826
2177
|
prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
|
|
1827
2178
|
) -> Thresholds:
|
|
2179
|
+
def _normalize_class_label(cls):
|
|
2180
|
+
if isinstance(cls, (np.integer, int)):
|
|
2181
|
+
return int(cls)
|
|
2182
|
+
if isinstance(cls, (float, np.floating)) and cls.is_integer():
|
|
2183
|
+
return int(cls)
|
|
2184
|
+
if isinstance(cls, str):
|
|
2185
|
+
try:
|
|
2186
|
+
as_float = float(cls)
|
|
2187
|
+
if as_float.is_integer():
|
|
2188
|
+
return int(as_float)
|
|
2189
|
+
except ValueError:
|
|
2190
|
+
pass
|
|
2191
|
+
return cls
|
|
2192
|
+
|
|
1828
2193
|
"""
|
|
1829
2194
|
General function to find best threshold optimizing recall, precision, or f1.
|
|
1830
2195
|
|
|
@@ -1843,10 +2208,15 @@ def find_best_threshold(
|
|
|
1843
2208
|
pred_cols = [
|
|
1844
2209
|
col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
|
|
1845
2210
|
]
|
|
1846
|
-
classes =
|
|
2211
|
+
classes = (
|
|
2212
|
+
[1]
|
|
2213
|
+
if len(pred_cols) <= 2
|
|
2214
|
+
else sorted({_normalize_class_label(cls) for cls in y_true.unique()}, key=str)
|
|
2215
|
+
)
|
|
1847
2216
|
|
|
1848
2217
|
results = {}
|
|
1849
|
-
for
|
|
2218
|
+
for raw_cls in classes:
|
|
2219
|
+
cls = _normalize_class_label(raw_cls)
|
|
1850
2220
|
cls_str = str(cls)
|
|
1851
2221
|
if cls_str not in prediction.columns and cls not in prediction.columns:
|
|
1852
2222
|
logger.warning(f"Missing predicted probabilities for class '{cls}'")
|