lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +116 -65
- lecrapaud/db/models/experiment.py +195 -182
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +4 -0
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +612 -242
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.19.0.dist-info/METADATA +0 -249
- lecrapaud-0.19.0.dist-info/RECORD +0 -48
lecrapaud/{api.py → base.py}
RENAMED
|
@@ -5,24 +5,22 @@ It allows for end-to-end ML workflows including data preprocessing, feature engi
|
|
|
5
5
|
model training, and prediction.
|
|
6
6
|
|
|
7
7
|
Basic Usage:
|
|
8
|
-
# Create a LeCrapaud instance
|
|
9
|
-
lc = LeCrapaud()
|
|
10
|
-
|
|
11
8
|
# Create a new experiment
|
|
12
|
-
experiment =
|
|
9
|
+
experiment = LeCrapaud(data=data, target_numbers=[1], target_clf=[1])
|
|
13
10
|
|
|
14
|
-
# Train
|
|
15
|
-
|
|
11
|
+
# Train the model
|
|
12
|
+
experiment.fit(data)
|
|
16
13
|
|
|
17
14
|
# Make predictions
|
|
18
15
|
predictions, scores_reg, scores_clf = experiment.predict(new_data)
|
|
19
16
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
# Load existing experiment
|
|
18
|
+
experiment = LeCrapaud(id=123)
|
|
19
|
+
predictions = experiment.predict(new_data)
|
|
20
|
+
|
|
21
|
+
# Class methods for experiment management
|
|
22
|
+
best_exp = LeCrapaud.get_best_experiment_by_name('my_experiment')
|
|
23
|
+
all_exps = LeCrapaud.list_experiments('my_experiment')
|
|
26
24
|
"""
|
|
27
25
|
|
|
28
26
|
import joblib
|
|
@@ -34,84 +32,172 @@ import seaborn as sns
|
|
|
34
32
|
import numpy as np
|
|
35
33
|
import matplotlib.pyplot as plt
|
|
36
34
|
from lecrapaud.db.session import init_db
|
|
37
|
-
from lecrapaud.feature_selection import
|
|
35
|
+
from lecrapaud.feature_selection import FeatureSelector
|
|
36
|
+
from lecrapaud.model_preprocessing import ModelPreprocessor
|
|
38
37
|
from lecrapaud.model_selection import (
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
ModelSelector,
|
|
39
|
+
BaseModel,
|
|
41
40
|
evaluate,
|
|
42
41
|
load_model,
|
|
43
42
|
plot_threshold,
|
|
44
43
|
plot_evaluation_for_classification,
|
|
45
44
|
)
|
|
46
|
-
from lecrapaud.feature_engineering import
|
|
45
|
+
from lecrapaud.feature_engineering import FeatureEngineering
|
|
46
|
+
from lecrapaud.feature_preprocessing import FeaturePreprocessor
|
|
47
47
|
from lecrapaud.experiment import create_experiment
|
|
48
48
|
from lecrapaud.db import Experiment
|
|
49
|
-
from lecrapaud.search_space import normalize_models_idx
|
|
49
|
+
from lecrapaud.search_space import normalize_models_idx, all_models
|
|
50
50
|
from lecrapaud.utils import logger
|
|
51
51
|
from lecrapaud.directories import tmp_dir
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
class LeCrapaud:
|
|
55
|
-
"""
|
|
55
|
+
"""
|
|
56
|
+
Unified LeCrapaud class for machine learning experiments.
|
|
56
57
|
|
|
57
|
-
This class provides
|
|
58
|
+
This class provides both the ML pipeline functionality and experiment management.
|
|
59
|
+
It can be initialized either with new data to create an experiment or with an
|
|
60
|
+
experiment ID to load an existing one.
|
|
58
61
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
Usage:
|
|
63
|
+
# Create new experiment
|
|
64
|
+
experiment = LeCrapaud(data=df, target_numbers=[1, 2], ...)
|
|
62
65
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
init_db(uri=uri)
|
|
66
|
+
# Load existing experiment
|
|
67
|
+
experiment = LeCrapaud(id=123)
|
|
66
68
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
# Train the model
|
|
70
|
+
experiment.fit(data)
|
|
69
71
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
**kwargs: Additional arguments to configure the experiment
|
|
72
|
+
# Make predictions
|
|
73
|
+
predictions = experiment.predict(new_data)
|
|
73
74
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
75
|
+
Args:
|
|
76
|
+
id (int, optional): ID of an existing experiment to load
|
|
77
|
+
data (pd.DataFrame, optional): Input data for a new experiment
|
|
78
|
+
uri (str, optional): Database connection URI
|
|
79
|
+
**kwargs: Additional configuration parameters
|
|
80
|
+
"""
|
|
78
81
|
|
|
79
|
-
def
|
|
80
|
-
|
|
82
|
+
def __init__(
|
|
83
|
+
self, id: int = None, data: pd.DataFrame = None, uri: str = None, **kwargs
|
|
84
|
+
):
|
|
85
|
+
"""Initialize LeCrapaud with either new or existing experiment."""
|
|
86
|
+
# Initialize database connection
|
|
87
|
+
init_db(uri=uri)
|
|
81
88
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
89
|
+
if id:
|
|
90
|
+
# Load existing experiment
|
|
91
|
+
self.experiment = Experiment.get(id)
|
|
92
|
+
# Context from DB takes precedence over kwargs
|
|
93
|
+
effective_kwargs = {
|
|
94
|
+
**self.DEFAULT_PARAMS,
|
|
95
|
+
**kwargs,
|
|
96
|
+
**self.experiment.context,
|
|
97
|
+
}
|
|
98
|
+
else:
|
|
99
|
+
if data is None:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"Either id or data must be provided. Data can be a path to a folder containing trained models"
|
|
102
|
+
)
|
|
103
|
+
# New experiment: merge defaults with provided kwargs
|
|
104
|
+
effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
|
|
85
105
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
106
|
+
# Normalize models_idx if present
|
|
107
|
+
if "models_idx" in effective_kwargs:
|
|
108
|
+
effective_kwargs["models_idx"] = normalize_models_idx(
|
|
109
|
+
effective_kwargs["models_idx"]
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Set all parameters as instance attributes
|
|
113
|
+
for key, value in effective_kwargs.items():
|
|
114
|
+
setattr(self, key, value)
|
|
90
115
|
|
|
91
|
-
|
|
116
|
+
# Create experiment if new
|
|
117
|
+
if not id:
|
|
118
|
+
self.experiment = create_experiment(data=data, **effective_kwargs)
|
|
119
|
+
|
|
120
|
+
# Create directories
|
|
121
|
+
experiment_dir = f"{tmp_dir}/{self.experiment.name}"
|
|
122
|
+
preprocessing_dir = f"{experiment_dir}/preprocessing"
|
|
123
|
+
data_dir = f"{experiment_dir}/data"
|
|
124
|
+
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
125
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
126
|
+
|
|
127
|
+
# Default values for all experiment parameters
|
|
128
|
+
DEFAULT_PARAMS = {
|
|
129
|
+
# Feature Engineering
|
|
130
|
+
"columns_drop": [],
|
|
131
|
+
"columns_boolean": [],
|
|
132
|
+
"columns_date": [],
|
|
133
|
+
"columns_te_groupby": [],
|
|
134
|
+
"columns_te_target": [],
|
|
135
|
+
"for_training": True,
|
|
136
|
+
# Preprocessing
|
|
137
|
+
"time_series": False,
|
|
138
|
+
"val_size": 0.2,
|
|
139
|
+
"test_size": 0.2,
|
|
140
|
+
"columns_pca": [],
|
|
141
|
+
"pca_temporal": [],
|
|
142
|
+
"pca_cross_sectional": [],
|
|
143
|
+
"columns_onehot": [],
|
|
144
|
+
"columns_binary": [],
|
|
145
|
+
"columns_ordinal": [],
|
|
146
|
+
"columns_frequency": [],
|
|
147
|
+
# Feature Selection
|
|
148
|
+
"percentile": 20,
|
|
149
|
+
"corr_threshold": 80,
|
|
150
|
+
"max_features": 50,
|
|
151
|
+
"max_p_value_categorical": 0.05,
|
|
152
|
+
# Model Selection
|
|
153
|
+
"target_numbers": [],
|
|
154
|
+
"target_clf": [],
|
|
155
|
+
"models_idx": [],
|
|
156
|
+
"max_timesteps": 120,
|
|
157
|
+
"perform_hyperopt": True,
|
|
158
|
+
"number_of_trials": 20,
|
|
159
|
+
"perform_crossval": False,
|
|
160
|
+
"plot": True,
|
|
161
|
+
"preserve_model": True,
|
|
162
|
+
"target_clf_thresholds": {},
|
|
163
|
+
# Data structure
|
|
164
|
+
"date_column": None,
|
|
165
|
+
"group_column": None,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def get_default_params(cls):
|
|
170
|
+
"""Get the default parameters for experiments."""
|
|
171
|
+
return cls.DEFAULT_PARAMS.copy()
|
|
172
|
+
|
|
173
|
+
def get_effective_context(self):
|
|
174
|
+
"""Get the effective context (merged defaults + experiment context)."""
|
|
175
|
+
return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def get_last_experiment_by_name(cls, name: str, **kwargs):
|
|
92
179
|
"""Retrieve the last experiment by name."""
|
|
93
|
-
return
|
|
180
|
+
return cls(id=Experiment.get_last_by_name(name).id, **kwargs)
|
|
94
181
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
) -> "ExperimentEngine":
|
|
182
|
+
@classmethod
|
|
183
|
+
def get_best_experiment_by_name(cls, name: str, **kwargs):
|
|
98
184
|
"""Retrieve the best experiment by score."""
|
|
99
|
-
best_exp = Experiment.get_best_by_score(name=name
|
|
185
|
+
best_exp = Experiment.get_best_by_score(name=name)
|
|
100
186
|
if not best_exp:
|
|
101
187
|
return None
|
|
102
|
-
return
|
|
188
|
+
return cls(id=best_exp.id, **kwargs)
|
|
103
189
|
|
|
104
|
-
|
|
105
|
-
|
|
190
|
+
@classmethod
|
|
191
|
+
def list_experiments(cls, name: str = None, limit: int = 1000):
|
|
192
|
+
"""List all experiments in the database."""
|
|
193
|
+
return [
|
|
194
|
+
cls(id=exp.id) for exp in Experiment.get_all_by_name(name=name, limit=limit)
|
|
195
|
+
]
|
|
106
196
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
dict: Dictionary containing experiment names as keys and their scores as values
|
|
112
|
-
"""
|
|
113
|
-
# Get all experiments with the given name pattern
|
|
114
|
-
experiments = self.list_experiments(name=name)
|
|
197
|
+
@classmethod
|
|
198
|
+
def compare_experiment_scores(cls, name: str):
|
|
199
|
+
"""Compare scores of experiments with matching names."""
|
|
200
|
+
experiments = cls.list_experiments(name=name)
|
|
115
201
|
|
|
116
202
|
if not experiments:
|
|
117
203
|
return {"error": f"No experiments found with name containing '{name}'"}
|
|
@@ -120,9 +206,7 @@ class LeCrapaud:
|
|
|
120
206
|
|
|
121
207
|
for exp in experiments:
|
|
122
208
|
for model_sel in exp.experiment.model_selections:
|
|
123
|
-
|
|
124
209
|
if model_sel.best_score:
|
|
125
|
-
|
|
126
210
|
scores = {
|
|
127
211
|
"rmse": model_sel.best_score["rmse"],
|
|
128
212
|
"logloss": model_sel.best_score["logloss"],
|
|
@@ -131,7 +215,6 @@ class LeCrapaud:
|
|
|
131
215
|
"roc_auc": model_sel.best_score["roc_auc"],
|
|
132
216
|
}
|
|
133
217
|
target_name = model_sel.target.name
|
|
134
|
-
|
|
135
218
|
comparison[exp.experiment.name][target_name] = scores
|
|
136
219
|
else:
|
|
137
220
|
logger.warning(
|
|
@@ -140,71 +223,143 @@ class LeCrapaud:
|
|
|
140
223
|
|
|
141
224
|
return comparison
|
|
142
225
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
) -> list["ExperimentEngine"]:
|
|
146
|
-
"""List all experiments in the database."""
|
|
147
|
-
return [
|
|
148
|
-
ExperimentEngine(id=exp.id)
|
|
149
|
-
for exp in Experiment.get_all_by_name(name=name, limit=limit)
|
|
150
|
-
]
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
class ExperimentEngine:
|
|
154
|
-
"""Engine for managing ML experiments.
|
|
155
|
-
|
|
156
|
-
This class handles the complete ML pipeline including feature engineering,
|
|
157
|
-
model training, and prediction. It can be initialized with either new data
|
|
158
|
-
or by loading an existing experiment by ID.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
id (int, optional): ID of an existing experiment to load
|
|
162
|
-
data (pd.DataFrame, optional): Input data for a new experiment
|
|
163
|
-
**kwargs: Additional configuration parameters
|
|
164
|
-
"""
|
|
226
|
+
# Main ML Pipeline Methods
|
|
227
|
+
# ========================
|
|
165
228
|
|
|
166
|
-
def
|
|
167
|
-
"""
|
|
168
|
-
|
|
169
|
-
self.experiment = Experiment.get(id)
|
|
170
|
-
kwargs.update(self.experiment.context)
|
|
171
|
-
experiment_dir = f"{tmp_dir}/{self.experiment.name}"
|
|
172
|
-
preprocessing_dir = f"{experiment_dir}/preprocessing"
|
|
173
|
-
data_dir = f"{experiment_dir}/data"
|
|
174
|
-
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
175
|
-
os.makedirs(data_dir, exist_ok=True)
|
|
176
|
-
else:
|
|
177
|
-
if data is None:
|
|
178
|
-
raise ValueError(
|
|
179
|
-
"Either id or data must be provided. Data can be a path to a folder containing trained models"
|
|
180
|
-
)
|
|
181
|
-
self.experiment = create_experiment(data=data, **kwargs)
|
|
229
|
+
def fit(self, data, best_params=None):
|
|
230
|
+
"""
|
|
231
|
+
Fit the complete ML pipeline on the provided data.
|
|
182
232
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
value = normalize_models_idx(value)
|
|
187
|
-
setattr(self, key, value)
|
|
233
|
+
Args:
|
|
234
|
+
data (pd.DataFrame): Input training data
|
|
235
|
+
best_params (dict, optional): Pre-defined best parameters
|
|
188
236
|
|
|
189
|
-
|
|
237
|
+
Returns:
|
|
238
|
+
self: Returns self for chaining
|
|
239
|
+
"""
|
|
190
240
|
logger.info("Running training...")
|
|
191
241
|
|
|
192
|
-
|
|
242
|
+
# Step 1: Feature Engineering
|
|
243
|
+
logger.info("Starting feature engineering...")
|
|
244
|
+
feature_eng = FeatureEngineering(experiment=self.experiment)
|
|
245
|
+
feature_eng.fit(data)
|
|
246
|
+
data_eng = feature_eng.get_data()
|
|
193
247
|
logger.info("Feature engineering done.")
|
|
194
248
|
|
|
195
|
-
|
|
249
|
+
# Step 2: Feature Preprocessing (split data)
|
|
250
|
+
logger.info("Starting feature preprocessing...")
|
|
251
|
+
from lecrapaud.feature_preprocessing import split_data
|
|
252
|
+
|
|
253
|
+
train, val, test = split_data(data_eng, experiment=self.experiment)
|
|
254
|
+
|
|
255
|
+
# Apply feature preprocessing transformations
|
|
256
|
+
feature_preprocessor = FeaturePreprocessor(experiment=self.experiment)
|
|
257
|
+
feature_preprocessor.fit(train)
|
|
258
|
+
train = feature_preprocessor.transform(train)
|
|
259
|
+
if val is not None:
|
|
260
|
+
val = feature_preprocessor.transform(val)
|
|
261
|
+
if test is not None:
|
|
262
|
+
test = feature_preprocessor.transform(test)
|
|
196
263
|
logger.info("Feature preprocessing done.")
|
|
197
264
|
|
|
198
|
-
|
|
265
|
+
# Step 3: Feature Selection (for each target)
|
|
266
|
+
logger.info("Starting feature selection...")
|
|
267
|
+
for target_number in self.target_numbers:
|
|
268
|
+
feature_selector = FeatureSelector(
|
|
269
|
+
experiment=self.experiment, target_number=target_number
|
|
270
|
+
)
|
|
271
|
+
feature_selector.fit(train)
|
|
272
|
+
|
|
273
|
+
# Refresh experiment to get updated features
|
|
274
|
+
self.experiment = Experiment.get(self.experiment.id)
|
|
275
|
+
all_features = self.experiment.get_all_features(
|
|
276
|
+
date_column=self.date_column, group_column=self.group_column
|
|
277
|
+
)
|
|
278
|
+
joblib.dump(
|
|
279
|
+
all_features, f"{self.experiment.path}/preprocessing/all_features.pkl"
|
|
280
|
+
)
|
|
199
281
|
logger.info("Feature selection done.")
|
|
200
282
|
|
|
201
|
-
|
|
283
|
+
# Step 4: Model Preprocessing (scaling)
|
|
284
|
+
logger.info("Starting model preprocessing...")
|
|
285
|
+
model_preprocessor = ModelPreprocessor(experiment=self.experiment)
|
|
286
|
+
|
|
287
|
+
# Fit and transform training data, then transform val/test
|
|
288
|
+
model_preprocessor.fit(train)
|
|
289
|
+
train_scaled = model_preprocessor.transform(train)
|
|
290
|
+
val_scaled = model_preprocessor.transform(val) if val is not None else None
|
|
291
|
+
test_scaled = model_preprocessor.transform(test) if test is not None else None
|
|
292
|
+
|
|
293
|
+
# Create data dict for model selection (keep both raw and scaled splits)
|
|
294
|
+
std_data = {
|
|
295
|
+
"train": train,
|
|
296
|
+
"val": val,
|
|
297
|
+
"test": test,
|
|
298
|
+
"train_scaled": train_scaled,
|
|
299
|
+
"val_scaled": val_scaled,
|
|
300
|
+
"test_scaled": test_scaled,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
# Handle time series reshaping if needed
|
|
304
|
+
reshaped_data = None
|
|
305
|
+
# Check if any model requires recurrent processing
|
|
306
|
+
need_reshaping = (
|
|
307
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
308
|
+
and self.time_series
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
if need_reshaping:
|
|
312
|
+
# Sanity check: make sure we have enough data for max_timesteps
|
|
313
|
+
if (
|
|
314
|
+
self.group_column
|
|
315
|
+
and train_scaled.groupby(self.group_column).size().min()
|
|
316
|
+
< self.max_timesteps
|
|
317
|
+
) or train_scaled.shape[0] < self.max_timesteps:
|
|
318
|
+
raise ValueError(
|
|
319
|
+
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
from lecrapaud.model_preprocessing import reshape_time_series
|
|
323
|
+
|
|
324
|
+
features = self.experiment.get_all_features(
|
|
325
|
+
date_column=self.date_column, group_column=self.group_column
|
|
326
|
+
)
|
|
327
|
+
reshaped_data = reshape_time_series(
|
|
328
|
+
self.experiment,
|
|
329
|
+
features,
|
|
330
|
+
train_scaled,
|
|
331
|
+
val_scaled,
|
|
332
|
+
test_scaled,
|
|
333
|
+
timesteps=self.max_timesteps,
|
|
334
|
+
)
|
|
202
335
|
logger.info("Model preprocessing done.")
|
|
203
336
|
|
|
204
|
-
|
|
337
|
+
# Step 5: Model Selection (for each target)
|
|
338
|
+
logger.info("Starting model selection...")
|
|
339
|
+
self.models_ = {}
|
|
340
|
+
for target_number in self.target_numbers:
|
|
341
|
+
model_selector = ModelSelector(
|
|
342
|
+
experiment=self.experiment, target_number=target_number
|
|
343
|
+
)
|
|
344
|
+
model_selector.fit(
|
|
345
|
+
std_data, reshaped_data=reshaped_data, best_params=best_params
|
|
346
|
+
)
|
|
347
|
+
self.models_[target_number] = model_selector.get_best_model()
|
|
205
348
|
logger.info("Model selection done.")
|
|
206
349
|
|
|
350
|
+
return self
|
|
351
|
+
|
|
207
352
|
def predict(self, new_data, verbose: int = 0):
|
|
353
|
+
"""
|
|
354
|
+
Make predictions on new data using the trained pipeline.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
new_data (pd.DataFrame): Input data for prediction
|
|
358
|
+
verbose (int): Verbosity level (0=warnings only, 1=all logs)
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
tuple: (predictions_df, scores_regression, scores_classification)
|
|
362
|
+
"""
|
|
208
363
|
# for scores if TARGET is in columns
|
|
209
364
|
scores_reg = []
|
|
210
365
|
scores_clf = []
|
|
@@ -214,28 +369,67 @@ class ExperimentEngine:
|
|
|
214
369
|
|
|
215
370
|
logger.warning("Running prediction...")
|
|
216
371
|
|
|
217
|
-
#
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
372
|
+
# Apply the same preprocessing pipeline as training
|
|
373
|
+
# Step 1: Feature Engineering
|
|
374
|
+
feature_eng = FeatureEngineering(experiment=self.experiment)
|
|
375
|
+
feature_eng.fit(new_data)
|
|
376
|
+
data = feature_eng.get_data()
|
|
377
|
+
|
|
378
|
+
# Step 2: Feature Preprocessing (no splitting for prediction)
|
|
379
|
+
feature_preprocessor = FeaturePreprocessor(experiment=self.experiment)
|
|
380
|
+
# Load existing transformations and apply
|
|
381
|
+
data = feature_preprocessor.transform(data)
|
|
382
|
+
|
|
383
|
+
# Step 3: Model Preprocessing (scaling)
|
|
384
|
+
model_preprocessor = ModelPreprocessor(experiment=self.experiment)
|
|
385
|
+
# Apply existing scaling
|
|
386
|
+
scaled_data = model_preprocessor.transform(data)
|
|
387
|
+
|
|
388
|
+
# Step 4: Time series reshaping if needed
|
|
389
|
+
reshaped_data = None
|
|
390
|
+
# Check if any model requires recurrent processing
|
|
391
|
+
need_reshaping = (
|
|
392
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
393
|
+
and self.time_series
|
|
225
394
|
)
|
|
226
395
|
|
|
227
|
-
|
|
396
|
+
if need_reshaping:
|
|
397
|
+
# Sanity check: make sure we have enough data for max_timesteps
|
|
398
|
+
if (
|
|
399
|
+
self.group_column
|
|
400
|
+
and scaled_data.groupby(self.group_column).size().min()
|
|
401
|
+
< self.max_timesteps
|
|
402
|
+
) or scaled_data.shape[0] < self.max_timesteps:
|
|
403
|
+
raise ValueError(
|
|
404
|
+
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
405
|
+
)
|
|
228
406
|
|
|
229
|
-
|
|
407
|
+
from lecrapaud.model_preprocessing import reshape_time_series
|
|
408
|
+
|
|
409
|
+
all_features = self.experiment.get_all_features(
|
|
410
|
+
date_column=self.date_column, group_column=self.group_column
|
|
411
|
+
)
|
|
412
|
+
# For prediction, we reshape the entire dataset
|
|
413
|
+
reshaped_data = reshape_time_series(
|
|
414
|
+
self.experiment, all_features, scaled_data, timesteps=self.max_timesteps
|
|
415
|
+
)
|
|
416
|
+
reshaped_data = reshaped_data[
|
|
417
|
+
"x_train_reshaped"
|
|
418
|
+
] # Only need X data for prediction
|
|
419
|
+
|
|
420
|
+
# Step 5: Predict for each target
|
|
421
|
+
for target_number in self.target_numbers:
|
|
422
|
+
# Load the trained model
|
|
230
423
|
target_dir = f"{self.experiment.path}/TARGET_{target_number}"
|
|
424
|
+
model = BaseModel(path=target_dir, target_number=target_number)
|
|
425
|
+
|
|
426
|
+
# Get features for this target
|
|
231
427
|
all_features = self.experiment.get_all_features(
|
|
232
428
|
date_column=self.date_column, group_column=self.group_column
|
|
233
429
|
)
|
|
234
430
|
features = self.experiment.get_features(target_number)
|
|
235
431
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
# getting data
|
|
432
|
+
# Prepare prediction data
|
|
239
433
|
if model.recurrent:
|
|
240
434
|
features_idx = [
|
|
241
435
|
i for i, e in enumerate(all_features) if e in set(features)
|
|
@@ -244,16 +438,14 @@ class ExperimentEngine:
|
|
|
244
438
|
else:
|
|
245
439
|
x_pred = scaled_data[features] if model.need_scaling else data[features]
|
|
246
440
|
|
|
247
|
-
#
|
|
441
|
+
# Make prediction
|
|
248
442
|
y_pred = model.predict(x_pred)
|
|
249
443
|
|
|
250
|
-
#
|
|
444
|
+
# Fix index for recurrent models
|
|
251
445
|
if model.recurrent:
|
|
252
|
-
y_pred.index =
|
|
253
|
-
new_data.index
|
|
254
|
-
) # TODO: not sure this will work for old experiment not aligned with data_for_training for test use case (done, this is why we decode the test set)
|
|
446
|
+
y_pred.index = new_data.index
|
|
255
447
|
|
|
256
|
-
#
|
|
448
|
+
# Unscale prediction if needed
|
|
257
449
|
if (
|
|
258
450
|
model.need_scaling
|
|
259
451
|
and model.target_type == "regression"
|
|
@@ -267,7 +459,7 @@ class ExperimentEngine:
|
|
|
267
459
|
)
|
|
268
460
|
y_pred.name = "PRED"
|
|
269
461
|
|
|
270
|
-
#
|
|
462
|
+
# Evaluate if target is present in new_data
|
|
271
463
|
target_col = next(
|
|
272
464
|
(
|
|
273
465
|
col
|
|
@@ -291,126 +483,21 @@ class ExperimentEngine:
|
|
|
291
483
|
else:
|
|
292
484
|
scores_reg.append(score)
|
|
293
485
|
|
|
294
|
-
#
|
|
486
|
+
# Add predictions to the output dataframe
|
|
295
487
|
if isinstance(y_pred, pd.DataFrame):
|
|
296
488
|
y_pred = y_pred.add_prefix(f"TARGET_{target_number}_")
|
|
297
489
|
new_data = pd.concat([new_data, y_pred], axis=1)
|
|
298
|
-
|
|
299
490
|
else:
|
|
300
491
|
y_pred.name = f"TARGET_{target_number}_PRED"
|
|
301
492
|
new_data = pd.concat([new_data, y_pred], axis=1)
|
|
302
493
|
|
|
494
|
+
# Format scores
|
|
303
495
|
if len(scores_reg) > 0:
|
|
304
496
|
scores_reg = pd.DataFrame(scores_reg).set_index("TARGET")
|
|
305
497
|
if len(scores_clf) > 0:
|
|
306
498
|
scores_clf = pd.DataFrame(scores_clf).set_index("TARGET")
|
|
307
|
-
return new_data, scores_reg, scores_clf
|
|
308
|
-
|
|
309
|
-
def feature_engineering(self, data, for_training=True):
|
|
310
|
-
app = FeatureEngineeringEngine(
|
|
311
|
-
data=data,
|
|
312
|
-
columns_drop=getattr(self, "columns_drop", []),
|
|
313
|
-
columns_boolean=getattr(self, "columns_boolean", []),
|
|
314
|
-
columns_date=getattr(self, "columns_date", []),
|
|
315
|
-
columns_te_groupby=getattr(self, "columns_te_groupby", []),
|
|
316
|
-
columns_te_target=getattr(self, "columns_te_target", []),
|
|
317
|
-
for_training=getattr(self, "for_training", True),
|
|
318
|
-
)
|
|
319
|
-
data = app.run()
|
|
320
|
-
return data
|
|
321
|
-
|
|
322
|
-
def preprocess_feature(self, data, for_training=True):
|
|
323
|
-
app = PreprocessFeature(
|
|
324
|
-
data=data,
|
|
325
|
-
experiment=getattr(self, "experiment", None),
|
|
326
|
-
time_series=getattr(self, "time_series", False),
|
|
327
|
-
date_column=getattr(self, "date_column", None),
|
|
328
|
-
group_column=getattr(self, "group_column", None),
|
|
329
|
-
val_size=getattr(self, "val_size", 0.2),
|
|
330
|
-
test_size=getattr(self, "test_size", 0.2),
|
|
331
|
-
columns_pca=getattr(self, "columns_pca", []),
|
|
332
|
-
pca_temporal=getattr(self, "pca_temporal", []),
|
|
333
|
-
pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
|
|
334
|
-
columns_onehot=getattr(self, "columns_onehot", []),
|
|
335
|
-
columns_binary=getattr(self, "columns_binary", []),
|
|
336
|
-
columns_ordinal=getattr(self, "columns_ordinal", []),
|
|
337
|
-
columns_frequency=getattr(self, "columns_frequency", []),
|
|
338
|
-
target_numbers=getattr(self, "target_numbers", []),
|
|
339
|
-
target_clf=getattr(self, "target_clf", []),
|
|
340
|
-
)
|
|
341
|
-
if for_training:
|
|
342
|
-
train, val, test = app.run()
|
|
343
|
-
return train, val, test
|
|
344
|
-
else:
|
|
345
|
-
data = app.inference()
|
|
346
|
-
return data
|
|
347
499
|
|
|
348
|
-
|
|
349
|
-
for target_number in self.target_numbers:
|
|
350
|
-
app = FeatureSelectionEngine(
|
|
351
|
-
train=train,
|
|
352
|
-
target_number=target_number,
|
|
353
|
-
experiment=self.experiment,
|
|
354
|
-
target_clf=self.target_clf,
|
|
355
|
-
)
|
|
356
|
-
app.run()
|
|
357
|
-
self.experiment = Experiment.get(self.experiment.id)
|
|
358
|
-
all_features = self.experiment.get_all_features(
|
|
359
|
-
date_column=self.date_column, group_column=self.group_column
|
|
360
|
-
)
|
|
361
|
-
joblib.dump(
|
|
362
|
-
all_features, f"{self.experiment.path}/preprocessing/all_features.pkl"
|
|
363
|
-
)
|
|
364
|
-
return all_features
|
|
365
|
-
|
|
366
|
-
def preprocess_model(self, train, val=None, test=None, for_training=True):
|
|
367
|
-
app = PreprocessModel(
|
|
368
|
-
train=train,
|
|
369
|
-
val=val,
|
|
370
|
-
test=test,
|
|
371
|
-
experiment=getattr(self, "experiment", None),
|
|
372
|
-
target_numbers=getattr(self, "target_numbers", []),
|
|
373
|
-
target_clf=getattr(self, "target_clf", []),
|
|
374
|
-
models_idx=getattr(self, "models_idx", []),
|
|
375
|
-
time_series=getattr(self, "time_series", False),
|
|
376
|
-
max_timesteps=getattr(self, "max_timesteps", 120),
|
|
377
|
-
date_column=getattr(self, "date_column", None),
|
|
378
|
-
group_column=getattr(self, "group_column", None),
|
|
379
|
-
)
|
|
380
|
-
if for_training:
|
|
381
|
-
data, reshaped_data = app.run()
|
|
382
|
-
return data, reshaped_data
|
|
383
|
-
else:
|
|
384
|
-
data, scaled_data, reshaped_data = app.inference()
|
|
385
|
-
return data, scaled_data, reshaped_data
|
|
386
|
-
|
|
387
|
-
def model_selection(self, data, reshaped_data, best_params=None):
|
|
388
|
-
for target_number in self.target_numbers:
|
|
389
|
-
app = ModelSelectionEngine(
|
|
390
|
-
data=data,
|
|
391
|
-
reshaped_data=reshaped_data,
|
|
392
|
-
target_number=target_number,
|
|
393
|
-
experiment=getattr(self, "experiment", None),
|
|
394
|
-
target_clf=getattr(self, "target_clf", []),
|
|
395
|
-
models_idx=getattr(self, "models_idx", []),
|
|
396
|
-
time_series=getattr(self, "time_series", False),
|
|
397
|
-
date_column=getattr(self, "date_column", None),
|
|
398
|
-
group_column=getattr(self, "group_column", None),
|
|
399
|
-
target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
|
|
400
|
-
)
|
|
401
|
-
if best_params and target_number not in best_params.keys():
|
|
402
|
-
raise ValueError(
|
|
403
|
-
f"Target {target_number} not found in best_params passed as argument"
|
|
404
|
-
)
|
|
405
|
-
app.run(
|
|
406
|
-
self.experiment_name,
|
|
407
|
-
perform_hyperopt=self.perform_hyperopt,
|
|
408
|
-
number_of_trials=self.number_of_trials,
|
|
409
|
-
perform_crossval=self.perform_crossval,
|
|
410
|
-
plot=self.plot,
|
|
411
|
-
preserve_model=self.preserve_model,
|
|
412
|
-
best_params=best_params[target_number] if best_params else None,
|
|
413
|
-
)
|
|
500
|
+
return new_data, scores_reg, scores_clf
|
|
414
501
|
|
|
415
502
|
def get_scores(self, target_number: int):
|
|
416
503
|
return pd.read_csv(
|
|
@@ -475,6 +562,9 @@ class ExperimentEngine:
|
|
|
475
562
|
# For lightgbm models
|
|
476
563
|
importances = model.feature_importance(importance_type="split")
|
|
477
564
|
importance_type = "Split"
|
|
565
|
+
elif hasattr(model, "get_feature_importance"):
|
|
566
|
+
importances = model.get_feature_importance()
|
|
567
|
+
importance_type = "Feature importance"
|
|
478
568
|
elif hasattr(model, "coef_"):
|
|
479
569
|
# For linear models
|
|
480
570
|
importances = np.abs(model.coef_.flatten())
|
|
@@ -557,7 +647,7 @@ class ExperimentEngine:
|
|
|
557
647
|
if not os.path.exists(params_file):
|
|
558
648
|
raise FileNotFoundError(
|
|
559
649
|
f"Best parameters file not found at {params_file}. "
|
|
560
|
-
"Make sure to
|
|
650
|
+
"Make sure to fit model training first."
|
|
561
651
|
)
|
|
562
652
|
|
|
563
653
|
try:
|