lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
lecrapaud/mixins.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sklearn-compatible mixins for LeCrapaud components.
|
|
3
|
+
|
|
4
|
+
This module provides base classes and mixins that ensure components are compatible
|
|
5
|
+
with scikit-learn conventions and can be used in sklearn pipelines.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from lecrapaud.db import Experiment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LeCrapaudTransformerMixin(BaseEstimator, TransformerMixin):
|
|
14
|
+
"""
|
|
15
|
+
Base mixin for LeCrapaud transformers that ensures sklearn compatibility.
|
|
16
|
+
|
|
17
|
+
This mixin provides the basic structure that all LeCrapaud transformers
|
|
18
|
+
should follow to be compatible with sklearn pipelines.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, experiment: Experiment = None, **kwargs):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the transformer.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
experiment: LeCrapaud experiment context
|
|
27
|
+
**kwargs: Additional parameters (take priority over experiment.context)
|
|
28
|
+
"""
|
|
29
|
+
self.experiment = experiment
|
|
30
|
+
|
|
31
|
+
# First, set parameters from experiment context if available
|
|
32
|
+
if experiment and hasattr(experiment, "context") and experiment.context:
|
|
33
|
+
for key, value in experiment.context.items():
|
|
34
|
+
setattr(self, key, value)
|
|
35
|
+
|
|
36
|
+
# Then override with explicit kwargs (kwargs have priority)
|
|
37
|
+
for key, value in kwargs.items():
|
|
38
|
+
setattr(self, key, value)
|
|
39
|
+
|
|
40
|
+
def get_params(self, deep=True):
|
|
41
|
+
"""
|
|
42
|
+
Get parameters for this estimator (sklearn compatibility).
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
deep (bool): If True, will return the parameters for this estimator
|
|
46
|
+
and contained subobjects that are estimators.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
dict: Parameter names mapped to their values.
|
|
50
|
+
"""
|
|
51
|
+
params = {}
|
|
52
|
+
for key in dir(self):
|
|
53
|
+
if not key.startswith("_") and not callable(getattr(self, key)):
|
|
54
|
+
value = getattr(self, key)
|
|
55
|
+
if deep and hasattr(value, "get_params"):
|
|
56
|
+
deep_items = value.get_params().items()
|
|
57
|
+
params.update((f"{key}__{k}", v) for k, v in deep_items)
|
|
58
|
+
params[key] = value
|
|
59
|
+
return params
|
|
60
|
+
|
|
61
|
+
def set_params(self, **params):
|
|
62
|
+
"""
|
|
63
|
+
Set the parameters of this estimator (sklearn compatibility).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
**params: Estimator parameters
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
self: Estimator instance
|
|
70
|
+
"""
|
|
71
|
+
for key, value in params.items():
|
|
72
|
+
if "__" in key:
|
|
73
|
+
# Handle nested parameters
|
|
74
|
+
obj_name, param_name = key.split("__", 1)
|
|
75
|
+
obj = getattr(self, obj_name)
|
|
76
|
+
obj.set_params(**{param_name: value})
|
|
77
|
+
else:
|
|
78
|
+
setattr(self, key, value)
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def _set_fitted(self):
|
|
82
|
+
"""Mark the transformer as fitted (sklearn compatibility helper)."""
|
|
83
|
+
self.is_fitted_ = True
|
|
84
|
+
|
|
85
|
+
def _check_is_fitted(self):
|
|
86
|
+
"""Ensure the estimator has been fitted before usage."""
|
|
87
|
+
if not getattr(self, "is_fitted_", False):
|
|
88
|
+
raise ValueError("This estimator has not been fitted yet.")
|
|
89
|
+
|
|
90
|
+
def _validate_data(self, X, y=None, reset=True):
|
|
91
|
+
"""Basic validation helper mirroring sklearn signature."""
|
|
92
|
+
if X is None:
|
|
93
|
+
raise ValueError("Input X cannot be None")
|
|
94
|
+
return X, y
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def fit(self, X, y=None):
|
|
98
|
+
"""
|
|
99
|
+
Fit the transformer.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
X: Input data
|
|
103
|
+
y: Target values (optional)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
self: Returns self for chaining
|
|
107
|
+
"""
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
def transform(self, X):
|
|
112
|
+
"""
|
|
113
|
+
Transform the input data.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
X: Input data to transform
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Transformed data
|
|
120
|
+
"""
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def fit_transform(self, X, y=None):
|
|
124
|
+
"""
|
|
125
|
+
Fit and transform in one step (provided by TransformerMixin).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
X: Input data
|
|
129
|
+
y: Target values (optional)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Transformed data
|
|
133
|
+
"""
|
|
134
|
+
return self.fit(X, y).transform(X)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class LeCrapaudEstimatorMixin(BaseEstimator):
|
|
138
|
+
"""
|
|
139
|
+
Base mixin for LeCrapaud estimators (like selectors) that only have fit().
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, experiment: Experiment = None, **kwargs):
|
|
143
|
+
"""
|
|
144
|
+
Initialize the estimator.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
experiment: LeCrapaud experiment context
|
|
148
|
+
**kwargs: Additional parameters (take priority over experiment.context)
|
|
149
|
+
"""
|
|
150
|
+
self.experiment = experiment
|
|
151
|
+
|
|
152
|
+
# First, set parameters from experiment context if available
|
|
153
|
+
if experiment and hasattr(experiment, "context") and experiment.context:
|
|
154
|
+
for key, value in experiment.context.items():
|
|
155
|
+
setattr(self, key, value)
|
|
156
|
+
|
|
157
|
+
# Then override with explicit kwargs (kwargs have priority)
|
|
158
|
+
for key, value in kwargs.items():
|
|
159
|
+
setattr(self, key, value)
|
|
160
|
+
|
|
161
|
+
def get_params(self, deep=True):
|
|
162
|
+
"""Get parameters for this estimator (sklearn compatibility)."""
|
|
163
|
+
params = {}
|
|
164
|
+
for key in dir(self):
|
|
165
|
+
if not key.startswith("_") and not callable(getattr(self, key)):
|
|
166
|
+
value = getattr(self, key)
|
|
167
|
+
if deep and hasattr(value, "get_params"):
|
|
168
|
+
deep_items = value.get_params().items()
|
|
169
|
+
params.update((f"{key}__{k}", v) for k, v in deep_items)
|
|
170
|
+
params[key] = value
|
|
171
|
+
return params
|
|
172
|
+
|
|
173
|
+
def set_params(self, **params):
|
|
174
|
+
"""Set the parameters of this estimator (sklearn compatibility)."""
|
|
175
|
+
for key, value in params.items():
|
|
176
|
+
if "__" in key:
|
|
177
|
+
# Handle nested parameters
|
|
178
|
+
obj_name, param_name = key.split("__", 1)
|
|
179
|
+
obj = getattr(self, obj_name)
|
|
180
|
+
obj.set_params(**{param_name: value})
|
|
181
|
+
else:
|
|
182
|
+
setattr(self, key, value)
|
|
183
|
+
return self
|
|
184
|
+
|
|
185
|
+
def _set_fitted(self):
|
|
186
|
+
"""Mark the estimator as fitted."""
|
|
187
|
+
self.is_fitted_ = True
|
|
188
|
+
|
|
189
|
+
def _check_is_fitted(self):
|
|
190
|
+
"""Ensure the estimator has been fitted."""
|
|
191
|
+
if not getattr(self, "is_fitted_", False):
|
|
192
|
+
raise ValueError("This estimator has not been fitted yet.")
|
|
193
|
+
|
|
194
|
+
def _validate_data(self, X, y=None, reset=True):
|
|
195
|
+
"""Basic validation helper mirroring sklearn signature."""
|
|
196
|
+
if X is None:
|
|
197
|
+
raise ValueError("Input X cannot be None")
|
|
198
|
+
return X, y
|
|
199
|
+
|
|
200
|
+
@abstractmethod
|
|
201
|
+
def fit(self, X, y=None):
|
|
202
|
+
"""
|
|
203
|
+
Fit the estimator.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
X: Input data
|
|
207
|
+
y: Target values (optional)
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
self: Returns self for chaining
|
|
211
|
+
"""
|
|
212
|
+
pass
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class LeCrapaudPipelineCompatible:
|
|
216
|
+
"""
|
|
217
|
+
Mixin for components that can be used in sklearn Pipeline.
|
|
218
|
+
|
|
219
|
+
This ensures proper parameter passing and state management.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
def _validate_data(self, X, y=None, reset=True):
|
|
223
|
+
"""
|
|
224
|
+
Validate input data (sklearn convention).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
X: Input data
|
|
228
|
+
y: Target data (optional)
|
|
229
|
+
reset (bool): Whether to reset internal state
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
tuple: (X, y) validated data
|
|
233
|
+
"""
|
|
234
|
+
# Basic validation - can be extended as needed
|
|
235
|
+
if X is None:
|
|
236
|
+
raise ValueError("Input X cannot be None")
|
|
237
|
+
|
|
238
|
+
return X, y
|
|
239
|
+
|
|
240
|
+
def _check_is_fitted(self):
|
|
241
|
+
"""Check if the transformer has been fitted."""
|
|
242
|
+
if not hasattr(self, "is_fitted_") or not self.is_fitted_:
|
|
243
|
+
raise ValueError("This transformer has not been fitted yet.")
|
|
244
|
+
|
|
245
|
+
def _set_fitted(self):
|
|
246
|
+
"""Mark the transformer as fitted."""
|
|
247
|
+
self.is_fitted_ = True
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import joblib
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
|
|
9
|
+
from lecrapaud.utils import logger
|
|
10
|
+
from lecrapaud.search_space import all_models
|
|
11
|
+
from lecrapaud.mixins import LeCrapaudTransformerMixin
|
|
12
|
+
from lecrapaud.db import Experiment
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ModelPreprocessor(LeCrapaudTransformerMixin):
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
experiment=None,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
23
|
+
super().__init__(experiment=experiment, **kwargs)
|
|
24
|
+
|
|
25
|
+
# Set defaults for required parameters if not provided
|
|
26
|
+
if not hasattr(self, "target_numbers"):
|
|
27
|
+
self.target_numbers = []
|
|
28
|
+
if not hasattr(self, "target_clf"):
|
|
29
|
+
self.target_clf = []
|
|
30
|
+
if not hasattr(self, "models_idx"):
|
|
31
|
+
self.models_idx = []
|
|
32
|
+
if not hasattr(self, "time_series"):
|
|
33
|
+
self.time_series = False
|
|
34
|
+
if not hasattr(self, "max_timesteps"):
|
|
35
|
+
self.max_timesteps = 120
|
|
36
|
+
if not hasattr(self, "group_column"):
|
|
37
|
+
self.group_column = None
|
|
38
|
+
if not hasattr(self, "date_column"):
|
|
39
|
+
self.date_column = None
|
|
40
|
+
|
|
41
|
+
# Set paths if experiment is available
|
|
42
|
+
if self.experiment:
|
|
43
|
+
self.experiment_dir = self.experiment.path
|
|
44
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
45
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
46
|
+
|
|
47
|
+
self.all_features = self.experiment.get_all_features(
|
|
48
|
+
date_column=self.date_column, group_column=self.group_column
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def fit(self, X, y=None):
|
|
52
|
+
"""
|
|
53
|
+
Fit the model preprocessor (learns scaling parameters).
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
X (pd.DataFrame): Training data
|
|
57
|
+
y: Target values (ignored)
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
self: Returns self for chaining
|
|
61
|
+
"""
|
|
62
|
+
X, y = self._validate_data(X, y)
|
|
63
|
+
|
|
64
|
+
# Filter columns to keep only features and targets
|
|
65
|
+
if hasattr(self, "all_features"):
|
|
66
|
+
columns_to_keep = self.all_features + [
|
|
67
|
+
f"TARGET_{i}" for i in self.target_numbers
|
|
68
|
+
]
|
|
69
|
+
duplicates = [
|
|
70
|
+
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
71
|
+
]
|
|
72
|
+
if duplicates:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Doublons détectés dans columns_to_keep: {duplicates}"
|
|
75
|
+
)
|
|
76
|
+
X = X[columns_to_keep]
|
|
77
|
+
|
|
78
|
+
# Determine if we need scaling
|
|
79
|
+
self.need_scaling_ = any(
|
|
80
|
+
t not in self.target_clf for t in self.target_numbers
|
|
81
|
+
) and any(all_models[i].get("need_scaling") for i in self.models_idx)
|
|
82
|
+
|
|
83
|
+
if self.need_scaling_:
|
|
84
|
+
logger.info("Fitting scalers...")
|
|
85
|
+
_, self.scaler_x_, self.scalers_y_ = self.scale_data(X)
|
|
86
|
+
|
|
87
|
+
# Save scalers if experiment is available
|
|
88
|
+
if self.experiment:
|
|
89
|
+
joblib.dump(self.scaler_x_, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
90
|
+
# Save target scalers
|
|
91
|
+
for target_number in self.target_numbers:
|
|
92
|
+
if target_number not in self.target_clf:
|
|
93
|
+
target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
|
|
94
|
+
scaler_y = self.scalers_y_[f"scaler_y_{target_number}"]
|
|
95
|
+
joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
|
|
96
|
+
|
|
97
|
+
self._set_fitted()
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
def transform(self, X):
|
|
101
|
+
"""
|
|
102
|
+
Transform the input data (apply scaling if fitted).
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
X (pd.DataFrame): Input data
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
pd.DataFrame: Scaled data (or original if no scaling needed)
|
|
109
|
+
"""
|
|
110
|
+
# Allow loading persisted artifacts even in a fresh instance
|
|
111
|
+
if not getattr(self, "is_fitted_", False) and self.experiment:
|
|
112
|
+
scaler_path = f"{self.preprocessing_dir}/scaler_x.pkl"
|
|
113
|
+
if os.path.exists(scaler_path):
|
|
114
|
+
self.is_fitted_ = True
|
|
115
|
+
|
|
116
|
+
self._check_is_fitted()
|
|
117
|
+
X, _ = self._validate_data(X, reset=False)
|
|
118
|
+
|
|
119
|
+
# Filter columns if needed
|
|
120
|
+
if hasattr(self, "all_features"):
|
|
121
|
+
columns_to_keep = self.all_features + [
|
|
122
|
+
f"TARGET_{i}" for i in self.target_numbers if f"TARGET_{i}" in X.columns
|
|
123
|
+
]
|
|
124
|
+
X = X[columns_to_keep]
|
|
125
|
+
|
|
126
|
+
# Load scalers if not in memory
|
|
127
|
+
if not hasattr(self, "scaler_x_") and self.experiment:
|
|
128
|
+
scaler_path = f"{self.preprocessing_dir}/scaler_x.pkl"
|
|
129
|
+
if os.path.exists(scaler_path):
|
|
130
|
+
self.scaler_x_ = joblib.load(scaler_path)
|
|
131
|
+
|
|
132
|
+
# Apply scaling if needed
|
|
133
|
+
if (
|
|
134
|
+
hasattr(self, "need_scaling_")
|
|
135
|
+
and self.need_scaling_
|
|
136
|
+
and hasattr(self, "scaler_x_")
|
|
137
|
+
):
|
|
138
|
+
X_scaled, _, _ = self.scale_data(
|
|
139
|
+
X, scaler_x=self.scaler_x_, scalers_y=getattr(self, "scalers_y_", None)
|
|
140
|
+
)
|
|
141
|
+
return X_scaled
|
|
142
|
+
|
|
143
|
+
return X
|
|
144
|
+
|
|
145
|
+
# scaling
|
|
146
|
+
def scale_data(
|
|
147
|
+
self,
|
|
148
|
+
df: pd.DataFrame,
|
|
149
|
+
scaler_x=None,
|
|
150
|
+
scalers_y: Optional[list] = None,
|
|
151
|
+
):
|
|
152
|
+
logger.info("Scale data...")
|
|
153
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
154
|
+
|
|
155
|
+
if scaler_x:
|
|
156
|
+
X_scaled = pd.DataFrame(
|
|
157
|
+
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
161
|
+
X_scaled = pd.DataFrame(
|
|
162
|
+
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Determine which targets need to be scaled
|
|
166
|
+
targets_numbers_to_scale = [
|
|
167
|
+
i for i in self.target_numbers if i not in self.target_clf
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
# Dictionary to store scaled target data
|
|
171
|
+
scaled_targets = {}
|
|
172
|
+
|
|
173
|
+
if scalers_y:
|
|
174
|
+
for target_number in targets_numbers_to_scale:
|
|
175
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
176
|
+
scaled_targets[target_number] = pd.DataFrame(
|
|
177
|
+
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
178
|
+
columns=y.columns,
|
|
179
|
+
index=y.index,
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
scalers_y = {}
|
|
183
|
+
for target_number in targets_numbers_to_scale:
|
|
184
|
+
scaler_y = StandardScaler()
|
|
185
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
186
|
+
|
|
187
|
+
scaled_y = pd.DataFrame(
|
|
188
|
+
scaler_y.fit_transform(y.values),
|
|
189
|
+
columns=y.columns,
|
|
190
|
+
index=y.index,
|
|
191
|
+
)
|
|
192
|
+
target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
|
|
193
|
+
joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
|
|
194
|
+
|
|
195
|
+
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
196
|
+
scaled_targets[target_number] = scaled_y
|
|
197
|
+
|
|
198
|
+
# Reconstruct y_scaled in the original order
|
|
199
|
+
y_scaled = pd.concat(
|
|
200
|
+
[
|
|
201
|
+
scaled_targets[target_number]
|
|
202
|
+
for target_number in targets_numbers_to_scale
|
|
203
|
+
],
|
|
204
|
+
axis=1,
|
|
205
|
+
)
|
|
206
|
+
y_not_scaled = df[
|
|
207
|
+
df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
# Ensure the final DataFrame keeps the original order
|
|
211
|
+
df_scaled = pd.concat(
|
|
212
|
+
[X_scaled, y_scaled, y_not_scaled],
|
|
213
|
+
axis=1,
|
|
214
|
+
)[
|
|
215
|
+
df.columns
|
|
216
|
+
] # Reorder columns to match original `df`
|
|
217
|
+
|
|
218
|
+
if not df_scaled.columns.equals(df.columns):
|
|
219
|
+
raise Exception("Columns are not in the same order after scaling.")
|
|
220
|
+
|
|
221
|
+
return df_scaled, scaler_x, scalers_y
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# Reshape into 3D tensors for recurrent models
|
|
225
|
+
def reshape_time_series(
|
|
226
|
+
experiment: Experiment,
|
|
227
|
+
features: list,
|
|
228
|
+
train: pd.DataFrame,
|
|
229
|
+
val: pd.DataFrame = None,
|
|
230
|
+
test: pd.DataFrame = None,
|
|
231
|
+
timesteps: int = 120,
|
|
232
|
+
):
|
|
233
|
+
# always scale for recurrent layers : train should be scaled
|
|
234
|
+
group_column = experiment.context.group_column
|
|
235
|
+
|
|
236
|
+
target_columns = train.columns.intersection(
|
|
237
|
+
[f"TARGET_{i}" for i in experiment.context.target_numbers]
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
data = pd.concat([train, val, test], axis=0)
|
|
241
|
+
|
|
242
|
+
def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
|
|
243
|
+
fill_value = [[[0] * len(df.columns)]]
|
|
244
|
+
|
|
245
|
+
def shiftsum(x, timesteps: int):
|
|
246
|
+
tmp = x.copy()
|
|
247
|
+
for i in range(1, timesteps):
|
|
248
|
+
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
249
|
+
return tmp
|
|
250
|
+
|
|
251
|
+
logger.info("Grouping each feature in a unique column with list...")
|
|
252
|
+
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
253
|
+
df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
|
|
254
|
+
|
|
255
|
+
logger.info("Grouping features and creating timesteps...")
|
|
256
|
+
df_reshaped = (
|
|
257
|
+
df_reshaped.groupby(group_column)[0]
|
|
258
|
+
.apply(lambda x: shiftsum(x, timesteps))
|
|
259
|
+
.reset_index(group_column, drop=True)
|
|
260
|
+
.rename("RECURRENT_FEATURES")
|
|
261
|
+
)
|
|
262
|
+
df_reshaped = pd.DataFrame(df_reshaped)
|
|
263
|
+
|
|
264
|
+
return df_reshaped
|
|
265
|
+
|
|
266
|
+
data_reshaped = reshape_df(data[features], data[group_column], timesteps)
|
|
267
|
+
|
|
268
|
+
data_reshaped[target_columns] = data[target_columns]
|
|
269
|
+
|
|
270
|
+
logger.info("Separating train, val, test data and creating np arrays...")
|
|
271
|
+
train_reshaped = data_reshaped.loc[train.index]
|
|
272
|
+
|
|
273
|
+
x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
274
|
+
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
275
|
+
|
|
276
|
+
reshaped_data = {
|
|
277
|
+
"x_train_reshaped": x_train_reshaped,
|
|
278
|
+
"y_train_reshaped": y_train_reshaped,
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
if val is not None:
|
|
282
|
+
val_reshaped = data_reshaped.loc[val.index]
|
|
283
|
+
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
284
|
+
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
285
|
+
reshaped_data["x_val_reshaped"] = x_val_reshaped
|
|
286
|
+
reshaped_data["y_val_reshaped"] = y_val_reshaped
|
|
287
|
+
|
|
288
|
+
if test is not None:
|
|
289
|
+
test_reshaped = data_reshaped.loc[test.index]
|
|
290
|
+
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
291
|
+
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
292
|
+
reshaped_data["x_test_reshaped"] = x_test_reshaped
|
|
293
|
+
reshaped_data["y_test_reshaped"] = y_test_reshaped
|
|
294
|
+
|
|
295
|
+
return reshaped_data
|