lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
lecrapaud/mixins.py ADDED
@@ -0,0 +1,247 @@
1
+ """
2
+ Sklearn-compatible mixins for LeCrapaud components.
3
+
4
+ This module provides base classes and mixins that ensure components are compatible
5
+ with scikit-learn conventions and can be used in sklearn pipelines.
6
+ """
7
+
8
+ from sklearn.base import BaseEstimator, TransformerMixin
9
+ from abc import ABC, abstractmethod
10
+ from lecrapaud.db import Experiment
11
+
12
+
13
+ class LeCrapaudTransformerMixin(BaseEstimator, TransformerMixin):
14
+ """
15
+ Base mixin for LeCrapaud transformers that ensures sklearn compatibility.
16
+
17
+ This mixin provides the basic structure that all LeCrapaud transformers
18
+ should follow to be compatible with sklearn pipelines.
19
+ """
20
+
21
+ def __init__(self, experiment: Experiment = None, **kwargs):
22
+ """
23
+ Initialize the transformer.
24
+
25
+ Args:
26
+ experiment: LeCrapaud experiment context
27
+ **kwargs: Additional parameters (take priority over experiment.context)
28
+ """
29
+ self.experiment = experiment
30
+
31
+ # First, set parameters from experiment context if available
32
+ if experiment and hasattr(experiment, "context") and experiment.context:
33
+ for key, value in experiment.context.items():
34
+ setattr(self, key, value)
35
+
36
+ # Then override with explicit kwargs (kwargs have priority)
37
+ for key, value in kwargs.items():
38
+ setattr(self, key, value)
39
+
40
+ def get_params(self, deep=True):
41
+ """
42
+ Get parameters for this estimator (sklearn compatibility).
43
+
44
+ Args:
45
+ deep (bool): If True, will return the parameters for this estimator
46
+ and contained subobjects that are estimators.
47
+
48
+ Returns:
49
+ dict: Parameter names mapped to their values.
50
+ """
51
+ params = {}
52
+ for key in dir(self):
53
+ if not key.startswith("_") and not callable(getattr(self, key)):
54
+ value = getattr(self, key)
55
+ if deep and hasattr(value, "get_params"):
56
+ deep_items = value.get_params().items()
57
+ params.update((f"{key}__{k}", v) for k, v in deep_items)
58
+ params[key] = value
59
+ return params
60
+
61
+ def set_params(self, **params):
62
+ """
63
+ Set the parameters of this estimator (sklearn compatibility).
64
+
65
+ Args:
66
+ **params: Estimator parameters
67
+
68
+ Returns:
69
+ self: Estimator instance
70
+ """
71
+ for key, value in params.items():
72
+ if "__" in key:
73
+ # Handle nested parameters
74
+ obj_name, param_name = key.split("__", 1)
75
+ obj = getattr(self, obj_name)
76
+ obj.set_params(**{param_name: value})
77
+ else:
78
+ setattr(self, key, value)
79
+ return self
80
+
81
+ def _set_fitted(self):
82
+ """Mark the transformer as fitted (sklearn compatibility helper)."""
83
+ self.is_fitted_ = True
84
+
85
+ def _check_is_fitted(self):
86
+ """Ensure the estimator has been fitted before usage."""
87
+ if not getattr(self, "is_fitted_", False):
88
+ raise ValueError("This estimator has not been fitted yet.")
89
+
90
+ def _validate_data(self, X, y=None, reset=True):
91
+ """Basic validation helper mirroring sklearn signature."""
92
+ if X is None:
93
+ raise ValueError("Input X cannot be None")
94
+ return X, y
95
+
96
+ @abstractmethod
97
+ def fit(self, X, y=None):
98
+ """
99
+ Fit the transformer.
100
+
101
+ Args:
102
+ X: Input data
103
+ y: Target values (optional)
104
+
105
+ Returns:
106
+ self: Returns self for chaining
107
+ """
108
+ pass
109
+
110
+ @abstractmethod
111
+ def transform(self, X):
112
+ """
113
+ Transform the input data.
114
+
115
+ Args:
116
+ X: Input data to transform
117
+
118
+ Returns:
119
+ Transformed data
120
+ """
121
+ pass
122
+
123
+ def fit_transform(self, X, y=None):
124
+ """
125
+ Fit and transform in one step (provided by TransformerMixin).
126
+
127
+ Args:
128
+ X: Input data
129
+ y: Target values (optional)
130
+
131
+ Returns:
132
+ Transformed data
133
+ """
134
+ return self.fit(X, y).transform(X)
135
+
136
+
137
+ class LeCrapaudEstimatorMixin(BaseEstimator):
138
+ """
139
+ Base mixin for LeCrapaud estimators (like selectors) that only have fit().
140
+ """
141
+
142
+ def __init__(self, experiment: Experiment = None, **kwargs):
143
+ """
144
+ Initialize the estimator.
145
+
146
+ Args:
147
+ experiment: LeCrapaud experiment context
148
+ **kwargs: Additional parameters (take priority over experiment.context)
149
+ """
150
+ self.experiment = experiment
151
+
152
+ # First, set parameters from experiment context if available
153
+ if experiment and hasattr(experiment, "context") and experiment.context:
154
+ for key, value in experiment.context.items():
155
+ setattr(self, key, value)
156
+
157
+ # Then override with explicit kwargs (kwargs have priority)
158
+ for key, value in kwargs.items():
159
+ setattr(self, key, value)
160
+
161
+ def get_params(self, deep=True):
162
+ """Get parameters for this estimator (sklearn compatibility)."""
163
+ params = {}
164
+ for key in dir(self):
165
+ if not key.startswith("_") and not callable(getattr(self, key)):
166
+ value = getattr(self, key)
167
+ if deep and hasattr(value, "get_params"):
168
+ deep_items = value.get_params().items()
169
+ params.update((f"{key}__{k}", v) for k, v in deep_items)
170
+ params[key] = value
171
+ return params
172
+
173
+ def set_params(self, **params):
174
+ """Set the parameters of this estimator (sklearn compatibility)."""
175
+ for key, value in params.items():
176
+ if "__" in key:
177
+ # Handle nested parameters
178
+ obj_name, param_name = key.split("__", 1)
179
+ obj = getattr(self, obj_name)
180
+ obj.set_params(**{param_name: value})
181
+ else:
182
+ setattr(self, key, value)
183
+ return self
184
+
185
+ def _set_fitted(self):
186
+ """Mark the estimator as fitted."""
187
+ self.is_fitted_ = True
188
+
189
+ def _check_is_fitted(self):
190
+ """Ensure the estimator has been fitted."""
191
+ if not getattr(self, "is_fitted_", False):
192
+ raise ValueError("This estimator has not been fitted yet.")
193
+
194
+ def _validate_data(self, X, y=None, reset=True):
195
+ """Basic validation helper mirroring sklearn signature."""
196
+ if X is None:
197
+ raise ValueError("Input X cannot be None")
198
+ return X, y
199
+
200
+ @abstractmethod
201
+ def fit(self, X, y=None):
202
+ """
203
+ Fit the estimator.
204
+
205
+ Args:
206
+ X: Input data
207
+ y: Target values (optional)
208
+
209
+ Returns:
210
+ self: Returns self for chaining
211
+ """
212
+ pass
213
+
214
+
215
+ class LeCrapaudPipelineCompatible:
216
+ """
217
+ Mixin for components that can be used in sklearn Pipeline.
218
+
219
+ This ensures proper parameter passing and state management.
220
+ """
221
+
222
+ def _validate_data(self, X, y=None, reset=True):
223
+ """
224
+ Validate input data (sklearn convention).
225
+
226
+ Args:
227
+ X: Input data
228
+ y: Target data (optional)
229
+ reset (bool): Whether to reset internal state
230
+
231
+ Returns:
232
+ tuple: (X, y) validated data
233
+ """
234
+ # Basic validation - can be extended as needed
235
+ if X is None:
236
+ raise ValueError("Input X cannot be None")
237
+
238
+ return X, y
239
+
240
+ def _check_is_fitted(self):
241
+ """Check if the transformer has been fitted."""
242
+ if not hasattr(self, "is_fitted_") or not self.is_fitted_:
243
+ raise ValueError("This transformer has not been fitted yet.")
244
+
245
+ def _set_fitted(self):
246
+ """Mark the transformer as fitted."""
247
+ self.is_fitted_ = True
@@ -0,0 +1,295 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ from typing import Optional
5
+ import os
6
+
7
+ from sklearn.preprocessing import StandardScaler
8
+
9
+ from lecrapaud.utils import logger
10
+ from lecrapaud.search_space import all_models
11
+ from lecrapaud.mixins import LeCrapaudTransformerMixin
12
+ from lecrapaud.db import Experiment
13
+
14
+
15
+ class ModelPreprocessor(LeCrapaudTransformerMixin):
16
+
17
+ def __init__(
18
+ self,
19
+ experiment=None,
20
+ **kwargs,
21
+ ):
22
+ # The mixin will automatically set all experiment.context parameters as attributes
23
+ super().__init__(experiment=experiment, **kwargs)
24
+
25
+ # Set defaults for required parameters if not provided
26
+ if not hasattr(self, "target_numbers"):
27
+ self.target_numbers = []
28
+ if not hasattr(self, "target_clf"):
29
+ self.target_clf = []
30
+ if not hasattr(self, "models_idx"):
31
+ self.models_idx = []
32
+ if not hasattr(self, "time_series"):
33
+ self.time_series = False
34
+ if not hasattr(self, "max_timesteps"):
35
+ self.max_timesteps = 120
36
+ if not hasattr(self, "group_column"):
37
+ self.group_column = None
38
+ if not hasattr(self, "date_column"):
39
+ self.date_column = None
40
+
41
+ # Set paths if experiment is available
42
+ if self.experiment:
43
+ self.experiment_dir = self.experiment.path
44
+ self.data_dir = f"{self.experiment_dir}/data"
45
+ self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
46
+
47
+ self.all_features = self.experiment.get_all_features(
48
+ date_column=self.date_column, group_column=self.group_column
49
+ )
50
+
51
+ def fit(self, X, y=None):
52
+ """
53
+ Fit the model preprocessor (learns scaling parameters).
54
+
55
+ Args:
56
+ X (pd.DataFrame): Training data
57
+ y: Target values (ignored)
58
+
59
+ Returns:
60
+ self: Returns self for chaining
61
+ """
62
+ X, y = self._validate_data(X, y)
63
+
64
+ # Filter columns to keep only features and targets
65
+ if hasattr(self, "all_features"):
66
+ columns_to_keep = self.all_features + [
67
+ f"TARGET_{i}" for i in self.target_numbers
68
+ ]
69
+ duplicates = [
70
+ col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
71
+ ]
72
+ if duplicates:
73
+ raise ValueError(
74
+ f"Doublons détectés dans columns_to_keep: {duplicates}"
75
+ )
76
+ X = X[columns_to_keep]
77
+
78
+ # Determine if we need scaling
79
+ self.need_scaling_ = any(
80
+ t not in self.target_clf for t in self.target_numbers
81
+ ) and any(all_models[i].get("need_scaling") for i in self.models_idx)
82
+
83
+ if self.need_scaling_:
84
+ logger.info("Fitting scalers...")
85
+ _, self.scaler_x_, self.scalers_y_ = self.scale_data(X)
86
+
87
+ # Save scalers if experiment is available
88
+ if self.experiment:
89
+ joblib.dump(self.scaler_x_, f"{self.preprocessing_dir}/scaler_x.pkl")
90
+ # Save target scalers
91
+ for target_number in self.target_numbers:
92
+ if target_number not in self.target_clf:
93
+ target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
94
+ scaler_y = self.scalers_y_[f"scaler_y_{target_number}"]
95
+ joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
96
+
97
+ self._set_fitted()
98
+ return self
99
+
100
+ def transform(self, X):
101
+ """
102
+ Transform the input data (apply scaling if fitted).
103
+
104
+ Args:
105
+ X (pd.DataFrame): Input data
106
+
107
+ Returns:
108
+ pd.DataFrame: Scaled data (or original if no scaling needed)
109
+ """
110
+ # Allow loading persisted artifacts even in a fresh instance
111
+ if not getattr(self, "is_fitted_", False) and self.experiment:
112
+ scaler_path = f"{self.preprocessing_dir}/scaler_x.pkl"
113
+ if os.path.exists(scaler_path):
114
+ self.is_fitted_ = True
115
+
116
+ self._check_is_fitted()
117
+ X, _ = self._validate_data(X, reset=False)
118
+
119
+ # Filter columns if needed
120
+ if hasattr(self, "all_features"):
121
+ columns_to_keep = self.all_features + [
122
+ f"TARGET_{i}" for i in self.target_numbers if f"TARGET_{i}" in X.columns
123
+ ]
124
+ X = X[columns_to_keep]
125
+
126
+ # Load scalers if not in memory
127
+ if not hasattr(self, "scaler_x_") and self.experiment:
128
+ scaler_path = f"{self.preprocessing_dir}/scaler_x.pkl"
129
+ if os.path.exists(scaler_path):
130
+ self.scaler_x_ = joblib.load(scaler_path)
131
+
132
+ # Apply scaling if needed
133
+ if (
134
+ hasattr(self, "need_scaling_")
135
+ and self.need_scaling_
136
+ and hasattr(self, "scaler_x_")
137
+ ):
138
+ X_scaled, _, _ = self.scale_data(
139
+ X, scaler_x=self.scaler_x_, scalers_y=getattr(self, "scalers_y_", None)
140
+ )
141
+ return X_scaled
142
+
143
+ return X
144
+
145
+ # scaling
146
+ def scale_data(
147
+ self,
148
+ df: pd.DataFrame,
149
+ scaler_x=None,
150
+ scalers_y: Optional[list] = None,
151
+ ):
152
+ logger.info("Scale data...")
153
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
154
+
155
+ if scaler_x:
156
+ X_scaled = pd.DataFrame(
157
+ scaler_x.transform(X), columns=list(X.columns), index=X.index
158
+ )
159
+ else:
160
+ scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
161
+ X_scaled = pd.DataFrame(
162
+ scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
163
+ )
164
+
165
+ # Determine which targets need to be scaled
166
+ targets_numbers_to_scale = [
167
+ i for i in self.target_numbers if i not in self.target_clf
168
+ ]
169
+
170
+ # Dictionary to store scaled target data
171
+ scaled_targets = {}
172
+
173
+ if scalers_y:
174
+ for target_number in targets_numbers_to_scale:
175
+ y = df[[f"TARGET_{target_number}"]]
176
+ scaled_targets[target_number] = pd.DataFrame(
177
+ scalers_y[f"scaler_y_{target_number}"].transform(y.values),
178
+ columns=y.columns,
179
+ index=y.index,
180
+ )
181
+ else:
182
+ scalers_y = {}
183
+ for target_number in targets_numbers_to_scale:
184
+ scaler_y = StandardScaler()
185
+ y = df[[f"TARGET_{target_number}"]]
186
+
187
+ scaled_y = pd.DataFrame(
188
+ scaler_y.fit_transform(y.values),
189
+ columns=y.columns,
190
+ index=y.index,
191
+ )
192
+ target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
193
+ joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
194
+
195
+ scalers_y[f"scaler_y_{target_number}"] = scaler_y
196
+ scaled_targets[target_number] = scaled_y
197
+
198
+ # Reconstruct y_scaled in the original order
199
+ y_scaled = pd.concat(
200
+ [
201
+ scaled_targets[target_number]
202
+ for target_number in targets_numbers_to_scale
203
+ ],
204
+ axis=1,
205
+ )
206
+ y_not_scaled = df[
207
+ df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
208
+ ]
209
+
210
+ # Ensure the final DataFrame keeps the original order
211
+ df_scaled = pd.concat(
212
+ [X_scaled, y_scaled, y_not_scaled],
213
+ axis=1,
214
+ )[
215
+ df.columns
216
+ ] # Reorder columns to match original `df`
217
+
218
+ if not df_scaled.columns.equals(df.columns):
219
+ raise Exception("Columns are not in the same order after scaling.")
220
+
221
+ return df_scaled, scaler_x, scalers_y
222
+
223
+
224
+ # Reshape into 3D tensors for recurrent models
225
+ def reshape_time_series(
226
+ experiment: Experiment,
227
+ features: list,
228
+ train: pd.DataFrame,
229
+ val: pd.DataFrame = None,
230
+ test: pd.DataFrame = None,
231
+ timesteps: int = 120,
232
+ ):
233
+ # always scale for recurrent layers : train should be scaled
234
+ group_column = experiment.context.group_column
235
+
236
+ target_columns = train.columns.intersection(
237
+ [f"TARGET_{i}" for i in experiment.context.target_numbers]
238
+ )
239
+
240
+ data = pd.concat([train, val, test], axis=0)
241
+
242
+ def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
243
+ fill_value = [[[0] * len(df.columns)]]
244
+
245
+ def shiftsum(x, timesteps: int):
246
+ tmp = x.copy()
247
+ for i in range(1, timesteps):
248
+ tmp = x.shift(i, fill_value=fill_value) + tmp
249
+ return tmp
250
+
251
+ logger.info("Grouping each feature in a unique column with list...")
252
+ df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
253
+ df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
254
+
255
+ logger.info("Grouping features and creating timesteps...")
256
+ df_reshaped = (
257
+ df_reshaped.groupby(group_column)[0]
258
+ .apply(lambda x: shiftsum(x, timesteps))
259
+ .reset_index(group_column, drop=True)
260
+ .rename("RECURRENT_FEATURES")
261
+ )
262
+ df_reshaped = pd.DataFrame(df_reshaped)
263
+
264
+ return df_reshaped
265
+
266
+ data_reshaped = reshape_df(data[features], data[group_column], timesteps)
267
+
268
+ data_reshaped[target_columns] = data[target_columns]
269
+
270
+ logger.info("Separating train, val, test data and creating np arrays...")
271
+ train_reshaped = data_reshaped.loc[train.index]
272
+
273
+ x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
274
+ y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
275
+
276
+ reshaped_data = {
277
+ "x_train_reshaped": x_train_reshaped,
278
+ "y_train_reshaped": y_train_reshaped,
279
+ }
280
+
281
+ if val is not None:
282
+ val_reshaped = data_reshaped.loc[val.index]
283
+ x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
284
+ y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
285
+ reshaped_data["x_val_reshaped"] = x_val_reshaped
286
+ reshaped_data["y_val_reshaped"] = y_val_reshaped
287
+
288
+ if test is not None:
289
+ test_reshaped = data_reshaped.loc[test.index]
290
+ x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
291
+ y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
292
+ reshaped_data["x_test_reshaped"] = x_test_reshaped
293
+ reshaped_data["y_test_reshaped"] = y_test_reshaped
294
+
295
+ return reshaped_data