lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
lecrapaud/pipeline.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LeCrapaud Pipeline for sklearn integration.
|
|
3
|
+
|
|
4
|
+
This module provides a sklearn-compatible pipeline that can be used
|
|
5
|
+
in sklearn workflows while incorporating LeCrapaud's custom components.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from sklearn.pipeline import Pipeline
|
|
9
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
10
|
+
from typing import List, Tuple, Optional, Dict, Any
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from lecrapaud.db import Experiment
|
|
14
|
+
from lecrapaud.feature_engineering import FeatureEngineering
|
|
15
|
+
from lecrapaud.feature_preprocessing import FeaturePreprocessor, split_data
|
|
16
|
+
from lecrapaud.feature_selection import FeatureSelector
|
|
17
|
+
from lecrapaud.model_preprocessing import ModelPreprocessor, reshape_time_series
|
|
18
|
+
from lecrapaud.model_selection import ModelSelector
|
|
19
|
+
from lecrapaud.search_space import all_models
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DataSplitterTransformer(BaseEstimator, TransformerMixin):
|
|
23
|
+
"""
|
|
24
|
+
Transformer that handles train/val/test data splitting for LeCrapaud pipelines.
|
|
25
|
+
|
|
26
|
+
This component splits data and can be used in sklearn pipelines while maintaining
|
|
27
|
+
the ability to access individual splits.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
experiment: Experiment,
|
|
33
|
+
time_series: bool = False,
|
|
34
|
+
date_column: str = None,
|
|
35
|
+
group_column: str = None,
|
|
36
|
+
val_size: float = 0.2,
|
|
37
|
+
test_size: float = 0.2,
|
|
38
|
+
target_numbers: List[int] = None,
|
|
39
|
+
target_clf: List[int] = None
|
|
40
|
+
):
|
|
41
|
+
self.experiment = experiment
|
|
42
|
+
self.time_series = time_series
|
|
43
|
+
self.date_column = date_column
|
|
44
|
+
self.group_column = group_column
|
|
45
|
+
self.val_size = val_size
|
|
46
|
+
self.test_size = test_size
|
|
47
|
+
self.target_numbers = target_numbers or []
|
|
48
|
+
self.target_clf = target_clf or []
|
|
49
|
+
|
|
50
|
+
def fit(self, X, y=None):
|
|
51
|
+
"""Fit the splitter (no-op, just validates parameters)."""
|
|
52
|
+
return self
|
|
53
|
+
|
|
54
|
+
def transform(self, X):
|
|
55
|
+
"""Transform data by splitting it and returning train split."""
|
|
56
|
+
train, val, test = split_data(X, experiment=self.experiment)
|
|
57
|
+
|
|
58
|
+
# Store splits as attributes for later access
|
|
59
|
+
self.train_ = train
|
|
60
|
+
self.val_ = val
|
|
61
|
+
self.test_ = test
|
|
62
|
+
|
|
63
|
+
return train
|
|
64
|
+
|
|
65
|
+
def get_splits(self):
|
|
66
|
+
"""Get all data splits."""
|
|
67
|
+
if not hasattr(self, 'train_'):
|
|
68
|
+
raise ValueError("Must call transform() first to create splits")
|
|
69
|
+
return self.train_, self.val_, self.test_
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DataReshaper(BaseEstimator, TransformerMixin):
|
|
73
|
+
"""
|
|
74
|
+
Transformer that handles time series data reshaping for recurrent models.
|
|
75
|
+
|
|
76
|
+
This component checks if reshaping is needed and applies it when appropriate.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
experiment: Experiment,
|
|
82
|
+
models_idx: List[int] = None,
|
|
83
|
+
time_series: bool = False,
|
|
84
|
+
max_timesteps: int = 120,
|
|
85
|
+
group_column: str = None
|
|
86
|
+
):
|
|
87
|
+
self.experiment = experiment
|
|
88
|
+
self.models_idx = models_idx or []
|
|
89
|
+
self.time_series = time_series
|
|
90
|
+
self.max_timesteps = max_timesteps
|
|
91
|
+
self.group_column = group_column
|
|
92
|
+
|
|
93
|
+
def fit(self, X, y=None):
|
|
94
|
+
"""Fit the reshaper (determines if reshaping is needed)."""
|
|
95
|
+
# Check if any model requires recurrent processing
|
|
96
|
+
self.need_reshaping_ = (
|
|
97
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
98
|
+
and self.time_series
|
|
99
|
+
)
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def transform(self, X):
|
|
103
|
+
"""Transform data by reshaping for time series if needed."""
|
|
104
|
+
if not self.need_reshaping_:
|
|
105
|
+
return X
|
|
106
|
+
|
|
107
|
+
# Sanity check: make sure we have enough data for max_timesteps
|
|
108
|
+
if (
|
|
109
|
+
self.group_column
|
|
110
|
+
and X.groupby(self.group_column).size().min()
|
|
111
|
+
< self.max_timesteps
|
|
112
|
+
) or X.shape[0] < self.max_timesteps:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Get features for reshaping
|
|
118
|
+
all_features = self.experiment.get_all_features(
|
|
119
|
+
date_column=getattr(self, 'date_column', None),
|
|
120
|
+
group_column=self.group_column
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Reshape the data
|
|
124
|
+
reshaped_data = reshape_time_series(
|
|
125
|
+
self.experiment, all_features, X, timesteps=self.max_timesteps
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Store reshaped data as attribute
|
|
129
|
+
self.reshaped_data_ = reshaped_data
|
|
130
|
+
|
|
131
|
+
return X # Return original data, reshaped data accessible via get_reshaped_data()
|
|
132
|
+
|
|
133
|
+
def get_reshaped_data(self):
|
|
134
|
+
"""Get the reshaped data."""
|
|
135
|
+
if not hasattr(self, 'reshaped_data_'):
|
|
136
|
+
return None
|
|
137
|
+
return self.reshaped_data_
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class FullPipelineTransformer(BaseEstimator, TransformerMixin):
|
|
141
|
+
"""
|
|
142
|
+
Complete LeCrapaud pipeline transformer that handles all steps including
|
|
143
|
+
data splitting, preprocessing, and reshaping in a sklearn-compatible way.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
experiment: Experiment,
|
|
149
|
+
target_numbers: List[int] = None,
|
|
150
|
+
include_model_selection: bool = False,
|
|
151
|
+
**pipeline_params
|
|
152
|
+
):
|
|
153
|
+
self.experiment = experiment
|
|
154
|
+
self.target_numbers = target_numbers or []
|
|
155
|
+
self.include_model_selection = include_model_selection
|
|
156
|
+
self.pipeline_params = pipeline_params
|
|
157
|
+
|
|
158
|
+
# Extract parameters from experiment context
|
|
159
|
+
if experiment and hasattr(experiment, 'context'):
|
|
160
|
+
for key, value in experiment.context.items():
|
|
161
|
+
if not hasattr(self, key):
|
|
162
|
+
setattr(self, key, value)
|
|
163
|
+
|
|
164
|
+
# Set defaults
|
|
165
|
+
self._set_defaults()
|
|
166
|
+
|
|
167
|
+
def _set_defaults(self):
|
|
168
|
+
"""Set default values for pipeline parameters."""
|
|
169
|
+
defaults = {
|
|
170
|
+
'time_series': False,
|
|
171
|
+
'date_column': None,
|
|
172
|
+
'group_column': None,
|
|
173
|
+
'val_size': 0.2,
|
|
174
|
+
'test_size': 0.2,
|
|
175
|
+
'target_clf': [],
|
|
176
|
+
'models_idx': [],
|
|
177
|
+
'max_timesteps': 120
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
for key, default_value in defaults.items():
|
|
181
|
+
if not hasattr(self, key):
|
|
182
|
+
setattr(self, key, default_value)
|
|
183
|
+
|
|
184
|
+
def fit(self, X, y=None):
|
|
185
|
+
"""Fit the complete pipeline."""
|
|
186
|
+
# Step 1: Feature Engineering
|
|
187
|
+
self.feature_eng_ = FeatureEngineering(experiment=self.experiment)
|
|
188
|
+
self.feature_eng_.fit(X)
|
|
189
|
+
data_eng = self.feature_eng_.get_data()
|
|
190
|
+
|
|
191
|
+
# Step 2: Data Splitting
|
|
192
|
+
self.data_splitter_ = DataSplitterTransformer(experiment=self.experiment)
|
|
193
|
+
train = self.data_splitter_.transform(data_eng)
|
|
194
|
+
val = self.data_splitter_.val_
|
|
195
|
+
test = self.data_splitter_.test_
|
|
196
|
+
|
|
197
|
+
# Step 3: Feature Preprocessing
|
|
198
|
+
self.feature_prep_ = FeaturePreprocessor(experiment=self.experiment)
|
|
199
|
+
self.feature_prep_.fit(train)
|
|
200
|
+
train_prep = self.feature_prep_.transform(train)
|
|
201
|
+
val_prep = self.feature_prep_.transform(val) if val is not None else None
|
|
202
|
+
test_prep = self.feature_prep_.transform(test) if test is not None else None
|
|
203
|
+
|
|
204
|
+
# Step 4: Feature Selection (for each target)
|
|
205
|
+
self.feature_selectors_ = {}
|
|
206
|
+
for target_number in self.target_numbers:
|
|
207
|
+
selector = FeatureSelector(
|
|
208
|
+
experiment=self.experiment,
|
|
209
|
+
target_number=target_number
|
|
210
|
+
)
|
|
211
|
+
selector.fit(train_prep)
|
|
212
|
+
self.feature_selectors_[target_number] = selector
|
|
213
|
+
|
|
214
|
+
# Step 5: Model Preprocessing
|
|
215
|
+
self.model_prep_ = ModelPreprocessor(experiment=self.experiment)
|
|
216
|
+
self.model_prep_.fit(train_prep)
|
|
217
|
+
train_scaled = self.model_prep_.transform(train_prep)
|
|
218
|
+
val_scaled = self.model_prep_.transform(val_prep) if val_prep is not None else None
|
|
219
|
+
test_scaled = self.model_prep_.transform(test_prep) if test_prep is not None else None
|
|
220
|
+
|
|
221
|
+
# Step 6: Data Reshaping (if needed)
|
|
222
|
+
self.data_reshaper_ = DataReshaper(
|
|
223
|
+
experiment=self.experiment,
|
|
224
|
+
models_idx=self.models_idx,
|
|
225
|
+
time_series=self.time_series,
|
|
226
|
+
max_timesteps=self.max_timesteps,
|
|
227
|
+
group_column=self.group_column
|
|
228
|
+
)
|
|
229
|
+
self.data_reshaper_.fit(train_scaled)
|
|
230
|
+
self.data_reshaper_.transform(train_scaled)
|
|
231
|
+
|
|
232
|
+
# Step 7: Model Selection (optional)
|
|
233
|
+
if self.include_model_selection:
|
|
234
|
+
self.model_selectors_ = {}
|
|
235
|
+
std_data = {"train": train_scaled, "val": val_scaled, "test": test_scaled}
|
|
236
|
+
reshaped_data = self.data_reshaper_.get_reshaped_data()
|
|
237
|
+
|
|
238
|
+
for target_number in self.target_numbers:
|
|
239
|
+
model_selector = ModelSelector(
|
|
240
|
+
experiment=self.experiment,
|
|
241
|
+
target_number=target_number
|
|
242
|
+
)
|
|
243
|
+
model_selector.fit(std_data, reshaped_data=reshaped_data)
|
|
244
|
+
self.model_selectors_[target_number] = model_selector
|
|
245
|
+
|
|
246
|
+
return self
|
|
247
|
+
|
|
248
|
+
def transform(self, X):
|
|
249
|
+
"""Transform new data through the fitted pipeline."""
|
|
250
|
+
# Apply feature engineering
|
|
251
|
+
self.feature_eng_.fit(X) # Refit for new data
|
|
252
|
+
data_eng = self.feature_eng_.get_data()
|
|
253
|
+
|
|
254
|
+
# Apply feature preprocessing
|
|
255
|
+
data_prep = self.feature_prep_.transform(data_eng)
|
|
256
|
+
|
|
257
|
+
# Apply model preprocessing
|
|
258
|
+
data_scaled = self.model_prep_.transform(data_prep)
|
|
259
|
+
|
|
260
|
+
# Apply reshaping if needed
|
|
261
|
+
self.data_reshaper_.transform(data_scaled)
|
|
262
|
+
|
|
263
|
+
return data_scaled
|
|
264
|
+
|
|
265
|
+
def get_training_splits(self):
|
|
266
|
+
"""Get the training data splits."""
|
|
267
|
+
if not hasattr(self, 'data_splitter_'):
|
|
268
|
+
raise ValueError("Must call fit() first")
|
|
269
|
+
return self.data_splitter_.get_splits()
|
|
270
|
+
|
|
271
|
+
def get_reshaped_data(self):
|
|
272
|
+
"""Get the reshaped data for recurrent models."""
|
|
273
|
+
if not hasattr(self, 'data_reshaper_'):
|
|
274
|
+
raise ValueError("Must call fit() first")
|
|
275
|
+
return self.data_reshaper_.get_reshaped_data()
|
|
276
|
+
|
|
277
|
+
def get_models(self):
|
|
278
|
+
"""Get the trained models."""
|
|
279
|
+
if not hasattr(self, 'model_selectors_'):
|
|
280
|
+
return {}
|
|
281
|
+
return {num: selector.get_best_model() for num, selector in self.model_selectors_.items()}
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class PipelineLeCrapaud(Pipeline):
|
|
285
|
+
"""
|
|
286
|
+
LeCrapaud pipeline that extends sklearn Pipeline for ML workflows.
|
|
287
|
+
|
|
288
|
+
This pipeline provides pre-configured steps for the typical LeCrapaud workflow:
|
|
289
|
+
1. Feature Engineering
|
|
290
|
+
2. Feature Preprocessing
|
|
291
|
+
3. Feature Selection
|
|
292
|
+
4. Model Preprocessing
|
|
293
|
+
5. Model Selection
|
|
294
|
+
|
|
295
|
+
It can be used as a drop-in replacement for sklearn Pipeline while
|
|
296
|
+
leveraging LeCrapaud's experiment tracking and domain-specific features.
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
def __init__(
|
|
300
|
+
self,
|
|
301
|
+
experiment: Experiment,
|
|
302
|
+
steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
|
|
303
|
+
memory=None,
|
|
304
|
+
verbose=False,
|
|
305
|
+
target_number: Optional[int] = None,
|
|
306
|
+
**kwargs
|
|
307
|
+
):
|
|
308
|
+
"""
|
|
309
|
+
Initialize LeCrapaud pipeline.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
experiment: LeCrapaud experiment instance
|
|
313
|
+
steps: List of (name, estimator) tuples. If None, uses default workflow
|
|
314
|
+
memory: Caching parameter (passed to sklearn Pipeline)
|
|
315
|
+
verbose: Whether to output progress info
|
|
316
|
+
target_number: Target number for model selection (if using default steps)
|
|
317
|
+
**kwargs: Additional parameters passed to default estimators
|
|
318
|
+
"""
|
|
319
|
+
self.experiment = experiment
|
|
320
|
+
self.target_number = target_number
|
|
321
|
+
self.step_kwargs = kwargs
|
|
322
|
+
|
|
323
|
+
if steps is None:
|
|
324
|
+
steps = self._create_default_steps()
|
|
325
|
+
|
|
326
|
+
super().__init__(steps=steps, memory=memory, verbose=verbose)
|
|
327
|
+
|
|
328
|
+
def _create_default_steps(self) -> List[Tuple[str, BaseEstimator]]:
|
|
329
|
+
"""Create default LeCrapaud pipeline steps."""
|
|
330
|
+
steps = [
|
|
331
|
+
('feature_engineering', FeatureEngineering(
|
|
332
|
+
experiment=self.experiment,
|
|
333
|
+
**self.step_kwargs.get('feature_engineering', {})
|
|
334
|
+
)),
|
|
335
|
+
('feature_preprocessing', FeaturePreprocessor(
|
|
336
|
+
experiment=self.experiment,
|
|
337
|
+
**self.step_kwargs.get('feature_preprocessing', {})
|
|
338
|
+
)),
|
|
339
|
+
('feature_selection', FeatureSelector(
|
|
340
|
+
experiment=self.experiment,
|
|
341
|
+
target_number=self.target_number,
|
|
342
|
+
**self.step_kwargs.get('feature_selection', {})
|
|
343
|
+
)),
|
|
344
|
+
('model_preprocessing', ModelPreprocessor(
|
|
345
|
+
experiment=self.experiment,
|
|
346
|
+
**self.step_kwargs.get('model_preprocessing', {})
|
|
347
|
+
))
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
# Add model selection if target_number is specified
|
|
351
|
+
if self.target_number is not None:
|
|
352
|
+
steps.append((
|
|
353
|
+
'model_selection',
|
|
354
|
+
ModelSelector(
|
|
355
|
+
experiment=self.experiment,
|
|
356
|
+
target_number=self.target_number,
|
|
357
|
+
**self.step_kwargs.get('model_selection', {})
|
|
358
|
+
)
|
|
359
|
+
))
|
|
360
|
+
|
|
361
|
+
return steps
|
|
362
|
+
|
|
363
|
+
@classmethod
|
|
364
|
+
def create_feature_pipeline(
|
|
365
|
+
cls,
|
|
366
|
+
experiment: Experiment,
|
|
367
|
+
include_selection: bool = True,
|
|
368
|
+
target_number: Optional[int] = None,
|
|
369
|
+
**kwargs
|
|
370
|
+
) -> 'PipelineLeCrapaud':
|
|
371
|
+
"""
|
|
372
|
+
Create a pipeline focused on feature processing only.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
experiment: LeCrapaud experiment instance
|
|
376
|
+
include_selection: Whether to include feature selection step
|
|
377
|
+
target_number: Target number for feature selection
|
|
378
|
+
**kwargs: Additional parameters for estimators
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
PipelineLeCrapaud: Feature processing pipeline
|
|
382
|
+
"""
|
|
383
|
+
steps = [
|
|
384
|
+
('feature_engineering', FeatureEngineering(
|
|
385
|
+
experiment=experiment,
|
|
386
|
+
**kwargs.get('feature_engineering', {})
|
|
387
|
+
)),
|
|
388
|
+
('feature_preprocessing', FeaturePreprocessor(
|
|
389
|
+
experiment=experiment,
|
|
390
|
+
**kwargs.get('feature_preprocessing', {})
|
|
391
|
+
))
|
|
392
|
+
]
|
|
393
|
+
|
|
394
|
+
if include_selection and target_number is not None:
|
|
395
|
+
steps.append((
|
|
396
|
+
'feature_selection',
|
|
397
|
+
FeatureSelector(
|
|
398
|
+
experiment=experiment,
|
|
399
|
+
target_number=target_number,
|
|
400
|
+
**kwargs.get('feature_selection', {})
|
|
401
|
+
)
|
|
402
|
+
))
|
|
403
|
+
|
|
404
|
+
return cls(experiment=experiment, steps=steps)
|
|
405
|
+
|
|
406
|
+
@classmethod
|
|
407
|
+
def create_model_pipeline(
|
|
408
|
+
cls,
|
|
409
|
+
experiment: Experiment,
|
|
410
|
+
target_number: int,
|
|
411
|
+
**kwargs
|
|
412
|
+
) -> 'PipelineLeCrapaud':
|
|
413
|
+
"""
|
|
414
|
+
Create a pipeline focused on model preprocessing and selection.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
experiment: LeCrapaud experiment instance
|
|
418
|
+
target_number: Target number for model selection
|
|
419
|
+
**kwargs: Additional parameters for estimators
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
PipelineLeCrapaud: Model pipeline
|
|
423
|
+
"""
|
|
424
|
+
steps = [
|
|
425
|
+
('model_preprocessing', ModelPreprocessor(
|
|
426
|
+
experiment=experiment,
|
|
427
|
+
**kwargs.get('model_preprocessing', {})
|
|
428
|
+
)),
|
|
429
|
+
('model_selection', ModelSelector(
|
|
430
|
+
experiment=experiment,
|
|
431
|
+
target_number=target_number,
|
|
432
|
+
**kwargs.get('model_selection', {})
|
|
433
|
+
))
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
return cls(experiment=experiment, steps=steps)
|
|
437
|
+
|
|
438
|
+
def get_feature_names_out(self, input_features=None):
|
|
439
|
+
"""Get output feature names for transformation."""
|
|
440
|
+
# Try to get from the last transformer that has this method
|
|
441
|
+
for name, estimator in reversed(self.steps):
|
|
442
|
+
if hasattr(estimator, 'get_feature_names_out'):
|
|
443
|
+
return estimator.get_feature_names_out(input_features)
|
|
444
|
+
# For FeatureSelector, try to get selected features
|
|
445
|
+
elif hasattr(estimator, 'get_selected_features'):
|
|
446
|
+
return estimator.get_selected_features()
|
|
447
|
+
|
|
448
|
+
return input_features
|
|
449
|
+
|
|
450
|
+
def get_experiment(self) -> Experiment:
|
|
451
|
+
"""Get the experiment instance."""
|
|
452
|
+
return self.experiment
|
|
453
|
+
|
|
454
|
+
def get_step_results(self, step_name: str) -> Any:
|
|
455
|
+
"""
|
|
456
|
+
Get results from a specific pipeline step.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
step_name: Name of the pipeline step
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
Results from the specified step
|
|
463
|
+
"""
|
|
464
|
+
if step_name not in self.named_steps:
|
|
465
|
+
raise ValueError(f"Step '{step_name}' not found in pipeline")
|
|
466
|
+
|
|
467
|
+
estimator = self.named_steps[step_name]
|
|
468
|
+
|
|
469
|
+
# Try common result methods
|
|
470
|
+
if hasattr(estimator, 'get_data'):
|
|
471
|
+
return estimator.get_data()
|
|
472
|
+
elif hasattr(estimator, 'get_selected_features'):
|
|
473
|
+
return estimator.get_selected_features()
|
|
474
|
+
elif hasattr(estimator, 'get_best_model'):
|
|
475
|
+
return estimator.get_best_model()
|
|
476
|
+
else:
|
|
477
|
+
return estimator
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
class LeCrapaudTransformer(BaseEstimator, TransformerMixin):
|
|
481
|
+
"""
|
|
482
|
+
A transformer wrapper that makes any LeCrapaud estimator compatible
|
|
483
|
+
with sklearn transformers, allowing them to be used in standard sklearn pipelines.
|
|
484
|
+
"""
|
|
485
|
+
|
|
486
|
+
def __init__(self, estimator_class, experiment: Experiment, **estimator_params):
|
|
487
|
+
"""
|
|
488
|
+
Initialize the transformer wrapper.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
estimator_class: The LeCrapaud estimator class to wrap
|
|
492
|
+
experiment: LeCrapaud experiment instance
|
|
493
|
+
**estimator_params: Parameters to pass to the estimator
|
|
494
|
+
"""
|
|
495
|
+
self.estimator_class = estimator_class
|
|
496
|
+
self.experiment = experiment
|
|
497
|
+
self.estimator_params = estimator_params
|
|
498
|
+
self.estimator_ = None
|
|
499
|
+
|
|
500
|
+
def fit(self, X, y=None):
|
|
501
|
+
"""Fit the wrapped estimator."""
|
|
502
|
+
self.estimator_ = self.estimator_class(
|
|
503
|
+
experiment=self.experiment,
|
|
504
|
+
**self.estimator_params
|
|
505
|
+
)
|
|
506
|
+
self.estimator_.fit(X, y)
|
|
507
|
+
return self
|
|
508
|
+
|
|
509
|
+
def transform(self, X):
|
|
510
|
+
"""Transform using the fitted estimator."""
|
|
511
|
+
if self.estimator_ is None:
|
|
512
|
+
raise ValueError("Transformer has not been fitted yet.")
|
|
513
|
+
|
|
514
|
+
# For estimators that don't have transform, use get_data or return X
|
|
515
|
+
if hasattr(self.estimator_, 'transform'):
|
|
516
|
+
return self.estimator_.transform(X)
|
|
517
|
+
elif hasattr(self.estimator_, 'get_data'):
|
|
518
|
+
return self.estimator_.get_data()
|
|
519
|
+
else:
|
|
520
|
+
return X
|
|
521
|
+
|
|
522
|
+
def get_params(self, deep=True):
|
|
523
|
+
"""Get parameters for this transformer."""
|
|
524
|
+
params = {
|
|
525
|
+
'estimator_class': self.estimator_class,
|
|
526
|
+
'experiment': self.experiment
|
|
527
|
+
}
|
|
528
|
+
if deep and self.estimator_params:
|
|
529
|
+
for key, value in self.estimator_params.items():
|
|
530
|
+
params[key] = value
|
|
531
|
+
return params
|
|
532
|
+
|
|
533
|
+
def set_params(self, **params):
|
|
534
|
+
"""Set parameters for this transformer."""
|
|
535
|
+
estimator_params = {}
|
|
536
|
+
base_params = {}
|
|
537
|
+
|
|
538
|
+
for key, value in params.items():
|
|
539
|
+
if key in ['estimator_class', 'experiment']:
|
|
540
|
+
base_params[key] = value
|
|
541
|
+
else:
|
|
542
|
+
estimator_params[key] = value
|
|
543
|
+
|
|
544
|
+
for key, value in base_params.items():
|
|
545
|
+
setattr(self, key, value)
|
|
546
|
+
|
|
547
|
+
self.estimator_params.update(estimator_params)
|
|
548
|
+
return self
|
lecrapaud/search_space.py
CHANGED
|
@@ -15,6 +15,7 @@ from sklearn.naive_bayes import GaussianNB
|
|
|
15
15
|
# Ensemble models
|
|
16
16
|
from lightgbm import LGBMRegressor, LGBMClassifier
|
|
17
17
|
from xgboost import XGBRegressor, XGBClassifier
|
|
18
|
+
from catboost import CatBoostRegressor, CatBoostClassifier
|
|
18
19
|
from sklearn.ensemble import (
|
|
19
20
|
RandomForestRegressor,
|
|
20
21
|
AdaBoostRegressor,
|
|
@@ -50,7 +51,8 @@ from keras.activations import sigmoid
|
|
|
50
51
|
from ray import tune
|
|
51
52
|
import pandas as pd
|
|
52
53
|
|
|
53
|
-
# we cannot use tune.sample_from function to make conditionnal search space,
|
|
54
|
+
# we cannot use tune.sample_from function to make conditionnal search space,
|
|
55
|
+
# because hyperopt and bayesian opt need a fixed search space
|
|
54
56
|
|
|
55
57
|
ml_models = [
|
|
56
58
|
{
|
|
@@ -464,6 +466,41 @@ ml_models = [
|
|
|
464
466
|
},
|
|
465
467
|
},
|
|
466
468
|
},
|
|
469
|
+
{
|
|
470
|
+
"model_name": "catboost",
|
|
471
|
+
"recurrent": False,
|
|
472
|
+
"need_scaling": False,
|
|
473
|
+
"classification": {
|
|
474
|
+
"create_model": CatBoostClassifier,
|
|
475
|
+
"search_params": {
|
|
476
|
+
"iterations": tune.randint(50, 1000),
|
|
477
|
+
"num_boost_round": tune.randint(50, 1000),
|
|
478
|
+
"early_stopping_rounds": tune.randint(5, 50),
|
|
479
|
+
"learning_rate": tune.loguniform(1e-4, 0.5),
|
|
480
|
+
"depth": tune.randint(3, 10),
|
|
481
|
+
"l2_leaf_reg": tune.loguniform(1e-5, 10),
|
|
482
|
+
"bagging_temperature": tune.uniform(0.0, 1.0),
|
|
483
|
+
"rsm": tune.quniform(0.6, 1.0, 0.05),
|
|
484
|
+
"random_state": 42,
|
|
485
|
+
"verbose": False,
|
|
486
|
+
},
|
|
487
|
+
},
|
|
488
|
+
"regression": {
|
|
489
|
+
"create_model": CatBoostRegressor,
|
|
490
|
+
"search_params": {
|
|
491
|
+
"iterations": tune.randint(50, 1000),
|
|
492
|
+
"num_boost_round": tune.randint(50, 1000),
|
|
493
|
+
"early_stopping_rounds": tune.randint(5, 50),
|
|
494
|
+
"learning_rate": tune.loguniform(1e-4, 0.5),
|
|
495
|
+
"depth": tune.randint(3, 10),
|
|
496
|
+
"l2_leaf_reg": tune.loguniform(1e-5, 10),
|
|
497
|
+
"bagging_temperature": tune.uniform(0.0, 1.0),
|
|
498
|
+
"rsm": tune.quniform(0.6, 1.0, 0.05),
|
|
499
|
+
"random_state": 42,
|
|
500
|
+
"verbose": False,
|
|
501
|
+
},
|
|
502
|
+
},
|
|
503
|
+
},
|
|
467
504
|
]
|
|
468
505
|
|
|
469
506
|
|
lecrapaud/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ import re
|
|
|
11
11
|
import string
|
|
12
12
|
|
|
13
13
|
from lecrapaud.directories import logger_dir
|
|
14
|
-
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
14
|
+
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
_LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
|
|
@@ -59,6 +59,14 @@ def setup_logger():
|
|
|
59
59
|
file_handler.setLevel(log_level)
|
|
60
60
|
logger.addHandler(file_handler)
|
|
61
61
|
|
|
62
|
+
try:
|
|
63
|
+
from lecrapaud.integrations.sentry_integration import init_sentry
|
|
64
|
+
|
|
65
|
+
if init_sentry():
|
|
66
|
+
logger.info("Sentry logging enabled")
|
|
67
|
+
except Exception as exc:
|
|
68
|
+
logger.info(f"Sentry logging disabled: {exc}")
|
|
69
|
+
|
|
62
70
|
_LECRAPAUD_LOGGER_ALREADY_CONFIGURED = True
|
|
63
71
|
return logger
|
|
64
72
|
|
|
@@ -232,9 +240,28 @@ def remove_accents(text: str) -> str:
|
|
|
232
240
|
def serialize_for_json(obj):
|
|
233
241
|
"""
|
|
234
242
|
Recursively convert any object into a JSON-serializable structure.
|
|
235
|
-
|
|
243
|
+
Handles NumPy types, datetime objects, and class instances.
|
|
236
244
|
"""
|
|
237
|
-
|
|
245
|
+
import numpy as np
|
|
246
|
+
from datetime import datetime, date
|
|
247
|
+
import pandas as pd
|
|
248
|
+
|
|
249
|
+
# Handle NumPy types
|
|
250
|
+
if isinstance(obj, (np.integer, np.int64, np.int32, np.int16)):
|
|
251
|
+
return int(obj)
|
|
252
|
+
elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
|
|
253
|
+
return float(obj)
|
|
254
|
+
elif isinstance(obj, np.ndarray):
|
|
255
|
+
return obj.tolist()
|
|
256
|
+
elif isinstance(obj, np.bool_):
|
|
257
|
+
return bool(obj)
|
|
258
|
+
|
|
259
|
+
# Handle datetime types
|
|
260
|
+
elif isinstance(obj, (datetime, date, pd.Timestamp)):
|
|
261
|
+
return obj.isoformat()
|
|
262
|
+
|
|
263
|
+
# Handle basic Python types
|
|
264
|
+
elif isinstance(obj, (str, int, float, bool, type(None))):
|
|
238
265
|
return obj
|
|
239
266
|
elif isinstance(obj, dict):
|
|
240
267
|
return {str(k): serialize_for_json(v) for k, v in obj.items()}
|
|
@@ -244,6 +271,12 @@ def serialize_for_json(obj):
|
|
|
244
271
|
# A class/type object like int, str, etc.
|
|
245
272
|
return obj.__name__
|
|
246
273
|
elif hasattr(obj, "__class__"):
|
|
274
|
+
# For other objects, return their string representation
|
|
247
275
|
return f"{obj.__class__.__name__}()"
|
|
248
276
|
else:
|
|
249
277
|
return str(obj)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def strip_timestamp_suffix(name: str) -> str:
|
|
281
|
+
# Matches an underscore followed by 8 digits, another underscore, then 6 digits at the end
|
|
282
|
+
return re.sub(r"_\d{8}_\d{6}$", "", name)
|