lecrapaud 0.16.7__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +68 -86
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +30 -0
- lecrapaud/db/models/experiment.py +52 -39
- lecrapaud/db/models/model_selection.py +1 -0
- lecrapaud/feature_engineering.py +147 -0
- lecrapaud/model_selection.py +32 -1
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.18.0.dist-info}/METADATA +1 -1
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.18.0.dist-info}/RECORD +10 -9
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.18.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.18.0.dist-info}/WHEEL +0 -0
lecrapaud/api.py
CHANGED
|
@@ -102,63 +102,43 @@ class LeCrapaud:
|
|
|
102
102
|
|
|
103
103
|
def compare_experiment_scores(self, name: str):
|
|
104
104
|
"""Compare scores of experiments with matching names.
|
|
105
|
-
|
|
105
|
+
|
|
106
106
|
Args:
|
|
107
107
|
name (str): Name or partial name of experiments to compare
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
Returns:
|
|
110
110
|
dict: Dictionary containing experiment names as keys and their scores as values
|
|
111
111
|
"""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
.
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if model_sel.scores:
|
|
143
|
-
for score in model_sel.scores:
|
|
144
|
-
if score.type == 'validation': # Use validation scores
|
|
145
|
-
if score.accuracy is not None:
|
|
146
|
-
scores["accuracy"] = score.accuracy
|
|
147
|
-
if score.f1 is not None:
|
|
148
|
-
scores["f1"] = score.f1
|
|
149
|
-
if score.roc_auc is not None:
|
|
150
|
-
scores["roc_auc"] = score.roc_auc
|
|
151
|
-
break
|
|
152
|
-
|
|
153
|
-
comparison[exp.name] = scores
|
|
154
|
-
|
|
155
|
-
return comparison
|
|
156
|
-
|
|
157
|
-
except Exception as e:
|
|
158
|
-
return {"error": f"Error comparing experiment scores: {str(e)}"}
|
|
159
|
-
finally:
|
|
160
|
-
db.close()
|
|
161
|
-
|
|
112
|
+
# Get all experiments with the given name pattern
|
|
113
|
+
experiments = self.list_experiments(name=name)
|
|
114
|
+
|
|
115
|
+
if not experiments:
|
|
116
|
+
return {"error": f"No experiments found with name containing '{name}'"}
|
|
117
|
+
|
|
118
|
+
comparison = {}
|
|
119
|
+
|
|
120
|
+
for exp in experiments:
|
|
121
|
+
for model_sel in exp.experiment.model_selections:
|
|
122
|
+
|
|
123
|
+
if model_sel.best_score:
|
|
124
|
+
|
|
125
|
+
scores = {
|
|
126
|
+
"rmse": model_sel.best_score["rmse"],
|
|
127
|
+
"logloss": model_sel.best_score["logloss"],
|
|
128
|
+
"accuracy": model_sel.best_score["accuracy"],
|
|
129
|
+
"f1": model_sel.best_score["f1"],
|
|
130
|
+
"roc_auc": model_sel.best_score["roc_auc"],
|
|
131
|
+
}
|
|
132
|
+
target_name = model_sel.target.name
|
|
133
|
+
|
|
134
|
+
comparison[exp.experiment.name][target_name] = scores
|
|
135
|
+
else:
|
|
136
|
+
logger.warning(
|
|
137
|
+
f"No best score found for experiment {exp.experiment.name} and target {model_sel.target.name}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return comparison
|
|
141
|
+
|
|
162
142
|
def list_experiments(
|
|
163
143
|
self, name: str = None, limit: int = 1000
|
|
164
144
|
) -> list["ExperimentEngine"]:
|
|
@@ -328,12 +308,12 @@ class ExperimentEngine:
|
|
|
328
308
|
def feature_engineering(self, data, for_training=True):
|
|
329
309
|
app = FeatureEngineeringEngine(
|
|
330
310
|
data=data,
|
|
331
|
-
columns_drop=self
|
|
332
|
-
columns_boolean=self
|
|
333
|
-
columns_date=self
|
|
334
|
-
columns_te_groupby=self
|
|
335
|
-
columns_te_target=self
|
|
336
|
-
for_training=for_training,
|
|
311
|
+
columns_drop=getattr(self, "columns_drop", []),
|
|
312
|
+
columns_boolean=getattr(self, "columns_boolean", []),
|
|
313
|
+
columns_date=getattr(self, "columns_date", []),
|
|
314
|
+
columns_te_groupby=getattr(self, "columns_te_groupby", []),
|
|
315
|
+
columns_te_target=getattr(self, "columns_te_target", []),
|
|
316
|
+
for_training=getattr(self, "for_training", True),
|
|
337
317
|
)
|
|
338
318
|
data = app.run()
|
|
339
319
|
return data
|
|
@@ -341,19 +321,21 @@ class ExperimentEngine:
|
|
|
341
321
|
def preprocess_feature(self, data, for_training=True):
|
|
342
322
|
app = PreprocessFeature(
|
|
343
323
|
data=data,
|
|
344
|
-
experiment=self
|
|
345
|
-
time_series=self
|
|
346
|
-
date_column=self
|
|
347
|
-
group_column=self
|
|
348
|
-
val_size=self
|
|
349
|
-
test_size=self
|
|
350
|
-
columns_pca=self
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
324
|
+
experiment=getattr(self, "experiment", None),
|
|
325
|
+
time_series=getattr(self, "time_series", False),
|
|
326
|
+
date_column=getattr(self, "date_column", None),
|
|
327
|
+
group_column=getattr(self, "group_column", None),
|
|
328
|
+
val_size=getattr(self, "val_size", 0.2),
|
|
329
|
+
test_size=getattr(self, "test_size", 0.2),
|
|
330
|
+
columns_pca=getattr(self, "columns_pca", []),
|
|
331
|
+
pca_temporal=getattr(self, "pca_temporal", []),
|
|
332
|
+
pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
|
|
333
|
+
columns_onehot=getattr(self, "columns_onehot", []),
|
|
334
|
+
columns_binary=getattr(self, "columns_binary", []),
|
|
335
|
+
columns_ordinal=getattr(self, "columns_ordinal", []),
|
|
336
|
+
columns_frequency=getattr(self, "columns_frequency", []),
|
|
337
|
+
target_numbers=getattr(self, "target_numbers", []),
|
|
338
|
+
target_clf=getattr(self, "target_clf", []),
|
|
357
339
|
)
|
|
358
340
|
if for_training:
|
|
359
341
|
train, val, test = app.run()
|
|
@@ -385,14 +367,14 @@ class ExperimentEngine:
|
|
|
385
367
|
train=train,
|
|
386
368
|
val=val,
|
|
387
369
|
test=test,
|
|
388
|
-
experiment=self
|
|
389
|
-
target_numbers=self
|
|
390
|
-
target_clf=self
|
|
391
|
-
models_idx=self
|
|
392
|
-
time_series=self
|
|
393
|
-
max_timesteps=self
|
|
394
|
-
date_column=self
|
|
395
|
-
group_column=self
|
|
370
|
+
experiment=getattr(self, "experiment", None),
|
|
371
|
+
target_numbers=getattr(self, "target_numbers", []),
|
|
372
|
+
target_clf=getattr(self, "target_clf", []),
|
|
373
|
+
models_idx=getattr(self, "models_idx", []),
|
|
374
|
+
time_series=getattr(self, "time_series", False),
|
|
375
|
+
max_timesteps=getattr(self, "max_timesteps", 120),
|
|
376
|
+
date_column=getattr(self, "date_column", None),
|
|
377
|
+
group_column=getattr(self, "group_column", None),
|
|
396
378
|
)
|
|
397
379
|
if for_training:
|
|
398
380
|
data, reshaped_data = app.run()
|
|
@@ -407,13 +389,13 @@ class ExperimentEngine:
|
|
|
407
389
|
data=data,
|
|
408
390
|
reshaped_data=reshaped_data,
|
|
409
391
|
target_number=target_number,
|
|
410
|
-
experiment=self
|
|
411
|
-
target_clf=self
|
|
412
|
-
models_idx=self
|
|
413
|
-
time_series=self
|
|
414
|
-
date_column=self
|
|
415
|
-
group_column=self
|
|
416
|
-
target_clf_thresholds=self
|
|
392
|
+
experiment=getattr(self, "experiment", None),
|
|
393
|
+
target_clf=getattr(self, "target_clf", []),
|
|
394
|
+
models_idx=getattr(self, "models_idx", []),
|
|
395
|
+
time_series=getattr(self, "time_series", False),
|
|
396
|
+
date_column=getattr(self, "date_column", None),
|
|
397
|
+
group_column=getattr(self, "group_column", None),
|
|
398
|
+
target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
|
|
417
399
|
)
|
|
418
400
|
if best_params and target_number not in best_params.keys():
|
|
419
401
|
raise ValueError(
|
lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""add best_score to model selection
|
|
2
|
+
|
|
3
|
+
Revision ID: 7ed9963e732f
|
|
4
|
+
Revises: 72aa496ca65b
|
|
5
|
+
Create Date: 2025-08-25 14:34:58.866912
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from typing import Sequence, Union
|
|
9
|
+
|
|
10
|
+
from alembic import op
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision: str = '7ed9963e732f'
|
|
16
|
+
down_revision: Union[str, None] = '72aa496ca65b'
|
|
17
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
18
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
op.add_column('lecrapaud_model_selections', sa.Column('best_score', sa.JSON(), nullable=True))
|
|
24
|
+
# ### end Alembic commands ###
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def downgrade() -> None:
|
|
28
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
29
|
+
op.drop_column('lecrapaud_model_selections', 'best_score')
|
|
30
|
+
# ### end Alembic commands ###
|
|
@@ -25,6 +25,7 @@ from lecrapaud.db.models.score import Score
|
|
|
25
25
|
|
|
26
26
|
from lecrapaud.db.models.base import Base, with_db
|
|
27
27
|
from lecrapaud.db.models.utils import create_association_table
|
|
28
|
+
from lecrapaud.utils import logger
|
|
28
29
|
|
|
29
30
|
# jointures
|
|
30
31
|
lecrapaud_experiment_target_association = create_association_table(
|
|
@@ -241,7 +242,8 @@ class Experiment(Base):
|
|
|
241
242
|
# This ensures we're comparing apples to apples by normalizing the scores
|
|
242
243
|
experiments = db.query(cls).filter(cls.name.ilike(f"%{name}%")).all()
|
|
243
244
|
if not experiments:
|
|
244
|
-
|
|
245
|
+
logger.error(f"No experiments found with the given name: {name}")
|
|
246
|
+
return None
|
|
245
247
|
|
|
246
248
|
# Get all scores
|
|
247
249
|
rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
|
|
@@ -250,9 +252,10 @@ class Experiment(Base):
|
|
|
250
252
|
]
|
|
251
253
|
|
|
252
254
|
if not rmse_scores or not logloss_scores:
|
|
253
|
-
|
|
255
|
+
logger.error(
|
|
254
256
|
"No experiments found with both RMSE and LogLoss scores. Maybe try with only one metric."
|
|
255
257
|
)
|
|
258
|
+
return None
|
|
256
259
|
|
|
257
260
|
# Normalize scores (subtract min and divide by range)
|
|
258
261
|
min_rmse = min(rmse_scores)
|
|
@@ -306,80 +309,90 @@ class Experiment(Base):
|
|
|
306
309
|
def best_score(self, target_number: int) -> dict:
|
|
307
310
|
"""
|
|
308
311
|
Returns the scores for the best model of the specified target.
|
|
309
|
-
|
|
312
|
+
|
|
310
313
|
Args:
|
|
311
314
|
target_number (int): The target number to get scores for
|
|
312
|
-
|
|
315
|
+
|
|
313
316
|
Returns:
|
|
314
317
|
dict: A dictionary containing the experiment name, target number, and the best model's scores
|
|
315
318
|
"""
|
|
316
319
|
# Find the target
|
|
317
320
|
target_name = f"TARGET_{target_number}"
|
|
318
321
|
target = next((t for t in self.targets if t.name == target_name), None)
|
|
319
|
-
|
|
322
|
+
|
|
320
323
|
if not target:
|
|
321
324
|
return {
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
325
|
+
"experiment_name": self.name,
|
|
326
|
+
"target_number": target_number,
|
|
327
|
+
"error": f"Target {target_name} not found in this experiment",
|
|
328
|
+
"scores": {},
|
|
326
329
|
}
|
|
327
|
-
|
|
330
|
+
|
|
328
331
|
# Find the best model selection for this target
|
|
329
332
|
best_model_selection = next(
|
|
330
|
-
(ms for ms in self.model_selections if ms.target_id == target.id),
|
|
331
|
-
None
|
|
333
|
+
(ms for ms in self.model_selections if ms.target_id == target.id), None
|
|
332
334
|
)
|
|
333
|
-
|
|
335
|
+
|
|
334
336
|
if not best_model_selection or not best_model_selection.model_trainings:
|
|
335
337
|
return {
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
338
|
+
"experiment_name": self.name,
|
|
339
|
+
"target_number": target_number,
|
|
340
|
+
"error": "No model found for this target",
|
|
341
|
+
"scores": {},
|
|
340
342
|
}
|
|
341
|
-
|
|
343
|
+
|
|
342
344
|
# Get the best model training (assuming the first one is the best)
|
|
343
345
|
best_training = best_model_selection.model_trainings[0]
|
|
344
|
-
|
|
346
|
+
|
|
345
347
|
# Get the validation score for this training
|
|
346
|
-
validation_scores = [s for s in best_training.score if s.type ==
|
|
347
|
-
|
|
348
|
+
validation_scores = [s for s in best_training.score if s.type == "validation"]
|
|
349
|
+
|
|
348
350
|
if not validation_scores:
|
|
349
351
|
return {
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
352
|
+
"experiment_name": self.name,
|
|
353
|
+
"target_number": target_number,
|
|
354
|
+
"error": "No validation scores found for the best model",
|
|
355
|
+
"scores": {},
|
|
354
356
|
}
|
|
355
|
-
|
|
357
|
+
|
|
356
358
|
# Get all available metrics from the first validation score
|
|
357
359
|
score = validation_scores[0]
|
|
358
360
|
available_metrics = [
|
|
359
|
-
|
|
360
|
-
|
|
361
|
+
"rmse",
|
|
362
|
+
"mae",
|
|
363
|
+
"r2",
|
|
364
|
+
"logloss",
|
|
365
|
+
"accuracy",
|
|
366
|
+
"precision",
|
|
367
|
+
"recall",
|
|
368
|
+
"f1",
|
|
369
|
+
"roc_auc",
|
|
361
370
|
]
|
|
362
|
-
|
|
371
|
+
|
|
363
372
|
scores = {}
|
|
364
373
|
for metric in available_metrics:
|
|
365
374
|
value = getattr(score, metric, None)
|
|
366
375
|
if value is not None:
|
|
367
376
|
scores[metric] = value
|
|
368
|
-
|
|
377
|
+
|
|
369
378
|
# Get the model info
|
|
370
379
|
model_info = {
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
380
|
+
"model_type": (
|
|
381
|
+
best_training.model.model_type if best_training.model else "unknown"
|
|
382
|
+
),
|
|
383
|
+
"model_name": (
|
|
384
|
+
best_training.model.name if best_training.model else "unknown"
|
|
385
|
+
),
|
|
386
|
+
"training_time_seconds": best_training.training_time,
|
|
374
387
|
}
|
|
375
|
-
|
|
388
|
+
|
|
376
389
|
return {
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
390
|
+
"experiment_name": self.name,
|
|
391
|
+
"target_number": target_number,
|
|
392
|
+
"model": model_info,
|
|
393
|
+
"scores": scores,
|
|
381
394
|
}
|
|
382
|
-
|
|
395
|
+
|
|
383
396
|
def get_features(self, target_number: int):
|
|
384
397
|
targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
|
|
385
398
|
if targets:
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -52,6 +52,9 @@ import os
|
|
|
52
52
|
|
|
53
53
|
from sklearn.compose import ColumnTransformer
|
|
54
54
|
from sklearn.decomposition import PCA
|
|
55
|
+
from sklearn.impute import SimpleImputer
|
|
56
|
+
from sklearn.preprocessing import StandardScaler
|
|
57
|
+
from sklearn.pipeline import Pipeline
|
|
55
58
|
from category_encoders import BinaryEncoder, CountEncoder
|
|
56
59
|
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
57
60
|
from sklearn.model_selection import train_test_split
|
|
@@ -316,6 +319,8 @@ class PreprocessFeature:
|
|
|
316
319
|
val_size: float = 0.2,
|
|
317
320
|
test_size: float = 0.2,
|
|
318
321
|
columns_pca: list[str] = [],
|
|
322
|
+
pca_temporal: list[dict[str, list[str]]] = [],
|
|
323
|
+
pca_cross_sectional: list[dict[str, list[str]]] = [],
|
|
319
324
|
columns_onehot: list[str] = [],
|
|
320
325
|
columns_binary: list[str] = [],
|
|
321
326
|
columns_ordinal: list[str] = [],
|
|
@@ -329,6 +334,8 @@ class PreprocessFeature:
|
|
|
329
334
|
|
|
330
335
|
self.experiment = experiment
|
|
331
336
|
self.columns_pca = [col.upper() for col in columns_pca]
|
|
337
|
+
self.pca_temporal = pca_temporal
|
|
338
|
+
self.pca_cross_sectional = pca_cross_sectional
|
|
332
339
|
self.columns_onehot = [col.upper() for col in columns_onehot]
|
|
333
340
|
self.columns_binary = [col.upper() for col in columns_binary]
|
|
334
341
|
self.columns_ordinal = [col.upper() for col in columns_ordinal]
|
|
@@ -364,6 +371,20 @@ class PreprocessFeature:
|
|
|
364
371
|
|
|
365
372
|
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
366
373
|
|
|
374
|
+
train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
|
|
375
|
+
val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
|
|
376
|
+
test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
|
|
377
|
+
|
|
378
|
+
joblib.dump(
|
|
379
|
+
pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
train, pcas_temporal = self.add_pca_feature_temporal(train)
|
|
383
|
+
val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
|
|
384
|
+
test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
|
|
385
|
+
|
|
386
|
+
joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
387
|
+
|
|
367
388
|
# Save all features before encoding
|
|
368
389
|
joblib.dump(
|
|
369
390
|
list(train.columns),
|
|
@@ -402,6 +423,18 @@ class PreprocessFeature:
|
|
|
402
423
|
pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
403
424
|
data, _ = self.add_pca_features(data, pcas=pcas)
|
|
404
425
|
|
|
426
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
|
|
427
|
+
pcas_cross_sectional = joblib.load(
|
|
428
|
+
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
429
|
+
)
|
|
430
|
+
data, _ = self.add_pca_feature_cross_sectional(
|
|
431
|
+
data, pcas=pcas_cross_sectional
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
|
|
435
|
+
pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
436
|
+
data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
|
|
437
|
+
|
|
405
438
|
# Encoding
|
|
406
439
|
transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
407
440
|
data, _ = self.encode_categorical_features(
|
|
@@ -577,6 +610,120 @@ class PreprocessFeature:
|
|
|
577
610
|
|
|
578
611
|
return df, pcas_dict
|
|
579
612
|
|
|
613
|
+
def add_pca_feature_cross_sectional(
|
|
614
|
+
self,
|
|
615
|
+
df: pd.DataFrame,
|
|
616
|
+
*,
|
|
617
|
+
n_components: int = 5,
|
|
618
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
619
|
+
impute_strategy: str = "median",
|
|
620
|
+
standardize: bool = True,
|
|
621
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
622
|
+
"""
|
|
623
|
+
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
624
|
+
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
625
|
+
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
626
|
+
"""
|
|
627
|
+
|
|
628
|
+
pcas_dict = {}
|
|
629
|
+
|
|
630
|
+
for pca_cross_sectional in self.pca_cross_sectional:
|
|
631
|
+
name, index_col, columns_col, value_col = (
|
|
632
|
+
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
633
|
+
)
|
|
634
|
+
prefix = f"CS_PC_{name}"
|
|
635
|
+
|
|
636
|
+
pivot = df.pivot_table(
|
|
637
|
+
index=index_col, columns=columns_col, values=value_col
|
|
638
|
+
).sort_index()
|
|
639
|
+
|
|
640
|
+
# Pipeline à réutiliser entre train et test
|
|
641
|
+
if pcas is None:
|
|
642
|
+
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
643
|
+
if standardize:
|
|
644
|
+
steps.append(
|
|
645
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
646
|
+
)
|
|
647
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
648
|
+
steps.append(("pca", pca))
|
|
649
|
+
pipe = Pipeline(steps)
|
|
650
|
+
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
651
|
+
else:
|
|
652
|
+
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
653
|
+
|
|
654
|
+
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
655
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
656
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
657
|
+
|
|
658
|
+
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
659
|
+
pcas_dict.update({name: pipe})
|
|
660
|
+
|
|
661
|
+
return df, pcas_dict
|
|
662
|
+
|
|
663
|
+
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
664
|
+
def add_pca_feature_temporal(
|
|
665
|
+
self,
|
|
666
|
+
df: pd.DataFrame,
|
|
667
|
+
*,
|
|
668
|
+
n_components: int = 5,
|
|
669
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
670
|
+
impute_strategy: (
|
|
671
|
+
str | None
|
|
672
|
+
) = None, # None = on exige toutes les colonnes présentes
|
|
673
|
+
standardize: bool = True,
|
|
674
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
675
|
+
"""
|
|
676
|
+
Applique une PCA sur une matrice (rows = lignes df, cols = lags).
|
|
677
|
+
Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
|
|
678
|
+
Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
|
|
679
|
+
"""
|
|
680
|
+
pcas_dict = {}
|
|
681
|
+
|
|
682
|
+
for pca_temporal in self.pca_temporal:
|
|
683
|
+
name, cols = (pca_temporal[k] for k in ("name", "columns"))
|
|
684
|
+
prefix = f"TMP_PC_{name}"
|
|
685
|
+
|
|
686
|
+
# Masque des lignes utilisables
|
|
687
|
+
if impute_strategy is None:
|
|
688
|
+
mask = (
|
|
689
|
+
df[cols].notna().all(axis=1)
|
|
690
|
+
) # on n'impute pas → lignes complètes
|
|
691
|
+
X_fit = df.loc[mask, cols]
|
|
692
|
+
else:
|
|
693
|
+
mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
|
|
694
|
+
X_fit = df.loc[mask, cols]
|
|
695
|
+
|
|
696
|
+
# Pipeline
|
|
697
|
+
if pcas is None:
|
|
698
|
+
steps = []
|
|
699
|
+
if impute_strategy is not None:
|
|
700
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
701
|
+
if standardize:
|
|
702
|
+
steps.append(
|
|
703
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
704
|
+
)
|
|
705
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
706
|
+
steps.append(("pca", pca))
|
|
707
|
+
pipe = Pipeline(steps)
|
|
708
|
+
if not X_fit.empty:
|
|
709
|
+
pipe.fit(X_fit) # <- fit sur TRAIN uniquement
|
|
710
|
+
else:
|
|
711
|
+
pipe = pcas[name] # <- TEST
|
|
712
|
+
|
|
713
|
+
# Transform uniquement sur lignes valides (mask)
|
|
714
|
+
if not df.loc[mask, cols].empty:
|
|
715
|
+
Z = pipe.transform(df.loc[mask, cols])
|
|
716
|
+
for i in range(n_components):
|
|
717
|
+
df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
|
|
718
|
+
else:
|
|
719
|
+
# crée les colonnes vides si aucune ligne valide (cohérence de schéma)
|
|
720
|
+
for i in range(n_components):
|
|
721
|
+
df[f"{prefix}_{i}"] = pd.NA
|
|
722
|
+
|
|
723
|
+
pcas_dict.update({name: pipe})
|
|
724
|
+
|
|
725
|
+
return df, pcas_dict
|
|
726
|
+
|
|
580
727
|
# encoding categorical features
|
|
581
728
|
def encode_categorical_features(
|
|
582
729
|
self,
|
lecrapaud/model_selection.py
CHANGED
|
@@ -1093,6 +1093,7 @@ class ModelSelectionEngine:
|
|
|
1093
1093
|
best_model_params = json.load(f)[best_model_name]
|
|
1094
1094
|
|
|
1095
1095
|
# Save model_selection results to db
|
|
1096
|
+
|
|
1096
1097
|
model_selection = ModelSelection.get(model_selection.id)
|
|
1097
1098
|
model_selection.best_model_id = Model.find_by(
|
|
1098
1099
|
name=best_score_overall["MODEL_NAME"], type=self.target_type
|
|
@@ -1100,6 +1101,17 @@ class ModelSelectionEngine:
|
|
|
1100
1101
|
model_selection.best_model_params = best_model_params
|
|
1101
1102
|
model_selection.best_thresholds = best_thresholds
|
|
1102
1103
|
model_selection.best_model_path = best_model_path
|
|
1104
|
+
|
|
1105
|
+
drop_cols = [
|
|
1106
|
+
"DATE",
|
|
1107
|
+
"MODEL_NAME",
|
|
1108
|
+
"MODEL_PATH",
|
|
1109
|
+
]
|
|
1110
|
+
best_score_overall = {
|
|
1111
|
+
k: v for k, v in best_score_overall.items() if k not in drop_cols
|
|
1112
|
+
}
|
|
1113
|
+
score_data = {k.lower(): v for k, v in best_score_overall.items()}
|
|
1114
|
+
model_selection.best_score = score_data
|
|
1103
1115
|
model_selection.save()
|
|
1104
1116
|
|
|
1105
1117
|
logger.info(f"Best model overall is : {best_score_overall}")
|
|
@@ -1781,7 +1793,17 @@ def find_best_threshold(
|
|
|
1781
1793
|
logger.warning(
|
|
1782
1794
|
f"[Class {cls}] No threshold with precision ≥ {target_value}"
|
|
1783
1795
|
)
|
|
1784
|
-
|
|
1796
|
+
# fallback: meilleure precision parmi ceux avec recall>0
|
|
1797
|
+
cand = np.where(recall > 0)[0]
|
|
1798
|
+
if cand.size:
|
|
1799
|
+
best_idx = cand[int(np.argmax(precision[cand]))]
|
|
1800
|
+
logger.warning(
|
|
1801
|
+
f"[Class {cls}] Fallback to best precision with recall>0: "
|
|
1802
|
+
f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
|
|
1803
|
+
)
|
|
1804
|
+
else:
|
|
1805
|
+
logger.error(f"[Class {cls}] No threshold achieves recall>0.")
|
|
1806
|
+
best_idx = None
|
|
1785
1807
|
|
|
1786
1808
|
elif metric == "f1":
|
|
1787
1809
|
valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
|
|
@@ -1795,6 +1817,15 @@ def find_best_threshold(
|
|
|
1795
1817
|
else:
|
|
1796
1818
|
best_idx = int(np.argmax(values)) # no constraint, get best value
|
|
1797
1819
|
|
|
1820
|
+
if best_idx is None:
|
|
1821
|
+
results[cls_str] = {
|
|
1822
|
+
"threshold": None,
|
|
1823
|
+
"precision": None,
|
|
1824
|
+
"recall": None,
|
|
1825
|
+
"f1": None,
|
|
1826
|
+
}
|
|
1827
|
+
continue
|
|
1828
|
+
|
|
1798
1829
|
results[cls_str] = {
|
|
1799
1830
|
"threshold": float(thresholds[best_idx]),
|
|
1800
1831
|
"precision": float(precision[best_idx]),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
|
|
2
|
-
lecrapaud/api.py,sha256=
|
|
2
|
+
lecrapaud/api.py,sha256=CJJeFvO-5jPRsVpLIgKJ34JpOXqZSs4RowLnPBwxrDs,22463
|
|
3
3
|
lecrapaud/config.py,sha256=itiqC31HB8i2Xo-kn2viCQrg_9tnA07-TJuZ-xdnx44,1126
|
|
4
4
|
lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
|
|
5
5
|
lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
|
|
@@ -9,15 +9,16 @@ lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py,sha256=hyPW0Mt_B4
|
|
|
9
9
|
lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py,sha256=6Pf36HAXEVrVlnrohAe2O7gVaXpDiv3LLIP_EEgTyA0,917
|
|
10
10
|
lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py,sha256=KjwjYvFaNqYmBLTYel8As37fyaBtNVWTqN_3M7y_2eI,1357
|
|
11
11
|
lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py,sha256=MiqooJuZ1etExl2he3MniaEv8G0LrmqY-0m22m9xKmc,943
|
|
12
|
+
lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py,sha256=dzPelNA8N1f8rxUAF9KeoRx3FPvcTKshgcKyq_woe8c,858
|
|
12
13
|
lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
|
|
13
14
|
lecrapaud/db/models/__init__.py,sha256=Lhyw9fVLdom0Fc6yIP-ip8FjkU1EwVwjae5q2VM815Q,740
|
|
14
15
|
lecrapaud/db/models/base.py,sha256=J9ew-0z_-tnWAwhVvOmVDys2R6jPF_oSca_ny6wpXQE,7606
|
|
15
|
-
lecrapaud/db/models/experiment.py,sha256=
|
|
16
|
+
lecrapaud/db/models/experiment.py,sha256=rgNpCNXMei5VhJDNKxelpwqv7iTxoPJ2kkffGaua2sA,14710
|
|
16
17
|
lecrapaud/db/models/feature.py,sha256=5o77O2FyRObnLOCGNj8kaPSGM3pLv1Ov6mXXHYkmnYY,1136
|
|
17
18
|
lecrapaud/db/models/feature_selection.py,sha256=mk42xuw1Sm_7Pznfg7TNc5_S4hscdw79QgIe3Bt9ZRI,3245
|
|
18
19
|
lecrapaud/db/models/feature_selection_rank.py,sha256=Ydsb_rAT58FoSH13wkGjGPByzsjPx3DITXgJ2jgZmow,2198
|
|
19
20
|
lecrapaud/db/models/model.py,sha256=F0hyMjd4FFHCv6_arIWBEmBCGOfG3b6_uzU8ExtFE90,952
|
|
20
|
-
lecrapaud/db/models/model_selection.py,sha256=
|
|
21
|
+
lecrapaud/db/models/model_selection.py,sha256=tJuICcporf3TxQHbJbHxnKgkaVc02z2kJJoCYS2nDcw,2001
|
|
21
22
|
lecrapaud/db/models/model_training.py,sha256=jAIYPdwBln2jf593soLQ730uYrTfNK8zdG8TesOqmh0,1698
|
|
22
23
|
lecrapaud/db/models/score.py,sha256=fSfXLt6Dm-8Fy9ku0urMT5Fa6zNqn4YqVnEO4o3zKVI,1669
|
|
23
24
|
lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk,1649
|
|
@@ -25,7 +26,7 @@ lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,
|
|
|
25
26
|
lecrapaud/db/session.py,sha256=E93WXcFFILFAIeH61ft2Egs7D-6caqs0oi4zCkO5Lq4,2822
|
|
26
27
|
lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
|
|
27
28
|
lecrapaud/experiment.py,sha256=1xLWjOrqAxJh9CdXOx9ppQuRFRRj0GH-xYZqg-ty9hI,2463
|
|
28
|
-
lecrapaud/feature_engineering.py,sha256=
|
|
29
|
+
lecrapaud/feature_engineering.py,sha256=ey46MqXBC-c-BS6nRA7zo8uafxmDABy5ThIyTfmXoSo,38982
|
|
29
30
|
lecrapaud/feature_selection.py,sha256=6ry-oVPQHbipm1XSE5YsH7AY0lQFt4CFbWiHiRs1nxg,43593
|
|
30
31
|
lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
|
|
31
32
|
lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
|
|
@@ -36,10 +37,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
|
|
|
36
37
|
lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
|
|
37
38
|
lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
|
|
38
39
|
lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
|
|
39
|
-
lecrapaud/model_selection.py,sha256=
|
|
40
|
+
lecrapaud/model_selection.py,sha256=QOwOsn1WEBzR-2ZpHvhzv9Qz47delkBdNziHy-auY3o,72302
|
|
40
41
|
lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
|
|
41
42
|
lecrapaud/utils.py,sha256=JdBB1NvbNIx4y0Una-kSZdo1_ZEocc5hwyYFIZKHmGg,8305
|
|
42
|
-
lecrapaud-0.
|
|
43
|
-
lecrapaud-0.
|
|
44
|
-
lecrapaud-0.
|
|
45
|
-
lecrapaud-0.
|
|
43
|
+
lecrapaud-0.18.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
44
|
+
lecrapaud-0.18.0.dist-info/METADATA,sha256=YtnShzFVl8EQOlwPI44gX2T67szZttyD62V1jZ9rFh0,11081
|
|
45
|
+
lecrapaud-0.18.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
46
|
+
lecrapaud-0.18.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|