lecrapaud 0.16.6__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/PKG-INFO +1 -1
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/api.py +64 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/experiment.py +77 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/feature_engineering.py +155 -1
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/model_selection.py +113 -10
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/pyproject.toml +1 -1
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/LICENSE +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/README.md +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/__init__.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/config.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/__init__.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/README +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/env.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/script.py.mako +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic.ini +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/__init__.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/base.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/feature.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/feature_selection.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/model.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/model_selection.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/model_training.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/score.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/target.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/models/utils.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/session.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/directories.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/experiment.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/feature_selection.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/integrations/openai_integration.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/jobs/__init__.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/jobs/config.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/jobs/scheduler.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/jobs/tasks.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/search_space.py +0 -0
- {lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/utils.py +0 -0
|
@@ -100,6 +100,68 @@ class LeCrapaud:
|
|
|
100
100
|
id=Experiment.get_best_by_score(name=name, metric=metric).id, **kwargs
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
+
def compare_experiment_scores(self, name: str):
|
|
104
|
+
"""Compare scores of experiments with matching names.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
name (str): Name or partial name of experiments to compare
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
dict: Dictionary containing experiment names as keys and their scores as values
|
|
111
|
+
"""
|
|
112
|
+
from lecrapaud.db import SessionLocal
|
|
113
|
+
from sqlalchemy.orm import joinedload
|
|
114
|
+
|
|
115
|
+
db = SessionLocal()
|
|
116
|
+
try:
|
|
117
|
+
# Get all experiments with the given name pattern
|
|
118
|
+
experiments = (
|
|
119
|
+
db.query(Experiment)
|
|
120
|
+
.options(
|
|
121
|
+
joinedload(Experiment.model_selections).joinedload(
|
|
122
|
+
ModelSelection.scores
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
.filter(Experiment.name.ilike(f"%{name}%"))
|
|
126
|
+
.all()
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if not experiments:
|
|
130
|
+
return {"error": f"No experiments found with name containing '{name}'"}
|
|
131
|
+
|
|
132
|
+
comparison = {}
|
|
133
|
+
|
|
134
|
+
for exp in experiments:
|
|
135
|
+
scores = {
|
|
136
|
+
"rmse": exp.avg_rmse,
|
|
137
|
+
"logloss": exp.avg_logloss,
|
|
138
|
+
"accuracy": None,
|
|
139
|
+
"f1": None,
|
|
140
|
+
"roc_auc": None,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Get classification metrics from the first model selection with scores
|
|
144
|
+
for model_sel in exp.model_selections:
|
|
145
|
+
if model_sel.scores:
|
|
146
|
+
for score in model_sel.scores:
|
|
147
|
+
if score.type == "validation": # Use validation scores
|
|
148
|
+
if score.accuracy is not None:
|
|
149
|
+
scores["accuracy"] = score.accuracy
|
|
150
|
+
if score.f1 is not None:
|
|
151
|
+
scores["f1"] = score.f1
|
|
152
|
+
if score.roc_auc is not None:
|
|
153
|
+
scores["roc_auc"] = score.roc_auc
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
comparison[exp.name] = scores
|
|
157
|
+
|
|
158
|
+
return comparison
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
return {"error": f"Error comparing experiment scores: {str(e)}"}
|
|
162
|
+
finally:
|
|
163
|
+
db.close()
|
|
164
|
+
|
|
103
165
|
def list_experiments(
|
|
104
166
|
self, name: str = None, limit: int = 1000
|
|
105
167
|
) -> list["ExperimentEngine"]:
|
|
@@ -289,6 +351,8 @@ class ExperimentEngine:
|
|
|
289
351
|
val_size=self.val_size,
|
|
290
352
|
test_size=self.test_size,
|
|
291
353
|
columns_pca=self.columns_pca,
|
|
354
|
+
pca_temporal=self.pca_temporal,
|
|
355
|
+
pca_cross_sectional=self.pca_cross_sectional,
|
|
292
356
|
columns_onehot=self.columns_onehot,
|
|
293
357
|
columns_binary=self.columns_binary,
|
|
294
358
|
columns_frequency=self.columns_frequency,
|
|
@@ -303,6 +303,83 @@ class Experiment(Base):
|
|
|
303
303
|
else:
|
|
304
304
|
raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
|
|
305
305
|
|
|
306
|
+
def best_score(self, target_number: int) -> dict:
|
|
307
|
+
"""
|
|
308
|
+
Returns the scores for the best model of the specified target.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
target_number (int): The target number to get scores for
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
dict: A dictionary containing the experiment name, target number, and the best model's scores
|
|
315
|
+
"""
|
|
316
|
+
# Find the target
|
|
317
|
+
target_name = f"TARGET_{target_number}"
|
|
318
|
+
target = next((t for t in self.targets if t.name == target_name), None)
|
|
319
|
+
|
|
320
|
+
if not target:
|
|
321
|
+
return {
|
|
322
|
+
'experiment_name': self.name,
|
|
323
|
+
'target_number': target_number,
|
|
324
|
+
'error': f'Target {target_name} not found in this experiment',
|
|
325
|
+
'scores': {}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# Find the best model selection for this target
|
|
329
|
+
best_model_selection = next(
|
|
330
|
+
(ms for ms in self.model_selections if ms.target_id == target.id),
|
|
331
|
+
None
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
if not best_model_selection or not best_model_selection.model_trainings:
|
|
335
|
+
return {
|
|
336
|
+
'experiment_name': self.name,
|
|
337
|
+
'target_number': target_number,
|
|
338
|
+
'error': 'No model found for this target',
|
|
339
|
+
'scores': {}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
# Get the best model training (assuming the first one is the best)
|
|
343
|
+
best_training = best_model_selection.model_trainings[0]
|
|
344
|
+
|
|
345
|
+
# Get the validation score for this training
|
|
346
|
+
validation_scores = [s for s in best_training.score if s.type == 'validation']
|
|
347
|
+
|
|
348
|
+
if not validation_scores:
|
|
349
|
+
return {
|
|
350
|
+
'experiment_name': self.name,
|
|
351
|
+
'target_number': target_number,
|
|
352
|
+
'error': 'No validation scores found for the best model',
|
|
353
|
+
'scores': {}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
# Get all available metrics from the first validation score
|
|
357
|
+
score = validation_scores[0]
|
|
358
|
+
available_metrics = [
|
|
359
|
+
'rmse', 'mae', 'r2', 'logloss', 'accuracy',
|
|
360
|
+
'precision', 'recall', 'f1', 'roc_auc'
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
scores = {}
|
|
364
|
+
for metric in available_metrics:
|
|
365
|
+
value = getattr(score, metric, None)
|
|
366
|
+
if value is not None:
|
|
367
|
+
scores[metric] = value
|
|
368
|
+
|
|
369
|
+
# Get the model info
|
|
370
|
+
model_info = {
|
|
371
|
+
'model_type': best_training.model.model_type if best_training.model else 'unknown',
|
|
372
|
+
'model_name': best_training.model.name if best_training.model else 'unknown',
|
|
373
|
+
'training_time_seconds': best_training.training_time
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
'experiment_name': self.name,
|
|
378
|
+
'target_number': target_number,
|
|
379
|
+
'model': model_info,
|
|
380
|
+
'scores': scores
|
|
381
|
+
}
|
|
382
|
+
|
|
306
383
|
def get_features(self, target_number: int):
|
|
307
384
|
targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
|
|
308
385
|
if targets:
|
|
@@ -52,6 +52,9 @@ import os
|
|
|
52
52
|
|
|
53
53
|
from sklearn.compose import ColumnTransformer
|
|
54
54
|
from sklearn.decomposition import PCA
|
|
55
|
+
from sklearn.impute import SimpleImputer
|
|
56
|
+
from sklearn.preprocessing import StandardScaler
|
|
57
|
+
from sklearn.pipeline import Pipeline
|
|
55
58
|
from category_encoders import BinaryEncoder, CountEncoder
|
|
56
59
|
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
57
60
|
from sklearn.model_selection import train_test_split
|
|
@@ -316,6 +319,8 @@ class PreprocessFeature:
|
|
|
316
319
|
val_size: float = 0.2,
|
|
317
320
|
test_size: float = 0.2,
|
|
318
321
|
columns_pca: list[str] = [],
|
|
322
|
+
pca_temporal: dict[str, list[str]] = {},
|
|
323
|
+
pca_cross_sectional: dict[str, list[str]] = {},
|
|
319
324
|
columns_onehot: list[str] = [],
|
|
320
325
|
columns_binary: list[str] = [],
|
|
321
326
|
columns_ordinal: list[str] = [],
|
|
@@ -329,6 +334,8 @@ class PreprocessFeature:
|
|
|
329
334
|
|
|
330
335
|
self.experiment = experiment
|
|
331
336
|
self.columns_pca = [col.upper() for col in columns_pca]
|
|
337
|
+
self.pca_temporal = pca_temporal
|
|
338
|
+
self.pca_cross_sectional = pca_cross_sectional
|
|
332
339
|
self.columns_onehot = [col.upper() for col in columns_onehot]
|
|
333
340
|
self.columns_binary = [col.upper() for col in columns_binary]
|
|
334
341
|
self.columns_ordinal = [col.upper() for col in columns_ordinal]
|
|
@@ -364,6 +371,26 @@ class PreprocessFeature:
|
|
|
364
371
|
|
|
365
372
|
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
366
373
|
|
|
374
|
+
train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
|
|
375
|
+
val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
|
|
376
|
+
test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
|
|
377
|
+
|
|
378
|
+
joblib.dump(
|
|
379
|
+
pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
train, pcas_temporal = self.add_pca_feature_temporal(train)
|
|
383
|
+
val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
|
|
384
|
+
test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
|
|
385
|
+
|
|
386
|
+
joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
387
|
+
|
|
388
|
+
# Save all features before encoding
|
|
389
|
+
joblib.dump(
|
|
390
|
+
list(train.columns),
|
|
391
|
+
f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
|
|
392
|
+
)
|
|
393
|
+
|
|
367
394
|
# Encoding
|
|
368
395
|
train, transformer = self.encode_categorical_features(train)
|
|
369
396
|
val, _ = self.encode_categorical_features(
|
|
@@ -382,7 +409,8 @@ class PreprocessFeature:
|
|
|
382
409
|
|
|
383
410
|
# Save all features before selection
|
|
384
411
|
joblib.dump(
|
|
385
|
-
train,
|
|
412
|
+
list(train.columns),
|
|
413
|
+
f"{self.preprocessing_dir}/all_features_before_selection.pkl",
|
|
386
414
|
)
|
|
387
415
|
|
|
388
416
|
return train, val, test
|
|
@@ -395,6 +423,18 @@ class PreprocessFeature:
|
|
|
395
423
|
pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
396
424
|
data, _ = self.add_pca_features(data, pcas=pcas)
|
|
397
425
|
|
|
426
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
|
|
427
|
+
pcas_cross_sectional = joblib.load(
|
|
428
|
+
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
429
|
+
)
|
|
430
|
+
data, _ = self.add_pca_feature_cross_sectional(
|
|
431
|
+
data, pcas=pcas_cross_sectional
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
|
|
435
|
+
pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
436
|
+
data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
|
|
437
|
+
|
|
398
438
|
# Encoding
|
|
399
439
|
transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
400
440
|
data, _ = self.encode_categorical_features(
|
|
@@ -570,6 +610,120 @@ class PreprocessFeature:
|
|
|
570
610
|
|
|
571
611
|
return df, pcas_dict
|
|
572
612
|
|
|
613
|
+
def add_pca_feature_cross_sectional(
|
|
614
|
+
self,
|
|
615
|
+
df: pd.DataFrame,
|
|
616
|
+
*,
|
|
617
|
+
n_components: int = 5,
|
|
618
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
619
|
+
impute_strategy: str = "median",
|
|
620
|
+
standardize: bool = True,
|
|
621
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
622
|
+
"""
|
|
623
|
+
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
624
|
+
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
625
|
+
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
626
|
+
"""
|
|
627
|
+
|
|
628
|
+
pcas_dict = {}
|
|
629
|
+
|
|
630
|
+
for pca_cross_sectional in self.pca_cross_sectional:
|
|
631
|
+
name, index_col, columns_col, value_col = (
|
|
632
|
+
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
633
|
+
)
|
|
634
|
+
prefix = f"CS_PC_{name}"
|
|
635
|
+
|
|
636
|
+
pivot = df.pivot_table(
|
|
637
|
+
index=index_col, columns=columns_col, values=value_col
|
|
638
|
+
).sort_index()
|
|
639
|
+
|
|
640
|
+
# Pipeline à réutiliser entre train et test
|
|
641
|
+
if pcas is None:
|
|
642
|
+
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
643
|
+
if standardize:
|
|
644
|
+
steps.append(
|
|
645
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
646
|
+
)
|
|
647
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
648
|
+
steps.append(("pca", pca))
|
|
649
|
+
pipe = Pipeline(steps)
|
|
650
|
+
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
651
|
+
else:
|
|
652
|
+
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
653
|
+
|
|
654
|
+
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
655
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
656
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
657
|
+
|
|
658
|
+
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
659
|
+
pcas_dict.update({name: pipe})
|
|
660
|
+
|
|
661
|
+
return df, pcas_dict
|
|
662
|
+
|
|
663
|
+
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
664
|
+
def add_pca_feature_temporal(
|
|
665
|
+
self,
|
|
666
|
+
df: pd.DataFrame,
|
|
667
|
+
*,
|
|
668
|
+
n_components: int = 5,
|
|
669
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
670
|
+
impute_strategy: (
|
|
671
|
+
str | None
|
|
672
|
+
) = None, # None = on exige toutes les colonnes présentes
|
|
673
|
+
standardize: bool = True,
|
|
674
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
675
|
+
"""
|
|
676
|
+
Applique une PCA sur une matrice (rows = lignes df, cols = lags).
|
|
677
|
+
Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
|
|
678
|
+
Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
|
|
679
|
+
"""
|
|
680
|
+
pcas_dict = {}
|
|
681
|
+
|
|
682
|
+
for pca_temporal in self.pca_temporal:
|
|
683
|
+
name, cols = (pca_temporal[k] for k in ("name", "columns"))
|
|
684
|
+
prefix = f"TMP_PC_{name}"
|
|
685
|
+
|
|
686
|
+
# Masque des lignes utilisables
|
|
687
|
+
if impute_strategy is None:
|
|
688
|
+
mask = (
|
|
689
|
+
df[cols].notna().all(axis=1)
|
|
690
|
+
) # on n'impute pas → lignes complètes
|
|
691
|
+
X_fit = df.loc[mask, cols]
|
|
692
|
+
else:
|
|
693
|
+
mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
|
|
694
|
+
X_fit = df.loc[mask, cols]
|
|
695
|
+
|
|
696
|
+
# Pipeline
|
|
697
|
+
if pcas is None:
|
|
698
|
+
steps = []
|
|
699
|
+
if impute_strategy is not None:
|
|
700
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
701
|
+
if standardize:
|
|
702
|
+
steps.append(
|
|
703
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
704
|
+
)
|
|
705
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
706
|
+
steps.append(("pca", pca))
|
|
707
|
+
pipe = Pipeline(steps)
|
|
708
|
+
if not X_fit.empty:
|
|
709
|
+
pipe.fit(X_fit) # <- fit sur TRAIN uniquement
|
|
710
|
+
else:
|
|
711
|
+
pipe = pcas[name] # <- TEST
|
|
712
|
+
|
|
713
|
+
# Transform uniquement sur lignes valides (mask)
|
|
714
|
+
if not df.loc[mask, cols].empty:
|
|
715
|
+
Z = pipe.transform(df.loc[mask, cols])
|
|
716
|
+
for i in range(n_components):
|
|
717
|
+
df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
|
|
718
|
+
else:
|
|
719
|
+
# crée les colonnes vides si aucune ligne valide (cohérence de schéma)
|
|
720
|
+
for i in range(n_components):
|
|
721
|
+
df[f"{prefix}_{i}"] = pd.NA
|
|
722
|
+
|
|
723
|
+
pcas_dict.update({name: pipe})
|
|
724
|
+
|
|
725
|
+
return df, pcas_dict
|
|
726
|
+
|
|
573
727
|
# encoding categorical features
|
|
574
728
|
def encode_categorical_features(
|
|
575
729
|
self,
|
|
@@ -1592,20 +1592,104 @@ def plot_evaluation_for_classification(prediction: dict):
|
|
|
1592
1592
|
|
|
1593
1593
|
|
|
1594
1594
|
def plot_confusion_matrix(y_true, y_pred):
|
|
1595
|
-
|
|
1595
|
+
# Calculate confusion matrix
|
|
1596
1596
|
cm = confusion_matrix(y_true, y_pred)
|
|
1597
1597
|
|
|
1598
|
-
|
|
1598
|
+
# Get unique, sorted class labels
|
|
1599
|
+
labels = np.unique(np.concatenate((y_true, y_pred)))
|
|
1600
|
+
labels = np.sort(labels)
|
|
1601
|
+
|
|
1602
|
+
# Calculate class distribution
|
|
1603
|
+
class_dist = np.bincount(y_true.astype(int))
|
|
1604
|
+
class_dist_pct = class_dist / len(y_true) * 100
|
|
1605
|
+
|
|
1606
|
+
# Create figure with two subplots stacked vertically
|
|
1607
|
+
fig = plt.figure(figsize=(10, 12))
|
|
1608
|
+
|
|
1609
|
+
# Subplot 1: Confusion Matrix
|
|
1610
|
+
ax1 = plt.subplot(2, 1, 1) # Changed to 2 rows, 1 column, first subplot
|
|
1611
|
+
|
|
1612
|
+
# Create a custom colormap (blue to white to red)
|
|
1613
|
+
cmap = sns.diverging_palette(220, 10, as_cmap=True)
|
|
1614
|
+
|
|
1615
|
+
# Plot heatmap with better styling
|
|
1616
|
+
sns.heatmap(
|
|
1617
|
+
cm,
|
|
1618
|
+
annot=True,
|
|
1619
|
+
fmt="d",
|
|
1620
|
+
cmap=cmap,
|
|
1621
|
+
center=0,
|
|
1622
|
+
linewidths=0.5,
|
|
1623
|
+
linecolor="lightgray",
|
|
1624
|
+
cbar_kws={"label": "Number of Samples"},
|
|
1625
|
+
ax=ax1,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
# Add title and labels with better styling
|
|
1629
|
+
ax1.set_title("Confusion Matrix", fontsize=14, pad=20, weight="bold")
|
|
1630
|
+
ax1.set_xlabel("Predicted Label", fontsize=12, labelpad=10)
|
|
1631
|
+
ax1.set_ylabel("True Label", fontsize=12, labelpad=10)
|
|
1632
|
+
|
|
1633
|
+
# Set tick labels to be centered and more readable
|
|
1634
|
+
ax1.set_xticks(np.arange(len(labels)) + 0.5)
|
|
1635
|
+
ax1.set_yticks(np.arange(len(labels)) + 0.5)
|
|
1636
|
+
ax1.set_xticklabels(labels, fontsize=10)
|
|
1637
|
+
ax1.set_yticklabels(labels, fontsize=10, rotation=0)
|
|
1638
|
+
|
|
1639
|
+
# Add grid lines for better readability
|
|
1640
|
+
ax1.set_xticks(np.arange(len(labels) + 1) - 0.5, minor=True)
|
|
1641
|
+
ax1.set_yticks(np.arange(len(labels) + 1) - 0.5, minor=True)
|
|
1642
|
+
ax1.grid(which="minor", color="w", linestyle="-", linewidth=2)
|
|
1643
|
+
ax1.tick_params(which="minor", bottom=False, left=False)
|
|
1644
|
+
|
|
1645
|
+
# Subplot 2: Class Distribution
|
|
1646
|
+
ax2 = plt.subplot(2, 1, 2) # Changed to 2 rows, 1 column, second subplot
|
|
1647
|
+
|
|
1648
|
+
# Create a bar plot for class distribution
|
|
1649
|
+
bars = ax2.bar(
|
|
1650
|
+
labels.astype(str),
|
|
1651
|
+
class_dist_pct,
|
|
1652
|
+
color=sns.color_palette("viridis", len(labels)),
|
|
1653
|
+
)
|
|
1599
1654
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1655
|
+
# Add percentage labels on top of bars
|
|
1656
|
+
for bar in bars:
|
|
1657
|
+
height = bar.get_height()
|
|
1658
|
+
ax2.text(
|
|
1659
|
+
bar.get_x() + bar.get_width() / 2.0,
|
|
1660
|
+
height + 1,
|
|
1661
|
+
f"{height:.1f}%",
|
|
1662
|
+
ha="center",
|
|
1663
|
+
va="bottom",
|
|
1664
|
+
fontsize=10,
|
|
1665
|
+
)
|
|
1605
1666
|
|
|
1606
|
-
|
|
1607
|
-
|
|
1667
|
+
# Add title and labels
|
|
1668
|
+
ax2.set_title("Class Distribution", fontsize=14, pad=20, weight="bold")
|
|
1669
|
+
ax2.set_xlabel("Class", fontsize=12, labelpad=10)
|
|
1670
|
+
ax2.set_ylabel("Percentage of Total Samples", fontsize=12, labelpad=10)
|
|
1671
|
+
ax2.set_ylim(0, 100)
|
|
1672
|
+
ax2.grid(axis="y", linestyle="--", alpha=0.7)
|
|
1673
|
+
|
|
1674
|
+
# Add total count annotation
|
|
1675
|
+
total = len(y_true)
|
|
1676
|
+
ax2.text(
|
|
1677
|
+
0.5,
|
|
1678
|
+
-0.15, # Adjusted y-position for better spacing
|
|
1679
|
+
f"Total samples: {total:,}",
|
|
1680
|
+
transform=ax2.transAxes,
|
|
1681
|
+
ha="center",
|
|
1682
|
+
fontsize=10,
|
|
1683
|
+
bbox=dict(
|
|
1684
|
+
facecolor="white",
|
|
1685
|
+
alpha=0.8,
|
|
1686
|
+
edgecolor="lightgray",
|
|
1687
|
+
boxstyle="round,pad=0.5",
|
|
1688
|
+
),
|
|
1689
|
+
)
|
|
1608
1690
|
|
|
1691
|
+
# Adjust layout to prevent overlap with more vertical space
|
|
1692
|
+
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
|
|
1609
1693
|
plt.show()
|
|
1610
1694
|
|
|
1611
1695
|
|
|
@@ -1697,7 +1781,17 @@ def find_best_threshold(
|
|
|
1697
1781
|
logger.warning(
|
|
1698
1782
|
f"[Class {cls}] No threshold with precision ≥ {target_value}"
|
|
1699
1783
|
)
|
|
1700
|
-
|
|
1784
|
+
# fallback: meilleure precision parmi ceux avec recall>0
|
|
1785
|
+
cand = np.where(recall > 0)[0]
|
|
1786
|
+
if cand.size:
|
|
1787
|
+
best_idx = cand[int(np.argmax(precision[cand]))]
|
|
1788
|
+
logger.warning(
|
|
1789
|
+
f"[Class {cls}] Fallback to best precision with recall>0: "
|
|
1790
|
+
f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
|
|
1791
|
+
)
|
|
1792
|
+
else:
|
|
1793
|
+
logger.error(f"[Class {cls}] No threshold achieves recall>0.")
|
|
1794
|
+
best_idx = None
|
|
1701
1795
|
|
|
1702
1796
|
elif metric == "f1":
|
|
1703
1797
|
valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
|
|
@@ -1711,6 +1805,15 @@ def find_best_threshold(
|
|
|
1711
1805
|
else:
|
|
1712
1806
|
best_idx = int(np.argmax(values)) # no constraint, get best value
|
|
1713
1807
|
|
|
1808
|
+
if best_idx is None:
|
|
1809
|
+
results[cls_str] = {
|
|
1810
|
+
"threshold": None,
|
|
1811
|
+
"precision": None,
|
|
1812
|
+
"recall": None,
|
|
1813
|
+
"f1": None,
|
|
1814
|
+
}
|
|
1815
|
+
continue
|
|
1816
|
+
|
|
1714
1817
|
results[cls_str] = {
|
|
1715
1818
|
"threshold": float(thresholds[best_idx]),
|
|
1716
1819
|
"precision": float(precision[best_idx]),
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py
RENAMED
|
File without changes
|
{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py
RENAMED
|
File without changes
|
{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py
RENAMED
|
File without changes
|
{lecrapaud-0.16.6 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|