lecrapaud 0.16.7__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/api.py +20 -15
- lecrapaud/feature_engineering.py +147 -0
- lecrapaud/model_selection.py +20 -1
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/METADATA +1 -1
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/RECORD +7 -7
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/WHEEL +0 -0
lecrapaud/api.py
CHANGED
|
@@ -102,46 +102,49 @@ class LeCrapaud:
|
|
|
102
102
|
|
|
103
103
|
def compare_experiment_scores(self, name: str):
|
|
104
104
|
"""Compare scores of experiments with matching names.
|
|
105
|
-
|
|
105
|
+
|
|
106
106
|
Args:
|
|
107
107
|
name (str): Name or partial name of experiments to compare
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
Returns:
|
|
110
110
|
dict: Dictionary containing experiment names as keys and their scores as values
|
|
111
111
|
"""
|
|
112
112
|
from lecrapaud.db import SessionLocal
|
|
113
113
|
from sqlalchemy.orm import joinedload
|
|
114
|
-
|
|
114
|
+
|
|
115
115
|
db = SessionLocal()
|
|
116
116
|
try:
|
|
117
117
|
# Get all experiments with the given name pattern
|
|
118
118
|
experiments = (
|
|
119
119
|
db.query(Experiment)
|
|
120
|
-
.options(
|
|
121
|
-
|
|
120
|
+
.options(
|
|
121
|
+
joinedload(Experiment.model_selections).joinedload(
|
|
122
|
+
ModelSelection.scores
|
|
123
|
+
)
|
|
124
|
+
)
|
|
122
125
|
.filter(Experiment.name.ilike(f"%{name}%"))
|
|
123
126
|
.all()
|
|
124
127
|
)
|
|
125
|
-
|
|
128
|
+
|
|
126
129
|
if not experiments:
|
|
127
130
|
return {"error": f"No experiments found with name containing '{name}'"}
|
|
128
|
-
|
|
131
|
+
|
|
129
132
|
comparison = {}
|
|
130
|
-
|
|
133
|
+
|
|
131
134
|
for exp in experiments:
|
|
132
135
|
scores = {
|
|
133
136
|
"rmse": exp.avg_rmse,
|
|
134
137
|
"logloss": exp.avg_logloss,
|
|
135
138
|
"accuracy": None,
|
|
136
139
|
"f1": None,
|
|
137
|
-
"roc_auc": None
|
|
140
|
+
"roc_auc": None,
|
|
138
141
|
}
|
|
139
|
-
|
|
142
|
+
|
|
140
143
|
# Get classification metrics from the first model selection with scores
|
|
141
144
|
for model_sel in exp.model_selections:
|
|
142
145
|
if model_sel.scores:
|
|
143
146
|
for score in model_sel.scores:
|
|
144
|
-
if score.type ==
|
|
147
|
+
if score.type == "validation": # Use validation scores
|
|
145
148
|
if score.accuracy is not None:
|
|
146
149
|
scores["accuracy"] = score.accuracy
|
|
147
150
|
if score.f1 is not None:
|
|
@@ -149,16 +152,16 @@ class LeCrapaud:
|
|
|
149
152
|
if score.roc_auc is not None:
|
|
150
153
|
scores["roc_auc"] = score.roc_auc
|
|
151
154
|
break
|
|
152
|
-
|
|
155
|
+
|
|
153
156
|
comparison[exp.name] = scores
|
|
154
|
-
|
|
157
|
+
|
|
155
158
|
return comparison
|
|
156
|
-
|
|
159
|
+
|
|
157
160
|
except Exception as e:
|
|
158
161
|
return {"error": f"Error comparing experiment scores: {str(e)}"}
|
|
159
162
|
finally:
|
|
160
163
|
db.close()
|
|
161
|
-
|
|
164
|
+
|
|
162
165
|
def list_experiments(
|
|
163
166
|
self, name: str = None, limit: int = 1000
|
|
164
167
|
) -> list["ExperimentEngine"]:
|
|
@@ -348,6 +351,8 @@ class ExperimentEngine:
|
|
|
348
351
|
val_size=self.val_size,
|
|
349
352
|
test_size=self.test_size,
|
|
350
353
|
columns_pca=self.columns_pca,
|
|
354
|
+
pca_temporal=self.pca_temporal,
|
|
355
|
+
pca_cross_sectional=self.pca_cross_sectional,
|
|
351
356
|
columns_onehot=self.columns_onehot,
|
|
352
357
|
columns_binary=self.columns_binary,
|
|
353
358
|
columns_frequency=self.columns_frequency,
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -52,6 +52,9 @@ import os
|
|
|
52
52
|
|
|
53
53
|
from sklearn.compose import ColumnTransformer
|
|
54
54
|
from sklearn.decomposition import PCA
|
|
55
|
+
from sklearn.impute import SimpleImputer
|
|
56
|
+
from sklearn.preprocessing import StandardScaler
|
|
57
|
+
from sklearn.pipeline import Pipeline
|
|
55
58
|
from category_encoders import BinaryEncoder, CountEncoder
|
|
56
59
|
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
57
60
|
from sklearn.model_selection import train_test_split
|
|
@@ -316,6 +319,8 @@ class PreprocessFeature:
|
|
|
316
319
|
val_size: float = 0.2,
|
|
317
320
|
test_size: float = 0.2,
|
|
318
321
|
columns_pca: list[str] = [],
|
|
322
|
+
pca_temporal: dict[str, list[str]] = {},
|
|
323
|
+
pca_cross_sectional: dict[str, list[str]] = {},
|
|
319
324
|
columns_onehot: list[str] = [],
|
|
320
325
|
columns_binary: list[str] = [],
|
|
321
326
|
columns_ordinal: list[str] = [],
|
|
@@ -329,6 +334,8 @@ class PreprocessFeature:
|
|
|
329
334
|
|
|
330
335
|
self.experiment = experiment
|
|
331
336
|
self.columns_pca = [col.upper() for col in columns_pca]
|
|
337
|
+
self.pca_temporal = pca_temporal
|
|
338
|
+
self.pca_cross_sectional = pca_cross_sectional
|
|
332
339
|
self.columns_onehot = [col.upper() for col in columns_onehot]
|
|
333
340
|
self.columns_binary = [col.upper() for col in columns_binary]
|
|
334
341
|
self.columns_ordinal = [col.upper() for col in columns_ordinal]
|
|
@@ -364,6 +371,20 @@ class PreprocessFeature:
|
|
|
364
371
|
|
|
365
372
|
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
366
373
|
|
|
374
|
+
train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
|
|
375
|
+
val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
|
|
376
|
+
test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
|
|
377
|
+
|
|
378
|
+
joblib.dump(
|
|
379
|
+
pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
train, pcas_temporal = self.add_pca_feature_temporal(train)
|
|
383
|
+
val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
|
|
384
|
+
test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
|
|
385
|
+
|
|
386
|
+
joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
387
|
+
|
|
367
388
|
# Save all features before encoding
|
|
368
389
|
joblib.dump(
|
|
369
390
|
list(train.columns),
|
|
@@ -402,6 +423,18 @@ class PreprocessFeature:
|
|
|
402
423
|
pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
403
424
|
data, _ = self.add_pca_features(data, pcas=pcas)
|
|
404
425
|
|
|
426
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
|
|
427
|
+
pcas_cross_sectional = joblib.load(
|
|
428
|
+
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
429
|
+
)
|
|
430
|
+
data, _ = self.add_pca_feature_cross_sectional(
|
|
431
|
+
data, pcas=pcas_cross_sectional
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
|
|
435
|
+
pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
436
|
+
data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
|
|
437
|
+
|
|
405
438
|
# Encoding
|
|
406
439
|
transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
407
440
|
data, _ = self.encode_categorical_features(
|
|
@@ -577,6 +610,120 @@ class PreprocessFeature:
|
|
|
577
610
|
|
|
578
611
|
return df, pcas_dict
|
|
579
612
|
|
|
613
|
+
def add_pca_feature_cross_sectional(
|
|
614
|
+
self,
|
|
615
|
+
df: pd.DataFrame,
|
|
616
|
+
*,
|
|
617
|
+
n_components: int = 5,
|
|
618
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
619
|
+
impute_strategy: str = "median",
|
|
620
|
+
standardize: bool = True,
|
|
621
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
622
|
+
"""
|
|
623
|
+
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
624
|
+
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
625
|
+
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
626
|
+
"""
|
|
627
|
+
|
|
628
|
+
pcas_dict = {}
|
|
629
|
+
|
|
630
|
+
for pca_cross_sectional in self.pca_cross_sectional:
|
|
631
|
+
name, index_col, columns_col, value_col = (
|
|
632
|
+
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
633
|
+
)
|
|
634
|
+
prefix = f"CS_PC_{name}"
|
|
635
|
+
|
|
636
|
+
pivot = df.pivot_table(
|
|
637
|
+
index=index_col, columns=columns_col, values=value_col
|
|
638
|
+
).sort_index()
|
|
639
|
+
|
|
640
|
+
# Pipeline à réutiliser entre train et test
|
|
641
|
+
if pcas is None:
|
|
642
|
+
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
643
|
+
if standardize:
|
|
644
|
+
steps.append(
|
|
645
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
646
|
+
)
|
|
647
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
648
|
+
steps.append(("pca", pca))
|
|
649
|
+
pipe = Pipeline(steps)
|
|
650
|
+
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
651
|
+
else:
|
|
652
|
+
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
653
|
+
|
|
654
|
+
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
655
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
656
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
657
|
+
|
|
658
|
+
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
659
|
+
pcas_dict.update({name: pipe})
|
|
660
|
+
|
|
661
|
+
return df, pcas_dict
|
|
662
|
+
|
|
663
|
+
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
664
|
+
def add_pca_feature_temporal(
|
|
665
|
+
self,
|
|
666
|
+
df: pd.DataFrame,
|
|
667
|
+
*,
|
|
668
|
+
n_components: int = 5,
|
|
669
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
670
|
+
impute_strategy: (
|
|
671
|
+
str | None
|
|
672
|
+
) = None, # None = on exige toutes les colonnes présentes
|
|
673
|
+
standardize: bool = True,
|
|
674
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
675
|
+
"""
|
|
676
|
+
Applique une PCA sur une matrice (rows = lignes df, cols = lags).
|
|
677
|
+
Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
|
|
678
|
+
Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
|
|
679
|
+
"""
|
|
680
|
+
pcas_dict = {}
|
|
681
|
+
|
|
682
|
+
for pca_temporal in self.pca_temporal:
|
|
683
|
+
name, cols = (pca_temporal[k] for k in ("name", "columns"))
|
|
684
|
+
prefix = f"TMP_PC_{name}"
|
|
685
|
+
|
|
686
|
+
# Masque des lignes utilisables
|
|
687
|
+
if impute_strategy is None:
|
|
688
|
+
mask = (
|
|
689
|
+
df[cols].notna().all(axis=1)
|
|
690
|
+
) # on n'impute pas → lignes complètes
|
|
691
|
+
X_fit = df.loc[mask, cols]
|
|
692
|
+
else:
|
|
693
|
+
mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
|
|
694
|
+
X_fit = df.loc[mask, cols]
|
|
695
|
+
|
|
696
|
+
# Pipeline
|
|
697
|
+
if pcas is None:
|
|
698
|
+
steps = []
|
|
699
|
+
if impute_strategy is not None:
|
|
700
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
701
|
+
if standardize:
|
|
702
|
+
steps.append(
|
|
703
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
704
|
+
)
|
|
705
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
706
|
+
steps.append(("pca", pca))
|
|
707
|
+
pipe = Pipeline(steps)
|
|
708
|
+
if not X_fit.empty:
|
|
709
|
+
pipe.fit(X_fit) # <- fit sur TRAIN uniquement
|
|
710
|
+
else:
|
|
711
|
+
pipe = pcas[name] # <- TEST
|
|
712
|
+
|
|
713
|
+
# Transform uniquement sur lignes valides (mask)
|
|
714
|
+
if not df.loc[mask, cols].empty:
|
|
715
|
+
Z = pipe.transform(df.loc[mask, cols])
|
|
716
|
+
for i in range(n_components):
|
|
717
|
+
df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
|
|
718
|
+
else:
|
|
719
|
+
# crée les colonnes vides si aucune ligne valide (cohérence de schéma)
|
|
720
|
+
for i in range(n_components):
|
|
721
|
+
df[f"{prefix}_{i}"] = pd.NA
|
|
722
|
+
|
|
723
|
+
pcas_dict.update({name: pipe})
|
|
724
|
+
|
|
725
|
+
return df, pcas_dict
|
|
726
|
+
|
|
580
727
|
# encoding categorical features
|
|
581
728
|
def encode_categorical_features(
|
|
582
729
|
self,
|
lecrapaud/model_selection.py
CHANGED
|
@@ -1781,7 +1781,17 @@ def find_best_threshold(
|
|
|
1781
1781
|
logger.warning(
|
|
1782
1782
|
f"[Class {cls}] No threshold with precision ≥ {target_value}"
|
|
1783
1783
|
)
|
|
1784
|
-
|
|
1784
|
+
# fallback: meilleure precision parmi ceux avec recall>0
|
|
1785
|
+
cand = np.where(recall > 0)[0]
|
|
1786
|
+
if cand.size:
|
|
1787
|
+
best_idx = cand[int(np.argmax(precision[cand]))]
|
|
1788
|
+
logger.warning(
|
|
1789
|
+
f"[Class {cls}] Fallback to best precision with recall>0: "
|
|
1790
|
+
f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
|
|
1791
|
+
)
|
|
1792
|
+
else:
|
|
1793
|
+
logger.error(f"[Class {cls}] No threshold achieves recall>0.")
|
|
1794
|
+
best_idx = None
|
|
1785
1795
|
|
|
1786
1796
|
elif metric == "f1":
|
|
1787
1797
|
valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
|
|
@@ -1795,6 +1805,15 @@ def find_best_threshold(
|
|
|
1795
1805
|
else:
|
|
1796
1806
|
best_idx = int(np.argmax(values)) # no constraint, get best value
|
|
1797
1807
|
|
|
1808
|
+
if best_idx is None:
|
|
1809
|
+
results[cls_str] = {
|
|
1810
|
+
"threshold": None,
|
|
1811
|
+
"precision": None,
|
|
1812
|
+
"recall": None,
|
|
1813
|
+
"f1": None,
|
|
1814
|
+
}
|
|
1815
|
+
continue
|
|
1816
|
+
|
|
1798
1817
|
results[cls_str] = {
|
|
1799
1818
|
"threshold": float(thresholds[best_idx]),
|
|
1800
1819
|
"precision": float(precision[best_idx]),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
|
|
2
|
-
lecrapaud/api.py,sha256=
|
|
2
|
+
lecrapaud/api.py,sha256=fYNkJizvnCdwQelSHlJjcDdBoiAvLm8tKbST1TsMAPc,22669
|
|
3
3
|
lecrapaud/config.py,sha256=itiqC31HB8i2Xo-kn2viCQrg_9tnA07-TJuZ-xdnx44,1126
|
|
4
4
|
lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
|
|
5
5
|
lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
|
|
@@ -25,7 +25,7 @@ lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,
|
|
|
25
25
|
lecrapaud/db/session.py,sha256=E93WXcFFILFAIeH61ft2Egs7D-6caqs0oi4zCkO5Lq4,2822
|
|
26
26
|
lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
|
|
27
27
|
lecrapaud/experiment.py,sha256=1xLWjOrqAxJh9CdXOx9ppQuRFRRj0GH-xYZqg-ty9hI,2463
|
|
28
|
-
lecrapaud/feature_engineering.py,sha256=
|
|
28
|
+
lecrapaud/feature_engineering.py,sha256=sGdQJIX7efdvNDlBWWOJD9NMZ8MzEyTOHCSRnTkJl5E,38970
|
|
29
29
|
lecrapaud/feature_selection.py,sha256=6ry-oVPQHbipm1XSE5YsH7AY0lQFt4CFbWiHiRs1nxg,43593
|
|
30
30
|
lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
|
|
31
31
|
lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
|
|
@@ -36,10 +36,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
|
|
|
36
36
|
lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
|
|
37
37
|
lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
|
|
38
38
|
lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
|
|
39
|
-
lecrapaud/model_selection.py,sha256=
|
|
39
|
+
lecrapaud/model_selection.py,sha256=tHGnYeuUC38fBeJcoHunnXDVd6RjuoawdY3peEvqy6I,71954
|
|
40
40
|
lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
|
|
41
41
|
lecrapaud/utils.py,sha256=JdBB1NvbNIx4y0Una-kSZdo1_ZEocc5hwyYFIZKHmGg,8305
|
|
42
|
-
lecrapaud-0.
|
|
43
|
-
lecrapaud-0.
|
|
44
|
-
lecrapaud-0.
|
|
45
|
-
lecrapaud-0.
|
|
42
|
+
lecrapaud-0.17.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
43
|
+
lecrapaud-0.17.0.dist-info/METADATA,sha256=-SzhIiALD3TcSnAnhxqqX0imJ608yQWWPQ4PeezAAh8,11081
|
|
44
|
+
lecrapaud-0.17.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
45
|
+
lecrapaud-0.17.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|