lecrapaud 0.16.7__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (45) hide show
  1. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/PKG-INFO +1 -1
  2. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/api.py +20 -15
  3. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/feature_engineering.py +147 -0
  4. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/model_selection.py +20 -1
  5. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/pyproject.toml +1 -1
  6. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/LICENSE +0 -0
  7. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/README.md +0 -0
  8. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/__init__.py +0 -0
  9. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/config.py +0 -0
  10. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/__init__.py +0 -0
  11. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/README +0 -0
  12. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/env.py +0 -0
  13. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  14. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  15. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  16. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  17. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  18. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/alembic.ini +0 -0
  19. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/__init__.py +0 -0
  20. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/base.py +0 -0
  21. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/experiment.py +0 -0
  22. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/feature.py +0 -0
  23. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/feature_selection.py +0 -0
  24. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  25. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/model.py +0 -0
  26. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/model_selection.py +0 -0
  27. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/model_training.py +0 -0
  28. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/score.py +0 -0
  29. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/target.py +0 -0
  30. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/models/utils.py +0 -0
  31. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/db/session.py +0 -0
  32. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/directories.py +0 -0
  33. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/experiment.py +0 -0
  34. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/feature_selection.py +0 -0
  35. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/integrations/openai_integration.py +0 -0
  36. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/jobs/__init__.py +0 -0
  37. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/jobs/config.py +0 -0
  38. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/jobs/scheduler.py +0 -0
  39. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/jobs/tasks.py +0 -0
  40. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  41. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  42. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  43. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  44. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/search_space.py +0 -0
  45. {lecrapaud-0.16.7 → lecrapaud-0.17.0}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.16.7
3
+ Version: 0.17.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -102,46 +102,49 @@ class LeCrapaud:
102
102
 
103
103
  def compare_experiment_scores(self, name: str):
104
104
  """Compare scores of experiments with matching names.
105
-
105
+
106
106
  Args:
107
107
  name (str): Name or partial name of experiments to compare
108
-
108
+
109
109
  Returns:
110
110
  dict: Dictionary containing experiment names as keys and their scores as values
111
111
  """
112
112
  from lecrapaud.db import SessionLocal
113
113
  from sqlalchemy.orm import joinedload
114
-
114
+
115
115
  db = SessionLocal()
116
116
  try:
117
117
  # Get all experiments with the given name pattern
118
118
  experiments = (
119
119
  db.query(Experiment)
120
- .options(joinedload(Experiment.model_selections)
121
- .joinedload(ModelSelection.scores))
120
+ .options(
121
+ joinedload(Experiment.model_selections).joinedload(
122
+ ModelSelection.scores
123
+ )
124
+ )
122
125
  .filter(Experiment.name.ilike(f"%{name}%"))
123
126
  .all()
124
127
  )
125
-
128
+
126
129
  if not experiments:
127
130
  return {"error": f"No experiments found with name containing '{name}'"}
128
-
131
+
129
132
  comparison = {}
130
-
133
+
131
134
  for exp in experiments:
132
135
  scores = {
133
136
  "rmse": exp.avg_rmse,
134
137
  "logloss": exp.avg_logloss,
135
138
  "accuracy": None,
136
139
  "f1": None,
137
- "roc_auc": None
140
+ "roc_auc": None,
138
141
  }
139
-
142
+
140
143
  # Get classification metrics from the first model selection with scores
141
144
  for model_sel in exp.model_selections:
142
145
  if model_sel.scores:
143
146
  for score in model_sel.scores:
144
- if score.type == 'validation': # Use validation scores
147
+ if score.type == "validation": # Use validation scores
145
148
  if score.accuracy is not None:
146
149
  scores["accuracy"] = score.accuracy
147
150
  if score.f1 is not None:
@@ -149,16 +152,16 @@ class LeCrapaud:
149
152
  if score.roc_auc is not None:
150
153
  scores["roc_auc"] = score.roc_auc
151
154
  break
152
-
155
+
153
156
  comparison[exp.name] = scores
154
-
157
+
155
158
  return comparison
156
-
159
+
157
160
  except Exception as e:
158
161
  return {"error": f"Error comparing experiment scores: {str(e)}"}
159
162
  finally:
160
163
  db.close()
161
-
164
+
162
165
  def list_experiments(
163
166
  self, name: str = None, limit: int = 1000
164
167
  ) -> list["ExperimentEngine"]:
@@ -348,6 +351,8 @@ class ExperimentEngine:
348
351
  val_size=self.val_size,
349
352
  test_size=self.test_size,
350
353
  columns_pca=self.columns_pca,
354
+ pca_temporal=self.pca_temporal,
355
+ pca_cross_sectional=self.pca_cross_sectional,
351
356
  columns_onehot=self.columns_onehot,
352
357
  columns_binary=self.columns_binary,
353
358
  columns_frequency=self.columns_frequency,
@@ -52,6 +52,9 @@ import os
52
52
 
53
53
  from sklearn.compose import ColumnTransformer
54
54
  from sklearn.decomposition import PCA
55
+ from sklearn.impute import SimpleImputer
56
+ from sklearn.preprocessing import StandardScaler
57
+ from sklearn.pipeline import Pipeline
55
58
  from category_encoders import BinaryEncoder, CountEncoder
56
59
  from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
57
60
  from sklearn.model_selection import train_test_split
@@ -316,6 +319,8 @@ class PreprocessFeature:
316
319
  val_size: float = 0.2,
317
320
  test_size: float = 0.2,
318
321
  columns_pca: list[str] = [],
322
+ pca_temporal: dict[str, list[str]] = {},
323
+ pca_cross_sectional: dict[str, list[str]] = {},
319
324
  columns_onehot: list[str] = [],
320
325
  columns_binary: list[str] = [],
321
326
  columns_ordinal: list[str] = [],
@@ -329,6 +334,8 @@ class PreprocessFeature:
329
334
 
330
335
  self.experiment = experiment
331
336
  self.columns_pca = [col.upper() for col in columns_pca]
337
+ self.pca_temporal = pca_temporal
338
+ self.pca_cross_sectional = pca_cross_sectional
332
339
  self.columns_onehot = [col.upper() for col in columns_onehot]
333
340
  self.columns_binary = [col.upper() for col in columns_binary]
334
341
  self.columns_ordinal = [col.upper() for col in columns_ordinal]
@@ -364,6 +371,20 @@ class PreprocessFeature:
364
371
 
365
372
  joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
366
373
 
374
+ train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
375
+ val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
376
+ test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
377
+
378
+ joblib.dump(
379
+ pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
380
+ )
381
+
382
+ train, pcas_temporal = self.add_pca_feature_temporal(train)
383
+ val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
384
+ test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
385
+
386
+ joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
387
+
367
388
  # Save all features before encoding
368
389
  joblib.dump(
369
390
  list(train.columns),
@@ -402,6 +423,18 @@ class PreprocessFeature:
402
423
  pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
403
424
  data, _ = self.add_pca_features(data, pcas=pcas)
404
425
 
426
+ if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
427
+ pcas_cross_sectional = joblib.load(
428
+ f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
429
+ )
430
+ data, _ = self.add_pca_feature_cross_sectional(
431
+ data, pcas=pcas_cross_sectional
432
+ )
433
+
434
+ if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
435
+ pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
436
+ data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
437
+
405
438
  # Encoding
406
439
  transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
407
440
  data, _ = self.encode_categorical_features(
@@ -577,6 +610,120 @@ class PreprocessFeature:
577
610
 
578
611
  return df, pcas_dict
579
612
 
613
+ def add_pca_feature_cross_sectional(
614
+ self,
615
+ df: pd.DataFrame,
616
+ *,
617
+ n_components: int = 5,
618
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
619
+ impute_strategy: str = "median",
620
+ standardize: bool = True,
621
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
622
+ """
623
+ Construit un pivot (index=index_col, columns=columns_col, values=value_col),
624
+ fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
625
+ (par index_col) dans df. Renvoie (df_avec_features, pipe).
626
+ """
627
+
628
+ pcas_dict = {}
629
+
630
+ for pca_cross_sectional in self.pca_cross_sectional:
631
+ name, index_col, columns_col, value_col = (
632
+ pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
633
+ )
634
+ prefix = f"CS_PC_{name}"
635
+
636
+ pivot = df.pivot_table(
637
+ index=index_col, columns=columns_col, values=value_col
638
+ ).sort_index()
639
+
640
+ # Pipeline à réutiliser entre train et test
641
+ if pcas is None:
642
+ steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
643
+ if standardize:
644
+ steps.append(
645
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
646
+ )
647
+ pca = PCA(n_components=n_components, random_state=0)
648
+ steps.append(("pca", pca))
649
+ pipe = Pipeline(steps)
650
+ pipe.fit(pivot) # <- fit sur TRAIN uniquement
651
+ else:
652
+ pipe = pcas[name] # <- TEST : on réutilise le pipe existant
653
+
654
+ scores = pipe.transform(pivot) # shape: (n_index, n_components)
655
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
656
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
657
+
658
+ df = df.merge(scores_df.reset_index(), on=index_col, how="left")
659
+ pcas_dict.update({name: pipe})
660
+
661
+ return df, pcas_dict
662
+
663
+ # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
664
+ def add_pca_feature_temporal(
665
+ self,
666
+ df: pd.DataFrame,
667
+ *,
668
+ n_components: int = 5,
669
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
670
+ impute_strategy: (
671
+ str | None
672
+ ) = None, # None = on exige toutes les colonnes présentes
673
+ standardize: bool = True,
674
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
675
+ """
676
+ Applique une PCA sur une matrice (rows = lignes df, cols = lags).
677
+ Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
678
+ Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
679
+ """
680
+ pcas_dict = {}
681
+
682
+ for pca_temporal in self.pca_temporal:
683
+ name, cols = (pca_temporal[k] for k in ("name", "columns"))
684
+ prefix = f"TMP_PC_{name}"
685
+
686
+ # Masque des lignes utilisables
687
+ if impute_strategy is None:
688
+ mask = (
689
+ df[cols].notna().all(axis=1)
690
+ ) # on n'impute pas → lignes complètes
691
+ X_fit = df.loc[mask, cols]
692
+ else:
693
+ mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
694
+ X_fit = df.loc[mask, cols]
695
+
696
+ # Pipeline
697
+ if pcas is None:
698
+ steps = []
699
+ if impute_strategy is not None:
700
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
701
+ if standardize:
702
+ steps.append(
703
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
704
+ )
705
+ pca = PCA(n_components=n_components, random_state=0)
706
+ steps.append(("pca", pca))
707
+ pipe = Pipeline(steps)
708
+ if not X_fit.empty:
709
+ pipe.fit(X_fit) # <- fit sur TRAIN uniquement
710
+ else:
711
+ pipe = pcas[name] # <- TEST
712
+
713
+ # Transform uniquement sur lignes valides (mask)
714
+ if not df.loc[mask, cols].empty:
715
+ Z = pipe.transform(df.loc[mask, cols])
716
+ for i in range(n_components):
717
+ df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
718
+ else:
719
+ # crée les colonnes vides si aucune ligne valide (cohérence de schéma)
720
+ for i in range(n_components):
721
+ df[f"{prefix}_{i}"] = pd.NA
722
+
723
+ pcas_dict.update({name: pipe})
724
+
725
+ return df, pcas_dict
726
+
580
727
  # encoding categorical features
581
728
  def encode_categorical_features(
582
729
  self,
@@ -1781,7 +1781,17 @@ def find_best_threshold(
1781
1781
  logger.warning(
1782
1782
  f"[Class {cls}] No threshold with precision ≥ {target_value}"
1783
1783
  )
1784
- best_idx = int(np.argmax(precision)) # fallback
1784
+ # fallback: meilleure precision parmi ceux avec recall>0
1785
+ cand = np.where(recall > 0)[0]
1786
+ if cand.size:
1787
+ best_idx = cand[int(np.argmax(precision[cand]))]
1788
+ logger.warning(
1789
+ f"[Class {cls}] Fallback to best precision with recall>0: "
1790
+ f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
1791
+ )
1792
+ else:
1793
+ logger.error(f"[Class {cls}] No threshold achieves recall>0.")
1794
+ best_idx = None
1785
1795
 
1786
1796
  elif metric == "f1":
1787
1797
  valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
@@ -1795,6 +1805,15 @@ def find_best_threshold(
1795
1805
  else:
1796
1806
  best_idx = int(np.argmax(values)) # no constraint, get best value
1797
1807
 
1808
+ if best_idx is None:
1809
+ results[cls_str] = {
1810
+ "threshold": None,
1811
+ "precision": None,
1812
+ "recall": None,
1813
+ "f1": None,
1814
+ }
1815
+ continue
1816
+
1798
1817
  results[cls_str] = {
1799
1818
  "threshold": float(thresholds[best_idx]),
1800
1819
  "precision": float(precision[best_idx]),
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.16.7"
3
+ version = "0.17.0"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
File without changes
File without changes