PyPI - lecrapaud - Versions diffs - 0.16.7__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

lecrapaud 0.16.7py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

lecrapaud/api.py CHANGED Viewed

@@ -102,46 +102,49 @@ class LeCrapaud:
     def compare_experiment_scores(self, name: str):
         """Compare scores of experiments with matching names.
         Args:
             name (str): Name or partial name of experiments to compare
         Returns:
             dict: Dictionary containing experiment names as keys and their scores as values
         """
         from lecrapaud.db import SessionLocal
         from sqlalchemy.orm import joinedload
         db = SessionLocal()
         try:
             # Get all experiments with the given name pattern
             experiments = (
                 db.query(Experiment)
-                .options(joinedload(Experiment.model_selections)
-                        .joinedload(ModelSelection.scores))
+                .options(
+                    joinedload(Experiment.model_selections).joinedload(
+                        ModelSelection.scores
+                    )
+                )
                 .filter(Experiment.name.ilike(f"%{name}%"))
                 .all()
             )
             if not experiments:
                 return {"error": f"No experiments found with name containing '{name}'"}
             comparison = {}
             for exp in experiments:
                 scores = {
                     "rmse": exp.avg_rmse,
                     "logloss": exp.avg_logloss,
                     "accuracy": None,
                     "f1": None,
-                    "roc_auc": None
+                    "roc_auc": None,
                 }
                 # Get classification metrics from the first model selection with scores
                 for model_sel in exp.model_selections:
                     if model_sel.scores:
                         for score in model_sel.scores:
-                            if score.type == 'validation':  # Use validation scores
+                            if score.type == "validation":  # Use validation scores
                                 if score.accuracy is not None:
                                     scores["accuracy"] = score.accuracy
                                 if score.f1 is not None:
@@ -149,16 +152,16 @@ class LeCrapaud:
                                 if score.roc_auc is not None:
                                     scores["roc_auc"] = score.roc_auc
                                 break
                 comparison[exp.name] = scores
             return comparison
         except Exception as e:
             return {"error": f"Error comparing experiment scores: {str(e)}"}
         finally:
             db.close()
     def list_experiments(
         self, name: str = None, limit: int = 1000
     ) -> list["ExperimentEngine"]:
@@ -348,6 +351,8 @@ class ExperimentEngine:
             val_size=self.val_size,
             test_size=self.test_size,
             columns_pca=self.columns_pca,
+            pca_temporal=self.pca_temporal,
+            pca_cross_sectional=self.pca_cross_sectional,
             columns_onehot=self.columns_onehot,
             columns_binary=self.columns_binary,
             columns_frequency=self.columns_frequency,

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -52,6 +52,9 @@ import os
 from sklearn.compose import ColumnTransformer
 from sklearn.decomposition import PCA
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
 from category_encoders import BinaryEncoder, CountEncoder
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.model_selection import train_test_split
@@ -316,6 +319,8 @@ class PreprocessFeature:
         val_size: float = 0.2,
         test_size: float = 0.2,
         columns_pca: list[str] = [],
+        pca_temporal: dict[str, list[str]] = {},
+        pca_cross_sectional: dict[str, list[str]] = {},
         columns_onehot: list[str] = [],
         columns_binary: list[str] = [],
         columns_ordinal: list[str] = [],
@@ -329,6 +334,8 @@ class PreprocessFeature:
         self.experiment = experiment
         self.columns_pca = [col.upper() for col in columns_pca]
+        self.pca_temporal = pca_temporal
+        self.pca_cross_sectional = pca_cross_sectional
         self.columns_onehot = [col.upper() for col in columns_onehot]
         self.columns_binary = [col.upper() for col in columns_binary]
         self.columns_ordinal = [col.upper() for col in columns_ordinal]
@@ -364,6 +371,20 @@ class PreprocessFeature:
         joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
+        train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
+        val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
+        test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
+        joblib.dump(
+            pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
+        )
+        train, pcas_temporal = self.add_pca_feature_temporal(train)
+        val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
+        test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
+        joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
         # Save all features before encoding
         joblib.dump(
             list(train.columns),
@@ -402,6 +423,18 @@ class PreprocessFeature:
             pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
             data, _ = self.add_pca_features(data, pcas=pcas)
+        if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
+            pcas_cross_sectional = joblib.load(
+                f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
+            )
+            data, _ = self.add_pca_feature_cross_sectional(
+                data, pcas=pcas_cross_sectional
+            )
+        if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
+            pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
+            data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
         # Encoding
         transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
         data, _ = self.encode_categorical_features(
@@ -577,6 +610,120 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_cross_sectional(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: str = "median",
+        standardize: bool = True,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Construit un pivot (index=index_col, columns=columns_col, values=value_col),
+        fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
+        (par index_col) dans df. Renvoie (df_avec_features, pipe).
+        """
+        pcas_dict = {}
+        for pca_cross_sectional in self.pca_cross_sectional:
+            name, index_col, columns_col, value_col = (
+                pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
+            )
+            prefix = f"CS_PC_{name}"
+            pivot = df.pivot_table(
+                index=index_col, columns=columns_col, values=value_col
+            ).sort_index()
+            # Pipeline à réutiliser entre train et test
+            if pcas is None:
+                steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
+                if standardize:
+                    steps.append(
+                        ("scaler", StandardScaler(with_mean=True, with_std=True))
+                    )
+                pca = PCA(n_components=n_components, random_state=0)
+                steps.append(("pca", pca))
+                pipe = Pipeline(steps)
+                pipe.fit(pivot)  # <- fit sur TRAIN uniquement
+            else:
+                pipe = pcas[name]  # <- TEST : on réutilise le pipe existant
+            scores = pipe.transform(pivot)  # shape: (n_index, n_components)
+            cols = [f"{prefix}_{i}" for i in range(n_components)]
+            scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+            df = df.merge(scores_df.reset_index(), on=index_col, how="left")
+            pcas_dict.update({name: pipe})
+        return df, pcas_dict
+    # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
+    def add_pca_feature_temporal(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: (
+            str | None
+        ) = None,  # None = on exige toutes les colonnes présentes
+        standardize: bool = True,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Applique une PCA sur une matrice (rows = lignes df, cols = lags).
+        Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
+        Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
+        """
+        pcas_dict = {}
+        for pca_temporal in self.pca_temporal:
+            name, cols = (pca_temporal[k] for k in ("name", "columns"))
+            prefix = f"TMP_PC_{name}"
+            # Masque des lignes utilisables
+            if impute_strategy is None:
+                mask = (
+                    df[cols].notna().all(axis=1)
+                )  # on n'impute pas → lignes complètes
+                X_fit = df.loc[mask, cols]
+            else:
+                mask = df[cols].notna().any(axis=1)  # on imputera → au moins une valeur
+                X_fit = df.loc[mask, cols]
+            # Pipeline
+            if pcas is None:
+                steps = []
+                if impute_strategy is not None:
+                    steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                if standardize:
+                    steps.append(
+                        ("scaler", StandardScaler(with_mean=True, with_std=True))
+                    )
+                pca = PCA(n_components=n_components, random_state=0)
+                steps.append(("pca", pca))
+                pipe = Pipeline(steps)
+                if not X_fit.empty:
+                    pipe.fit(X_fit)  # <- fit sur TRAIN uniquement
+            else:
+                pipe = pcas[name]  # <- TEST
+            # Transform uniquement sur lignes valides (mask)
+            if not df.loc[mask, cols].empty:
+                Z = pipe.transform(df.loc[mask, cols])
+                for i in range(n_components):
+                    df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
+            else:
+                # crée les colonnes vides si aucune ligne valide (cohérence de schéma)
+                for i in range(n_components):
+                    df[f"{prefix}_{i}"] = pd.NA
+            pcas_dict.update({name: pipe})
+        return df, pcas_dict
     # encoding categorical features
     def encode_categorical_features(
         self,

lecrapaud/model_selection.py CHANGED Viewed

@@ -1781,7 +1781,17 @@ def find_best_threshold(
                     logger.warning(
                         f"[Class {cls}] No threshold with precision ≥ {target_value}"
                     )
-                    best_idx = int(np.argmax(precision))  # fallback
+                    # fallback: meilleure precision parmi ceux avec recall>0
+                    cand = np.where(recall > 0)[0]
+                    if cand.size:
+                        best_idx = cand[int(np.argmax(precision[cand]))]
+                        logger.warning(
+                            f"[Class {cls}] Fallback to best precision with recall>0: "
+                            f"idx={best_idx}, precision={precision[best_idx]:.4f}, recall={recall[best_idx]:.4f}"
+                        )
+                    else:
+                        logger.error(f"[Class {cls}] No threshold achieves recall>0.")
+                        best_idx = None
             elif metric == "f1":
                 valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
@@ -1795,6 +1805,15 @@ def find_best_threshold(
         else:
             best_idx = int(np.argmax(values))  # no constraint, get best value
+        if best_idx is None:
+            results[cls_str] = {
+                "threshold": None,
+                "precision": None,
+                "recall": None,
+                "f1": None,
+            }
+            continue
         results[cls_str] = {
             "threshold": float(thresholds[best_idx]),
             "precision": float(precision[best_idx]),

{lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.16.7
+Version: 0.17.0
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet

{lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
-lecrapaud/api.py,sha256=XsdK1jywLOOGcMMtx09KtjLpEzzLpFtXfVjJrQSfcH0,22639
+lecrapaud/api.py,sha256=fYNkJizvnCdwQelSHlJjcDdBoiAvLm8tKbST1TsMAPc,22669
 lecrapaud/config.py,sha256=itiqC31HB8i2Xo-kn2viCQrg_9tnA07-TJuZ-xdnx44,1126
 lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
 lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -25,7 +25,7 @@ lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,
 lecrapaud/db/session.py,sha256=E93WXcFFILFAIeH61ft2Egs7D-6caqs0oi4zCkO5Lq4,2822
 lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
 lecrapaud/experiment.py,sha256=1xLWjOrqAxJh9CdXOx9ppQuRFRRj0GH-xYZqg-ty9hI,2463
-lecrapaud/feature_engineering.py,sha256=J7lWp-lQmuMiirT9QeuK5bxl2YutilZ1JGHR12i0V64,32790
+lecrapaud/feature_engineering.py,sha256=sGdQJIX7efdvNDlBWWOJD9NMZ8MzEyTOHCSRnTkJl5E,38970
 lecrapaud/feature_selection.py,sha256=6ry-oVPQHbipm1XSE5YsH7AY0lQFt4CFbWiHiRs1nxg,43593
 lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
 lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
@@ -36,10 +36,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
 lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
 lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
 lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
-lecrapaud/model_selection.py,sha256=Q7afY0UzFzs2fFEPNXIBxjpabmruxiTmDh5OssPayLk,71139
+lecrapaud/model_selection.py,sha256=tHGnYeuUC38fBeJcoHunnXDVd6RjuoawdY3peEvqy6I,71954
 lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
 lecrapaud/utils.py,sha256=JdBB1NvbNIx4y0Una-kSZdo1_ZEocc5hwyYFIZKHmGg,8305
-lecrapaud-0.16.7.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
-lecrapaud-0.16.7.dist-info/METADATA,sha256=5NUEvWiw9TIKhDPOlh7WIYXvcsnKErDPMUdayfBfC24,11081
-lecrapaud-0.16.7.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-lecrapaud-0.16.7.dist-info/RECORD,,
+lecrapaud-0.17.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
+lecrapaud-0.17.0.dist-info/METADATA,sha256=-SzhIiALD3TcSnAnhxqqX0imJ608yQWWPQ4PeezAAh8,11081
+lecrapaud-0.17.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+lecrapaud-0.17.0.dist-info/RECORD,,

{lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{lecrapaud-0.16.7.dist-info → lecrapaud-0.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

lecrapaud 0.16.7__py3-none-any.whl → 0.17.0__py3-none-any.whl

lecrapaud 0.16.7py3-none-any.whl → 0.17.0py3-none-any.whl