PyPI - lecrapaud - Versions diffs - 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl - Mend

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

lecrapaud/__init__.py +22 -1
lecrapaud/{api.py → base.py} +331 -241
lecrapaud/config.py +15 -3
lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
lecrapaud/db/models/__init__.py +2 -4
lecrapaud/db/models/base.py +116 -65
lecrapaud/db/models/experiment.py +195 -182
lecrapaud/db/models/feature_selection.py +0 -3
lecrapaud/db/models/feature_selection_rank.py +0 -18
lecrapaud/db/models/model_selection.py +2 -2
lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
lecrapaud/db/session.py +4 -0
lecrapaud/experiment.py +44 -17
lecrapaud/feature_engineering.py +45 -674
lecrapaud/feature_preprocessing.py +1202 -0
lecrapaud/feature_selection.py +145 -332
lecrapaud/integrations/sentry_integration.py +46 -0
lecrapaud/misc/tabpfn_tests.ipynb +2 -2
lecrapaud/mixins.py +247 -0
lecrapaud/model_preprocessing.py +295 -0
lecrapaud/model_selection.py +612 -242
lecrapaud/pipeline.py +548 -0
lecrapaud/search_space.py +2 -1
lecrapaud/utils.py +36 -3
lecrapaud-0.22.6.dist-info/METADATA +423 -0
lecrapaud-0.22.6.dist-info/RECORD +51 -0
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
lecrapaud/db/models/model_training.py +0 -64
lecrapaud/jobs/__init__.py +0 -13
lecrapaud/jobs/config.py +0 -17
lecrapaud/jobs/scheduler.py +0 -30
lecrapaud/jobs/tasks.py +0 -17
lecrapaud-0.19.0.dist-info/METADATA +0 -249
lecrapaud-0.19.0.dist-info/RECORD +0 -48

lecrapaud/feature_selection.py CHANGED Viewed

@@ -5,16 +5,12 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import os
 import time
-from typing import Optional
 from tqdm import tqdm
 import warnings
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import joblib
-import re
 from pathlib import Path
-os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
 # feature selection
 from sklearn.feature_selection import (
     f_classif,
@@ -33,7 +29,6 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import TimeSeriesSplit
 from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
 from mlxtend.feature_selection import SequentialFeatureSelector
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from scipy.stats import spearmanr, kendalltau
 # Internal
@@ -48,6 +43,7 @@ from lecrapaud.db import (
     FeatureSelectionRank,
 )
 from lecrapaud.search_space import all_models
+from lecrapaud.mixins import LeCrapaudEstimatorMixin
 # Annoying Warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
@@ -72,39 +68,66 @@ def load_train_data(experiment_dir):
     return train, val, test, train_scaled, val_scaled, test_scaled
-class FeatureSelectionEngine:
-    def __init__(self, train, experiment, target_number, target_clf, **kwargs):
-        self.experiment = experiment
-        self.train = train
+class FeatureSelector(LeCrapaudEstimatorMixin):
+    def __init__(self, experiment=None, target_number=None, **kwargs):
+        # The mixin will automatically set all experiment.context parameters as attributes
+        super().__init__(experiment=experiment, target_number=target_number, **kwargs)
+        # Set defaults for required parameters if not provided
+        if not hasattr(self, "target_clf"):
+            self.target_clf = []
+        if not hasattr(self, "max_p_value_categorical"):
+            self.max_p_value_categorical = 0.05
+        if not hasattr(self, "percentile"):
+            self.percentile = 20
+        if not hasattr(self, "corr_threshold"):
+            self.corr_threshold = 80
+        if not hasattr(self, "max_features"):
+            self.max_features = 50
         self.target_number = target_number
-        self.target_clf = target_clf
-        self.target_type = (
-            "classification" if self.target_number in self.target_clf else "regression"
-        )
-        self.percentile = self.experiment.percentile
-        self.corr_threshold = self.experiment.corr_threshold
-        self.max_features = self.experiment.max_features
+        # Derived attributes
+        if self.target_number is not None and hasattr(self, "target_clf"):
+            self.target_type = (
+                "classification"
+                if self.target_number in self.target_clf
+                else "regression"
+            )
-        self.experiment_dir = self.experiment.path
-        self.experiment_id = self.experiment.id
-        self.data_dir = f"{self.experiment_dir}/data"
-        self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
-        self.feature_selection_dir = f"{self.target_dir}/feature_selection"
-        os.makedirs(self.feature_selection_dir, exist_ok=True)
+        # Set paths if experiment is available
+        if self.experiment:
+            self.experiment_dir = self.experiment.path
+            self.experiment_id = self.experiment.id
+            self.data_dir = f"{self.experiment_dir}/data"
+            if self.target_number is not None:
+                self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
+                self.feature_selection_dir = f"{self.target_dir}/feature_selection"
+                os.makedirs(self.feature_selection_dir, exist_ok=True)
     # Main feature selection function
-    def run(
-        self,
-        single_process: bool = True,
-    ):
-        """Function to do feature selection with a range of different feature selection technics
+    def fit(self, X, y=None, single_process=True):
+        """
+        Fit the feature selector.
         Args:
-            - train (pd.DataFrame): a pandas train set
-            - target_number (in): a target, targets need to be name ``TARGET_{n}```
-            - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
+            X (pd.DataFrame): Input features
+            y: Target values (ignored, uses TARGET columns in X)
+            single_process (bool): if True, run all feature selection methods in a single process
+        Returns:
+            self: Returns self for chaining (sklearn convention)
         """
+        # Validate data
+        X, y = self._validate_data(X, y)
+        # Store train data
+        self.train = X
+        # Check that target_number is set
+        if self.target_number is None:
+            raise ValueError("target_number must be set before fitting")
         target_number = self.target_number
         target_type = self.target_type
@@ -115,7 +138,6 @@ class FeatureSelectionEngine:
         max_features = self.max_features
         feature_selection = FeatureSelection.upsert(
-            match_fields=["target_id", "experiment_id"],
             target_id=target.id,
             experiment_id=self.experiment_id,
         )
@@ -276,6 +298,58 @@ class FeatureSelectionEngine:
         features_selected_list = features_selected["features"].values.tolist()
+        # Save ensemble features for all numerical features with global ranking
+        logger.info(
+            "Saving ensemble features with global ranking for all numerical features..."
+        )
+        numerical_features_in_data = self.X_numerical.columns.tolist()
+        ensemble_rows = []
+        # Create global ranking for ALL numerical features (1 to n, no null values)
+        all_numerical_scores = pd.concat(results, axis=0)
+        all_numerical_scores = (
+            all_numerical_scores.groupby("features")
+            .agg({"rank": "mean"})  # Average rank across all methods
+            .reset_index()
+        )
+        all_numerical_scores.sort_values("rank", inplace=True)
+        all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
+        for feature in numerical_features_in_data:
+            feature_id = feature_map.get(feature)
+            if feature_id:
+                is_selected = feature in features_selected_list
+                # Get global rank (no null values - all features get a rank)
+                if feature in all_numerical_scores["features"].values:
+                    global_rank = all_numerical_scores[
+                        all_numerical_scores["features"] == feature
+                    ]["global_rank"].values[0]
+                else:
+                    # Fallback: assign last rank + position for features not in results
+                    global_rank = (
+                        len(all_numerical_scores)
+                        + numerical_features_in_data.index(feature)
+                        + 1
+                    )
+                ensemble_rows.append(
+                    {
+                        "feature_selection_id": feature_selection.id,
+                        "feature_id": feature_id,
+                        "method": "ensemble",
+                        "score": None,
+                        "pvalue": None,
+                        "support": (
+                            2 if is_selected else 0
+                        ),  # 2 = in aggregated features
+                        "rank": global_rank,
+                        "training_time": 0,
+                    }
+                )
+        FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
         # analysis 1
         features_selected_by_every_methods = set(results[0]["features"].values.tolist())
         for df in results[1:]:
@@ -303,12 +377,30 @@ class FeatureSelectionEngine:
             header=True,
             index_label="ID",
         )
+        # Update support for features after correlation removal (before max)
+        logger.info("Updating ensemble features after correlation removal...")
+        for row in ensemble_rows:
+            feature = Feature.get(row["feature_id"]).name
+            if feature in features:
+                row["support"] = 1  # 1 = survived correlation removal
         features = features[:max_features]
         # adding categorical features selected
         features += (
             categorical_features_selected if target_type == "classification" else []
         )
+        # Final update for features after max limitation (final selection)
+        logger.info("Finalizing ensemble features...")
+        for row in ensemble_rows:
+            feature = Feature.get(row["feature_id"]).name
+            if feature in features and row["support"] == 1:
+                row["support"] = 2  # 2 = in final selection
+        # Re-save all ensemble data with updated support values
+        FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
         logger.debug(
             f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
         )
@@ -352,7 +444,20 @@ class FeatureSelectionEngine:
         feature_selection.best_features_path = best_features_path
         feature_selection.save()
-        return features
+        # Store selected features for later access
+        self.selected_features_ = features
+        self._set_fitted()
+        return self
+    def get_selected_features(self):
+        """
+        Get the list of selected features after fitting.
+        Returns:
+            list: Selected feature names
+        """
+        self._check_is_fitted()
+        return self.selected_features_
     # Remove correlation
     # ------------------
@@ -441,13 +546,20 @@ class FeatureSelectionEngine:
         feat_scores["features"] = X.columns
         feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
         feat_scores["method"] = "Chi2"
+        # Apply both percentile and p-value filtering
+        # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
+        feat_scores["support"] = feat_scores["support"] & (
+            feat_scores["pvalue"] <= self.max_p_value_categorical
+        )
         feat_scores.sort_values("rank", ascending=True, inplace=True)
         stop = time.time()
         training_time = timedelta(seconds=(stop - start)).total_seconds()
         feat_scores["training_time"] = training_time
         logger.debug(
-            f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+            f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
         )
         feat_scores.to_csv(
@@ -796,305 +908,6 @@ class FeatureSelectionEngine:
         return feat_scores
-class PreprocessModel:
-    def __init__(
-        self,
-        train,
-        val,
-        test,
-        experiment,
-        target_numbers,
-        target_clf,
-        models_idx,
-        time_series,
-        max_timesteps,
-        group_column,
-        date_column,
-        **kwargs,
-    ):
-        self.train = train
-        self.val = val
-        self.test = test
-        self.experiment = experiment
-        self.target_numbers = target_numbers
-        self.target_clf = target_clf
-        self.models_idx = models_idx
-        self.time_series = time_series
-        self.max_timesteps = max_timesteps
-        self.group_column = group_column
-        self.date_column = date_column
-        self.experiment_dir = experiment.path
-        self.data_dir = f"{self.experiment_dir}/data"
-        self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
-        self.all_features = experiment.get_all_features(
-            date_column=date_column, group_column=group_column
-        )
-    def run(self):
-        # save data
-        columns_to_keep = self.all_features + [
-            f"TARGET_{i}" for i in self.target_numbers
-        ]
-        duplicates = [
-            col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
-        ]
-        if duplicates:
-            raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
-        self.train = self.train[columns_to_keep]
-        self.val = self.val[columns_to_keep]
-        self.test = self.test[columns_to_keep]
-        joblib.dump(self.train, f"{self.data_dir}/train.pkl")
-        joblib.dump(self.val, f"{self.data_dir}/val.pkl")
-        joblib.dump(self.test, f"{self.data_dir}/test.pkl")
-        # scaling features
-        if any(t not in self.target_clf for t in self.target_numbers) and any(
-            all_models[i].get("need_scaling") for i in self.models_idx
-        ):
-            logger.info("Scaling features...")
-            train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
-            val_scaled, _, _ = self.scale_data(
-                self.val,
-                scaler_x=scaler_x,
-                scalers_y=scalers_y,
-            )
-            test_scaled, _, _ = self.scale_data(
-                self.test,
-                scaler_x=scaler_x,
-                scalers_y=scalers_y,
-            )
-        else:
-            train_scaled = None
-            val_scaled = None
-            test_scaled = None
-            scaler_x = None
-        # save data
-        joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
-        joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
-        joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
-        joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
-        data = {
-            "train": self.train,
-            "val": self.val,
-            "test": self.test,
-            "train_scaled": train_scaled,
-            "val_scaled": val_scaled,
-            "test_scaled": test_scaled,
-        }
-        # reshape data for time series
-        reshaped_data = None
-        if (
-            any(all_models[i].get("recurrent") for i in self.models_idx)
-            and self.time_series
-        ):
-            # reshaping data for recurrent models
-            logger.info("Reshaping data for recurrent models...")
-            reshaped_data = self.reshape_time_series(
-                train_scaled,
-                val_scaled,
-                test_scaled,
-                features=self.all_features,
-                timesteps=self.max_timesteps,
-            )
-        return data, reshaped_data
-    def inference(self):
-        # self.train is new data here
-        columns_to_keep = self.all_features
-        self.train = self.train[columns_to_keep]
-        scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
-        if scaler_x:
-            scaled_data = scaler_x.transform(self.train)
-            scaled_data = pd.DataFrame(
-                scaled_data, columns=self.train.columns, index=self.train.index
-            )
-        else:
-            scaled_data = self.train
-        reshaped_data = None
-        if (
-            any(all_models[i].get("recurrent") for i in self.models_idx)
-            and self.time_series
-        ):
-            # we need to make sur we have max_timesteps of data after grouping by group_column
-            if (
-                self.group_column
-                and scaled_data.groupby(self.group_column).size().min()
-                < self.max_timesteps
-            ) or scaled_data.shape[0] < self.max_timesteps:
-                raise ValueError(
-                    f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
-                )
-            # reshaping data for recurrent models
-            logger.info("Reshaping data for recurrent models...")
-            reshaped_data = self.reshape_time_series(
-                scaled_data,
-                features=self.all_features,
-                timesteps=self.max_timesteps,
-            )
-        return self.train, scaled_data, reshaped_data
-    # scaling
-    def scale_data(
-        self,
-        df: pd.DataFrame,
-        scaler_x=None,
-        scalers_y: Optional[list] = None,
-    ):
-        logger.info("Scale data...")
-        X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
-        if scaler_x:
-            X_scaled = pd.DataFrame(
-                scaler_x.transform(X), columns=list(X.columns), index=X.index
-            )
-        else:
-            scaler_x = StandardScaler()  # MinMaxScaler(feature_range=(-1,1))
-            X_scaled = pd.DataFrame(
-                scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
-            )
-        # Determine which targets need to be scaled
-        targets_numbers_to_scale = [
-            i for i in self.target_numbers if i not in self.target_clf
-        ]
-        # Dictionary to store scaled target data
-        scaled_targets = {}
-        if scalers_y:
-            for target_number in targets_numbers_to_scale:
-                y = df[[f"TARGET_{target_number}"]]
-                scaled_targets[target_number] = pd.DataFrame(
-                    scalers_y[f"scaler_y_{target_number}"].transform(y.values),
-                    columns=y.columns,
-                    index=y.index,
-                )
-        else:
-            scalers_y = {}
-            for target_number in targets_numbers_to_scale:
-                scaler_y = StandardScaler()
-                y = df[[f"TARGET_{target_number}"]]
-                scaled_y = pd.DataFrame(
-                    scaler_y.fit_transform(y.values),
-                    columns=y.columns,
-                    index=y.index,
-                )
-                target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
-                joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
-                scalers_y[f"scaler_y_{target_number}"] = scaler_y
-                scaled_targets[target_number] = scaled_y
-        # Reconstruct y_scaled in the original order
-        y_scaled = pd.concat(
-            [
-                scaled_targets[target_number]
-                for target_number in targets_numbers_to_scale
-            ],
-            axis=1,
-        )
-        y_not_scaled = df[
-            df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
-        ]
-        # Ensure the final DataFrame keeps the original order
-        df_scaled = pd.concat(
-            [X_scaled, y_scaled, y_not_scaled],
-            axis=1,
-        )[
-            df.columns
-        ]  # Reorder columns to match original `df`
-        if not df_scaled.columns.equals(df.columns):
-            raise Exception("Columns are not in the same order after scaling.")
-        return df_scaled, scaler_x, scalers_y
-    # Reshape into 3D tensors for recurrent models
-    def reshape_time_series(
-        self,
-        train: pd.DataFrame,
-        val: pd.DataFrame,
-        test: pd.DataFrame,
-        features: list,
-        timesteps: int = 120,
-    ):
-        # always scale for recurrent layers : train should be scaled
-        group_column = self.group_column
-        target_columns = train.columns.intersection(
-            [f"TARGET_{i}" for i in self.target_numbers]
-        )
-        data = pd.concat([train, val, test], axis=0)
-        def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
-            fill_value = [[[0] * len(df.columns)]]
-            def shiftsum(x, timesteps: int):
-                tmp = x.copy()
-                for i in range(1, timesteps):
-                    tmp = x.shift(i, fill_value=fill_value) + tmp
-                return tmp
-            logger.info("Grouping each feature in a unique column with list...")
-            df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
-            df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
-            logger.info("Grouping features and creating timesteps...")
-            df_reshaped = (
-                df_reshaped.groupby(group_column)[0]
-                .apply(lambda x: shiftsum(x, timesteps))
-                .reset_index(group_column, drop=True)
-                .rename("RECURRENT_FEATURES")
-            )
-            df_reshaped = pd.DataFrame(df_reshaped)
-            return df_reshaped
-        data_reshaped = reshape_df(data[features], data[group_column], timesteps)
-        data_reshaped[target_columns] = data[target_columns]
-        logger.info("Separating train, val, test data and creating np arrays...")
-        train_reshaped = data_reshaped.loc[train.index]
-        val_reshaped = data_reshaped.loc[val.index]
-        test_reshaped = data_reshaped.loc[test.index]
-        x_train_reshaped = np.array(
-            train_reshaped["RECURRENT_FEATURES"].values.tolist()
-        )
-        y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
-        x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
-        y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
-        x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
-        y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
-        reshaped_data = {
-            "x_train_reshaped": x_train_reshaped,
-            "y_train_reshaped": y_train_reshaped,
-            "x_val_reshaped": x_val_reshaped,
-            "y_val_reshaped": y_val_reshaped,
-            "x_test_reshaped": x_test_reshaped,
-            "y_test_reshaped": y_test_reshaped,
-        }
-        return reshaped_data
 # utils
 # TODO : can we use this to select the ideal number of features ?
 def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):

lecrapaud/integrations/sentry_integration.py ADDED Viewed

@@ -0,0 +1,46 @@
+import logging
+from importlib.metadata import version
+import sentry_sdk
+from sentry_sdk.integrations.logging import LoggingIntegration
+from lecrapaud.config import (
+    LOGGING_LEVEL,
+    PYTHON_ENV,
+    SENTRY_DSN,
+    SENTRY_PROFILES_SAMPLE_RATE,
+    SENTRY_TRACES_SAMPLE_RATE,
+)
+def _release_version():
+    try:
+        return f"lecrapaud@{version('lecrapaud')}"
+    except Exception:
+        return None
+def init_sentry():
+    """
+    Initialize Sentry if a DSN is configured.
+    Returns True when enabled, False otherwise.
+    """
+    if not SENTRY_DSN:
+        return False
+    sentry_logging = LoggingIntegration(
+        level=getattr(logging, LOGGING_LEVEL.upper(), logging.INFO),
+        event_level=logging.ERROR,
+    )
+    sentry_sdk.init(
+        dsn=SENTRY_DSN,
+        environment=PYTHON_ENV,
+        release=_release_version(),
+        integrations=[sentry_logging],
+        traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
+        profiles_sample_rate=SENTRY_PROFILES_SAMPLE_RATE,
+        send_default_pii=False,
+    )
+    return True

lecrapaud/misc/tabpfn_tests.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -64,7 +64,7 @@
     "from sklearn.metrics import accuracy_score, roc_auc_score\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "from tabpfn import TabPFNClassifier\n",
+    "# from tabpfn import TabPFNClassifier\n",
     "\n",
     "# Load data\n",
     "X, y = load_breast_cancer(return_X_y=True)\n",

lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl