PyPI - lecrapaud - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show

lecrapaud/__init__.py +1 -0
lecrapaud/api.py +277 -0
lecrapaud/config.py +10 -0
lecrapaud/db/__init__.py +1 -0
lecrapaud/db/alembic/env.py +2 -2
lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
lecrapaud/db/alembic.ini +116 -0
lecrapaud/db/models/__init__.py +10 -10
lecrapaud/db/models/base.py +176 -1
lecrapaud/db/models/dataset.py +25 -20
lecrapaud/db/models/feature.py +5 -6
lecrapaud/db/models/feature_selection.py +3 -4
lecrapaud/db/models/feature_selection_rank.py +3 -4
lecrapaud/db/models/model.py +3 -4
lecrapaud/db/models/model_selection.py +15 -8
lecrapaud/db/models/model_training.py +15 -7
lecrapaud/db/models/score.py +9 -6
lecrapaud/db/models/target.py +16 -8
lecrapaud/db/session.py +66 -0
lecrapaud/experiment.py +64 -0
lecrapaud/feature_engineering.py +747 -1022
lecrapaud/feature_selection.py +915 -998
lecrapaud/integrations/openai_integration.py +225 -0
lecrapaud/jobs/__init__.py +2 -2
lecrapaud/jobs/config.py +1 -1
lecrapaud/jobs/scheduler.py +1 -1
lecrapaud/jobs/tasks.py +6 -6
lecrapaud/model_selection.py +1060 -960
lecrapaud/search_space.py +4 -0
lecrapaud/utils.py +2 -2
lecrapaud-0.4.1.dist-info/METADATA +171 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
lecrapaud/db/crud.py +0 -179
lecrapaud/db/services.py +0 -0
lecrapaud/db/setup.py +0 -58
lecrapaud/predictions.py +0 -292
lecrapaud/training.py +0 -151
lecrapaud-0.4.0.dist-info/METADATA +0 -103
/lecrapaud/{directory_management.py → directories.py} +0 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0

lecrapaud/feature_selection.py CHANGED Viewed

@@ -33,970 +33,1027 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import TimeSeriesSplit
 from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
 from mlxtend.feature_selection import SequentialFeatureSelector
-from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
-from sklearn.compose import ColumnTransformer
-import category_encoders as ce
-from scipy.stats import spearmanr, kendalltau
-# Scaling
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from scipy.stats import spearmanr, kendalltau
 # Internal
-from src.directory_management import tmp_dir, clean_directory
-from src.utils import logger
-from src.config import PYTHON_ENV
-from src.db.models import (
+from lecrapaud.directories import tmp_dir, clean_directory
+from lecrapaud.utils import logger
+from lecrapaud.config import PYTHON_ENV
+from lecrapaud.db import (
     Dataset,
     Target,
     Feature,
     FeatureSelection,
     FeatureSelectionRank,
 )
-from src.db.setup import get_db
-# Variables for targets handling
-TARGETS_NUMBER = range(1, 15)
-TARGETS_CLF = [2, 4, 6, 8, 9, 10, 11]
-TARGETS_MCLF = [11]
-GROUPING_COLUMN = "STOCK"
-DATE_COLUMN = "DATE"
+from lecrapaud.db.session import get_db
+from lecrapaud.search_space import all_models
 # Annoying Warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
-def get_dataset_name(
-    df, corr_threshold: int = 80, percentile: int = 20, max_features: int = 20
-):
-    number_of_groups = df[GROUPING_COLUMN].nunique()
-    # Try to convert DATE column to datetime safely
-    if pd.api.types.is_integer_dtype(df[DATE_COLUMN]):
-        df_date = df[DATE_COLUMN].map(pd.Timestamp.fromordinal)
-    else:
-        df_date = pd.to_datetime(
-            df[DATE_COLUMN], errors="coerce"
-        )  # convert strings, datetime, etc.
-    name = f"data_{number_of_groups}_{corr_threshold}_{percentile}_{max_features}_{df_date.min().date()}_{df_date.max().date()}"
-    if PYTHON_ENV == "Test":
-        name = f"test_{name}"
-    return name
-def create_sets_from_data(
-    df: pd.DataFrame,
-    corr_threshold: int = 80,
-    percentile: int = 20,
-    max_features: int = 20,
-):
-    df.sort_values([DATE_COLUMN, GROUPING_COLUMN], inplace=True)
-    # Drop non-useful column for training
-    if "ISIN" in df.columns:
-        df.drop(labels=["ISIN"], axis=1, inplace=True)
-    if "SECURITY" in df.columns:
-        df.drop(labels=["SECURITY"], axis=1, inplace=True)
+def load_train_data(dataset_dir, target_number, target_type="regression"):
+    data_dir = f"{dataset_dir}/data"
-    dates = df[DATE_COLUMN].unique()
+    logger.info("Loading data...")
+    train = joblib.load(f"{data_dir}/train.pkl")
+    val = joblib.load(f"{data_dir}/val.pkl")
+    test = joblib.load(f"{data_dir}/test.pkl")
+    try:
+        train_scaled = joblib.load(f"{data_dir}/train_scaled.pkl")
+        val_scaled = joblib.load(f"{data_dir}/val_scaled.pkl")
+        test_scaled = joblib.load(f"{data_dir}/test_scaled.pkl")
+    except FileNotFoundError:
+        train_scaled = None
+        val_scaled = None
+        test_scaled = None
+    return train, val, test, train_scaled, val_scaled, test_scaled
+class FeatureSelectionEngine:
+    def __init__(self, train, dataset, target_number, target_clf, **kwargs):
+        self.dataset = dataset
+        self.train = train
+        self.target_number = target_number
+        self.target_clf = target_clf
+        self.target_type = (
+            "classification" if self.target_number in self.target_clf else "regression"
+        )
+        self.percentile = self.dataset.percentile
+        self.corr_threshold = self.dataset.corr_threshold
+        self.max_features = self.dataset.max_features
+        self.dataset_dir = self.dataset.path
+        self.dataset_id = self.dataset.id
+        self.data_dir = f"{self.dataset_dir}/data"
+        self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
+        self.fs_dir_target = (
+            f"{self.dataset_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
+        )
+        os.makedirs(self.fs_dir_target, exist_ok=True)
+    # Main feature selection function
+    def run(
+        self,
+        single_process: bool = True,
+    ):
+        """Function to do feature selection with a range of different feature selection technics
+        Args:
+            - train (pd.DataFrame): a pandas train set
+            - target_number (in): a target, targets need to be name ``TARGET_{n}```
+            - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
+        """
+        target_number = self.target_number
+        target_type = self.target_type
+        fs_dir_target = self.fs_dir_target
+        # Create the feature selection in db
+        target = Target.find_by(name=f"TARGET_{target_number}")
+        percentile = self.percentile
+        corr_threshold = self.corr_threshold
+        max_features = self.max_features
+        feature_selection = FeatureSelection.upsert(
+            match_fields=["target_id", "dataset_id"],
+            target_id=target.id,
+            dataset_id=self.dataset_id,
+        )
-    val_first_id = int(len(dates) * 0.6) + 1
-    test_first_id = int(len(dates) * 0.8) + 1
+        if feature_selection.best_features_path:
+            return joblib.load(feature_selection.best_features_path)
-    train = df[df[DATE_COLUMN].isin(dates[:val_first_id])]
-    val = df[df[DATE_COLUMN].isin(dates[val_first_id:test_first_id])]
-    test = df[df[DATE_COLUMN].isin(dates[test_first_id:])]
+        self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
+        self.y = self.train[f"TARGET_{target_number}"]
-    dates = {}
-    dates["start_date"] = pd.to_datetime(df[DATE_COLUMN].iat[0])
-    dates["end_date"] = pd.to_datetime(df[DATE_COLUMN].iat[-1])
-    for name, data in zip(["train", "val", "test"], [train, val, test]):
-        dates[f"{name}_start_date"] = pd.to_datetime(data[DATE_COLUMN].iat[0])
-        dates[f"{name}_end_date"] = pd.to_datetime(data[DATE_COLUMN].iat[-1])
+        logger.info(f"Starting feature selection for TARGET_{target_number}...")
+        clean_directory(self.fs_dir_target)
-        logger.info(
-            f"{len(data['DATE'])} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
+        # Let's start by removing extremly correlated features
+        # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
+        # TODO: we could also remove low variance features
+        features_uncorrelated, features_correlated = self.remove_correlated_features(
+            90, vizualize=False
         )
-    datasets = {}
-    with get_db() as db:
-        all_targets = Target.get_all(db=db)
-        matched_targets = [
-            target for target in all_targets if target.name in train.columns
-        ]
-        dataset_name = get_dataset_name(train, corr_threshold, percentile, max_features)
-        dataset_dir = f"{tmp_dir}/{dataset_name}"
-        preprocessing_dir = f"{dataset_dir}/preprocessing"
-        train_data_dir = f"{dataset_dir}/data"
-        os.makedirs(dataset_dir, exist_ok=True)
-        os.makedirs(preprocessing_dir, exist_ok=True)
-        os.makedirs(train_data_dir, exist_ok=True)
-        dataset = datasets[name] = Dataset.upsert(
-            match_fields=["name"],
-            db=db,
-            name=dataset_name,
-            path=Path(dataset_dir).resolve(),
-            type="training",
-            size=df.shape[0],
-            train_size=train.shape[0],
-            val_size=val.shape[0],
-            test_size=test.shape[0],
-            number_of_groups=data[GROUPING_COLUMN].nunique(),
-            list_of_groups=data[GROUPING_COLUMN].unique().tolist(),
-            corr_threshold=corr_threshold,
-            percentile=percentile,
-            max_features=max_features,
-            **dates,
-            targets=matched_targets,
+        self.X = self.X[features_uncorrelated]
+        logger.debug(
+            f"""
+            \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
+            \nWe are looking to capture {percentile}% of {len(self.X.columns)} features, i.e. {int(len(self.X.columns)*percentile/100)} features, with different feature selection methods
+            \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
+            \nFinally, we will keep only the {max_features} best ranked features
+            """
         )
-        # encode categoricals
-        train = encode_categorical_features(train, fit=True, save_dir=preprocessing_dir)
-        val = encode_categorical_features(val, save_dir=preprocessing_dir)
-        test = encode_categorical_features(test, save_dir=preprocessing_dir)
-        # save the full data
-        if PYTHON_ENV != "Test":
-            joblib.dump(df, f"{train_data_dir}/full.pkl")
+        start = time.time()
-        return train, val, test, dataset
+        # handling categorical features (only if classification)
+        self.X_categorical, self.X_numerical = get_features_by_types(self.X)
+        if target_type == "classification" and self.X_categorical.shape[1] > 0:
+            feat_scores = self.select_categorical_features(
+                percentile=percentile, save_dir=fs_dir_target
+            )
+            with get_db() as db:
+                for row in feat_scores.itertuples(index=False):
+                    feature = Feature.find_by(name=row.features, db=db)
+                    FeatureSelectionRank.upsert(
+                        ["feature_selection_id", "feature_id", "method"],
+                        db=db,
+                        score=row.score,
+                        pvalue=row.pvalue,
+                        support=row.support,
+                        rank=row.rank,
+                        method=row.method,
+                        training_time=row.training_time,
+                        feature_selection_id=feature_selection.id,
+                        feature_id=feature.id,
+                    )
+            categorical_features_selected = feat_scores[feat_scores["support"]][
+                "features"
+            ].values.tolist()
+        results = []
+        params = {"percentile": percentile, "save_dir": fs_dir_target}
+        if single_process:
+            results = [
+                self.select_feature_by_linear_correlation(**params),
+                self.select_feature_by_nonlinear_correlation(**params),
+                self.select_feature_by_mi(**params),
+                self.select_feature_by_feat_imp(**params),
+                self.select_feature_by_rfe(**params),
+                # self.select_feature_by_sfs(
+                #     **params
+                # ), # TODO: this is taking too long
+            ]
+        else:
+            # Use ProcessPoolExecutor to run tasks in parallel
+            # TODO: not sure it's efficient from previous tests... especially because rfe and sfs methods are doing parallel processing already, this can create overhead
+            with ProcessPoolExecutor() as executor:
+                # Submit different functions to be executed in parallel
+                futures = [
+                    executor.submit(
+                        self.select_feature_by_linear_correlation,
+                        **params,
+                    ),
+                    executor.submit(
+                        self.select_feature_by_nonlinear_correlation,
+                        **params,
+                    ),
+                    executor.submit(
+                        self.select_feature_by_mi,
+                        **params,
+                    ),
+                    executor.submit(
+                        self.select_feature_by_feat_imp,
+                        **params,
+                    ),
+                    executor.submit(
+                        self.select_feature_by_rfe,
+                        **params,
+                    ),
+                    # executor.submit(
+                    #     self.select_feature_by_sfs,
+                    #     **params,
+                    # ),  # TODO: this is taking too long
+                ]
+                # Wait for all futures to complete and gather the results
+                with tqdm(total=len(futures)) as pbar:
+                    for future in as_completed(futures):
+                        results.append(future.result())
+                        pbar.update(1)
+        logger.info(f"Finished feature selection for target {target_number}")
+        stop = time.time()
+        # Once all tasks are completed, start by inserting results to db
+        feat_scores = pd.concat(
+            results,
+            axis=0,
+        )
-def encode_categorical_features(df: pd.DataFrame, save_dir: str, fit: bool = False):
+        logger.info("Inserting feature selection results to db...")
+        rows = []
+        with get_db() as db:
+            feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
+            for row in feat_scores.itertuples(index=False):
+                feature_id = feature_map.get(row.features)
+                if not feature_id:
+                    continue  # or raise if feature must exist
+                rows.append(
+                    {
+                        "feature_selection_id": feature_selection.id,
+                        "feature_id": feature_id,
+                        "method": row.method,
+                        "score": row.score,
+                        "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
+                        "support": row.support,
+                        "rank": row.rank,
+                        "training_time": row.training_time,
+                    }
+                )
-    X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
-    y = df.loc[:, df.columns.str.contains("^TARGET_")]
+            if len(rows) == 0:
+                raise ValueError(f"No features selected for TARGET_{target_number}")
-    # 1. Timestamps for 'DATE'
-    X.loc[:, DATE_COLUMN] = pd.to_datetime(X[DATE_COLUMN]).map(pd.Timestamp.toordinal)
+            FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
-    if fit:
-        # Define columns for ordinal and binary encoding (we should have all possible values in training set, unless we accept unknown values processing)
-        ordinal_encoding_features = ["STOCK"]
+        # Merge the results
+        logger.info("Merging feature selection methods...")
+        features_selected = feat_scores[feat_scores["support"]][["features", "rank"]]
+        features_selected.sort_values("rank", inplace=True)
+        features_selected.drop_duplicates("features", inplace=True)
-        binary_encoding_features = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
+        features_selected_list = features_selected["features"].values.tolist()
-        # Fit and save the ColumnTransformer with OrdinalEncoder and OneHotEncoder
-        column_transformer = ColumnTransformer(
-            transformers=[
-                (
-                    "ordinal",
-                    OrdinalEncoder(
-                        handle_unknown="use_encoded_value",
-                        unknown_value=-1,  # rows with unseen STOCK values will be encoded as -1
-                    ),
-                    ordinal_encoding_features,
-                ),
-                (
-                    "binary_encoder",
-                    ce.BinaryEncoder(
-                        handle_unknown="value",
-                    ),  # rows with unseen values will be encoded as all-zeros in the binary columns
-                    binary_encoding_features,
-                ),
-            ],
-            remainder="passthrough",  # Keep the non-encoded columns like 'DATE'
+        # analysis 1
+        features_selected_by_every_methods = set(results[0]["features"].values.tolist())
+        for df in results[1:]:
+            features_selected_by_every_methods &= set(
+                df["features"].values.tolist()
+            )  # intersection
+        features_selected_by_every_methods = list(features_selected_by_every_methods)
+        logger.debug(
+            f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
+        )
+        logger.debug(features_selected_by_every_methods)
+        pd.Series(features_selected_list).to_csv(
+            f"{fs_dir_target}/features_before_corr.csv",
+            index=True,
+            header=True,
+            index_label="ID",
         )
-        transformed_data = column_transformer.fit_transform(X)
-        if PYTHON_ENV != "Test":
-            joblib.dump(column_transformer, f"{save_dir}/column_transformer.pkl")
-    else:
-        # Load the ColumnTransformer and apply it
-        column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
-        transformed_data = column_transformer.transform(X)
-    # Convert to DataFrame for readability and return
-    transformed_X = pd.DataFrame(
-        transformed_data,
-        columns=[
-            feature.split("__")[1]
-            for feature in column_transformer.get_feature_names_out()
-        ],
-        index=X.index,
-    )
-    transformed_X = transformed_X.apply(pd.to_numeric)
-    for col in [
-        feature.split("__")[1]
-        for feature in column_transformer.get_feature_names_out()
-        if "remainder" not in feature
-    ] + [DATE_COLUMN]:
-        transformed_X[col] = transformed_X[col].astype(int)
-    # Insert features in db
-    if fit:
-        # TODO: in bulk
-        for feature in transformed_X.columns:
-            dtype = transformed_X[feature].dtype
-            if pd.api.types.is_integer_dtype(dtype):
-                feature_type = "categorical"
-            elif pd.api.types.is_float_dtype(dtype):
-                feature_type = "numerical"
-            else:
-                feature_type = "other"
-            Feature.upsert(match_fields=["name"], name=feature, type=feature_type)
-        for target in y.columns:
-            type = (
-                "classification"
-                if int(target.split("_")[1]) in TARGETS_CLF
-                else "regression"
-            )
-            # TODO: what about description here ?
-            Target.upsert(match_fields=["name", "type"], name=target, type=type)
-    return pd.concat([transformed_X, y], axis=1)
+        # removing correlated features
+        self.X = self.X[features_selected_list]
+        features, features_correlated = self.remove_correlated_features(corr_threshold)
+        pd.Series(features).to_csv(
+            f"{fs_dir_target}/features_before_max.csv",
+            index=True,
+            header=True,
+            index_label="ID",
+        )
+        features = features[:max_features]
-# only work with all features from feat eng in the right order (unused for now)
-def decode_categorical_features(df: pd.DataFrame, save_dir: str):
-    X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
-    y = df.loc[:, df.columns.str.contains("^TARGET_")]
-    index = X.index
-    original_dtypes = X.dtypes.to_dict()
+        # adding categorical features selected
+        features += (
+            categorical_features_selected if target_type == "classification" else []
+        )
+        logger.debug(
+            f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
+        )
-    column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
+        # analysis 2
+        features_selected_by_every_methods_uncorrelated = list(
+            set(features) & set(features_selected_by_every_methods)
+        )
+        logger.debug(
+            f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
+        )
+        logger.debug(
+            features_selected[
+                features_selected["features"].isin(features)
+            ].to_markdown()
+        )
-    X = X.to_numpy()
-    arrays = []
-    for name, indices in column_transformer.output_indices_.items():
-        transformer = column_transformer.named_transformers_.get(name, None)
-        arr = X[:, indices.start : indices.stop]
+        # save to path
+        best_features_path = Path(
+            f"{self.preprocessing_dir}/features_{target_number}.pkl"
+        ).resolve()
+        joblib.dump(features, best_features_path)
-        if transformer in (None, "passthrough", "drop"):
-            pass
+        # save in db
+        db_features = Feature.filter(name__in=features)
+        # Order matters, to keep the same order in db as in features, we need: map features by name
+        feature_by_name = {f.name: f for f in db_features}
+        # Reorder them according to original `features` list
+        ordered_db_features = [
+            feature_by_name[name] for name in features if name in feature_by_name
+        ]
-        else:
-            arr = transformer.inverse_transform(arr)
+        feature_selection = FeatureSelection.get(feature_selection.id)
+        feature_selection = feature_selection.add_features(ordered_db_features)
+        feature_selection.training_time = stop - start
+        feature_selection.best_features_path = best_features_path
+        feature_selection.save()
-        arrays.append(arr)
+        return features
-    retarr = np.concatenate(arrays, axis=1)
+    # Remove correlation
+    # ------------------
-    columns_ordinal = [
-        feature.split("__")[1]
-        for feature in column_transformer.get_feature_names_out()
-        if feature.split("__")[0] == "ordinal"
-    ]
-    columns_binary_encoder = [
-        feature.split("__")[1]
-        for feature in column_transformer.get_feature_names_out()
-        if feature.split("__")[0] == "binary_encoder"
-    ]
-    # Remove trailing "_number" using regex
-    columns_binary_encoder = {
-        re.sub(r"_\d+$", "", col) for col in columns_binary_encoder
-    }
-    columns_binary_encoder = list(columns_binary_encoder)
-    columns_remainder = [
-        feature.split("__")[1]
-        for feature in column_transformer.get_feature_names_out()
-        if feature.split("__")[0] == "remainder"
-    ]
-    columns = columns_ordinal + columns_binary_encoder + columns_remainder
-    decoded_X = pd.DataFrame(
-        retarr,
-        columns=columns,
-        index=index,
-    )
+    def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
+        X = self.X
+        features = X.columns
+        # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
+        corr_matrix = X[features].corr().abs()
-    for col in decoded_X.columns:
-        if col in columns_ordinal or col in columns_binary_encoder:
-            decoded_X[col] = decoded_X[col].astype(str)
-        elif col in original_dtypes:
-            decoded_X[col] = decoded_X[col].astype(original_dtypes[col])
-    # revert timestamps to dates
-    decoded_X.loc[:, DATE_COLUMN] = decoded_X[DATE_COLUMN].map(pd.Timestamp.fromordinal)
-    return pd.concat([decoded_X, y], axis=1)
-# Filter methods
-# ----------------
-# Linear correlation (Person's R for regression and ANOVA for classification)
-def select_feature_by_linear_correlation(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
-    test_type = "Person’s R" if target_type == "regression" else "ANOVA"
-    logger.debug(f"Running {test_type}...")
-    model = f_regression if target_type == "regression" else f_classif
-    feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
-    feat_scores = pd.DataFrame()
-    feat_scores["score"] = feat_selector.scores_
-    feat_scores["pvalue"] = feat_selector.pvalues_
-    feat_scores["support"] = feat_selector.get_support()
-    feat_scores["features"] = X.columns
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores["method"] = test_type
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
-    logger.debug(
-        f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+        features_uncorrelated = [
+            column
+            for column in upper.columns
+            if all(upper[column].dropna() <= corr_threshold / 100)
+        ]
+        features_correlated = [
+            column
+            for column in upper.columns
+            if any(upper[column] > corr_threshold / 100)
+        ]
-    feat_scores.to_csv(
-        f"{save_dir}/{test_type}.csv",
-        index=True,
-        header=True,
-        index_label="ID",
-    )
+        if vizualize:
+            features_selected_visualization = (
+                X[features]
+                .corr()
+                .where(np.triu(np.ones(len(features)), k=1).astype(bool))
+                .fillna(0)
+            )
+            # Plot the heatmap
+            plt.figure(figsize=(10, 8))
+            sns.heatmap(
+                corr_matrix,
+                annot=True,
+                cmap="coolwarm",
+                center=0,
+                linewidths=1,
+                linecolor="black",
+            )
+            plt.title(f"Correlation Matrix")
+            plt.show()
+            logger.info(f"\n{features_selected_visualization.describe().to_string()}")
+            logger.info(f"\n{features_selected_visualization.to_string()}")
+        return features_uncorrelated, features_correlated
+    # Filter methods
+    # ----------------
+    def select_categorical_features(self, percentile, save_dir: Optional[str] = None):
+        X, y = self.X_categorical, self.y
+        start = time.time()
+        logger.debug("Running Chi2 for categorical features...")
+        feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
+        feat_scores = pd.DataFrame()
+        feat_scores["score"] = feat_selector.scores_
+        feat_scores["pvalue"] = feat_selector.pvalues_
+        feat_scores["support"] = feat_selector.get_support()
+        feat_scores["features"] = X.columns
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores["method"] = "Chi2"
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
-    return feat_scores
+        feat_scores.to_csv(
+            f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
+        )
+        return feat_scores
+    # Linear correlation (Person's R for regression and ANOVA for classification)
+    def select_feature_by_linear_correlation(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
+        start = time.time()
+        test_type = "Person's R" if target_type == "regression" else "ANOVA"
+        logger.debug(f"Running {test_type}...")
+        model = f_regression if target_type == "regression" else f_classif
+        feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
+        feat_scores = pd.DataFrame()
+        feat_scores["score"] = feat_selector.scores_
+        feat_scores["pvalue"] = feat_selector.pvalues_
+        feat_scores["support"] = feat_selector.get_support()
+        feat_scores["features"] = X.columns
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores["method"] = test_type
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
-# Non-Linear correlation (Spearsman's R for regression and Kendall’s Tau for classification)
-def select_feature_by_nonlinear_correlation(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
+        feat_scores.to_csv(
+            f"{save_dir}/{test_type}.csv",
+            index=True,
+            header=True,
+            index_label="ID",
+        )
-    def model(X_model, y_model):
-        X_model = pd.DataFrame(X_model)
-        y_model = pd.Series(y_model)
+        return feat_scores
-        method = "spearman" if target_type == "regression" else "kendall"
+    # Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
+    def select_feature_by_nonlinear_correlation(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
-        corr_scores = []
-        p_values = []
+        start = time.time()
-        for col in X_model.columns:
-            if method == "spearman":
-                corr, pval = spearmanr(X_model[col], y_model)
-            else:  # Kendall's Tau for classification
-                corr, pval = kendalltau(X_model[col], y_model)
+        def model(X_model, y_model):
+            X_model = pd.DataFrame(X_model)
+            y_model = pd.Series(y_model)
-            corr_scores.append(abs(corr))  # Keeping absolute correlation
-            p_values.append(pval)
+            method = "spearman" if target_type == "regression" else "kendall"
-        return np.array(corr_scores), np.array(p_values)
+            corr_scores = []
+            p_values = []
-    test_type = "Spearman’s R" if target_type == "regression" else "Kendall’s Tau"
-    logger.debug(f"Running {test_type}...")
+            for col in X_model.columns:
+                if method == "spearman":
+                    corr, pval = spearmanr(X_model[col], y_model)
+                else:  # Kendall's Tau for classification
+                    corr, pval = kendalltau(X_model[col], y_model)
-    feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
-    feat_scores = pd.DataFrame()
-    feat_scores["score"] = feat_selector.scores_
-    feat_scores["pvalue"] = feat_selector.pvalues_
-    feat_scores["support"] = feat_selector.get_support()
-    feat_scores["features"] = X.columns
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores["method"] = test_type
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
+                corr_scores.append(abs(corr))  # Keeping absolute correlation
+                p_values.append(pval)
-    logger.debug(
-        f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+            return np.array(corr_scores), np.array(p_values)
-    feat_scores.to_csv(
-        f"{save_dir}/{test_type}.csv",
-        index=True,
-        header=True,
-        index_label="ID",
-    )
+        test_type = "Spearman's R" if target_type == "regression" else "Kendall's Tau"
+        logger.debug(f"Running {test_type}...")
-    return feat_scores
+        feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
+        feat_scores = pd.DataFrame()
+        feat_scores["score"] = feat_selector.scores_
+        feat_scores["pvalue"] = feat_selector.pvalues_
+        feat_scores["support"] = feat_selector.get_support()
+        feat_scores["features"] = X.columns
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores["method"] = test_type
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
-# Mutual Information
-def select_feature_by_mi(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
-    logger.debug("Running Mutual Information...")
-    model = (
-        mutual_info_regression if target_type == "regression" else mutual_info_classif
-    )
-    feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
-    feat_scores = pd.DataFrame()
-    feat_scores["score"] = feat_selector.scores_
-    feat_scores["support"] = feat_selector.get_support()
-    feat_scores["features"] = X.columns
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores["method"] = "Mutual Information"
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
-    logger.debug(
-        f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+        feat_scores.to_csv(
+            f"{save_dir}/{test_type}.csv",
+            index=True,
+            header=True,
+            index_label="ID",
+        )
-    feat_scores.to_csv(f"{save_dir}/MI.csv", index=True, header=True, index_label="ID")
-    return feat_scores
-def select_categorical_features(X, y, percentile, save_dir: Optional[str] = None):
-    start = time.time()
-    logger.debug("Running Chi2 for categorical features...")
-    feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
-    feat_scores = pd.DataFrame()
-    feat_scores["score"] = feat_selector.scores_
-    feat_scores["pvalue"] = feat_selector.pvalues_
-    feat_scores["support"] = feat_selector.get_support()
-    feat_scores["features"] = X.columns
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores["method"] = "Chi2"
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
-    logger.debug(
-        f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+        return feat_scores
-    feat_scores.to_csv(
-        f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
-    )
+    # Mutual Information
+    def select_feature_by_mi(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
-    return feat_scores
+        start = time.time()
+        logger.debug("Running Mutual Information...")
+        model = (
+            mutual_info_regression
+            if target_type == "regression"
+            else mutual_info_classif
+        )
+        feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
+        feat_scores = pd.DataFrame()
+        feat_scores["score"] = feat_selector.scores_
+        feat_scores["support"] = feat_selector.get_support()
+        feat_scores["features"] = X.columns
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores["method"] = "Mutual Information"
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
+        feat_scores.to_csv(
+            f"{save_dir}/MI.csv", index=True, header=True, index_label="ID"
+        )
-# Intrisic/embeedded method
-# ----------------
+        return feat_scores
+    # Intrisic/embeedded method
+    # ----------------
-# feature importance
-def select_feature_by_feat_imp(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
-    logger.debug("Running Feature importance...")
+    # feature importance
+    def select_feature_by_feat_imp(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
-    params = {"n_estimators": 500, "max_depth": 2**3, "random_state": 42, "n_jobs": -1}
+        start = time.time()
+        logger.debug("Running Feature importance...")
-    estimator = (
-        RandomForestClassifier(**params)
-        if target_type == "classification"
-        else RandomForestRegressor(**params)
-    )
+        params = {
+            "n_estimators": 500,
+            "max_depth": 2**3,
+            "random_state": 42,
+            "n_jobs": -1,
+        }
-    feat_selector = SelectFromModel(
-        estimator=estimator,
-        threshold=-np.inf,
-        max_features=int(percentile * X.shape[1] / 100),
-    ).fit(X, y)
-    feat_scores = pd.DataFrame()
-    feat_scores["score"] = feat_selector.estimator_.feature_importances_
-    feat_scores["support"] = feat_selector.get_support()
-    feat_scores["features"] = X.columns
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores["method"] = "FI"
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
-    logger.debug(
-        f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+        estimator = (
+            RandomForestClassifier(**params)
+            if target_type == "classification"
+            else RandomForestRegressor(**params)
+        )
-    feat_scores.to_csv(f"{save_dir}/FI.csv", index=True, header=True, index_label="ID")
+        feat_selector = SelectFromModel(
+            estimator=estimator,
+            threshold=-np.inf,
+            max_features=int(percentile * X.shape[1] / 100),
+        ).fit(X, y)
+        feat_scores = pd.DataFrame()
+        feat_scores["score"] = feat_selector.estimator_.feature_importances_
+        feat_scores["support"] = feat_selector.get_support()
+        feat_scores["features"] = X.columns
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores["method"] = "FI"
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
-    return feat_scores
+        feat_scores.to_csv(
+            f"{save_dir}/FI.csv", index=True, header=True, index_label="ID"
+        )
+        return feat_scores
-# Wrapper method
-# ----------------
+    # Wrapper method
+    # ----------------
+    # recursive feature elimination
+    def select_feature_by_rfe(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
-# recursive feature elimination
-def select_feature_by_rfe(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
-    logger.debug("Running Recursive Feature Elimination...")
+        start = time.time()
+        logger.debug("Running Recursive Feature Elimination...")
-    params = {
-        "max_depth": 2**3,
-        "random_state": 42,
-    }
-    estimator = (
-        DecisionTreeClassifier(**params)
-        if target_type == "classification"
-        else DecisionTreeRegressor(**params)
-    )
-    rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
-    feat_selector = rfe.fit(X, y)
-    feat_scores = pd.DataFrame(
-        {
-            "score": 0.0,  # Default feature importance
-            "support": feat_selector.get_support(),
-            "features": X.columns,
-            "rank": 0,
-            "method": "RFE",
+        params = {
+            "max_depth": 2**3,
+            "random_state": 42,
         }
-    )
-    feat_scores.loc[
-        feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
-    ] = list(feat_selector.estimator_.feature_importances_)
-    feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
-    logger.debug(
-        f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
-    feat_scores.to_csv(f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID")
-    return feat_scores
+        estimator = (
+            DecisionTreeClassifier(**params)
+            if target_type == "classification"
+            else DecisionTreeRegressor(**params)
+        )
+        rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
+        feat_selector = rfe.fit(X, y)
+        feat_scores = pd.DataFrame(
+            {
+                "score": 0.0,  # Default feature importance
+                "support": feat_selector.get_support(),
+                "features": X.columns,
+                "rank": 0,
+                "method": "RFE",
+            }
+        )
+        feat_scores.loc[
+            feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
+        ] = list(feat_selector.estimator_.feature_importances_)
+        feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
+        feat_scores.to_csv(
+            f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID"
+        )
-# SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
-def select_feature_by_sfs(
-    X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
-):
-    start = time.time()
-    logger.debug("Running Sequential Feature Selection...")
-    warnings.filterwarnings("ignore", category=FutureWarning)
+        return feat_scores
-    params = {
-        "max_depth": 2**3,
-        "random_state": 42,
-    }
-    estimator = (
-        DecisionTreeClassifier(**params)
-        if target_type == "classification"
-        else DecisionTreeRegressor(**params)
-    )
+    # SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
+    def select_feature_by_sfs(
+        self, percentile: int = 20, save_dir: Optional[str] = None
+    ):
+        X, y, target_type = self.X_numerical, self.y, self.target_type
-    n_splits = 3
-    n_samples = len(X)
-    test_size = int(n_samples / (n_splits + 4))
-    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
-    score_function = (
-        make_scorer(
-            log_loss, response_method="predict_proba"
-        )  # logloss needs probabilities
-        if target_type == "classification"
-        else make_scorer(root_mean_squared_error)
-    )  # we avoid greater_is_better = False because it make the score negative and mess up ranking
-    sfs = SequentialFeatureSelector(
-        estimator,
-        k_features=int(percentile * X.shape[1] / 100),
-        forward=True,
-        floating=True,  # Enables dynamic feature elimination
-        scoring=score_function,
-        cv=tscv,
-        n_jobs=-1,
-        verbose=0,
-    )
+        start = time.time()
+        logger.debug("Running Sequential Feature Selection...")
+        warnings.filterwarnings("ignore", category=FutureWarning)
-    feat_selector = sfs.fit(X, y)
-    # Extract selected features and their scores
-    selected_features = set(feat_selector.k_feature_names_)
-    feat_subsets = feat_selector.subsets_
-    # Create DataFrame for feature scores
-    feat_scores = pd.DataFrame(
-        {
-            "features": X.columns,
-            "support": X.columns.isin(
-                selected_features
-            ),  # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
-            "score": 1000,
-            "rank": None,
-            "method": "SFS",
+        params = {
+            "max_depth": 2**3,
+            "random_state": 42,
         }
-    )
-    # Sort subsets by score (lower is better)
-    sorted_subsets = sorted(feat_subsets.items(), key=lambda item: item[1]["avg_score"])
-    # Record score per feature (first appearance)
-    feature_score_map = {}
-    for step in sorted_subsets:
-        step = step[1]
-        for feature in step["feature_names"]:
-            if feature not in feature_score_map:
-                feature_score_map[feature] = step["avg_score"]
-    # Assign scores
-    for feature, score in feature_score_map.items():
-        feat_scores.loc[feat_scores["features"] == feature, "score"] = score
+        estimator = (
+            DecisionTreeClassifier(**params)
+            if target_type == "classification"
+            else DecisionTreeRegressor(**params)
+        )
-    # rank by score (lower = better)
-    feat_scores["rank"] = (
-        feat_scores["score"].rank(method="first", ascending=True).astype(int)
-    )
+        n_splits = 3
+        n_samples = len(X)
+        test_size = int(n_samples / (n_splits + 4))
+        tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
+        score_function = (
+            make_scorer(
+                log_loss, response_method="predict_proba"
+            )  # logloss needs probabilities
+            if target_type == "classification"
+            else make_scorer(root_mean_squared_error)
+        )  # we avoid greater_is_better = False because it make the score negative and mess up ranking
+        sfs = SequentialFeatureSelector(
+            estimator,
+            k_features=int(percentile * X.shape[1] / 100),
+            forward=True,
+            floating=True,  # Enables dynamic feature elimination
+            scoring=score_function,
+            cv=tscv,
+            n_jobs=-1,
+            verbose=0,
+        )
-    feat_scores.sort_values("rank", ascending=True, inplace=True)
+        feat_selector = sfs.fit(X, y)
+        # Extract selected features and their scores
+        selected_features = set(feat_selector.k_feature_names_)
+        feat_subsets = feat_selector.subsets_
+        # Create DataFrame for feature scores
+        feat_scores = pd.DataFrame(
+            {
+                "features": X.columns,
+                "support": X.columns.isin(
+                    selected_features
+                ),  # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
+                "score": 1000,
+                "rank": None,
+                "method": "SFS",
+            }
+        )
-    stop = time.time()
-    training_time = timedelta(seconds=(stop - start)).total_seconds()
-    feat_scores["training_time"] = training_time
+        # Sort subsets by score (lower is better)
+        sorted_subsets = sorted(
+            feat_subsets.items(), key=lambda item: item[1]["avg_score"]
+        )
-    logger.debug(
-        f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
-    )
+        # Record score per feature (first appearance)
+        feature_score_map = {}
+        for step in sorted_subsets:
+            step = step[1]
+            for feature in step["feature_names"]:
+                if feature not in feature_score_map:
+                    feature_score_map[feature] = step["avg_score"]
+        # Assign scores
+        for feature, score in feature_score_map.items():
+            feat_scores.loc[feat_scores["features"] == feature, "score"] = score
+        # rank by score (lower = better)
+        feat_scores["rank"] = (
+            feat_scores["score"].rank(method="first", ascending=True).astype(int)
+        )
-    feat_scores.to_csv(f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID")
+        feat_scores.sort_values("rank", ascending=True, inplace=True)
-    return feat_scores
+        stop = time.time()
+        training_time = timedelta(seconds=(stop - start)).total_seconds()
+        feat_scores["training_time"] = training_time
+        logger.debug(
+            f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+        )
-# Remove correlation
-# ------------------
+        feat_scores.to_csv(
+            f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID"
+        )
+        return feat_scores
+class PreprocessModel:
+    def __init__(
+        self,
+        train,
+        val,
+        test,
+        dataset,
+        target_numbers,
+        target_clf,
+        models_idx,
+        time_series,
+        max_timesteps,
+        group_column,
+        date_column,
+        **kwargs,
+    ):
+        self.dataset = dataset
+        self.target_numbers = target_numbers
+        self.target_clf = target_clf
+        self.models_idx = models_idx
+        self.time_series = time_series
+        self.max_timesteps = max_timesteps
+        self.group_column = group_column
+        self.date_column = date_column
+        self.dataset_dir = dataset.path
+        self.data_dir = f"{self.dataset_dir}/data"
+        self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
+        self.all_features = dataset.get_all_features(
+            date_column=date_column, group_column=group_column
+        )
+        columns_to_keep = self.all_features + [
+            f"TARGET_{i}" for i in self.target_numbers
+        ]
+        duplicates = [
+            col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
+        ]
+        if duplicates:
+            raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
+        self.train = train[columns_to_keep]
+        if isinstance(val, pd.DataFrame):
+            self.val = val[columns_to_keep]
+        if isinstance(test, pd.DataFrame):
+            self.test = test[columns_to_keep]
+    def run(self):
+        # save data
+        joblib.dump(self.train, f"{self.data_dir}/train.pkl")
+        joblib.dump(self.val, f"{self.data_dir}/val.pkl")
+        joblib.dump(self.test, f"{self.data_dir}/test.pkl")
+        # scaling features
+        if any(t not in self.target_clf for t in self.target_numbers) and any(
+            all_models[i].get("need_scaling") for i in self.models_idx
+        ):
+            logger.info("Scaling features...")
+            train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
+            val_scaled, _, _ = self.scale_data(
+                self.val,
+                scaler_x=scaler_x,
+                scalers_y=scalers_y,
+            )
+            test_scaled, _, _ = self.scale_data(
+                self.test,
+                scaler_x=scaler_x,
+                scalers_y=scalers_y,
+            )
+        else:
+            train_scaled = None
+            val_scaled = None
+            test_scaled = None
+        # save data
+        joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
+        joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
+        joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
+        data = {
+            "train": self.train,
+            "val": self.val,
+            "test": self.test,
+            "train_scaled": train_scaled,
+            "val_scaled": val_scaled,
+            "test_scaled": test_scaled,
+            "scalers_y": scalers_y,
+        }
-def remove_correlated_features(
-    X: pd.DataFrame, features: list, corr_threshold: int, vizualize: bool = False
-):
-    # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
-    corr_matrix = X[features].corr().abs()
+        # reshape data for time series
+        reshaped_data = None
+        if (
+            any(all_models[i].get("recurrent") for i in self.models_idx)
+            and self.time_series
+        ):
+            # reshaping data for recurrent models
+            logger.info("Reshaping data for recurrent models...")
+            reshaped_data = self.reshape_time_series(
+                train_scaled,
+                val_scaled,
+                test_scaled,
+                features=self.all_features,
+                timesteps=self.max_timesteps,
+            )
-    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
-    features_uncorrelated = [
-        column
-        for column in upper.columns
-        if all(upper[column].dropna() <= corr_threshold / 100)
-    ]
-    features_correlated = [
-        column for column in upper.columns if any(upper[column] > corr_threshold / 100)
-    ]
+        return data, reshaped_data
-    if vizualize:
-        features_selected_visualization = (
-            X[features]
-            .corr()
-            .where(np.triu(np.ones(len(features)), k=1).astype(bool))
-            .fillna(0)
+    def inference(self):
+        # self.train is new data here
+        scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
+        scaled_data = scaler_x.transform(self.train)
+        scaled_data = pd.DataFrame(
+            scaled_data, columns=self.train.columns, index=self.train.index
         )
-        # Plot the heatmap
-        plt.figure(figsize=(10, 8))
-        sns.heatmap(
-            corr_matrix,
-            annot=True,
-            cmap="coolwarm",
-            center=0,
-            linewidths=1,
-            linecolor="black",
-        )
-        plt.title(f"Correlation Matrix")
-        plt.show()
-        logger.info(f"\n{features_selected_visualization.describe().to_string()}")
-        logger.info(f"\n{features_selected_visualization.to_string()}")
-    return features_uncorrelated, features_correlated
-# Main feature selection function
-def feature_selection(
-    dataset_id: int,
-    train: pd.DataFrame,
-    target_number: int,
-    single_process: bool = False,
-):
-    """Function to do feature selection with a range of different feature selection technics
-    Args:
-        - train (pd.DataFrame): a pandas train set
-        - target_number (in): a target, targets need to be name ``TARGET_{n}```
-        - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
-    """
-    # Create the feature selection in db
-    target = Target.find_by(name=f"TARGET_{target_number}")
-    dataset = Dataset.get(dataset_id)
-    percentile = dataset.percentile
-    corr_threshold = dataset.corr_threshold
-    max_features = dataset.max_features
-    feature_selection = FeatureSelection.upsert(
-        match_fields=["target_id", "dataset_id"],
-        target_id=target.id,
-        dataset_id=dataset.id,
-    )
-    X = train.loc[:, ~train.columns.str.contains("^TARGET_")]
-    y = train[f"TARGET_{target_number}"]
+        reshaped_data = None
+        if (
+            any(all_models[i].get("recurrent") for i in self.models_idx)
+            and self.time_series
+        ):
+            # we need to make sur we have max_timesteps of data after grouping by group_column
+            if (
+                self.group_column
+                and scaled_data.groupby(self.group_column).size().min()
+                < self.max_timesteps
+            ) or scaled_data.shape[0] < self.max_timesteps:
+                raise ValueError(
+                    f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
+                )
-    logger.info(f"Starting feature selection for TARGET_{target_number}...")
+            # reshaping data for recurrent models
+            logger.info("Reshaping data for recurrent models...")
+            reshaped_data = self.reshape_time_series(
+                scaled_data,
+                features=self.all_features,
+                timesteps=self.max_timesteps,
+            )
-    target_type = "classification" if target_number in TARGETS_CLF else "regression"
+        return self.train, scaled_data, reshaped_data
+    # scaling
+    def scale_data(
+        self,
+        df: pd.DataFrame,
+        scaler_x=None,
+        scalers_y: Optional[list] = None,
+    ):
+        logger.info("Scale data...")
+        X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
+        if scaler_x:
+            X_scaled = pd.DataFrame(
+                scaler_x.transform(X), columns=list(X.columns), index=X.index
+            )
+        else:
+            scaler_x = StandardScaler()  # MinMaxScaler(feature_range=(-1,1))
+            X_scaled = pd.DataFrame(
+                scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
+            )
+            joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
-    fs_dir_target = f"{dataset.path}/{y.name}/feature_selection"
-    preprocessing_dir = f"{dataset.path}/preprocessing"
-    os.makedirs(fs_dir_target, exist_ok=True)
-    clean_directory(fs_dir_target)
+        # Determine which targets need to be scaled
+        targets_numbers_to_scale = [
+            i for i in self.target_numbers if i not in self.target_clf
+        ]
-    # Let's start by removing extremly correlated features
-    # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
-    # TODO: we could also remove low variance features
-    features_uncorrelated, features_correlated = remove_correlated_features(
-        X, X.columns, 90, vizualize=False
-    )
-    X = X[features_uncorrelated]
-    logger.debug(
-        f"""
-        \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
-        \nWe are looking to capture {percentile}% of {len(X.columns)} features, i.e. {int(len(X.columns)*percentile/100)} features, with different feature selection methods
-        \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
-        \nFinally, we will keep only the {max_features} best ranked features
-        """
-    )
+        # Dictionary to store scaled target data
+        scaled_targets = {}
-    start = time.time()
+        if scalers_y:
+            for target_number in targets_numbers_to_scale:
+                y = df[[f"TARGET_{target_number}"]]
+                scaled_targets[target_number] = pd.DataFrame(
+                    scalers_y[f"scaler_y_{target_number}"].transform(y.values),
+                    columns=y.columns,
+                    index=y.index,
+                )
+        else:
+            scalers_y = {}
+            for target_number in targets_numbers_to_scale:
+                scaler_y = StandardScaler()
+                y = df[[f"TARGET_{target_number}"]]
+                scaled_y = pd.DataFrame(
+                    scaler_y.fit_transform(y.values),
+                    columns=y.columns,
+                    index=y.index,
+                )
+                joblib.dump(
+                    scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
+                )
-    # handling categorical features (only if classification)
-    categorical_features = X.select_dtypes(include=["int64", "Int64"]).columns.tolist()
-    X_categorical = X[categorical_features]
+                scalers_y[f"scaler_y_{target_number}"] = scaler_y
+                scaled_targets[target_number] = scaled_y
-    if target_type == "classification":
-        feat_scores = select_categorical_features(
-            X_categorical, y, percentile, save_dir=fs_dir_target
+        # Reconstruct y_scaled in the original order
+        y_scaled = pd.concat(
+            [
+                scaled_targets[target_number]
+                for target_number in targets_numbers_to_scale
+            ],
+            axis=1,
         )
-        with get_db() as db:
-            for row in feat_scores.itertuples(index=False):
-                feature = Feature.find_by(name=row.features, db=db)
-                FeatureSelectionRank.upsert(
-                    ["feature_selection_id", "feature_id", "method"],
-                    db=db,
-                    score=row.score,
-                    pvalue=row.pvalue,
-                    support=row.support,
-                    rank=row.rank,
-                    method=row.method,
-                    training_time=row.training_time,
-                    feature_selection_id=feature_selection.id,
-                    feature_id=feature.id,
-                )
-        categorical_features_selected = feat_scores[feat_scores["support"] == True][
-            "features"
-        ].values.tolist()
-    # removing categorical features from X
-    numerical_features = list(set(X.columns).difference(set(categorical_features)))
-    X_numerical = X[numerical_features]
-    results = []
-    if single_process:
-        results = [
-            select_feature_by_linear_correlation(
-                X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            ),
-            select_feature_by_nonlinear_correlation(
-                X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            ),
-            select_feature_by_mi(
-                X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            ),
-            select_feature_by_feat_imp(
-                X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            ),
-            select_feature_by_rfe(
-                X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            ),
-            # select_feature_by_sfs(
-            #     X_numerical, y, target_type, percentile, save_dir=fs_dir_target
-            # ), # TODO: this is taking too long
+        y_not_scaled = df[
+            df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
         ]
-    else:
-        # Use ProcessPoolExecutor to run tasks in parallel
-        with ProcessPoolExecutor() as executor:
-            # Submit different functions to be executed in parallel
-            futures = [
-                executor.submit(
-                    select_feature_by_linear_correlation,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-                executor.submit(
-                    select_feature_by_nonlinear_correlation,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-                executor.submit(
-                    select_feature_by_mi,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-                executor.submit(
-                    select_feature_by_feat_imp,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-                executor.submit(
-                    select_feature_by_rfe,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-                executor.submit(
-                    select_feature_by_sfs,
-                    X_numerical,
-                    y,
-                    target_type,
-                    percentile,
-                    save_dir=fs_dir_target,
-                ),
-            ]
-            # Wait for all futures to complete and gather the results
-            with tqdm(total=len(futures)) as pbar:
-                for future in as_completed(futures):
-                    results.append(future.result())
-                    pbar.update(1)
-    logger.info(f"Finished feature selection for target {target_number}")
-    stop = time.time()
-    # Once all tasks are completed, start by inserting results to db
-    feat_scores = pd.concat(
-        results,
-        axis=0,
-    )
-    logger.info("Inserting feature selection results to db...")
-    rows = []
-    with get_db() as db:
-        feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
-        for row in feat_scores.itertuples(index=False):
-            feature_id = feature_map.get(row.features)
-            if not feature_id:
-                continue  # or raise if feature must exist
-            rows.append(
-                {
-                    "feature_selection_id": feature_selection.id,
-                    "feature_id": feature_id,
-                    "method": row.method,
-                    "score": row.score,
-                    "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
-                    "support": row.support,
-                    "rank": row.rank,
-                    "training_time": row.training_time,
-                }
-            )
-        if len(rows) == 0:
-            raise ValueError(f"No features selected for TARGET_{target_number}")
-        FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
-    # Merge the results
-    features_selected = feat_scores[feat_scores["support"] == True][
-        ["features", "rank"]
-    ]
-    features_selected.sort_values("rank", inplace=True)
-    features_selected.drop_duplicates("features", inplace=True)
-    features_selected_list = features_selected["features"].values.tolist()
+        # Ensure the final DataFrame keeps the original order
+        df_scaled = pd.concat(
+            [X_scaled, y_scaled, y_not_scaled],
+            axis=1,
+        )[
+            df.columns
+        ]  # Reorder columns to match original `df`
+        if not df_scaled.columns.equals(df.columns):
+            raise Exception("Columns are not in the same order after scaling.")
+        return df_scaled, scaler_x, scalers_y
+    # Reshape into 3D tensors for recurrent models
+    def reshape_time_series(
+        self,
+        train: pd.DataFrame,
+        val: pd.DataFrame,
+        test: pd.DataFrame,
+        features: list,
+        timesteps: int = 120,
+    ):
+        # always scale for recurrent layers : train should be scaled
+        group_column = self.group_column
+        target_columns = train.columns.intersection(
+            [f"TARGET_{i}" for i in self.target_numbers]
+        )
-    logger.info("Merging feature selection methods...")
-    # features_selected = list(dict.fromkeys(features_selected_by_mi + features_selected_by_nonlinear_correlation + features_selected_by_linear_correlation))
-    features_selected_by_every_methods = set(results[0]["features"].values.tolist())
+        data = pd.concat([train, val, test], axis=0)
-    for df in results[1:]:
-        features_selected_by_every_methods &= set(
-            df["features"].values.tolist()
-        )  # intersection
+        def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
+            fill_value = [[[0] * len(df.columns)]]
-    features_selected_by_every_methods = list(features_selected_by_every_methods)
+            def shiftsum(x, timesteps: int):
+                tmp = x.copy()
+                for i in range(1, timesteps):
+                    tmp = x.shift(i, fill_value=fill_value) + tmp
+                return tmp
-    logger.debug(
-        f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
-    )
-    logger.debug(features_selected_by_every_methods)
+            logger.info("Grouping each feature in a unique column with list...")
+            df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
+            df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
-    pd.Series(features_selected_list).to_csv(
-        f"{fs_dir_target}/features_before_corr.csv",
-        index=True,
-        header=True,
-        index_label="ID",
-    )
-    features, features_correlated = remove_correlated_features(
-        X, features_selected_list, corr_threshold
-    )
-    pd.Series(features).to_csv(
-        f"{fs_dir_target}/features_before_max.csv",
-        index=True,
-        header=True,
-        index_label="ID",
-    )
-    features = features[:max_features]
+            logger.info("Grouping method stock and creating timesteps...")
+            df_reshaped = (
+                df_reshaped.groupby(group_column)[0]
+                .apply(lambda x: shiftsum(x, timesteps))
+                .reset_index(group_column, drop=True)
+                .rename("RECURRENT_FEATURES")
+            )
+            df_reshaped = pd.DataFrame(df_reshaped)
-    features += categorical_features_selected if target_type == "classification" else []
-    logger.debug(
-        f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
-    )
+            return df_reshaped
-    features_selected_by_every_methods_uncorrelated = list(
-        set(features) & set(features_selected_by_every_methods)
-    )
-    logger.debug(
-        f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
-    )
+        data_reshaped = reshape_df(data[features], data[group_column], timesteps)
-    logger.debug(
-        features_selected[features_selected["features"].isin(features)].to_markdown()
-    )
+        data_reshaped[target_columns] = data[target_columns]
-    best_features_path = Path(
-        f"{preprocessing_dir}/features_{target_number}.pkl"
-    ).resolve()
-    if PYTHON_ENV != "Test":
-        joblib.dump(features, best_features_path)
+        logger.info("Separating train, val, test data and creating np arrays...")
+        train_reshaped = data_reshaped.loc[train.index]
+        val_reshaped = data_reshaped.loc[val.index]
+        test_reshaped = data_reshaped.loc[test.index]
-    db_features = Feature.filter(name__in=features)
-    # Order matters, to keep the same order in db as in features, we need: map features by name
-    feature_by_name = {f.name: f for f in db_features}
-    # Reorder them according to original `features` list
-    ordered_db_features = [
-        feature_by_name[name] for name in features if name in feature_by_name
-    ]
-    feature_selection = FeatureSelection.get(feature_selection.id)
-    feature_selection = feature_selection.add_features(ordered_db_features)
-    feature_selection.training_time = stop - start
-    feature_selection.best_features_path = best_features_path
-    feature_selection.save()
+        x_train_reshaped = np.array(
+            train_reshaped["RECURRENT_FEATURES"].values.tolist()
+        )
+        y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
+        x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
+        y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
+        x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
+        y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
+        reshaped_data = {
+            "x_train_reshaped": x_train_reshaped,
+            "y_train_reshaped": y_train_reshaped,
+            "x_val_reshaped": x_val_reshaped,
+            "y_val_reshaped": y_val_reshaped,
+            "x_test_reshaped": x_test_reshaped,
+            "y_test_reshaped": y_test_reshaped,
+        }
-    return features
+        return reshaped_data
+# utils
 # TODO : can we use this to select the ideal number of features ?
 def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
@@ -1072,158 +1129,18 @@ def feature_selection_analysis(feature_selection_id: int, n_components: int = 5)
     plt.show()
-# scaling
-def scale_data(
-    df: pd.DataFrame, save_dir: str, scaler_x=None, scalers_y: Optional[list] = None
-):
-    logger.info("Scale data...")
-    X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
-    if scaler_x:
-        X_scaled = pd.DataFrame(
-            scaler_x.transform(X), columns=list(X.columns), index=X.index
-        )
-    else:
-        scaler_x = StandardScaler()  # MinMaxScaler(feature_range=(-1,1))
-        X_scaled = pd.DataFrame(
-            scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
-        )
-        if PYTHON_ENV != "Test":
-            joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
-    # Determine which targets need to be scaled
-    targets_numbers_to_scale = [i for i in TARGETS_NUMBER if i not in TARGETS_CLF]
-    # Dictionary to store scaled target data
-    scaled_targets = {}
-    if scalers_y:
-        for target_number in targets_numbers_to_scale:
-            y = df[[f"TARGET_{target_number}"]]
-            scaled_targets[target_number] = pd.DataFrame(
-                scalers_y[f"scaler_y_{target_number}"].transform(y.values),
-                columns=y.columns,
-                index=y.index,
-            )
-    else:
-        scalers_y = {}
-        for target_number in targets_numbers_to_scale:
-            scaler_y = StandardScaler()
-            y = df[[f"TARGET_{target_number}"]]
-            scaled_y = pd.DataFrame(
-                scaler_y.fit_transform(y.values),
-                columns=y.columns,
-                index=y.index,
-            )
-            if PYTHON_ENV != "Test":
-                joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
-            scalers_y[f"scaler_y_{target_number}"] = scaler_y
-            scaled_targets[target_number] = scaled_y
-    # Reconstruct y_scaled in the original order
-    y_scaled = pd.concat(
-        [scaled_targets[target_number] for target_number in targets_numbers_to_scale],
-        axis=1,
-    )
-    y_not_scaled = df[df.columns.intersection([f"TARGET_{i}" for i in TARGETS_CLF])]
-    # Ensure the final DataFrame keeps the original order
-    df_scaled = pd.concat(
-        [X_scaled, y_scaled, y_not_scaled],
-        axis=1,
-    )[
-        df.columns
-    ]  # Reorder columns to match original `df`
-    if not df_scaled.columns.equals(df.columns):
-        raise Exception("Columns are not in the same order after scaling.")
-    return df_scaled, scaler_x, scalers_y
-# Reshape into 3D tensors for recurrent models
-def reshape_time_series(
-    train: pd.DataFrame,
-    val: pd.DataFrame,
-    test: pd.DataFrame,
-    features: list,
-    timesteps: int = 120,
-):
-    # always scale for recurrent layers : train should be scaled
-    target_columns = train.columns.intersection([f"TARGET_{i}" for i in TARGETS_NUMBER])
-    data = pd.concat([train, val, test], axis=0)
-    data_reshaped = reshape_df(data[features], data[GROUPING_COLUMN], timesteps)
-    data_reshaped[target_columns] = data[target_columns]
-    logger.info("Separating train, val, test data and creating np arrays...")
-    train_reshaped = data_reshaped.loc[train.index]
-    val_reshaped = data_reshaped.loc[val.index]
-    test_reshaped = data_reshaped.loc[test.index]
-    x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
-    y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
-    x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
-    y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
-    x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
-    y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
-    reshaped_data = {
-        "x_train_reshaped": x_train_reshaped,
-        "y_train_reshaped": y_train_reshaped,
-        "x_val_reshaped": x_val_reshaped,
-        "y_val_reshaped": y_val_reshaped,
-        "x_test_reshaped": x_test_reshaped,
-        "y_test_reshaped": y_test_reshaped,
-    }
-    return reshaped_data
-def reshape_df(df: pd.DataFrame, stock_column: pd.DataFrame, timesteps: int):
-    fill_value = [[[0] * len(df.columns)]]
-    def shiftsum(x, timesteps: int):
-        tmp = x.copy()
-        for i in range(1, timesteps):
-            tmp = x.shift(i, fill_value=fill_value) + tmp
-        return tmp
-    logger.info("Grouping each feature in a unique column with list...")
-    df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
-    df_reshaped = pd.concat([df_reshaped, stock_column], axis=1)
-    logger.info("Grouping method stock and creating timesteps...")
-    df_reshaped = (
-        df_reshaped.groupby(GROUPING_COLUMN)[0]
-        .apply(lambda x: shiftsum(x, timesteps))
-        .reset_index(GROUPING_COLUMN, drop=True)
-        .rename("RECURRENT_FEATURES")
-    )
-    df_reshaped = pd.DataFrame(df_reshaped)
-    return df_reshaped
-def load_train_data(dataset_dir, target_number, target_type="regression"):
-    train_data_dir = f"{dataset_dir}/data"
-    preprocessing_dir = f"{dataset_dir}/preprocessing"
-    _scaler_y = (
-        joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
-        if target_type == "regression"
-        else None
-    )
+def get_features_by_types(df: pd.DataFrame, sample_categorical_threshold: int = 15):
+    categorical_features = [
+        col
+        for col in df.columns
+        if df[col].nunique() <= sample_categorical_threshold
+        and df[col].dtype in ["int64", "Int64"]
+    ]
+    df_categorical = df[categorical_features]
+    logger.info(f"Number of categorical features: {len(categorical_features)}")
-    logger.info("Loading data...")
-    train = joblib.load(f"{train_data_dir}/train.pkl")
-    val = joblib.load(f"{train_data_dir}/val.pkl")
-    train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
-    val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
+    numerical_features = list(set(df.columns).difference(set(categorical_features)))
+    df_numerical = df[numerical_features]
+    logger.info(f"Number of numerical features: {len(numerical_features)}")
-    return train, val, train_scaled, val_scaled, _scaler_y
+    return df_categorical, df_numerical

lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl