PyPI - lecrapaud - Versions diffs - 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

lecrapaud 0.20.1py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (12) hide show

lecrapaud/__init__.py +4 -0
lecrapaud/api.py +80 -12
lecrapaud/config.py +3 -2
lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +49 -16
lecrapaud/experiment.py +8 -2
lecrapaud/feature_engineering.py +223 -4
lecrapaud/utils.py +4 -4
lecrapaud-0.21.0.dist-info/METADATA +347 -0
{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/RECORD +11 -11
lecrapaud-0.20.1.dist-info/METADATA +0 -250
{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/WHEEL +0 -0
{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/licenses/LICENSE +0 -0

lecrapaud/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
 from lecrapaud.api import *
+# Export default parameters for easy access
+from lecrapaud.api import ExperimentEngine
+DEFAULT_EXPERIMENT_PARAMS = ExperimentEngine.DEFAULT_PARAMS

lecrapaud/api.py CHANGED Viewed

@@ -163,28 +163,96 @@ class ExperimentEngine:
         **kwargs: Additional configuration parameters
     """
+    # Default values for all experiment parameters
+    DEFAULT_PARAMS = {
+        # Feature Engineering
+        "columns_drop": [],
+        "columns_boolean": [],
+        "columns_date": [],
+        "columns_te_groupby": [],
+        "columns_te_target": [],
+        "for_training": True,
+        # Preprocessing
+        "time_series": False,
+        "val_size": 0.2,
+        "test_size": 0.2,
+        "columns_pca": [],
+        "pca_temporal": [],
+        "pca_cross_sectional": [],
+        "columns_onehot": [],
+        "columns_binary": [],
+        "columns_ordinal": [],
+        "columns_frequency": [],
+        # Feature Selection
+        "percentile": 20,
+        "corr_threshold": 80,
+        "max_features": 50,
+        "max_p_value_categorical": 0.05,
+        # Model Selection
+        "target_numbers": [],
+        "target_clf": [],
+        "models_idx": [],
+        "max_timesteps": 120,
+        "perform_hyperopt": True,
+        "number_of_trials": 20,
+        "perform_crossval": False,
+        "plot": True,
+        "preserve_model": True,
+        "target_clf_thresholds": {},
+        # Data structure
+        "date_column": None,
+        "group_column": None,
+    }
     def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
         """Initialize the experiment engine with either new or existing experiment."""
-        # Set all kwargs as instance attributes
-        if "models_idx" in kwargs:
-            kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
-        for key, value in kwargs.items():
-            setattr(self, key, value)
         if id:
+            # Load existing experiment
             self.experiment = Experiment.get(id)
-            kwargs.update(self.experiment.context)
-            experiment_dir = f"{tmp_dir}/{self.experiment.name}"
-            preprocessing_dir = f"{experiment_dir}/preprocessing"
-            data_dir = f"{experiment_dir}/data"
-            os.makedirs(preprocessing_dir, exist_ok=True)
-            os.makedirs(data_dir, exist_ok=True)
+            # Context from DB takes precedence over kwargs
+            effective_kwargs = {
+                **self.DEFAULT_PARAMS,
+                **kwargs,
+                **self.experiment.context,
+            }
         else:
             if data is None:
                 raise ValueError(
                     "Either id or data must be provided. Data can be a path to a folder containing trained models"
                 )
-            self.experiment = create_experiment(data=data, **kwargs)
+            # New experiment: merge defaults with provided kwargs
+            effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
+        # Normalize models_idx if present
+        if "models_idx" in effective_kwargs:
+            effective_kwargs["models_idx"] = normalize_models_idx(
+                effective_kwargs["models_idx"]
+            )
+        # Set all parameters as instance attributes
+        for key, value in effective_kwargs.items():
+            setattr(self, key, value)
+        # Create experiment if new
+        if not id:
+            self.experiment = create_experiment(data=data, **effective_kwargs)
+        # Create directories
+        experiment_dir = f"{tmp_dir}/{self.experiment.name}"
+        preprocessing_dir = f"{experiment_dir}/preprocessing"
+        data_dir = f"{experiment_dir}/data"
+        os.makedirs(preprocessing_dir, exist_ok=True)
+        os.makedirs(data_dir, exist_ok=True)
+    @classmethod
+    def get_default_params(cls):
+        """Get the default parameters for experiments."""
+        return cls.DEFAULT_PARAMS.copy()
+    def get_effective_context(self):
+        """Get the effective context (merged defaults + experiment context)."""
+        return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
     def train(self, data, best_params=None):
         logger.info("Running training...")

lecrapaud/config.py CHANGED Viewed

@@ -32,6 +32,7 @@ DB_URI: str = (
 )
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
-LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)
 LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
-LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv("LECRAPAUD_OPTIMIZATION_BACKEND", "ray").lower()
+LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
+    "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
+).lower()

lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py CHANGED Viewed

@@ -5,38 +5,71 @@ Revises: 033e0f7eca4f
 Create Date: 2025-10-28 20:06:54.792631
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import mysql
+from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
 # revision identifiers, used by Alembic.
-revision: str = '0a8fb7826e9b'
-down_revision: Union[str, None] = '033e0f7eca4f'
+revision: str = "0a8fb7826e9b"
+down_revision: Union[str, None] = "033e0f7eca4f"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
-    op.drop_column('lecrapaud_experiments', 'corr_threshold')
-    op.drop_column('lecrapaud_experiments', 'max_features')
-    op.drop_column('lecrapaud_experiments', 'percentile')
-    op.drop_column('lecrapaud_experiments', 'type')
-    op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
-    op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("number_of_targets", sa.Integer(), nullable=True),
+    )
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "corr_threshold")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "max_features")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "percentile")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "type")
+    op.drop_index(
+        op.f("ix_model_selection_scores_id"),
+        table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+    )
+    op.create_index(
+        op.f("ix_model_selection_scores_id"),
+        f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+        ["id"],
+        unique=False,
+    )
     # ### end Alembic commands ###
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
-    op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
-    op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
-    op.drop_column('lecrapaud_experiments', 'number_of_targets')
+    op.drop_index(
+        op.f("ix_lecrapaud_model_selection_scores_id"),
+        table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+    )
+    op.create_index(
+        op.f("ix_model_selection_scores_id"),
+        f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+        ["id"],
+        unique=False,
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("type", mysql.VARCHAR(length=50), nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("percentile", mysql.FLOAT(), nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("max_features", mysql.INTEGER(), autoincrement=False, nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("corr_threshold", mysql.FLOAT(), nullable=False),
+    )
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "number_of_targets")
     # ### end Alembic commands ###

lecrapaud/experiment.py CHANGED Viewed

@@ -16,15 +16,21 @@ from lecrapaud.db.session import get_db
 def create_experiment(
     data: pd.DataFrame | str,
-    date_column,
-    group_column,
     experiment_name,
+    date_column=None,
+    group_column=None,
     **kwargs,
 ):
     if isinstance(data, str):
         path = f"{data}/data/full.pkl"
         data = joblib.load(path)
+    if kwargs.get("time_series") and not date_column:
+        raise ValueError("date_column must be provided for time series experiments")
+    if experiment_name is None:
+        raise ValueError("experiment_name must be provided")
     dates = {}
     if date_column:
         dates["start_date"] = pd.to_datetime(data[date_column].iat[0])

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -94,7 +94,7 @@ class FeatureEngineeringEngine:
         self.data = data
         self.experiment = experiment
         self.for_training = for_training
         # Get all parameters from experiment context
         self.columns_drop = self.experiment.context.get("columns_drop", [])
         self.columns_boolean = self.experiment.context.get("columns_boolean", [])
@@ -330,15 +330,19 @@ class PreprocessFeature:
         self.test_size = context.get("test_size", 0.2)
         self.target_numbers = context.get("target_numbers", [])
         self.target_clf = context.get("target_clf", [])
         # Handle list parameters with uppercase conversion
         self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
         self.pca_temporal = context.get("pca_temporal", [])
         self.pca_cross_sectional = context.get("pca_cross_sectional", [])
         self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
         self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
-        self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
-        self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
+        self.columns_ordinal = [
+            col.upper() for col in context.get("columns_ordinal", [])
+        ]
+        self.columns_frequency = [
+            col.upper() for col in context.get("columns_frequency", [])
+        ]
         self.experiment_dir = self.experiment.path
         self.experiment_id = self.experiment.id
@@ -653,6 +657,221 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_cross_sectional_time_series(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: str = "median",
+        standardize: bool = True,
+        lookback_days: int = 365,  # nombre de jours à regarder en arrière pour le fit
+        refresh_frequency: int = 90,  # refresh la PCA tous les X jours
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Construit un pivot (index=index_col, columns=columns_col, values=value_col),
+        fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
+        (par index_col) dans df. Renvoie (df_avec_features, pipe).
+        Pour les séries temporelles : fit la PCA uniquement sur les données passées
+        pour éviter le leakage, avec refresh périodique.
+        Gère le cas des données panel où on a plusieurs séries temporelles
+        (ex: plusieurs stocks avec les mêmes dates).
+        """
+        pcas_dict = {}
+        index_saved = df.index
+        for pca_cross_sectional in self.pca_cross_sectional:
+            name, index_col, columns_col, value_col = (
+                pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
+            )
+            prefix = f"CS_PC_{name}"
+            # Vérifier si c'est une série temporelle avec index = date
+            # Les dates sont déjà en ordinal après cyclic_encode_date
+            is_time_series = self.time_series and index_col == self.date_column
+            if is_time_series:
+                # Cas spécial : PCA cross-sectional sur des données de panel time series
+                # Par exemple : PCA sur les returns de tous les stocks à chaque date
+                # pour capturer le régime de marché
+                all_scores = []
+                # Les dates sont déjà en ordinal
+                unique_dates = sorted(df[index_col].unique())
+                # Pour l'inference, utiliser la PCA fournie
+                if pcas is not None:
+                    pipe = pcas[name]
+                    pivot = df.pivot_table(
+                        index=index_col, columns=columns_col, values=value_col
+                    ).sort_index()
+                    scores = pipe.transform(pivot)
+                    cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+                else:
+                    # Training : fit PCA de manière expanding avec refresh périodique
+                    pipe = None
+                    last_fit_date = None
+                    for i, current_date_ordinal in enumerate(unique_dates):
+                        # Convertir l'ordinal en date pour les calculs de temps
+                        current_date = pd.Timestamp.fromordinal(
+                            int(current_date_ordinal)
+                        )
+                        # Déterminer si on doit refitter la PCA
+                        should_refit = pipe is None or (  # Première fois
+                            last_fit_date is not None
+                            and (current_date - last_fit_date).days >= refresh_frequency
+                        )
+                        if (
+                            should_refit and i > 30
+                        ):  # Attendre au moins 30 jours de données
+                            # Prendre les données des 'lookback_days' derniers jours
+                            lookback_start_date = current_date - pd.Timedelta(
+                                days=lookback_days
+                            )
+                            lookback_start_ordinal = pd.Timestamp.toordinal(
+                                lookback_start_date
+                            )
+                            # Masque pour les dates passées uniquement (éviter le leakage)
+                            mask_fit = (df[index_col] >= lookback_start_ordinal) & (
+                                df[index_col] < current_date_ordinal
+                            )
+                            df_fit = df[mask_fit]
+                            if len(df_fit) > 0:
+                                # Créer le pivot pour la période de lookback
+                                pivot_fit = df_fit.pivot_table(
+                                    index=index_col,
+                                    columns=columns_col,
+                                    values=value_col,
+                                ).sort_index()
+                                # Vérifier qu'on a assez de dates et de colonnes
+                                if (
+                                    len(pivot_fit) >= n_components
+                                    and pivot_fit.shape[1] >= n_components
+                                ):
+                                    # Créer nouveau pipeline
+                                    steps = [
+                                        (
+                                            "imputer",
+                                            SimpleImputer(strategy=impute_strategy),
+                                        )
+                                    ]
+                                    if standardize:
+                                        steps.append(
+                                            (
+                                                "scaler",
+                                                StandardScaler(
+                                                    with_mean=True, with_std=True
+                                                ),
+                                            )
+                                        )
+                                    pca = PCA(n_components=n_components, random_state=0)
+                                    steps.append(("pca", pca))
+                                    pipe = Pipeline(steps)
+                                    pipe.fit(pivot_fit)
+                                    last_fit_date = current_date
+                                    logger.debug(
+                                        f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
+                                        f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
+                                    )
+                        # Transform pour la date courante uniquement
+                        if pipe is not None:
+                            df_current = df[df[index_col] == current_date_ordinal]
+                            if len(df_current) > 0:
+                                pivot_current = df_current.pivot_table(
+                                    index=index_col,
+                                    columns=columns_col,
+                                    values=value_col,
+                                )
+                                try:
+                                    scores_current = pipe.transform(pivot_current)
+                                    scores_dict = {
+                                        index_col: [current_date_ordinal],
+                                        **{
+                                            f"{prefix}_{j}": [scores_current[0, j]]
+                                            for j in range(n_components)
+                                        },
+                                    }
+                                    all_scores.append(pd.DataFrame(scores_dict))
+                                except Exception as e:
+                                    # En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
+                                    logger.debug(
+                                        f"PCA transform error at date {current_date}: {str(e)}"
+                                    )
+                                    scores_dict = {
+                                        index_col: [current_date_ordinal],
+                                        **{
+                                            f"{prefix}_{j}": [np.nan]
+                                            for j in range(n_components)
+                                        },
+                                    }
+                                    all_scores.append(pd.DataFrame(scores_dict))
+                        else:
+                            # Pas encore de PCA fittée, créer des NaN
+                            scores_dict = {
+                                index_col: [current_date_ordinal],
+                                **{
+                                    f"{prefix}_{j}": [np.nan]
+                                    for j in range(n_components)
+                                },
+                            }
+                            all_scores.append(pd.DataFrame(scores_dict))
+                    # Combiner tous les scores
+                    if all_scores:
+                        scores_df = pd.concat(all_scores, ignore_index=True)
+                    else:
+                        # Créer un DataFrame vide avec les bonnes colonnes
+                        cols = [f"{prefix}_{i}" for i in range(n_components)]
+                        scores_df = pd.DataFrame(columns=[index_col] + cols)
+                # Merger les scores
+                df = df.merge(scores_df, on=index_col, how="left")
+                df.index = index_saved
+                pcas_dict.update({name: pipe})
+            else:
+                # Approche classique (non time series ou index != date)
+                pivot = df.pivot_table(
+                    index=index_col, columns=columns_col, values=value_col
+                ).sort_index()
+                # Pipeline à réutiliser entre train et test
+                if pcas is None:
+                    steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
+                    if standardize:
+                        steps.append(
+                            ("scaler", StandardScaler(with_mean=True, with_std=True))
+                        )
+                    pca = PCA(n_components=n_components, random_state=0)
+                    steps.append(("pca", pca))
+                    pipe = Pipeline(steps)
+                    pipe.fit(pivot)  # <- fit sur TRAIN uniquement
+                else:
+                    pipe = pcas[name]  # <- TEST : on réutilise le pipe existant
+                scores = pipe.transform(pivot)  # shape: (n_index, n_components)
+                cols = [f"{prefix}_{i}" for i in range(n_components)]
+                scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+                df = df.merge(scores_df.reset_index(), on=index_col, how="left")
+                df.index = index_saved
+                pcas_dict.update({name: pipe})
+        return df, pcas_dict
     # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
     def add_pca_feature_temporal(
         self,

lecrapaud/utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ import re
 import string
 from lecrapaud.directories import logger_dir
-from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV, LECRAPAUD_LOCAL
+from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
 _LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
@@ -237,7 +237,7 @@ def serialize_for_json(obj):
     import numpy as np
     from datetime import datetime, date
     import pandas as pd
     # Handle NumPy types
     if isinstance(obj, (np.integer, np.int64, np.int32, np.int16)):
         return int(obj)
@@ -247,11 +247,11 @@ def serialize_for_json(obj):
         return obj.tolist()
     elif isinstance(obj, np.bool_):
         return bool(obj)
     # Handle datetime types
     elif isinstance(obj, (datetime, date, pd.Timestamp)):
         return obj.isoformat()
     # Handle basic Python types
     elif isinstance(obj, (str, int, float, bool, type(None))):
         return obj

lecrapaud-0.21.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,347 @@
+Metadata-Version: 2.4
+Name: lecrapaud
+Version: 0.21.0
+Summary: Framework for machine and deep learning, with regression, classification and time series analysis
+License: Apache License
+License-File: LICENSE
+Author: Pierre H. Gallet
+Requires-Python: ==3.12.*
+Classifier: License :: Other/Proprietary License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: catboost (>=1.2.8)
+Requires-Dist: category-encoders (>=2.8.1)
+Requires-Dist: celery (>=5.5.3)
+Requires-Dist: celery-redbeat (>=2.3.2)
+Requires-Dist: ftfy (>=6.3.1)
+Requires-Dist: hyperopt (>=0.2.7)
+Requires-Dist: joblib (>=1.5.1)
+Requires-Dist: keras (>=3.10.0)
+Requires-Dist: keras-tcn (>=3.5.6)
+Requires-Dist: lightgbm (>=4.6.0)
+Requires-Dist: matplotlib (>=3.10.3)
+Requires-Dist: mlxtend (>=0.23.4)
+Requires-Dist: numpy (>=2.1.3)
+Requires-Dist: openai (>=1.88.0)
+Requires-Dist: pandas (>=2.3.0)
+Requires-Dist: pydantic (>=2.9.2)
+Requires-Dist: python-dotenv (>=1.1.0)
+Requires-Dist: scikit-learn (>=1.6.1)
+Requires-Dist: scipy (<1.14.0)
+Requires-Dist: seaborn (>=0.13.2)
+Requires-Dist: sqlalchemy (>=2.0.41)
+Requires-Dist: tensorboardx (>=2.6.4)
+Requires-Dist: tensorflow (>=2.19.0)
+Requires-Dist: tiktoken (>=0.9.0)
+Requires-Dist: tqdm (>=4.67.1)
+Requires-Dist: xgboost (>=3.0.2)
+Description-Content-Type: text/markdown
+<div align="center">
+<img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
+## Welcome to LeCrapaud
+**An all-in-one machine learning framework**
+[![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
+[![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
+[![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
+[![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
+[![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
+</div>
+## 🚀 Introduction
+LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
+## ✨ Key Features
+- 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
+- 🤖 Automated model selection and hyperparameter optimization
+- 📊 Easy integration with pandas DataFrames
+- 🔬 Supports both regression and classification tasks
+- 🛠️ Simple API for both full pipeline and step-by-step usage
+- 📦 Ready for production and research workflows
+## ⚡ Quick Start
+### Install the package
+```sh
+pip install lecrapaud
+```
+### How it works
+This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
+### Typical workflow
+```python
+from lecrapaud import LeCrapaud
+# 1. Create the main app
+app = LeCrapaud(uri=uri)
+# 2. Define your experiment context (see your notebook or api.py for all options)
+context = {
+    "data": your_dataframe,
+    "columns_drop": [...],
+    "columns_date": [...],
+    # ... other config options
+}
+# 3. Create an experiment
+experiment = app.create_experiment(**context)
+# 4. Run the full training pipeline
+experiment.train(your_dataframe)
+# 5. Make predictions on new data
+predictions = experiment.predict(new_data)
+```
+### Database Configuration (Required)
+LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
+- Pass a valid MySQL URI to the `LeCrapaud` constructor:
+  ```python
+  app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
+  ```
+- **OR** set the following environment variables before using the package:
+  - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
+  - Or set `DB_URI` directly with your full connection string.
+If neither is provided, database operations will not work.
+### Using OpenAI Embeddings (Optional)
+If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
+```sh
+export OPENAI_API_KEY=sk-...
+```
+If this variable is not set, features relying on OpenAI embeddings will not be available.
+### Experiment Context Arguments
+The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
+#### Required Parameters
+| Parameter         | Type      | Description                                          | Example                |
+|-------------------|-----------|------------------------------------------------------|------------------------|
+| `data`           | DataFrame | Input dataset (required for new experiments only)    | `pd.DataFrame(...)`    |
+| `experiment_name`| str       | Unique name for the experiment                      | `'stock_prediction'`   |
+| `date_column`    | str       | Name of the date column (required for time series)  | `'DATE'`              |
+| `group_column`   | str       | Name of the group column (required for panel data)  | `'STOCK'`             |
+#### Feature Engineering Parameters
+| Parameter             | Type  | Default | Description                                                              |
+|-----------------------|-------|---------|--------------------------------------------------------------------------|
+| `columns_drop`        | list  | `[]`    | Columns to drop during feature engineering                              |
+| `columns_boolean`     | list  | `[]`    | Columns to convert to boolean features                                  |
+| `columns_date`        | list  | `[]`    | Date columns for cyclic encoding                                        |
+| `columns_te_groupby`  | list  | `[]`    | Groupby columns for target encoding                                     |
+| `columns_te_target`   | list  | `[]`    | Target columns for target encoding                                      |
+#### Preprocessing Parameters
+| Parameter               | Type  | Default | Description                                                           |
+|-------------------------|-------|---------|-----------------------------------------------------------------------|
+| `time_series`           | bool  | `False` | Whether data is time series                                          |
+| `val_size`              | float | `0.2`   | Validation set size (fraction)                                       |
+| `test_size`             | float | `0.2`   | Test set size (fraction)                                             |
+| `columns_pca`           | list  | `[]`    | Columns for PCA transformation                                       |
+| `pca_temporal`          | list  | `[]`    | Temporal PCA config (e.g., lag features)                            |
+| `pca_cross_sectional`   | list  | `[]`    | Cross-sectional PCA config (e.g., market regime)                    |
+| `columns_onehot`        | list  | `[]`    | Columns for one-hot encoding                                         |
+| `columns_binary`        | list  | `[]`    | Columns for binary encoding                                          |
+| `columns_ordinal`       | list  | `[]`    | Columns for ordinal encoding                                         |
+| `columns_frequency`     | list  | `[]`    | Columns for frequency encoding                                       |
+#### Feature Selection Parameters
+| Parameter                   | Type  | Default | Description                                                      |
+|-----------------------------|-------|---------|------------------------------------------------------------------|
+| `percentile`                | float | `20`    | Percentage of features to keep per selection method             |
+| `corr_threshold`            | float | `80`    | Maximum correlation threshold (%) between features              |
+| `max_features`              | int   | `50`    | Maximum number of final features                                |
+| `max_p_value_categorical`   | float | `0.05`  | Maximum p-value for categorical feature selection (Chi2)        |
+#### Model Selection Parameters
+| Parameter              | Type  | Default | Description                                                           |
+|------------------------|-------|---------|-----------------------------------------------------------------------|
+| `target_numbers`       | list  | `[]`    | List of target indices to predict                                     |
+| `target_clf`           | list  | `[]`    | Classification target indices                                         |
+| `models_idx`           | list  | `[]`    | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`)           |
+| `max_timesteps`        | int   | `120`   | Maximum timesteps for recurrent models                               |
+| `perform_hyperopt`     | bool  | `True`  | Whether to perform hyperparameter optimization                       |
+| `number_of_trials`     | int   | `20`    | Number of hyperopt trials                                            |
+| `perform_crossval`     | bool  | `False` | Whether to use cross-validation during hyperopt                      |
+| `plot`                 | bool  | `True`  | Whether to generate plots                                            |
+| `preserve_model`       | bool  | `True`  | Whether to save the best model                                       |
+| `target_clf_thresholds`| dict  | `{}`    | Classification thresholds per target                                 |
+#### Example Context Configuration
+```python
+context = {
+    # Required parameters
+    "experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+    "date_column": "DATE",
+    "group_column": "STOCK",
+    # Feature selection
+    "corr_threshold": 80,
+    "max_features": 20,
+    "percentile": 20,
+    "max_p_value_categorical": 0.05,
+    # Feature engineering
+    "columns_drop": ["SECURITY", "ISIN", "ID"],
+    "columns_boolean": [],
+    "columns_date": ["DATE"],
+    "columns_te_groupby": [["SECTOR", "DATE"]],
+    "columns_te_target": ["RET", "VOLUME"],
+    # Preprocessing
+    "time_series": True,
+    "val_size": 0.2,
+    "test_size": 0.2,
+    "pca_temporal": [
+        {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
+    ],
+    "pca_cross_sectional": [
+        {
+            "name": "MARKET_REGIME",
+            "index": "DATE",
+            "columns": "STOCK",
+            "value": "RET",
+        }
+    ],
+    "columns_onehot": ["BUY_SIGNAL"],
+    "columns_binary": ["SECTOR", "LOCATION"],
+    "columns_ordinal": ["STOCK"],
+    # Model selection
+    "target_numbers": [1, 2, 3],
+    "target_clf": [1],
+    "models_idx": ["xgb", "lgb", "catboost"],
+    "max_timesteps": 120,
+    "perform_hyperopt": True,
+    "number_of_trials": 50,
+    "perform_crossval": True,
+    "plot": True,
+    "preserve_model": True,
+    "target_clf_thresholds": {1: {"precision": 0.80}},
+}
+# Create experiment
+experiment = app.create_experiment(data=your_dataframe, **context)
+```
+#### Important Notes
+1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
+2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
+3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
+4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
+5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
+### Modular usage
+You can also use each step independently:
+```python
+data_eng = experiment.feature_engineering(data)
+train, val, test = experiment.preprocess_feature(data_eng)
+features = experiment.feature_selection(train)
+std_data, reshaped_data = experiment.preprocess_model(train, val, test)
+experiment.model_selection(std_data, reshaped_data)
+```
+## ⚠️ Using Alembic in Your Project (Important for Integrators)
+If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
+By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
+```python
+def include_object(object, name, type_, reflected, compare_to):
+    if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
+        return False  # Ignore LeCrapaud tables
+    return True
+context.configure(
+    # ... other options ...
+    include_object=include_object,
+)
+```
+This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
+---
+## 🤝 Contributing
+### Reminders for Github usage
+1. Creating Github repository
+```sh
+$ brew install gh
+$ gh auth login
+$ gh repo create
+```
+2. Initializing git and first commit to distant repository
+```sh
+$ git init
+$ git add .
+$ git commit -m 'first commit'
+$ git remote add origin <YOUR_REPO_URL>
+$ git push -u origin master
+```
+3. Use conventional commits
+https://www.conventionalcommits.org/en/v1.0.0/#summary
+4. Create environment
+```sh
+$ pip install virtualenv
+$ python -m venv .venv
+$ source .venv/bin/activate
+```
+5. Install dependencies
+```sh
+$ make install
+```
+6. Deactivate virtualenv (if needed)
+```sh
+$ deactivate
+```
+---
+Pierre Gallet © 2025

{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
-lecrapaud/api.py,sha256=IQlH3wcSzxYgvlamfICNMwNsQGoaNxBJUPTlC9M0kBk,20321
-lecrapaud/config.py,sha256=QK1MxWsEddXii02Rme31tCGDyMFsfHHF2Zy-lLIOuSY,1218
+lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
+lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
+lecrapaud/config.py,sha256=0NEg61QdLxQ97bVFDDXa6OwlWFEo_z8VIhX5KrD1ik0,1170
 lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
 lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
 lecrapaud/db/alembic/env.py,sha256=RvTTBa3bDVBxmDtapAfzUoeWBgmVQU3s9U6HmQCAP84,2421
@@ -14,7 +14,7 @@ lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_
 lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py,sha256=g6H2Z9MwB6UEiqdGlBoHBXpO9DTaWkwHt8FS6joVOm0,1191
 lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py,sha256=FshOF1t-NWXrBtXT3wMNGFslJ4sWUxzvBEXSymu05cI,1043
 lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py,sha256=htHUD4zPJr-0z_DQfTi8r9RsFVe9m7SL0f7oRIvLIcQ,10999
-lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=o3TNHq1GTFjxfk2zHWaUbq91khMJi6Xy6HToO9i54AU,2051
+lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=0NBvOwPqMXpWnDEGiEBk_IeLKmXQ5ZcU-dqHeSEgsRQ,2557
 lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
 lecrapaud/db/models/__init__.py,sha256=-XoCN1eeLihnNxBMl90lXrgrTSDkMbeqgienMqFi5f4,702
 lecrapaud/db/models/base.py,sha256=0548x4ftd6Oim9BJmtD7Er4izM6u0QCrlTG5560384w,9458
@@ -29,8 +29,8 @@ lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk
 lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,1109
 lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
 lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
-lecrapaud/experiment.py,sha256=TYECkPqZNVqQQaSg8u5fZ3UvxKYCzc3f-mYVlikCz4s,2234
-lecrapaud/feature_engineering.py,sha256=UM-EIOsgYWedqsR9uA-09eaWSb9FofVxoE0rRcDelQ8,39173
+lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
+lecrapaud/feature_engineering.py,sha256=lfY14RS303_izt3OcnLhTvsPbWUWZY5ES_0HNcbBezc,50017
 lecrapaud/feature_selection.py,sha256=Q9xWVmZsvRjX9mJHB_PY_KLXsEAYNLX7txSe0cniY4A,47529
 lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
 lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
@@ -43,8 +43,8 @@ lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8o
 lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
 lecrapaud/model_selection.py,sha256=o4_hOEp91_33HtMatVHU7YPc71KZ2hK7wucN63xqWkA,88017
 lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
-lecrapaud/utils.py,sha256=vsNBd2Nnhpjo65Ugz2GFJaRhq3U3_eWERfofpevo5Ls,8884
-lecrapaud-0.20.1.dist-info/METADATA,sha256=gCEqDJXok9Ti9DQ32XRqU4cH0blMCrSBAOLPTy9viXE,11137
-lecrapaud-0.20.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-lecrapaud-0.20.1.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
-lecrapaud-0.20.1.dist-info/RECORD,,
+lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
+lecrapaud-0.21.0.dist-info/METADATA,sha256=TziJTM9CXoayu3hwlHqCIiqvWIbvTaZhRv0XbYaLuRQ,14348
+lecrapaud-0.21.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+lecrapaud-0.21.0.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
+lecrapaud-0.21.0.dist-info/RECORD,,

lecrapaud-0.20.1.dist-info/METADATA DELETED Viewed

@@ -1,250 +0,0 @@
-Metadata-Version: 2.4
-Name: lecrapaud
-Version: 0.20.1
-Summary: Framework for machine and deep learning, with regression, classification and time series analysis
-License: Apache License
-License-File: LICENSE
-Author: Pierre H. Gallet
-Requires-Python: ==3.12.*
-Classifier: License :: Other/Proprietary License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: catboost (>=1.2.8)
-Requires-Dist: category-encoders (>=2.8.1)
-Requires-Dist: celery (>=5.5.3)
-Requires-Dist: ftfy (>=6.3.1)
-Requires-Dist: joblib (>=1.5.1)
-Requires-Dist: keras (>=3.10.0)
-Requires-Dist: lightgbm (>=4.6.0)
-Requires-Dist: matplotlib (>=3.10.3)
-Requires-Dist: mlxtend (>=0.23.4)
-Requires-Dist: numpy (>=2.1.3)
-Requires-Dist: openai (>=1.88.0)
-Requires-Dist: pandas (>=2.3.0)
-Requires-Dist: pydantic (>=2.9.2)
-Requires-Dist: python-dotenv (>=1.1.0)
-Requires-Dist: scikit-learn (>=1.6.1)
-Requires-Dist: scipy (<1.14.0)
-Requires-Dist: seaborn (>=0.13.2)
-Requires-Dist: sqlalchemy (>=2.0.41)
-Requires-Dist: tensorboardx (>=2.6.4)
-Requires-Dist: tensorflow (>=2.19.0)
-Requires-Dist: tiktoken (>=0.9.0)
-Requires-Dist: tqdm (>=4.67.1)
-Requires-Dist: xgboost (>=3.0.2)
-Description-Content-Type: text/markdown
-<div align="center">
-<img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
-## Welcome to LeCrapaud
-**An all-in-one machine learning framework**
-[![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
-[![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
-[![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
-[![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
-[![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
-</div>
-## 🚀 Introduction
-LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
-## ✨ Key Features
-- 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
-- 🤖 Automated model selection and hyperparameter optimization
-- 📊 Easy integration with pandas DataFrames
-- 🔬 Supports both regression and classification tasks
-- 🛠️ Simple API for both full pipeline and step-by-step usage
-- 📦 Ready for production and research workflows
-## ⚡ Quick Start
-### Install the package
-```sh
-pip install lecrapaud
-```
-### How it works
-This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
-### Typical workflow
-```python
-from lecrapaud import LeCrapaud
-# 1. Create the main app
-app = LeCrapaud(uri=uri)
-# 2. Define your experiment context (see your notebook or api.py for all options)
-context = {
-    "data": your_dataframe,
-    "columns_drop": [...],
-    "columns_date": [...],
-    # ... other config options
-}
-# 3. Create an experiment
-experiment = app.create_experiment(**context)
-# 4. Run the full training pipeline
-experiment.train(your_dataframe)
-# 5. Make predictions on new data
-predictions = experiment.predict(new_data)
-```
-### Database Configuration (Required)
-LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
-- Pass a valid MySQL URI to the `LeCrapaud` constructor:
-  ```python
-  app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
-  ```
-- **OR** set the following environment variables before using the package:
-  - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
-  - Or set `DB_URI` directly with your full connection string.
-If neither is provided, database operations will not work.
-### Using OpenAI Embeddings (Optional)
-If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
-```sh
-export OPENAI_API_KEY=sk-...
-```
-If this variable is not set, features relying on OpenAI embeddings will not be available.
-### Experiment Context Arguments
-Below are the main arguments you can pass to `create_experiment` (or the `Experiment` class):
-| Argument             | Type      | Description                                                                              | Example/Default    |
-| -------------------- | --------- | ---------------------------------------------------------------------------------------- | ------------------ |
-| `columns_binary`     | list      | Columns to treat as binary                                                               | `['flag']`         |
-| `columns_boolean`    | list      | Columns to treat as boolean                                                              | `['is_active']`    |
-| `columns_date`       | list      | Columns to treat as dates                                                                | `['date']`         |
-| `columns_drop`       | list      | Columns to drop during feature engineering                                               | `['col1', 'col2']` |
-| `columns_frequency`  | list      | Columns to frequency encode                                                              | `['category']`     |
-| `columns_onehot`     | list      | Columns to one-hot encode                                                                | `['sector']`       |
-| `columns_ordinal`    | list      | Columns to ordinal encode                                                                | `['grade']`        |
-| `columns_pca`        | list      | Columns to use for PCA/embeddings (requires `OPENAI_API_KEY` if using OpenAI embeddings) | `['text_col']`     |
-| `columns_te_groupby` | list      | Columns for target encoding groupby                                                      | `['sector']`       |
-| `columns_te_target`  | list      | Columns for target encoding target                                                       | `['target']`       |
-| `data`               | DataFrame | Your main dataset (required for new experiment)                                          | `your_dataframe`   |
-| `date_column`        | str       | Name of the date column                                                                  | `'date'`           |
-| `experiment_name`    | str       | Name for the training session                                                            | `'my_session'`     |
-| `group_column`       | str       | Name of the group column                                                                 | `'stock_id'`       |
-| `max_timesteps`      | int       | Max timesteps for time series models                                                     | `30`               |
-| `models_idx`         | list      | Indices of models to use for model selection                                             | `[0, 1, 2]`        |
-| `number_of_trials`   | int       | Number of trials for hyperparameter optimization                                         | `20`               |
-| `perform_crossval`   | bool      | Whether to perform cross-validation                                                      | `True`/`False`     |
-| `perform_hyperopt`   | bool      | Whether to perform hyperparameter optimization                                           | `True`/`False`     |
-| `plot`               | bool      | Whether to plot results                                                                  | `True`/`False`     |
-| `preserve_model`     | bool      | Whether to preserve the best model                                                       | `True`/`False`     |
-| `target_clf`         | list      | List of classification target column indices/names                                       | `[1, 2, 3]`        |
-| `target_mclf`        | list      | Multi-class classification targets (not yet implemented)                                 | `[11]`             |
-| `target_numbers`     | list      | List of regression target column indices/names                                           | `[1, 2, 3]`        |
-| `test_size`          | int/float | Test set size (count or fraction)                                                        | `0.2`              |
-| `time_series`        | bool      | Whether the data is time series                                                          | `True`/`False`     |
-| `val_size`           | int/float | Validation set size (count or fraction)                                                  | `0.2`              |
-**Note:**
-- Not all arguments are required; defaults may exist for some.
-- For `columns_pca` with OpenAI embeddings, you must set the `OPENAI_API_KEY` environment variable.
-### Modular usage
-You can also use each step independently:
-```python
-data_eng = experiment.feature_engineering(data)
-train, val, test = experiment.preprocess_feature(data_eng)
-features = experiment.feature_selection(train)
-std_data, reshaped_data = experiment.preprocess_model(train, val, test)
-experiment.model_selection(std_data, reshaped_data)
-```
-## ⚠️ Using Alembic in Your Project (Important for Integrators)
-If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
-By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
-```python
-def include_object(object, name, type_, reflected, compare_to):
-    if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
-        return False  # Ignore LeCrapaud tables
-    return True
-context.configure(
-    # ... other options ...
-    include_object=include_object,
-)
-```
-This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
----
-## 🤝 Contributing
-### Reminders for Github usage
-1. Creating Github repository
-```sh
-$ brew install gh
-$ gh auth login
-$ gh repo create
-```
-2. Initializing git and first commit to distant repository
-```sh
-$ git init
-$ git add .
-$ git commit -m 'first commit'
-$ git remote add origin <YOUR_REPO_URL>
-$ git push -u origin master
-```
-3. Use conventional commits
-https://www.conventionalcommits.org/en/v1.0.0/#summary
-4. Create environment
-```sh
-$ pip install virtualenv
-$ python -m venv .venv
-$ source .venv/bin/activate
-```
-5. Install dependencies
-```sh
-$ make install
-```
-6. Deactivate virtualenv (if needed)
-```sh
-$ deactivate
-```
----
-Pierre Gallet © 2025

{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{lecrapaud-0.20.1.dist-info → lecrapaud-0.21.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

lecrapaud 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.20.1py3-none-any.whl → 0.21.0py3-none-any.whl