PyPI - lecrapaud - Versions diffs - 0.20.2__py3-none-any.whl → 0.21.1__py3-none-any.whl - Mend

lecrapaud 0.20.2py3-none-any.whl → 0.21.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (9) hide show

lecrapaud/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
 from lecrapaud.api import *
+# Export default parameters for easy access
+from lecrapaud.api import ExperimentEngine
+DEFAULT_EXPERIMENT_PARAMS = ExperimentEngine.DEFAULT_PARAMS

lecrapaud/api.py CHANGED Viewed

@@ -163,28 +163,96 @@ class ExperimentEngine:
         **kwargs: Additional configuration parameters
     """
+    # Default values for all experiment parameters
+    DEFAULT_PARAMS = {
+        # Feature Engineering
+        "columns_drop": [],
+        "columns_boolean": [],
+        "columns_date": [],
+        "columns_te_groupby": [],
+        "columns_te_target": [],
+        "for_training": True,
+        # Preprocessing
+        "time_series": False,
+        "val_size": 0.2,
+        "test_size": 0.2,
+        "columns_pca": [],
+        "pca_temporal": [],
+        "pca_cross_sectional": [],
+        "columns_onehot": [],
+        "columns_binary": [],
+        "columns_ordinal": [],
+        "columns_frequency": [],
+        # Feature Selection
+        "percentile": 20,
+        "corr_threshold": 80,
+        "max_features": 50,
+        "max_p_value_categorical": 0.05,
+        # Model Selection
+        "target_numbers": [],
+        "target_clf": [],
+        "models_idx": [],
+        "max_timesteps": 120,
+        "perform_hyperopt": True,
+        "number_of_trials": 20,
+        "perform_crossval": False,
+        "plot": True,
+        "preserve_model": True,
+        "target_clf_thresholds": {},
+        # Data structure
+        "date_column": None,
+        "group_column": None,
+    }
     def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
         """Initialize the experiment engine with either new or existing experiment."""
-        # Set all kwargs as instance attributes
-        if "models_idx" in kwargs:
-            kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
-        for key, value in kwargs.items():
-            setattr(self, key, value)
         if id:
+            # Load existing experiment
             self.experiment = Experiment.get(id)
-            kwargs.update(self.experiment.context)
-            experiment_dir = f"{tmp_dir}/{self.experiment.name}"
-            preprocessing_dir = f"{experiment_dir}/preprocessing"
-            data_dir = f"{experiment_dir}/data"
-            os.makedirs(preprocessing_dir, exist_ok=True)
-            os.makedirs(data_dir, exist_ok=True)
+            # Context from DB takes precedence over kwargs
+            effective_kwargs = {
+                **self.DEFAULT_PARAMS,
+                **kwargs,
+                **self.experiment.context,
+            }
         else:
             if data is None:
                 raise ValueError(
                     "Either id or data must be provided. Data can be a path to a folder containing trained models"
                 )
-            self.experiment = create_experiment(data=data, **kwargs)
+            # New experiment: merge defaults with provided kwargs
+            effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
+        # Normalize models_idx if present
+        if "models_idx" in effective_kwargs:
+            effective_kwargs["models_idx"] = normalize_models_idx(
+                effective_kwargs["models_idx"]
+            )
+        # Set all parameters as instance attributes
+        for key, value in effective_kwargs.items():
+            setattr(self, key, value)
+        # Create experiment if new
+        if not id:
+            self.experiment = create_experiment(data=data, **effective_kwargs)
+        # Create directories
+        experiment_dir = f"{tmp_dir}/{self.experiment.name}"
+        preprocessing_dir = f"{experiment_dir}/preprocessing"
+        data_dir = f"{experiment_dir}/data"
+        os.makedirs(preprocessing_dir, exist_ok=True)
+        os.makedirs(data_dir, exist_ok=True)
+    @classmethod
+    def get_default_params(cls):
+        """Get the default parameters for experiments."""
+        return cls.DEFAULT_PARAMS.copy()
+    def get_effective_context(self):
+        """Get the effective context (merged defaults + experiment context)."""
+        return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
     def train(self, data, best_params=None):
         logger.info("Running training...")

lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py CHANGED Viewed

@@ -5,38 +5,71 @@ Revises: 033e0f7eca4f
 Create Date: 2025-10-28 20:06:54.792631
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import mysql
+from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
 # revision identifiers, used by Alembic.
-revision: str = '0a8fb7826e9b'
-down_revision: Union[str, None] = '033e0f7eca4f'
+revision: str = "0a8fb7826e9b"
+down_revision: Union[str, None] = "033e0f7eca4f"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
-    op.drop_column('lecrapaud_experiments', 'corr_threshold')
-    op.drop_column('lecrapaud_experiments', 'max_features')
-    op.drop_column('lecrapaud_experiments', 'percentile')
-    op.drop_column('lecrapaud_experiments', 'type')
-    op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
-    op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("number_of_targets", sa.Integer(), nullable=True),
+    )
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "corr_threshold")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "max_features")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "percentile")
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "type")
+    op.drop_index(
+        op.f("ix_model_selection_scores_id"),
+        table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+    )
+    op.create_index(
+        op.f("ix_model_selection_scores_id"),
+        f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+        ["id"],
+        unique=False,
+    )
     # ### end Alembic commands ###
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
-    op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
-    op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
-    op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
-    op.drop_column('lecrapaud_experiments', 'number_of_targets')
+    op.drop_index(
+        op.f("ix_lecrapaud_model_selection_scores_id"),
+        table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+    )
+    op.create_index(
+        op.f("ix_model_selection_scores_id"),
+        f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
+        ["id"],
+        unique=False,
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("type", mysql.VARCHAR(length=50), nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("percentile", mysql.FLOAT(), nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("max_features", mysql.INTEGER(), autoincrement=False, nullable=False),
+    )
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        sa.Column("corr_threshold", mysql.FLOAT(), nullable=False),
+    )
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "number_of_targets")
     # ### end Alembic commands ###

lecrapaud/experiment.py CHANGED Viewed

@@ -28,6 +28,9 @@ def create_experiment(
     if kwargs.get("time_series") and not date_column:
         raise ValueError("date_column must be provided for time series experiments")
+    if experiment_name is None:
+        raise ValueError("experiment_name must be provided")
     dates = {}
     if date_column:
         dates["start_date"] = pd.to_datetime(data[date_column].iat[0])

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -94,7 +94,7 @@ class FeatureEngineeringEngine:
         self.data = data
         self.experiment = experiment
         self.for_training = for_training
         # Get all parameters from experiment context
         self.columns_drop = self.experiment.context.get("columns_drop", [])
         self.columns_boolean = self.experiment.context.get("columns_boolean", [])
@@ -330,15 +330,19 @@ class PreprocessFeature:
         self.test_size = context.get("test_size", 0.2)
         self.target_numbers = context.get("target_numbers", [])
         self.target_clf = context.get("target_clf", [])
         # Handle list parameters with uppercase conversion
         self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
         self.pca_temporal = context.get("pca_temporal", [])
         self.pca_cross_sectional = context.get("pca_cross_sectional", [])
         self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
         self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
-        self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
-        self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
+        self.columns_ordinal = [
+            col.upper() for col in context.get("columns_ordinal", [])
+        ]
+        self.columns_frequency = [
+            col.upper() for col in context.get("columns_frequency", [])
+        ]
         self.experiment_dir = self.experiment.path
         self.experiment_id = self.experiment.id
@@ -601,7 +605,7 @@ class PreprocessFeature:
         return df, pcas_dict
-    def add_pca_feature_cross_sectional(
+    def add_pca_feature_cross_sectional_old(
         self,
         df: pd.DataFrame,
         *,
@@ -653,8 +657,228 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_cross_sectional(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,  # si fourni: transform only
+        impute_strategy: str = "median",
+        standardize: bool = True,
+        lookback_days: int = 365,  # nombre de jours à regarder en arrière pour le fit
+        refresh_frequency: int = 90,  # refresh la PCA tous les X jours
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        Construit un pivot (index=index_col, columns=columns_col, values=value_col),
+        fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
+        (par index_col) dans df. Renvoie (df_avec_features, pipe).
+        Pour les séries temporelles : fit la PCA uniquement sur les données passées
+        pour éviter le leakage, avec refresh périodique.
+        Gère le cas des données panel où on a plusieurs séries temporelles
+        (ex: plusieurs stocks avec les mêmes dates).
+        """
+        pcas_dict = {}
+        index_saved = df.index
+        for pca_cross_sectional in self.pca_cross_sectional:
+            name, index_col, columns_col, value_col = (
+                pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
+            )
+            prefix = f"CS_PC_{name}"
+            # Vérifier si c'est une série temporelle avec index = date
+            # Les dates sont déjà en ordinal après cyclic_encode_date
+            is_time_series = self.time_series and index_col == self.date_column
+            if is_time_series:
+                # Cas spécial : PCA cross-sectional sur des données de panel time series
+                # Par exemple : PCA sur les returns de tous les stocks à chaque date
+                # pour capturer le régime de marché
+                all_scores = []
+                # Les dates sont déjà en ordinal
+                unique_dates = sorted(df[index_col].unique())
+                # Pour l'inference, utiliser la PCA fournie
+                if pcas is not None:
+                    pipe = pcas[name]
+                    pivot = df.pivot_table(
+                        index=index_col, columns=columns_col, values=value_col
+                    ).sort_index()
+                    scores = pipe.transform(pivot)
+                    cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+                else:
+                    # Training : fit PCA de manière expanding avec refresh périodique
+                    pipe = None
+                    last_fit_date = None
+                    for i, current_date_ordinal in enumerate(unique_dates):
+                        # Convertir l'ordinal en date pour les calculs de temps
+                        current_date = pd.Timestamp.fromordinal(
+                            int(current_date_ordinal)
+                        )
+                        # Déterminer si on doit refitter la PCA
+                        should_refit = pipe is None or (  # Première fois
+                            last_fit_date is not None
+                            and (current_date - last_fit_date).days >= refresh_frequency
+                        )
+                        if (
+                            should_refit and i > 30
+                        ):  # Attendre au moins 30 jours de données
+                            # Prendre les données des 'lookback_days' derniers jours
+                            lookback_start_date = current_date - pd.Timedelta(
+                                days=lookback_days
+                            )
+                            lookback_start_ordinal = pd.Timestamp.toordinal(
+                                lookback_start_date
+                            )
+                            # Masque pour les dates passées uniquement (éviter le leakage)
+                            mask_fit = (df[index_col] >= lookback_start_ordinal) & (
+                                df[index_col] < current_date_ordinal
+                            )
+                            df_fit = df[mask_fit]
+                            if len(df_fit) > 0:
+                                # Créer le pivot pour la période de lookback
+                                pivot_fit = df_fit.pivot_table(
+                                    index=index_col,
+                                    columns=columns_col,
+                                    values=value_col,
+                                ).sort_index()
+                                # Vérifier qu'on a assez de dates et de colonnes
+                                if (
+                                    len(pivot_fit) >= n_components
+                                    and pivot_fit.shape[1] >= n_components
+                                ):
+                                    # Créer nouveau pipeline
+                                    steps = [
+                                        (
+                                            "imputer",
+                                            SimpleImputer(strategy=impute_strategy),
+                                        )
+                                    ]
+                                    if standardize:
+                                        steps.append(
+                                            (
+                                                "scaler",
+                                                StandardScaler(
+                                                    with_mean=True, with_std=True
+                                                ),
+                                            )
+                                        )
+                                    pca = PCA(n_components=n_components, random_state=0)
+                                    steps.append(("pca", pca))
+                                    pipe = Pipeline(steps)
+                                    pipe.fit(pivot_fit)
+                                    last_fit_date = current_date
+                                    logger.debug(
+                                        f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
+                                        f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
+                                    )
+                        # Transform pour la date courante uniquement
+                        if pipe is not None:
+                            df_current = df[df[index_col] == current_date_ordinal]
+                            if len(df_current) > 0:
+                                pivot_current = df_current.pivot_table(
+                                    index=index_col,
+                                    columns=columns_col,
+                                    values=value_col,
+                                )
+                                try:
+                                    scores_current = pipe.transform(pivot_current)
+                                    scores_dict = {
+                                        index_col: [current_date_ordinal],
+                                        **{
+                                            f"{prefix}_{j}": [scores_current[0, j]]
+                                            for j in range(n_components)
+                                        },
+                                    }
+                                    all_scores.append(pd.DataFrame(scores_dict))
+                                except Exception as e:
+                                    # En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
+                                    logger.debug(
+                                        f"PCA transform error at date {current_date}: {str(e)}"
+                                    )
+                                    scores_dict = {
+                                        index_col: [current_date_ordinal],
+                                        **{
+                                            f"{prefix}_{j}": [np.nan]
+                                            for j in range(n_components)
+                                        },
+                                    }
+                                    all_scores.append(pd.DataFrame(scores_dict))
+                        else:
+                            # Pas encore de PCA fittée, créer des NaN
+                            scores_dict = {
+                                index_col: [current_date_ordinal],
+                                **{
+                                    f"{prefix}_{j}": [np.nan]
+                                    for j in range(n_components)
+                                },
+                            }
+                            all_scores.append(pd.DataFrame(scores_dict))
+                    # Combiner tous les scores
+                    if all_scores:
+                        scores_df = pd.concat(all_scores, ignore_index=True)
+                    else:
+                        # Créer un DataFrame vide avec les bonnes colonnes
+                        cols = [f"{prefix}_{i}" for i in range(n_components)]
+                        scores_df = pd.DataFrame(columns=[index_col] + cols)
+                # Merger les scores
+                df = df.merge(scores_df, on=index_col, how="left")
+                df.index = index_saved
+                # Forward fill puis 0 pour éviter les NaN
+                pca_cols = [col for col in df.columns if col.startswith(prefix)]
+                df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
+                pcas_dict.update({name: pipe})
+            else:
+                # Approche classique (non time series ou index != date)
+                pivot = df.pivot_table(
+                    index=index_col, columns=columns_col, values=value_col
+                ).sort_index()
+                # Pipeline à réutiliser entre train et test
+                if pcas is None:
+                    steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
+                    if standardize:
+                        steps.append(
+                            ("scaler", StandardScaler(with_mean=True, with_std=True))
+                        )
+                    pca = PCA(n_components=n_components, random_state=0)
+                    steps.append(("pca", pca))
+                    pipe = Pipeline(steps)
+                    pipe.fit(pivot)  # <- fit sur TRAIN uniquement
+                else:
+                    pipe = pcas[name]  # <- TEST : on réutilise le pipe existant
+                scores = pipe.transform(pivot)  # shape: (n_index, n_components)
+                cols = [f"{prefix}_{i}" for i in range(n_components)]
+                scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
+                df = df.merge(scores_df.reset_index(), on=index_col, how="left")
+                df.index = index_saved
+                pcas_dict.update({name: pipe})
+        return df, pcas_dict
     # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
-    def add_pca_feature_temporal(
+    def add_pca_feature_temporal_old(
         self,
         df: pd.DataFrame,
         *,
@@ -717,6 +941,187 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_temporal(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,
+        impute_strategy: str = "median",
+        standardize: bool = True,
+        lookback_days: int = 365,
+        refresh_frequency: int = 90,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        PCA temporelle pour time series avec support panel data.
+        Crée automatiquement les colonnes de lags et évite le look-ahead bias.
+        Format pca_temporal simplifié:
+        [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
+        """
+        pcas_dict = {}
+        for pca_config in self.pca_temporal:
+            # Support both old and new format
+            if "columns" in pca_config:
+                # Old format: use existing columns
+                name = pca_config["name"]
+                lag_columns = pca_config["columns"]
+                base_column = None
+                num_lags = len(lag_columns)
+            else:
+                # New format: create lag columns
+                name = pca_config["name"]
+                base_column = pca_config["column"].upper()
+                num_lags = pca_config.get("lags", 20)
+                # Create lag columns if they don't exist
+                if self.group_column:
+                    # Panel data: create lags by group
+                    for lag in range(1, num_lags + 1):
+                        lag_col = f"{base_column}_-{lag}"
+                        if lag_col not in df.columns:
+                            df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
+                else:
+                    # Simple time series
+                    for lag in range(1, num_lags + 1):
+                        lag_col = f"{base_column}_-{lag}"
+                        if lag_col not in df.columns:
+                            df[lag_col] = df[base_column].shift(lag)
+                lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
+            prefix = f"TMP_PC_{name}"
+            # For time series: avoid look-ahead bias
+            if self.time_series and self.date_column:
+                all_scores = []
+                unique_dates = sorted(df[self.date_column].unique())
+                if pcas is not None:
+                    # Inference: use provided PCA
+                    pipe = pcas[name]
+                    # Apply to all data at once
+                    mask = df[lag_columns].notna().all(axis=1)
+                    if mask.any():
+                        X_transform = df.loc[mask, lag_columns]
+                        scores = pipe.transform(X_transform)
+                        for i in range(n_components):
+                            df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
+                    # Fill NaN with forward fill then 0
+                    pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
+                else:
+                    # Training: expanding window with periodic refresh
+                    pipe = None
+                    last_fit_date = None
+                    for current_date_ordinal in unique_dates:
+                        current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
+                        # Determine if we should refit
+                        should_refit = pipe is None or (
+                            last_fit_date is not None
+                            and (current_date - last_fit_date).days >= refresh_frequency
+                        )
+                        if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
+                            # Get historical data for fitting
+                            lookback_start = current_date - pd.Timedelta(days=lookback_days)
+                            lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
+                            mask_fit = (
+                                (df[self.date_column] >= lookback_start_ordinal) &
+                                (df[self.date_column] < current_date_ordinal) &
+                                df[lag_columns].notna().all(axis=1)
+                            )
+                            if mask_fit.sum() >= n_components:
+                                X_fit = df.loc[mask_fit, lag_columns]
+                                # Create pipeline
+                                steps = []
+                                if impute_strategy is not None:
+                                    steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                                if standardize:
+                                    steps.append(("scaler", StandardScaler()))
+                                steps.append(("pca", PCA(n_components=n_components, random_state=0)))
+                                pipe = Pipeline(steps)
+                                pipe.fit(X_fit)
+                                last_fit_date = current_date
+                                logger.debug(
+                                    f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
+                                    f"using {len(X_fit)} samples"
+                                )
+                        # Transform current date data
+                        if pipe is not None:
+                            mask_current = (
+                                (df[self.date_column] == current_date_ordinal) &
+                                df[lag_columns].notna().all(axis=1)
+                            )
+                            if mask_current.any():
+                                X_current = df.loc[mask_current, lag_columns]
+                                scores = pipe.transform(X_current)
+                                for i in range(n_components):
+                                    df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
+                    # Fill NaN with forward fill then 0
+                    pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    for col in pca_cols:
+                        if col not in df.columns:
+                            df[col] = 0
+                    df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
+                pcas_dict[name] = pipe
+            else:
+                # Non time-series: use original approach
+                mask = df[lag_columns].notna().all(axis=1)
+                if pcas is None and mask.any():
+                    X_fit = df.loc[mask, lag_columns]
+                    steps = []
+                    if impute_strategy is not None:
+                        steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                    if standardize:
+                        steps.append(("scaler", StandardScaler()))
+                    steps.append(("pca", PCA(n_components=n_components, random_state=0)))
+                    pipe = Pipeline(steps)
+                    pipe.fit(X_fit)
+                    pcas_dict[name] = pipe
+                elif pcas is not None:
+                    pipe = pcas[name]
+                    pcas_dict[name] = pipe
+                else:
+                    continue
+                if mask.any():
+                    X_transform = df.loc[mask, lag_columns]
+                    scores = pipe.transform(X_transform)
+                    for i in range(n_components):
+                        df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
+                # Fill missing values
+                pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                for col in pca_cols:
+                    if col not in df.columns:
+                        df[col] = 0
+                df[pca_cols] = df[pca_cols].fillna(0)
+        return df, pcas_dict
     # encoding categorical features
     def encode_categorical_features(
         self,

{lecrapaud-0.20.2.dist-info → lecrapaud-0.21.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lecrapaud
-Version: 0.20.2
+Version: 0.21.1
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 License-File: LICENSE
@@ -12,9 +12,12 @@ Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: catboost (>=1.2.8)
 Requires-Dist: category-encoders (>=2.8.1)
 Requires-Dist: celery (>=5.5.3)
+Requires-Dist: celery-redbeat (>=2.3.2)
 Requires-Dist: ftfy (>=6.3.1)
+Requires-Dist: hyperopt (>=0.2.7)
 Requires-Dist: joblib (>=1.5.1)
 Requires-Dist: keras (>=3.10.0)
+Requires-Dist: keras-tcn (>=3.5.6)
 Requires-Dist: lightgbm (>=4.6.0)
 Requires-Dist: matplotlib (>=3.10.3)
 Requires-Dist: mlxtend (>=0.23.4)

{lecrapaud-0.20.2.dist-info → lecrapaud-0.21.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
-lecrapaud/api.py,sha256=IQlH3wcSzxYgvlamfICNMwNsQGoaNxBJUPTlC9M0kBk,20321
+lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
+lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
 lecrapaud/config.py,sha256=0NEg61QdLxQ97bVFDDXa6OwlWFEo_z8VIhX5KrD1ik0,1170
 lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
 lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -14,7 +14,7 @@ lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_
 lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py,sha256=g6H2Z9MwB6UEiqdGlBoHBXpO9DTaWkwHt8FS6joVOm0,1191
 lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py,sha256=FshOF1t-NWXrBtXT3wMNGFslJ4sWUxzvBEXSymu05cI,1043
 lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py,sha256=htHUD4zPJr-0z_DQfTi8r9RsFVe9m7SL0f7oRIvLIcQ,10999
-lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=o3TNHq1GTFjxfk2zHWaUbq91khMJi6Xy6HToO9i54AU,2051
+lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=0NBvOwPqMXpWnDEGiEBk_IeLKmXQ5ZcU-dqHeSEgsRQ,2557
 lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
 lecrapaud/db/models/__init__.py,sha256=-XoCN1eeLihnNxBMl90lXrgrTSDkMbeqgienMqFi5f4,702
 lecrapaud/db/models/base.py,sha256=0548x4ftd6Oim9BJmtD7Er4izM6u0QCrlTG5560384w,9458
@@ -29,8 +29,8 @@ lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk
 lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,1109
 lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
 lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
-lecrapaud/experiment.py,sha256=hhi6NdVKtxoyx_AGBB4iNEZZpd9b3rKs23qiLPf-mUk,2384
-lecrapaud/feature_engineering.py,sha256=UM-EIOsgYWedqsR9uA-09eaWSb9FofVxoE0rRcDelQ8,39173
+lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
+lecrapaud/feature_engineering.py,sha256=SvGrJXv24rVgH0QE5mRwJITcCLfUqgbV2Ep68bBVnJs,58794
 lecrapaud/feature_selection.py,sha256=Q9xWVmZsvRjX9mJHB_PY_KLXsEAYNLX7txSe0cniY4A,47529
 lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
 lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
@@ -44,7 +44,7 @@ lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIF
 lecrapaud/model_selection.py,sha256=o4_hOEp91_33HtMatVHU7YPc71KZ2hK7wucN63xqWkA,88017
 lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
 lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
-lecrapaud-0.20.2.dist-info/METADATA,sha256=FUXEVYVCJAoat8HUtsupISlRbK56YVxezYwCH6j4kBE,14239
-lecrapaud-0.20.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-lecrapaud-0.20.2.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
-lecrapaud-0.20.2.dist-info/RECORD,,
+lecrapaud-0.21.1.dist-info/METADATA,sha256=rKls8xvjhu9f72jTw2sjBYCmQPw-N02RSScSOjJ1E2g,14348
+lecrapaud-0.21.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+lecrapaud-0.21.1.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
+lecrapaud-0.21.1.dist-info/RECORD,,

{lecrapaud-0.20.2.dist-info → lecrapaud-0.21.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{lecrapaud-0.20.2.dist-info → lecrapaud-0.21.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

lecrapaud 0.20.2__py3-none-any.whl → 0.21.1__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.20.2py3-none-any.whl → 0.21.1py3-none-any.whl