PyPI - lecrapaud - Versions diffs - 0.21.0__tar.gz → 0.21.1__tar.gz - Mend

lecrapaud 0.21.0tar.gz → 0.21.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{lecrapaud-0.21.0 → lecrapaud-0.21.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lecrapaud
-Version: 0.21.0
+Version: 0.21.1
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 License-File: LICENSE

{lecrapaud-0.21.0 → lecrapaud-0.21.1}/lecrapaud/feature_engineering.py RENAMED Viewed

@@ -605,7 +605,7 @@ class PreprocessFeature:
         return df, pcas_dict
-    def add_pca_feature_cross_sectional(
+    def add_pca_feature_cross_sectional_old(
         self,
         df: pd.DataFrame,
         *,
@@ -657,7 +657,7 @@ class PreprocessFeature:
         return df, pcas_dict
-    def add_pca_feature_cross_sectional_time_series(
+    def add_pca_feature_cross_sectional(
         self,
         df: pd.DataFrame,
         *,
@@ -840,6 +840,11 @@ class PreprocessFeature:
                 # Merger les scores
                 df = df.merge(scores_df, on=index_col, how="left")
                 df.index = index_saved
+                # Forward fill puis 0 pour éviter les NaN
+                pca_cols = [col for col in df.columns if col.startswith(prefix)]
+                df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
                 pcas_dict.update({name: pipe})
             else:
@@ -873,7 +878,7 @@ class PreprocessFeature:
         return df, pcas_dict
     # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
-    def add_pca_feature_temporal(
+    def add_pca_feature_temporal_old(
         self,
         df: pd.DataFrame,
         *,
@@ -936,6 +941,187 @@ class PreprocessFeature:
         return df, pcas_dict
+    def add_pca_feature_temporal(
+        self,
+        df: pd.DataFrame,
+        *,
+        n_components: int = 5,
+        pcas: dict[str, Pipeline] | None = None,
+        impute_strategy: str = "median",
+        standardize: bool = True,
+        lookback_days: int = 365,
+        refresh_frequency: int = 90,
+    ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
+        """
+        PCA temporelle pour time series avec support panel data.
+        Crée automatiquement les colonnes de lags et évite le look-ahead bias.
+        Format pca_temporal simplifié:
+        [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
+        """
+        pcas_dict = {}
+        for pca_config in self.pca_temporal:
+            # Support both old and new format
+            if "columns" in pca_config:
+                # Old format: use existing columns
+                name = pca_config["name"]
+                lag_columns = pca_config["columns"]
+                base_column = None
+                num_lags = len(lag_columns)
+            else:
+                # New format: create lag columns
+                name = pca_config["name"]
+                base_column = pca_config["column"].upper()
+                num_lags = pca_config.get("lags", 20)
+                # Create lag columns if they don't exist
+                if self.group_column:
+                    # Panel data: create lags by group
+                    for lag in range(1, num_lags + 1):
+                        lag_col = f"{base_column}_-{lag}"
+                        if lag_col not in df.columns:
+                            df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
+                else:
+                    # Simple time series
+                    for lag in range(1, num_lags + 1):
+                        lag_col = f"{base_column}_-{lag}"
+                        if lag_col not in df.columns:
+                            df[lag_col] = df[base_column].shift(lag)
+                lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
+            prefix = f"TMP_PC_{name}"
+            # For time series: avoid look-ahead bias
+            if self.time_series and self.date_column:
+                all_scores = []
+                unique_dates = sorted(df[self.date_column].unique())
+                if pcas is not None:
+                    # Inference: use provided PCA
+                    pipe = pcas[name]
+                    # Apply to all data at once
+                    mask = df[lag_columns].notna().all(axis=1)
+                    if mask.any():
+                        X_transform = df.loc[mask, lag_columns]
+                        scores = pipe.transform(X_transform)
+                        for i in range(n_components):
+                            df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
+                    # Fill NaN with forward fill then 0
+                    pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
+                else:
+                    # Training: expanding window with periodic refresh
+                    pipe = None
+                    last_fit_date = None
+                    for current_date_ordinal in unique_dates:
+                        current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
+                        # Determine if we should refit
+                        should_refit = pipe is None or (
+                            last_fit_date is not None
+                            and (current_date - last_fit_date).days >= refresh_frequency
+                        )
+                        if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
+                            # Get historical data for fitting
+                            lookback_start = current_date - pd.Timedelta(days=lookback_days)
+                            lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
+                            mask_fit = (
+                                (df[self.date_column] >= lookback_start_ordinal) &
+                                (df[self.date_column] < current_date_ordinal) &
+                                df[lag_columns].notna().all(axis=1)
+                            )
+                            if mask_fit.sum() >= n_components:
+                                X_fit = df.loc[mask_fit, lag_columns]
+                                # Create pipeline
+                                steps = []
+                                if impute_strategy is not None:
+                                    steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                                if standardize:
+                                    steps.append(("scaler", StandardScaler()))
+                                steps.append(("pca", PCA(n_components=n_components, random_state=0)))
+                                pipe = Pipeline(steps)
+                                pipe.fit(X_fit)
+                                last_fit_date = current_date
+                                logger.debug(
+                                    f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
+                                    f"using {len(X_fit)} samples"
+                                )
+                        # Transform current date data
+                        if pipe is not None:
+                            mask_current = (
+                                (df[self.date_column] == current_date_ordinal) &
+                                df[lag_columns].notna().all(axis=1)
+                            )
+                            if mask_current.any():
+                                X_current = df.loc[mask_current, lag_columns]
+                                scores = pipe.transform(X_current)
+                                for i in range(n_components):
+                                    df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
+                    # Fill NaN with forward fill then 0
+                    pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                    for col in pca_cols:
+                        if col not in df.columns:
+                            df[col] = 0
+                    df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
+                pcas_dict[name] = pipe
+            else:
+                # Non time-series: use original approach
+                mask = df[lag_columns].notna().all(axis=1)
+                if pcas is None and mask.any():
+                    X_fit = df.loc[mask, lag_columns]
+                    steps = []
+                    if impute_strategy is not None:
+                        steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
+                    if standardize:
+                        steps.append(("scaler", StandardScaler()))
+                    steps.append(("pca", PCA(n_components=n_components, random_state=0)))
+                    pipe = Pipeline(steps)
+                    pipe.fit(X_fit)
+                    pcas_dict[name] = pipe
+                elif pcas is not None:
+                    pipe = pcas[name]
+                    pcas_dict[name] = pipe
+                else:
+                    continue
+                if mask.any():
+                    X_transform = df.loc[mask, lag_columns]
+                    scores = pipe.transform(X_transform)
+                    for i in range(n_components):
+                        df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
+                # Fill missing values
+                pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
+                for col in pca_cols:
+                    if col not in df.columns:
+                        df[col] = 0
+                df[pca_cols] = df[pca_cols].fillna(0)
+        return df, pcas_dict
     # encoding categorical features
     def encode_categorical_features(
         self,

{lecrapaud-0.21.0 → lecrapaud-0.21.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.21.0"
+version = "0.21.1"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}