PyPI - lecrapaud - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show

lecrapaud/__init__.py +1 -0
lecrapaud/api.py +277 -0
lecrapaud/config.py +10 -0
lecrapaud/db/__init__.py +1 -0
lecrapaud/db/alembic/env.py +2 -2
lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
lecrapaud/db/alembic.ini +116 -0
lecrapaud/db/models/__init__.py +10 -10
lecrapaud/db/models/base.py +176 -1
lecrapaud/db/models/dataset.py +25 -20
lecrapaud/db/models/feature.py +5 -6
lecrapaud/db/models/feature_selection.py +3 -4
lecrapaud/db/models/feature_selection_rank.py +3 -4
lecrapaud/db/models/model.py +3 -4
lecrapaud/db/models/model_selection.py +15 -8
lecrapaud/db/models/model_training.py +15 -7
lecrapaud/db/models/score.py +9 -6
lecrapaud/db/models/target.py +16 -8
lecrapaud/db/session.py +66 -0
lecrapaud/experiment.py +64 -0
lecrapaud/feature_engineering.py +747 -1022
lecrapaud/feature_selection.py +915 -998
lecrapaud/integrations/openai_integration.py +225 -0
lecrapaud/jobs/__init__.py +2 -2
lecrapaud/jobs/config.py +1 -1
lecrapaud/jobs/scheduler.py +1 -1
lecrapaud/jobs/tasks.py +6 -6
lecrapaud/model_selection.py +1060 -960
lecrapaud/search_space.py +4 -0
lecrapaud/utils.py +2 -2
lecrapaud-0.4.1.dist-info/METADATA +171 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
lecrapaud/db/crud.py +0 -179
lecrapaud/db/services.py +0 -0
lecrapaud/db/setup.py +0 -58
lecrapaud/predictions.py +0 -292
lecrapaud/training.py +0 -151
lecrapaud-0.4.0.dist-info/METADATA +0 -103
/lecrapaud/{directory_management.py → directories.py} +0 -0
{lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -1,1119 +1,844 @@
-#!/usr/bin/env python
-# coding: utf-8
+"""
+Feature engineering module for data preprocessing and transformation.
-import pandas as pd
-import numpy as np
-import joblib
-from datetime import datetime
-import matplotlib.pyplot as plt
-import seaborn as sns
-from scipy.signal import argrelextrema
-from itertools import product
-import os
-from collections import defaultdict
-from src.config import PYTHON_ENV
-from src.utils import logger
-from src.directory_management import data_dir
-from src.services.indicators import (
-    rsi,
-    macd,
-    bollinger_bands,
-    adx,
-    atr,
-    stochastic,
-    mfi,
-    ichimoku_cloud,
-    parabolic_sar,
-    chaikin_money_flow,
-    pivot_points,
-    sma,
-    ema,
-    volatility,
-    cumulative_return,
-    close_diff,
-    obv,
-    pressure,
-)
-from src.db.models import Target
+Process
+-------
+FEAT ENG
+- utiliser business_analysis > get_table_summary pour voir quels sont les champs null à + de 90%
+- utiliser remove_constant_columns pour supprimer les colonnes constantes
+- utiliser summarize_dataframe pour supprimer de nouvelles colonnes inutiles (date, id, donnée future à la prédiction, misc not useful)
+- caster en numeric ce qui peut être casté en numeric
+- definir columns_boolean
+- definir groupby_columns_list et target_column pour le target encoding
+- créer la/les targets
+- définir columns_pca
+- définir columns_one_hot, columns_binary, columns_ordinal, columns_frequency
-# pd print options
-# pd.set_option("display.max_columns", None)
-# pd.reset_option("display.max_rows")
-# pd.set_option("display.max_colwidth", None)
+Todo
+----
+- DONE: drop meaningless identifier columns
+- DONE: PCA on embedding of deck
+- DONE: maybe cyclic encoding for date columns
-# Main function to create targets
-def targets_creation(
-    df: pd.DataFrame,
-    top_x_stock: float = 0.1,
-    local_max_order: int = 10,
-    threshold: int = 5,
-):
-    """Preprocessing the stock data from yfinance
+- DONE: ordinal/label encode (only 1 column) for tree based method when not too big number of categories
+- DONE: frequency encoding for some categorical columns
+- DONE: one hot encoding for categorical columns
+- DONE: binary encoding if big number of category
-    Args:
-        df (pd.DataFrame): a dataframe obtain with `get_data` function
-        top_x_stock (float): the % at which you are considered top ranked stock for the day
-        local_max_order (int): this set up the window to look at on both side of the extrema : the greater, the more 'global' is the extrema.
+- DONE: create other other embedding column for textual data ?
+- DONE: create some boolean like has_website, has_linkedin_company_url, etc...
-    Returns:
-        df with more columns:
-            - date variables : we create YEAR, MONTH, DAY, WEEK, WEEKDAY, YEARWEEK and YEARDAY features
-            - return, market return, residual return and similar computation with volume are done to create 6 new features
-            - target variables :
-                - TARGET_1 : next day return
-                - TARGET_2 : categorical return (positive 1, or negative 0)
-                - TARGET_3 : next day ranking from best (1) to worst (n_stock) returns
-                - TARGET_4 : categorical next day top ranking (in top_x_stock) (1), or not (0)
-                - TARGET_5, TARGET_6, TARGET_7, TARGET_8 : same but with residual return
-                - TARGET_9 : categorical with 1 if it's a local maximum and 0 if not
-                - TARGET_10 : categorical with 1 if it's a local minimum and 0 if not
-                - TARGET 11 : We will create trading signals based on proximity to local minima and maxima : need multi-binary loss support
-                - TARGET 12, 13, 14 : return in 9,14,21 days
+- target/mean encoding with a groupby on a very interesting categorical column
+- faire du "vrai" target encoding avec du leave one out encoding par exemple, sur la target variable ?
+- better categorize some stuff like country ? for sourcing we do position, ext_position, company, ext_company, country, source, but only country is relevant here
-    """
-    # Creating targets
-    logger.info("Creating target variables...")
-    # TARGET 1-4 : We start with target RET
-    target = "RET"
-    stock_column = "STOCK"
-    nb_of_stocks = len(df[stock_column].unique())
-    first_x_percent = max(int(nb_of_stocks * top_x_stock), 1)
-    df["TARGET_1"] = df[target].shift(-1)
-    df["TARGET_2"] = np.select([df["TARGET_1"] <= 0, df["TARGET_1"] > 0], [0, 1])
-    df["TARGET_3"] = df.groupby("DATE")["TARGET_1"].rank(
-        method="first", ascending=False
-    )
-    df["TARGET_4"] = np.select(
-        [
-            df.groupby("DATE")["TARGET_1"].rank(method="first", ascending=False)
-            <= first_x_percent
-        ],
-        [1],
-        default=0,
-    )
-    # TARGET 5-8 : We do the same for RESIDUAL_RET
-    target = "RESIDUAL_RET"
-    df["TARGET_5"] = df[target].shift(-1)
-    df["TARGET_6"] = np.select([df["TARGET_5"] <= 0, df["TARGET_5"] > 0], [0, 1])
-    df["TARGET_7"] = df.groupby("DATE")["TARGET_5"].rank(
-        method="first", ascending=False
-    )
-    df["TARGET_8"] = np.select(
-        [
-            df.groupby("DATE")["TARGET_5"].rank(method="first", ascending=False)
-            <= first_x_percent
-        ],
-        [1],
-        default=0,
-    )
-    # TARGET 9-10 : Let's look at local min and max : it can be interpretate as buy and sell signal respectively
-    target = "CLOSE"
-    df["TARGET_9"] = 0
-    df["TARGET_10"] = 0
-    # Calculate local maxima and set TARGET_9 to 1 where maxima are found
-    maxima_indices = df.groupby(stock_column)[target].transform(
-        lambda x: x.index.isin(
-            x.iloc[argrelextrema(x.values, np.greater, order=local_max_order)].index
-        )
-    )
+Development
+-----------
+- utiliser le PCA pour définir combien de variable explique la variance pour la feature selection max_feature
+- could be nice to get linkedin info of founders (need to search reps in rails first) - and score !
+- add created_from, utm_source, referrer when we will have more data
+- could be nice to get team_count, or dealroom info but at the moment of submission...
+"""
-    minima_indices = df.groupby(stock_column)[target].transform(
-        lambda x: x.index.isin(
-            x.iloc[argrelextrema(x.values, np.less, order=local_max_order)].index
-        )
-    )
-    df.loc[maxima_indices, "TARGET_9"] = 1
-    df.loc[minima_indices, "TARGET_10"] = 1
-    # TARGET 11 : We will create trading signals based on proximity to local minima and maxima.
-    df["TARGET_11"] = 2  # Default value for HOLD
-    # Function to detect local minima and maxima, and assign signals
-    def assign_signals(group):
-        close_prices = group[target].values
-        dates = group["DATE"].values
-        # Detect local maxima and minima using argrelextrema
-        local_maxima_idx = argrelextrema(
-            close_prices, np.greater, order=local_max_order
-        )[0]
-        local_minima_idx = argrelextrema(close_prices, np.less, order=local_max_order)[
-            0
-        ]
-        # STRONG BUY (4) for local minima, STRONG SELL (0) for local maxima
-        group.loc[group.index[local_minima_idx], "TARGET_11"] = 4
-        group.loc[group.index[local_maxima_idx], "TARGET_11"] = 0
-        # Assign BUY (3) and SELL (1) based on proximity to extrema within the threshold window
-        for idx in local_minima_idx:
-            # Get the actual date of the minima
-            min_date = dates[idx]
-            # Select the rows within the threshold window around the minima date
-            buy_window = group.loc[
-                (group["DATE"] >= min_date - pd.Timedelta(days=threshold))
-                & (group["DATE"] <= min_date + pd.Timedelta(days=threshold))
-            ]
-            group.loc[buy_window.index, "TARGET_11"] = np.where(
-                buy_window["DATE"] == min_date,
-                4,
-                3,  # STRONG BUY at minima, BUY near minima
-            )
-        for idx in local_maxima_idx:
-            # Get the actual date of the maxima
-            max_date = dates[idx]
-            # Select the rows within the threshold window around the maxima date
-            sell_window = group.loc[
-                (group["DATE"] >= max_date - pd.Timedelta(days=threshold))
-                & (group["DATE"] <= max_date + pd.Timedelta(days=threshold))
-            ]
-            group.loc[sell_window.index, "TARGET_11"] = np.where(
-                sell_window["DATE"] == max_date,
-                0,
-                1,  # STRONG SELL at maxima, SELL near maxima
-            )
+import pandas as pd
+import numpy as np
+from itertools import product
+import joblib
-        return group
-    # Apply the function to each stock group
-    df = df.groupby(stock_column, group_keys=False).apply(assign_signals)
-    # TARGET 12, 13, 14 : return in 9,14,21 days
-    df["TARGET_12"] = df.groupby("STOCK")["CLOSE"].pct_change(9).shift(-9)
-    df["TARGET_13"] = df.groupby("STOCK")["CLOSE"].pct_change(14).shift(-14)
-    df["TARGET_14"] = df.groupby("STOCK")["CLOSE"].pct_change(21).shift(-21)
-    # Update database
-    # TODO: in bulk
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_1",
-        type="regression",
-        description="Next day return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_2",
-        type="classification",
-        description="Next day return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_3",
-        type="regression",
-        description="Ranking of next day return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_4",
-        type="classification",
-        description="Top ranking of next day return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_5",
-        type="regression",
-        description="Next day residual return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_6",
-        type="classification",
-        description="Next day residual return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_7",
-        type="regression",
-        description="Ranking of next day residual return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_8",
-        type="classification",
-        description="Top ranking of next day residual return",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_9",
-        type="classification",
-        description="Local maxima",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_10",
-        type="classification",
-        description="Local minima",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_11",
-        type="classification",
-        description="Trading signals based on proximity to local minima and maxima",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_12",
-        type="regression",
-        description="Return in 9 days",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_13",
-        type="regression",
-        description="Return in 14 days",
-    )
-    Target.upsert(
-        match_fields=["name", "type"],
-        name="TARGET_14",
-        type="regression",
-        description="Return in 21 days",
-    )
+from sklearn.compose import ColumnTransformer
+from sklearn.decomposition import PCA
+from category_encoders import BinaryEncoder, CountEncoder
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.model_selection import train_test_split
-    return df
+from lecrapaud.integrations.openai_integration import (
+    truncate_text,
+    get_openai_embeddings,
+)
+from lecrapaud.feature_selection import get_features_by_types
+from lecrapaud.utils import logger
+from lecrapaud.db import Target, Feature, Dataset
+from lecrapaud.config import PYTHON_ENV
-def calculate_option_features(option_data: list[dict], spot_price: float):
-    puts = [opt for opt in option_data if opt["type"] == "put"]
-    calls = [opt for opt in option_data if opt["type"] == "call"]
+# main function
+class FeatureEngineeringEngine:
+    """
+    Feature engineering pipeline
+    Params needed
+    -------------
+    data
+    columns_boolean
+    columns_date
+    columns_te_groupby
+    columns_te_target
+    for_training
+    """
-    def safe_float(x):
-        try:
-            return float(x)
-        except:
-            return 0.0
-    # Convert and clean data
-    for opt in option_data:
-        for key in ["strike", "volume", "open_interest", "delta", "implied_volatility"]:
-            opt[key] = safe_float(opt.get(key, 0.0))
-    # Put/Call ratios
-    total_put_vol = sum(p["volume"] for p in puts)
-    total_call_vol = sum(c["volume"] for c in calls)
-    total_put_oi = sum(p["open_interest"] for p in puts)
-    total_call_oi = sum(c["open_interest"] for c in calls)
-    put_call_ratio_vol = total_put_vol / total_call_vol if total_call_vol > 0 else None
-    put_call_ratio_oi = total_put_oi / total_call_oi if total_call_oi > 0 else None
-    # Open Interest Skew
-    oi_skew = sum(c["open_interest"] for c in calls if c["strike"] > spot_price) - sum(
-        p["open_interest"] for p in puts if p["strike"] < spot_price
-    )
-    # Total Open Interest
-    total_oi = sum(opt["open_interest"] for opt in option_data)
-    # Delta-weighted Put/Call Ratio
-    dw_put = sum(p["delta"] * p["volume"] for p in puts)
-    dw_call = sum(c["delta"] * c["volume"] for c in calls)
-    delta_weighted_pcr = dw_put / dw_call if dw_call > 0 else None
-    # ATM IV
-    atm_option = min(option_data, key=lambda x: abs(x["strike"] - spot_price))
-    atm_iv = atm_option["implied_volatility"]
-    # IV Skew (25-delta)
-    iv_put_25d = np.mean(
-        [p["implied_volatility"] for p in puts if abs(p["delta"] + 0.25) < 0.05]
-    )
-    iv_call_25d = np.mean(
-        [c["implied_volatility"] for c in calls if abs(c["delta"] - 0.25) < 0.05]
-    )
-    iv_skew_25d = iv_put_25d - iv_call_25d if iv_put_25d and iv_call_25d else None
-    # IV Term Structure
-    iv_by_exp = defaultdict(list)
-    for opt in option_data:
-        iv_by_exp[opt["expiration"]].append(opt["implied_volatility"])
-    expiries = sorted(iv_by_exp.keys())
-    if len(expiries) >= 2:
-        iv_term_structure = np.mean(iv_by_exp[expiries[-1]]) - np.mean(
-            iv_by_exp[expiries[0]]
-        )
-    else:
-        iv_term_structure = None
-    # Moneyness
-    moneyness = [spot_price / opt["strike"] for opt in option_data if opt["strike"] > 0]
-    # % OTM / ITM
-    otm_calls = [c for c in calls if c["strike"] > spot_price]
-    otm_puts = [p for p in puts if p["strike"] < spot_price]
-    otm = len(otm_calls) + len(otm_puts)
-    itm = len(option_data) - otm
-    percent_otm = otm / len(option_data) if option_data else None
-    percent_itm = itm / len(option_data) if option_data else None
-    # Weighted Average Strike
-    def weighted_avg_strike(options):
-        total_vol = sum(o["volume"] for o in options)
-        return (
-            sum(o["strike"] * o["volume"] for o in options) / total_vol
-            if total_vol > 0
-            else None
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        columns_drop: list[str] = [],
+        columns_boolean: list[str] = [],
+        columns_date: list[str] = [],
+        columns_te_groupby: list[str] = [],
+        columns_te_target: list[str] = [],
+        for_training: bool = True,
+        **kwargs,
+    ):
+        self.data = data
+        self.columns_drop = columns_drop
+        self.columns_boolean = columns_boolean
+        self.columns_date = columns_date
+        self.columns_te_groupby = columns_te_groupby
+        self.columns_te_target = columns_te_target
+        self.for_training = for_training
+    def run(self) -> pd.DataFrame:
+        # drop columns
+        self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
+        # convert object columns to numeric if possible
+        self.data = convert_object_columns_that_are_numeric(self.data)
+        # handle boolean features
+        self.data = self.boolean_encode_columns()
+        # handle missing values
+        self.data = (
+            self.fillna_at_training()
+            if self.for_training
+            else self.fillna_at_inference()
         )
-    avg_strike_calls = weighted_avg_strike(calls)
-    avg_strike_puts = weighted_avg_strike(puts)
-    # Option Sentiment Index
-    sentiment_numerator = sum(
-        c["volume"] for c in calls if c["strike"] < spot_price
-    ) - sum(p["volume"] for p in puts if p["strike"] > spot_price)
-    sentiment_index = (
-        sentiment_numerator / (total_put_vol + total_call_vol)
-        if (total_put_vol + total_call_vol) > 0
-        else None
-    )
-    return {
-        "put_call_ratio_volume": put_call_ratio_vol,
-        "put_call_ratio_open_interest": put_call_ratio_oi,
-        "open_interest_skew": oi_skew,
-        "total_open_interest": total_oi,
-        "delta_weighted_pcr": delta_weighted_pcr,
-        "atm_iv": atm_iv,
-        "iv_skew_25d": iv_skew_25d,
-        "iv_term_structure": iv_term_structure,
-        "average_moneyness": np.mean(moneyness) if moneyness else None,
-        "percent_otm": percent_otm,
-        "percent_itm": percent_itm,
-        "weighted_avg_strike_calls": avg_strike_calls,
-        "weighted_avg_strike_puts": avg_strike_puts,
-        "option_sentiment_index": sentiment_index,
-    }
-def apply_indicators(df: pd.DataFrame):
-    """Apply multiple indicators to a grouped dataframe of a single stock."""
-    # Assuming 'df' is the OHLC data for a single stock, apply indicators
-    result = df.copy()
-    logger.debug(f"Computing non-period features...")
-    # Apply Parabolic SAR
-    result["Parabolic_SAR"] = parabolic_sar(df)
-    # Apply Bollinger Bands
-    result["Upper_BB"], result["Middle_BB"], result["Lower_BB"] = bollinger_bands(df)
-    # Apply Ichimoku Cloud
-    (
-        result["Tenkan"],
-        result["Kijun"],
-        result["Senkou_A"],
-        result["Senkou_B"],
-        result["Chikou"],
-    ) = ichimoku_cloud(df)
-    # Apply Pivot Points (including support and resistance levels)
-    result["Pivot"], result["R1"], result["S1"], result["R2"], result["S2"] = (
-        pivot_points(df)
-    )
-    # Other indicators
-    result["CLOSE_DIFF"] = close_diff(df)
-    result["OBV"] = obv(df)
-    result["DOWNWARD_PRESSURE"], result["UPWARD_PRESSURE"] = pressure(df)
-    # Apply MACD (Moving Average Convergence Divergence)
-    result["MACD_Line"], result["MACD_Signal"] = macd(df)
-    # first buy/sell signal : MACD_SIGNAL_DIFF cross 0 levels
-    result["MACD_SIGNAL_DIFF"] = result["MACD_Line"] - result["MACD_Signal"]
-    result["BUY_1"] = np.where(
-        (result["MACD_SIGNAL_DIFF"] > 0)
-        & (result["MACD_SIGNAL_DIFF"].shift(1) < 0),  # Buy signal (MACD crossover)
-        1,  # Buy
-        np.where(
-            (result["MACD_SIGNAL_DIFF"] < 0)
-            & (
-                result["MACD_SIGNAL_DIFF"].shift(1) > 0
-            ),  # Sell signal (MACD crossunder)
-            -1,  # Sell
-            np.nan,  # Default case
-        ),
-    )
-    result["BUY_1"] = result["BUY_1"].fillna(0)  # TODO: should we fill with 0 (done)
-    # second buy/sell signal : MACD_SIGNAL_DIFF cross 30% threshold of maximum value while positive and decreasing, or 30% threshold of minimum value while negative and increasing
-    # Calculate rolling 20-day max and min values for MACD_SIGNAL_DIFF per stock
-    macd_signal_diff_max_20_days = result.groupby("STOCK")[
-        "MACD_SIGNAL_DIFF"
-    ].transform(lambda x: x.rolling(20).max())
-    macd_signal_diff_min_20_days = result.groupby("STOCK")[
-        "MACD_SIGNAL_DIFF"
-    ].transform(lambda x: x.rolling(20).min())
-    # Define the buy/sell signal conditions
-    buy_condition = (
-        (result["MACD_SIGNAL_DIFF"] > result["MACD_SIGNAL_DIFF"].shift(1))  # Increasing
-        & (result["MACD_SIGNAL_DIFF"] < 0)  # Negative value
-        & (
-            result["MACD_SIGNAL_DIFF"] > 0.3 * macd_signal_diff_min_20_days
-        )  # Above 30% of minimum
-    )
-    sell_condition = (
-        (result["MACD_SIGNAL_DIFF"] < result["MACD_SIGNAL_DIFF"].shift(1))  # Decreasing
-        & (result["MACD_SIGNAL_DIFF"] > 0)  # Positive value
-        & (
-            result["MACD_SIGNAL_DIFF"] < 0.3 * macd_signal_diff_max_20_days
-        )  # Below 30% of maximum
-    )
-    # Apply the conditions to calculate buy/sell signals
-    result["BUY_2"] = np.where(
-        buy_condition,
-        np.abs(
-            (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_min_20_days)
-            / (0.3 * macd_signal_diff_min_20_days)
-        ),
-        np.where(
-            sell_condition,
-            -np.abs(
-                (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_max_20_days)
-                / (0.3 * macd_signal_diff_max_20_days)
-            ),
-            0,  # Default
-        ),
-    )
-    periods = [
-        9,
-        14,
-        21,
-        50,
-        126,
-        200,
-        252,
-    ]  # 2 semaines, 3 semaines, 1 mois et 2.5 mois
-    # TODO: on pourrait rajouter plus de long terme : 126 jours (6 mois) et 200 jours (9 mois) et 252 jours (1 an)
-    features = []
-    for period in periods:
-        logger.debug(f"Computing period features for {period} days...")
-        features.append(
-            pd.DataFrame(
-                {
-                    f"CUMUL_RET_{period}": cumulative_return(df, period=period),
-                    f"SMA_{period}": sma(df, period=period),
-                    f"EMA_{period}": ema(df, period=period),
-                    f"VOLATILITY_{period}": volatility(df, period=period),
-                    f"ADX_{period}": adx(df, period=period),
-                    f"ATR_{period}": atr(df, period=period),
-                    f"CMF_{period}": chaikin_money_flow(df, period=period),
-                    f"RSI_{period}": rsi(df, period=period),
-                    f"MFI_{period}": mfi(df, period=period),
-                },
-                index=df.index,
-            )
-        )
+        # target encoding
+        self.data = self.generate_target_encodings()
-        # Stochastic Oscillator returns two series: %K and %D
-        k, d = stochastic(df, period=period)
-        features.append(
-            pd.DataFrame(
-                {
-                    f"%K_{period}": k,
-                    f"%D_{period}": d,
-                },
-                index=df.index,
-            )
-        )
-    result = pd.concat([result] + features, axis=1)
-    # third buy/sell signal : RSI is overbought >0.7 / oversold <0.3
-    result["BUY_3"] = np.where(
-        result["RSI_14"] <= 30,
-        (30 - result["RSI_14"]) / 30,
-        np.where(result["RSI_14"] >= 70, -(result["RSI_14"] - 70) / 30, 0),
-    )
-    # fourth buy/sell signal : RSI vs CLOSE divergence
-    # The RSI vs. Close divergence trading signal identifies potential reversals by detecting when the
-    # Relative Strength Index (RSI) and price (Close) move in opposite directions
-    # bullish divergence occurs when the price makes lower lows while RSI makes higher lows (potential uptrend),
-    # and bearish divergence occurs when the price makes higher highs while RSI makes lower highs (potential downtrend)
-    # Detect local peaks (RSI Highs) and troughs (RSI Lows) for divergence analysis
-    # Compute local maxima and minima indices
-    rsi_peak_indices = argrelextrema(result["RSI_14"].values, np.greater)[
-        0
-    ]  # RSI highs
-    rsi_trough_indices = argrelextrema(result["RSI_14"].values, np.less)[0]  # RSI lows
-    # Create boolean masks for peaks and troughs
-    rsi_peaks_mask = np.zeros(len(result), dtype=bool)
-    rsi_troughs_mask = np.zeros(len(result), dtype=bool)
-    rsi_peaks_mask[rsi_peak_indices] = True
-    rsi_troughs_mask[rsi_trough_indices] = True
-    # Extract peak and trough rows efficiently
-    rsi_peaks = result.loc[rsi_peaks_mask, ["CLOSE", "RSI_14"]].copy()
-    rsi_troughs = result.loc[rsi_troughs_mask, ["CLOSE", "RSI_14"]].copy()
-    # Compute RSI and CLOSE differences to check divergence
-    for i in [1, 2, 3]:
-        # RSI & Price difference from past peaks
-        rsi_peaks[f"RSI_PEAK_DIFF_{i}"] = rsi_peaks["RSI_14"].diff(i)
-        rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] = rsi_peaks["CLOSE"].diff(i)
-        # RSI & Price difference from past troughs
-        rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] = rsi_troughs["RSI_14"].diff(i)
-        rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] = rsi_troughs["CLOSE"].diff(i)
-        # Detect bearish divergence (RSI down, price up) and bullish divergence (RSI up, price down)
-        rsi_peaks[f"DIVERGENCE_{i}"] = np.where(
-            (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] < 0)
-            & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] > 0),
-            -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
-            np.where(
-                (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] > 0)
-                & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] < 0),
-                -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
-                0,
-            ),
-        )
+        # Cyclic encode dates
+        self.data = self.cyclic_encode_date()
-        rsi_troughs[f"DIVERGENCE_{i}"] = np.where(
-            (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] > 0)
-            & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] < 0),
-            np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
-            np.where(
-                (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] < 0)
-                & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] > 0),
-                np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
-                0,
-            ),
-        )
+        return self.data
-    # Concatenate peak and trough divergences into a single DataFrame
-    divergence_cols = [f"DIVERGENCE_{i}" for i in [1, 2, 3]]
-    divergence_data = pd.concat(
-        [rsi_peaks[divergence_cols], rsi_troughs[divergence_cols]], axis=0
-    )
+    def cyclic_encode_date(self) -> pd.DataFrame:
+        """
+        Adds cyclic (sine and cosine) encoding for common date parts: day of week, day of month, and month.
-    # Merge using index alignment
-    result[divergence_cols] = divergence_data.reindex(result.index, fill_value=0)
+        Parameters:
+            df (pd.DataFrame): Input dataframe
+            columns (list[str]): List of datetime columns to encode
+            prefix (str): Optional prefix for new columns. If None, uses column names.
-    # Sum divergence signals into BUY_4 for a single signal strength metric
-    result["BUY_4"] = result[divergence_cols].sum(axis=1)
-    return result
+        Returns:
+            pd.DataFrame: Updated dataframe with new cyclic features
+        """
+        df: pd.DataFrame = self.data
+        columns: list[str] = self.columns_date
-# Main function to process the full dataset with multiple stocks
-def feature_engineering(
-    df: pd.DataFrame,
-    for_training: bool = False,
-    save_as_csv: bool = False,
-    analytics: bool = False,
-):
-    """Main function to process the full dataset with multiple stocks
+        def cyclic_encode(series, max_value):
+            sin_values = np.sin(2 * np.pi * series / max_value)
+            cos_values = np.cos(2 * np.pi * series / max_value)
+            return sin_values, cos_values
-    Args:
-        - df (pd.DataFrame): the dataframe with ohlc data
-        - for_training (bool): whether to compute targets and for_training as data_for_training, or not.
-    """
+        for col in columns:
-    # dates
-    logger.info("Creating date like variables...")
-    df["DATE"] = pd.to_datetime(df["DATE"]).dt.normalize()
-    df["YEAR"] = df["DATE"].dt.isocalendar().year
-    df["MONTH"] = df["DATE"].dt.month
-    df["DAY"] = df["DATE"].dt.day
-    df["WEEK"] = df["DATE"].dt.isocalendar().week
-    df["WEEKDAY"] = df["DATE"].dt.weekday
-    df["YEARDAY"] = df["DATE"].dt.dayofyear
-    # Cyclic encoding for date-like variables
-    def cyclic_encode(series, max_value):
-        sin_values = np.sin(2 * np.pi * series / max_value)
-        cos_values = np.cos(2 * np.pi * series / max_value)
-        return sin_values, cos_values
-    df["MONTH_sin"], df["MONTH_cos"] = cyclic_encode(df["MONTH"], 12)
-    df["DAY_sin"], df["DAY_cos"] = cyclic_encode(df["DAY"], 31)
-    df["WEEK_sin"], df["WEEK_cos"] = cyclic_encode(df["WEEK"], 52)
-    df["WEEKDAY_sin"], df["WEEKDAY_cos"] = cyclic_encode(df["WEEKDAY"], 7)
-    df["YEARDAY_sin"], df["YEARDAY_cos"] = cyclic_encode(df["YEARDAY"], 365)
-    # Computing residual RET and relative VOLUME
-    logger.info("Creating RET and VOLUME metrics...")
-    df["RET"] = df.groupby("STOCK")["CLOSE"].pct_change(1)
-    df["MARKET_RET"] = df.groupby("DATE")["RET"].transform("mean")
-    df["RESIDUAL_RET"] = df["RET"] - df["MARKET_RET"]
-    df["VOLUME_RATIO"] = (
-        df["VOLUME"]
-        / df.groupby("STOCK")["VOLUME"].rolling(20, min_periods=1).mean().values
-    )
-    df["MARKET_VOLUME_RATIO"] = df.groupby("DATE")["VOLUME_RATIO"].transform("mean")
-    df["RELATIVE_VOLUME"] = df["VOLUME_RATIO"] - df["MARKET_VOLUME_RATIO"]
-    logger.info("Creating historical time series metrics...")
-    periods = [
-        1,  # daily
-        2,
-        3,
-        4,
-        5,  # weekly
-        9,
-        14,
-        21,  # monthly
-        50,
-        126,
-        200,
-        252,
-    ]  # need to keep 1, 2, 3, 4, 5 for backward compatibility
-    for METRIC in ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]:
-        for i in periods:
-            df[f"{METRIC}_-{i}"] = df[METRIC].shift(i)
-    # Group by "STOCK" and apply the indicators for each stock
-    logger.info("Applying indicators...")
-    grouped_df = df.groupby("STOCK", group_keys=False)
-    preprocessed_df = grouped_df.apply(apply_indicators)
-    # Target encoding / Mean encoding for categorical features
-    # it's when you groupby a categorical feature and aggregate a target with a stat such as mean or median
-    logger.info("Computing aggregated features...")
-    statistics = ["mean", "median"]
-    gb_features = [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
-    # Define your base
-    target_features = ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]
-    periods = [9, 14, 21, 50]
-    indicators = [
-        "CUMUL_RET",
-        "SMA",
-        "EMA",
-        "VOLATILITY",
-        "ATR",
-        "ADX",
-        "%K",
-        "RSI",
-        "MFI",
-    ]
-    target_features += [f"{ind}_{p}" for p in periods for ind in indicators]
-    # Prepare to collect new columns
-    new_feature_cols = {}
-    # Generate features efficiently
-    for gb_feature, stat, target in product(gb_features, statistics, target_features):
-        col_name = f"{target}_{'_'.join(gb_feature)}_{stat.upper()}"
-        new_feature_cols[col_name] = preprocessed_df.groupby(gb_feature)[
-            target
-        ].transform(stat)
-    # Merge all at once to improve performance
-    preprocessed_df = pd.concat(
-        [preprocessed_df, pd.DataFrame(new_feature_cols)], axis=1
-    )
-    if for_training:
-        preprocessed_df = targets_creation(preprocessed_df)
-    # Descriptive Analysis
-    if analytics:
-        traditional_descriptive_analysis(preprocessed_df)
-    if save_as_csv and PYTHON_ENV == "Development":
-        preprocessed_df_to_csv = preprocessed_df.sort_values(["DATE", "STOCK"])
-        preprocessed_df_to_csv.to_csv(
-            f"{data_dir}/data_for_training.csv",
-            index=False,
-            header=True,
-        )
+            df[col] = pd.to_datetime(df[col]).dt.normalize()
+            df[f"{col}_year"] = df[col].dt.isocalendar().year
+            df[f"{col}_month"] = df[col].dt.month
+            df[f"{col}_day"] = df[col].dt.day
+            df[f"{col}_week"] = df[col].dt.isocalendar().week
+            df[f"{col}_weekday"] = df[col].dt.weekday
+            df[f"{col}_yearday"] = df[col].dt.dayofyear
+            df[col] = pd.to_datetime(df[col]).map(pd.Timestamp.toordinal)
-    if for_training:
-        preprocessed_df.dropna(inplace=True)
-    else:
-        preprocessed_df.dropna(
-            subset=preprocessed_df.loc[
-                :, ~preprocessed_df.columns.str.contains("^TARGET_")
-            ].columns,
-            inplace=True,
-        )
+            df[f"{col}_month_sin"], df[f"{col}_month_cos"] = cyclic_encode(
+                df[f"{col}_month"], 12
+            )
+            df[f"{col}_day_sin"], df[f"{col}_day_cos"] = cyclic_encode(
+                df[f"{col}_day"], 31
+            )
+            df[f"{col}_week_sin"], df[f"{col}_week_cos"] = cyclic_encode(
+                df[f"{col}_week"], 52
+            )
+            df[f"{col}_weekday_sin"], df[f"{col}_weekday_cos"] = cyclic_encode(
+                df[f"{col}_weekday"], 7
+            )
+            df[f"{col}_yearday_sin"], df[f"{col}_yearday_cos"] = cyclic_encode(
+                df[f"{col}_yearday"], 365
+            )
-    preprocessed_df.sort_values(["DATE", "STOCK"], inplace=True)
-    preprocessed_df.reset_index(drop=True, inplace=True)
+            # Drop the original column TODO: not sure if we should drop it for time series
+            # df.drop(col, axis=1, inplace=True)
+        return df
+    def boolean_encode_columns(self) -> pd.DataFrame:
+        """
+        Applies boolean encoding to a list of columns:
+        - Leaves column as-is if already int with only 0 and 1
+        - Otherwise: sets 1 if value is present (notna), 0 if null/NaN/None
+        Parameters:
+            df (pd.DataFrame): Input dataframe
+            columns (list): List of column names to encode
+        Returns:
+            pd.DataFrame: Updated dataframe with encoded columns
+        """
+        df: pd.DataFrame = self.data
+        columns: list[str] = self.columns_boolean
+        for column in columns:
+            col = df[column]
+            if pd.api.types.is_integer_dtype(col) and set(
+                col.dropna().unique()
+            ).issubset({0, 1}):
+                continue  # already valid binary
+            df[column] = col.notna().astype(int)
+        return df
+    def generate_target_encodings(self) -> pd.DataFrame:
+        """
+        Generate target encoding features (e.g., mean, median) for specified targets and group-by combinations.
+        Parameters:
+            df (pd.DataFrame): Input dataframe
+            columns_te_groupby (list of list): Grouping keys, e.g., [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
+            columns_te_target (list): Target columns to aggregate (e.g., ["RET", "VOLUME", "RSI_14"])
+            statistics (list): List of aggregation statistics (e.g., ["mean", "median"])
+        Returns:
+            pd.DataFrame: Original dataframe with new encoded columns added
+        """
+        df: pd.DataFrame = self.data
+        columns_te_groupby: list[list[str]] = self.columns_te_groupby
+        columns_te_target: list[str] = self.columns_te_target
+        statistics: list[str] = ["mean", "median"]
+        df = df.copy()
+        new_feature_cols = {}
+        for group_cols, stat, target_col in product(
+            columns_te_groupby, statistics, columns_te_target
+        ):
+            col_name = f"{target_col}_{'_'.join(group_cols)}_{stat.upper()}"
+            new_feature_cols[col_name] = df.groupby(group_cols)[target_col].transform(
+                stat
+            )
-    logger.info(
-        f"{len(preprocessed_df['DATE'])} preprocessed data with shape {preprocessed_df.shape} from {datetime.strftime(preprocessed_df['DATE'].iat[0], '%d/%m/%Y')} to {datetime.strftime(preprocessed_df['DATE'].iat[-1], '%d/%m/%Y')}"
-    )
+        # merge all at once to improve performance
+        df = pd.concat([df, pd.DataFrame(new_feature_cols)], axis=1)
+        return df
+    def fillna_at_training(self) -> pd.DataFrame:
+        """
+        Fill missing values in a DataFrame:
+        - Numeric columns: fill with mean
+        - Categorical columns: fill with mode
+        Handles both NaN and None.
+        Parameters:
+            df (pd.DataFrame): Input DataFrame
+        Returns:
+            pd.DataFrame: Cleaned DataFrame with missing values filled
+        """
+        df: pd.DataFrame = self.data.copy()
+        for col in df.columns:
+            missing_count = df[col].isnull().sum()
+            if missing_count > 0:
+                if pd.api.types.is_numeric_dtype(df[col]):
+                    df[col] = df[col].fillna(df[col].mean())
+                    logger.info(
+                        f"Filled {missing_count} NaN values in numeric column '{col}' with mean."
+                    )
+                else:
+                    mode = df[col].mode()
+                    if not mode.empty:
+                        mode_value = mode[0]
+                        mode_count = (df[col] == mode_value).sum()
+                        if mode_count > 100:
+                            fill_value = mode_value
+                        else:
+                            fill_value = "unknown"
+                    else:
+                        fill_value = "unknown"
+                    df[col] = df[col].fillna(fill_value)
+                    logger.info(
+                        f"Filled {missing_count} NaN values in categorical column '{col}' with '{fill_value}'."
+                    )
+        return df
+    def fillna_at_inference(self) -> pd.DataFrame:
+        df: pd.DataFrame = self.data
+        missing_cols = df.columns[df.isnull().any()].tolist()
+        if missing_cols:
+            numeric_cols = [
+                col for col in missing_cols if pd.api.types.is_numeric_dtype(df[col])
+            ]
+            non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
-    # for_training results if needed
-    if for_training and PYTHON_ENV == "Development":
-        joblib.dump(preprocessed_df, f"{data_dir}/data_for_training.pkl")
+            logger.warning(
+                f"Missing values found in inference data."
+                f"Filling with 0 for numeric columns: {numeric_cols}, "
+                f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
+            )
-    # Return the fully processed DataFrame with all new features (copy to avoid fragmented memory)
-    return_df = preprocessed_df.copy()
-    return return_df
+            df[numeric_cols] = df[numeric_cols].fillna(0)
+            df[non_numeric_cols] = df[non_numeric_cols].fillna("unknown")
+        return df
+class PreprocessFeature:
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        dataset,
+        time_series: bool = False,
+        date_column: str | None = None,
+        group_column: str | None = None,
+        val_size: float = 0.2,
+        test_size: float = 0.2,
+        columns_pca: list[str] = [],
+        columns_onehot: list[str] = [],
+        columns_binary: list[str] = [],
+        columns_ordinal: list[str] = [],
+        columns_frequency: list[str] = [],
+        target_numbers: list = [],
+        target_clf: list = [],
+        **kwargs,
+    ):
+        self.data = data
+        self.data.columns = self.data.columns.str.upper()
+        self.dataset = dataset
+        self.columns_pca = columns_pca
+        self.columns_onehot = columns_onehot
+        self.columns_binary = columns_binary
+        self.columns_ordinal = columns_ordinal
+        self.columns_frequency = columns_frequency
+        self.target_numbers = target_numbers
+        self.target_clf = target_clf
+        self.time_series = time_series
+        self.date_column = date_column
+        self.group_column = group_column
+        self.val_size = val_size
+        self.test_size = test_size
+        self.dataset_dir = self.dataset.path
+        self.dataset_id = self.dataset.id
+        self.data_dir = f"{self.dataset_dir}/data"
+        self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
+    def run(self):
+        # Split
+        train, val, test = (
+            self.train_val_test_split_time_series()
+            if self.time_series
+            else self.train_val_test_split(
+                stratify_col=f"TARGET_{self.target_numbers[0]}"
+            )
+        )  # TODO: only stratifying first target for now
+        # PCA
+        train, pcas = self.add_pca_features(train)
+        val, _ = self.add_pca_features(test, pcas=pcas)
+        test, _ = self.add_pca_features(val, pcas=pcas)
-# Descriptive Analytics functions
-def print_missing_values(df: pd.DataFrame):
+        joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
-    if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
-        logger.info(
-            f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
+        # Encoding
+        train, transformer = self.encode_categorical_features(train)
+        val, _ = self.encode_categorical_features(
+            val,
+            transformer=transformer,
+        )
+        test, _ = self.encode_categorical_features(
+            test,
+            transformer=transformer,
         )
-    else:
-        logger.info("No missing values found")
-def plot_sector_repartition(df: pd.DataFrame):
-    """Visualise repartition of stock per sectors
-    Args:
-        df (pd.DataFrame): a df created with `get_data`
-    """
-    sns.barplot(
-        data=df.groupby("SECTOR")["STOCK"].nunique(),
-        orient="h",
-        order=df.groupby("SECTOR")["STOCK"]
-        .nunique()
-        .sort_values(ascending=False)
-        .index,
-    )
+        joblib.dump(self.data, f"{self.data_dir}/full.pkl")
+        joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
+        summary = summarize_dataframe(train)
+        summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
-def traditional_descriptive_analysis(df: pd.DataFrame, stock_column: str = "STOCK"):
-    with pd.option_context("display.max_rows", None):
+        return train, val, test
-        # Check for duplicates
-        duplicated_count = df.duplicated().sum()
+    def inference(self):
+        # PCA
+        pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
+        data, _ = self.add_pca_features(self.data, pcas=pcas)
-        # Check for missing values
-        missing_values = (
-            df.isnull()
-            .sum()
-            .where(df.isnull().sum() != 0)
-            .dropna()
-            .sort_values(ascending=False)
+        # Encoding
+        transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
+        data, _ = self.encode_categorical_features(
+            data,
+            transformer=transformer,
         )
+        return data
-        # Check for infinite values
-        inf_values = (
-            df.isin([np.inf, -np.inf])
-            .sum()
-            .where(df.isin([np.inf, -np.inf]).sum() != 0)
-            .dropna()
-            .sort_values(ascending=False)
-        )
+    def train_val_test_split_time_series(self):
+        df: pd.DataFrame = self.data
+        date_column: str = self.date_column
+        group_column: str = self.group_column
+        val_size: float = self.val_size
+        test_size: float = self.test_size
-        # Data types of each column
-        data_types = df.dtypes
+        if not date_column:
+            ValueError("Please specify a date_column for time series")
-        # Shape of the DataFrame (rows, columns)
-        shape = df.shape
+        if group_column:
+            df.sort_values([date_column, group_column], inplace=True)
+        else:
+            df.sort_values(date_column, inplace=True)
-        # Number of unique values in the stock column (or any specified column)
-        unique_stock_count = (
-            len(df[stock_column].unique()) if stock_column in df.columns else None
-        )
+        dates = df[date_column].unique()
-        # Constant columns
-        constant_columns = [col for col in df.columns if df[col].nunique() == 1]
+        val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
+        test_first_id = int(len(dates) * (1 - test_size)) + 1
-        # logger.info results
-        logger.info(f"Duplicated rows: {duplicated_count}")
+        train = df[df[date_column].isin(dates[:val_first_id])]
+        val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
+        test = df[df[date_column].isin(dates[test_first_id:])]
-        logger.info(f"\nShape of DataFrame: {shape}")
+        dates = {}
+        for name, data in zip(["train", "val", "test"], [train, val, test]):
+            dates[f"{name}_start_date"] = (
+                data[date_column].map(pd.Timestamp.fromordinal).iat[0]
+            )
+            dates[f"{name}_end_date"] = (
+                data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
+            )
-        if unique_stock_count is not None:
             logger.info(
-                f"\nNumber of unique values in '{stock_column}': {unique_stock_count}"
+                f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
             )
-        else:
-            logger.info(f"\nColumn '{stock_column}' not found in the DataFrame.")
-        logger.info(
-            f"\nInfinite Values: \n{inf_values if not inf_values.empty else 'No infinite values'}"
+        Dataset.update(
+            match_fields=["id"],
+            id=self.dataset_id,
+            train_size=len(train),
+            val_size=len(val),
+            test_size=len(test),
+            **dates,
         )
-        logger.info(
-            f"\nMissing Values: \n{missing_values if not missing_values.empty else 'No missing values'}"
+        return (
+            train.reset_index(drop=True),
+            val.reset_index(drop=True),
+            test.reset_index(drop=True),
         )
-        logger.info(f"\nConstant columns: \n{constant_columns}")
+    def train_val_test_split(
+        self,
+        random_state: int = 42,
+        stratify_col: str | None = None,
+    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        """
+        Splits a DataFrame into train, validation, and test sets.
+        Parameters:
+            df (pd.DataFrame): The full dataset
+            val_size (float): Proportion of validation set (default 0.1)
+            test_size (float): Proportion of test set (default 0.1)
+            random_state (int): Random seed for reproducibility
+            stratify_col (str | None): Optional column to stratify on (for classification tasks)
+        Returns:
+            Tuple of (train_df, val_df, test_df)
+        """
+        df: pd.DataFrame = self.data
+        val_size: float = self.val_size
+        test_size: float = self.test_size
+        stratify_vals = df[stratify_col] if stratify_col else None
+        # First split: train + (val + test)
+        train, temp = train_test_split(
+            df,
+            test_size=val_size + test_size,
+            random_state=random_state,
+            stratify=stratify_vals,
+        )
-        logger.info(f"\nData Types: \n{data_types}")
+        # Adjust stratify target for val/test split
+        stratify_temp = temp[stratify_col] if stratify_col else None
+        # Compute val and test sizes relative to temp
+        val_ratio = val_size / (val_size + test_size)
-def visualize_extrema(
-    data: pd.DataFrame,
-    stock: str,
-    days_before_last: int = 200,
-    local_max_order: int = 10,
-):
-    """
-    Function to visualize local maxima and minima for a given stock in the data.
+        val, test = train_test_split(
+            temp,
+            test_size=1 - val_ratio,
+            random_state=random_state,
+            stratify=stratify_temp,
+        )
-    Parameters:
-    - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'ID'
-    - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
-    - days_before_last: int, number of days before the last date in the dataset to visualize
-    - local_max_order: int, the window size for identifying local extrema (default: 5)
-    """
+        for name, data in zip(["train", "val", "test"], [train, val, test]):
+            logger.info(f"{data.shape} {name} data")
-    # Calculate the last date in the dataset
-    last_date = data["DATE"].max()
-    start_date = last_date - pd.Timedelta(days=days_before_last)
-    # Find local maxima (argrelextrema with np.greater) for each stock
-    local_max_CLOSE = (
-        data[data["STOCK"] == stock]
-        .set_index("DATE")["CLOSE"]
-        .iloc[
-            argrelextrema(
-                data[data["STOCK"] == stock]["CLOSE"].values,
-                np.greater,
-                order=local_max_order,
-            )
-        ]
-        .reset_index()
-    )
-    # Find local minima (argrelextrema with np.less) for each stock
-    local_min_CLOSE = (
-        data[data["STOCK"] == stock]
-        .set_index("DATE")["CLOSE"]
-        .iloc[
-            argrelextrema(
-                data[data["STOCK"] == stock]["CLOSE"].values,
-                np.less,
-                order=local_max_order,
-            )
-        ]
-        .reset_index()
-    )
-    # Filter maxima based on stock and date range
-    local_max_CLOSE = local_max_CLOSE[local_max_CLOSE["DATE"] >= start_date]
-    # Filter minima based on stock and date range
-    local_min_CLOSE = local_min_CLOSE[local_min_CLOSE["DATE"] >= start_date]
-    # logger.info the maxima and minima dates
-    logger.info(
-        f"Maxima Dates for Stock {stock}: {list(local_max_CLOSE['DATE'].values)}"
-    )
-    logger.info(
-        f"Minima Dates for Stock {stock}: {list(local_min_CLOSE['DATE'].values)}"
-    )
-    # Plot the stock's CLOSE prices within the specified date range
-    stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)][
-        ["CLOSE", "DATE"]
-    ].set_index("DATE")
-    plt.figure(figsize=(10, 6))
-    stock_data.plot(color="black", title=f"Stock {stock} Extremas")
-    # Add vertical lines for maxima
-    for date in local_max_CLOSE["DATE"].values:
-        plt.axvline(
-            x=date,
-            color="red",
-            label="Maxima" if date == local_max_CLOSE["DATE"].values[0] else "",
+        return (
+            train.reset_index(drop=True),
+            val.reset_index(drop=True),
+            test.reset_index(drop=True),
         )
-    # Add vertical lines for minima
-    for date in local_min_CLOSE["DATE"].values:
-        plt.axvline(
-            x=date,
-            color="green",
-            label="Minima" if date == local_min_CLOSE["DATE"].values[0] else "",
+    # embedding and pca
+    def add_pca_features(
+        self, df: pd.DataFrame, n_components: int = 5, pcas=None
+    ) -> tuple[pd.DataFrame, dict]:
+        """
+        Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
+        NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
+        Parameters:
+            df (pd.DataFrame): Input DataFrame
+            column (str): Name of the column containing np.ndarray
+            n_components (int): Number of PCA components to keep
+        Returns:
+            pd.DataFrame: DataFrame with new PCA columns added
+        """
+        columns: list[str] = self.columns_pca
+        pcas_dict = {}
+        for column in columns:
+            # Convert text to embeddings if necessary
+            if not isinstance(df[column].iloc[0], (np.ndarray, list)):
+                sentences = df[column].astype(str).tolist()
+                logger.info(
+                    f"Total sentences to embed for column {column}: {len(sentences)}"
+                )
+                # Truncate each sentence
+                truncate_sentences = [truncate_text(sentence) for sentence in sentences]
+                # embedding
+                embedding_matrix = get_openai_embeddings(truncate_sentences)
+            else:
+                logger.info(f"Column {column} is already embeddings")
+                # Stack the vectors into a 2D array
+                embedding_matrix = np.vstack(df[column].values)
+            # Apply PCA
+            if pcas:
+                pca = pcas[column]
+                pca_features = pca.transform(embedding_matrix)
+            else:
+                pca = PCA(n_components=n_components)
+                pca_features = pca.fit_transform(embedding_matrix)
+            # Add PCA columns
+            for i in range(n_components):
+                df[f"{column}_pca_{i+1}"] = pca_features[:, i]
+            # Drop the original column
+            df.drop(column, axis=1, inplace=True)
+            pcas_dict.update({column: pca})
+        return df, pcas_dict
+    # encoding categorical features
+    def encode_categorical_features(
+        self,
+        df: pd.DataFrame,
+        transformer: ColumnTransformer | None = None,
+    ) -> tuple[pd.DataFrame, ColumnTransformer]:
+        """
+        Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
+        Parameters:
+            df (pd.DataFrame): Input DataFrame
+            columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
+            columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
+            columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
+            columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
+            transformer (ColumnTransformer, optional): if provided, applies transform only
+        Returns:
+            tuple: (transformed DataFrame, ColumnTransformer)
+        """
+        columns_onehot: list[str] = self.columns_onehot
+        columns_binary: list[str] = self.columns_binary
+        columns_ordinal: list[str] = self.columns_ordinal
+        columns_frequency: list[str] = self.columns_frequency
+        X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
+        y = df.loc[:, df.columns.str.contains("^TARGET_")]
+        save_in_db = False
+        all_columns = (
+            columns_onehot + columns_binary + columns_ordinal + columns_frequency
         )
-    plt.legend()
-    plt.show()
-def visualize_trading_signals(
-    data: pd.DataFrame,
-    stock: str,
-    days_before_last: int = 200,
-):
-    """
-    Function to visualize trading signals (BUY, SELL, HOLD) for a given stock.
+        if transformer:
+            transformed = transformer.transform(X)
+        else:
+            transformer = ColumnTransformer(
+                transformers=[
+                    (
+                        "onehot",
+                        OneHotEncoder(handle_unknown="ignore", sparse_output=False),
+                        columns_onehot,
+                    ),
+                    (
+                        "ordinal",
+                        OrdinalEncoder(
+                            handle_unknown="use_encoded_value", unknown_value=-1
+                        ),
+                        columns_ordinal,
+                    ),
+                    ("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
+                    ("freq", CountEncoder(normalize=True), columns_frequency),
+                ],
+                remainder="passthrough",
+            )
+            transformed = transformer.fit_transform(X)
+            save_in_db = True
-    Parameters:
-    - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'TRADING_SIGNAL'
-    - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
-    - days_before_last: int, number of days before the last date in the dataset to visualize
-    """
+        # Build output column names
+        column_names = []
-    # Calculate the last date in the dataset
-    last_date = data["DATE"].max()
-    start_date = last_date - pd.Timedelta(days=days_before_last)
+        if columns_onehot:
+            column_names.extend(
+                transformer.named_transformers_["onehot"]
+                .get_feature_names_out(columns_onehot)
+                .tolist()
+            )
-    # Filter data for the selected stock and date range
-    stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)].copy()
+        if columns_ordinal:
+            column_names.extend(columns_ordinal)
-    # Plot the stock's CLOSE prices
-    plt.figure(figsize=(10, 6))
-    plt.plot(stock_data["DATE"], stock_data["CLOSE"], color="black", label="CLOSE")
+        if columns_binary:
+            column_names.extend(
+                transformer.named_transformers_["binary"]
+                .get_feature_names_out(columns_binary)
+                .tolist()
+            )
-    # Define the colors for the trading signals
-    colors = {2: "green", 1: "lightgreen", 0: "yellow", -1: "red", -2: "darkred"}
+        if columns_frequency:
+            column_names.extend(columns_frequency)
-    # Plot each trading signal with the respective color
-    for signal_value, color in colors.items():
-        plt.scatter(
-            stock_data.loc[stock_data["TARGET_11"] == signal_value, "DATE"],
-            stock_data.loc[stock_data["TARGET_11"] == signal_value, "CLOSE"],
-            color=color,
-            label=f"Signal {signal_value}",
-            s=50,  # Size of the points
-        )
+        # Add passthrough (non-encoded) columns
+        passthrough_columns = [col for col in X.columns if col not in all_columns]
+        column_names.extend(passthrough_columns)
-    plt.title(f"Trading Signals for {stock}")
-    plt.xlabel("Date")
-    plt.ylabel("Close Price")
-    plt.legend()
-    plt.grid(True)
-    plt.show()
-def visualize_data_distribution(
-    data,
-    plot_type="hist",
-    features=None,
-    bins=50,
-    rows=5,
-    cols=5,
-    width_per_plot=4,
-    height_per_plot=3,
-):
-    """
-    Function to visualize the data distribution for multiple features in a DataFrame with dynamic figsize,
-    splitting into multiple figures if there are too many features for one figure.
-    Parameters:
-    - data: pd.DataFrame, the DataFrame containing the data to visualize.
-    - plot_type: str, the type of plot to use ('hist', 'kde', 'box').
-    - features: list, list of features (columns) to visualize. If None, all numeric features are used.
-    - bins: int, the number of bins for histograms (default: 50).
-    - rows: int, number of rows in the subplot grid (default: 5).
-    - cols: int, number of columns in the subplot grid (default: 5).
-    - width_per_plot: int, the width of each subplot (default: 4).
-    - height_per_plot: int, the height of each subplot (default: 3).
-    """
+        X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
-    # If no features are specified, use all numeric features
-    if features is None:
-        features = data.select_dtypes(include=[np.number]).columns.tolist()
+        # Try to convert columns to best possible dtypes
+        X_transformed = X_transformed.convert_dtypes()
-    # Calculate the total number of features
-    total_features = len(features)
-    # How many plots can fit into one figure
-    plots_per_figure = rows * cols
+        # Insert features in db
+        if save_in_db:
+            # TODO: in bulk
+            categorical_features, numerical_features = get_features_by_types(
+                X_transformed
+            )
+            for feature in categorical_features:
+                Feature.upsert(match_fields=["name"], name=feature, type="categorical")
+            for feature in numerical_features:
+                Feature.upsert(match_fields=["name"], name=feature, type="numerical")
+            for target in y.columns:
+                target_number = int(target.split("_")[1])
+                type = (
+                    "classification"
+                    if target_number in self.target_clf
+                    else "regression"
+                )
+                # TODO: what about description here ?
+                Target.upsert(match_fields=["name", "type"], name=target, type=type)
+        return pd.concat([X_transformed, y], axis=1), transformer
+# analysis & utils
+def summarize_dataframe(
+    df: pd.DataFrame, sample_categorical_threshold: int = 15
+) -> pd.DataFrame:
+    summary = []
+    def is_hashable_series(series: pd.Series) -> bool:
+        try:
+            _ = series.dropna().unique()
+            return True
+        except TypeError:
+            return False
-    # Loop over the features and create new figures as needed
-    for start in range(0, total_features, plots_per_figure):
-        # Subset of features for the current figure
-        subset_features = features[start : start + plots_per_figure]
+    df = convert_object_columns_that_are_numeric(df)
+    df = df.convert_dtypes()
-        # Dynamically calculate figure size based on grid size and plot dimensions
-        num_plots = len(subset_features)
-        grid_rows = min(rows, num_plots // cols + (num_plots % cols != 0))
-        grid_cols = min(cols, num_plots)
-        figsize = (grid_cols * width_per_plot, grid_rows * height_per_plot)
+    for col in df.columns:
+        total_missing = df[col].isna().sum()
+        col_data = df[col].dropna()
+        dtype = col_data.dtype
-        # Set up the figure and axes for this subset of features
-        fig, axes = plt.subplots(grid_rows, grid_cols, figsize=figsize)
-        axes = axes.flatten()  # Flatten the axes for easy iteration
+        if col_data.empty:
+            summary.append(
+                {
+                    "Column": col,
+                    "Dtype": dtype,
+                    "Type": "unknown",
+                    "Detail": "No non-null values",
+                    "Missing": total_missing,
+                }
+            )
+            continue
+        # Case 1: Numeric columns
+        if pd.api.types.is_numeric_dtype(col_data):
+            unique_vals = col_data.nunique()
+            if set(col_data.unique()).issubset({0, 1}):
+                col_type = "binary-categorical"
+                detail = "0/1 values only"
+            elif (
+                pd.api.types.is_integer_dtype(col_data)
+                and unique_vals <= sample_categorical_threshold
+            ):
+                col_type = "multi-categorical"
+                top_vals = col_data.value_counts().head(10)
+                detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
+            else:
+                col_type = "numeric"
+                q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
+                detail = (
+                    f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
+                    f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
+                )
+        # Case 2: Object or other hashable columns
+        elif is_hashable_series(col_data):
+            unique_vals = col_data.nunique()
+            if unique_vals <= sample_categorical_threshold:
+                col_type = "object-categorical"
+                top_vals = col_data.value_counts().head(10)
+                detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
+            else:
+                col_type = "high-cardinality-categorical"
+                detail = f"{unique_vals} unique values"
+        # Case 3: Unusable columns
+        else:
+            col_type = "non-hashable"
+            detail = f"Non-hashable type: {type(col_data.iloc[0])}"
+        summary.append(
+            {
+                "Column": col,
+                "Dtype": dtype,
+                "Type": col_type,
+                "Detail": detail,
+                "Missing": total_missing,
+            }
+        )
-        # Plot each feature
-        for i, feature in enumerate(subset_features):
-            ax = axes[i]
+    return pd.DataFrame(summary)
-            if plot_type == "hist":
-                sns.histplot(data[feature].dropna(), bins=bins, kde=False, ax=ax)
-            elif plot_type == "kde":
-                sns.kdeplot(data[feature].dropna(), ax=ax, fill=True)
-            elif plot_type == "box":
-                sns.boxplot(data[feature].dropna(), ax=ax)
-            ax.set_xlabel(feature)
-            ax.set_ylabel("Count")
+def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
+    """
+    Detect object columns that can be safely converted to numeric (float or int).
-        # Hide any empty subplots
-        for j in range(i + 1, len(axes)):
-            fig.delaxes(axes[j])
+    Returns:
+        List of column names that are object type but contain numeric values.
+    """
-        # Use tight layout to ensure there's no overlap
-        fig.tight_layout()
+    numeric_candidates = []
-        # Show the plot for this figure
-        plt.show()
+    for col in df.select_dtypes(include=["object"]).columns:
+        try:
+            converted = pd.to_numeric(df[col], errors="coerce")
+            if converted.notna().sum() / len(df) > 0.9:  # at least 90% convertible
+                numeric_candidates.append(col)
+        except Exception:
+            continue
+    for col in numeric_candidates:
+        df[col] = pd.to_numeric(df[col], errors="coerce")
-def detect_outliers_iqr(data, degree: float = 1.5):
-    """
-    Detect outliers in a DataFrame using the Interquartile Range (IQR) method.
+    return df
-    Parameters:
-    - data: pd.DataFrame, the DataFrame in which to detect outliers.
-    Returns:
-    - outliers: pd.DataFrame, DataFrame with boolean values indicating outliers for each feature.
-    """
-    outliers = pd.DataFrame(index=data.index)
+def traditional_descriptive_analysis(df: pd.DataFrame, group_column: str | None = None):
+    with pd.option_context("display.max_rows", None):
+        results = {}
-    for column in data.select_dtypes(include=[np.number]).columns:
-        Q1 = data[column].quantile(0.25)  # 1st quartile (25th percentile)
-        Q3 = data[column].quantile(0.75)  # 3rd quartile (75th percentile)
-        IQR = Q3 - Q1  # Interquartile range
+        # Shape
+        results["Shape"] = f"{df.shape[0]} rows × {df.shape[1]} columns"
-        lower_bound = Q1 - degree * IQR
-        upper_bound = Q3 + degree * IQR
+        # Duplicated rows
+        results["Duplicated rows"] = int(df.duplicated().sum())
-        # Detect outliers
-        outliers[column] = (data[column] < lower_bound) | (data[column] > upper_bound)
+        # Duplicated columns
+        duplicated_cols = df.T[df.T.duplicated()].index.tolist()
+        results["Duplicated columns"] = (
+            ", ".join(duplicated_cols) if len(duplicated_cols) > 0 else "None"
+        )
-    return outliers
+        # Missing values
+        missing = df.isnull().sum()
+        missing = missing[missing > 0].sort_values(ascending=False)
+        if len(missing) > 0:
+            results["Missing values"] = missing.to_frame("Missing Count").to_markdown()
+        else:
+            results["Missing values"] = "No missing values"
+        # Infinite values
+        inf = df.replace([np.inf, -np.inf], np.nan)
+        inf_count = inf.isnull().sum() - df.isnull().sum()
+        inf_count = inf_count[inf_count > 0].sort_values(ascending=False)
+        if len(inf_count) > 0:
+            results["Infinite values"] = inf_count.to_frame("Inf Count").to_markdown()
+        else:
+            results["Infinite values"] = "No infinite values"
+        # Constant columns
+        constant_cols = [col for col in df.columns if df[col].nunique() == 1]
+        results["Constant columns"] = (
+            ", ".join(constant_cols) if len(constant_cols) > 0 else "None"
+        )
-def plot_distribution(df):
-    logger.info("DATA_DISTRIBUTION")
+        # Data types
+        dtypes = df.dtypes.astype(str).sort_index()
+        results["Data types"] = dtypes.to_frame("Type").to_markdown()
-    logger.info("numerical features")
-    visualize_data_distribution(df.select_dtypes(include=["float64"]))
+        # Unique values in group_column
+        if group_column is not None:
+            if group_column in df.columns:
+                results[f"Unique values in '{group_column}'"] = int(
+                    df[group_column].nunique()
+                )
+            else:
+                results[f"Unique values in '{group_column}'"] = (
+                    f"❌ Column '{group_column}' not found"
+                )
-    logger.info("categorical features")
-    visualize_data_distribution(df.select_dtypes(include=["int64"]))
+        # Log all results
+        for title, content in results.items():
+            print(f"\n### {title}\n{content}")
-    logger.info("nb of outliers")
-    outliers = detect_outliers_iqr(df.select_dtypes(include=["float64"]), degree=5)
-    with pd.option_context("display.max_rows", None):
-        logger.info(outliers.sum().sort_values(ascending=False))
+def print_missing_values(df: pd.DataFrame):
-    logger.info("zoom on volume outliers")
-    columns = [c for c in df.columns if "VOLUME" in c]
-    visualize_data_distribution(df, features=columns, plot_type="box", cols=3)
+    if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
+        logger.info(
+            f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
+        )
+    else:
+        logger.info("No missing values found")

lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl