lecrapaud 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +277 -0
  3. lecrapaud/config.py +10 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/env.py +2 -2
  6. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
  7. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  8. lecrapaud/db/alembic.ini +116 -0
  9. lecrapaud/db/models/__init__.py +10 -10
  10. lecrapaud/db/models/base.py +176 -1
  11. lecrapaud/db/models/dataset.py +25 -20
  12. lecrapaud/db/models/feature.py +5 -6
  13. lecrapaud/db/models/feature_selection.py +3 -4
  14. lecrapaud/db/models/feature_selection_rank.py +3 -4
  15. lecrapaud/db/models/model.py +3 -4
  16. lecrapaud/db/models/model_selection.py +15 -8
  17. lecrapaud/db/models/model_training.py +15 -7
  18. lecrapaud/db/models/score.py +9 -6
  19. lecrapaud/db/models/target.py +16 -8
  20. lecrapaud/db/session.py +68 -0
  21. lecrapaud/experiment.py +64 -0
  22. lecrapaud/feature_engineering.py +747 -1022
  23. lecrapaud/feature_selection.py +915 -998
  24. lecrapaud/integrations/openai_integration.py +225 -0
  25. lecrapaud/jobs/__init__.py +2 -2
  26. lecrapaud/jobs/config.py +1 -1
  27. lecrapaud/jobs/scheduler.py +1 -1
  28. lecrapaud/jobs/tasks.py +6 -6
  29. lecrapaud/model_selection.py +1060 -960
  30. lecrapaud/search_space.py +4 -0
  31. lecrapaud/utils.py +2 -2
  32. lecrapaud-0.4.2.dist-info/METADATA +177 -0
  33. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/RECORD +36 -35
  34. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/WHEEL +1 -1
  35. lecrapaud/db/crud.py +0 -179
  36. lecrapaud/db/services.py +0 -0
  37. lecrapaud/db/setup.py +0 -58
  38. lecrapaud/predictions.py +0 -292
  39. lecrapaud/training.py +0 -151
  40. lecrapaud-0.4.0.dist-info/METADATA +0 -103
  41. /lecrapaud/{directory_management.py → directories.py} +0 -0
  42. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/LICENSE +0 -0
@@ -1,1119 +1,844 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
1
+ """
2
+ Feature engineering module for data preprocessing and transformation.
3
3
 
4
- import pandas as pd
5
- import numpy as np
6
- import joblib
7
- from datetime import datetime
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
- from scipy.signal import argrelextrema
11
- from itertools import product
12
- import os
13
- from collections import defaultdict
14
-
15
- from src.config import PYTHON_ENV
16
- from src.utils import logger
17
- from src.directory_management import data_dir
18
- from src.services.indicators import (
19
- rsi,
20
- macd,
21
- bollinger_bands,
22
- adx,
23
- atr,
24
- stochastic,
25
- mfi,
26
- ichimoku_cloud,
27
- parabolic_sar,
28
- chaikin_money_flow,
29
- pivot_points,
30
- sma,
31
- ema,
32
- volatility,
33
- cumulative_return,
34
- close_diff,
35
- obv,
36
- pressure,
37
- )
38
- from src.db.models import Target
4
+ Process
5
+ -------
6
+ FEAT ENG
7
+ - utiliser business_analysis > get_table_summary pour voir quels sont les champs null à + de 90%
8
+ - utiliser remove_constant_columns pour supprimer les colonnes constantes
9
+ - utiliser summarize_dataframe pour supprimer de nouvelles colonnes inutiles (date, id, donnée future à la prédiction, misc not useful)
10
+ - caster en numeric ce qui peut être casté en numeric
39
11
 
12
+ - definir columns_boolean
13
+ - definir groupby_columns_list et target_column pour le target encoding
14
+ - créer la/les targets
15
+ - définir columns_pca
16
+ - définir columns_one_hot, columns_binary, columns_ordinal, columns_frequency
40
17
 
41
- # pd print options
42
- # pd.set_option("display.max_columns", None)
43
- # pd.reset_option("display.max_rows")
44
- # pd.set_option("display.max_colwidth", None)
45
18
 
19
+ Todo
20
+ ----
21
+ - DONE: drop meaningless identifier columns
22
+ - DONE: PCA on embedding of deck
23
+ - DONE: maybe cyclic encoding for date columns
46
24
 
47
- # Main function to create targets
48
- def targets_creation(
49
- df: pd.DataFrame,
50
- top_x_stock: float = 0.1,
51
- local_max_order: int = 10,
52
- threshold: int = 5,
53
- ):
54
- """Preprocessing the stock data from yfinance
25
+ - DONE: ordinal/label encode (only 1 column) for tree based method when not too big number of categories
26
+ - DONE: frequency encoding for some categorical columns
27
+ - DONE: one hot encoding for categorical columns
28
+ - DONE: binary encoding if big number of category
55
29
 
56
- Args:
57
- df (pd.DataFrame): a dataframe obtain with `get_data` function
58
- top_x_stock (float): the % at which you are considered top ranked stock for the day
59
- local_max_order (int): this set up the window to look at on both side of the extrema : the greater, the more 'global' is the extrema.
30
+ - DONE: create other other embedding column for textual data ?
31
+ - DONE: create some boolean like has_website, has_linkedin_company_url, etc...
60
32
 
61
- Returns:
62
- df with more columns:
63
- - date variables : we create YEAR, MONTH, DAY, WEEK, WEEKDAY, YEARWEEK and YEARDAY features
64
- - return, market return, residual return and similar computation with volume are done to create 6 new features
65
- - target variables :
66
- - TARGET_1 : next day return
67
- - TARGET_2 : categorical return (positive 1, or negative 0)
68
- - TARGET_3 : next day ranking from best (1) to worst (n_stock) returns
69
- - TARGET_4 : categorical next day top ranking (in top_x_stock) (1), or not (0)
70
- - TARGET_5, TARGET_6, TARGET_7, TARGET_8 : same but with residual return
71
- - TARGET_9 : categorical with 1 if it's a local maximum and 0 if not
72
- - TARGET_10 : categorical with 1 if it's a local minimum and 0 if not
73
- - TARGET 11 : We will create trading signals based on proximity to local minima and maxima : need multi-binary loss support
74
- - TARGET 12, 13, 14 : return in 9,14,21 days
33
+ - target/mean encoding with a groupby on a very interesting categorical column
34
+ - faire du "vrai" target encoding avec du leave one out encoding par exemple, sur la target variable ?
75
35
 
36
+ - better categorize some stuff like country ? for sourcing we do position, ext_position, company, ext_company, country, source, but only country is relevant here
76
37
 
77
- """
78
38
 
79
- # Creating targets
80
- logger.info("Creating target variables...")
81
-
82
- # TARGET 1-4 : We start with target RET
83
- target = "RET"
84
- stock_column = "STOCK"
85
- nb_of_stocks = len(df[stock_column].unique())
86
-
87
- first_x_percent = max(int(nb_of_stocks * top_x_stock), 1)
88
-
89
- df["TARGET_1"] = df[target].shift(-1)
90
- df["TARGET_2"] = np.select([df["TARGET_1"] <= 0, df["TARGET_1"] > 0], [0, 1])
91
- df["TARGET_3"] = df.groupby("DATE")["TARGET_1"].rank(
92
- method="first", ascending=False
93
- )
94
- df["TARGET_4"] = np.select(
95
- [
96
- df.groupby("DATE")["TARGET_1"].rank(method="first", ascending=False)
97
- <= first_x_percent
98
- ],
99
- [1],
100
- default=0,
101
- )
102
-
103
- # TARGET 5-8 : We do the same for RESIDUAL_RET
104
- target = "RESIDUAL_RET"
105
-
106
- df["TARGET_5"] = df[target].shift(-1)
107
- df["TARGET_6"] = np.select([df["TARGET_5"] <= 0, df["TARGET_5"] > 0], [0, 1])
108
- df["TARGET_7"] = df.groupby("DATE")["TARGET_5"].rank(
109
- method="first", ascending=False
110
- )
111
- df["TARGET_8"] = np.select(
112
- [
113
- df.groupby("DATE")["TARGET_5"].rank(method="first", ascending=False)
114
- <= first_x_percent
115
- ],
116
- [1],
117
- default=0,
118
- )
119
-
120
- # TARGET 9-10 : Let's look at local min and max : it can be interpretate as buy and sell signal respectively
121
- target = "CLOSE"
122
-
123
- df["TARGET_9"] = 0
124
- df["TARGET_10"] = 0
125
-
126
- # Calculate local maxima and set TARGET_9 to 1 where maxima are found
127
- maxima_indices = df.groupby(stock_column)[target].transform(
128
- lambda x: x.index.isin(
129
- x.iloc[argrelextrema(x.values, np.greater, order=local_max_order)].index
130
- )
131
- )
39
+ Development
40
+ -----------
41
+ - utiliser le PCA pour définir combien de variable explique la variance pour la feature selection max_feature
42
+ - could be nice to get linkedin info of founders (need to search reps in rails first) - and score !
43
+ - add created_from, utm_source, referrer when we will have more data
44
+ - could be nice to get team_count, or dealroom info but at the moment of submission...
45
+ """
132
46
 
133
- minima_indices = df.groupby(stock_column)[target].transform(
134
- lambda x: x.index.isin(
135
- x.iloc[argrelextrema(x.values, np.less, order=local_max_order)].index
136
- )
137
- )
138
-
139
- df.loc[maxima_indices, "TARGET_9"] = 1
140
- df.loc[minima_indices, "TARGET_10"] = 1
141
-
142
- # TARGET 11 : We will create trading signals based on proximity to local minima and maxima.
143
- df["TARGET_11"] = 2 # Default value for HOLD
144
-
145
- # Function to detect local minima and maxima, and assign signals
146
- def assign_signals(group):
147
- close_prices = group[target].values
148
- dates = group["DATE"].values
149
-
150
- # Detect local maxima and minima using argrelextrema
151
- local_maxima_idx = argrelextrema(
152
- close_prices, np.greater, order=local_max_order
153
- )[0]
154
- local_minima_idx = argrelextrema(close_prices, np.less, order=local_max_order)[
155
- 0
156
- ]
157
-
158
- # STRONG BUY (4) for local minima, STRONG SELL (0) for local maxima
159
- group.loc[group.index[local_minima_idx], "TARGET_11"] = 4
160
- group.loc[group.index[local_maxima_idx], "TARGET_11"] = 0
161
-
162
- # Assign BUY (3) and SELL (1) based on proximity to extrema within the threshold window
163
- for idx in local_minima_idx:
164
- # Get the actual date of the minima
165
- min_date = dates[idx]
166
- # Select the rows within the threshold window around the minima date
167
- buy_window = group.loc[
168
- (group["DATE"] >= min_date - pd.Timedelta(days=threshold))
169
- & (group["DATE"] <= min_date + pd.Timedelta(days=threshold))
170
- ]
171
- group.loc[buy_window.index, "TARGET_11"] = np.where(
172
- buy_window["DATE"] == min_date,
173
- 4,
174
- 3, # STRONG BUY at minima, BUY near minima
175
- )
176
-
177
- for idx in local_maxima_idx:
178
- # Get the actual date of the maxima
179
- max_date = dates[idx]
180
- # Select the rows within the threshold window around the maxima date
181
- sell_window = group.loc[
182
- (group["DATE"] >= max_date - pd.Timedelta(days=threshold))
183
- & (group["DATE"] <= max_date + pd.Timedelta(days=threshold))
184
- ]
185
- group.loc[sell_window.index, "TARGET_11"] = np.where(
186
- sell_window["DATE"] == max_date,
187
- 0,
188
- 1, # STRONG SELL at maxima, SELL near maxima
189
- )
47
+ import pandas as pd
48
+ import numpy as np
49
+ from itertools import product
50
+ import joblib
190
51
 
191
- return group
192
-
193
- # Apply the function to each stock group
194
- df = df.groupby(stock_column, group_keys=False).apply(assign_signals)
195
-
196
- # TARGET 12, 13, 14 : return in 9,14,21 days
197
- df["TARGET_12"] = df.groupby("STOCK")["CLOSE"].pct_change(9).shift(-9)
198
- df["TARGET_13"] = df.groupby("STOCK")["CLOSE"].pct_change(14).shift(-14)
199
- df["TARGET_14"] = df.groupby("STOCK")["CLOSE"].pct_change(21).shift(-21)
200
-
201
- # Update database
202
- # TODO: in bulk
203
- Target.upsert(
204
- match_fields=["name", "type"],
205
- name="TARGET_1",
206
- type="regression",
207
- description="Next day return",
208
- )
209
- Target.upsert(
210
- match_fields=["name", "type"],
211
- name="TARGET_2",
212
- type="classification",
213
- description="Next day return",
214
- )
215
- Target.upsert(
216
- match_fields=["name", "type"],
217
- name="TARGET_3",
218
- type="regression",
219
- description="Ranking of next day return",
220
- )
221
- Target.upsert(
222
- match_fields=["name", "type"],
223
- name="TARGET_4",
224
- type="classification",
225
- description="Top ranking of next day return",
226
- )
227
- Target.upsert(
228
- match_fields=["name", "type"],
229
- name="TARGET_5",
230
- type="regression",
231
- description="Next day residual return",
232
- )
233
- Target.upsert(
234
- match_fields=["name", "type"],
235
- name="TARGET_6",
236
- type="classification",
237
- description="Next day residual return",
238
- )
239
- Target.upsert(
240
- match_fields=["name", "type"],
241
- name="TARGET_7",
242
- type="regression",
243
- description="Ranking of next day residual return",
244
- )
245
- Target.upsert(
246
- match_fields=["name", "type"],
247
- name="TARGET_8",
248
- type="classification",
249
- description="Top ranking of next day residual return",
250
- )
251
- Target.upsert(
252
- match_fields=["name", "type"],
253
- name="TARGET_9",
254
- type="classification",
255
- description="Local maxima",
256
- )
257
- Target.upsert(
258
- match_fields=["name", "type"],
259
- name="TARGET_10",
260
- type="classification",
261
- description="Local minima",
262
- )
263
- Target.upsert(
264
- match_fields=["name", "type"],
265
- name="TARGET_11",
266
- type="classification",
267
- description="Trading signals based on proximity to local minima and maxima",
268
- )
269
- Target.upsert(
270
- match_fields=["name", "type"],
271
- name="TARGET_12",
272
- type="regression",
273
- description="Return in 9 days",
274
- )
275
- Target.upsert(
276
- match_fields=["name", "type"],
277
- name="TARGET_13",
278
- type="regression",
279
- description="Return in 14 days",
280
- )
281
- Target.upsert(
282
- match_fields=["name", "type"],
283
- name="TARGET_14",
284
- type="regression",
285
- description="Return in 21 days",
286
- )
52
+ from sklearn.compose import ColumnTransformer
53
+ from sklearn.decomposition import PCA
54
+ from category_encoders import BinaryEncoder, CountEncoder
55
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
56
+ from sklearn.model_selection import train_test_split
287
57
 
288
- return df
58
+ from lecrapaud.integrations.openai_integration import (
59
+ truncate_text,
60
+ get_openai_embeddings,
61
+ )
62
+ from lecrapaud.feature_selection import get_features_by_types
63
+ from lecrapaud.utils import logger
64
+ from lecrapaud.db import Target, Feature, Dataset
65
+ from lecrapaud.config import PYTHON_ENV
289
66
 
290
67
 
291
- def calculate_option_features(option_data: list[dict], spot_price: float):
292
- puts = [opt for opt in option_data if opt["type"] == "put"]
293
- calls = [opt for opt in option_data if opt["type"] == "call"]
68
+ # main function
69
+ class FeatureEngineeringEngine:
70
+ """
71
+ Feature engineering pipeline
72
+
73
+ Params needed
74
+ -------------
75
+ data
76
+ columns_boolean
77
+ columns_date
78
+ columns_te_groupby
79
+ columns_te_target
80
+ for_training
81
+ """
294
82
 
295
- def safe_float(x):
296
- try:
297
- return float(x)
298
- except:
299
- return 0.0
300
-
301
- # Convert and clean data
302
- for opt in option_data:
303
- for key in ["strike", "volume", "open_interest", "delta", "implied_volatility"]:
304
- opt[key] = safe_float(opt.get(key, 0.0))
305
-
306
- # Put/Call ratios
307
- total_put_vol = sum(p["volume"] for p in puts)
308
- total_call_vol = sum(c["volume"] for c in calls)
309
- total_put_oi = sum(p["open_interest"] for p in puts)
310
- total_call_oi = sum(c["open_interest"] for c in calls)
311
-
312
- put_call_ratio_vol = total_put_vol / total_call_vol if total_call_vol > 0 else None
313
- put_call_ratio_oi = total_put_oi / total_call_oi if total_call_oi > 0 else None
314
-
315
- # Open Interest Skew
316
- oi_skew = sum(c["open_interest"] for c in calls if c["strike"] > spot_price) - sum(
317
- p["open_interest"] for p in puts if p["strike"] < spot_price
318
- )
319
-
320
- # Total Open Interest
321
- total_oi = sum(opt["open_interest"] for opt in option_data)
322
-
323
- # Delta-weighted Put/Call Ratio
324
- dw_put = sum(p["delta"] * p["volume"] for p in puts)
325
- dw_call = sum(c["delta"] * c["volume"] for c in calls)
326
- delta_weighted_pcr = dw_put / dw_call if dw_call > 0 else None
327
-
328
- # ATM IV
329
- atm_option = min(option_data, key=lambda x: abs(x["strike"] - spot_price))
330
- atm_iv = atm_option["implied_volatility"]
331
-
332
- # IV Skew (25-delta)
333
- iv_put_25d = np.mean(
334
- [p["implied_volatility"] for p in puts if abs(p["delta"] + 0.25) < 0.05]
335
- )
336
- iv_call_25d = np.mean(
337
- [c["implied_volatility"] for c in calls if abs(c["delta"] - 0.25) < 0.05]
338
- )
339
- iv_skew_25d = iv_put_25d - iv_call_25d if iv_put_25d and iv_call_25d else None
340
-
341
- # IV Term Structure
342
- iv_by_exp = defaultdict(list)
343
- for opt in option_data:
344
- iv_by_exp[opt["expiration"]].append(opt["implied_volatility"])
345
- expiries = sorted(iv_by_exp.keys())
346
- if len(expiries) >= 2:
347
- iv_term_structure = np.mean(iv_by_exp[expiries[-1]]) - np.mean(
348
- iv_by_exp[expiries[0]]
349
- )
350
- else:
351
- iv_term_structure = None
352
-
353
- # Moneyness
354
- moneyness = [spot_price / opt["strike"] for opt in option_data if opt["strike"] > 0]
355
-
356
- # % OTM / ITM
357
- otm_calls = [c for c in calls if c["strike"] > spot_price]
358
- otm_puts = [p for p in puts if p["strike"] < spot_price]
359
- otm = len(otm_calls) + len(otm_puts)
360
- itm = len(option_data) - otm
361
- percent_otm = otm / len(option_data) if option_data else None
362
- percent_itm = itm / len(option_data) if option_data else None
363
-
364
- # Weighted Average Strike
365
- def weighted_avg_strike(options):
366
- total_vol = sum(o["volume"] for o in options)
367
- return (
368
- sum(o["strike"] * o["volume"] for o in options) / total_vol
369
- if total_vol > 0
370
- else None
83
+ def __init__(
84
+ self,
85
+ data: pd.DataFrame,
86
+ columns_drop: list[str] = [],
87
+ columns_boolean: list[str] = [],
88
+ columns_date: list[str] = [],
89
+ columns_te_groupby: list[str] = [],
90
+ columns_te_target: list[str] = [],
91
+ for_training: bool = True,
92
+ **kwargs,
93
+ ):
94
+ self.data = data
95
+ self.columns_drop = columns_drop
96
+ self.columns_boolean = columns_boolean
97
+ self.columns_date = columns_date
98
+ self.columns_te_groupby = columns_te_groupby
99
+ self.columns_te_target = columns_te_target
100
+ self.for_training = for_training
101
+
102
+ def run(self) -> pd.DataFrame:
103
+ # drop columns
104
+ self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
105
+
106
+ # convert object columns to numeric if possible
107
+ self.data = convert_object_columns_that_are_numeric(self.data)
108
+
109
+ # handle boolean features
110
+ self.data = self.boolean_encode_columns()
111
+
112
+ # handle missing values
113
+ self.data = (
114
+ self.fillna_at_training()
115
+ if self.for_training
116
+ else self.fillna_at_inference()
371
117
  )
372
118
 
373
- avg_strike_calls = weighted_avg_strike(calls)
374
- avg_strike_puts = weighted_avg_strike(puts)
375
-
376
- # Option Sentiment Index
377
- sentiment_numerator = sum(
378
- c["volume"] for c in calls if c["strike"] < spot_price
379
- ) - sum(p["volume"] for p in puts if p["strike"] > spot_price)
380
- sentiment_index = (
381
- sentiment_numerator / (total_put_vol + total_call_vol)
382
- if (total_put_vol + total_call_vol) > 0
383
- else None
384
- )
385
-
386
- return {
387
- "put_call_ratio_volume": put_call_ratio_vol,
388
- "put_call_ratio_open_interest": put_call_ratio_oi,
389
- "open_interest_skew": oi_skew,
390
- "total_open_interest": total_oi,
391
- "delta_weighted_pcr": delta_weighted_pcr,
392
- "atm_iv": atm_iv,
393
- "iv_skew_25d": iv_skew_25d,
394
- "iv_term_structure": iv_term_structure,
395
- "average_moneyness": np.mean(moneyness) if moneyness else None,
396
- "percent_otm": percent_otm,
397
- "percent_itm": percent_itm,
398
- "weighted_avg_strike_calls": avg_strike_calls,
399
- "weighted_avg_strike_puts": avg_strike_puts,
400
- "option_sentiment_index": sentiment_index,
401
- }
402
-
403
-
404
- def apply_indicators(df: pd.DataFrame):
405
- """Apply multiple indicators to a grouped dataframe of a single stock."""
406
- # Assuming 'df' is the OHLC data for a single stock, apply indicators
407
- result = df.copy()
408
-
409
- logger.debug(f"Computing non-period features...")
410
-
411
- # Apply Parabolic SAR
412
- result["Parabolic_SAR"] = parabolic_sar(df)
413
-
414
- # Apply Bollinger Bands
415
- result["Upper_BB"], result["Middle_BB"], result["Lower_BB"] = bollinger_bands(df)
416
-
417
- # Apply Ichimoku Cloud
418
- (
419
- result["Tenkan"],
420
- result["Kijun"],
421
- result["Senkou_A"],
422
- result["Senkou_B"],
423
- result["Chikou"],
424
- ) = ichimoku_cloud(df)
425
-
426
- # Apply Pivot Points (including support and resistance levels)
427
- result["Pivot"], result["R1"], result["S1"], result["R2"], result["S2"] = (
428
- pivot_points(df)
429
- )
430
-
431
- # Other indicators
432
- result["CLOSE_DIFF"] = close_diff(df)
433
- result["OBV"] = obv(df)
434
- result["DOWNWARD_PRESSURE"], result["UPWARD_PRESSURE"] = pressure(df)
435
-
436
- # Apply MACD (Moving Average Convergence Divergence)
437
- result["MACD_Line"], result["MACD_Signal"] = macd(df)
438
-
439
- # first buy/sell signal : MACD_SIGNAL_DIFF cross 0 levels
440
- result["MACD_SIGNAL_DIFF"] = result["MACD_Line"] - result["MACD_Signal"]
441
- result["BUY_1"] = np.where(
442
- (result["MACD_SIGNAL_DIFF"] > 0)
443
- & (result["MACD_SIGNAL_DIFF"].shift(1) < 0), # Buy signal (MACD crossover)
444
- 1, # Buy
445
- np.where(
446
- (result["MACD_SIGNAL_DIFF"] < 0)
447
- & (
448
- result["MACD_SIGNAL_DIFF"].shift(1) > 0
449
- ), # Sell signal (MACD crossunder)
450
- -1, # Sell
451
- np.nan, # Default case
452
- ),
453
- )
454
- result["BUY_1"] = result["BUY_1"].fillna(0) # TODO: should we fill with 0 (done)
455
-
456
- # second buy/sell signal : MACD_SIGNAL_DIFF cross 30% threshold of maximum value while positive and decreasing, or 30% threshold of minimum value while negative and increasing
457
- # Calculate rolling 20-day max and min values for MACD_SIGNAL_DIFF per stock
458
- macd_signal_diff_max_20_days = result.groupby("STOCK")[
459
- "MACD_SIGNAL_DIFF"
460
- ].transform(lambda x: x.rolling(20).max())
461
- macd_signal_diff_min_20_days = result.groupby("STOCK")[
462
- "MACD_SIGNAL_DIFF"
463
- ].transform(lambda x: x.rolling(20).min())
464
-
465
- # Define the buy/sell signal conditions
466
- buy_condition = (
467
- (result["MACD_SIGNAL_DIFF"] > result["MACD_SIGNAL_DIFF"].shift(1)) # Increasing
468
- & (result["MACD_SIGNAL_DIFF"] < 0) # Negative value
469
- & (
470
- result["MACD_SIGNAL_DIFF"] > 0.3 * macd_signal_diff_min_20_days
471
- ) # Above 30% of minimum
472
- )
473
-
474
- sell_condition = (
475
- (result["MACD_SIGNAL_DIFF"] < result["MACD_SIGNAL_DIFF"].shift(1)) # Decreasing
476
- & (result["MACD_SIGNAL_DIFF"] > 0) # Positive value
477
- & (
478
- result["MACD_SIGNAL_DIFF"] < 0.3 * macd_signal_diff_max_20_days
479
- ) # Below 30% of maximum
480
- )
481
-
482
- # Apply the conditions to calculate buy/sell signals
483
- result["BUY_2"] = np.where(
484
- buy_condition,
485
- np.abs(
486
- (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_min_20_days)
487
- / (0.3 * macd_signal_diff_min_20_days)
488
- ),
489
- np.where(
490
- sell_condition,
491
- -np.abs(
492
- (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_max_20_days)
493
- / (0.3 * macd_signal_diff_max_20_days)
494
- ),
495
- 0, # Default
496
- ),
497
- )
498
-
499
- periods = [
500
- 9,
501
- 14,
502
- 21,
503
- 50,
504
- 126,
505
- 200,
506
- 252,
507
- ] # 2 semaines, 3 semaines, 1 mois et 2.5 mois
508
- # TODO: on pourrait rajouter plus de long terme : 126 jours (6 mois) et 200 jours (9 mois) et 252 jours (1 an)
509
-
510
- features = []
511
- for period in periods:
512
- logger.debug(f"Computing period features for {period} days...")
513
-
514
- features.append(
515
- pd.DataFrame(
516
- {
517
- f"CUMUL_RET_{period}": cumulative_return(df, period=period),
518
- f"SMA_{period}": sma(df, period=period),
519
- f"EMA_{period}": ema(df, period=period),
520
- f"VOLATILITY_{period}": volatility(df, period=period),
521
- f"ADX_{period}": adx(df, period=period),
522
- f"ATR_{period}": atr(df, period=period),
523
- f"CMF_{period}": chaikin_money_flow(df, period=period),
524
- f"RSI_{period}": rsi(df, period=period),
525
- f"MFI_{period}": mfi(df, period=period),
526
- },
527
- index=df.index,
528
- )
529
- )
119
+ # target encoding
120
+ self.data = self.generate_target_encodings()
530
121
 
531
- # Stochastic Oscillator returns two series: %K and %D
532
- k, d = stochastic(df, period=period)
533
- features.append(
534
- pd.DataFrame(
535
- {
536
- f"%K_{period}": k,
537
- f"%D_{period}": d,
538
- },
539
- index=df.index,
540
- )
541
- )
542
-
543
- result = pd.concat([result] + features, axis=1)
544
-
545
- # third buy/sell signal : RSI is overbought >0.7 / oversold <0.3
546
- result["BUY_3"] = np.where(
547
- result["RSI_14"] <= 30,
548
- (30 - result["RSI_14"]) / 30,
549
- np.where(result["RSI_14"] >= 70, -(result["RSI_14"] - 70) / 30, 0),
550
- )
551
-
552
- # fourth buy/sell signal : RSI vs CLOSE divergence
553
- # The RSI vs. Close divergence trading signal identifies potential reversals by detecting when the
554
- # Relative Strength Index (RSI) and price (Close) move in opposite directions
555
- # bullish divergence occurs when the price makes lower lows while RSI makes higher lows (potential uptrend),
556
- # and bearish divergence occurs when the price makes higher highs while RSI makes lower highs (potential downtrend)
557
-
558
- # Detect local peaks (RSI Highs) and troughs (RSI Lows) for divergence analysis
559
- # Compute local maxima and minima indices
560
- rsi_peak_indices = argrelextrema(result["RSI_14"].values, np.greater)[
561
- 0
562
- ] # RSI highs
563
- rsi_trough_indices = argrelextrema(result["RSI_14"].values, np.less)[0] # RSI lows
564
-
565
- # Create boolean masks for peaks and troughs
566
- rsi_peaks_mask = np.zeros(len(result), dtype=bool)
567
- rsi_troughs_mask = np.zeros(len(result), dtype=bool)
568
-
569
- rsi_peaks_mask[rsi_peak_indices] = True
570
- rsi_troughs_mask[rsi_trough_indices] = True
571
-
572
- # Extract peak and trough rows efficiently
573
- rsi_peaks = result.loc[rsi_peaks_mask, ["CLOSE", "RSI_14"]].copy()
574
- rsi_troughs = result.loc[rsi_troughs_mask, ["CLOSE", "RSI_14"]].copy()
575
-
576
- # Compute RSI and CLOSE differences to check divergence
577
- for i in [1, 2, 3]:
578
- # RSI & Price difference from past peaks
579
- rsi_peaks[f"RSI_PEAK_DIFF_{i}"] = rsi_peaks["RSI_14"].diff(i)
580
- rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] = rsi_peaks["CLOSE"].diff(i)
581
-
582
- # RSI & Price difference from past troughs
583
- rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] = rsi_troughs["RSI_14"].diff(i)
584
- rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] = rsi_troughs["CLOSE"].diff(i)
585
-
586
- # Detect bearish divergence (RSI down, price up) and bullish divergence (RSI up, price down)
587
- rsi_peaks[f"DIVERGENCE_{i}"] = np.where(
588
- (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] < 0)
589
- & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] > 0),
590
- -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
591
- np.where(
592
- (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] > 0)
593
- & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] < 0),
594
- -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
595
- 0,
596
- ),
597
- )
122
+ # Cyclic encode dates
123
+ self.data = self.cyclic_encode_date()
598
124
 
599
- rsi_troughs[f"DIVERGENCE_{i}"] = np.where(
600
- (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] > 0)
601
- & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] < 0),
602
- np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
603
- np.where(
604
- (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] < 0)
605
- & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] > 0),
606
- np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
607
- 0,
608
- ),
609
- )
125
+ return self.data
610
126
 
611
- # Concatenate peak and trough divergences into a single DataFrame
612
- divergence_cols = [f"DIVERGENCE_{i}" for i in [1, 2, 3]]
613
- divergence_data = pd.concat(
614
- [rsi_peaks[divergence_cols], rsi_troughs[divergence_cols]], axis=0
615
- )
127
+ def cyclic_encode_date(self) -> pd.DataFrame:
128
+ """
129
+ Adds cyclic (sine and cosine) encoding for common date parts: day of week, day of month, and month.
616
130
 
617
- # Merge using index alignment
618
- result[divergence_cols] = divergence_data.reindex(result.index, fill_value=0)
131
+ Parameters:
132
+ df (pd.DataFrame): Input dataframe
133
+ columns (list[str]): List of datetime columns to encode
134
+ prefix (str): Optional prefix for new columns. If None, uses column names.
619
135
 
620
- # Sum divergence signals into BUY_4 for a single signal strength metric
621
- result["BUY_4"] = result[divergence_cols].sum(axis=1)
622
- return result
136
+ Returns:
137
+ pd.DataFrame: Updated dataframe with new cyclic features
138
+ """
623
139
 
140
+ df: pd.DataFrame = self.data
141
+ columns: list[str] = self.columns_date
624
142
 
625
- # Main function to process the full dataset with multiple stocks
626
- def feature_engineering(
627
- df: pd.DataFrame,
628
- for_training: bool = False,
629
- save_as_csv: bool = False,
630
- analytics: bool = False,
631
- ):
632
- """Main function to process the full dataset with multiple stocks
143
+ def cyclic_encode(series, max_value):
144
+ sin_values = np.sin(2 * np.pi * series / max_value)
145
+ cos_values = np.cos(2 * np.pi * series / max_value)
146
+ return sin_values, cos_values
633
147
 
634
- Args:
635
- - df (pd.DataFrame): the dataframe with ohlc data
636
- - for_training (bool): whether to compute targets and for_training as data_for_training, or not.
637
- """
148
+ for col in columns:
638
149
 
639
- # dates
640
- logger.info("Creating date like variables...")
641
- df["DATE"] = pd.to_datetime(df["DATE"]).dt.normalize()
642
- df["YEAR"] = df["DATE"].dt.isocalendar().year
643
- df["MONTH"] = df["DATE"].dt.month
644
- df["DAY"] = df["DATE"].dt.day
645
- df["WEEK"] = df["DATE"].dt.isocalendar().week
646
- df["WEEKDAY"] = df["DATE"].dt.weekday
647
- df["YEARDAY"] = df["DATE"].dt.dayofyear
648
-
649
- # Cyclic encoding for date-like variables
650
- def cyclic_encode(series, max_value):
651
- sin_values = np.sin(2 * np.pi * series / max_value)
652
- cos_values = np.cos(2 * np.pi * series / max_value)
653
- return sin_values, cos_values
654
-
655
- df["MONTH_sin"], df["MONTH_cos"] = cyclic_encode(df["MONTH"], 12)
656
- df["DAY_sin"], df["DAY_cos"] = cyclic_encode(df["DAY"], 31)
657
- df["WEEK_sin"], df["WEEK_cos"] = cyclic_encode(df["WEEK"], 52)
658
- df["WEEKDAY_sin"], df["WEEKDAY_cos"] = cyclic_encode(df["WEEKDAY"], 7)
659
- df["YEARDAY_sin"], df["YEARDAY_cos"] = cyclic_encode(df["YEARDAY"], 365)
660
-
661
- # Computing residual RET and relative VOLUME
662
- logger.info("Creating RET and VOLUME metrics...")
663
- df["RET"] = df.groupby("STOCK")["CLOSE"].pct_change(1)
664
- df["MARKET_RET"] = df.groupby("DATE")["RET"].transform("mean")
665
- df["RESIDUAL_RET"] = df["RET"] - df["MARKET_RET"]
666
-
667
- df["VOLUME_RATIO"] = (
668
- df["VOLUME"]
669
- / df.groupby("STOCK")["VOLUME"].rolling(20, min_periods=1).mean().values
670
- )
671
- df["MARKET_VOLUME_RATIO"] = df.groupby("DATE")["VOLUME_RATIO"].transform("mean")
672
- df["RELATIVE_VOLUME"] = df["VOLUME_RATIO"] - df["MARKET_VOLUME_RATIO"]
673
-
674
- logger.info("Creating historical time series metrics...")
675
- periods = [
676
- 1, # daily
677
- 2,
678
- 3,
679
- 4,
680
- 5, # weekly
681
- 9,
682
- 14,
683
- 21, # monthly
684
- 50,
685
- 126,
686
- 200,
687
- 252,
688
- ] # need to keep 1, 2, 3, 4, 5 for backward compatibility
689
- for METRIC in ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]:
690
- for i in periods:
691
- df[f"{METRIC}_-{i}"] = df[METRIC].shift(i)
692
-
693
- # Group by "STOCK" and apply the indicators for each stock
694
- logger.info("Applying indicators...")
695
- grouped_df = df.groupby("STOCK", group_keys=False)
696
- preprocessed_df = grouped_df.apply(apply_indicators)
697
-
698
- # Target encoding / Mean encoding for categorical features
699
- # it's when you groupby a categorical feature and aggregate a target with a stat such as mean or median
700
- logger.info("Computing aggregated features...")
701
- statistics = ["mean", "median"]
702
- gb_features = [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
703
-
704
- # Define your base
705
- target_features = ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]
706
- periods = [9, 14, 21, 50]
707
- indicators = [
708
- "CUMUL_RET",
709
- "SMA",
710
- "EMA",
711
- "VOLATILITY",
712
- "ATR",
713
- "ADX",
714
- "%K",
715
- "RSI",
716
- "MFI",
717
- ]
718
- target_features += [f"{ind}_{p}" for p in periods for ind in indicators]
719
-
720
- # Prepare to collect new columns
721
- new_feature_cols = {}
722
-
723
- # Generate features efficiently
724
- for gb_feature, stat, target in product(gb_features, statistics, target_features):
725
- col_name = f"{target}_{'_'.join(gb_feature)}_{stat.upper()}"
726
- new_feature_cols[col_name] = preprocessed_df.groupby(gb_feature)[
727
- target
728
- ].transform(stat)
729
-
730
- # Merge all at once to improve performance
731
- preprocessed_df = pd.concat(
732
- [preprocessed_df, pd.DataFrame(new_feature_cols)], axis=1
733
- )
734
-
735
- if for_training:
736
- preprocessed_df = targets_creation(preprocessed_df)
737
-
738
- # Descriptive Analysis
739
- if analytics:
740
- traditional_descriptive_analysis(preprocessed_df)
741
- if save_as_csv and PYTHON_ENV == "Development":
742
- preprocessed_df_to_csv = preprocessed_df.sort_values(["DATE", "STOCK"])
743
- preprocessed_df_to_csv.to_csv(
744
- f"{data_dir}/data_for_training.csv",
745
- index=False,
746
- header=True,
747
- )
150
+ df[col] = pd.to_datetime(df[col]).dt.normalize()
151
+ df[f"{col}_year"] = df[col].dt.isocalendar().year
152
+ df[f"{col}_month"] = df[col].dt.month
153
+ df[f"{col}_day"] = df[col].dt.day
154
+ df[f"{col}_week"] = df[col].dt.isocalendar().week
155
+ df[f"{col}_weekday"] = df[col].dt.weekday
156
+ df[f"{col}_yearday"] = df[col].dt.dayofyear
157
+ df[col] = pd.to_datetime(df[col]).map(pd.Timestamp.toordinal)
748
158
 
749
- if for_training:
750
- preprocessed_df.dropna(inplace=True)
751
- else:
752
- preprocessed_df.dropna(
753
- subset=preprocessed_df.loc[
754
- :, ~preprocessed_df.columns.str.contains("^TARGET_")
755
- ].columns,
756
- inplace=True,
757
- )
159
+ df[f"{col}_month_sin"], df[f"{col}_month_cos"] = cyclic_encode(
160
+ df[f"{col}_month"], 12
161
+ )
162
+ df[f"{col}_day_sin"], df[f"{col}_day_cos"] = cyclic_encode(
163
+ df[f"{col}_day"], 31
164
+ )
165
+ df[f"{col}_week_sin"], df[f"{col}_week_cos"] = cyclic_encode(
166
+ df[f"{col}_week"], 52
167
+ )
168
+ df[f"{col}_weekday_sin"], df[f"{col}_weekday_cos"] = cyclic_encode(
169
+ df[f"{col}_weekday"], 7
170
+ )
171
+ df[f"{col}_yearday_sin"], df[f"{col}_yearday_cos"] = cyclic_encode(
172
+ df[f"{col}_yearday"], 365
173
+ )
758
174
 
759
- preprocessed_df.sort_values(["DATE", "STOCK"], inplace=True)
760
- preprocessed_df.reset_index(drop=True, inplace=True)
175
+ # Drop the original column TODO: not sure if we should drop it for time series
176
+ # df.drop(col, axis=1, inplace=True)
177
+
178
+ return df
179
+
180
+ def boolean_encode_columns(self) -> pd.DataFrame:
181
+ """
182
+ Applies boolean encoding to a list of columns:
183
+ - Leaves column as-is if already int with only 0 and 1
184
+ - Otherwise: sets 1 if value is present (notna), 0 if null/NaN/None
185
+
186
+ Parameters:
187
+ df (pd.DataFrame): Input dataframe
188
+ columns (list): List of column names to encode
189
+
190
+ Returns:
191
+ pd.DataFrame: Updated dataframe with encoded columns
192
+ """
193
+
194
+ df: pd.DataFrame = self.data
195
+ columns: list[str] = self.columns_boolean
196
+
197
+ for column in columns:
198
+ col = df[column]
199
+ if pd.api.types.is_integer_dtype(col) and set(
200
+ col.dropna().unique()
201
+ ).issubset({0, 1}):
202
+ continue # already valid binary
203
+ df[column] = col.notna().astype(int)
204
+ return df
205
+
206
+ def generate_target_encodings(self) -> pd.DataFrame:
207
+ """
208
+ Generate target encoding features (e.g., mean, median) for specified targets and group-by combinations.
209
+
210
+ Parameters:
211
+ df (pd.DataFrame): Input dataframe
212
+ columns_te_groupby (list of list): Grouping keys, e.g., [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
213
+ columns_te_target (list): Target columns to aggregate (e.g., ["RET", "VOLUME", "RSI_14"])
214
+ statistics (list): List of aggregation statistics (e.g., ["mean", "median"])
215
+
216
+ Returns:
217
+ pd.DataFrame: Original dataframe with new encoded columns added
218
+ """
219
+
220
+ df: pd.DataFrame = self.data
221
+ columns_te_groupby: list[list[str]] = self.columns_te_groupby
222
+ columns_te_target: list[str] = self.columns_te_target
223
+ statistics: list[str] = ["mean", "median"]
224
+
225
+ df = df.copy()
226
+ new_feature_cols = {}
227
+ for group_cols, stat, target_col in product(
228
+ columns_te_groupby, statistics, columns_te_target
229
+ ):
230
+ col_name = f"{target_col}_{'_'.join(group_cols)}_{stat.upper()}"
231
+ new_feature_cols[col_name] = df.groupby(group_cols)[target_col].transform(
232
+ stat
233
+ )
761
234
 
762
- logger.info(
763
- f"{len(preprocessed_df['DATE'])} preprocessed data with shape {preprocessed_df.shape} from {datetime.strftime(preprocessed_df['DATE'].iat[0], '%d/%m/%Y')} to {datetime.strftime(preprocessed_df['DATE'].iat[-1], '%d/%m/%Y')}"
764
- )
235
+ # merge all at once to improve performance
236
+ df = pd.concat([df, pd.DataFrame(new_feature_cols)], axis=1)
237
+ return df
238
+
239
+ def fillna_at_training(self) -> pd.DataFrame:
240
+ """
241
+ Fill missing values in a DataFrame:
242
+ - Numeric columns: fill with mean
243
+ - Categorical columns: fill with mode
244
+ Handles both NaN and None.
245
+
246
+ Parameters:
247
+ df (pd.DataFrame): Input DataFrame
248
+
249
+ Returns:
250
+ pd.DataFrame: Cleaned DataFrame with missing values filled
251
+ """
252
+
253
+ df: pd.DataFrame = self.data.copy()
254
+
255
+ for col in df.columns:
256
+ missing_count = df[col].isnull().sum()
257
+ if missing_count > 0:
258
+ if pd.api.types.is_numeric_dtype(df[col]):
259
+ df[col] = df[col].fillna(df[col].mean())
260
+ logger.info(
261
+ f"Filled {missing_count} NaN values in numeric column '{col}' with mean."
262
+ )
263
+ else:
264
+ mode = df[col].mode()
265
+ if not mode.empty:
266
+ mode_value = mode[0]
267
+ mode_count = (df[col] == mode_value).sum()
268
+ if mode_count > 100:
269
+ fill_value = mode_value
270
+ else:
271
+ fill_value = "unknown"
272
+ else:
273
+ fill_value = "unknown"
274
+
275
+ df[col] = df[col].fillna(fill_value)
276
+ logger.info(
277
+ f"Filled {missing_count} NaN values in categorical column '{col}' with '{fill_value}'."
278
+ )
279
+
280
+ return df
281
+
282
+ def fillna_at_inference(self) -> pd.DataFrame:
283
+
284
+ df: pd.DataFrame = self.data
285
+
286
+ missing_cols = df.columns[df.isnull().any()].tolist()
287
+
288
+ if missing_cols:
289
+ numeric_cols = [
290
+ col for col in missing_cols if pd.api.types.is_numeric_dtype(df[col])
291
+ ]
292
+ non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
765
293
 
766
- # for_training results if needed
767
- if for_training and PYTHON_ENV == "Development":
768
- joblib.dump(preprocessed_df, f"{data_dir}/data_for_training.pkl")
294
+ logger.warning(
295
+ f"Missing values found in inference data."
296
+ f"Filling with 0 for numeric columns: {numeric_cols}, "
297
+ f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
298
+ )
769
299
 
770
- # Return the fully processed DataFrame with all new features (copy to avoid fragmented memory)
771
- return_df = preprocessed_df.copy()
772
- return return_df
300
+ df[numeric_cols] = df[numeric_cols].fillna(0)
301
+ df[non_numeric_cols] = df[non_numeric_cols].fillna("unknown")
302
+
303
+ return df
304
+
305
+
306
+ class PreprocessFeature:
307
+
308
+ def __init__(
309
+ self,
310
+ data: pd.DataFrame,
311
+ dataset,
312
+ time_series: bool = False,
313
+ date_column: str | None = None,
314
+ group_column: str | None = None,
315
+ val_size: float = 0.2,
316
+ test_size: float = 0.2,
317
+ columns_pca: list[str] = [],
318
+ columns_onehot: list[str] = [],
319
+ columns_binary: list[str] = [],
320
+ columns_ordinal: list[str] = [],
321
+ columns_frequency: list[str] = [],
322
+ target_numbers: list = [],
323
+ target_clf: list = [],
324
+ **kwargs,
325
+ ):
326
+ self.data = data
327
+ self.data.columns = self.data.columns.str.upper()
328
+
329
+ self.dataset = dataset
330
+ self.columns_pca = columns_pca
331
+ self.columns_onehot = columns_onehot
332
+ self.columns_binary = columns_binary
333
+ self.columns_ordinal = columns_ordinal
334
+ self.columns_frequency = columns_frequency
335
+ self.target_numbers = target_numbers
336
+ self.target_clf = target_clf
337
+
338
+ self.time_series = time_series
339
+ self.date_column = date_column
340
+ self.group_column = group_column
341
+ self.val_size = val_size
342
+ self.test_size = test_size
343
+
344
+ self.dataset_dir = self.dataset.path
345
+ self.dataset_id = self.dataset.id
346
+ self.data_dir = f"{self.dataset_dir}/data"
347
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
348
+
349
+ def run(self):
350
+ # Split
351
+ train, val, test = (
352
+ self.train_val_test_split_time_series()
353
+ if self.time_series
354
+ else self.train_val_test_split(
355
+ stratify_col=f"TARGET_{self.target_numbers[0]}"
356
+ )
357
+ ) # TODO: only stratifying first target for now
773
358
 
359
+ # PCA
360
+ train, pcas = self.add_pca_features(train)
361
+ val, _ = self.add_pca_features(test, pcas=pcas)
362
+ test, _ = self.add_pca_features(val, pcas=pcas)
774
363
 
775
- # Descriptive Analytics functions
776
- def print_missing_values(df: pd.DataFrame):
364
+ joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
777
365
 
778
- if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
779
- logger.info(
780
- f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
366
+ # Encoding
367
+ train, transformer = self.encode_categorical_features(train)
368
+ val, _ = self.encode_categorical_features(
369
+ val,
370
+ transformer=transformer,
371
+ )
372
+ test, _ = self.encode_categorical_features(
373
+ test,
374
+ transformer=transformer,
781
375
  )
782
- else:
783
- logger.info("No missing values found")
784
-
785
-
786
- def plot_sector_repartition(df: pd.DataFrame):
787
- """Visualise repartition of stock per sectors
788
-
789
- Args:
790
- df (pd.DataFrame): a df created with `get_data`
791
- """
792
- sns.barplot(
793
- data=df.groupby("SECTOR")["STOCK"].nunique(),
794
- orient="h",
795
- order=df.groupby("SECTOR")["STOCK"]
796
- .nunique()
797
- .sort_values(ascending=False)
798
- .index,
799
- )
800
376
 
377
+ joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
+ joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
+ summary = summarize_dataframe(train)
380
+ summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
801
381
 
802
- def traditional_descriptive_analysis(df: pd.DataFrame, stock_column: str = "STOCK"):
803
- with pd.option_context("display.max_rows", None):
382
+ return train, val, test
804
383
 
805
- # Check for duplicates
806
- duplicated_count = df.duplicated().sum()
384
+ def inference(self):
385
+ # PCA
386
+ pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
387
+ data, _ = self.add_pca_features(self.data, pcas=pcas)
807
388
 
808
- # Check for missing values
809
- missing_values = (
810
- df.isnull()
811
- .sum()
812
- .where(df.isnull().sum() != 0)
813
- .dropna()
814
- .sort_values(ascending=False)
389
+ # Encoding
390
+ transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
391
+ data, _ = self.encode_categorical_features(
392
+ data,
393
+ transformer=transformer,
815
394
  )
395
+ return data
816
396
 
817
- # Check for infinite values
818
- inf_values = (
819
- df.isin([np.inf, -np.inf])
820
- .sum()
821
- .where(df.isin([np.inf, -np.inf]).sum() != 0)
822
- .dropna()
823
- .sort_values(ascending=False)
824
- )
397
+ def train_val_test_split_time_series(self):
398
+ df: pd.DataFrame = self.data
399
+ date_column: str = self.date_column
400
+ group_column: str = self.group_column
401
+ val_size: float = self.val_size
402
+ test_size: float = self.test_size
825
403
 
826
- # Data types of each column
827
- data_types = df.dtypes
404
+ if not date_column:
405
+ ValueError("Please specify a date_column for time series")
828
406
 
829
- # Shape of the DataFrame (rows, columns)
830
- shape = df.shape
407
+ if group_column:
408
+ df.sort_values([date_column, group_column], inplace=True)
409
+ else:
410
+ df.sort_values(date_column, inplace=True)
831
411
 
832
- # Number of unique values in the stock column (or any specified column)
833
- unique_stock_count = (
834
- len(df[stock_column].unique()) if stock_column in df.columns else None
835
- )
412
+ dates = df[date_column].unique()
836
413
 
837
- # Constant columns
838
- constant_columns = [col for col in df.columns if df[col].nunique() == 1]
414
+ val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
415
+ test_first_id = int(len(dates) * (1 - test_size)) + 1
839
416
 
840
- # logger.info results
841
- logger.info(f"Duplicated rows: {duplicated_count}")
417
+ train = df[df[date_column].isin(dates[:val_first_id])]
418
+ val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
419
+ test = df[df[date_column].isin(dates[test_first_id:])]
842
420
 
843
- logger.info(f"\nShape of DataFrame: {shape}")
421
+ dates = {}
422
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
423
+ dates[f"{name}_start_date"] = (
424
+ data[date_column].map(pd.Timestamp.fromordinal).iat[0]
425
+ )
426
+ dates[f"{name}_end_date"] = (
427
+ data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
428
+ )
844
429
 
845
- if unique_stock_count is not None:
846
430
  logger.info(
847
- f"\nNumber of unique values in '{stock_column}': {unique_stock_count}"
431
+ f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
848
432
  )
849
- else:
850
- logger.info(f"\nColumn '{stock_column}' not found in the DataFrame.")
851
433
 
852
- logger.info(
853
- f"\nInfinite Values: \n{inf_values if not inf_values.empty else 'No infinite values'}"
434
+ Dataset.update(
435
+ match_fields=["id"],
436
+ id=self.dataset_id,
437
+ train_size=len(train),
438
+ val_size=len(val),
439
+ test_size=len(test),
440
+ **dates,
854
441
  )
855
-
856
- logger.info(
857
- f"\nMissing Values: \n{missing_values if not missing_values.empty else 'No missing values'}"
442
+ return (
443
+ train.reset_index(drop=True),
444
+ val.reset_index(drop=True),
445
+ test.reset_index(drop=True),
858
446
  )
859
447
 
860
- logger.info(f"\nConstant columns: \n{constant_columns}")
448
+ def train_val_test_split(
449
+ self,
450
+ random_state: int = 42,
451
+ stratify_col: str | None = None,
452
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
453
+ """
454
+ Splits a DataFrame into train, validation, and test sets.
455
+
456
+ Parameters:
457
+ df (pd.DataFrame): The full dataset
458
+ val_size (float): Proportion of validation set (default 0.1)
459
+ test_size (float): Proportion of test set (default 0.1)
460
+ random_state (int): Random seed for reproducibility
461
+ stratify_col (str | None): Optional column to stratify on (for classification tasks)
462
+
463
+ Returns:
464
+ Tuple of (train_df, val_df, test_df)
465
+ """
466
+ df: pd.DataFrame = self.data
467
+ val_size: float = self.val_size
468
+ test_size: float = self.test_size
469
+
470
+ stratify_vals = df[stratify_col] if stratify_col else None
471
+
472
+ # First split: train + (val + test)
473
+ train, temp = train_test_split(
474
+ df,
475
+ test_size=val_size + test_size,
476
+ random_state=random_state,
477
+ stratify=stratify_vals,
478
+ )
861
479
 
862
- logger.info(f"\nData Types: \n{data_types}")
480
+ # Adjust stratify target for val/test split
481
+ stratify_temp = temp[stratify_col] if stratify_col else None
863
482
 
483
+ # Compute val and test sizes relative to temp
484
+ val_ratio = val_size / (val_size + test_size)
864
485
 
865
- def visualize_extrema(
866
- data: pd.DataFrame,
867
- stock: str,
868
- days_before_last: int = 200,
869
- local_max_order: int = 10,
870
- ):
871
- """
872
- Function to visualize local maxima and minima for a given stock in the data.
486
+ val, test = train_test_split(
487
+ temp,
488
+ test_size=1 - val_ratio,
489
+ random_state=random_state,
490
+ stratify=stratify_temp,
491
+ )
873
492
 
874
- Parameters:
875
- - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'ID'
876
- - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
877
- - days_before_last: int, number of days before the last date in the dataset to visualize
878
- - local_max_order: int, the window size for identifying local extrema (default: 5)
879
- """
493
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
494
+ logger.info(f"{data.shape} {name} data")
880
495
 
881
- # Calculate the last date in the dataset
882
- last_date = data["DATE"].max()
883
- start_date = last_date - pd.Timedelta(days=days_before_last)
884
-
885
- # Find local maxima (argrelextrema with np.greater) for each stock
886
- local_max_CLOSE = (
887
- data[data["STOCK"] == stock]
888
- .set_index("DATE")["CLOSE"]
889
- .iloc[
890
- argrelextrema(
891
- data[data["STOCK"] == stock]["CLOSE"].values,
892
- np.greater,
893
- order=local_max_order,
894
- )
895
- ]
896
- .reset_index()
897
- )
898
-
899
- # Find local minima (argrelextrema with np.less) for each stock
900
- local_min_CLOSE = (
901
- data[data["STOCK"] == stock]
902
- .set_index("DATE")["CLOSE"]
903
- .iloc[
904
- argrelextrema(
905
- data[data["STOCK"] == stock]["CLOSE"].values,
906
- np.less,
907
- order=local_max_order,
908
- )
909
- ]
910
- .reset_index()
911
- )
912
-
913
- # Filter maxima based on stock and date range
914
- local_max_CLOSE = local_max_CLOSE[local_max_CLOSE["DATE"] >= start_date]
915
-
916
- # Filter minima based on stock and date range
917
- local_min_CLOSE = local_min_CLOSE[local_min_CLOSE["DATE"] >= start_date]
918
-
919
- # logger.info the maxima and minima dates
920
- logger.info(
921
- f"Maxima Dates for Stock {stock}: {list(local_max_CLOSE['DATE'].values)}"
922
- )
923
- logger.info(
924
- f"Minima Dates for Stock {stock}: {list(local_min_CLOSE['DATE'].values)}"
925
- )
926
-
927
- # Plot the stock's CLOSE prices within the specified date range
928
- stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)][
929
- ["CLOSE", "DATE"]
930
- ].set_index("DATE")
931
-
932
- plt.figure(figsize=(10, 6))
933
- stock_data.plot(color="black", title=f"Stock {stock} Extremas")
934
-
935
- # Add vertical lines for maxima
936
- for date in local_max_CLOSE["DATE"].values:
937
- plt.axvline(
938
- x=date,
939
- color="red",
940
- label="Maxima" if date == local_max_CLOSE["DATE"].values[0] else "",
496
+ return (
497
+ train.reset_index(drop=True),
498
+ val.reset_index(drop=True),
499
+ test.reset_index(drop=True),
941
500
  )
942
501
 
943
- # Add vertical lines for minima
944
- for date in local_min_CLOSE["DATE"].values:
945
- plt.axvline(
946
- x=date,
947
- color="green",
948
- label="Minima" if date == local_min_CLOSE["DATE"].values[0] else "",
502
+ # embedding and pca
503
+ def add_pca_features(
504
+ self, df: pd.DataFrame, n_components: int = 5, pcas=None
505
+ ) -> tuple[pd.DataFrame, dict]:
506
+ """
507
+ Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
508
+ NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
509
+
510
+ Parameters:
511
+ df (pd.DataFrame): Input DataFrame
512
+ column (str): Name of the column containing np.ndarray
513
+ n_components (int): Number of PCA components to keep
514
+
515
+ Returns:
516
+ pd.DataFrame: DataFrame with new PCA columns added
517
+ """
518
+ columns: list[str] = self.columns_pca
519
+
520
+ pcas_dict = {}
521
+ for column in columns:
522
+ # Convert text to embeddings if necessary
523
+ if not isinstance(df[column].iloc[0], (np.ndarray, list)):
524
+ sentences = df[column].astype(str).tolist()
525
+ logger.info(
526
+ f"Total sentences to embed for column {column}: {len(sentences)}"
527
+ )
528
+
529
+ # Truncate each sentence
530
+ truncate_sentences = [truncate_text(sentence) for sentence in sentences]
531
+
532
+ # embedding
533
+ embedding_matrix = get_openai_embeddings(truncate_sentences)
534
+ else:
535
+ logger.info(f"Column {column} is already embeddings")
536
+ # Stack the vectors into a 2D array
537
+ embedding_matrix = np.vstack(df[column].values)
538
+
539
+ # Apply PCA
540
+ if pcas:
541
+ pca = pcas[column]
542
+ pca_features = pca.transform(embedding_matrix)
543
+ else:
544
+ pca = PCA(n_components=n_components)
545
+ pca_features = pca.fit_transform(embedding_matrix)
546
+
547
+ # Add PCA columns
548
+ for i in range(n_components):
549
+ df[f"{column}_pca_{i+1}"] = pca_features[:, i]
550
+
551
+ # Drop the original column
552
+ df.drop(column, axis=1, inplace=True)
553
+ pcas_dict.update({column: pca})
554
+
555
+ return df, pcas_dict
556
+
557
+ # encoding categorical features
558
+ def encode_categorical_features(
559
+ self,
560
+ df: pd.DataFrame,
561
+ transformer: ColumnTransformer | None = None,
562
+ ) -> tuple[pd.DataFrame, ColumnTransformer]:
563
+ """
564
+ Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
565
+
566
+ Parameters:
567
+ df (pd.DataFrame): Input DataFrame
568
+ columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
569
+ columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
570
+ columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
571
+ columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
572
+ transformer (ColumnTransformer, optional): if provided, applies transform only
573
+
574
+ Returns:
575
+ tuple: (transformed DataFrame, ColumnTransformer)
576
+ """
577
+ columns_onehot: list[str] = self.columns_onehot
578
+ columns_binary: list[str] = self.columns_binary
579
+ columns_ordinal: list[str] = self.columns_ordinal
580
+ columns_frequency: list[str] = self.columns_frequency
581
+
582
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
583
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
584
+ save_in_db = False
585
+
586
+ all_columns = (
587
+ columns_onehot + columns_binary + columns_ordinal + columns_frequency
949
588
  )
950
589
 
951
- plt.legend()
952
- plt.show()
953
-
954
-
955
- def visualize_trading_signals(
956
- data: pd.DataFrame,
957
- stock: str,
958
- days_before_last: int = 200,
959
- ):
960
- """
961
- Function to visualize trading signals (BUY, SELL, HOLD) for a given stock.
590
+ if transformer:
591
+ transformed = transformer.transform(X)
592
+ else:
593
+ transformer = ColumnTransformer(
594
+ transformers=[
595
+ (
596
+ "onehot",
597
+ OneHotEncoder(handle_unknown="ignore", sparse_output=False),
598
+ columns_onehot,
599
+ ),
600
+ (
601
+ "ordinal",
602
+ OrdinalEncoder(
603
+ handle_unknown="use_encoded_value", unknown_value=-1
604
+ ),
605
+ columns_ordinal,
606
+ ),
607
+ ("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
608
+ ("freq", CountEncoder(normalize=True), columns_frequency),
609
+ ],
610
+ remainder="passthrough",
611
+ )
612
+ transformed = transformer.fit_transform(X)
613
+ save_in_db = True
962
614
 
963
- Parameters:
964
- - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'TRADING_SIGNAL'
965
- - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
966
- - days_before_last: int, number of days before the last date in the dataset to visualize
967
- """
615
+ # Build output column names
616
+ column_names = []
968
617
 
969
- # Calculate the last date in the dataset
970
- last_date = data["DATE"].max()
971
- start_date = last_date - pd.Timedelta(days=days_before_last)
618
+ if columns_onehot:
619
+ column_names.extend(
620
+ transformer.named_transformers_["onehot"]
621
+ .get_feature_names_out(columns_onehot)
622
+ .tolist()
623
+ )
972
624
 
973
- # Filter data for the selected stock and date range
974
- stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)].copy()
625
+ if columns_ordinal:
626
+ column_names.extend(columns_ordinal)
975
627
 
976
- # Plot the stock's CLOSE prices
977
- plt.figure(figsize=(10, 6))
978
- plt.plot(stock_data["DATE"], stock_data["CLOSE"], color="black", label="CLOSE")
628
+ if columns_binary:
629
+ column_names.extend(
630
+ transformer.named_transformers_["binary"]
631
+ .get_feature_names_out(columns_binary)
632
+ .tolist()
633
+ )
979
634
 
980
- # Define the colors for the trading signals
981
- colors = {2: "green", 1: "lightgreen", 0: "yellow", -1: "red", -2: "darkred"}
635
+ if columns_frequency:
636
+ column_names.extend(columns_frequency)
982
637
 
983
- # Plot each trading signal with the respective color
984
- for signal_value, color in colors.items():
985
- plt.scatter(
986
- stock_data.loc[stock_data["TARGET_11"] == signal_value, "DATE"],
987
- stock_data.loc[stock_data["TARGET_11"] == signal_value, "CLOSE"],
988
- color=color,
989
- label=f"Signal {signal_value}",
990
- s=50, # Size of the points
991
- )
638
+ # Add passthrough (non-encoded) columns
639
+ passthrough_columns = [col for col in X.columns if col not in all_columns]
640
+ column_names.extend(passthrough_columns)
992
641
 
993
- plt.title(f"Trading Signals for {stock}")
994
- plt.xlabel("Date")
995
- plt.ylabel("Close Price")
996
- plt.legend()
997
- plt.grid(True)
998
- plt.show()
999
-
1000
-
1001
- def visualize_data_distribution(
1002
- data,
1003
- plot_type="hist",
1004
- features=None,
1005
- bins=50,
1006
- rows=5,
1007
- cols=5,
1008
- width_per_plot=4,
1009
- height_per_plot=3,
1010
- ):
1011
- """
1012
- Function to visualize the data distribution for multiple features in a DataFrame with dynamic figsize,
1013
- splitting into multiple figures if there are too many features for one figure.
1014
-
1015
- Parameters:
1016
- - data: pd.DataFrame, the DataFrame containing the data to visualize.
1017
- - plot_type: str, the type of plot to use ('hist', 'kde', 'box').
1018
- - features: list, list of features (columns) to visualize. If None, all numeric features are used.
1019
- - bins: int, the number of bins for histograms (default: 50).
1020
- - rows: int, number of rows in the subplot grid (default: 5).
1021
- - cols: int, number of columns in the subplot grid (default: 5).
1022
- - width_per_plot: int, the width of each subplot (default: 4).
1023
- - height_per_plot: int, the height of each subplot (default: 3).
1024
- """
642
+ X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
1025
643
 
1026
- # If no features are specified, use all numeric features
1027
- if features is None:
1028
- features = data.select_dtypes(include=[np.number]).columns.tolist()
644
+ # Try to convert columns to best possible dtypes
645
+ X_transformed = X_transformed.convert_dtypes()
1029
646
 
1030
- # Calculate the total number of features
1031
- total_features = len(features)
1032
-
1033
- # How many plots can fit into one figure
1034
- plots_per_figure = rows * cols
647
+ # Insert features in db
648
+ if save_in_db:
649
+ # TODO: in bulk
650
+ categorical_features, numerical_features = get_features_by_types(
651
+ X_transformed
652
+ )
653
+ for feature in categorical_features:
654
+ Feature.upsert(match_fields=["name"], name=feature, type="categorical")
655
+ for feature in numerical_features:
656
+ Feature.upsert(match_fields=["name"], name=feature, type="numerical")
657
+ for target in y.columns:
658
+ target_number = int(target.split("_")[1])
659
+ type = (
660
+ "classification"
661
+ if target_number in self.target_clf
662
+ else "regression"
663
+ )
664
+ # TODO: what about description here ?
665
+ Target.upsert(match_fields=["name", "type"], name=target, type=type)
666
+
667
+ return pd.concat([X_transformed, y], axis=1), transformer
668
+
669
+
670
+ # analysis & utils
671
+ def summarize_dataframe(
672
+ df: pd.DataFrame, sample_categorical_threshold: int = 15
673
+ ) -> pd.DataFrame:
674
+ summary = []
675
+
676
+ def is_hashable_series(series: pd.Series) -> bool:
677
+ try:
678
+ _ = series.dropna().unique()
679
+ return True
680
+ except TypeError:
681
+ return False
1035
682
 
1036
- # Loop over the features and create new figures as needed
1037
- for start in range(0, total_features, plots_per_figure):
1038
- # Subset of features for the current figure
1039
- subset_features = features[start : start + plots_per_figure]
683
+ df = convert_object_columns_that_are_numeric(df)
684
+ df = df.convert_dtypes()
1040
685
 
1041
- # Dynamically calculate figure size based on grid size and plot dimensions
1042
- num_plots = len(subset_features)
1043
- grid_rows = min(rows, num_plots // cols + (num_plots % cols != 0))
1044
- grid_cols = min(cols, num_plots)
1045
- figsize = (grid_cols * width_per_plot, grid_rows * height_per_plot)
686
+ for col in df.columns:
687
+ total_missing = df[col].isna().sum()
688
+ col_data = df[col].dropna()
689
+ dtype = col_data.dtype
1046
690
 
1047
- # Set up the figure and axes for this subset of features
1048
- fig, axes = plt.subplots(grid_rows, grid_cols, figsize=figsize)
1049
- axes = axes.flatten() # Flatten the axes for easy iteration
691
+ if col_data.empty:
692
+ summary.append(
693
+ {
694
+ "Column": col,
695
+ "Dtype": dtype,
696
+ "Type": "unknown",
697
+ "Detail": "No non-null values",
698
+ "Missing": total_missing,
699
+ }
700
+ )
701
+ continue
702
+
703
+ # Case 1: Numeric columns
704
+ if pd.api.types.is_numeric_dtype(col_data):
705
+ unique_vals = col_data.nunique()
706
+
707
+ if set(col_data.unique()).issubset({0, 1}):
708
+ col_type = "binary-categorical"
709
+ detail = "0/1 values only"
710
+ elif (
711
+ pd.api.types.is_integer_dtype(col_data)
712
+ and unique_vals <= sample_categorical_threshold
713
+ ):
714
+ col_type = "multi-categorical"
715
+ top_vals = col_data.value_counts().head(10)
716
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
717
+ else:
718
+ col_type = "numeric"
719
+ q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
720
+ detail = (
721
+ f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
722
+ f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
723
+ )
724
+
725
+ # Case 2: Object or other hashable columns
726
+ elif is_hashable_series(col_data):
727
+ unique_vals = col_data.nunique()
728
+ if unique_vals <= sample_categorical_threshold:
729
+ col_type = "object-categorical"
730
+ top_vals = col_data.value_counts().head(10)
731
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
732
+ else:
733
+ col_type = "high-cardinality-categorical"
734
+ detail = f"{unique_vals} unique values"
735
+
736
+ # Case 3: Unusable columns
737
+ else:
738
+ col_type = "non-hashable"
739
+ detail = f"Non-hashable type: {type(col_data.iloc[0])}"
740
+
741
+ summary.append(
742
+ {
743
+ "Column": col,
744
+ "Dtype": dtype,
745
+ "Type": col_type,
746
+ "Detail": detail,
747
+ "Missing": total_missing,
748
+ }
749
+ )
1050
750
 
1051
- # Plot each feature
1052
- for i, feature in enumerate(subset_features):
1053
- ax = axes[i]
751
+ return pd.DataFrame(summary)
1054
752
 
1055
- if plot_type == "hist":
1056
- sns.histplot(data[feature].dropna(), bins=bins, kde=False, ax=ax)
1057
- elif plot_type == "kde":
1058
- sns.kdeplot(data[feature].dropna(), ax=ax, fill=True)
1059
- elif plot_type == "box":
1060
- sns.boxplot(data[feature].dropna(), ax=ax)
1061
753
 
1062
- ax.set_xlabel(feature)
1063
- ax.set_ylabel("Count")
754
+ def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
755
+ """
756
+ Detect object columns that can be safely converted to numeric (float or int).
1064
757
 
1065
- # Hide any empty subplots
1066
- for j in range(i + 1, len(axes)):
1067
- fig.delaxes(axes[j])
758
+ Returns:
759
+ List of column names that are object type but contain numeric values.
760
+ """
1068
761
 
1069
- # Use tight layout to ensure there's no overlap
1070
- fig.tight_layout()
762
+ numeric_candidates = []
1071
763
 
1072
- # Show the plot for this figure
1073
- plt.show()
764
+ for col in df.select_dtypes(include=["object"]).columns:
765
+ try:
766
+ converted = pd.to_numeric(df[col], errors="coerce")
767
+ if converted.notna().sum() / len(df) > 0.9: # at least 90% convertible
768
+ numeric_candidates.append(col)
769
+ except Exception:
770
+ continue
1074
771
 
772
+ for col in numeric_candidates:
773
+ df[col] = pd.to_numeric(df[col], errors="coerce")
1075
774
 
1076
- def detect_outliers_iqr(data, degree: float = 1.5):
1077
- """
1078
- Detect outliers in a DataFrame using the Interquartile Range (IQR) method.
775
+ return df
1079
776
 
1080
- Parameters:
1081
- - data: pd.DataFrame, the DataFrame in which to detect outliers.
1082
777
 
1083
- Returns:
1084
- - outliers: pd.DataFrame, DataFrame with boolean values indicating outliers for each feature.
1085
- """
1086
- outliers = pd.DataFrame(index=data.index)
778
+ def traditional_descriptive_analysis(df: pd.DataFrame, group_column: str | None = None):
779
+ with pd.option_context("display.max_rows", None):
780
+ results = {}
1087
781
 
1088
- for column in data.select_dtypes(include=[np.number]).columns:
1089
- Q1 = data[column].quantile(0.25) # 1st quartile (25th percentile)
1090
- Q3 = data[column].quantile(0.75) # 3rd quartile (75th percentile)
1091
- IQR = Q3 - Q1 # Interquartile range
782
+ # Shape
783
+ results["Shape"] = f"{df.shape[0]} rows × {df.shape[1]} columns"
1092
784
 
1093
- lower_bound = Q1 - degree * IQR
1094
- upper_bound = Q3 + degree * IQR
785
+ # Duplicated rows
786
+ results["Duplicated rows"] = int(df.duplicated().sum())
1095
787
 
1096
- # Detect outliers
1097
- outliers[column] = (data[column] < lower_bound) | (data[column] > upper_bound)
788
+ # Duplicated columns
789
+ duplicated_cols = df.T[df.T.duplicated()].index.tolist()
790
+ results["Duplicated columns"] = (
791
+ ", ".join(duplicated_cols) if len(duplicated_cols) > 0 else "None"
792
+ )
1098
793
 
1099
- return outliers
794
+ # Missing values
795
+ missing = df.isnull().sum()
796
+ missing = missing[missing > 0].sort_values(ascending=False)
797
+ if len(missing) > 0:
798
+ results["Missing values"] = missing.to_frame("Missing Count").to_markdown()
799
+ else:
800
+ results["Missing values"] = "No missing values"
801
+
802
+ # Infinite values
803
+ inf = df.replace([np.inf, -np.inf], np.nan)
804
+ inf_count = inf.isnull().sum() - df.isnull().sum()
805
+ inf_count = inf_count[inf_count > 0].sort_values(ascending=False)
806
+ if len(inf_count) > 0:
807
+ results["Infinite values"] = inf_count.to_frame("Inf Count").to_markdown()
808
+ else:
809
+ results["Infinite values"] = "No infinite values"
1100
810
 
811
+ # Constant columns
812
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
813
+ results["Constant columns"] = (
814
+ ", ".join(constant_cols) if len(constant_cols) > 0 else "None"
815
+ )
1101
816
 
1102
- def plot_distribution(df):
1103
- logger.info("DATA_DISTRIBUTION")
817
+ # Data types
818
+ dtypes = df.dtypes.astype(str).sort_index()
819
+ results["Data types"] = dtypes.to_frame("Type").to_markdown()
1104
820
 
1105
- logger.info("numerical features")
1106
- visualize_data_distribution(df.select_dtypes(include=["float64"]))
821
+ # Unique values in group_column
822
+ if group_column is not None:
823
+ if group_column in df.columns:
824
+ results[f"Unique values in '{group_column}'"] = int(
825
+ df[group_column].nunique()
826
+ )
827
+ else:
828
+ results[f"Unique values in '{group_column}'"] = (
829
+ f"❌ Column '{group_column}' not found"
830
+ )
1107
831
 
1108
- logger.info("categorical features")
1109
- visualize_data_distribution(df.select_dtypes(include=["int64"]))
832
+ # Log all results
833
+ for title, content in results.items():
834
+ print(f"\n### {title}\n{content}")
1110
835
 
1111
- logger.info("nb of outliers")
1112
- outliers = detect_outliers_iqr(df.select_dtypes(include=["float64"]), degree=5)
1113
836
 
1114
- with pd.option_context("display.max_rows", None):
1115
- logger.info(outliers.sum().sort_values(ascending=False))
837
+ def print_missing_values(df: pd.DataFrame):
1116
838
 
1117
- logger.info("zoom on volume outliers")
1118
- columns = [c for c in df.columns if "VOLUME" in c]
1119
- visualize_data_distribution(df, features=columns, plot_type="box", cols=3)
839
+ if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
840
+ logger.info(
841
+ f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
842
+ )
843
+ else:
844
+ logger.info("No missing values found")