lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +277 -0
- lecrapaud/config.py +10 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/env.py +2 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/alembic.ini +116 -0
- lecrapaud/db/models/__init__.py +10 -10
- lecrapaud/db/models/base.py +176 -1
- lecrapaud/db/models/dataset.py +25 -20
- lecrapaud/db/models/feature.py +5 -6
- lecrapaud/db/models/feature_selection.py +3 -4
- lecrapaud/db/models/feature_selection_rank.py +3 -4
- lecrapaud/db/models/model.py +3 -4
- lecrapaud/db/models/model_selection.py +15 -8
- lecrapaud/db/models/model_training.py +15 -7
- lecrapaud/db/models/score.py +9 -6
- lecrapaud/db/models/target.py +16 -8
- lecrapaud/db/session.py +66 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +747 -1022
- lecrapaud/feature_selection.py +915 -998
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +2 -2
- lecrapaud/jobs/config.py +1 -1
- lecrapaud/jobs/scheduler.py +1 -1
- lecrapaud/jobs/tasks.py +6 -6
- lecrapaud/model_selection.py +1060 -960
- lecrapaud/search_space.py +4 -0
- lecrapaud/utils.py +2 -2
- lecrapaud-0.4.1.dist-info/METADATA +171 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
- lecrapaud/db/crud.py +0 -179
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +0 -58
- lecrapaud/predictions.py +0 -292
- lecrapaud/training.py +0 -151
- lecrapaud-0.4.0.dist-info/METADATA +0 -103
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
lecrapaud/feature_engineering.py
CHANGED
|
@@ -1,1119 +1,844 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
Feature engineering module for data preprocessing and transformation.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from itertools import product
|
|
12
|
-
import os
|
|
13
|
-
from collections import defaultdict
|
|
14
|
-
|
|
15
|
-
from src.config import PYTHON_ENV
|
|
16
|
-
from src.utils import logger
|
|
17
|
-
from src.directory_management import data_dir
|
|
18
|
-
from src.services.indicators import (
|
|
19
|
-
rsi,
|
|
20
|
-
macd,
|
|
21
|
-
bollinger_bands,
|
|
22
|
-
adx,
|
|
23
|
-
atr,
|
|
24
|
-
stochastic,
|
|
25
|
-
mfi,
|
|
26
|
-
ichimoku_cloud,
|
|
27
|
-
parabolic_sar,
|
|
28
|
-
chaikin_money_flow,
|
|
29
|
-
pivot_points,
|
|
30
|
-
sma,
|
|
31
|
-
ema,
|
|
32
|
-
volatility,
|
|
33
|
-
cumulative_return,
|
|
34
|
-
close_diff,
|
|
35
|
-
obv,
|
|
36
|
-
pressure,
|
|
37
|
-
)
|
|
38
|
-
from src.db.models import Target
|
|
4
|
+
Process
|
|
5
|
+
-------
|
|
6
|
+
FEAT ENG
|
|
7
|
+
- utiliser business_analysis > get_table_summary pour voir quels sont les champs null à + de 90%
|
|
8
|
+
- utiliser remove_constant_columns pour supprimer les colonnes constantes
|
|
9
|
+
- utiliser summarize_dataframe pour supprimer de nouvelles colonnes inutiles (date, id, donnée future à la prédiction, misc not useful)
|
|
10
|
+
- caster en numeric ce qui peut être casté en numeric
|
|
39
11
|
|
|
12
|
+
- definir columns_boolean
|
|
13
|
+
- definir groupby_columns_list et target_column pour le target encoding
|
|
14
|
+
- créer la/les targets
|
|
15
|
+
- définir columns_pca
|
|
16
|
+
- définir columns_one_hot, columns_binary, columns_ordinal, columns_frequency
|
|
40
17
|
|
|
41
|
-
# pd print options
|
|
42
|
-
# pd.set_option("display.max_columns", None)
|
|
43
|
-
# pd.reset_option("display.max_rows")
|
|
44
|
-
# pd.set_option("display.max_colwidth", None)
|
|
45
18
|
|
|
19
|
+
Todo
|
|
20
|
+
----
|
|
21
|
+
- DONE: drop meaningless identifier columns
|
|
22
|
+
- DONE: PCA on embedding of deck
|
|
23
|
+
- DONE: maybe cyclic encoding for date columns
|
|
46
24
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
local_max_order: int = 10,
|
|
52
|
-
threshold: int = 5,
|
|
53
|
-
):
|
|
54
|
-
"""Preprocessing the stock data from yfinance
|
|
25
|
+
- DONE: ordinal/label encode (only 1 column) for tree based method when not too big number of categories
|
|
26
|
+
- DONE: frequency encoding for some categorical columns
|
|
27
|
+
- DONE: one hot encoding for categorical columns
|
|
28
|
+
- DONE: binary encoding if big number of category
|
|
55
29
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
top_x_stock (float): the % at which you are considered top ranked stock for the day
|
|
59
|
-
local_max_order (int): this set up the window to look at on both side of the extrema : the greater, the more 'global' is the extrema.
|
|
30
|
+
- DONE: create other other embedding column for textual data ?
|
|
31
|
+
- DONE: create some boolean like has_website, has_linkedin_company_url, etc...
|
|
60
32
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
- date variables : we create YEAR, MONTH, DAY, WEEK, WEEKDAY, YEARWEEK and YEARDAY features
|
|
64
|
-
- return, market return, residual return and similar computation with volume are done to create 6 new features
|
|
65
|
-
- target variables :
|
|
66
|
-
- TARGET_1 : next day return
|
|
67
|
-
- TARGET_2 : categorical return (positive 1, or negative 0)
|
|
68
|
-
- TARGET_3 : next day ranking from best (1) to worst (n_stock) returns
|
|
69
|
-
- TARGET_4 : categorical next day top ranking (in top_x_stock) (1), or not (0)
|
|
70
|
-
- TARGET_5, TARGET_6, TARGET_7, TARGET_8 : same but with residual return
|
|
71
|
-
- TARGET_9 : categorical with 1 if it's a local maximum and 0 if not
|
|
72
|
-
- TARGET_10 : categorical with 1 if it's a local minimum and 0 if not
|
|
73
|
-
- TARGET 11 : We will create trading signals based on proximity to local minima and maxima : need multi-binary loss support
|
|
74
|
-
- TARGET 12, 13, 14 : return in 9,14,21 days
|
|
33
|
+
- target/mean encoding with a groupby on a very interesting categorical column
|
|
34
|
+
- faire du "vrai" target encoding avec du leave one out encoding par exemple, sur la target variable ?
|
|
75
35
|
|
|
36
|
+
- better categorize some stuff like country ? for sourcing we do position, ext_position, company, ext_company, country, source, but only country is relevant here
|
|
76
37
|
|
|
77
|
-
"""
|
|
78
38
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
first_x_percent = max(int(nb_of_stocks * top_x_stock), 1)
|
|
88
|
-
|
|
89
|
-
df["TARGET_1"] = df[target].shift(-1)
|
|
90
|
-
df["TARGET_2"] = np.select([df["TARGET_1"] <= 0, df["TARGET_1"] > 0], [0, 1])
|
|
91
|
-
df["TARGET_3"] = df.groupby("DATE")["TARGET_1"].rank(
|
|
92
|
-
method="first", ascending=False
|
|
93
|
-
)
|
|
94
|
-
df["TARGET_4"] = np.select(
|
|
95
|
-
[
|
|
96
|
-
df.groupby("DATE")["TARGET_1"].rank(method="first", ascending=False)
|
|
97
|
-
<= first_x_percent
|
|
98
|
-
],
|
|
99
|
-
[1],
|
|
100
|
-
default=0,
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
# TARGET 5-8 : We do the same for RESIDUAL_RET
|
|
104
|
-
target = "RESIDUAL_RET"
|
|
105
|
-
|
|
106
|
-
df["TARGET_5"] = df[target].shift(-1)
|
|
107
|
-
df["TARGET_6"] = np.select([df["TARGET_5"] <= 0, df["TARGET_5"] > 0], [0, 1])
|
|
108
|
-
df["TARGET_7"] = df.groupby("DATE")["TARGET_5"].rank(
|
|
109
|
-
method="first", ascending=False
|
|
110
|
-
)
|
|
111
|
-
df["TARGET_8"] = np.select(
|
|
112
|
-
[
|
|
113
|
-
df.groupby("DATE")["TARGET_5"].rank(method="first", ascending=False)
|
|
114
|
-
<= first_x_percent
|
|
115
|
-
],
|
|
116
|
-
[1],
|
|
117
|
-
default=0,
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
# TARGET 9-10 : Let's look at local min and max : it can be interpretate as buy and sell signal respectively
|
|
121
|
-
target = "CLOSE"
|
|
122
|
-
|
|
123
|
-
df["TARGET_9"] = 0
|
|
124
|
-
df["TARGET_10"] = 0
|
|
125
|
-
|
|
126
|
-
# Calculate local maxima and set TARGET_9 to 1 where maxima are found
|
|
127
|
-
maxima_indices = df.groupby(stock_column)[target].transform(
|
|
128
|
-
lambda x: x.index.isin(
|
|
129
|
-
x.iloc[argrelextrema(x.values, np.greater, order=local_max_order)].index
|
|
130
|
-
)
|
|
131
|
-
)
|
|
39
|
+
Development
|
|
40
|
+
-----------
|
|
41
|
+
- utiliser le PCA pour définir combien de variable explique la variance pour la feature selection max_feature
|
|
42
|
+
- could be nice to get linkedin info of founders (need to search reps in rails first) - and score !
|
|
43
|
+
- add created_from, utm_source, referrer when we will have more data
|
|
44
|
+
- could be nice to get team_count, or dealroom info but at the moment of submission...
|
|
45
|
+
"""
|
|
132
46
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
df.loc[maxima_indices, "TARGET_9"] = 1
|
|
140
|
-
df.loc[minima_indices, "TARGET_10"] = 1
|
|
141
|
-
|
|
142
|
-
# TARGET 11 : We will create trading signals based on proximity to local minima and maxima.
|
|
143
|
-
df["TARGET_11"] = 2 # Default value for HOLD
|
|
144
|
-
|
|
145
|
-
# Function to detect local minima and maxima, and assign signals
|
|
146
|
-
def assign_signals(group):
|
|
147
|
-
close_prices = group[target].values
|
|
148
|
-
dates = group["DATE"].values
|
|
149
|
-
|
|
150
|
-
# Detect local maxima and minima using argrelextrema
|
|
151
|
-
local_maxima_idx = argrelextrema(
|
|
152
|
-
close_prices, np.greater, order=local_max_order
|
|
153
|
-
)[0]
|
|
154
|
-
local_minima_idx = argrelextrema(close_prices, np.less, order=local_max_order)[
|
|
155
|
-
0
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
# STRONG BUY (4) for local minima, STRONG SELL (0) for local maxima
|
|
159
|
-
group.loc[group.index[local_minima_idx], "TARGET_11"] = 4
|
|
160
|
-
group.loc[group.index[local_maxima_idx], "TARGET_11"] = 0
|
|
161
|
-
|
|
162
|
-
# Assign BUY (3) and SELL (1) based on proximity to extrema within the threshold window
|
|
163
|
-
for idx in local_minima_idx:
|
|
164
|
-
# Get the actual date of the minima
|
|
165
|
-
min_date = dates[idx]
|
|
166
|
-
# Select the rows within the threshold window around the minima date
|
|
167
|
-
buy_window = group.loc[
|
|
168
|
-
(group["DATE"] >= min_date - pd.Timedelta(days=threshold))
|
|
169
|
-
& (group["DATE"] <= min_date + pd.Timedelta(days=threshold))
|
|
170
|
-
]
|
|
171
|
-
group.loc[buy_window.index, "TARGET_11"] = np.where(
|
|
172
|
-
buy_window["DATE"] == min_date,
|
|
173
|
-
4,
|
|
174
|
-
3, # STRONG BUY at minima, BUY near minima
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
for idx in local_maxima_idx:
|
|
178
|
-
# Get the actual date of the maxima
|
|
179
|
-
max_date = dates[idx]
|
|
180
|
-
# Select the rows within the threshold window around the maxima date
|
|
181
|
-
sell_window = group.loc[
|
|
182
|
-
(group["DATE"] >= max_date - pd.Timedelta(days=threshold))
|
|
183
|
-
& (group["DATE"] <= max_date + pd.Timedelta(days=threshold))
|
|
184
|
-
]
|
|
185
|
-
group.loc[sell_window.index, "TARGET_11"] = np.where(
|
|
186
|
-
sell_window["DATE"] == max_date,
|
|
187
|
-
0,
|
|
188
|
-
1, # STRONG SELL at maxima, SELL near maxima
|
|
189
|
-
)
|
|
47
|
+
import pandas as pd
|
|
48
|
+
import numpy as np
|
|
49
|
+
from itertools import product
|
|
50
|
+
import joblib
|
|
190
51
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# TARGET 12, 13, 14 : return in 9,14,21 days
|
|
197
|
-
df["TARGET_12"] = df.groupby("STOCK")["CLOSE"].pct_change(9).shift(-9)
|
|
198
|
-
df["TARGET_13"] = df.groupby("STOCK")["CLOSE"].pct_change(14).shift(-14)
|
|
199
|
-
df["TARGET_14"] = df.groupby("STOCK")["CLOSE"].pct_change(21).shift(-21)
|
|
200
|
-
|
|
201
|
-
# Update database
|
|
202
|
-
# TODO: in bulk
|
|
203
|
-
Target.upsert(
|
|
204
|
-
match_fields=["name", "type"],
|
|
205
|
-
name="TARGET_1",
|
|
206
|
-
type="regression",
|
|
207
|
-
description="Next day return",
|
|
208
|
-
)
|
|
209
|
-
Target.upsert(
|
|
210
|
-
match_fields=["name", "type"],
|
|
211
|
-
name="TARGET_2",
|
|
212
|
-
type="classification",
|
|
213
|
-
description="Next day return",
|
|
214
|
-
)
|
|
215
|
-
Target.upsert(
|
|
216
|
-
match_fields=["name", "type"],
|
|
217
|
-
name="TARGET_3",
|
|
218
|
-
type="regression",
|
|
219
|
-
description="Ranking of next day return",
|
|
220
|
-
)
|
|
221
|
-
Target.upsert(
|
|
222
|
-
match_fields=["name", "type"],
|
|
223
|
-
name="TARGET_4",
|
|
224
|
-
type="classification",
|
|
225
|
-
description="Top ranking of next day return",
|
|
226
|
-
)
|
|
227
|
-
Target.upsert(
|
|
228
|
-
match_fields=["name", "type"],
|
|
229
|
-
name="TARGET_5",
|
|
230
|
-
type="regression",
|
|
231
|
-
description="Next day residual return",
|
|
232
|
-
)
|
|
233
|
-
Target.upsert(
|
|
234
|
-
match_fields=["name", "type"],
|
|
235
|
-
name="TARGET_6",
|
|
236
|
-
type="classification",
|
|
237
|
-
description="Next day residual return",
|
|
238
|
-
)
|
|
239
|
-
Target.upsert(
|
|
240
|
-
match_fields=["name", "type"],
|
|
241
|
-
name="TARGET_7",
|
|
242
|
-
type="regression",
|
|
243
|
-
description="Ranking of next day residual return",
|
|
244
|
-
)
|
|
245
|
-
Target.upsert(
|
|
246
|
-
match_fields=["name", "type"],
|
|
247
|
-
name="TARGET_8",
|
|
248
|
-
type="classification",
|
|
249
|
-
description="Top ranking of next day residual return",
|
|
250
|
-
)
|
|
251
|
-
Target.upsert(
|
|
252
|
-
match_fields=["name", "type"],
|
|
253
|
-
name="TARGET_9",
|
|
254
|
-
type="classification",
|
|
255
|
-
description="Local maxima",
|
|
256
|
-
)
|
|
257
|
-
Target.upsert(
|
|
258
|
-
match_fields=["name", "type"],
|
|
259
|
-
name="TARGET_10",
|
|
260
|
-
type="classification",
|
|
261
|
-
description="Local minima",
|
|
262
|
-
)
|
|
263
|
-
Target.upsert(
|
|
264
|
-
match_fields=["name", "type"],
|
|
265
|
-
name="TARGET_11",
|
|
266
|
-
type="classification",
|
|
267
|
-
description="Trading signals based on proximity to local minima and maxima",
|
|
268
|
-
)
|
|
269
|
-
Target.upsert(
|
|
270
|
-
match_fields=["name", "type"],
|
|
271
|
-
name="TARGET_12",
|
|
272
|
-
type="regression",
|
|
273
|
-
description="Return in 9 days",
|
|
274
|
-
)
|
|
275
|
-
Target.upsert(
|
|
276
|
-
match_fields=["name", "type"],
|
|
277
|
-
name="TARGET_13",
|
|
278
|
-
type="regression",
|
|
279
|
-
description="Return in 14 days",
|
|
280
|
-
)
|
|
281
|
-
Target.upsert(
|
|
282
|
-
match_fields=["name", "type"],
|
|
283
|
-
name="TARGET_14",
|
|
284
|
-
type="regression",
|
|
285
|
-
description="Return in 21 days",
|
|
286
|
-
)
|
|
52
|
+
from sklearn.compose import ColumnTransformer
|
|
53
|
+
from sklearn.decomposition import PCA
|
|
54
|
+
from category_encoders import BinaryEncoder, CountEncoder
|
|
55
|
+
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
56
|
+
from sklearn.model_selection import train_test_split
|
|
287
57
|
|
|
288
|
-
|
|
58
|
+
from lecrapaud.integrations.openai_integration import (
|
|
59
|
+
truncate_text,
|
|
60
|
+
get_openai_embeddings,
|
|
61
|
+
)
|
|
62
|
+
from lecrapaud.feature_selection import get_features_by_types
|
|
63
|
+
from lecrapaud.utils import logger
|
|
64
|
+
from lecrapaud.db import Target, Feature, Dataset
|
|
65
|
+
from lecrapaud.config import PYTHON_ENV
|
|
289
66
|
|
|
290
67
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
68
|
+
# main function
|
|
69
|
+
class FeatureEngineeringEngine:
|
|
70
|
+
"""
|
|
71
|
+
Feature engineering pipeline
|
|
72
|
+
|
|
73
|
+
Params needed
|
|
74
|
+
-------------
|
|
75
|
+
data
|
|
76
|
+
columns_boolean
|
|
77
|
+
columns_date
|
|
78
|
+
columns_te_groupby
|
|
79
|
+
columns_te_target
|
|
80
|
+
for_training
|
|
81
|
+
"""
|
|
294
82
|
|
|
295
|
-
def
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
atm_option = min(option_data, key=lambda x: abs(x["strike"] - spot_price))
|
|
330
|
-
atm_iv = atm_option["implied_volatility"]
|
|
331
|
-
|
|
332
|
-
# IV Skew (25-delta)
|
|
333
|
-
iv_put_25d = np.mean(
|
|
334
|
-
[p["implied_volatility"] for p in puts if abs(p["delta"] + 0.25) < 0.05]
|
|
335
|
-
)
|
|
336
|
-
iv_call_25d = np.mean(
|
|
337
|
-
[c["implied_volatility"] for c in calls if abs(c["delta"] - 0.25) < 0.05]
|
|
338
|
-
)
|
|
339
|
-
iv_skew_25d = iv_put_25d - iv_call_25d if iv_put_25d and iv_call_25d else None
|
|
340
|
-
|
|
341
|
-
# IV Term Structure
|
|
342
|
-
iv_by_exp = defaultdict(list)
|
|
343
|
-
for opt in option_data:
|
|
344
|
-
iv_by_exp[opt["expiration"]].append(opt["implied_volatility"])
|
|
345
|
-
expiries = sorted(iv_by_exp.keys())
|
|
346
|
-
if len(expiries) >= 2:
|
|
347
|
-
iv_term_structure = np.mean(iv_by_exp[expiries[-1]]) - np.mean(
|
|
348
|
-
iv_by_exp[expiries[0]]
|
|
349
|
-
)
|
|
350
|
-
else:
|
|
351
|
-
iv_term_structure = None
|
|
352
|
-
|
|
353
|
-
# Moneyness
|
|
354
|
-
moneyness = [spot_price / opt["strike"] for opt in option_data if opt["strike"] > 0]
|
|
355
|
-
|
|
356
|
-
# % OTM / ITM
|
|
357
|
-
otm_calls = [c for c in calls if c["strike"] > spot_price]
|
|
358
|
-
otm_puts = [p for p in puts if p["strike"] < spot_price]
|
|
359
|
-
otm = len(otm_calls) + len(otm_puts)
|
|
360
|
-
itm = len(option_data) - otm
|
|
361
|
-
percent_otm = otm / len(option_data) if option_data else None
|
|
362
|
-
percent_itm = itm / len(option_data) if option_data else None
|
|
363
|
-
|
|
364
|
-
# Weighted Average Strike
|
|
365
|
-
def weighted_avg_strike(options):
|
|
366
|
-
total_vol = sum(o["volume"] for o in options)
|
|
367
|
-
return (
|
|
368
|
-
sum(o["strike"] * o["volume"] for o in options) / total_vol
|
|
369
|
-
if total_vol > 0
|
|
370
|
-
else None
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
data: pd.DataFrame,
|
|
86
|
+
columns_drop: list[str] = [],
|
|
87
|
+
columns_boolean: list[str] = [],
|
|
88
|
+
columns_date: list[str] = [],
|
|
89
|
+
columns_te_groupby: list[str] = [],
|
|
90
|
+
columns_te_target: list[str] = [],
|
|
91
|
+
for_training: bool = True,
|
|
92
|
+
**kwargs,
|
|
93
|
+
):
|
|
94
|
+
self.data = data
|
|
95
|
+
self.columns_drop = columns_drop
|
|
96
|
+
self.columns_boolean = columns_boolean
|
|
97
|
+
self.columns_date = columns_date
|
|
98
|
+
self.columns_te_groupby = columns_te_groupby
|
|
99
|
+
self.columns_te_target = columns_te_target
|
|
100
|
+
self.for_training = for_training
|
|
101
|
+
|
|
102
|
+
def run(self) -> pd.DataFrame:
|
|
103
|
+
# drop columns
|
|
104
|
+
self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
|
|
105
|
+
|
|
106
|
+
# convert object columns to numeric if possible
|
|
107
|
+
self.data = convert_object_columns_that_are_numeric(self.data)
|
|
108
|
+
|
|
109
|
+
# handle boolean features
|
|
110
|
+
self.data = self.boolean_encode_columns()
|
|
111
|
+
|
|
112
|
+
# handle missing values
|
|
113
|
+
self.data = (
|
|
114
|
+
self.fillna_at_training()
|
|
115
|
+
if self.for_training
|
|
116
|
+
else self.fillna_at_inference()
|
|
371
117
|
)
|
|
372
118
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
# Option Sentiment Index
|
|
377
|
-
sentiment_numerator = sum(
|
|
378
|
-
c["volume"] for c in calls if c["strike"] < spot_price
|
|
379
|
-
) - sum(p["volume"] for p in puts if p["strike"] > spot_price)
|
|
380
|
-
sentiment_index = (
|
|
381
|
-
sentiment_numerator / (total_put_vol + total_call_vol)
|
|
382
|
-
if (total_put_vol + total_call_vol) > 0
|
|
383
|
-
else None
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
return {
|
|
387
|
-
"put_call_ratio_volume": put_call_ratio_vol,
|
|
388
|
-
"put_call_ratio_open_interest": put_call_ratio_oi,
|
|
389
|
-
"open_interest_skew": oi_skew,
|
|
390
|
-
"total_open_interest": total_oi,
|
|
391
|
-
"delta_weighted_pcr": delta_weighted_pcr,
|
|
392
|
-
"atm_iv": atm_iv,
|
|
393
|
-
"iv_skew_25d": iv_skew_25d,
|
|
394
|
-
"iv_term_structure": iv_term_structure,
|
|
395
|
-
"average_moneyness": np.mean(moneyness) if moneyness else None,
|
|
396
|
-
"percent_otm": percent_otm,
|
|
397
|
-
"percent_itm": percent_itm,
|
|
398
|
-
"weighted_avg_strike_calls": avg_strike_calls,
|
|
399
|
-
"weighted_avg_strike_puts": avg_strike_puts,
|
|
400
|
-
"option_sentiment_index": sentiment_index,
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def apply_indicators(df: pd.DataFrame):
|
|
405
|
-
"""Apply multiple indicators to a grouped dataframe of a single stock."""
|
|
406
|
-
# Assuming 'df' is the OHLC data for a single stock, apply indicators
|
|
407
|
-
result = df.copy()
|
|
408
|
-
|
|
409
|
-
logger.debug(f"Computing non-period features...")
|
|
410
|
-
|
|
411
|
-
# Apply Parabolic SAR
|
|
412
|
-
result["Parabolic_SAR"] = parabolic_sar(df)
|
|
413
|
-
|
|
414
|
-
# Apply Bollinger Bands
|
|
415
|
-
result["Upper_BB"], result["Middle_BB"], result["Lower_BB"] = bollinger_bands(df)
|
|
416
|
-
|
|
417
|
-
# Apply Ichimoku Cloud
|
|
418
|
-
(
|
|
419
|
-
result["Tenkan"],
|
|
420
|
-
result["Kijun"],
|
|
421
|
-
result["Senkou_A"],
|
|
422
|
-
result["Senkou_B"],
|
|
423
|
-
result["Chikou"],
|
|
424
|
-
) = ichimoku_cloud(df)
|
|
425
|
-
|
|
426
|
-
# Apply Pivot Points (including support and resistance levels)
|
|
427
|
-
result["Pivot"], result["R1"], result["S1"], result["R2"], result["S2"] = (
|
|
428
|
-
pivot_points(df)
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# Other indicators
|
|
432
|
-
result["CLOSE_DIFF"] = close_diff(df)
|
|
433
|
-
result["OBV"] = obv(df)
|
|
434
|
-
result["DOWNWARD_PRESSURE"], result["UPWARD_PRESSURE"] = pressure(df)
|
|
435
|
-
|
|
436
|
-
# Apply MACD (Moving Average Convergence Divergence)
|
|
437
|
-
result["MACD_Line"], result["MACD_Signal"] = macd(df)
|
|
438
|
-
|
|
439
|
-
# first buy/sell signal : MACD_SIGNAL_DIFF cross 0 levels
|
|
440
|
-
result["MACD_SIGNAL_DIFF"] = result["MACD_Line"] - result["MACD_Signal"]
|
|
441
|
-
result["BUY_1"] = np.where(
|
|
442
|
-
(result["MACD_SIGNAL_DIFF"] > 0)
|
|
443
|
-
& (result["MACD_SIGNAL_DIFF"].shift(1) < 0), # Buy signal (MACD crossover)
|
|
444
|
-
1, # Buy
|
|
445
|
-
np.where(
|
|
446
|
-
(result["MACD_SIGNAL_DIFF"] < 0)
|
|
447
|
-
& (
|
|
448
|
-
result["MACD_SIGNAL_DIFF"].shift(1) > 0
|
|
449
|
-
), # Sell signal (MACD crossunder)
|
|
450
|
-
-1, # Sell
|
|
451
|
-
np.nan, # Default case
|
|
452
|
-
),
|
|
453
|
-
)
|
|
454
|
-
result["BUY_1"] = result["BUY_1"].fillna(0) # TODO: should we fill with 0 (done)
|
|
455
|
-
|
|
456
|
-
# second buy/sell signal : MACD_SIGNAL_DIFF cross 30% threshold of maximum value while positive and decreasing, or 30% threshold of minimum value while negative and increasing
|
|
457
|
-
# Calculate rolling 20-day max and min values for MACD_SIGNAL_DIFF per stock
|
|
458
|
-
macd_signal_diff_max_20_days = result.groupby("STOCK")[
|
|
459
|
-
"MACD_SIGNAL_DIFF"
|
|
460
|
-
].transform(lambda x: x.rolling(20).max())
|
|
461
|
-
macd_signal_diff_min_20_days = result.groupby("STOCK")[
|
|
462
|
-
"MACD_SIGNAL_DIFF"
|
|
463
|
-
].transform(lambda x: x.rolling(20).min())
|
|
464
|
-
|
|
465
|
-
# Define the buy/sell signal conditions
|
|
466
|
-
buy_condition = (
|
|
467
|
-
(result["MACD_SIGNAL_DIFF"] > result["MACD_SIGNAL_DIFF"].shift(1)) # Increasing
|
|
468
|
-
& (result["MACD_SIGNAL_DIFF"] < 0) # Negative value
|
|
469
|
-
& (
|
|
470
|
-
result["MACD_SIGNAL_DIFF"] > 0.3 * macd_signal_diff_min_20_days
|
|
471
|
-
) # Above 30% of minimum
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
sell_condition = (
|
|
475
|
-
(result["MACD_SIGNAL_DIFF"] < result["MACD_SIGNAL_DIFF"].shift(1)) # Decreasing
|
|
476
|
-
& (result["MACD_SIGNAL_DIFF"] > 0) # Positive value
|
|
477
|
-
& (
|
|
478
|
-
result["MACD_SIGNAL_DIFF"] < 0.3 * macd_signal_diff_max_20_days
|
|
479
|
-
) # Below 30% of maximum
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
# Apply the conditions to calculate buy/sell signals
|
|
483
|
-
result["BUY_2"] = np.where(
|
|
484
|
-
buy_condition,
|
|
485
|
-
np.abs(
|
|
486
|
-
(result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_min_20_days)
|
|
487
|
-
/ (0.3 * macd_signal_diff_min_20_days)
|
|
488
|
-
),
|
|
489
|
-
np.where(
|
|
490
|
-
sell_condition,
|
|
491
|
-
-np.abs(
|
|
492
|
-
(result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_max_20_days)
|
|
493
|
-
/ (0.3 * macd_signal_diff_max_20_days)
|
|
494
|
-
),
|
|
495
|
-
0, # Default
|
|
496
|
-
),
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
periods = [
|
|
500
|
-
9,
|
|
501
|
-
14,
|
|
502
|
-
21,
|
|
503
|
-
50,
|
|
504
|
-
126,
|
|
505
|
-
200,
|
|
506
|
-
252,
|
|
507
|
-
] # 2 semaines, 3 semaines, 1 mois et 2.5 mois
|
|
508
|
-
# TODO: on pourrait rajouter plus de long terme : 126 jours (6 mois) et 200 jours (9 mois) et 252 jours (1 an)
|
|
509
|
-
|
|
510
|
-
features = []
|
|
511
|
-
for period in periods:
|
|
512
|
-
logger.debug(f"Computing period features for {period} days...")
|
|
513
|
-
|
|
514
|
-
features.append(
|
|
515
|
-
pd.DataFrame(
|
|
516
|
-
{
|
|
517
|
-
f"CUMUL_RET_{period}": cumulative_return(df, period=period),
|
|
518
|
-
f"SMA_{period}": sma(df, period=period),
|
|
519
|
-
f"EMA_{period}": ema(df, period=period),
|
|
520
|
-
f"VOLATILITY_{period}": volatility(df, period=period),
|
|
521
|
-
f"ADX_{period}": adx(df, period=period),
|
|
522
|
-
f"ATR_{period}": atr(df, period=period),
|
|
523
|
-
f"CMF_{period}": chaikin_money_flow(df, period=period),
|
|
524
|
-
f"RSI_{period}": rsi(df, period=period),
|
|
525
|
-
f"MFI_{period}": mfi(df, period=period),
|
|
526
|
-
},
|
|
527
|
-
index=df.index,
|
|
528
|
-
)
|
|
529
|
-
)
|
|
119
|
+
# target encoding
|
|
120
|
+
self.data = self.generate_target_encodings()
|
|
530
121
|
|
|
531
|
-
#
|
|
532
|
-
|
|
533
|
-
features.append(
|
|
534
|
-
pd.DataFrame(
|
|
535
|
-
{
|
|
536
|
-
f"%K_{period}": k,
|
|
537
|
-
f"%D_{period}": d,
|
|
538
|
-
},
|
|
539
|
-
index=df.index,
|
|
540
|
-
)
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
result = pd.concat([result] + features, axis=1)
|
|
544
|
-
|
|
545
|
-
# third buy/sell signal : RSI is overbought >0.7 / oversold <0.3
|
|
546
|
-
result["BUY_3"] = np.where(
|
|
547
|
-
result["RSI_14"] <= 30,
|
|
548
|
-
(30 - result["RSI_14"]) / 30,
|
|
549
|
-
np.where(result["RSI_14"] >= 70, -(result["RSI_14"] - 70) / 30, 0),
|
|
550
|
-
)
|
|
551
|
-
|
|
552
|
-
# fourth buy/sell signal : RSI vs CLOSE divergence
|
|
553
|
-
# The RSI vs. Close divergence trading signal identifies potential reversals by detecting when the
|
|
554
|
-
# Relative Strength Index (RSI) and price (Close) move in opposite directions
|
|
555
|
-
# bullish divergence occurs when the price makes lower lows while RSI makes higher lows (potential uptrend),
|
|
556
|
-
# and bearish divergence occurs when the price makes higher highs while RSI makes lower highs (potential downtrend)
|
|
557
|
-
|
|
558
|
-
# Detect local peaks (RSI Highs) and troughs (RSI Lows) for divergence analysis
|
|
559
|
-
# Compute local maxima and minima indices
|
|
560
|
-
rsi_peak_indices = argrelextrema(result["RSI_14"].values, np.greater)[
|
|
561
|
-
0
|
|
562
|
-
] # RSI highs
|
|
563
|
-
rsi_trough_indices = argrelextrema(result["RSI_14"].values, np.less)[0] # RSI lows
|
|
564
|
-
|
|
565
|
-
# Create boolean masks for peaks and troughs
|
|
566
|
-
rsi_peaks_mask = np.zeros(len(result), dtype=bool)
|
|
567
|
-
rsi_troughs_mask = np.zeros(len(result), dtype=bool)
|
|
568
|
-
|
|
569
|
-
rsi_peaks_mask[rsi_peak_indices] = True
|
|
570
|
-
rsi_troughs_mask[rsi_trough_indices] = True
|
|
571
|
-
|
|
572
|
-
# Extract peak and trough rows efficiently
|
|
573
|
-
rsi_peaks = result.loc[rsi_peaks_mask, ["CLOSE", "RSI_14"]].copy()
|
|
574
|
-
rsi_troughs = result.loc[rsi_troughs_mask, ["CLOSE", "RSI_14"]].copy()
|
|
575
|
-
|
|
576
|
-
# Compute RSI and CLOSE differences to check divergence
|
|
577
|
-
for i in [1, 2, 3]:
|
|
578
|
-
# RSI & Price difference from past peaks
|
|
579
|
-
rsi_peaks[f"RSI_PEAK_DIFF_{i}"] = rsi_peaks["RSI_14"].diff(i)
|
|
580
|
-
rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] = rsi_peaks["CLOSE"].diff(i)
|
|
581
|
-
|
|
582
|
-
# RSI & Price difference from past troughs
|
|
583
|
-
rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] = rsi_troughs["RSI_14"].diff(i)
|
|
584
|
-
rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] = rsi_troughs["CLOSE"].diff(i)
|
|
585
|
-
|
|
586
|
-
# Detect bearish divergence (RSI down, price up) and bullish divergence (RSI up, price down)
|
|
587
|
-
rsi_peaks[f"DIVERGENCE_{i}"] = np.where(
|
|
588
|
-
(rsi_peaks[f"RSI_PEAK_DIFF_{i}"] < 0)
|
|
589
|
-
& (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] > 0),
|
|
590
|
-
-np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
|
|
591
|
-
np.where(
|
|
592
|
-
(rsi_peaks[f"RSI_PEAK_DIFF_{i}"] > 0)
|
|
593
|
-
& (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] < 0),
|
|
594
|
-
-np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
|
|
595
|
-
0,
|
|
596
|
-
),
|
|
597
|
-
)
|
|
122
|
+
# Cyclic encode dates
|
|
123
|
+
self.data = self.cyclic_encode_date()
|
|
598
124
|
|
|
599
|
-
|
|
600
|
-
(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] > 0)
|
|
601
|
-
& (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] < 0),
|
|
602
|
-
np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
|
|
603
|
-
np.where(
|
|
604
|
-
(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] < 0)
|
|
605
|
-
& (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] > 0),
|
|
606
|
-
np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
|
|
607
|
-
0,
|
|
608
|
-
),
|
|
609
|
-
)
|
|
125
|
+
return self.data
|
|
610
126
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
[rsi_peaks[divergence_cols], rsi_troughs[divergence_cols]], axis=0
|
|
615
|
-
)
|
|
127
|
+
def cyclic_encode_date(self) -> pd.DataFrame:
|
|
128
|
+
"""
|
|
129
|
+
Adds cyclic (sine and cosine) encoding for common date parts: day of week, day of month, and month.
|
|
616
130
|
|
|
617
|
-
|
|
618
|
-
|
|
131
|
+
Parameters:
|
|
132
|
+
df (pd.DataFrame): Input dataframe
|
|
133
|
+
columns (list[str]): List of datetime columns to encode
|
|
134
|
+
prefix (str): Optional prefix for new columns. If None, uses column names.
|
|
619
135
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
136
|
+
Returns:
|
|
137
|
+
pd.DataFrame: Updated dataframe with new cyclic features
|
|
138
|
+
"""
|
|
623
139
|
|
|
140
|
+
df: pd.DataFrame = self.data
|
|
141
|
+
columns: list[str] = self.columns_date
|
|
624
142
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
save_as_csv: bool = False,
|
|
630
|
-
analytics: bool = False,
|
|
631
|
-
):
|
|
632
|
-
"""Main function to process the full dataset with multiple stocks
|
|
143
|
+
def cyclic_encode(series, max_value):
|
|
144
|
+
sin_values = np.sin(2 * np.pi * series / max_value)
|
|
145
|
+
cos_values = np.cos(2 * np.pi * series / max_value)
|
|
146
|
+
return sin_values, cos_values
|
|
633
147
|
|
|
634
|
-
|
|
635
|
-
- df (pd.DataFrame): the dataframe with ohlc data
|
|
636
|
-
- for_training (bool): whether to compute targets and for_training as data_for_training, or not.
|
|
637
|
-
"""
|
|
148
|
+
for col in columns:
|
|
638
149
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
df["YEARDAY"] = df["DATE"].dt.dayofyear
|
|
648
|
-
|
|
649
|
-
# Cyclic encoding for date-like variables
|
|
650
|
-
def cyclic_encode(series, max_value):
|
|
651
|
-
sin_values = np.sin(2 * np.pi * series / max_value)
|
|
652
|
-
cos_values = np.cos(2 * np.pi * series / max_value)
|
|
653
|
-
return sin_values, cos_values
|
|
654
|
-
|
|
655
|
-
df["MONTH_sin"], df["MONTH_cos"] = cyclic_encode(df["MONTH"], 12)
|
|
656
|
-
df["DAY_sin"], df["DAY_cos"] = cyclic_encode(df["DAY"], 31)
|
|
657
|
-
df["WEEK_sin"], df["WEEK_cos"] = cyclic_encode(df["WEEK"], 52)
|
|
658
|
-
df["WEEKDAY_sin"], df["WEEKDAY_cos"] = cyclic_encode(df["WEEKDAY"], 7)
|
|
659
|
-
df["YEARDAY_sin"], df["YEARDAY_cos"] = cyclic_encode(df["YEARDAY"], 365)
|
|
660
|
-
|
|
661
|
-
# Computing residual RET and relative VOLUME
|
|
662
|
-
logger.info("Creating RET and VOLUME metrics...")
|
|
663
|
-
df["RET"] = df.groupby("STOCK")["CLOSE"].pct_change(1)
|
|
664
|
-
df["MARKET_RET"] = df.groupby("DATE")["RET"].transform("mean")
|
|
665
|
-
df["RESIDUAL_RET"] = df["RET"] - df["MARKET_RET"]
|
|
666
|
-
|
|
667
|
-
df["VOLUME_RATIO"] = (
|
|
668
|
-
df["VOLUME"]
|
|
669
|
-
/ df.groupby("STOCK")["VOLUME"].rolling(20, min_periods=1).mean().values
|
|
670
|
-
)
|
|
671
|
-
df["MARKET_VOLUME_RATIO"] = df.groupby("DATE")["VOLUME_RATIO"].transform("mean")
|
|
672
|
-
df["RELATIVE_VOLUME"] = df["VOLUME_RATIO"] - df["MARKET_VOLUME_RATIO"]
|
|
673
|
-
|
|
674
|
-
logger.info("Creating historical time series metrics...")
|
|
675
|
-
periods = [
|
|
676
|
-
1, # daily
|
|
677
|
-
2,
|
|
678
|
-
3,
|
|
679
|
-
4,
|
|
680
|
-
5, # weekly
|
|
681
|
-
9,
|
|
682
|
-
14,
|
|
683
|
-
21, # monthly
|
|
684
|
-
50,
|
|
685
|
-
126,
|
|
686
|
-
200,
|
|
687
|
-
252,
|
|
688
|
-
] # need to keep 1, 2, 3, 4, 5 for backward compatibility
|
|
689
|
-
for METRIC in ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]:
|
|
690
|
-
for i in periods:
|
|
691
|
-
df[f"{METRIC}_-{i}"] = df[METRIC].shift(i)
|
|
692
|
-
|
|
693
|
-
# Group by "STOCK" and apply the indicators for each stock
|
|
694
|
-
logger.info("Applying indicators...")
|
|
695
|
-
grouped_df = df.groupby("STOCK", group_keys=False)
|
|
696
|
-
preprocessed_df = grouped_df.apply(apply_indicators)
|
|
697
|
-
|
|
698
|
-
# Target encoding / Mean encoding for categorical features
|
|
699
|
-
# it's when you groupby a categorical feature and aggregate a target with a stat such as mean or median
|
|
700
|
-
logger.info("Computing aggregated features...")
|
|
701
|
-
statistics = ["mean", "median"]
|
|
702
|
-
gb_features = [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
|
|
703
|
-
|
|
704
|
-
# Define your base
|
|
705
|
-
target_features = ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]
|
|
706
|
-
periods = [9, 14, 21, 50]
|
|
707
|
-
indicators = [
|
|
708
|
-
"CUMUL_RET",
|
|
709
|
-
"SMA",
|
|
710
|
-
"EMA",
|
|
711
|
-
"VOLATILITY",
|
|
712
|
-
"ATR",
|
|
713
|
-
"ADX",
|
|
714
|
-
"%K",
|
|
715
|
-
"RSI",
|
|
716
|
-
"MFI",
|
|
717
|
-
]
|
|
718
|
-
target_features += [f"{ind}_{p}" for p in periods for ind in indicators]
|
|
719
|
-
|
|
720
|
-
# Prepare to collect new columns
|
|
721
|
-
new_feature_cols = {}
|
|
722
|
-
|
|
723
|
-
# Generate features efficiently
|
|
724
|
-
for gb_feature, stat, target in product(gb_features, statistics, target_features):
|
|
725
|
-
col_name = f"{target}_{'_'.join(gb_feature)}_{stat.upper()}"
|
|
726
|
-
new_feature_cols[col_name] = preprocessed_df.groupby(gb_feature)[
|
|
727
|
-
target
|
|
728
|
-
].transform(stat)
|
|
729
|
-
|
|
730
|
-
# Merge all at once to improve performance
|
|
731
|
-
preprocessed_df = pd.concat(
|
|
732
|
-
[preprocessed_df, pd.DataFrame(new_feature_cols)], axis=1
|
|
733
|
-
)
|
|
734
|
-
|
|
735
|
-
if for_training:
|
|
736
|
-
preprocessed_df = targets_creation(preprocessed_df)
|
|
737
|
-
|
|
738
|
-
# Descriptive Analysis
|
|
739
|
-
if analytics:
|
|
740
|
-
traditional_descriptive_analysis(preprocessed_df)
|
|
741
|
-
if save_as_csv and PYTHON_ENV == "Development":
|
|
742
|
-
preprocessed_df_to_csv = preprocessed_df.sort_values(["DATE", "STOCK"])
|
|
743
|
-
preprocessed_df_to_csv.to_csv(
|
|
744
|
-
f"{data_dir}/data_for_training.csv",
|
|
745
|
-
index=False,
|
|
746
|
-
header=True,
|
|
747
|
-
)
|
|
150
|
+
df[col] = pd.to_datetime(df[col]).dt.normalize()
|
|
151
|
+
df[f"{col}_year"] = df[col].dt.isocalendar().year
|
|
152
|
+
df[f"{col}_month"] = df[col].dt.month
|
|
153
|
+
df[f"{col}_day"] = df[col].dt.day
|
|
154
|
+
df[f"{col}_week"] = df[col].dt.isocalendar().week
|
|
155
|
+
df[f"{col}_weekday"] = df[col].dt.weekday
|
|
156
|
+
df[f"{col}_yearday"] = df[col].dt.dayofyear
|
|
157
|
+
df[col] = pd.to_datetime(df[col]).map(pd.Timestamp.toordinal)
|
|
748
158
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
]
|
|
756
|
-
|
|
757
|
-
|
|
159
|
+
df[f"{col}_month_sin"], df[f"{col}_month_cos"] = cyclic_encode(
|
|
160
|
+
df[f"{col}_month"], 12
|
|
161
|
+
)
|
|
162
|
+
df[f"{col}_day_sin"], df[f"{col}_day_cos"] = cyclic_encode(
|
|
163
|
+
df[f"{col}_day"], 31
|
|
164
|
+
)
|
|
165
|
+
df[f"{col}_week_sin"], df[f"{col}_week_cos"] = cyclic_encode(
|
|
166
|
+
df[f"{col}_week"], 52
|
|
167
|
+
)
|
|
168
|
+
df[f"{col}_weekday_sin"], df[f"{col}_weekday_cos"] = cyclic_encode(
|
|
169
|
+
df[f"{col}_weekday"], 7
|
|
170
|
+
)
|
|
171
|
+
df[f"{col}_yearday_sin"], df[f"{col}_yearday_cos"] = cyclic_encode(
|
|
172
|
+
df[f"{col}_yearday"], 365
|
|
173
|
+
)
|
|
758
174
|
|
|
759
|
-
|
|
760
|
-
|
|
175
|
+
# Drop the original column TODO: not sure if we should drop it for time series
|
|
176
|
+
# df.drop(col, axis=1, inplace=True)
|
|
177
|
+
|
|
178
|
+
return df
|
|
179
|
+
|
|
180
|
+
def boolean_encode_columns(self) -> pd.DataFrame:
|
|
181
|
+
"""
|
|
182
|
+
Applies boolean encoding to a list of columns:
|
|
183
|
+
- Leaves column as-is if already int with only 0 and 1
|
|
184
|
+
- Otherwise: sets 1 if value is present (notna), 0 if null/NaN/None
|
|
185
|
+
|
|
186
|
+
Parameters:
|
|
187
|
+
df (pd.DataFrame): Input dataframe
|
|
188
|
+
columns (list): List of column names to encode
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
pd.DataFrame: Updated dataframe with encoded columns
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
df: pd.DataFrame = self.data
|
|
195
|
+
columns: list[str] = self.columns_boolean
|
|
196
|
+
|
|
197
|
+
for column in columns:
|
|
198
|
+
col = df[column]
|
|
199
|
+
if pd.api.types.is_integer_dtype(col) and set(
|
|
200
|
+
col.dropna().unique()
|
|
201
|
+
).issubset({0, 1}):
|
|
202
|
+
continue # already valid binary
|
|
203
|
+
df[column] = col.notna().astype(int)
|
|
204
|
+
return df
|
|
205
|
+
|
|
206
|
+
def generate_target_encodings(self) -> pd.DataFrame:
|
|
207
|
+
"""
|
|
208
|
+
Generate target encoding features (e.g., mean, median) for specified targets and group-by combinations.
|
|
209
|
+
|
|
210
|
+
Parameters:
|
|
211
|
+
df (pd.DataFrame): Input dataframe
|
|
212
|
+
columns_te_groupby (list of list): Grouping keys, e.g., [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
|
|
213
|
+
columns_te_target (list): Target columns to aggregate (e.g., ["RET", "VOLUME", "RSI_14"])
|
|
214
|
+
statistics (list): List of aggregation statistics (e.g., ["mean", "median"])
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
pd.DataFrame: Original dataframe with new encoded columns added
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
df: pd.DataFrame = self.data
|
|
221
|
+
columns_te_groupby: list[list[str]] = self.columns_te_groupby
|
|
222
|
+
columns_te_target: list[str] = self.columns_te_target
|
|
223
|
+
statistics: list[str] = ["mean", "median"]
|
|
224
|
+
|
|
225
|
+
df = df.copy()
|
|
226
|
+
new_feature_cols = {}
|
|
227
|
+
for group_cols, stat, target_col in product(
|
|
228
|
+
columns_te_groupby, statistics, columns_te_target
|
|
229
|
+
):
|
|
230
|
+
col_name = f"{target_col}_{'_'.join(group_cols)}_{stat.upper()}"
|
|
231
|
+
new_feature_cols[col_name] = df.groupby(group_cols)[target_col].transform(
|
|
232
|
+
stat
|
|
233
|
+
)
|
|
761
234
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
235
|
+
# merge all at once to improve performance
|
|
236
|
+
df = pd.concat([df, pd.DataFrame(new_feature_cols)], axis=1)
|
|
237
|
+
return df
|
|
238
|
+
|
|
239
|
+
def fillna_at_training(self) -> pd.DataFrame:
|
|
240
|
+
"""
|
|
241
|
+
Fill missing values in a DataFrame:
|
|
242
|
+
- Numeric columns: fill with mean
|
|
243
|
+
- Categorical columns: fill with mode
|
|
244
|
+
Handles both NaN and None.
|
|
245
|
+
|
|
246
|
+
Parameters:
|
|
247
|
+
df (pd.DataFrame): Input DataFrame
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
pd.DataFrame: Cleaned DataFrame with missing values filled
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
df: pd.DataFrame = self.data.copy()
|
|
254
|
+
|
|
255
|
+
for col in df.columns:
|
|
256
|
+
missing_count = df[col].isnull().sum()
|
|
257
|
+
if missing_count > 0:
|
|
258
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
259
|
+
df[col] = df[col].fillna(df[col].mean())
|
|
260
|
+
logger.info(
|
|
261
|
+
f"Filled {missing_count} NaN values in numeric column '{col}' with mean."
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
mode = df[col].mode()
|
|
265
|
+
if not mode.empty:
|
|
266
|
+
mode_value = mode[0]
|
|
267
|
+
mode_count = (df[col] == mode_value).sum()
|
|
268
|
+
if mode_count > 100:
|
|
269
|
+
fill_value = mode_value
|
|
270
|
+
else:
|
|
271
|
+
fill_value = "unknown"
|
|
272
|
+
else:
|
|
273
|
+
fill_value = "unknown"
|
|
274
|
+
|
|
275
|
+
df[col] = df[col].fillna(fill_value)
|
|
276
|
+
logger.info(
|
|
277
|
+
f"Filled {missing_count} NaN values in categorical column '{col}' with '{fill_value}'."
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return df
|
|
281
|
+
|
|
282
|
+
def fillna_at_inference(self) -> pd.DataFrame:
|
|
283
|
+
|
|
284
|
+
df: pd.DataFrame = self.data
|
|
285
|
+
|
|
286
|
+
missing_cols = df.columns[df.isnull().any()].tolist()
|
|
287
|
+
|
|
288
|
+
if missing_cols:
|
|
289
|
+
numeric_cols = [
|
|
290
|
+
col for col in missing_cols if pd.api.types.is_numeric_dtype(df[col])
|
|
291
|
+
]
|
|
292
|
+
non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
|
|
765
293
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
294
|
+
logger.warning(
|
|
295
|
+
f"Missing values found in inference data."
|
|
296
|
+
f"Filling with 0 for numeric columns: {numeric_cols}, "
|
|
297
|
+
f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
|
|
298
|
+
)
|
|
769
299
|
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
300
|
+
df[numeric_cols] = df[numeric_cols].fillna(0)
|
|
301
|
+
df[non_numeric_cols] = df[non_numeric_cols].fillna("unknown")
|
|
302
|
+
|
|
303
|
+
return df
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class PreprocessFeature:
|
|
307
|
+
|
|
308
|
+
def __init__(
|
|
309
|
+
self,
|
|
310
|
+
data: pd.DataFrame,
|
|
311
|
+
dataset,
|
|
312
|
+
time_series: bool = False,
|
|
313
|
+
date_column: str | None = None,
|
|
314
|
+
group_column: str | None = None,
|
|
315
|
+
val_size: float = 0.2,
|
|
316
|
+
test_size: float = 0.2,
|
|
317
|
+
columns_pca: list[str] = [],
|
|
318
|
+
columns_onehot: list[str] = [],
|
|
319
|
+
columns_binary: list[str] = [],
|
|
320
|
+
columns_ordinal: list[str] = [],
|
|
321
|
+
columns_frequency: list[str] = [],
|
|
322
|
+
target_numbers: list = [],
|
|
323
|
+
target_clf: list = [],
|
|
324
|
+
**kwargs,
|
|
325
|
+
):
|
|
326
|
+
self.data = data
|
|
327
|
+
self.data.columns = self.data.columns.str.upper()
|
|
328
|
+
|
|
329
|
+
self.dataset = dataset
|
|
330
|
+
self.columns_pca = columns_pca
|
|
331
|
+
self.columns_onehot = columns_onehot
|
|
332
|
+
self.columns_binary = columns_binary
|
|
333
|
+
self.columns_ordinal = columns_ordinal
|
|
334
|
+
self.columns_frequency = columns_frequency
|
|
335
|
+
self.target_numbers = target_numbers
|
|
336
|
+
self.target_clf = target_clf
|
|
337
|
+
|
|
338
|
+
self.time_series = time_series
|
|
339
|
+
self.date_column = date_column
|
|
340
|
+
self.group_column = group_column
|
|
341
|
+
self.val_size = val_size
|
|
342
|
+
self.test_size = test_size
|
|
343
|
+
|
|
344
|
+
self.dataset_dir = self.dataset.path
|
|
345
|
+
self.dataset_id = self.dataset.id
|
|
346
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
347
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
348
|
+
|
|
349
|
+
def run(self):
|
|
350
|
+
# Split
|
|
351
|
+
train, val, test = (
|
|
352
|
+
self.train_val_test_split_time_series()
|
|
353
|
+
if self.time_series
|
|
354
|
+
else self.train_val_test_split(
|
|
355
|
+
stratify_col=f"TARGET_{self.target_numbers[0]}"
|
|
356
|
+
)
|
|
357
|
+
) # TODO: only stratifying first target for now
|
|
773
358
|
|
|
359
|
+
# PCA
|
|
360
|
+
train, pcas = self.add_pca_features(train)
|
|
361
|
+
val, _ = self.add_pca_features(test, pcas=pcas)
|
|
362
|
+
test, _ = self.add_pca_features(val, pcas=pcas)
|
|
774
363
|
|
|
775
|
-
|
|
776
|
-
def print_missing_values(df: pd.DataFrame):
|
|
364
|
+
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
777
365
|
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
366
|
+
# Encoding
|
|
367
|
+
train, transformer = self.encode_categorical_features(train)
|
|
368
|
+
val, _ = self.encode_categorical_features(
|
|
369
|
+
val,
|
|
370
|
+
transformer=transformer,
|
|
371
|
+
)
|
|
372
|
+
test, _ = self.encode_categorical_features(
|
|
373
|
+
test,
|
|
374
|
+
transformer=transformer,
|
|
781
375
|
)
|
|
782
|
-
else:
|
|
783
|
-
logger.info("No missing values found")
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
def plot_sector_repartition(df: pd.DataFrame):
|
|
787
|
-
"""Visualise repartition of stock per sectors
|
|
788
|
-
|
|
789
|
-
Args:
|
|
790
|
-
df (pd.DataFrame): a df created with `get_data`
|
|
791
|
-
"""
|
|
792
|
-
sns.barplot(
|
|
793
|
-
data=df.groupby("SECTOR")["STOCK"].nunique(),
|
|
794
|
-
orient="h",
|
|
795
|
-
order=df.groupby("SECTOR")["STOCK"]
|
|
796
|
-
.nunique()
|
|
797
|
-
.sort_values(ascending=False)
|
|
798
|
-
.index,
|
|
799
|
-
)
|
|
800
376
|
|
|
377
|
+
joblib.dump(self.data, f"{self.data_dir}/full.pkl")
|
|
378
|
+
joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
379
|
+
summary = summarize_dataframe(train)
|
|
380
|
+
summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
|
|
801
381
|
|
|
802
|
-
|
|
803
|
-
with pd.option_context("display.max_rows", None):
|
|
382
|
+
return train, val, test
|
|
804
383
|
|
|
805
|
-
|
|
806
|
-
|
|
384
|
+
def inference(self):
|
|
385
|
+
# PCA
|
|
386
|
+
pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
387
|
+
data, _ = self.add_pca_features(self.data, pcas=pcas)
|
|
807
388
|
|
|
808
|
-
#
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
.dropna()
|
|
814
|
-
.sort_values(ascending=False)
|
|
389
|
+
# Encoding
|
|
390
|
+
transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
391
|
+
data, _ = self.encode_categorical_features(
|
|
392
|
+
data,
|
|
393
|
+
transformer=transformer,
|
|
815
394
|
)
|
|
395
|
+
return data
|
|
816
396
|
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
.sort_values(ascending=False)
|
|
824
|
-
)
|
|
397
|
+
def train_val_test_split_time_series(self):
|
|
398
|
+
df: pd.DataFrame = self.data
|
|
399
|
+
date_column: str = self.date_column
|
|
400
|
+
group_column: str = self.group_column
|
|
401
|
+
val_size: float = self.val_size
|
|
402
|
+
test_size: float = self.test_size
|
|
825
403
|
|
|
826
|
-
|
|
827
|
-
|
|
404
|
+
if not date_column:
|
|
405
|
+
ValueError("Please specify a date_column for time series")
|
|
828
406
|
|
|
829
|
-
|
|
830
|
-
|
|
407
|
+
if group_column:
|
|
408
|
+
df.sort_values([date_column, group_column], inplace=True)
|
|
409
|
+
else:
|
|
410
|
+
df.sort_values(date_column, inplace=True)
|
|
831
411
|
|
|
832
|
-
|
|
833
|
-
unique_stock_count = (
|
|
834
|
-
len(df[stock_column].unique()) if stock_column in df.columns else None
|
|
835
|
-
)
|
|
412
|
+
dates = df[date_column].unique()
|
|
836
413
|
|
|
837
|
-
|
|
838
|
-
|
|
414
|
+
val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
|
|
415
|
+
test_first_id = int(len(dates) * (1 - test_size)) + 1
|
|
839
416
|
|
|
840
|
-
|
|
841
|
-
|
|
417
|
+
train = df[df[date_column].isin(dates[:val_first_id])]
|
|
418
|
+
val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
|
|
419
|
+
test = df[df[date_column].isin(dates[test_first_id:])]
|
|
842
420
|
|
|
843
|
-
|
|
421
|
+
dates = {}
|
|
422
|
+
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
423
|
+
dates[f"{name}_start_date"] = (
|
|
424
|
+
data[date_column].map(pd.Timestamp.fromordinal).iat[0]
|
|
425
|
+
)
|
|
426
|
+
dates[f"{name}_end_date"] = (
|
|
427
|
+
data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
|
|
428
|
+
)
|
|
844
429
|
|
|
845
|
-
if unique_stock_count is not None:
|
|
846
430
|
logger.info(
|
|
847
|
-
f"
|
|
431
|
+
f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
848
432
|
)
|
|
849
|
-
else:
|
|
850
|
-
logger.info(f"\nColumn '{stock_column}' not found in the DataFrame.")
|
|
851
433
|
|
|
852
|
-
|
|
853
|
-
|
|
434
|
+
Dataset.update(
|
|
435
|
+
match_fields=["id"],
|
|
436
|
+
id=self.dataset_id,
|
|
437
|
+
train_size=len(train),
|
|
438
|
+
val_size=len(val),
|
|
439
|
+
test_size=len(test),
|
|
440
|
+
**dates,
|
|
854
441
|
)
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
442
|
+
return (
|
|
443
|
+
train.reset_index(drop=True),
|
|
444
|
+
val.reset_index(drop=True),
|
|
445
|
+
test.reset_index(drop=True),
|
|
858
446
|
)
|
|
859
447
|
|
|
860
|
-
|
|
448
|
+
def train_val_test_split(
|
|
449
|
+
self,
|
|
450
|
+
random_state: int = 42,
|
|
451
|
+
stratify_col: str | None = None,
|
|
452
|
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
453
|
+
"""
|
|
454
|
+
Splits a DataFrame into train, validation, and test sets.
|
|
455
|
+
|
|
456
|
+
Parameters:
|
|
457
|
+
df (pd.DataFrame): The full dataset
|
|
458
|
+
val_size (float): Proportion of validation set (default 0.1)
|
|
459
|
+
test_size (float): Proportion of test set (default 0.1)
|
|
460
|
+
random_state (int): Random seed for reproducibility
|
|
461
|
+
stratify_col (str | None): Optional column to stratify on (for classification tasks)
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Tuple of (train_df, val_df, test_df)
|
|
465
|
+
"""
|
|
466
|
+
df: pd.DataFrame = self.data
|
|
467
|
+
val_size: float = self.val_size
|
|
468
|
+
test_size: float = self.test_size
|
|
469
|
+
|
|
470
|
+
stratify_vals = df[stratify_col] if stratify_col else None
|
|
471
|
+
|
|
472
|
+
# First split: train + (val + test)
|
|
473
|
+
train, temp = train_test_split(
|
|
474
|
+
df,
|
|
475
|
+
test_size=val_size + test_size,
|
|
476
|
+
random_state=random_state,
|
|
477
|
+
stratify=stratify_vals,
|
|
478
|
+
)
|
|
861
479
|
|
|
862
|
-
|
|
480
|
+
# Adjust stratify target for val/test split
|
|
481
|
+
stratify_temp = temp[stratify_col] if stratify_col else None
|
|
863
482
|
|
|
483
|
+
# Compute val and test sizes relative to temp
|
|
484
|
+
val_ratio = val_size / (val_size + test_size)
|
|
864
485
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
)
|
|
871
|
-
"""
|
|
872
|
-
Function to visualize local maxima and minima for a given stock in the data.
|
|
486
|
+
val, test = train_test_split(
|
|
487
|
+
temp,
|
|
488
|
+
test_size=1 - val_ratio,
|
|
489
|
+
random_state=random_state,
|
|
490
|
+
stratify=stratify_temp,
|
|
491
|
+
)
|
|
873
492
|
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
- stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
|
|
877
|
-
- days_before_last: int, number of days before the last date in the dataset to visualize
|
|
878
|
-
- local_max_order: int, the window size for identifying local extrema (default: 5)
|
|
879
|
-
"""
|
|
493
|
+
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
494
|
+
logger.info(f"{data.shape} {name} data")
|
|
880
495
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
# Find local maxima (argrelextrema with np.greater) for each stock
|
|
886
|
-
local_max_CLOSE = (
|
|
887
|
-
data[data["STOCK"] == stock]
|
|
888
|
-
.set_index("DATE")["CLOSE"]
|
|
889
|
-
.iloc[
|
|
890
|
-
argrelextrema(
|
|
891
|
-
data[data["STOCK"] == stock]["CLOSE"].values,
|
|
892
|
-
np.greater,
|
|
893
|
-
order=local_max_order,
|
|
894
|
-
)
|
|
895
|
-
]
|
|
896
|
-
.reset_index()
|
|
897
|
-
)
|
|
898
|
-
|
|
899
|
-
# Find local minima (argrelextrema with np.less) for each stock
|
|
900
|
-
local_min_CLOSE = (
|
|
901
|
-
data[data["STOCK"] == stock]
|
|
902
|
-
.set_index("DATE")["CLOSE"]
|
|
903
|
-
.iloc[
|
|
904
|
-
argrelextrema(
|
|
905
|
-
data[data["STOCK"] == stock]["CLOSE"].values,
|
|
906
|
-
np.less,
|
|
907
|
-
order=local_max_order,
|
|
908
|
-
)
|
|
909
|
-
]
|
|
910
|
-
.reset_index()
|
|
911
|
-
)
|
|
912
|
-
|
|
913
|
-
# Filter maxima based on stock and date range
|
|
914
|
-
local_max_CLOSE = local_max_CLOSE[local_max_CLOSE["DATE"] >= start_date]
|
|
915
|
-
|
|
916
|
-
# Filter minima based on stock and date range
|
|
917
|
-
local_min_CLOSE = local_min_CLOSE[local_min_CLOSE["DATE"] >= start_date]
|
|
918
|
-
|
|
919
|
-
# logger.info the maxima and minima dates
|
|
920
|
-
logger.info(
|
|
921
|
-
f"Maxima Dates for Stock {stock}: {list(local_max_CLOSE['DATE'].values)}"
|
|
922
|
-
)
|
|
923
|
-
logger.info(
|
|
924
|
-
f"Minima Dates for Stock {stock}: {list(local_min_CLOSE['DATE'].values)}"
|
|
925
|
-
)
|
|
926
|
-
|
|
927
|
-
# Plot the stock's CLOSE prices within the specified date range
|
|
928
|
-
stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)][
|
|
929
|
-
["CLOSE", "DATE"]
|
|
930
|
-
].set_index("DATE")
|
|
931
|
-
|
|
932
|
-
plt.figure(figsize=(10, 6))
|
|
933
|
-
stock_data.plot(color="black", title=f"Stock {stock} Extremas")
|
|
934
|
-
|
|
935
|
-
# Add vertical lines for maxima
|
|
936
|
-
for date in local_max_CLOSE["DATE"].values:
|
|
937
|
-
plt.axvline(
|
|
938
|
-
x=date,
|
|
939
|
-
color="red",
|
|
940
|
-
label="Maxima" if date == local_max_CLOSE["DATE"].values[0] else "",
|
|
496
|
+
return (
|
|
497
|
+
train.reset_index(drop=True),
|
|
498
|
+
val.reset_index(drop=True),
|
|
499
|
+
test.reset_index(drop=True),
|
|
941
500
|
)
|
|
942
501
|
|
|
943
|
-
#
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
502
|
+
# embedding and pca
|
|
503
|
+
def add_pca_features(
|
|
504
|
+
self, df: pd.DataFrame, n_components: int = 5, pcas=None
|
|
505
|
+
) -> tuple[pd.DataFrame, dict]:
|
|
506
|
+
"""
|
|
507
|
+
Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
|
|
508
|
+
NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
|
|
509
|
+
|
|
510
|
+
Parameters:
|
|
511
|
+
df (pd.DataFrame): Input DataFrame
|
|
512
|
+
column (str): Name of the column containing np.ndarray
|
|
513
|
+
n_components (int): Number of PCA components to keep
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
pd.DataFrame: DataFrame with new PCA columns added
|
|
517
|
+
"""
|
|
518
|
+
columns: list[str] = self.columns_pca
|
|
519
|
+
|
|
520
|
+
pcas_dict = {}
|
|
521
|
+
for column in columns:
|
|
522
|
+
# Convert text to embeddings if necessary
|
|
523
|
+
if not isinstance(df[column].iloc[0], (np.ndarray, list)):
|
|
524
|
+
sentences = df[column].astype(str).tolist()
|
|
525
|
+
logger.info(
|
|
526
|
+
f"Total sentences to embed for column {column}: {len(sentences)}"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Truncate each sentence
|
|
530
|
+
truncate_sentences = [truncate_text(sentence) for sentence in sentences]
|
|
531
|
+
|
|
532
|
+
# embedding
|
|
533
|
+
embedding_matrix = get_openai_embeddings(truncate_sentences)
|
|
534
|
+
else:
|
|
535
|
+
logger.info(f"Column {column} is already embeddings")
|
|
536
|
+
# Stack the vectors into a 2D array
|
|
537
|
+
embedding_matrix = np.vstack(df[column].values)
|
|
538
|
+
|
|
539
|
+
# Apply PCA
|
|
540
|
+
if pcas:
|
|
541
|
+
pca = pcas[column]
|
|
542
|
+
pca_features = pca.transform(embedding_matrix)
|
|
543
|
+
else:
|
|
544
|
+
pca = PCA(n_components=n_components)
|
|
545
|
+
pca_features = pca.fit_transform(embedding_matrix)
|
|
546
|
+
|
|
547
|
+
# Add PCA columns
|
|
548
|
+
for i in range(n_components):
|
|
549
|
+
df[f"{column}_pca_{i+1}"] = pca_features[:, i]
|
|
550
|
+
|
|
551
|
+
# Drop the original column
|
|
552
|
+
df.drop(column, axis=1, inplace=True)
|
|
553
|
+
pcas_dict.update({column: pca})
|
|
554
|
+
|
|
555
|
+
return df, pcas_dict
|
|
556
|
+
|
|
557
|
+
# encoding categorical features
|
|
558
|
+
def encode_categorical_features(
|
|
559
|
+
self,
|
|
560
|
+
df: pd.DataFrame,
|
|
561
|
+
transformer: ColumnTransformer | None = None,
|
|
562
|
+
) -> tuple[pd.DataFrame, ColumnTransformer]:
|
|
563
|
+
"""
|
|
564
|
+
Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
|
|
565
|
+
|
|
566
|
+
Parameters:
|
|
567
|
+
df (pd.DataFrame): Input DataFrame
|
|
568
|
+
columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
|
|
569
|
+
columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
|
|
570
|
+
columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
|
|
571
|
+
columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
|
|
572
|
+
transformer (ColumnTransformer, optional): if provided, applies transform only
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
tuple: (transformed DataFrame, ColumnTransformer)
|
|
576
|
+
"""
|
|
577
|
+
columns_onehot: list[str] = self.columns_onehot
|
|
578
|
+
columns_binary: list[str] = self.columns_binary
|
|
579
|
+
columns_ordinal: list[str] = self.columns_ordinal
|
|
580
|
+
columns_frequency: list[str] = self.columns_frequency
|
|
581
|
+
|
|
582
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
583
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
584
|
+
save_in_db = False
|
|
585
|
+
|
|
586
|
+
all_columns = (
|
|
587
|
+
columns_onehot + columns_binary + columns_ordinal + columns_frequency
|
|
949
588
|
)
|
|
950
589
|
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
590
|
+
if transformer:
|
|
591
|
+
transformed = transformer.transform(X)
|
|
592
|
+
else:
|
|
593
|
+
transformer = ColumnTransformer(
|
|
594
|
+
transformers=[
|
|
595
|
+
(
|
|
596
|
+
"onehot",
|
|
597
|
+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
|
598
|
+
columns_onehot,
|
|
599
|
+
),
|
|
600
|
+
(
|
|
601
|
+
"ordinal",
|
|
602
|
+
OrdinalEncoder(
|
|
603
|
+
handle_unknown="use_encoded_value", unknown_value=-1
|
|
604
|
+
),
|
|
605
|
+
columns_ordinal,
|
|
606
|
+
),
|
|
607
|
+
("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
|
|
608
|
+
("freq", CountEncoder(normalize=True), columns_frequency),
|
|
609
|
+
],
|
|
610
|
+
remainder="passthrough",
|
|
611
|
+
)
|
|
612
|
+
transformed = transformer.fit_transform(X)
|
|
613
|
+
save_in_db = True
|
|
962
614
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
- stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
|
|
966
|
-
- days_before_last: int, number of days before the last date in the dataset to visualize
|
|
967
|
-
"""
|
|
615
|
+
# Build output column names
|
|
616
|
+
column_names = []
|
|
968
617
|
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
618
|
+
if columns_onehot:
|
|
619
|
+
column_names.extend(
|
|
620
|
+
transformer.named_transformers_["onehot"]
|
|
621
|
+
.get_feature_names_out(columns_onehot)
|
|
622
|
+
.tolist()
|
|
623
|
+
)
|
|
972
624
|
|
|
973
|
-
|
|
974
|
-
|
|
625
|
+
if columns_ordinal:
|
|
626
|
+
column_names.extend(columns_ordinal)
|
|
975
627
|
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
628
|
+
if columns_binary:
|
|
629
|
+
column_names.extend(
|
|
630
|
+
transformer.named_transformers_["binary"]
|
|
631
|
+
.get_feature_names_out(columns_binary)
|
|
632
|
+
.tolist()
|
|
633
|
+
)
|
|
979
634
|
|
|
980
|
-
|
|
981
|
-
|
|
635
|
+
if columns_frequency:
|
|
636
|
+
column_names.extend(columns_frequency)
|
|
982
637
|
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
stock_data.loc[stock_data["TARGET_11"] == signal_value, "DATE"],
|
|
987
|
-
stock_data.loc[stock_data["TARGET_11"] == signal_value, "CLOSE"],
|
|
988
|
-
color=color,
|
|
989
|
-
label=f"Signal {signal_value}",
|
|
990
|
-
s=50, # Size of the points
|
|
991
|
-
)
|
|
638
|
+
# Add passthrough (non-encoded) columns
|
|
639
|
+
passthrough_columns = [col for col in X.columns if col not in all_columns]
|
|
640
|
+
column_names.extend(passthrough_columns)
|
|
992
641
|
|
|
993
|
-
|
|
994
|
-
plt.xlabel("Date")
|
|
995
|
-
plt.ylabel("Close Price")
|
|
996
|
-
plt.legend()
|
|
997
|
-
plt.grid(True)
|
|
998
|
-
plt.show()
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
def visualize_data_distribution(
|
|
1002
|
-
data,
|
|
1003
|
-
plot_type="hist",
|
|
1004
|
-
features=None,
|
|
1005
|
-
bins=50,
|
|
1006
|
-
rows=5,
|
|
1007
|
-
cols=5,
|
|
1008
|
-
width_per_plot=4,
|
|
1009
|
-
height_per_plot=3,
|
|
1010
|
-
):
|
|
1011
|
-
"""
|
|
1012
|
-
Function to visualize the data distribution for multiple features in a DataFrame with dynamic figsize,
|
|
1013
|
-
splitting into multiple figures if there are too many features for one figure.
|
|
1014
|
-
|
|
1015
|
-
Parameters:
|
|
1016
|
-
- data: pd.DataFrame, the DataFrame containing the data to visualize.
|
|
1017
|
-
- plot_type: str, the type of plot to use ('hist', 'kde', 'box').
|
|
1018
|
-
- features: list, list of features (columns) to visualize. If None, all numeric features are used.
|
|
1019
|
-
- bins: int, the number of bins for histograms (default: 50).
|
|
1020
|
-
- rows: int, number of rows in the subplot grid (default: 5).
|
|
1021
|
-
- cols: int, number of columns in the subplot grid (default: 5).
|
|
1022
|
-
- width_per_plot: int, the width of each subplot (default: 4).
|
|
1023
|
-
- height_per_plot: int, the height of each subplot (default: 3).
|
|
1024
|
-
"""
|
|
642
|
+
X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
|
|
1025
643
|
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
features = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
644
|
+
# Try to convert columns to best possible dtypes
|
|
645
|
+
X_transformed = X_transformed.convert_dtypes()
|
|
1029
646
|
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
647
|
+
# Insert features in db
|
|
648
|
+
if save_in_db:
|
|
649
|
+
# TODO: in bulk
|
|
650
|
+
categorical_features, numerical_features = get_features_by_types(
|
|
651
|
+
X_transformed
|
|
652
|
+
)
|
|
653
|
+
for feature in categorical_features:
|
|
654
|
+
Feature.upsert(match_fields=["name"], name=feature, type="categorical")
|
|
655
|
+
for feature in numerical_features:
|
|
656
|
+
Feature.upsert(match_fields=["name"], name=feature, type="numerical")
|
|
657
|
+
for target in y.columns:
|
|
658
|
+
target_number = int(target.split("_")[1])
|
|
659
|
+
type = (
|
|
660
|
+
"classification"
|
|
661
|
+
if target_number in self.target_clf
|
|
662
|
+
else "regression"
|
|
663
|
+
)
|
|
664
|
+
# TODO: what about description here ?
|
|
665
|
+
Target.upsert(match_fields=["name", "type"], name=target, type=type)
|
|
666
|
+
|
|
667
|
+
return pd.concat([X_transformed, y], axis=1), transformer
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
# analysis & utils
|
|
671
|
+
def summarize_dataframe(
|
|
672
|
+
df: pd.DataFrame, sample_categorical_threshold: int = 15
|
|
673
|
+
) -> pd.DataFrame:
|
|
674
|
+
summary = []
|
|
675
|
+
|
|
676
|
+
def is_hashable_series(series: pd.Series) -> bool:
|
|
677
|
+
try:
|
|
678
|
+
_ = series.dropna().unique()
|
|
679
|
+
return True
|
|
680
|
+
except TypeError:
|
|
681
|
+
return False
|
|
1035
682
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
# Subset of features for the current figure
|
|
1039
|
-
subset_features = features[start : start + plots_per_figure]
|
|
683
|
+
df = convert_object_columns_that_are_numeric(df)
|
|
684
|
+
df = df.convert_dtypes()
|
|
1040
685
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
figsize = (grid_cols * width_per_plot, grid_rows * height_per_plot)
|
|
686
|
+
for col in df.columns:
|
|
687
|
+
total_missing = df[col].isna().sum()
|
|
688
|
+
col_data = df[col].dropna()
|
|
689
|
+
dtype = col_data.dtype
|
|
1046
690
|
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
691
|
+
if col_data.empty:
|
|
692
|
+
summary.append(
|
|
693
|
+
{
|
|
694
|
+
"Column": col,
|
|
695
|
+
"Dtype": dtype,
|
|
696
|
+
"Type": "unknown",
|
|
697
|
+
"Detail": "No non-null values",
|
|
698
|
+
"Missing": total_missing,
|
|
699
|
+
}
|
|
700
|
+
)
|
|
701
|
+
continue
|
|
702
|
+
|
|
703
|
+
# Case 1: Numeric columns
|
|
704
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
705
|
+
unique_vals = col_data.nunique()
|
|
706
|
+
|
|
707
|
+
if set(col_data.unique()).issubset({0, 1}):
|
|
708
|
+
col_type = "binary-categorical"
|
|
709
|
+
detail = "0/1 values only"
|
|
710
|
+
elif (
|
|
711
|
+
pd.api.types.is_integer_dtype(col_data)
|
|
712
|
+
and unique_vals <= sample_categorical_threshold
|
|
713
|
+
):
|
|
714
|
+
col_type = "multi-categorical"
|
|
715
|
+
top_vals = col_data.value_counts().head(10)
|
|
716
|
+
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
717
|
+
else:
|
|
718
|
+
col_type = "numeric"
|
|
719
|
+
q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
|
|
720
|
+
detail = (
|
|
721
|
+
f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
|
|
722
|
+
f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
# Case 2: Object or other hashable columns
|
|
726
|
+
elif is_hashable_series(col_data):
|
|
727
|
+
unique_vals = col_data.nunique()
|
|
728
|
+
if unique_vals <= sample_categorical_threshold:
|
|
729
|
+
col_type = "object-categorical"
|
|
730
|
+
top_vals = col_data.value_counts().head(10)
|
|
731
|
+
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
732
|
+
else:
|
|
733
|
+
col_type = "high-cardinality-categorical"
|
|
734
|
+
detail = f"{unique_vals} unique values"
|
|
735
|
+
|
|
736
|
+
# Case 3: Unusable columns
|
|
737
|
+
else:
|
|
738
|
+
col_type = "non-hashable"
|
|
739
|
+
detail = f"Non-hashable type: {type(col_data.iloc[0])}"
|
|
740
|
+
|
|
741
|
+
summary.append(
|
|
742
|
+
{
|
|
743
|
+
"Column": col,
|
|
744
|
+
"Dtype": dtype,
|
|
745
|
+
"Type": col_type,
|
|
746
|
+
"Detail": detail,
|
|
747
|
+
"Missing": total_missing,
|
|
748
|
+
}
|
|
749
|
+
)
|
|
1050
750
|
|
|
1051
|
-
|
|
1052
|
-
for i, feature in enumerate(subset_features):
|
|
1053
|
-
ax = axes[i]
|
|
751
|
+
return pd.DataFrame(summary)
|
|
1054
752
|
|
|
1055
|
-
if plot_type == "hist":
|
|
1056
|
-
sns.histplot(data[feature].dropna(), bins=bins, kde=False, ax=ax)
|
|
1057
|
-
elif plot_type == "kde":
|
|
1058
|
-
sns.kdeplot(data[feature].dropna(), ax=ax, fill=True)
|
|
1059
|
-
elif plot_type == "box":
|
|
1060
|
-
sns.boxplot(data[feature].dropna(), ax=ax)
|
|
1061
753
|
|
|
1062
|
-
|
|
1063
|
-
|
|
754
|
+
def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
|
|
755
|
+
"""
|
|
756
|
+
Detect object columns that can be safely converted to numeric (float or int).
|
|
1064
757
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
758
|
+
Returns:
|
|
759
|
+
List of column names that are object type but contain numeric values.
|
|
760
|
+
"""
|
|
1068
761
|
|
|
1069
|
-
|
|
1070
|
-
fig.tight_layout()
|
|
762
|
+
numeric_candidates = []
|
|
1071
763
|
|
|
1072
|
-
|
|
1073
|
-
|
|
764
|
+
for col in df.select_dtypes(include=["object"]).columns:
|
|
765
|
+
try:
|
|
766
|
+
converted = pd.to_numeric(df[col], errors="coerce")
|
|
767
|
+
if converted.notna().sum() / len(df) > 0.9: # at least 90% convertible
|
|
768
|
+
numeric_candidates.append(col)
|
|
769
|
+
except Exception:
|
|
770
|
+
continue
|
|
1074
771
|
|
|
772
|
+
for col in numeric_candidates:
|
|
773
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
1075
774
|
|
|
1076
|
-
|
|
1077
|
-
"""
|
|
1078
|
-
Detect outliers in a DataFrame using the Interquartile Range (IQR) method.
|
|
775
|
+
return df
|
|
1079
776
|
|
|
1080
|
-
Parameters:
|
|
1081
|
-
- data: pd.DataFrame, the DataFrame in which to detect outliers.
|
|
1082
777
|
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
outliers = pd.DataFrame(index=data.index)
|
|
778
|
+
def traditional_descriptive_analysis(df: pd.DataFrame, group_column: str | None = None):
|
|
779
|
+
with pd.option_context("display.max_rows", None):
|
|
780
|
+
results = {}
|
|
1087
781
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
Q3 = data[column].quantile(0.75) # 3rd quartile (75th percentile)
|
|
1091
|
-
IQR = Q3 - Q1 # Interquartile range
|
|
782
|
+
# Shape
|
|
783
|
+
results["Shape"] = f"{df.shape[0]} rows × {df.shape[1]} columns"
|
|
1092
784
|
|
|
1093
|
-
|
|
1094
|
-
|
|
785
|
+
# Duplicated rows
|
|
786
|
+
results["Duplicated rows"] = int(df.duplicated().sum())
|
|
1095
787
|
|
|
1096
|
-
#
|
|
1097
|
-
|
|
788
|
+
# Duplicated columns
|
|
789
|
+
duplicated_cols = df.T[df.T.duplicated()].index.tolist()
|
|
790
|
+
results["Duplicated columns"] = (
|
|
791
|
+
", ".join(duplicated_cols) if len(duplicated_cols) > 0 else "None"
|
|
792
|
+
)
|
|
1098
793
|
|
|
1099
|
-
|
|
794
|
+
# Missing values
|
|
795
|
+
missing = df.isnull().sum()
|
|
796
|
+
missing = missing[missing > 0].sort_values(ascending=False)
|
|
797
|
+
if len(missing) > 0:
|
|
798
|
+
results["Missing values"] = missing.to_frame("Missing Count").to_markdown()
|
|
799
|
+
else:
|
|
800
|
+
results["Missing values"] = "No missing values"
|
|
801
|
+
|
|
802
|
+
# Infinite values
|
|
803
|
+
inf = df.replace([np.inf, -np.inf], np.nan)
|
|
804
|
+
inf_count = inf.isnull().sum() - df.isnull().sum()
|
|
805
|
+
inf_count = inf_count[inf_count > 0].sort_values(ascending=False)
|
|
806
|
+
if len(inf_count) > 0:
|
|
807
|
+
results["Infinite values"] = inf_count.to_frame("Inf Count").to_markdown()
|
|
808
|
+
else:
|
|
809
|
+
results["Infinite values"] = "No infinite values"
|
|
1100
810
|
|
|
811
|
+
# Constant columns
|
|
812
|
+
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
|
|
813
|
+
results["Constant columns"] = (
|
|
814
|
+
", ".join(constant_cols) if len(constant_cols) > 0 else "None"
|
|
815
|
+
)
|
|
1101
816
|
|
|
1102
|
-
|
|
1103
|
-
|
|
817
|
+
# Data types
|
|
818
|
+
dtypes = df.dtypes.astype(str).sort_index()
|
|
819
|
+
results["Data types"] = dtypes.to_frame("Type").to_markdown()
|
|
1104
820
|
|
|
1105
|
-
|
|
1106
|
-
|
|
821
|
+
# Unique values in group_column
|
|
822
|
+
if group_column is not None:
|
|
823
|
+
if group_column in df.columns:
|
|
824
|
+
results[f"Unique values in '{group_column}'"] = int(
|
|
825
|
+
df[group_column].nunique()
|
|
826
|
+
)
|
|
827
|
+
else:
|
|
828
|
+
results[f"Unique values in '{group_column}'"] = (
|
|
829
|
+
f"❌ Column '{group_column}' not found"
|
|
830
|
+
)
|
|
1107
831
|
|
|
1108
|
-
|
|
1109
|
-
|
|
832
|
+
# Log all results
|
|
833
|
+
for title, content in results.items():
|
|
834
|
+
print(f"\n### {title}\n{content}")
|
|
1110
835
|
|
|
1111
|
-
logger.info("nb of outliers")
|
|
1112
|
-
outliers = detect_outliers_iqr(df.select_dtypes(include=["float64"]), degree=5)
|
|
1113
836
|
|
|
1114
|
-
|
|
1115
|
-
logger.info(outliers.sum().sort_values(ascending=False))
|
|
837
|
+
def print_missing_values(df: pd.DataFrame):
|
|
1116
838
|
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
839
|
+
if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
|
|
840
|
+
logger.info(
|
|
841
|
+
f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
|
|
842
|
+
)
|
|
843
|
+
else:
|
|
844
|
+
logger.info("No missing values found")
|