lecrapaud 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +8 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/session.py +11 -0
- lecrapaud/experiment.py +1 -1
- lecrapaud/feature_engineering.py +11 -12
- lecrapaud/feature_selection.py +29 -48
- lecrapaud/model_selection.py +59 -59
- lecrapaud/utils.py +1 -1
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/METADATA +27 -20
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/RECORD +13 -16
- lecrapaud/predictions.py +0 -292
- lecrapaud/preprocessing.py +0 -984
- lecrapaud/training.py +0 -239
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/WHEEL +0 -0
lecrapaud/preprocessing.py
DELETED
|
@@ -1,984 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# coding: utf-8
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import numpy as np
|
|
6
|
-
import joblib
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
import matplotlib.pyplot as plt
|
|
9
|
-
import seaborn as sns
|
|
10
|
-
from scipy.signal import argrelextrema
|
|
11
|
-
from itertools import product
|
|
12
|
-
import os
|
|
13
|
-
from collections import defaultdict
|
|
14
|
-
|
|
15
|
-
from lecrapaud.config import PYTHON_ENV
|
|
16
|
-
from lecrapaud.utils import logger
|
|
17
|
-
from lecrapaud.directory_management import data_dir
|
|
18
|
-
from lecrapaud.services.indicators import (
|
|
19
|
-
rsi,
|
|
20
|
-
macd,
|
|
21
|
-
bollinger_bands,
|
|
22
|
-
adx,
|
|
23
|
-
atr,
|
|
24
|
-
stochastic,
|
|
25
|
-
mfi,
|
|
26
|
-
ichimoku_cloud,
|
|
27
|
-
parabolic_sar,
|
|
28
|
-
chaikin_money_flow,
|
|
29
|
-
pivot_points,
|
|
30
|
-
sma,
|
|
31
|
-
ema,
|
|
32
|
-
volatility,
|
|
33
|
-
cumulative_return,
|
|
34
|
-
close_diff,
|
|
35
|
-
obv,
|
|
36
|
-
pressure,
|
|
37
|
-
)
|
|
38
|
-
from lecrapaud.db import Target
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# pd print options
|
|
42
|
-
# pd.set_option("display.max_columns", None)
|
|
43
|
-
# pd.reset_option("display.max_rows")
|
|
44
|
-
# pd.set_option("display.max_colwidth", None)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# Main function to create targets
|
|
48
|
-
def targets_creation(
|
|
49
|
-
df: pd.DataFrame,
|
|
50
|
-
top_x_stock: float = 0.1,
|
|
51
|
-
local_max_order: int = 10,
|
|
52
|
-
threshold: int = 5,
|
|
53
|
-
):
|
|
54
|
-
"""Preprocessing the stock data from yfinance
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
df (pd.DataFrame): a dataframe obtain with `get_data` function
|
|
58
|
-
top_x_stock (float): the % at which you are considered top ranked stock for the day
|
|
59
|
-
local_max_order (int): this set up the window to look at on both side of the extrema : the greater, the more 'global' is the extrema.
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
df with more columns:
|
|
63
|
-
- date variables : we create YEAR, MONTH, DAY, WEEK, WEEKDAY, YEARWEEK and YEARDAY features
|
|
64
|
-
- return, market return, residual return and similar computation with volume are done to create 6 new features
|
|
65
|
-
- target variables :
|
|
66
|
-
- TARGET_1 : next day return
|
|
67
|
-
- TARGET_2 : categorical return (positive 1, or negative 0)
|
|
68
|
-
- TARGET_3 : next day ranking from best (1) to worst (n_stock) returns
|
|
69
|
-
- TARGET_4 : categorical next day top ranking (in top_x_stock) (1), or not (0)
|
|
70
|
-
- TARGET_5, TARGET_6, TARGET_7, TARGET_8 : same but with residual return
|
|
71
|
-
- TARGET_9 : categorical with 1 if it's a local maximum and 0 if not
|
|
72
|
-
- TARGET_10 : categorical with 1 if it's a local minimum and 0 if not
|
|
73
|
-
- TARGET 11 : We will create trading signals based on proximity to local minima and maxima : need multi-binary loss support
|
|
74
|
-
- TARGET 12, 13, 14 : return in 9,14,21 days
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"""
|
|
78
|
-
|
|
79
|
-
# Creating targets
|
|
80
|
-
logger.info("Creating target variables...")
|
|
81
|
-
|
|
82
|
-
# TARGET 1-4 : We start with target RET
|
|
83
|
-
target = "RET"
|
|
84
|
-
stock_column = "STOCK"
|
|
85
|
-
nb_of_stocks = len(df[stock_column].unique())
|
|
86
|
-
|
|
87
|
-
first_x_percent = max(int(nb_of_stocks * top_x_stock), 1)
|
|
88
|
-
|
|
89
|
-
df["TARGET_1"] = df[target].shift(-1)
|
|
90
|
-
df["TARGET_2"] = np.select([df["TARGET_1"] <= 0, df["TARGET_1"] > 0], [0, 1])
|
|
91
|
-
df["TARGET_3"] = df.groupby("DATE")["TARGET_1"].rank(
|
|
92
|
-
method="first", ascending=False
|
|
93
|
-
)
|
|
94
|
-
df["TARGET_4"] = np.select(
|
|
95
|
-
[
|
|
96
|
-
df.groupby("DATE")["TARGET_1"].rank(method="first", ascending=False)
|
|
97
|
-
<= first_x_percent
|
|
98
|
-
],
|
|
99
|
-
[1],
|
|
100
|
-
default=0,
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
# TARGET 5-8 : We do the same for RESIDUAL_RET
|
|
104
|
-
target = "RESIDUAL_RET"
|
|
105
|
-
|
|
106
|
-
df["TARGET_5"] = df[target].shift(-1)
|
|
107
|
-
df["TARGET_6"] = np.select([df["TARGET_5"] <= 0, df["TARGET_5"] > 0], [0, 1])
|
|
108
|
-
df["TARGET_7"] = df.groupby("DATE")["TARGET_5"].rank(
|
|
109
|
-
method="first", ascending=False
|
|
110
|
-
)
|
|
111
|
-
df["TARGET_8"] = np.select(
|
|
112
|
-
[
|
|
113
|
-
df.groupby("DATE")["TARGET_5"].rank(method="first", ascending=False)
|
|
114
|
-
<= first_x_percent
|
|
115
|
-
],
|
|
116
|
-
[1],
|
|
117
|
-
default=0,
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
# TARGET 9-10 : Let's look at local min and max : it can be interpretate as buy and sell signal respectively
|
|
121
|
-
target = "CLOSE"
|
|
122
|
-
|
|
123
|
-
df["TARGET_9"] = 0
|
|
124
|
-
df["TARGET_10"] = 0
|
|
125
|
-
|
|
126
|
-
# Calculate local maxima and set TARGET_9 to 1 where maxima are found
|
|
127
|
-
maxima_indices = df.groupby(stock_column)[target].transform(
|
|
128
|
-
lambda x: x.index.isin(
|
|
129
|
-
x.iloc[argrelextrema(x.values, np.greater, order=local_max_order)].index
|
|
130
|
-
)
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
minima_indices = df.groupby(stock_column)[target].transform(
|
|
134
|
-
lambda x: x.index.isin(
|
|
135
|
-
x.iloc[argrelextrema(x.values, np.less, order=local_max_order)].index
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
df.loc[maxima_indices, "TARGET_9"] = 1
|
|
140
|
-
df.loc[minima_indices, "TARGET_10"] = 1
|
|
141
|
-
|
|
142
|
-
# TARGET 11 : We will create trading signals based on proximity to local minima and maxima.
|
|
143
|
-
df["TARGET_11"] = 2 # Default value for HOLD
|
|
144
|
-
|
|
145
|
-
# Function to detect local minima and maxima, and assign signals
|
|
146
|
-
def assign_signals(group):
|
|
147
|
-
close_prices = group[target].values
|
|
148
|
-
dates = group["DATE"].values
|
|
149
|
-
|
|
150
|
-
# Detect local maxima and minima using argrelextrema
|
|
151
|
-
local_maxima_idx = argrelextrema(
|
|
152
|
-
close_prices, np.greater, order=local_max_order
|
|
153
|
-
)[0]
|
|
154
|
-
local_minima_idx = argrelextrema(close_prices, np.less, order=local_max_order)[
|
|
155
|
-
0
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
# STRONG BUY (4) for local minima, STRONG SELL (0) for local maxima
|
|
159
|
-
group.loc[group.index[local_minima_idx], "TARGET_11"] = 4
|
|
160
|
-
group.loc[group.index[local_maxima_idx], "TARGET_11"] = 0
|
|
161
|
-
|
|
162
|
-
# Assign BUY (3) and SELL (1) based on proximity to extrema within the threshold window
|
|
163
|
-
for idx in local_minima_idx:
|
|
164
|
-
# Get the actual date of the minima
|
|
165
|
-
min_date = dates[idx]
|
|
166
|
-
# Select the rows within the threshold window around the minima date
|
|
167
|
-
buy_window = group.loc[
|
|
168
|
-
(group["DATE"] >= min_date - pd.Timedelta(days=threshold))
|
|
169
|
-
& (group["DATE"] <= min_date + pd.Timedelta(days=threshold))
|
|
170
|
-
]
|
|
171
|
-
group.loc[buy_window.index, "TARGET_11"] = np.where(
|
|
172
|
-
buy_window["DATE"] == min_date,
|
|
173
|
-
4,
|
|
174
|
-
3, # STRONG BUY at minima, BUY near minima
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
for idx in local_maxima_idx:
|
|
178
|
-
# Get the actual date of the maxima
|
|
179
|
-
max_date = dates[idx]
|
|
180
|
-
# Select the rows within the threshold window around the maxima date
|
|
181
|
-
sell_window = group.loc[
|
|
182
|
-
(group["DATE"] >= max_date - pd.Timedelta(days=threshold))
|
|
183
|
-
& (group["DATE"] <= max_date + pd.Timedelta(days=threshold))
|
|
184
|
-
]
|
|
185
|
-
group.loc[sell_window.index, "TARGET_11"] = np.where(
|
|
186
|
-
sell_window["DATE"] == max_date,
|
|
187
|
-
0,
|
|
188
|
-
1, # STRONG SELL at maxima, SELL near maxima
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
return group
|
|
192
|
-
|
|
193
|
-
# Apply the function to each stock group
|
|
194
|
-
df = df.groupby(stock_column, group_keys=False).apply(assign_signals)
|
|
195
|
-
|
|
196
|
-
# TARGET 12, 13, 14 : return in 9,14,21 days
|
|
197
|
-
df["TARGET_12"] = df.groupby("STOCK")["CLOSE"].pct_change(9).shift(-9)
|
|
198
|
-
df["TARGET_13"] = df.groupby("STOCK")["CLOSE"].pct_change(14).shift(-14)
|
|
199
|
-
df["TARGET_14"] = df.groupby("STOCK")["CLOSE"].pct_change(21).shift(-21)
|
|
200
|
-
|
|
201
|
-
# Update database
|
|
202
|
-
# TODO: in bulk
|
|
203
|
-
Target.upsert(
|
|
204
|
-
match_fields=["name", "type"],
|
|
205
|
-
name="TARGET_1",
|
|
206
|
-
type="regression",
|
|
207
|
-
description="Next day return",
|
|
208
|
-
)
|
|
209
|
-
Target.upsert(
|
|
210
|
-
match_fields=["name", "type"],
|
|
211
|
-
name="TARGET_2",
|
|
212
|
-
type="classification",
|
|
213
|
-
description="Next day return",
|
|
214
|
-
)
|
|
215
|
-
Target.upsert(
|
|
216
|
-
match_fields=["name", "type"],
|
|
217
|
-
name="TARGET_3",
|
|
218
|
-
type="regression",
|
|
219
|
-
description="Ranking of next day return",
|
|
220
|
-
)
|
|
221
|
-
Target.upsert(
|
|
222
|
-
match_fields=["name", "type"],
|
|
223
|
-
name="TARGET_4",
|
|
224
|
-
type="classification",
|
|
225
|
-
description="Top ranking of next day return",
|
|
226
|
-
)
|
|
227
|
-
Target.upsert(
|
|
228
|
-
match_fields=["name", "type"],
|
|
229
|
-
name="TARGET_5",
|
|
230
|
-
type="regression",
|
|
231
|
-
description="Next day residual return",
|
|
232
|
-
)
|
|
233
|
-
Target.upsert(
|
|
234
|
-
match_fields=["name", "type"],
|
|
235
|
-
name="TARGET_6",
|
|
236
|
-
type="classification",
|
|
237
|
-
description="Next day residual return",
|
|
238
|
-
)
|
|
239
|
-
Target.upsert(
|
|
240
|
-
match_fields=["name", "type"],
|
|
241
|
-
name="TARGET_7",
|
|
242
|
-
type="regression",
|
|
243
|
-
description="Ranking of next day residual return",
|
|
244
|
-
)
|
|
245
|
-
Target.upsert(
|
|
246
|
-
match_fields=["name", "type"],
|
|
247
|
-
name="TARGET_8",
|
|
248
|
-
type="classification",
|
|
249
|
-
description="Top ranking of next day residual return",
|
|
250
|
-
)
|
|
251
|
-
Target.upsert(
|
|
252
|
-
match_fields=["name", "type"],
|
|
253
|
-
name="TARGET_9",
|
|
254
|
-
type="classification",
|
|
255
|
-
description="Local maxima",
|
|
256
|
-
)
|
|
257
|
-
Target.upsert(
|
|
258
|
-
match_fields=["name", "type"],
|
|
259
|
-
name="TARGET_10",
|
|
260
|
-
type="classification",
|
|
261
|
-
description="Local minima",
|
|
262
|
-
)
|
|
263
|
-
Target.upsert(
|
|
264
|
-
match_fields=["name", "type"],
|
|
265
|
-
name="TARGET_11",
|
|
266
|
-
type="classification",
|
|
267
|
-
description="Trading signals based on proximity to local minima and maxima",
|
|
268
|
-
)
|
|
269
|
-
Target.upsert(
|
|
270
|
-
match_fields=["name", "type"],
|
|
271
|
-
name="TARGET_12",
|
|
272
|
-
type="regression",
|
|
273
|
-
description="Return in 9 days",
|
|
274
|
-
)
|
|
275
|
-
Target.upsert(
|
|
276
|
-
match_fields=["name", "type"],
|
|
277
|
-
name="TARGET_13",
|
|
278
|
-
type="regression",
|
|
279
|
-
description="Return in 14 days",
|
|
280
|
-
)
|
|
281
|
-
Target.upsert(
|
|
282
|
-
match_fields=["name", "type"],
|
|
283
|
-
name="TARGET_14",
|
|
284
|
-
type="regression",
|
|
285
|
-
description="Return in 21 days",
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
return df
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def calculate_option_features(option_data: list[dict], spot_price: float):
|
|
292
|
-
puts = [opt for opt in option_data if opt["type"] == "put"]
|
|
293
|
-
calls = [opt for opt in option_data if opt["type"] == "call"]
|
|
294
|
-
|
|
295
|
-
def safe_float(x):
|
|
296
|
-
try:
|
|
297
|
-
return float(x)
|
|
298
|
-
except:
|
|
299
|
-
return 0.0
|
|
300
|
-
|
|
301
|
-
# Convert and clean data
|
|
302
|
-
for opt in option_data:
|
|
303
|
-
for key in ["strike", "volume", "open_interest", "delta", "implied_volatility"]:
|
|
304
|
-
opt[key] = safe_float(opt.get(key, 0.0))
|
|
305
|
-
|
|
306
|
-
# Put/Call ratios
|
|
307
|
-
total_put_vol = sum(p["volume"] for p in puts)
|
|
308
|
-
total_call_vol = sum(c["volume"] for c in calls)
|
|
309
|
-
total_put_oi = sum(p["open_interest"] for p in puts)
|
|
310
|
-
total_call_oi = sum(c["open_interest"] for c in calls)
|
|
311
|
-
|
|
312
|
-
put_call_ratio_vol = total_put_vol / total_call_vol if total_call_vol > 0 else None
|
|
313
|
-
put_call_ratio_oi = total_put_oi / total_call_oi if total_call_oi > 0 else None
|
|
314
|
-
|
|
315
|
-
# Open Interest Skew
|
|
316
|
-
oi_skew = sum(c["open_interest"] for c in calls if c["strike"] > spot_price) - sum(
|
|
317
|
-
p["open_interest"] for p in puts if p["strike"] < spot_price
|
|
318
|
-
)
|
|
319
|
-
|
|
320
|
-
# Total Open Interest
|
|
321
|
-
total_oi = sum(opt["open_interest"] for opt in option_data)
|
|
322
|
-
|
|
323
|
-
# Delta-weighted Put/Call Ratio
|
|
324
|
-
dw_put = sum(p["delta"] * p["volume"] for p in puts)
|
|
325
|
-
dw_call = sum(c["delta"] * c["volume"] for c in calls)
|
|
326
|
-
delta_weighted_pcr = dw_put / dw_call if dw_call > 0 else None
|
|
327
|
-
|
|
328
|
-
# ATM IV
|
|
329
|
-
atm_option = min(option_data, key=lambda x: abs(x["strike"] - spot_price))
|
|
330
|
-
atm_iv = atm_option["implied_volatility"]
|
|
331
|
-
|
|
332
|
-
# IV Skew (25-delta)
|
|
333
|
-
iv_put_25d = np.mean(
|
|
334
|
-
[p["implied_volatility"] for p in puts if abs(p["delta"] + 0.25) < 0.05]
|
|
335
|
-
)
|
|
336
|
-
iv_call_25d = np.mean(
|
|
337
|
-
[c["implied_volatility"] for c in calls if abs(c["delta"] - 0.25) < 0.05]
|
|
338
|
-
)
|
|
339
|
-
iv_skew_25d = iv_put_25d - iv_call_25d if iv_put_25d and iv_call_25d else None
|
|
340
|
-
|
|
341
|
-
# IV Term Structure
|
|
342
|
-
iv_by_exp = defaultdict(list)
|
|
343
|
-
for opt in option_data:
|
|
344
|
-
iv_by_exp[opt["expiration"]].append(opt["implied_volatility"])
|
|
345
|
-
expiries = sorted(iv_by_exp.keys())
|
|
346
|
-
if len(expiries) >= 2:
|
|
347
|
-
iv_term_structure = np.mean(iv_by_exp[expiries[-1]]) - np.mean(
|
|
348
|
-
iv_by_exp[expiries[0]]
|
|
349
|
-
)
|
|
350
|
-
else:
|
|
351
|
-
iv_term_structure = None
|
|
352
|
-
|
|
353
|
-
# Moneyness
|
|
354
|
-
moneyness = [spot_price / opt["strike"] for opt in option_data if opt["strike"] > 0]
|
|
355
|
-
|
|
356
|
-
# % OTM / ITM
|
|
357
|
-
otm_calls = [c for c in calls if c["strike"] > spot_price]
|
|
358
|
-
otm_puts = [p for p in puts if p["strike"] < spot_price]
|
|
359
|
-
otm = len(otm_calls) + len(otm_puts)
|
|
360
|
-
itm = len(option_data) - otm
|
|
361
|
-
percent_otm = otm / len(option_data) if option_data else None
|
|
362
|
-
percent_itm = itm / len(option_data) if option_data else None
|
|
363
|
-
|
|
364
|
-
# Weighted Average Strike
|
|
365
|
-
def weighted_avg_strike(options):
|
|
366
|
-
total_vol = sum(o["volume"] for o in options)
|
|
367
|
-
return (
|
|
368
|
-
sum(o["strike"] * o["volume"] for o in options) / total_vol
|
|
369
|
-
if total_vol > 0
|
|
370
|
-
else None
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
avg_strike_calls = weighted_avg_strike(calls)
|
|
374
|
-
avg_strike_puts = weighted_avg_strike(puts)
|
|
375
|
-
|
|
376
|
-
# Option Sentiment Index
|
|
377
|
-
sentiment_numerator = sum(
|
|
378
|
-
c["volume"] for c in calls if c["strike"] < spot_price
|
|
379
|
-
) - sum(p["volume"] for p in puts if p["strike"] > spot_price)
|
|
380
|
-
sentiment_index = (
|
|
381
|
-
sentiment_numerator / (total_put_vol + total_call_vol)
|
|
382
|
-
if (total_put_vol + total_call_vol) > 0
|
|
383
|
-
else None
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
return {
|
|
387
|
-
"put_call_ratio_volume": put_call_ratio_vol,
|
|
388
|
-
"put_call_ratio_open_interest": put_call_ratio_oi,
|
|
389
|
-
"open_interest_skew": oi_skew,
|
|
390
|
-
"total_open_interest": total_oi,
|
|
391
|
-
"delta_weighted_pcr": delta_weighted_pcr,
|
|
392
|
-
"atm_iv": atm_iv,
|
|
393
|
-
"iv_skew_25d": iv_skew_25d,
|
|
394
|
-
"iv_term_structure": iv_term_structure,
|
|
395
|
-
"average_moneyness": np.mean(moneyness) if moneyness else None,
|
|
396
|
-
"percent_otm": percent_otm,
|
|
397
|
-
"percent_itm": percent_itm,
|
|
398
|
-
"weighted_avg_strike_calls": avg_strike_calls,
|
|
399
|
-
"weighted_avg_strike_puts": avg_strike_puts,
|
|
400
|
-
"option_sentiment_index": sentiment_index,
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def apply_indicators(df: pd.DataFrame):
|
|
405
|
-
"""Apply multiple indicators to a grouped dataframe of a single stock."""
|
|
406
|
-
# Assuming 'df' is the OHLC data for a single stock, apply indicators
|
|
407
|
-
result = df.copy()
|
|
408
|
-
|
|
409
|
-
logger.debug(f"Computing non-period features...")
|
|
410
|
-
|
|
411
|
-
# Apply Parabolic SAR
|
|
412
|
-
result["Parabolic_SAR"] = parabolic_sar(df)
|
|
413
|
-
|
|
414
|
-
# Apply Bollinger Bands
|
|
415
|
-
result["Upper_BB"], result["Middle_BB"], result["Lower_BB"] = bollinger_bands(df)
|
|
416
|
-
|
|
417
|
-
# Apply Ichimoku Cloud
|
|
418
|
-
(
|
|
419
|
-
result["Tenkan"],
|
|
420
|
-
result["Kijun"],
|
|
421
|
-
result["Senkou_A"],
|
|
422
|
-
result["Senkou_B"],
|
|
423
|
-
result["Chikou"],
|
|
424
|
-
) = ichimoku_cloud(df)
|
|
425
|
-
|
|
426
|
-
# Apply Pivot Points (including support and resistance levels)
|
|
427
|
-
result["Pivot"], result["R1"], result["S1"], result["R2"], result["S2"] = (
|
|
428
|
-
pivot_points(df)
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# Other indicators
|
|
432
|
-
result["CLOSE_DIFF"] = close_diff(df)
|
|
433
|
-
result["OBV"] = obv(df)
|
|
434
|
-
result["DOWNWARD_PRESSURE"], result["UPWARD_PRESSURE"] = pressure(df)
|
|
435
|
-
|
|
436
|
-
# Apply MACD (Moving Average Convergence Divergence)
|
|
437
|
-
result["MACD_Line"], result["MACD_Signal"] = macd(df)
|
|
438
|
-
|
|
439
|
-
# first buy/sell signal : MACD_SIGNAL_DIFF cross 0 levels
|
|
440
|
-
result["MACD_SIGNAL_DIFF"] = result["MACD_Line"] - result["MACD_Signal"]
|
|
441
|
-
result["BUY_1"] = np.where(
|
|
442
|
-
(result["MACD_SIGNAL_DIFF"] > 0)
|
|
443
|
-
& (result["MACD_SIGNAL_DIFF"].shift(1) < 0), # Buy signal (MACD crossover)
|
|
444
|
-
1, # Buy
|
|
445
|
-
np.where(
|
|
446
|
-
(result["MACD_SIGNAL_DIFF"] < 0)
|
|
447
|
-
& (
|
|
448
|
-
result["MACD_SIGNAL_DIFF"].shift(1) > 0
|
|
449
|
-
), # Sell signal (MACD crossunder)
|
|
450
|
-
-1, # Sell
|
|
451
|
-
np.nan, # Default case
|
|
452
|
-
),
|
|
453
|
-
)
|
|
454
|
-
result["BUY_1"] = result["BUY_1"].fillna(0) # TODO: should we fill with 0 (done)
|
|
455
|
-
|
|
456
|
-
# second buy/sell signal : MACD_SIGNAL_DIFF cross 30% threshold of maximum value while positive and decreasing, or 30% threshold of minimum value while negative and increasing
|
|
457
|
-
# Calculate rolling 20-day max and min values for MACD_SIGNAL_DIFF per stock
|
|
458
|
-
macd_signal_diff_max_20_days = result.groupby("STOCK")[
|
|
459
|
-
"MACD_SIGNAL_DIFF"
|
|
460
|
-
].transform(lambda x: x.rolling(20).max())
|
|
461
|
-
macd_signal_diff_min_20_days = result.groupby("STOCK")[
|
|
462
|
-
"MACD_SIGNAL_DIFF"
|
|
463
|
-
].transform(lambda x: x.rolling(20).min())
|
|
464
|
-
|
|
465
|
-
# Define the buy/sell signal conditions
|
|
466
|
-
buy_condition = (
|
|
467
|
-
(result["MACD_SIGNAL_DIFF"] > result["MACD_SIGNAL_DIFF"].shift(1)) # Increasing
|
|
468
|
-
& (result["MACD_SIGNAL_DIFF"] < 0) # Negative value
|
|
469
|
-
& (
|
|
470
|
-
result["MACD_SIGNAL_DIFF"] > 0.3 * macd_signal_diff_min_20_days
|
|
471
|
-
) # Above 30% of minimum
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
sell_condition = (
|
|
475
|
-
(result["MACD_SIGNAL_DIFF"] < result["MACD_SIGNAL_DIFF"].shift(1)) # Decreasing
|
|
476
|
-
& (result["MACD_SIGNAL_DIFF"] > 0) # Positive value
|
|
477
|
-
& (
|
|
478
|
-
result["MACD_SIGNAL_DIFF"] < 0.3 * macd_signal_diff_max_20_days
|
|
479
|
-
) # Below 30% of maximum
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
# Apply the conditions to calculate buy/sell signals
|
|
483
|
-
result["BUY_2"] = np.where(
|
|
484
|
-
buy_condition,
|
|
485
|
-
np.abs(
|
|
486
|
-
(result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_min_20_days)
|
|
487
|
-
/ (0.3 * macd_signal_diff_min_20_days)
|
|
488
|
-
),
|
|
489
|
-
np.where(
|
|
490
|
-
sell_condition,
|
|
491
|
-
-np.abs(
|
|
492
|
-
(result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_max_20_days)
|
|
493
|
-
/ (0.3 * macd_signal_diff_max_20_days)
|
|
494
|
-
),
|
|
495
|
-
0, # Default
|
|
496
|
-
),
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
periods = [
|
|
500
|
-
9,
|
|
501
|
-
14,
|
|
502
|
-
21,
|
|
503
|
-
50,
|
|
504
|
-
126,
|
|
505
|
-
200,
|
|
506
|
-
252,
|
|
507
|
-
] # 2 semaines, 3 semaines, 1 mois et 2.5 mois
|
|
508
|
-
# TODO: on pourrait rajouter plus de long terme : 126 jours (6 mois) et 200 jours (9 mois) et 252 jours (1 an)
|
|
509
|
-
|
|
510
|
-
features = []
|
|
511
|
-
for period in periods:
|
|
512
|
-
logger.debug(f"Computing period features for {period} days...")
|
|
513
|
-
|
|
514
|
-
features.append(
|
|
515
|
-
pd.DataFrame(
|
|
516
|
-
{
|
|
517
|
-
f"CUMUL_RET_{period}": cumulative_return(df, period=period),
|
|
518
|
-
f"SMA_{period}": sma(df, period=period),
|
|
519
|
-
f"EMA_{period}": ema(df, period=period),
|
|
520
|
-
f"VOLATILITY_{period}": volatility(df, period=period),
|
|
521
|
-
f"ADX_{period}": adx(df, period=period),
|
|
522
|
-
f"ATR_{period}": atr(df, period=period),
|
|
523
|
-
f"CMF_{period}": chaikin_money_flow(df, period=period),
|
|
524
|
-
f"RSI_{period}": rsi(df, period=period),
|
|
525
|
-
f"MFI_{period}": mfi(df, period=period),
|
|
526
|
-
},
|
|
527
|
-
index=df.index,
|
|
528
|
-
)
|
|
529
|
-
)
|
|
530
|
-
|
|
531
|
-
# Stochastic Oscillator returns two series: %K and %D
|
|
532
|
-
k, d = stochastic(df, period=period)
|
|
533
|
-
features.append(
|
|
534
|
-
pd.DataFrame(
|
|
535
|
-
{
|
|
536
|
-
f"%K_{period}": k,
|
|
537
|
-
f"%D_{period}": d,
|
|
538
|
-
},
|
|
539
|
-
index=df.index,
|
|
540
|
-
)
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
result = pd.concat([result] + features, axis=1)
|
|
544
|
-
|
|
545
|
-
# third buy/sell signal : RSI is overbought >0.7 / oversold <0.3
|
|
546
|
-
result["BUY_3"] = np.where(
|
|
547
|
-
result["RSI_14"] <= 30,
|
|
548
|
-
(30 - result["RSI_14"]) / 30,
|
|
549
|
-
np.where(result["RSI_14"] >= 70, -(result["RSI_14"] - 70) / 30, 0),
|
|
550
|
-
)
|
|
551
|
-
|
|
552
|
-
# fourth buy/sell signal : RSI vs CLOSE divergence
|
|
553
|
-
# The RSI vs. Close divergence trading signal identifies potential reversals by detecting when the
|
|
554
|
-
# Relative Strength Index (RSI) and price (Close) move in opposite directions
|
|
555
|
-
# bullish divergence occurs when the price makes lower lows while RSI makes higher lows (potential uptrend),
|
|
556
|
-
# and bearish divergence occurs when the price makes higher highs while RSI makes lower highs (potential downtrend)
|
|
557
|
-
|
|
558
|
-
# Detect local peaks (RSI Highs) and troughs (RSI Lows) for divergence analysis
|
|
559
|
-
# Compute local maxima and minima indices
|
|
560
|
-
rsi_peak_indices = argrelextrema(result["RSI_14"].values, np.greater)[
|
|
561
|
-
0
|
|
562
|
-
] # RSI highs
|
|
563
|
-
rsi_trough_indices = argrelextrema(result["RSI_14"].values, np.less)[0] # RSI lows
|
|
564
|
-
|
|
565
|
-
# Create boolean masks for peaks and troughs
|
|
566
|
-
rsi_peaks_mask = np.zeros(len(result), dtype=bool)
|
|
567
|
-
rsi_troughs_mask = np.zeros(len(result), dtype=bool)
|
|
568
|
-
|
|
569
|
-
rsi_peaks_mask[rsi_peak_indices] = True
|
|
570
|
-
rsi_troughs_mask[rsi_trough_indices] = True
|
|
571
|
-
|
|
572
|
-
# Extract peak and trough rows efficiently
|
|
573
|
-
rsi_peaks = result.loc[rsi_peaks_mask, ["CLOSE", "RSI_14"]].copy()
|
|
574
|
-
rsi_troughs = result.loc[rsi_troughs_mask, ["CLOSE", "RSI_14"]].copy()
|
|
575
|
-
|
|
576
|
-
# Compute RSI and CLOSE differences to check divergence
|
|
577
|
-
for i in [1, 2, 3]:
|
|
578
|
-
# RSI & Price difference from past peaks
|
|
579
|
-
rsi_peaks[f"RSI_PEAK_DIFF_{i}"] = rsi_peaks["RSI_14"].diff(i)
|
|
580
|
-
rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] = rsi_peaks["CLOSE"].diff(i)
|
|
581
|
-
|
|
582
|
-
# RSI & Price difference from past troughs
|
|
583
|
-
rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] = rsi_troughs["RSI_14"].diff(i)
|
|
584
|
-
rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] = rsi_troughs["CLOSE"].diff(i)
|
|
585
|
-
|
|
586
|
-
# Detect bearish divergence (RSI down, price up) and bullish divergence (RSI up, price down)
|
|
587
|
-
rsi_peaks[f"DIVERGENCE_{i}"] = np.where(
|
|
588
|
-
(rsi_peaks[f"RSI_PEAK_DIFF_{i}"] < 0)
|
|
589
|
-
& (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] > 0),
|
|
590
|
-
-np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
|
|
591
|
-
np.where(
|
|
592
|
-
(rsi_peaks[f"RSI_PEAK_DIFF_{i}"] > 0)
|
|
593
|
-
& (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] < 0),
|
|
594
|
-
-np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
|
|
595
|
-
0,
|
|
596
|
-
),
|
|
597
|
-
)
|
|
598
|
-
|
|
599
|
-
rsi_troughs[f"DIVERGENCE_{i}"] = np.where(
|
|
600
|
-
(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] > 0)
|
|
601
|
-
& (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] < 0),
|
|
602
|
-
np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
|
|
603
|
-
np.where(
|
|
604
|
-
(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] < 0)
|
|
605
|
-
& (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] > 0),
|
|
606
|
-
np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
|
|
607
|
-
0,
|
|
608
|
-
),
|
|
609
|
-
)
|
|
610
|
-
|
|
611
|
-
# Concatenate peak and trough divergences into a single DataFrame
|
|
612
|
-
divergence_cols = [f"DIVERGENCE_{i}" for i in [1, 2, 3]]
|
|
613
|
-
divergence_data = pd.concat(
|
|
614
|
-
[rsi_peaks[divergence_cols], rsi_troughs[divergence_cols]], axis=0
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
# Merge using index alignment
|
|
618
|
-
result[divergence_cols] = divergence_data.reindex(result.index, fill_value=0)
|
|
619
|
-
|
|
620
|
-
# Sum divergence signals into BUY_4 for a single signal strength metric
|
|
621
|
-
result["BUY_4"] = result[divergence_cols].sum(axis=1)
|
|
622
|
-
return result
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
# Main function to process the full dataset with multiple stocks
|
|
626
|
-
def preprocessing(
|
|
627
|
-
df: pd.DataFrame,
|
|
628
|
-
for_training: bool = False,
|
|
629
|
-
save_as_csv: bool = False,
|
|
630
|
-
):
|
|
631
|
-
"""Main function to process the full dataset with multiple stocks
|
|
632
|
-
|
|
633
|
-
Args:
|
|
634
|
-
- df (pd.DataFrame): the dataframe with ohlc data
|
|
635
|
-
- for_training (bool): whether to compute targets and for_training as data_for_training, or not.
|
|
636
|
-
"""
|
|
637
|
-
|
|
638
|
-
# Computing residual RET and relative VOLUME
|
|
639
|
-
logger.info("Creating RET and VOLUME metrics...")
|
|
640
|
-
df["RET"] = df.groupby("STOCK")["CLOSE"].pct_change(1)
|
|
641
|
-
df["MARKET_RET"] = df.groupby("DATE")["RET"].transform("mean")
|
|
642
|
-
df["RESIDUAL_RET"] = df["RET"] - df["MARKET_RET"]
|
|
643
|
-
|
|
644
|
-
df["VOLUME_RATIO"] = (
|
|
645
|
-
df["VOLUME"]
|
|
646
|
-
/ df.groupby("STOCK")["VOLUME"].rolling(20, min_periods=1).mean().values
|
|
647
|
-
)
|
|
648
|
-
df["MARKET_VOLUME_RATIO"] = df.groupby("DATE")["VOLUME_RATIO"].transform("mean")
|
|
649
|
-
df["RELATIVE_VOLUME"] = df["VOLUME_RATIO"] - df["MARKET_VOLUME_RATIO"]
|
|
650
|
-
|
|
651
|
-
logger.info("Creating historical time series metrics...")
|
|
652
|
-
periods = [
|
|
653
|
-
1, # daily
|
|
654
|
-
2,
|
|
655
|
-
3,
|
|
656
|
-
4,
|
|
657
|
-
5, # weekly
|
|
658
|
-
9,
|
|
659
|
-
14,
|
|
660
|
-
21, # monthly
|
|
661
|
-
50,
|
|
662
|
-
126,
|
|
663
|
-
200,
|
|
664
|
-
252,
|
|
665
|
-
] # need to keep 1, 2, 3, 4, 5 for backward compatibility
|
|
666
|
-
for METRIC in ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]:
|
|
667
|
-
for i in periods:
|
|
668
|
-
df[f"{METRIC}_-{i}"] = df[METRIC].shift(i)
|
|
669
|
-
|
|
670
|
-
# Group by "STOCK" and apply the indicators for each stock
|
|
671
|
-
logger.info("Applying indicators...")
|
|
672
|
-
grouped_df = df.groupby("STOCK", group_keys=False)
|
|
673
|
-
preprocessed_df = grouped_df.apply(apply_indicators)
|
|
674
|
-
|
|
675
|
-
# Drop non-useful column for training
|
|
676
|
-
if "ISIN" in df.columns:
|
|
677
|
-
df.drop(labels=["ISIN"], axis=1, inplace=True)
|
|
678
|
-
if "SECURITY" in df.columns:
|
|
679
|
-
df.drop(labels=["SECURITY"], axis=1, inplace=True)
|
|
680
|
-
|
|
681
|
-
if for_training:
|
|
682
|
-
preprocessed_df = targets_creation(preprocessed_df)
|
|
683
|
-
|
|
684
|
-
if save_as_csv and PYTHON_ENV == "Development":
|
|
685
|
-
preprocessed_df_to_csv = preprocessed_df.sort_values(["DATE", "STOCK"])
|
|
686
|
-
preprocessed_df_to_csv.to_csv(
|
|
687
|
-
f"{data_dir}/data_for_training.csv",
|
|
688
|
-
index=False,
|
|
689
|
-
header=True,
|
|
690
|
-
)
|
|
691
|
-
|
|
692
|
-
if for_training:
|
|
693
|
-
preprocessed_df.dropna(inplace=True)
|
|
694
|
-
|
|
695
|
-
preprocessed_df.sort_values(["DATE", "STOCK"], inplace=True)
|
|
696
|
-
preprocessed_df.reset_index(drop=True, inplace=True)
|
|
697
|
-
|
|
698
|
-
logger.info(
|
|
699
|
-
f"{len(preprocessed_df['DATE'])} preprocessed data with shape {preprocessed_df.shape} from {datetime.strftime(preprocessed_df['DATE'].iat[0], '%d/%m/%Y')} to {datetime.strftime(preprocessed_df['DATE'].iat[-1], '%d/%m/%Y')}"
|
|
700
|
-
)
|
|
701
|
-
|
|
702
|
-
# for_training results if needed
|
|
703
|
-
if for_training and PYTHON_ENV == "Development":
|
|
704
|
-
joblib.dump(preprocessed_df, f"{data_dir}/data_for_training.pkl")
|
|
705
|
-
|
|
706
|
-
# Return the fully processed DataFrame with all new features (copy to avoid fragmented memory)
|
|
707
|
-
return_df = preprocessed_df.copy()
|
|
708
|
-
return return_df
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
# Descriptive Analytics functions
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
def plot_sector_repartition(df: pd.DataFrame):
|
|
715
|
-
"""Visualise repartition of stock per sectors
|
|
716
|
-
|
|
717
|
-
Args:
|
|
718
|
-
df (pd.DataFrame): a df created with `get_data`
|
|
719
|
-
"""
|
|
720
|
-
sns.barplot(
|
|
721
|
-
data=df.groupby("SECTOR")["STOCK"].nunique(),
|
|
722
|
-
orient="h",
|
|
723
|
-
order=df.groupby("SECTOR")["STOCK"]
|
|
724
|
-
.nunique()
|
|
725
|
-
.sort_values(ascending=False)
|
|
726
|
-
.index,
|
|
727
|
-
)
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
def visualize_extrema(
|
|
731
|
-
data: pd.DataFrame,
|
|
732
|
-
stock: str,
|
|
733
|
-
days_before_last: int = 200,
|
|
734
|
-
local_max_order: int = 10,
|
|
735
|
-
):
|
|
736
|
-
"""
|
|
737
|
-
Function to visualize local maxima and minima for a given stock in the data.
|
|
738
|
-
|
|
739
|
-
Parameters:
|
|
740
|
-
- data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'ID'
|
|
741
|
-
- stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
|
|
742
|
-
- days_before_last: int, number of days before the last date in the dataset to visualize
|
|
743
|
-
- local_max_order: int, the window size for identifying local extrema (default: 5)
|
|
744
|
-
"""
|
|
745
|
-
|
|
746
|
-
# Calculate the last date in the dataset
|
|
747
|
-
last_date = data["DATE"].max()
|
|
748
|
-
start_date = last_date - pd.Timedelta(days=days_before_last)
|
|
749
|
-
|
|
750
|
-
# Find local maxima (argrelextrema with np.greater) for each stock
|
|
751
|
-
local_max_CLOSE = (
|
|
752
|
-
data[data["STOCK"] == stock]
|
|
753
|
-
.set_index("DATE")["CLOSE"]
|
|
754
|
-
.iloc[
|
|
755
|
-
argrelextrema(
|
|
756
|
-
data[data["STOCK"] == stock]["CLOSE"].values,
|
|
757
|
-
np.greater,
|
|
758
|
-
order=local_max_order,
|
|
759
|
-
)
|
|
760
|
-
]
|
|
761
|
-
.reset_index()
|
|
762
|
-
)
|
|
763
|
-
|
|
764
|
-
# Find local minima (argrelextrema with np.less) for each stock
|
|
765
|
-
local_min_CLOSE = (
|
|
766
|
-
data[data["STOCK"] == stock]
|
|
767
|
-
.set_index("DATE")["CLOSE"]
|
|
768
|
-
.iloc[
|
|
769
|
-
argrelextrema(
|
|
770
|
-
data[data["STOCK"] == stock]["CLOSE"].values,
|
|
771
|
-
np.less,
|
|
772
|
-
order=local_max_order,
|
|
773
|
-
)
|
|
774
|
-
]
|
|
775
|
-
.reset_index()
|
|
776
|
-
)
|
|
777
|
-
|
|
778
|
-
# Filter maxima based on stock and date range
|
|
779
|
-
local_max_CLOSE = local_max_CLOSE[local_max_CLOSE["DATE"] >= start_date]
|
|
780
|
-
|
|
781
|
-
# Filter minima based on stock and date range
|
|
782
|
-
local_min_CLOSE = local_min_CLOSE[local_min_CLOSE["DATE"] >= start_date]
|
|
783
|
-
|
|
784
|
-
# logger.info the maxima and minima dates
|
|
785
|
-
logger.info(
|
|
786
|
-
f"Maxima Dates for Stock {stock}: {list(local_max_CLOSE['DATE'].values)}"
|
|
787
|
-
)
|
|
788
|
-
logger.info(
|
|
789
|
-
f"Minima Dates for Stock {stock}: {list(local_min_CLOSE['DATE'].values)}"
|
|
790
|
-
)
|
|
791
|
-
|
|
792
|
-
# Plot the stock's CLOSE prices within the specified date range
|
|
793
|
-
stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)][
|
|
794
|
-
["CLOSE", "DATE"]
|
|
795
|
-
].set_index("DATE")
|
|
796
|
-
|
|
797
|
-
plt.figure(figsize=(10, 6))
|
|
798
|
-
stock_data.plot(color="black", title=f"Stock {stock} Extremas")
|
|
799
|
-
|
|
800
|
-
# Add vertical lines for maxima
|
|
801
|
-
for date in local_max_CLOSE["DATE"].values:
|
|
802
|
-
plt.axvline(
|
|
803
|
-
x=date,
|
|
804
|
-
color="red",
|
|
805
|
-
label="Maxima" if date == local_max_CLOSE["DATE"].values[0] else "",
|
|
806
|
-
)
|
|
807
|
-
|
|
808
|
-
# Add vertical lines for minima
|
|
809
|
-
for date in local_min_CLOSE["DATE"].values:
|
|
810
|
-
plt.axvline(
|
|
811
|
-
x=date,
|
|
812
|
-
color="green",
|
|
813
|
-
label="Minima" if date == local_min_CLOSE["DATE"].values[0] else "",
|
|
814
|
-
)
|
|
815
|
-
|
|
816
|
-
plt.legend()
|
|
817
|
-
plt.show()
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
def visualize_trading_signals(
|
|
821
|
-
data: pd.DataFrame,
|
|
822
|
-
stock: str,
|
|
823
|
-
days_before_last: int = 200,
|
|
824
|
-
):
|
|
825
|
-
"""
|
|
826
|
-
Function to visualize trading signals (BUY, SELL, HOLD) for a given stock.
|
|
827
|
-
|
|
828
|
-
Parameters:
|
|
829
|
-
- data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'TRADING_SIGNAL'
|
|
830
|
-
- stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
|
|
831
|
-
- days_before_last: int, number of days before the last date in the dataset to visualize
|
|
832
|
-
"""
|
|
833
|
-
|
|
834
|
-
# Calculate the last date in the dataset
|
|
835
|
-
last_date = data["DATE"].max()
|
|
836
|
-
start_date = last_date - pd.Timedelta(days=days_before_last)
|
|
837
|
-
|
|
838
|
-
# Filter data for the selected stock and date range
|
|
839
|
-
stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)].copy()
|
|
840
|
-
|
|
841
|
-
# Plot the stock's CLOSE prices
|
|
842
|
-
plt.figure(figsize=(10, 6))
|
|
843
|
-
plt.plot(stock_data["DATE"], stock_data["CLOSE"], color="black", label="CLOSE")
|
|
844
|
-
|
|
845
|
-
# Define the colors for the trading signals
|
|
846
|
-
colors = {2: "green", 1: "lightgreen", 0: "yellow", -1: "red", -2: "darkred"}
|
|
847
|
-
|
|
848
|
-
# Plot each trading signal with the respective color
|
|
849
|
-
for signal_value, color in colors.items():
|
|
850
|
-
plt.scatter(
|
|
851
|
-
stock_data.loc[stock_data["TARGET_11"] == signal_value, "DATE"],
|
|
852
|
-
stock_data.loc[stock_data["TARGET_11"] == signal_value, "CLOSE"],
|
|
853
|
-
color=color,
|
|
854
|
-
label=f"Signal {signal_value}",
|
|
855
|
-
s=50, # Size of the points
|
|
856
|
-
)
|
|
857
|
-
|
|
858
|
-
plt.title(f"Trading Signals for {stock}")
|
|
859
|
-
plt.xlabel("Date")
|
|
860
|
-
plt.ylabel("Close Price")
|
|
861
|
-
plt.legend()
|
|
862
|
-
plt.grid(True)
|
|
863
|
-
plt.show()
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
def visualize_data_distribution(
|
|
867
|
-
data,
|
|
868
|
-
plot_type="hist",
|
|
869
|
-
features=None,
|
|
870
|
-
bins=50,
|
|
871
|
-
rows=5,
|
|
872
|
-
cols=5,
|
|
873
|
-
width_per_plot=4,
|
|
874
|
-
height_per_plot=3,
|
|
875
|
-
):
|
|
876
|
-
"""
|
|
877
|
-
Function to visualize the data distribution for multiple features in a DataFrame with dynamic figsize,
|
|
878
|
-
splitting into multiple figures if there are too many features for one figure.
|
|
879
|
-
|
|
880
|
-
Parameters:
|
|
881
|
-
- data: pd.DataFrame, the DataFrame containing the data to visualize.
|
|
882
|
-
- plot_type: str, the type of plot to use ('hist', 'kde', 'box').
|
|
883
|
-
- features: list, list of features (columns) to visualize. If None, all numeric features are used.
|
|
884
|
-
- bins: int, the number of bins for histograms (default: 50).
|
|
885
|
-
- rows: int, number of rows in the subplot grid (default: 5).
|
|
886
|
-
- cols: int, number of columns in the subplot grid (default: 5).
|
|
887
|
-
- width_per_plot: int, the width of each subplot (default: 4).
|
|
888
|
-
- height_per_plot: int, the height of each subplot (default: 3).
|
|
889
|
-
"""
|
|
890
|
-
|
|
891
|
-
# If no features are specified, use all numeric features
|
|
892
|
-
if features is None:
|
|
893
|
-
features = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
894
|
-
|
|
895
|
-
# Calculate the total number of features
|
|
896
|
-
total_features = len(features)
|
|
897
|
-
|
|
898
|
-
# How many plots can fit into one figure
|
|
899
|
-
plots_per_figure = rows * cols
|
|
900
|
-
|
|
901
|
-
# Loop over the features and create new figures as needed
|
|
902
|
-
for start in range(0, total_features, plots_per_figure):
|
|
903
|
-
# Subset of features for the current figure
|
|
904
|
-
subset_features = features[start : start + plots_per_figure]
|
|
905
|
-
|
|
906
|
-
# Dynamically calculate figure size based on grid size and plot dimensions
|
|
907
|
-
num_plots = len(subset_features)
|
|
908
|
-
grid_rows = min(rows, num_plots // cols + (num_plots % cols != 0))
|
|
909
|
-
grid_cols = min(cols, num_plots)
|
|
910
|
-
figsize = (grid_cols * width_per_plot, grid_rows * height_per_plot)
|
|
911
|
-
|
|
912
|
-
# Set up the figure and axes for this subset of features
|
|
913
|
-
fig, axes = plt.subplots(grid_rows, grid_cols, figsize=figsize)
|
|
914
|
-
axes = axes.flatten() # Flatten the axes for easy iteration
|
|
915
|
-
|
|
916
|
-
# Plot each feature
|
|
917
|
-
for i, feature in enumerate(subset_features):
|
|
918
|
-
ax = axes[i]
|
|
919
|
-
|
|
920
|
-
if plot_type == "hist":
|
|
921
|
-
sns.histplot(data[feature].dropna(), bins=bins, kde=False, ax=ax)
|
|
922
|
-
elif plot_type == "kde":
|
|
923
|
-
sns.kdeplot(data[feature].dropna(), ax=ax, fill=True)
|
|
924
|
-
elif plot_type == "box":
|
|
925
|
-
sns.boxplot(data[feature].dropna(), ax=ax)
|
|
926
|
-
|
|
927
|
-
ax.set_xlabel(feature)
|
|
928
|
-
ax.set_ylabel("Count")
|
|
929
|
-
|
|
930
|
-
# Hide any empty subplots
|
|
931
|
-
for j in range(i + 1, len(axes)):
|
|
932
|
-
fig.delaxes(axes[j])
|
|
933
|
-
|
|
934
|
-
# Use tight layout to ensure there's no overlap
|
|
935
|
-
fig.tight_layout()
|
|
936
|
-
|
|
937
|
-
# Show the plot for this figure
|
|
938
|
-
plt.show()
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
def detect_outliers_iqr(data, degree: float = 1.5):
|
|
942
|
-
"""
|
|
943
|
-
Detect outliers in a DataFrame using the Interquartile Range (IQR) method.
|
|
944
|
-
|
|
945
|
-
Parameters:
|
|
946
|
-
- data: pd.DataFrame, the DataFrame in which to detect outliers.
|
|
947
|
-
|
|
948
|
-
Returns:
|
|
949
|
-
- outliers: pd.DataFrame, DataFrame with boolean values indicating outliers for each feature.
|
|
950
|
-
"""
|
|
951
|
-
outliers = pd.DataFrame(index=data.index)
|
|
952
|
-
|
|
953
|
-
for column in data.select_dtypes(include=[np.number]).columns:
|
|
954
|
-
Q1 = data[column].quantile(0.25) # 1st quartile (25th percentile)
|
|
955
|
-
Q3 = data[column].quantile(0.75) # 3rd quartile (75th percentile)
|
|
956
|
-
IQR = Q3 - Q1 # Interquartile range
|
|
957
|
-
|
|
958
|
-
lower_bound = Q1 - degree * IQR
|
|
959
|
-
upper_bound = Q3 + degree * IQR
|
|
960
|
-
|
|
961
|
-
# Detect outliers
|
|
962
|
-
outliers[column] = (data[column] < lower_bound) | (data[column] > upper_bound)
|
|
963
|
-
|
|
964
|
-
return outliers
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
def plot_distribution(df):
|
|
968
|
-
logger.info("DATA_DISTRIBUTION")
|
|
969
|
-
|
|
970
|
-
logger.info("numerical features")
|
|
971
|
-
visualize_data_distribution(df.select_dtypes(include=["float64"]))
|
|
972
|
-
|
|
973
|
-
logger.info("categorical features")
|
|
974
|
-
visualize_data_distribution(df.select_dtypes(include=["int64"]))
|
|
975
|
-
|
|
976
|
-
logger.info("nb of outliers")
|
|
977
|
-
outliers = detect_outliers_iqr(df.select_dtypes(include=["float64"]), degree=5)
|
|
978
|
-
|
|
979
|
-
with pd.option_context("display.max_rows", None):
|
|
980
|
-
logger.info(outliers.sum().sort_values(ascending=False))
|
|
981
|
-
|
|
982
|
-
logger.info("zoom on volume outliers")
|
|
983
|
-
columns = [c for c in df.columns if "VOLUME" in c]
|
|
984
|
-
visualize_data_distribution(df, features=columns, plot_type="box", cols=3)
|