lecrapaud 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -1,984 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- import pandas as pd
5
- import numpy as np
6
- import joblib
7
- from datetime import datetime
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
- from scipy.signal import argrelextrema
11
- from itertools import product
12
- import os
13
- from collections import defaultdict
14
-
15
- from lecrapaud.config import PYTHON_ENV
16
- from lecrapaud.utils import logger
17
- from lecrapaud.directory_management import data_dir
18
- from lecrapaud.services.indicators import (
19
- rsi,
20
- macd,
21
- bollinger_bands,
22
- adx,
23
- atr,
24
- stochastic,
25
- mfi,
26
- ichimoku_cloud,
27
- parabolic_sar,
28
- chaikin_money_flow,
29
- pivot_points,
30
- sma,
31
- ema,
32
- volatility,
33
- cumulative_return,
34
- close_diff,
35
- obv,
36
- pressure,
37
- )
38
- from lecrapaud.db import Target
39
-
40
-
41
- # pd print options
42
- # pd.set_option("display.max_columns", None)
43
- # pd.reset_option("display.max_rows")
44
- # pd.set_option("display.max_colwidth", None)
45
-
46
-
47
- # Main function to create targets
48
- def targets_creation(
49
- df: pd.DataFrame,
50
- top_x_stock: float = 0.1,
51
- local_max_order: int = 10,
52
- threshold: int = 5,
53
- ):
54
- """Preprocessing the stock data from yfinance
55
-
56
- Args:
57
- df (pd.DataFrame): a dataframe obtain with `get_data` function
58
- top_x_stock (float): the % at which you are considered top ranked stock for the day
59
- local_max_order (int): this set up the window to look at on both side of the extrema : the greater, the more 'global' is the extrema.
60
-
61
- Returns:
62
- df with more columns:
63
- - date variables : we create YEAR, MONTH, DAY, WEEK, WEEKDAY, YEARWEEK and YEARDAY features
64
- - return, market return, residual return and similar computation with volume are done to create 6 new features
65
- - target variables :
66
- - TARGET_1 : next day return
67
- - TARGET_2 : categorical return (positive 1, or negative 0)
68
- - TARGET_3 : next day ranking from best (1) to worst (n_stock) returns
69
- - TARGET_4 : categorical next day top ranking (in top_x_stock) (1), or not (0)
70
- - TARGET_5, TARGET_6, TARGET_7, TARGET_8 : same but with residual return
71
- - TARGET_9 : categorical with 1 if it's a local maximum and 0 if not
72
- - TARGET_10 : categorical with 1 if it's a local minimum and 0 if not
73
- - TARGET 11 : We will create trading signals based on proximity to local minima and maxima : need multi-binary loss support
74
- - TARGET 12, 13, 14 : return in 9,14,21 days
75
-
76
-
77
- """
78
-
79
- # Creating targets
80
- logger.info("Creating target variables...")
81
-
82
- # TARGET 1-4 : We start with target RET
83
- target = "RET"
84
- stock_column = "STOCK"
85
- nb_of_stocks = len(df[stock_column].unique())
86
-
87
- first_x_percent = max(int(nb_of_stocks * top_x_stock), 1)
88
-
89
- df["TARGET_1"] = df[target].shift(-1)
90
- df["TARGET_2"] = np.select([df["TARGET_1"] <= 0, df["TARGET_1"] > 0], [0, 1])
91
- df["TARGET_3"] = df.groupby("DATE")["TARGET_1"].rank(
92
- method="first", ascending=False
93
- )
94
- df["TARGET_4"] = np.select(
95
- [
96
- df.groupby("DATE")["TARGET_1"].rank(method="first", ascending=False)
97
- <= first_x_percent
98
- ],
99
- [1],
100
- default=0,
101
- )
102
-
103
- # TARGET 5-8 : We do the same for RESIDUAL_RET
104
- target = "RESIDUAL_RET"
105
-
106
- df["TARGET_5"] = df[target].shift(-1)
107
- df["TARGET_6"] = np.select([df["TARGET_5"] <= 0, df["TARGET_5"] > 0], [0, 1])
108
- df["TARGET_7"] = df.groupby("DATE")["TARGET_5"].rank(
109
- method="first", ascending=False
110
- )
111
- df["TARGET_8"] = np.select(
112
- [
113
- df.groupby("DATE")["TARGET_5"].rank(method="first", ascending=False)
114
- <= first_x_percent
115
- ],
116
- [1],
117
- default=0,
118
- )
119
-
120
- # TARGET 9-10 : Let's look at local min and max : it can be interpretate as buy and sell signal respectively
121
- target = "CLOSE"
122
-
123
- df["TARGET_9"] = 0
124
- df["TARGET_10"] = 0
125
-
126
- # Calculate local maxima and set TARGET_9 to 1 where maxima are found
127
- maxima_indices = df.groupby(stock_column)[target].transform(
128
- lambda x: x.index.isin(
129
- x.iloc[argrelextrema(x.values, np.greater, order=local_max_order)].index
130
- )
131
- )
132
-
133
- minima_indices = df.groupby(stock_column)[target].transform(
134
- lambda x: x.index.isin(
135
- x.iloc[argrelextrema(x.values, np.less, order=local_max_order)].index
136
- )
137
- )
138
-
139
- df.loc[maxima_indices, "TARGET_9"] = 1
140
- df.loc[minima_indices, "TARGET_10"] = 1
141
-
142
- # TARGET 11 : We will create trading signals based on proximity to local minima and maxima.
143
- df["TARGET_11"] = 2 # Default value for HOLD
144
-
145
- # Function to detect local minima and maxima, and assign signals
146
- def assign_signals(group):
147
- close_prices = group[target].values
148
- dates = group["DATE"].values
149
-
150
- # Detect local maxima and minima using argrelextrema
151
- local_maxima_idx = argrelextrema(
152
- close_prices, np.greater, order=local_max_order
153
- )[0]
154
- local_minima_idx = argrelextrema(close_prices, np.less, order=local_max_order)[
155
- 0
156
- ]
157
-
158
- # STRONG BUY (4) for local minima, STRONG SELL (0) for local maxima
159
- group.loc[group.index[local_minima_idx], "TARGET_11"] = 4
160
- group.loc[group.index[local_maxima_idx], "TARGET_11"] = 0
161
-
162
- # Assign BUY (3) and SELL (1) based on proximity to extrema within the threshold window
163
- for idx in local_minima_idx:
164
- # Get the actual date of the minima
165
- min_date = dates[idx]
166
- # Select the rows within the threshold window around the minima date
167
- buy_window = group.loc[
168
- (group["DATE"] >= min_date - pd.Timedelta(days=threshold))
169
- & (group["DATE"] <= min_date + pd.Timedelta(days=threshold))
170
- ]
171
- group.loc[buy_window.index, "TARGET_11"] = np.where(
172
- buy_window["DATE"] == min_date,
173
- 4,
174
- 3, # STRONG BUY at minima, BUY near minima
175
- )
176
-
177
- for idx in local_maxima_idx:
178
- # Get the actual date of the maxima
179
- max_date = dates[idx]
180
- # Select the rows within the threshold window around the maxima date
181
- sell_window = group.loc[
182
- (group["DATE"] >= max_date - pd.Timedelta(days=threshold))
183
- & (group["DATE"] <= max_date + pd.Timedelta(days=threshold))
184
- ]
185
- group.loc[sell_window.index, "TARGET_11"] = np.where(
186
- sell_window["DATE"] == max_date,
187
- 0,
188
- 1, # STRONG SELL at maxima, SELL near maxima
189
- )
190
-
191
- return group
192
-
193
- # Apply the function to each stock group
194
- df = df.groupby(stock_column, group_keys=False).apply(assign_signals)
195
-
196
- # TARGET 12, 13, 14 : return in 9,14,21 days
197
- df["TARGET_12"] = df.groupby("STOCK")["CLOSE"].pct_change(9).shift(-9)
198
- df["TARGET_13"] = df.groupby("STOCK")["CLOSE"].pct_change(14).shift(-14)
199
- df["TARGET_14"] = df.groupby("STOCK")["CLOSE"].pct_change(21).shift(-21)
200
-
201
- # Update database
202
- # TODO: in bulk
203
- Target.upsert(
204
- match_fields=["name", "type"],
205
- name="TARGET_1",
206
- type="regression",
207
- description="Next day return",
208
- )
209
- Target.upsert(
210
- match_fields=["name", "type"],
211
- name="TARGET_2",
212
- type="classification",
213
- description="Next day return",
214
- )
215
- Target.upsert(
216
- match_fields=["name", "type"],
217
- name="TARGET_3",
218
- type="regression",
219
- description="Ranking of next day return",
220
- )
221
- Target.upsert(
222
- match_fields=["name", "type"],
223
- name="TARGET_4",
224
- type="classification",
225
- description="Top ranking of next day return",
226
- )
227
- Target.upsert(
228
- match_fields=["name", "type"],
229
- name="TARGET_5",
230
- type="regression",
231
- description="Next day residual return",
232
- )
233
- Target.upsert(
234
- match_fields=["name", "type"],
235
- name="TARGET_6",
236
- type="classification",
237
- description="Next day residual return",
238
- )
239
- Target.upsert(
240
- match_fields=["name", "type"],
241
- name="TARGET_7",
242
- type="regression",
243
- description="Ranking of next day residual return",
244
- )
245
- Target.upsert(
246
- match_fields=["name", "type"],
247
- name="TARGET_8",
248
- type="classification",
249
- description="Top ranking of next day residual return",
250
- )
251
- Target.upsert(
252
- match_fields=["name", "type"],
253
- name="TARGET_9",
254
- type="classification",
255
- description="Local maxima",
256
- )
257
- Target.upsert(
258
- match_fields=["name", "type"],
259
- name="TARGET_10",
260
- type="classification",
261
- description="Local minima",
262
- )
263
- Target.upsert(
264
- match_fields=["name", "type"],
265
- name="TARGET_11",
266
- type="classification",
267
- description="Trading signals based on proximity to local minima and maxima",
268
- )
269
- Target.upsert(
270
- match_fields=["name", "type"],
271
- name="TARGET_12",
272
- type="regression",
273
- description="Return in 9 days",
274
- )
275
- Target.upsert(
276
- match_fields=["name", "type"],
277
- name="TARGET_13",
278
- type="regression",
279
- description="Return in 14 days",
280
- )
281
- Target.upsert(
282
- match_fields=["name", "type"],
283
- name="TARGET_14",
284
- type="regression",
285
- description="Return in 21 days",
286
- )
287
-
288
- return df
289
-
290
-
291
- def calculate_option_features(option_data: list[dict], spot_price: float):
292
- puts = [opt for opt in option_data if opt["type"] == "put"]
293
- calls = [opt for opt in option_data if opt["type"] == "call"]
294
-
295
- def safe_float(x):
296
- try:
297
- return float(x)
298
- except:
299
- return 0.0
300
-
301
- # Convert and clean data
302
- for opt in option_data:
303
- for key in ["strike", "volume", "open_interest", "delta", "implied_volatility"]:
304
- opt[key] = safe_float(opt.get(key, 0.0))
305
-
306
- # Put/Call ratios
307
- total_put_vol = sum(p["volume"] for p in puts)
308
- total_call_vol = sum(c["volume"] for c in calls)
309
- total_put_oi = sum(p["open_interest"] for p in puts)
310
- total_call_oi = sum(c["open_interest"] for c in calls)
311
-
312
- put_call_ratio_vol = total_put_vol / total_call_vol if total_call_vol > 0 else None
313
- put_call_ratio_oi = total_put_oi / total_call_oi if total_call_oi > 0 else None
314
-
315
- # Open Interest Skew
316
- oi_skew = sum(c["open_interest"] for c in calls if c["strike"] > spot_price) - sum(
317
- p["open_interest"] for p in puts if p["strike"] < spot_price
318
- )
319
-
320
- # Total Open Interest
321
- total_oi = sum(opt["open_interest"] for opt in option_data)
322
-
323
- # Delta-weighted Put/Call Ratio
324
- dw_put = sum(p["delta"] * p["volume"] for p in puts)
325
- dw_call = sum(c["delta"] * c["volume"] for c in calls)
326
- delta_weighted_pcr = dw_put / dw_call if dw_call > 0 else None
327
-
328
- # ATM IV
329
- atm_option = min(option_data, key=lambda x: abs(x["strike"] - spot_price))
330
- atm_iv = atm_option["implied_volatility"]
331
-
332
- # IV Skew (25-delta)
333
- iv_put_25d = np.mean(
334
- [p["implied_volatility"] for p in puts if abs(p["delta"] + 0.25) < 0.05]
335
- )
336
- iv_call_25d = np.mean(
337
- [c["implied_volatility"] for c in calls if abs(c["delta"] - 0.25) < 0.05]
338
- )
339
- iv_skew_25d = iv_put_25d - iv_call_25d if iv_put_25d and iv_call_25d else None
340
-
341
- # IV Term Structure
342
- iv_by_exp = defaultdict(list)
343
- for opt in option_data:
344
- iv_by_exp[opt["expiration"]].append(opt["implied_volatility"])
345
- expiries = sorted(iv_by_exp.keys())
346
- if len(expiries) >= 2:
347
- iv_term_structure = np.mean(iv_by_exp[expiries[-1]]) - np.mean(
348
- iv_by_exp[expiries[0]]
349
- )
350
- else:
351
- iv_term_structure = None
352
-
353
- # Moneyness
354
- moneyness = [spot_price / opt["strike"] for opt in option_data if opt["strike"] > 0]
355
-
356
- # % OTM / ITM
357
- otm_calls = [c for c in calls if c["strike"] > spot_price]
358
- otm_puts = [p for p in puts if p["strike"] < spot_price]
359
- otm = len(otm_calls) + len(otm_puts)
360
- itm = len(option_data) - otm
361
- percent_otm = otm / len(option_data) if option_data else None
362
- percent_itm = itm / len(option_data) if option_data else None
363
-
364
- # Weighted Average Strike
365
- def weighted_avg_strike(options):
366
- total_vol = sum(o["volume"] for o in options)
367
- return (
368
- sum(o["strike"] * o["volume"] for o in options) / total_vol
369
- if total_vol > 0
370
- else None
371
- )
372
-
373
- avg_strike_calls = weighted_avg_strike(calls)
374
- avg_strike_puts = weighted_avg_strike(puts)
375
-
376
- # Option Sentiment Index
377
- sentiment_numerator = sum(
378
- c["volume"] for c in calls if c["strike"] < spot_price
379
- ) - sum(p["volume"] for p in puts if p["strike"] > spot_price)
380
- sentiment_index = (
381
- sentiment_numerator / (total_put_vol + total_call_vol)
382
- if (total_put_vol + total_call_vol) > 0
383
- else None
384
- )
385
-
386
- return {
387
- "put_call_ratio_volume": put_call_ratio_vol,
388
- "put_call_ratio_open_interest": put_call_ratio_oi,
389
- "open_interest_skew": oi_skew,
390
- "total_open_interest": total_oi,
391
- "delta_weighted_pcr": delta_weighted_pcr,
392
- "atm_iv": atm_iv,
393
- "iv_skew_25d": iv_skew_25d,
394
- "iv_term_structure": iv_term_structure,
395
- "average_moneyness": np.mean(moneyness) if moneyness else None,
396
- "percent_otm": percent_otm,
397
- "percent_itm": percent_itm,
398
- "weighted_avg_strike_calls": avg_strike_calls,
399
- "weighted_avg_strike_puts": avg_strike_puts,
400
- "option_sentiment_index": sentiment_index,
401
- }
402
-
403
-
404
- def apply_indicators(df: pd.DataFrame):
405
- """Apply multiple indicators to a grouped dataframe of a single stock."""
406
- # Assuming 'df' is the OHLC data for a single stock, apply indicators
407
- result = df.copy()
408
-
409
- logger.debug(f"Computing non-period features...")
410
-
411
- # Apply Parabolic SAR
412
- result["Parabolic_SAR"] = parabolic_sar(df)
413
-
414
- # Apply Bollinger Bands
415
- result["Upper_BB"], result["Middle_BB"], result["Lower_BB"] = bollinger_bands(df)
416
-
417
- # Apply Ichimoku Cloud
418
- (
419
- result["Tenkan"],
420
- result["Kijun"],
421
- result["Senkou_A"],
422
- result["Senkou_B"],
423
- result["Chikou"],
424
- ) = ichimoku_cloud(df)
425
-
426
- # Apply Pivot Points (including support and resistance levels)
427
- result["Pivot"], result["R1"], result["S1"], result["R2"], result["S2"] = (
428
- pivot_points(df)
429
- )
430
-
431
- # Other indicators
432
- result["CLOSE_DIFF"] = close_diff(df)
433
- result["OBV"] = obv(df)
434
- result["DOWNWARD_PRESSURE"], result["UPWARD_PRESSURE"] = pressure(df)
435
-
436
- # Apply MACD (Moving Average Convergence Divergence)
437
- result["MACD_Line"], result["MACD_Signal"] = macd(df)
438
-
439
- # first buy/sell signal : MACD_SIGNAL_DIFF cross 0 levels
440
- result["MACD_SIGNAL_DIFF"] = result["MACD_Line"] - result["MACD_Signal"]
441
- result["BUY_1"] = np.where(
442
- (result["MACD_SIGNAL_DIFF"] > 0)
443
- & (result["MACD_SIGNAL_DIFF"].shift(1) < 0), # Buy signal (MACD crossover)
444
- 1, # Buy
445
- np.where(
446
- (result["MACD_SIGNAL_DIFF"] < 0)
447
- & (
448
- result["MACD_SIGNAL_DIFF"].shift(1) > 0
449
- ), # Sell signal (MACD crossunder)
450
- -1, # Sell
451
- np.nan, # Default case
452
- ),
453
- )
454
- result["BUY_1"] = result["BUY_1"].fillna(0) # TODO: should we fill with 0 (done)
455
-
456
- # second buy/sell signal : MACD_SIGNAL_DIFF cross 30% threshold of maximum value while positive and decreasing, or 30% threshold of minimum value while negative and increasing
457
- # Calculate rolling 20-day max and min values for MACD_SIGNAL_DIFF per stock
458
- macd_signal_diff_max_20_days = result.groupby("STOCK")[
459
- "MACD_SIGNAL_DIFF"
460
- ].transform(lambda x: x.rolling(20).max())
461
- macd_signal_diff_min_20_days = result.groupby("STOCK")[
462
- "MACD_SIGNAL_DIFF"
463
- ].transform(lambda x: x.rolling(20).min())
464
-
465
- # Define the buy/sell signal conditions
466
- buy_condition = (
467
- (result["MACD_SIGNAL_DIFF"] > result["MACD_SIGNAL_DIFF"].shift(1)) # Increasing
468
- & (result["MACD_SIGNAL_DIFF"] < 0) # Negative value
469
- & (
470
- result["MACD_SIGNAL_DIFF"] > 0.3 * macd_signal_diff_min_20_days
471
- ) # Above 30% of minimum
472
- )
473
-
474
- sell_condition = (
475
- (result["MACD_SIGNAL_DIFF"] < result["MACD_SIGNAL_DIFF"].shift(1)) # Decreasing
476
- & (result["MACD_SIGNAL_DIFF"] > 0) # Positive value
477
- & (
478
- result["MACD_SIGNAL_DIFF"] < 0.3 * macd_signal_diff_max_20_days
479
- ) # Below 30% of maximum
480
- )
481
-
482
- # Apply the conditions to calculate buy/sell signals
483
- result["BUY_2"] = np.where(
484
- buy_condition,
485
- np.abs(
486
- (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_min_20_days)
487
- / (0.3 * macd_signal_diff_min_20_days)
488
- ),
489
- np.where(
490
- sell_condition,
491
- -np.abs(
492
- (result["MACD_SIGNAL_DIFF"] - 0.3 * macd_signal_diff_max_20_days)
493
- / (0.3 * macd_signal_diff_max_20_days)
494
- ),
495
- 0, # Default
496
- ),
497
- )
498
-
499
- periods = [
500
- 9,
501
- 14,
502
- 21,
503
- 50,
504
- 126,
505
- 200,
506
- 252,
507
- ] # 2 semaines, 3 semaines, 1 mois et 2.5 mois
508
- # TODO: on pourrait rajouter plus de long terme : 126 jours (6 mois) et 200 jours (9 mois) et 252 jours (1 an)
509
-
510
- features = []
511
- for period in periods:
512
- logger.debug(f"Computing period features for {period} days...")
513
-
514
- features.append(
515
- pd.DataFrame(
516
- {
517
- f"CUMUL_RET_{period}": cumulative_return(df, period=period),
518
- f"SMA_{period}": sma(df, period=period),
519
- f"EMA_{period}": ema(df, period=period),
520
- f"VOLATILITY_{period}": volatility(df, period=period),
521
- f"ADX_{period}": adx(df, period=period),
522
- f"ATR_{period}": atr(df, period=period),
523
- f"CMF_{period}": chaikin_money_flow(df, period=period),
524
- f"RSI_{period}": rsi(df, period=period),
525
- f"MFI_{period}": mfi(df, period=period),
526
- },
527
- index=df.index,
528
- )
529
- )
530
-
531
- # Stochastic Oscillator returns two series: %K and %D
532
- k, d = stochastic(df, period=period)
533
- features.append(
534
- pd.DataFrame(
535
- {
536
- f"%K_{period}": k,
537
- f"%D_{period}": d,
538
- },
539
- index=df.index,
540
- )
541
- )
542
-
543
- result = pd.concat([result] + features, axis=1)
544
-
545
- # third buy/sell signal : RSI is overbought >0.7 / oversold <0.3
546
- result["BUY_3"] = np.where(
547
- result["RSI_14"] <= 30,
548
- (30 - result["RSI_14"]) / 30,
549
- np.where(result["RSI_14"] >= 70, -(result["RSI_14"] - 70) / 30, 0),
550
- )
551
-
552
- # fourth buy/sell signal : RSI vs CLOSE divergence
553
- # The RSI vs. Close divergence trading signal identifies potential reversals by detecting when the
554
- # Relative Strength Index (RSI) and price (Close) move in opposite directions
555
- # bullish divergence occurs when the price makes lower lows while RSI makes higher lows (potential uptrend),
556
- # and bearish divergence occurs when the price makes higher highs while RSI makes lower highs (potential downtrend)
557
-
558
- # Detect local peaks (RSI Highs) and troughs (RSI Lows) for divergence analysis
559
- # Compute local maxima and minima indices
560
- rsi_peak_indices = argrelextrema(result["RSI_14"].values, np.greater)[
561
- 0
562
- ] # RSI highs
563
- rsi_trough_indices = argrelextrema(result["RSI_14"].values, np.less)[0] # RSI lows
564
-
565
- # Create boolean masks for peaks and troughs
566
- rsi_peaks_mask = np.zeros(len(result), dtype=bool)
567
- rsi_troughs_mask = np.zeros(len(result), dtype=bool)
568
-
569
- rsi_peaks_mask[rsi_peak_indices] = True
570
- rsi_troughs_mask[rsi_trough_indices] = True
571
-
572
- # Extract peak and trough rows efficiently
573
- rsi_peaks = result.loc[rsi_peaks_mask, ["CLOSE", "RSI_14"]].copy()
574
- rsi_troughs = result.loc[rsi_troughs_mask, ["CLOSE", "RSI_14"]].copy()
575
-
576
- # Compute RSI and CLOSE differences to check divergence
577
- for i in [1, 2, 3]:
578
- # RSI & Price difference from past peaks
579
- rsi_peaks[f"RSI_PEAK_DIFF_{i}"] = rsi_peaks["RSI_14"].diff(i)
580
- rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] = rsi_peaks["CLOSE"].diff(i)
581
-
582
- # RSI & Price difference from past troughs
583
- rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] = rsi_troughs["RSI_14"].diff(i)
584
- rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] = rsi_troughs["CLOSE"].diff(i)
585
-
586
- # Detect bearish divergence (RSI down, price up) and bullish divergence (RSI up, price down)
587
- rsi_peaks[f"DIVERGENCE_{i}"] = np.where(
588
- (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] < 0)
589
- & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] > 0),
590
- -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
591
- np.where(
592
- (rsi_peaks[f"RSI_PEAK_DIFF_{i}"] > 0)
593
- & (rsi_peaks[f"PRICE_PEAK_DIFF_{i}"] < 0),
594
- -np.abs(rsi_peaks[f"RSI_PEAK_DIFF_{i}"]),
595
- 0,
596
- ),
597
- )
598
-
599
- rsi_troughs[f"DIVERGENCE_{i}"] = np.where(
600
- (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] > 0)
601
- & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] < 0),
602
- np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
603
- np.where(
604
- (rsi_troughs[f"RSI_TROUGH_DIFF_{i}"] < 0)
605
- & (rsi_troughs[f"PRICE_TROUGH_DIFF_{i}"] > 0),
606
- np.abs(rsi_troughs[f"RSI_TROUGH_DIFF_{i}"]),
607
- 0,
608
- ),
609
- )
610
-
611
- # Concatenate peak and trough divergences into a single DataFrame
612
- divergence_cols = [f"DIVERGENCE_{i}" for i in [1, 2, 3]]
613
- divergence_data = pd.concat(
614
- [rsi_peaks[divergence_cols], rsi_troughs[divergence_cols]], axis=0
615
- )
616
-
617
- # Merge using index alignment
618
- result[divergence_cols] = divergence_data.reindex(result.index, fill_value=0)
619
-
620
- # Sum divergence signals into BUY_4 for a single signal strength metric
621
- result["BUY_4"] = result[divergence_cols].sum(axis=1)
622
- return result
623
-
624
-
625
- # Main function to process the full dataset with multiple stocks
626
- def preprocessing(
627
- df: pd.DataFrame,
628
- for_training: bool = False,
629
- save_as_csv: bool = False,
630
- ):
631
- """Main function to process the full dataset with multiple stocks
632
-
633
- Args:
634
- - df (pd.DataFrame): the dataframe with ohlc data
635
- - for_training (bool): whether to compute targets and for_training as data_for_training, or not.
636
- """
637
-
638
- # Computing residual RET and relative VOLUME
639
- logger.info("Creating RET and VOLUME metrics...")
640
- df["RET"] = df.groupby("STOCK")["CLOSE"].pct_change(1)
641
- df["MARKET_RET"] = df.groupby("DATE")["RET"].transform("mean")
642
- df["RESIDUAL_RET"] = df["RET"] - df["MARKET_RET"]
643
-
644
- df["VOLUME_RATIO"] = (
645
- df["VOLUME"]
646
- / df.groupby("STOCK")["VOLUME"].rolling(20, min_periods=1).mean().values
647
- )
648
- df["MARKET_VOLUME_RATIO"] = df.groupby("DATE")["VOLUME_RATIO"].transform("mean")
649
- df["RELATIVE_VOLUME"] = df["VOLUME_RATIO"] - df["MARKET_VOLUME_RATIO"]
650
-
651
- logger.info("Creating historical time series metrics...")
652
- periods = [
653
- 1, # daily
654
- 2,
655
- 3,
656
- 4,
657
- 5, # weekly
658
- 9,
659
- 14,
660
- 21, # monthly
661
- 50,
662
- 126,
663
- 200,
664
- 252,
665
- ] # need to keep 1, 2, 3, 4, 5 for backward compatibility
666
- for METRIC in ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"]:
667
- for i in periods:
668
- df[f"{METRIC}_-{i}"] = df[METRIC].shift(i)
669
-
670
- # Group by "STOCK" and apply the indicators for each stock
671
- logger.info("Applying indicators...")
672
- grouped_df = df.groupby("STOCK", group_keys=False)
673
- preprocessed_df = grouped_df.apply(apply_indicators)
674
-
675
- # Drop non-useful column for training
676
- if "ISIN" in df.columns:
677
- df.drop(labels=["ISIN"], axis=1, inplace=True)
678
- if "SECURITY" in df.columns:
679
- df.drop(labels=["SECURITY"], axis=1, inplace=True)
680
-
681
- if for_training:
682
- preprocessed_df = targets_creation(preprocessed_df)
683
-
684
- if save_as_csv and PYTHON_ENV == "Development":
685
- preprocessed_df_to_csv = preprocessed_df.sort_values(["DATE", "STOCK"])
686
- preprocessed_df_to_csv.to_csv(
687
- f"{data_dir}/data_for_training.csv",
688
- index=False,
689
- header=True,
690
- )
691
-
692
- if for_training:
693
- preprocessed_df.dropna(inplace=True)
694
-
695
- preprocessed_df.sort_values(["DATE", "STOCK"], inplace=True)
696
- preprocessed_df.reset_index(drop=True, inplace=True)
697
-
698
- logger.info(
699
- f"{len(preprocessed_df['DATE'])} preprocessed data with shape {preprocessed_df.shape} from {datetime.strftime(preprocessed_df['DATE'].iat[0], '%d/%m/%Y')} to {datetime.strftime(preprocessed_df['DATE'].iat[-1], '%d/%m/%Y')}"
700
- )
701
-
702
- # for_training results if needed
703
- if for_training and PYTHON_ENV == "Development":
704
- joblib.dump(preprocessed_df, f"{data_dir}/data_for_training.pkl")
705
-
706
- # Return the fully processed DataFrame with all new features (copy to avoid fragmented memory)
707
- return_df = preprocessed_df.copy()
708
- return return_df
709
-
710
-
711
- # Descriptive Analytics functions
712
-
713
-
714
- def plot_sector_repartition(df: pd.DataFrame):
715
- """Visualise repartition of stock per sectors
716
-
717
- Args:
718
- df (pd.DataFrame): a df created with `get_data`
719
- """
720
- sns.barplot(
721
- data=df.groupby("SECTOR")["STOCK"].nunique(),
722
- orient="h",
723
- order=df.groupby("SECTOR")["STOCK"]
724
- .nunique()
725
- .sort_values(ascending=False)
726
- .index,
727
- )
728
-
729
-
730
- def visualize_extrema(
731
- data: pd.DataFrame,
732
- stock: str,
733
- days_before_last: int = 200,
734
- local_max_order: int = 10,
735
- ):
736
- """
737
- Function to visualize local maxima and minima for a given stock in the data.
738
-
739
- Parameters:
740
- - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'ID'
741
- - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
742
- - days_before_last: int, number of days before the last date in the dataset to visualize
743
- - local_max_order: int, the window size for identifying local extrema (default: 5)
744
- """
745
-
746
- # Calculate the last date in the dataset
747
- last_date = data["DATE"].max()
748
- start_date = last_date - pd.Timedelta(days=days_before_last)
749
-
750
- # Find local maxima (argrelextrema with np.greater) for each stock
751
- local_max_CLOSE = (
752
- data[data["STOCK"] == stock]
753
- .set_index("DATE")["CLOSE"]
754
- .iloc[
755
- argrelextrema(
756
- data[data["STOCK"] == stock]["CLOSE"].values,
757
- np.greater,
758
- order=local_max_order,
759
- )
760
- ]
761
- .reset_index()
762
- )
763
-
764
- # Find local minima (argrelextrema with np.less) for each stock
765
- local_min_CLOSE = (
766
- data[data["STOCK"] == stock]
767
- .set_index("DATE")["CLOSE"]
768
- .iloc[
769
- argrelextrema(
770
- data[data["STOCK"] == stock]["CLOSE"].values,
771
- np.less,
772
- order=local_max_order,
773
- )
774
- ]
775
- .reset_index()
776
- )
777
-
778
- # Filter maxima based on stock and date range
779
- local_max_CLOSE = local_max_CLOSE[local_max_CLOSE["DATE"] >= start_date]
780
-
781
- # Filter minima based on stock and date range
782
- local_min_CLOSE = local_min_CLOSE[local_min_CLOSE["DATE"] >= start_date]
783
-
784
- # logger.info the maxima and minima dates
785
- logger.info(
786
- f"Maxima Dates for Stock {stock}: {list(local_max_CLOSE['DATE'].values)}"
787
- )
788
- logger.info(
789
- f"Minima Dates for Stock {stock}: {list(local_min_CLOSE['DATE'].values)}"
790
- )
791
-
792
- # Plot the stock's CLOSE prices within the specified date range
793
- stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)][
794
- ["CLOSE", "DATE"]
795
- ].set_index("DATE")
796
-
797
- plt.figure(figsize=(10, 6))
798
- stock_data.plot(color="black", title=f"Stock {stock} Extremas")
799
-
800
- # Add vertical lines for maxima
801
- for date in local_max_CLOSE["DATE"].values:
802
- plt.axvline(
803
- x=date,
804
- color="red",
805
- label="Maxima" if date == local_max_CLOSE["DATE"].values[0] else "",
806
- )
807
-
808
- # Add vertical lines for minima
809
- for date in local_min_CLOSE["DATE"].values:
810
- plt.axvline(
811
- x=date,
812
- color="green",
813
- label="Minima" if date == local_min_CLOSE["DATE"].values[0] else "",
814
- )
815
-
816
- plt.legend()
817
- plt.show()
818
-
819
-
820
- def visualize_trading_signals(
821
- data: pd.DataFrame,
822
- stock: str,
823
- days_before_last: int = 200,
824
- ):
825
- """
826
- Function to visualize trading signals (BUY, SELL, HOLD) for a given stock.
827
-
828
- Parameters:
829
- - data: pd.DataFrame, DataFrame containing columns 'STOCK', 'DATE', 'CLOSE', and 'TRADING_SIGNAL'
830
- - stock: str, the stock identifier to analyze (e.g., 'AAPL', 'GOOG')
831
- - days_before_last: int, number of days before the last date in the dataset to visualize
832
- """
833
-
834
- # Calculate the last date in the dataset
835
- last_date = data["DATE"].max()
836
- start_date = last_date - pd.Timedelta(days=days_before_last)
837
-
838
- # Filter data for the selected stock and date range
839
- stock_data = data[(data["STOCK"] == stock) & (data["DATE"] >= start_date)].copy()
840
-
841
- # Plot the stock's CLOSE prices
842
- plt.figure(figsize=(10, 6))
843
- plt.plot(stock_data["DATE"], stock_data["CLOSE"], color="black", label="CLOSE")
844
-
845
- # Define the colors for the trading signals
846
- colors = {2: "green", 1: "lightgreen", 0: "yellow", -1: "red", -2: "darkred"}
847
-
848
- # Plot each trading signal with the respective color
849
- for signal_value, color in colors.items():
850
- plt.scatter(
851
- stock_data.loc[stock_data["TARGET_11"] == signal_value, "DATE"],
852
- stock_data.loc[stock_data["TARGET_11"] == signal_value, "CLOSE"],
853
- color=color,
854
- label=f"Signal {signal_value}",
855
- s=50, # Size of the points
856
- )
857
-
858
- plt.title(f"Trading Signals for {stock}")
859
- plt.xlabel("Date")
860
- plt.ylabel("Close Price")
861
- plt.legend()
862
- plt.grid(True)
863
- plt.show()
864
-
865
-
866
- def visualize_data_distribution(
867
- data,
868
- plot_type="hist",
869
- features=None,
870
- bins=50,
871
- rows=5,
872
- cols=5,
873
- width_per_plot=4,
874
- height_per_plot=3,
875
- ):
876
- """
877
- Function to visualize the data distribution for multiple features in a DataFrame with dynamic figsize,
878
- splitting into multiple figures if there are too many features for one figure.
879
-
880
- Parameters:
881
- - data: pd.DataFrame, the DataFrame containing the data to visualize.
882
- - plot_type: str, the type of plot to use ('hist', 'kde', 'box').
883
- - features: list, list of features (columns) to visualize. If None, all numeric features are used.
884
- - bins: int, the number of bins for histograms (default: 50).
885
- - rows: int, number of rows in the subplot grid (default: 5).
886
- - cols: int, number of columns in the subplot grid (default: 5).
887
- - width_per_plot: int, the width of each subplot (default: 4).
888
- - height_per_plot: int, the height of each subplot (default: 3).
889
- """
890
-
891
- # If no features are specified, use all numeric features
892
- if features is None:
893
- features = data.select_dtypes(include=[np.number]).columns.tolist()
894
-
895
- # Calculate the total number of features
896
- total_features = len(features)
897
-
898
- # How many plots can fit into one figure
899
- plots_per_figure = rows * cols
900
-
901
- # Loop over the features and create new figures as needed
902
- for start in range(0, total_features, plots_per_figure):
903
- # Subset of features for the current figure
904
- subset_features = features[start : start + plots_per_figure]
905
-
906
- # Dynamically calculate figure size based on grid size and plot dimensions
907
- num_plots = len(subset_features)
908
- grid_rows = min(rows, num_plots // cols + (num_plots % cols != 0))
909
- grid_cols = min(cols, num_plots)
910
- figsize = (grid_cols * width_per_plot, grid_rows * height_per_plot)
911
-
912
- # Set up the figure and axes for this subset of features
913
- fig, axes = plt.subplots(grid_rows, grid_cols, figsize=figsize)
914
- axes = axes.flatten() # Flatten the axes for easy iteration
915
-
916
- # Plot each feature
917
- for i, feature in enumerate(subset_features):
918
- ax = axes[i]
919
-
920
- if plot_type == "hist":
921
- sns.histplot(data[feature].dropna(), bins=bins, kde=False, ax=ax)
922
- elif plot_type == "kde":
923
- sns.kdeplot(data[feature].dropna(), ax=ax, fill=True)
924
- elif plot_type == "box":
925
- sns.boxplot(data[feature].dropna(), ax=ax)
926
-
927
- ax.set_xlabel(feature)
928
- ax.set_ylabel("Count")
929
-
930
- # Hide any empty subplots
931
- for j in range(i + 1, len(axes)):
932
- fig.delaxes(axes[j])
933
-
934
- # Use tight layout to ensure there's no overlap
935
- fig.tight_layout()
936
-
937
- # Show the plot for this figure
938
- plt.show()
939
-
940
-
941
- def detect_outliers_iqr(data, degree: float = 1.5):
942
- """
943
- Detect outliers in a DataFrame using the Interquartile Range (IQR) method.
944
-
945
- Parameters:
946
- - data: pd.DataFrame, the DataFrame in which to detect outliers.
947
-
948
- Returns:
949
- - outliers: pd.DataFrame, DataFrame with boolean values indicating outliers for each feature.
950
- """
951
- outliers = pd.DataFrame(index=data.index)
952
-
953
- for column in data.select_dtypes(include=[np.number]).columns:
954
- Q1 = data[column].quantile(0.25) # 1st quartile (25th percentile)
955
- Q3 = data[column].quantile(0.75) # 3rd quartile (75th percentile)
956
- IQR = Q3 - Q1 # Interquartile range
957
-
958
- lower_bound = Q1 - degree * IQR
959
- upper_bound = Q3 + degree * IQR
960
-
961
- # Detect outliers
962
- outliers[column] = (data[column] < lower_bound) | (data[column] > upper_bound)
963
-
964
- return outliers
965
-
966
-
967
- def plot_distribution(df):
968
- logger.info("DATA_DISTRIBUTION")
969
-
970
- logger.info("numerical features")
971
- visualize_data_distribution(df.select_dtypes(include=["float64"]))
972
-
973
- logger.info("categorical features")
974
- visualize_data_distribution(df.select_dtypes(include=["int64"]))
975
-
976
- logger.info("nb of outliers")
977
- outliers = detect_outliers_iqr(df.select_dtypes(include=["float64"]), degree=5)
978
-
979
- with pd.option_context("display.max_rows", None):
980
- logger.info(outliers.sum().sort_values(ascending=False))
981
-
982
- logger.info("zoom on volume outliers")
983
- columns = [c for c in df.columns if "VOLUME" in c]
984
- visualize_data_distribution(df, features=columns, plot_type="box", cols=3)