bbstrader 0.2.93__py3-none-any.whl → 0.2.95__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

Files changed (35) hide show
  1. bbstrader/__ini__.py +20 -20
  2. bbstrader/__main__.py +50 -50
  3. bbstrader/btengine/__init__.py +54 -54
  4. bbstrader/btengine/scripts.py +157 -157
  5. bbstrader/compat.py +19 -19
  6. bbstrader/config.py +137 -137
  7. bbstrader/core/data.py +22 -22
  8. bbstrader/core/utils.py +146 -146
  9. bbstrader/metatrader/__init__.py +6 -6
  10. bbstrader/metatrader/account.py +1516 -1516
  11. bbstrader/metatrader/copier.py +750 -745
  12. bbstrader/metatrader/rates.py +584 -584
  13. bbstrader/metatrader/risk.py +749 -748
  14. bbstrader/metatrader/scripts.py +81 -81
  15. bbstrader/metatrader/trade.py +1836 -1836
  16. bbstrader/metatrader/utils.py +645 -645
  17. bbstrader/models/__init__.py +10 -10
  18. bbstrader/models/factors.py +312 -312
  19. bbstrader/models/ml.py +1272 -1272
  20. bbstrader/models/optimization.py +182 -182
  21. bbstrader/models/portfolio.py +223 -223
  22. bbstrader/models/risk.py +398 -398
  23. bbstrader/trading/__init__.py +11 -11
  24. bbstrader/trading/execution.py +846 -846
  25. bbstrader/trading/script.py +155 -155
  26. bbstrader/trading/scripts.py +69 -69
  27. bbstrader/trading/strategies.py +860 -860
  28. bbstrader/tseries.py +1842 -1842
  29. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/LICENSE +21 -21
  30. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/METADATA +188 -187
  31. bbstrader-0.2.95.dist-info/RECORD +44 -0
  32. bbstrader-0.2.93.dist-info/RECORD +0 -44
  33. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/WHEEL +0 -0
  34. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/entry_points.txt +0 -0
  35. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/top_level.txt +0 -0
bbstrader/tseries.py CHANGED
@@ -1,1842 +1,1842 @@
1
- """
2
- The `tseries` module is a designed for conducting
3
- advanced time series analysis in financial markets.
4
- It leverages statistical models and algorithms to perform
5
- tasks such as cointegration testing, volatility modeling,
6
- and filter-based estimation to assist in trading strategy development,
7
- market analysis, and financial data exploration.
8
- """
9
-
10
- import pprint
11
- import warnings
12
- from itertools import combinations
13
- from typing import List, Tuple, Union
14
-
15
- import matplotlib.pyplot as plt
16
- import numpy as np
17
- import pandas as pd
18
- import pmdarima as pm
19
- import seaborn as sns
20
- import statsmodels.api as sm
21
- import statsmodels.tsa.stattools as ts
22
- import yfinance as yf
23
- from arch import arch_model
24
- from filterpy.kalman import KalmanFilter
25
- from hurst import compute_Hc
26
- from pykalman import KalmanFilter as PyKalmanFilter
27
- from scipy.optimize import minimize
28
- from sklearn.linear_model import LogisticRegressionCV
29
- from sklearn.model_selection import GridSearchCV
30
- from sklearn.tree import DecisionTreeClassifier
31
- from statsmodels.graphics.tsaplots import plot_acf
32
- from statsmodels.stats.diagnostic import acorr_ljungbox
33
- from statsmodels.tsa.arima.model import ARIMA
34
- from statsmodels.tsa.stattools import adfuller, coint
35
- from statsmodels.tsa.vector_ar.var_model import VAR
36
- from statsmodels.tsa.vector_ar.vecm import coint_johansen
37
- from tqdm import tqdm
38
-
39
- warnings.filterwarnings("ignore")
40
-
41
-
42
- __all__ = [
43
- "load_and_prepare_data",
44
- "fit_best_arima",
45
- "fit_garch",
46
- "predict_next_return",
47
- "get_prediction",
48
- "get_corr",
49
- "run_cadf_test",
50
- "run_hurst_test",
51
- "run_coint_test",
52
- "run_kalman_filter",
53
- "ArimaGarchModel",
54
- "KalmanFilterModel",
55
- "OrnsteinUhlenbeck",
56
- "remove_correlated_assets",
57
- "check_stationarity",
58
- "remove_stationary_assets",
59
- "select_assets",
60
- "compute_pair_metrics",
61
- "find_cointegrated_pairs",
62
- "analyze_cointegrated_pairs",
63
- "select_candidate_pairs",
64
- "KFSmoother",
65
- "KFHedgeRatio",
66
- ]
67
-
68
- # *******************************************
69
- # ARIMA AND GARCH MODELS *
70
- # *******************************************
71
-
72
-
73
- def load_and_prepare_data(df: pd.DataFrame):
74
- """
75
- Prepares financial time series data for analysis.
76
-
77
- This function takes a pandas DataFrame containing financial data,
78
- calculates logarithmic returns, and the first difference
79
- of these logarithmic returns. It handles missing values
80
- by filling them with zeros.
81
-
82
- Args:
83
- df (pd.DataFrame): DataFrame containing at least
84
- a `Close` column with closing prices of a financial asset.
85
-
86
- Returns:
87
- pd.DataFrame: DataFrame with additional
88
- columns for logarithmic returns (`log_return`)
89
- and the first difference of logarithmic returns (`diff_log_return`),
90
- with `NaN` values filled with `0`.
91
- """
92
- # Load data
93
- data = df.copy()
94
- # Calculate logarithmic returns
95
- data["log_return"] = np.log(data["Close"] / data["Close"].shift(1))
96
- # Differencing if necessary
97
- data["diff_log_return"] = data["log_return"].diff()
98
- # Drop NaN values
99
- data.fillna(0, inplace=True)
100
- return data
101
-
102
-
103
- def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
104
- """
105
- Identifies and fits the best `ARIMA` model
106
- based on the Akaike Information Criterion `(AIC)`.
107
-
108
- Iterates through different combinations of `p` and `q`
109
- parameters (within specified ranges) for the ARIMA model,
110
- fits them to the provided data, and selects the combination
111
- with the lowest `AIC` value.
112
-
113
- Args:
114
- window_data (pd.Series or np.ndarray):
115
- Time series data to fit the `ARIMA` model on.
116
-
117
- Returns:
118
- ARIMA result object: The fitted `ARIMA` model with the lowest `AIC`.
119
- """
120
- if isinstance(window_data, pd.Series):
121
- window_data = window_data.values
122
-
123
- window_data = window_data[~(np.isnan(window_data) | np.isinf(window_data))]
124
- # Fit ARIMA model with best parameters
125
- model = pm.auto_arima(
126
- window_data,
127
- start_p=1,
128
- start_q=1,
129
- max_p=6,
130
- max_q=6,
131
- seasonal=False,
132
- stepwise=True,
133
- )
134
- final_order = model.order
135
- from arch.utility.exceptions import ConvergenceWarning as ArchWarning
136
- from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
137
-
138
- with warnings.catch_warnings():
139
- warnings.filterwarnings("ignore", category=StatsWarning, module="statsmodels")
140
- warnings.filterwarnings("ignore", category=ArchWarning, module="arch")
141
- try:
142
- best_arima_model = ARIMA(
143
- window_data + 1e-5, order=final_order, missing="drop"
144
- ).fit()
145
- return best_arima_model
146
- except np.linalg.LinAlgError:
147
- # Catch specific linear algebra errors
148
- print("LinAlgError occurred, skipping this data point.")
149
- return None
150
- except Exception as e:
151
- # Catch any other unexpected errors and log them
152
- print(f"An error occurred: {e}")
153
- return None
154
-
155
-
156
- def fit_garch(window_data: Union[pd.Series, np.ndarray]):
157
- """
158
- Fits an `ARIMA` model to the data to get residuals,
159
- then fits a `GARCH(1,1)` model on these residuals.
160
-
161
- Utilizes the residuals from the best `ARIMA` model fit to
162
- then model volatility using a `GARCH(1,1)` model.
163
-
164
- Args:
165
- window_data (pd.Series or np.ndarray):
166
- Time series data for which to fit the `ARIMA` and `GARCH` models.
167
-
168
- Returns:
169
- tuple: A tuple containing the `ARIMA` result
170
- object and the `GARCH` result object.
171
- """
172
- arima_result = fit_best_arima(window_data)
173
- if arima_result is None:
174
- return None, None
175
- resid = np.asarray(arima_result.resid)
176
- resid = resid[~(np.isnan(resid) | np.isinf(resid))]
177
- garch_model = arch_model(resid, p=1, q=1, rescale=False)
178
- garch_result = garch_model.fit(disp="off")
179
- return arima_result, garch_result
180
-
181
-
182
- def predict_next_return(arima_result, garch_result):
183
- """
184
- Predicts the next return value using fitted `ARIMA` and `GARCH` models.
185
-
186
- Combines the next period forecast from the `ARIMA` model
187
- with the next period volatility forecast from the `GARCH` model
188
- to predict the next return value.
189
-
190
- Args:
191
- arima_result (ARIMA result object): The fitted `ARIMA` model result.
192
- garch_result (ARCH result object): The fitted `GARCH` model result.
193
-
194
- Returns:
195
- float: The predicted next return, adjusted for predicted volatility.
196
- """
197
- if arima_result is None or garch_result is None:
198
- return 0
199
- # Predict next value with ARIMA
200
- arima_pred = arima_result.forecast(steps=1)
201
- # Predict next volatility with GARCH
202
- garch_pred = garch_result.forecast(horizon=1)
203
- next_volatility = garch_pred.variance.iloc[-1, 0]
204
-
205
- # Combine predictions (return + volatility)
206
- if not isinstance(arima_pred, np.ndarray):
207
- pred = arima_pred.values[0]
208
- else:
209
- pred = arima_pred[0]
210
- return pred + next_volatility
211
-
212
-
213
- def get_prediction(window_data: Union[pd.Series, np.ndarray]):
214
- """
215
- Orchestrator function to get the next period's return prediction.
216
-
217
- This function ties together the process of fitting
218
- both `ARIMA` and `GARCH` models on the provided data
219
- and then predicting the next period's return using these models.
220
-
221
- Args:
222
- window_data (Union[pd.Series , np.ndarray]):
223
- Time series data to fit the models and predict the next return.
224
-
225
- Returns
226
- float: Predicted next return value.
227
- """
228
- arima_result, garch_result = fit_garch(window_data)
229
- prediction = predict_next_return(arima_result, garch_result)
230
- return prediction
231
-
232
-
233
- class ArimaGarchModel:
234
- """
235
- This class implements a time serie model
236
- that combines `ARIMA (AutoRegressive Integrated Moving Average)`
237
- and `GARCH (Generalized Autoregressive Conditional Heteroskedasticity)` models
238
- to predict future returns based on historical price data.
239
-
240
- The model is implemented in the following steps:
241
- 1. Data Preparation: Load and prepare the historical price data.
242
- 2. Modeling: Fit the ARIMA model to the data and then fit the GARCH model to the residuals.
243
- 3. Prediction: Predict the next return using the ARIMA model and the next volatility using the GARCH model.
244
- 4. Trading Strategy: Execute the trading strategy based on the predictions.
245
- 5. Vectorized Backtesting: Backtest the trading strategy using the historical data.
246
-
247
- Exemple:
248
- >>> import yfinance as yf
249
- >>> from bbstrader.tseries import ArimaGarchModel
250
- >>> from bbstrader.tseries import load_and_prepare_data
251
-
252
- >>> if __name__ == '__main__':
253
- >>> # ARCH SPY Vectorize Backtest
254
- >>> k = 252
255
- >>> data = yf.download("SPY", start="2010-01-02", end="2015-12-31")
256
- >>> arch = ArimaGarchModel("SPY", data, k=k)
257
- >>> df = load_and_prepare_data(data)
258
- >>> arch.show_arima_garch_results(df['diff_log_return'].values[-k:])
259
- >>> arch.backtest_strategy()
260
- """
261
-
262
- def __init__(self, symbol, data, k: int = 252):
263
- """
264
- Initializes the ArimaGarchStrategy class.
265
-
266
- Args:
267
- symbol (str): The ticker symbol for the financial instrument.
268
- data (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
269
- k (int): The window size for rolling prediction in backtesting.
270
- """
271
- self.symbol = symbol
272
- self.data = self.load_and_prepare_data(data)
273
- self.k = k
274
-
275
- # Step 1: Data Preparation
276
- def load_and_prepare_data(self, df):
277
- """
278
- Prepares the dataset by calculating logarithmic returns
279
- and differencing if necessary.
280
-
281
- Args:
282
- df (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
283
-
284
- Returns:
285
- pd.DataFrame: The dataset with additional columns
286
- for log returns and differenced log returns.
287
- """
288
- return load_and_prepare_data(df)
289
-
290
- # Step 2: Modeling (ARIMA + GARCH)
291
- def fit_best_arima(self, window_data):
292
- """
293
- Fits the ARIMA model to the provided window of data,
294
- selecting the best model based on AIC.
295
-
296
- Args:
297
- window_data (np.array): The dataset for a specific window period.
298
-
299
- Returns:
300
- ARIMA model: The best fitted ARIMA model based on AIC.
301
- """
302
- return fit_best_arima(window_data)
303
-
304
- def fit_garch(self, window_data):
305
- """
306
- Fits the GARCH model to the residuals of the best ARIMA model.
307
-
308
- Args:
309
- window_data (np.array): The dataset for a specific window period.
310
-
311
- Returns:
312
- tuple: Contains the ARIMA result and GARCH result.
313
- """
314
- return fit_garch(window_data)
315
-
316
- def show_arima_garch_results(self, window_data, acf=True, test_resid=True):
317
- """
318
- Displays the ARIMA and GARCH model results, including plotting
319
- ACF of residuals and conducting , Box-Pierce and Ljung-Box tests.
320
-
321
- Args:
322
- window_data (np.array): The dataset for a specific window period.
323
- acf (bool, optional): If True, plot the ACF of residuals. Defaults to True.
324
-
325
- test_resid (bool, optional):
326
- If True, conduct Box-Pierce and Ljung-Box tests on residuals. Defaults to True.
327
- """
328
- arima_result = self.fit_best_arima(window_data)
329
- resid = np.asarray(arima_result.resid)
330
- resid = resid[~(np.isnan(resid) | np.isinf(resid))]
331
- garch_model = arch_model(resid, p=1, q=1, rescale=False)
332
- garch_result = garch_model.fit(disp="off")
333
- residuals = garch_result.resid
334
-
335
- # TODO : Plot the ACF of the residuals
336
- if acf:
337
- fig = plt.figure(figsize=(12, 8))
338
- # Plot the ACF of ARIMA residuals
339
- ax1 = fig.add_subplot(211, ylabel="ACF")
340
- plot_acf(resid, alpha=0.05, ax=ax1, title="ACF of ARIMA Residuals")
341
- ax1.set_xlabel("Lags")
342
- ax1.grid(True)
343
-
344
- # Plot the ACF of GARCH residuals on the same axes
345
- ax2 = fig.add_subplot(212, ylabel="ACF")
346
- plot_acf(residuals, alpha=0.05, ax=ax2, title="ACF of GARCH Residuals")
347
- ax2.set_xlabel("Lags")
348
- ax2.grid(True)
349
-
350
- # Plot the figure
351
- plt.tight_layout()
352
- plt.show()
353
-
354
- # TODO : Conduct Box-Pierce and Ljung-Box Tests of the residuals
355
- if test_resid:
356
- print(arima_result.summary())
357
- print(garch_result.summary())
358
- bp_test = acorr_ljungbox(resid, return_df=True)
359
- print("Box-Pierce and Ljung-Box Tests Results for ARIMA:\n", bp_test)
360
-
361
- # Step 3: Prediction
362
- def predict_next_return(self, arima_result, garch_result):
363
- """
364
- Predicts the next return using the ARIMA model
365
- and the next volatility using the GARCH model.
366
-
367
- Args:
368
- arima_result (ARIMA model): The ARIMA model result.
369
- garch_result (GARCH model): The GARCH model result.
370
-
371
- Returns:
372
- float: The predicted next return.
373
- """
374
- return predict_next_return(arima_result, garch_result)
375
-
376
- def get_prediction(self, window_data):
377
- """
378
- Generates a prediction for the next return based on a window of data.
379
-
380
- Args:
381
- window_data (np.array): The dataset for a specific window period.
382
-
383
- Returns:
384
- float: The predicted next return.
385
- """
386
- return get_prediction(window_data)
387
-
388
- def calculate_signals(self, window_data):
389
- """
390
- Calculates the trading signal based on the prediction.
391
-
392
- Args:
393
- window_data (np.array): The dataset for a specific window period.
394
-
395
- Returns:
396
- str: The trading signal ('LONG', 'SHORT', or None).
397
- """
398
- prediction = self.get_prediction(window_data)
399
- if prediction > 0:
400
- signal = "LONG"
401
- elif prediction < 0:
402
- signal = "SHORT"
403
- else:
404
- signal = None
405
- return signal
406
-
407
- # Step 4: Trading Strategy
408
-
409
- def execute_trading_strategy(self, predictions):
410
- """
411
- Executes the trading strategy based on a list
412
- of predictions, determining positions to take.
413
-
414
- Args:
415
- predictions (list): A list of predicted returns.
416
-
417
- Returns:
418
- list: A list of positions (1 for 'LONG', -1 for 'SHORT', 0 for 'HOLD').
419
- """
420
- positions = [] # Long if 1, Short if -1
421
- previous_position = 0 # Initial position
422
- for prediction in predictions:
423
- if prediction > 0:
424
- current_position = 1 # Long
425
- elif prediction < 0:
426
- current_position = -1 # Short
427
- else:
428
- current_position = previous_position # Hold previous position
429
- positions.append(current_position)
430
- previous_position = current_position
431
-
432
- return positions
433
-
434
- # Step 5: Vectorized Backtesting
435
- def generate_predictions(self):
436
- """
437
- Generator that yields predictions one by one.
438
- """
439
- data = self.data
440
- window_size = self.k
441
- for i in range(window_size, len(data)):
442
- print(
443
- f"Processing window {i - window_size + 1}/{len(data) - window_size}..."
444
- )
445
- window_data = data["diff_log_return"].iloc[i - window_size : i]
446
- next_return = self.get_prediction(window_data)
447
- yield next_return
448
-
449
- def backtest_strategy(self):
450
- """
451
- Performs a backtest of the strategy over
452
- the entire dataset, plotting cumulative returns.
453
- """
454
- data = self.data
455
- window_size = self.k
456
- print(
457
- f"Starting backtesting for {self.symbol}\n"
458
- f"Window size {window_size}.\n"
459
- f"Total iterations: {len(data) - window_size}.\n"
460
- )
461
- predictions_generator = self.generate_predictions()
462
-
463
- positions = self.execute_trading_strategy(predictions_generator)
464
-
465
- strategy_returns = (
466
- np.array(positions[:-1]) * data["log_return"].iloc[window_size + 1 :].values
467
- )
468
- buy_and_hold = data["log_return"].iloc[window_size + 1 :].values
469
- buy_and_hold_returns = np.cumsum(buy_and_hold)
470
- cumulative_returns = np.cumsum(strategy_returns)
471
- dates = data.index[window_size + 1 :]
472
- self.plot_cumulative_returns(cumulative_returns, buy_and_hold_returns, dates)
473
-
474
- print("\nBacktesting completed !!")
475
-
476
- # Function to plot the cumulative returns
477
- def plot_cumulative_returns(self, strategy_returns, buy_and_hold_returns, dates):
478
- """
479
- Plots the cumulative returns of the ARIMA+GARCH strategy against
480
- a buy-and-hold strategy.
481
-
482
- Args:
483
- strategy_returns (np.array): Cumulative returns from the strategy.
484
- buy_and_hold_returns (np.array): Cumulative returns from a buy-and-hold strategy.
485
- dates (pd.Index): The dates corresponding to the returns.
486
- """
487
- plt.figure(figsize=(14, 7))
488
- plt.plot(dates, strategy_returns, label="ARIMA+GARCH ", color="blue")
489
- plt.plot(dates, buy_and_hold_returns, label="Buy & Hold", color="red")
490
- plt.xlabel("Time")
491
- plt.ylabel("Cumulative Returns")
492
- plt.title(f"ARIMA+GARCH Strategy vs. Buy & Hold on ({self.symbol})")
493
- plt.legend()
494
- plt.grid(True)
495
- plt.show()
496
-
497
-
498
- # *********************************************
499
- # STATS TEST (Cointegration , Mean Reverting)*
500
- # *********************************************
501
- def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -> None:
502
- """
503
- Calculates and prints the correlation matrix of the adjusted closing prices
504
- for a given list of stock tickers within a specified date range.
505
-
506
- Args:
507
- tickers (Union[List[str] , Tuple[str, ...]]):
508
- A list or tuple of valid stock tickers (e.g., ['AAPL', 'MSFT', 'GOOG']).
509
- start (str): The start date for the historical data in 'YYYY-MM-DD' format.
510
- end (str): The end date for the historical data in 'YYYY-MM-DD' format.
511
-
512
- Example:
513
- >>> from bbstrader.tseries import get_corr
514
- >>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
515
- """
516
- # Download historical data
517
- data = yf.download(tickers, start=start, end=end, multi_level_index=False)[
518
- "Adj Close"
519
- ]
520
-
521
- # Calculate correlation matrix
522
- correlation_matrix = data.corr()
523
-
524
- # Display the matrix
525
- print(correlation_matrix)
526
-
527
-
528
- def plot_price_series(df: pd.DataFrame, ts1: str, ts2: str):
529
- """
530
- Plot both time series on the same line graph for
531
- the specified date range.
532
-
533
- Args:
534
- df (pd.DataFrame):
535
- The DataFrame containing prices for each series
536
- ts1 (str): The first time series column name
537
- ts2 (str): The second time series column name
538
- """
539
- fig, ax = plt.subplots()
540
- ax.plot(df.index, df[ts1], label=ts1)
541
- ax.plot(df.index, df[ts2], label=ts2)
542
-
543
- fig.autofmt_xdate()
544
- plt.xlabel("Month/Year")
545
- plt.ylabel("Price ($)")
546
- plt.title(f"{ts1} and {ts2} Daily Prices ")
547
- plt.legend()
548
- plt.show()
549
-
550
-
551
- def plot_scatter_series(df: pd.DataFrame, ts1: str, ts2: str):
552
- """
553
- Plot a scatter plot of both time series for
554
- via the provided DataFrame.
555
-
556
- Args:
557
- df (pd.DataFrame):
558
- The DataFrame containing prices for each series
559
- ts1 (str): The first time series column name
560
- ts2 (str): The second time series column name
561
- """
562
- plt.xlabel(f"{ts1} Price ($)")
563
- plt.ylabel(f"{ts2} Price ($)")
564
- plt.title(f"{ts1} and {ts2} Price Scatterplot")
565
- plt.scatter(df[ts1], df[ts2])
566
-
567
- # Plot the regression line
568
- plt.plot(
569
- df[ts1],
570
- results.fittedvalues,
571
- linestyle="--",
572
- color="red",
573
- linewidth=2,
574
- label="Regression Line",
575
- )
576
- plt.legend()
577
- plt.show()
578
-
579
-
580
- def plot_residuals(df: pd.DataFrame):
581
- """
582
- Plot the residuals of OLS procedure for both
583
- time series.
584
-
585
- Args:
586
- df (pd.DataFrame):
587
- The DataFrame containing prices for each series
588
- """
589
- fig, ax = plt.subplots()
590
- ax.plot(df.index, df["res"], label="Residuals")
591
-
592
- fig.autofmt_xdate()
593
- plt.xlabel("Month/Year")
594
- plt.ylabel("Price ($)")
595
- plt.title("Residual Plot")
596
- plt.legend()
597
- plt.show()
598
-
599
-
600
- def run_cadf_test(
601
- pair: Union[List[str], Tuple[str, ...]],
602
- start: str,
603
- end: str,
604
- ) -> None:
605
- """
606
- Performs the Cointegration Augmented Dickey-Fuller (CADF) test on a pair of stock tickers
607
- over a specified date range to check for cointegration.
608
-
609
- The function downloads historical adjusted closing prices for the specified pair of stock tickers,
610
- calculates the optimal hedge ratio (beta) using Ordinary Least Squares (OLS) regression, plots the
611
- time series and their residuals, and finally performs the CADF test on the residuals.
612
-
613
- Args:
614
- pair (List[str] or Tuple[str, ...]):
615
- A list or tuple containing two valid stock tickers (e.g., ['AAPL', 'MSFT']).
616
- start (str): The start date for the historical data in 'YYYY-MM-DD' format.
617
- end (str): The end date for the historical data in 'YYYY-MM-DD' format.
618
-
619
- Example:
620
- >>> from bbstrader.tseries import run_cadf_test
621
- >>> run_cadf_test(['AAPL', 'MSFT'], '2023-01-01', '2023-12-31')
622
- >>> Regression Metrics:
623
- >>> Optimal Hedge Ratio (Beta): 2.2485845594120333
624
- >>> Result Parmas:
625
-
626
- >>> const -74.418034
627
- >>> AAPL 2.248585
628
- >>> dtype: float64
629
-
630
- >>> Regression Summary:
631
- >>> OLS Regression Results
632
- >>> ==============================================================================
633
- >>> Dep. Variable: MSFT R-squared: 0.900
634
- >>> Model: OLS Adj. R-squared: 0.900
635
- >>> Method: Least Squares F-statistic: 2244.
636
- >>> Date: Sat, 20 Jul 2024 Prob (F-statistic): 2.95e-126
637
- >>> Time: 13:36:58 Log-Likelihood: -996.45
638
- >>> No. Observations: 250 AIC: 1997.
639
- >>> Df Residuals: 248 BIC: 2004.
640
- >>> Df Model: 1
641
- >>> Covariance Type: nonrobust
642
- >>> ==============================================================================
643
- >>> coef std err t P>|t| [0.025 0.975]
644
- >>> ------------------------------------------------------------------------------
645
- >>> const -74.4180 8.191 -9.085 0.000 -90.551 -58.286
646
- >>> AAPL 2.2486 0.047 47.369 0.000 2.155 2.342
647
- >>> ==============================================================================
648
- >>> Omnibus: 4.923 Durbin-Watson: 0.121
649
- >>> Prob(Omnibus): 0.085 Jarque-Bera (JB): 4.862
650
- >>> Skew: 0.342 Prob(JB): 0.0879
651
- >>> Kurtosis: 2.993 Cond. No. 1.71e+03
652
- >>> ==============================================================================
653
-
654
- >>> Notes:
655
- >>> [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
656
- >>> [2] The condition number is large, 1.71e+03. This might indicate that there are
657
- >>> strong multicollinearity or other numerical problems.
658
-
659
- >>> Cointegration TEST Results:
660
- >>> (np.float64(-3.204126144947765),
661
- >>> np.float64(0.019747080611767602),
662
- >>> 0,
663
- >>> 249,
664
- >>> {'1%': np.float64(-3.4568881317725864),
665
- >>> '10%': np.float64(-2.5729936189738876),
666
- >>> '5%': np.float64(-2.8732185133016057)},
667
- >>> np.float64(1364.3866758546171))
668
- """
669
- # Download historical data for required stocks
670
- p0, p1 = pair[0], pair[1]
671
- _p0 = yf.download(
672
- p0,
673
- start=start,
674
- end=end,
675
- progress=False,
676
- multi_level_index=False,
677
- auto_adjust=True,
678
- )
679
- _p1 = yf.download(
680
- p1,
681
- start=start,
682
- end=end,
683
- progress=False,
684
- multi_level_index=False,
685
- auto_adjust=True,
686
- )
687
- df = pd.DataFrame(index=_p0.index)
688
- df[p0] = _p0["Adj Close"]
689
- df[p1] = _p1["Adj Close"]
690
- df = df.dropna()
691
-
692
- # Calculate optimal hedge ratio "beta"
693
- # using statsmodels OLS
694
- X = sm.add_constant(df[p0])
695
- y = df[p1]
696
- model = sm.OLS(y, X)
697
- global results
698
- results = model.fit()
699
- beta_hr = results.params[p0]
700
-
701
- # Plot the two time series with regression line
702
- plot_price_series(df, p0, p1)
703
-
704
- # Display a scatter plot of the two time series
705
- # with regression line
706
- plot_scatter_series(df, p0, p1)
707
-
708
- # Calculate the residuals of the linear combination
709
- df["res"] = results.resid
710
- plot_residuals(df)
711
-
712
- # Display regression metrics
713
- print("\nRegression Metrics:")
714
- print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
715
- print("Result Parmas: \n")
716
- print(results.params)
717
- print("\nRegression Summary:")
718
- print(results.summary())
719
-
720
- # Calculate and output the CADF test on the residuals
721
- print("\nCointegration TEST Results:")
722
- cadf = ts.adfuller(df["res"], autolag="AIC")
723
- pprint.pprint(cadf)
724
-
725
-
726
- def _hurst(ts):
727
- """
728
- Returns the Hurst Exponent of the time series vector ts,
729
- """
730
- # Create the range of lag values
731
- lags = range(2, 100)
732
-
733
- # Calculate the array of the variances of the lagged differences
734
- tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
735
-
736
- # Use a linear fit to estimate the Hurst Exponent
737
- poly = np.polyfit(np.log(lags), np.log(tau), 1)
738
-
739
- # Return the Hurst exponent from the polyfit output
740
- return poly[0] * 2.0
741
-
742
-
743
- # Function to calculate Hurst Exponent
744
-
745
-
746
- def hurst(time_series):
747
- H, c, data_range = compute_Hc(time_series, kind="price", simplified=True)
748
- return H
749
-
750
-
751
- def run_hurst_test(symbol: str, start: str, end: str):
752
- """
753
- Calculates and prints the Hurst Exponent for a given stock's adjusted closing prices
754
- within a specified date range, and for three generated series (Geometric Brownian Motion,
755
- Mean-Reverting, and Trending).
756
-
757
- The Hurst Exponent is used to determine the long-term memory of a time series.
758
-
759
- Args:
760
- symbol (str): A valid stock ticker symbol (e.g., 'AAPL').
761
- start (str): The start date for the historical data in 'YYYY-MM-DD' format.
762
- end (str): The end date for the historical data in 'YYYY-MM-DD' format.
763
-
764
- Example:
765
- >>> from bbstrader.tseries import run_hurst_test
766
-
767
- >>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
768
- """
769
- data = yf.download(
770
- symbol,
771
- start=start,
772
- end=end,
773
- progress=False,
774
- multi_level_index=False,
775
- auto_adjust=True,
776
- )
777
-
778
- # Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
779
- gbm = np.log(np.cumsum(np.random.randn(100000)) + 1000)
780
- mr = np.log(np.random.randn(100000) + 1000)
781
- tr = np.log(np.cumsum(np.random.randn(100000) + 1) + 1000)
782
-
783
- # Output the Hurst Exponent for each of the series
784
- print(f"\nHurst(GBM): {_hurst(gbm)}")
785
- print(f"Hurst(MR): {_hurst(mr)}")
786
- print(f"Hurst(TR): {_hurst(tr)}")
787
- print(f"Hurst({symbol}): {hurst(data['Adj Close'])}\n")
788
-
789
-
790
- def test_cointegration(ticker1, ticker2, start, end):
791
- # Download historical data
792
- stock_data_pair = yf.download(
793
- [ticker1, ticker2],
794
- start=start,
795
- end=end,
796
- progress=False,
797
- multi_level_index=False,
798
- auto_adjust=True,
799
- )["Adj Close"].dropna()
800
-
801
- # Perform Johansen cointegration test
802
- result = coint_johansen(stock_data_pair, det_order=0, k_ar_diff=1)
803
-
804
- # Get the cointegration rank
805
- traces_stats = result.lr1
806
- print(f"\nTraces Stats: \n{traces_stats}")
807
-
808
- # Get the critical values for 95% confidence level
809
- critical_values = result.cvt
810
- print(f"\nCritical Values: \n{critical_values}")
811
-
812
- # Compare the cointegration rank with critical values
813
- if traces_stats[0] > critical_values[:, 1].all():
814
- print(f"\n{ticker1} and {ticker2} are cointegrated.\n")
815
- else:
816
- print(f"\nNo cointegration found for {ticker1} and {ticker2}.\n")
817
-
818
-
819
- def run_coint_test(tickers: List[str], start: str, end: str) -> None:
820
- """
821
- Performs pairwise cointegration tests on a list of stock tickers over a specified date range.
822
-
823
- For each unique pair of tickers, the function downloads historical adjusted closing prices and
824
- tests for cointegration.
825
-
826
- Args:
827
- tickers (List[str]): A list of valid stock ticker symbols (e.g., ['AAPL', 'MSFT', 'GOOG']).
828
- start (str): The start date for the historical data in 'YYYY-MM-DD' format.
829
- end (str): The end date for the historical data in 'YYYY-MM-DD' format.
830
-
831
- Example:
832
- >>> from bbstrader.tseries import run_coint_test
833
-
834
- >>> run_coint_test(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
835
- """
836
- # Loop through ticker combinations
837
- for ticker1, ticker2 in combinations(tickers, 2):
838
- test_cointegration(ticker1, ticker2, start, end)
839
-
840
-
841
- # *********************************
842
- # KALMAN FILTER *
843
- # *********************************
844
- def draw_date_coloured_scatterplot(etfs, prices):
845
- """
846
- Create a scatterplot of the two ETF prices, which is
847
- coloured by the date of the price to indicate the
848
- changing relationship between the sets of prices
849
- """
850
- plen = len(prices)
851
- colour_map = plt.cm.get_cmap("YlOrRd")
852
- colours = np.linspace(0.1, 1, plen)
853
-
854
- scatterplot = plt.scatter(
855
- prices[etfs[0]],
856
- prices[etfs[1]],
857
- s=30,
858
- c=colours,
859
- cmap=colour_map,
860
- edgecolor="k",
861
- alpha=0.8,
862
- )
863
-
864
- colourbar = plt.colorbar(scatterplot)
865
- colourbar.ax.set_yticklabels([str(p.date()) for p in prices[:: plen // 9].index])
866
-
867
- plt.xlabel(prices.columns[0])
868
- plt.ylabel(prices.columns[1])
869
- plt.show()
870
-
871
-
872
- def calc_slope_intercept_kalman(etfs, prices):
873
- """
874
- Utilize the Kalman Filter from the filterpy library
875
- to calculate the slope and intercept of the regressed
876
- ETF prices.
877
- """
878
- delta = 1e-5
879
- trans_cov = delta / (1 - delta) * np.eye(2)
880
-
881
- kf = KalmanFilter(dim_x=2, dim_z=1)
882
- kf.x = np.zeros((2, 1)) # Initial state
883
- kf.P = np.ones((2, 2)) * 1000.0 # Initial covariance,
884
- # large to represent high uncertainty
885
- kf.F = np.eye(2) # State transition matrix
886
- kf.Q = trans_cov # Process noise covariance
887
- kf.R = 1.0 # Scalar measurement noise covariance
888
-
889
- state_means, state_covs = [], []
890
- for time, z in enumerate(prices[etfs[1]].values):
891
- # Dynamically update the observation matrix H
892
- # to include the current independent variable
893
- kf.H = np.array([[prices[etfs[0]][time], 1.0]])
894
- kf.predict()
895
- kf.update(z)
896
- state_means.append(kf.x.copy())
897
- state_covs.append(kf.P.copy())
898
-
899
- return np.array(state_means), np.array(state_covs)
900
-
901
-
902
- def draw_slope_intercept_changes(prices, state_means):
903
- """
904
- Plot the slope and intercept of the regressed ETF prices
905
- between the two ETFs, with the changing values of the
906
- Kalman Filter over time.
907
- """
908
- print(f"First Slops : {state_means[0, 0]}")
909
- print(f"First intercept : {state_means[0, 1]}")
910
- pd.DataFrame(
911
- {
912
- "slope": state_means[:, 0].flatten(),
913
- "intercept": state_means[:, 1].flatten(),
914
- },
915
- index=prices.index,
916
- ).plot(subplots=True)
917
- plt.show()
918
-
919
-
920
- def run_kalman_filter(
921
- etfs: Union[List[str], Tuple[str, ...]], start: str, end: str
922
- ) -> None:
923
- """
924
- Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
925
- to estimate the slope and intercept over time.
926
-
927
- The function downloads historical adjusted closing prices for the specified pair of assets,
928
- visualizes their price relationship, calculates the Kalman filter estimates for the slope and
929
- intercept, and visualizes the changes in these estimates over time.
930
-
931
- Args:
932
- etfs (Union[List[str] , Tuple[str, ...]]):
933
- A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
934
- start (str): The start date for the historical data in 'YYYY-MM-DD' format.
935
- end (str): The end date for the historical data in 'YYYY-MM-DD' format.
936
-
937
- Example:
938
- >>> from bbstrader.tseries import run_kalman_filter
939
-
940
- >>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
941
- """
942
- etf_df1 = yf.download(
943
- etfs[0], start, end, progress=False, multi_level_index=False, auto_adjust=True
944
- )
945
- etf_df2 = yf.download(
946
- etfs[1], start, end, progress=False, multi_level_index=False, auto_adjust=True
947
- )
948
-
949
- prices = pd.DataFrame(index=etf_df1.index)
950
- prices[etfs[0]] = etf_df1["Adj Close"]
951
- prices[etfs[1]] = etf_df2["Adj Close"]
952
-
953
- draw_date_coloured_scatterplot(etfs, prices)
954
- state_means, state_covs = calc_slope_intercept_kalman(etfs, prices)
955
- draw_slope_intercept_changes(prices, state_means)
956
-
957
-
958
- class KalmanFilterModel:
959
- """
960
- Implements a Kalman Filter model a recursive algorithm used for estimating
961
- the state of a linear dynamic system from a series of noisy measurements.
962
- It's designed to process market data, estimate dynamic parameters such as
963
- the slope and intercept of price relationships,
964
- forecast error and standard deviation of the predictions
965
-
966
- You can learn more here https://en.wikipedia.org/wiki/Kalman_filter
967
- """
968
-
969
- def __init__(self, tickers: List | Tuple, **kwargs):
970
- """
971
- Initializes the Kalman Filter strategy.
972
-
973
- Args:
974
- tickers :
975
- A list or tuple of ticker symbols representing financial instruments.
976
-
977
- kwargs : Keyword arguments for additional parameters,
978
- specifically `delta` and `vt`
979
- """
980
- self.tickers = tickers
981
- assert self.tickers is not None
982
-
983
- self.R = None
984
- self.theta = np.zeros(2)
985
- self.P = np.zeros((2, 2))
986
- self.delta = kwargs.get("delta", 1e-4)
987
- self.vt = kwargs.get("vt", 1e-3)
988
- self.wt = self.delta / (1 - self.delta) * np.eye(2)
989
- self.latest_prices = np.array([-1.0, -1.0])
990
- self.kf = self._init_kalman()
991
-
992
- def _init_kalman(self):
993
- """
994
- Initializes and returns a Kalman Filter configured
995
- for the trading strategy. The filter is set up with initial
996
- state and covariance, state transition matrix, process noise
997
- and measurement noise covariances.
998
- """
999
- kf = KalmanFilter(dim_x=2, dim_z=1)
1000
- kf.x = np.zeros((2, 1)) # Initial state
1001
- kf.P = self.P # Initial covariance
1002
- kf.F = np.eye(2) # State transition matrix
1003
- kf.Q = self.wt # Process noise covariance
1004
- kf.R = 1.0 # Scalar measurement noise covariance
1005
-
1006
- return kf
1007
-
1008
- Array = np.ndarray
1009
-
1010
- def calc_slope_intercep(self, prices: Array) -> Tuple:
1011
- """
1012
- Calculates and returns the slope and intercept
1013
- of the relationship between the provided prices using the Kalman Filter.
1014
- This method updates the filter with the latest price and returns
1015
- the estimated slope and intercept.
1016
-
1017
- Args:
1018
- prices : A numpy array of prices for two financial instruments.
1019
-
1020
- Returns:
1021
- A tuple containing the slope and intercept of the relationship
1022
- """
1023
- self.kf.H = np.array([[prices[1], 1.0]])
1024
- self.kf.predict()
1025
- self.kf.update(prices[1])
1026
- slope = self.kf.x.copy().flatten()[0]
1027
- intercept = self.kf.x.copy().flatten()[1]
1028
-
1029
- return slope, intercept
1030
-
1031
- def calculate_etqt(self, prices: Array) -> Tuple:
1032
- """
1033
- Calculates the ``forecast error`` and ``standard deviation`` of the predictions
1034
- using the Kalman Filter.
1035
-
1036
- Args:
1037
- prices : A numpy array of prices for two financial instruments.
1038
-
1039
- Returns:
1040
- A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
1041
- """
1042
-
1043
- self.latest_prices[0] = prices[0]
1044
- self.latest_prices[1] = prices[1]
1045
-
1046
- if all(self.latest_prices > -1.0):
1047
- slope, intercept = self.calc_slope_intercep(self.latest_prices)
1048
-
1049
- self.theta[0] = slope
1050
- self.theta[1] = intercept
1051
-
1052
- # Create the observation matrix of the latest prices
1053
- # of Y and the intercept value (1.0) as well as the
1054
- # scalar value of the latest price from X
1055
- F = np.asarray([self.latest_prices[0], 1.0]).reshape((1, 2))
1056
- y = self.latest_prices[1]
1057
-
1058
- # The prior value of the states {\theta_t} is
1059
- # distributed as a multivariate Gaussian with
1060
- # mean a_t and variance-covariance {R_t}
1061
- if self.R is not None:
1062
- self.R = self.C + self.wt
1063
- else:
1064
- self.R = np.zeros((2, 2))
1065
-
1066
- # Calculate the Kalman Filter update
1067
- # ---------------------------------
1068
- # Calculate prediction of new observation
1069
- # as well as forecast error of that prediction
1070
- yhat = F.dot(self.theta)
1071
- et = y - yhat
1072
-
1073
- # {Q_t} is the variance of the prediction of
1074
- # observations and hence sqrt_Qt is the
1075
- # standard deviation of the predictions
1076
- Qt = F.dot(self.R).dot(F.T) + self.vt
1077
- sqrt_Qt = np.sqrt(Qt)
1078
-
1079
- # The posterior value of the states {\theta_t} is
1080
- # distributed as a multivariate Gaussian with mean
1081
- # {m_t} and variance-covariance {C_t}
1082
- At = self.R.dot(F.T) / Qt
1083
- self.theta = self.theta + At.flatten() * et
1084
- self.C = self.R - At * F.dot(self.R)
1085
- return (et[0], sqrt_Qt.flatten()[0])
1086
- else:
1087
- return None
1088
-
1089
-
1090
- # ******************************************
1091
- # ORNSTEIN UHLENBECK PROCESS *
1092
- # ******************************************
1093
-
1094
-
1095
- class OrnsteinUhlenbeck:
1096
- """
1097
- The Ornstein-Uhlenbeck process is a mathematical model
1098
- used to describe the behavior of a mean-reverting stochastic process.
1099
- We use it to model the price dynamics of an asset that tends
1100
- to revert to a long-term mean.
1101
-
1102
- We Estimate the drift (θ), volatility (σ), and long-term mean (μ)
1103
- based on historical price data; then we Simulate the OU process
1104
- using the estimated parameters.
1105
-
1106
- https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
1107
- """
1108
-
1109
- def __init__(self, prices: np.ndarray, returns: bool = True, timeframe: str = "D1"):
1110
- """
1111
- Initializes the OrnsteinUhlenbeck instance.
1112
-
1113
- Args:
1114
- prices (np.ndarray) : Historical close prices.
1115
-
1116
- retrurns (bool) : Use it to indicate weither
1117
- you want to simulate the returns or your raw data
1118
-
1119
- timeframe (str) : The time frame for the Historical prices
1120
- (1m, 5m, 15m, 30m, 1h, 4h, D1)
1121
- """
1122
- self.prices = prices
1123
- if returns:
1124
- series = pd.Series(self.prices)
1125
- self.returns = series.pct_change().dropna().values
1126
- else:
1127
- self.returns = self.prices
1128
-
1129
- time_frame_mapping = {
1130
- "1m": 1 / (24 * 60), # 1 minute intervals
1131
- "5m": 5 / (24 * 60), # 5 minute intervals
1132
- "15m": 15 / (24 * 60), # 15 minute intervals
1133
- "30m": 30 / (24 * 60), # 30 minute intervals
1134
- "1h": 1 / 24, # 1 hour intervals
1135
- "4h": 4 / 24, # 4 hour intervals
1136
- "D1": 1, # Daily intervals
1137
- }
1138
- if timeframe not in time_frame_mapping:
1139
- raise ValueError("Unsupported time frame")
1140
- self.tf = time_frame_mapping[timeframe]
1141
-
1142
- params = self.estimate_parameters()
1143
- self.mu_hat = params[0] # Mean (μ)
1144
- self.theta_hat = params[1] # Drift (θ)
1145
- self.sigma_hat = params[2] # Volatility (σ)
1146
- print(f"Estimated μ: {self.mu_hat}")
1147
- print(f"Estimated θ: {self.theta_hat}")
1148
- print(f"Estimated σ: {self.sigma_hat}")
1149
-
1150
- def ornstein_uhlenbeck(self, mu, theta, sigma, dt, X0, n):
1151
- """
1152
- Simulates the Ornstein-Uhlenbeck process.
1153
-
1154
- Args:
1155
- mu (float): Estimated long-term mean.
1156
- theta (float): Estimated drift.
1157
- sigma (float): Estimated volatility.
1158
- dt (float): Time step.
1159
- X0 (float): Initial value.
1160
- n (int): Number of time steps.
1161
-
1162
- Returns:
1163
- np.ndarray : Simulated process.
1164
- """
1165
- x = np.zeros(n)
1166
- x[0] = X0
1167
- for t in range(1, n):
1168
- dW = np.random.normal(loc=0, scale=np.sqrt(dt))
1169
- # O-U process differential equation
1170
- x[t] = x[t - 1] + (theta * (mu - x[t - 1]) * dt) + (sigma * dW)
1171
- # dW is a Wiener process
1172
- # (theta * (mu - x[t-1]) * dt) represents the mean-reverting tendency
1173
- # (sigma * dW) represents the random volatility
1174
- return x
1175
-
1176
- def estimate_parameters(self):
1177
- """
1178
- Estimates the mean-reverting parameters (μ, θ, σ)
1179
- using the negative log-likelihood.
1180
-
1181
- Returns:
1182
- Tuple: Estimated μ, θ, and σ.
1183
- """
1184
- initial_guess = [0, 0.1, np.std(self.returns)]
1185
- result = minimize(self._neg_log_likelihood, initial_guess, args=(self.returns,))
1186
- mu, theta, sigma = result.x
1187
- return mu, theta, sigma
1188
-
1189
- def _neg_log_likelihood(self, params, returns):
1190
- """
1191
- Calculates the negative
1192
- log-likelihood for parameter estimation.
1193
-
1194
- Args:
1195
- params (list): List of parameters [mu, theta, sigma].
1196
- returns (np.ndarray): Historical returns.
1197
-
1198
- Returns:
1199
- float: Negative log-likelihood.
1200
- """
1201
- mu, theta, sigma = params
1202
- dt = self.tf
1203
- n = len(returns)
1204
- ou_simulated = self.ornstein_uhlenbeck(mu, theta, sigma, dt, 0, n + 1)
1205
- residuals = ou_simulated[1 : n + 1] - returns
1206
- neg_ll = 0.5 * np.sum(residuals**2) / sigma**2 + 0.5 * n * np.log(
1207
- 2 * np.pi * sigma**2
1208
- )
1209
- return neg_ll
1210
-
1211
- def simulate_process(self, returns=None, n=100, p=None):
1212
- """
1213
- Simulates the OU process multiple times .
1214
-
1215
- Args:
1216
- returns (np.ndarray): Historical returns.
1217
- n (int): Number of simulations to perform.
1218
- p (int): Number of time steps.
1219
-
1220
- Returns:
1221
- np.ndarray: 2D array representing simulated processes.
1222
- """
1223
- if returns is None:
1224
- returns = self.returns
1225
- if p is not None:
1226
- T = p
1227
- else:
1228
- T = len(returns)
1229
- dt = self.tf
1230
-
1231
- dW_matrix = np.random.normal(loc=0, scale=np.sqrt(dt), size=(n, T))
1232
- simulations_matrix = np.zeros((n, T))
1233
- simulations_matrix[:, 0] = returns[-1]
1234
-
1235
- for t in range(1, T):
1236
- simulations_matrix[:, t] = (
1237
- simulations_matrix[:, t - 1]
1238
- + self.theta_hat * (self.mu_hat - simulations_matrix[:, t - 1]) * dt
1239
- + self.sigma_hat * dW_matrix[:, t]
1240
- )
1241
- return simulations_matrix
1242
-
1243
-
1244
- def remove_correlated_assets(df: pd.DataFrame, cutoff=0.99):
1245
- """
1246
- Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
1247
- This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
1248
-
1249
- Args:
1250
- df (pd.DataFrame): A DataFrame where each column represents an asset
1251
- and rows represent observations (e.g., time-series data).
1252
- cutoff (float, optional, default=0.99): The correlation threshold.
1253
- Columns with absolute correlation greater than this value will be considered for removal.
1254
-
1255
- Returns:
1256
- pd.DataFrame: A DataFrame with less correlated assets.
1257
- The columns that are highly correlated (above the cutoff) are removed.
1258
-
1259
- References
1260
- ----------
1261
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1262
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1263
-
1264
- Example:
1265
- >>> df = pd.DataFrame({
1266
- ... 'AAPL': [100, 101, 102, 103, 104],
1267
- ... 'MSFT': [200, 201, 202, 203, 204],
1268
- ... 'GOOG': [300, 301, 302, 303, 304]
1269
- ... })
1270
- >>> df = remove_correlated_assets(df)
1271
- """
1272
- corr = df.corr().stack()
1273
- corr = corr[corr < 1]
1274
- to_check = corr[corr.abs() > cutoff].index
1275
- keep, drop = set(), set()
1276
- for s1, s2 in to_check:
1277
- if s1 not in keep:
1278
- if s2 not in keep:
1279
- keep.add(s1)
1280
- drop.add(s2)
1281
- else:
1282
- drop.add(s1)
1283
- else:
1284
- keep.discard(s2)
1285
- drop.add(s2)
1286
- return df.drop(drop, axis=1)
1287
-
1288
-
1289
- def check_stationarity(df: pd.DataFrame):
1290
- """
1291
- Tests the stationarity of time-series data for each asset in the DataFrame
1292
- using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
1293
- in time-series analysis, and non-stationary data can affect model performance.
1294
-
1295
- Args:
1296
- df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1297
-
1298
- Returns:
1299
- pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
1300
- - ticker Asset name (column name from df).
1301
- - adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
1302
-
1303
- References
1304
- ----------
1305
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1306
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1307
-
1308
- Example:
1309
- >>> df = pd.DataFrame({
1310
- ... 'AAPL': [100, 101, 102, 103, 104],
1311
- ... 'MSFT': [200, 201, 202, 203, 204],
1312
- ... 'GOOG': [300, 301, 302, 303, 304]
1313
- ... })
1314
- >>> df = check_stationarity(df)
1315
- """
1316
- results = []
1317
- for ticker, prices in df.items():
1318
- results.append([ticker, adfuller(prices, regression="ct")[1]])
1319
- return pd.DataFrame(results, columns=["ticker", "adf"]).sort_values("adf")
1320
-
1321
-
1322
- def remove_stationary_assets(df: pd.DataFrame, pval=0.05):
1323
- """
1324
- Filters out stationary assets from the DataFrame based on the p-value obtained
1325
- from the Augmented Dickey-Fuller test.
1326
- Useful for focusing only on non-stationary time-series data.
1327
-
1328
- Args:
1329
- df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1330
- pval (float, optional, default=0.05): The significance level to determine stationarity.
1331
- Columns with an ADF test p-value below this threshold are considered stationary and removed.
1332
-
1333
- Returns:
1334
- pd.DataFrame: A DataFrame containing only the non-stationary assets.
1335
-
1336
- References
1337
- ----------
1338
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1339
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1340
-
1341
- Example:
1342
- >>> df = pd.DataFrame({
1343
- ... 'AAPL': [100, 101, 102, 103, 104],
1344
- ... 'MSFT': [200, 201, 202, 203, 204],
1345
- ... 'GOOG': [300, 301, 302, 303, 304]
1346
- ... })
1347
- >>> df = remove_stationary_assets(df)
1348
- """
1349
- test_result = check_stationarity(df)
1350
- stationary = test_result.loc[test_result.adf <= pval, "ticker"].tolist()
1351
- return df.drop(stationary, axis=1).sort_index()
1352
-
1353
-
1354
- def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
1355
- """
1356
- Selects the top N assets based on the average trading volume from the input DataFrame.
1357
- These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
1358
-
1359
- Args:
1360
- df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
1361
- Must include columns 'close' (price) and 'volume'.
1362
- n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
1363
- start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
1364
- end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
1365
- rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
1366
-
1367
- Returns:
1368
- pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
1369
-
1370
- References
1371
- ----------
1372
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1373
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1374
- """
1375
- required_columns = {"close", "volume"}
1376
- if not required_columns.issubset(df.columns):
1377
- raise ValueError(
1378
- f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}."
1379
- )
1380
-
1381
- if (
1382
- not isinstance(df.index, pd.MultiIndex)
1383
- or "ticker" not in df.index.names
1384
- or "date" not in df.index.names
1385
- ):
1386
- raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
1387
-
1388
- df = df.copy()
1389
- idx = pd.IndexSlice
1390
- start = start or df.index.get_level_values("date").min()
1391
- end = end or df.index.get_level_values("date").max()
1392
- df = (
1393
- df.loc[lambda df: ~df.index.duplicated()]
1394
- .sort_index()
1395
- .loc[idx[:, f"{start}" : f"{end}"], :]
1396
- .assign(dv=lambda df: df.close.mul(df.volume))
1397
- )
1398
-
1399
- if rolling_window is None:
1400
- most_traded = df.groupby(level="ticker").dv.mean().nlargest(n=n).index
1401
- else:
1402
- # Calculate the rolling average of dollar volume
1403
- df["dv_rolling_avg"] = (
1404
- df.groupby(level=0)
1405
- .dv.rolling(window=rolling_window, min_periods=1)
1406
- .mean()
1407
- .reset_index(level=0, drop=True)
1408
- )
1409
- most_traded = df.groupby(level=0)["dv_rolling_avg"].mean().nlargest(n=n).index
1410
- df = (
1411
- df.loc[idx[most_traded, :], "close"]
1412
- .unstack("ticker")
1413
- .ffill(limit=5)
1414
- .dropna(axis=1)
1415
- )
1416
- df = remove_correlated_assets(df)
1417
- df = remove_stationary_assets(df)
1418
- return df.sort_index()
1419
-
1420
-
1421
- def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1422
- """
1423
- Calculates statistical and econometric metrics for a target security and a set of candidate securities.
1424
- These metrics are useful in financial modeling and pairs trading strategies,
1425
- providing information about drift, volatility, correlation, and cointegration.
1426
-
1427
- Args:
1428
- security (pd.Series): A time-series of the target security's prices.
1429
- The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
1430
- candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
1431
- for candidate securities to be evaluated against the target security.
1432
-
1433
- Returns:
1434
- pd.DataFrame: A DataFrame combining:
1435
- Drift: Estimated drift of spreads between the target security and each candidate.
1436
- Volatility: Standard deviation of spreads.
1437
- Correlation:
1438
- ``corr``: Correlation of normalized prices between the target and each candidate.
1439
- ``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
1440
- Cointegration metrics:
1441
- Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
1442
- Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
1443
-
1444
- References
1445
- ----------
1446
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1447
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1448
- """
1449
- security = security.div(security.iloc[0])
1450
- ticker = security.name
1451
- candidates = candidates.div(candidates.iloc[0])
1452
- spreads = candidates.sub(security, axis=0)
1453
- n, m = spreads.shape
1454
- X = np.ones(shape=(n, 2))
1455
- X[:, 1] = np.arange(1, n + 1)
1456
-
1457
- # compute drift
1458
- drift = (np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1].to_frame("drift")
1459
-
1460
- # compute volatility
1461
- vol = spreads.std().to_frame("vol")
1462
-
1463
- # returns correlation
1464
- corr_ret = (
1465
- candidates.pct_change().corrwith(security.pct_change()).to_frame("corr_ret")
1466
- )
1467
-
1468
- # normalized price series correlation
1469
- corr = candidates.corrwith(security).to_frame("corr")
1470
- metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
1471
-
1472
- tests = []
1473
- # run cointegration tests
1474
- for candidate, prices in tqdm(candidates.items()):
1475
- df = pd.DataFrame({"s1": security, "s2": prices})
1476
- var = VAR(df.values)
1477
- lags = var.select_order() # select VAR order
1478
- k_ar_diff = lags.selected_orders["aic"]
1479
- # Johansen Test with constant Term and estd. lag order
1480
- cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
1481
- # Engle-Granger Tests
1482
- t1, p1 = coint(security, prices, trend="c")[:2]
1483
- t2, p2 = coint(prices, security, trend="c")[:2]
1484
- tests.append([ticker, candidate, t1, p1, t2, p2, k_ar_diff, *cj0.lr1])
1485
- columns = ["s1", "s2", "t1", "p1", "t2", "p2", "k_ar_diff", "trace0", "trace1"]
1486
- tests = pd.DataFrame(tests, columns=columns).set_index("s2")
1487
- return metrics.join(tests)
1488
-
1489
-
1490
- __CRITICAL_VALUES = {
1491
- 0: {0.9: 13.4294, 0.95: 15.4943, 0.99: 19.9349},
1492
- 1: {0.9: 2.7055, 0.95: 3.8415, 0.99: 6.6349},
1493
- }
1494
-
1495
-
1496
- def find_cointegrated_pairs(
1497
- securities: pd.DataFrame,
1498
- candidates: pd.DataFrame,
1499
- n=None,
1500
- start=None,
1501
- stop=None,
1502
- coint=False,
1503
- ):
1504
- """
1505
- Identifies cointegrated pairs between a target set of securities and candidate securities
1506
- based on econometric tests. The function evaluates statistical relationships,
1507
- such as cointegration and Engle-Granger significance, to determine pairs suitable
1508
- for financial strategies like pairs trading.
1509
-
1510
- Args:
1511
- securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
1512
- prices of target securities to evaluate.
1513
- candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
1514
- prices of candidate securities to compare against the target securities.
1515
- n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
1516
- start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
1517
- stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
1518
- coint (`bool`, optional, default=False):
1519
- - If `True`, filters for pairs identified as cointegrated.
1520
- - If `False`, returns all evaluated pairs.
1521
-
1522
- Returns:
1523
- - ``pd.DataFrame``: A DataFrame containing:
1524
- - Johansen and Engle-Granger cointegration metrics:
1525
- - `t1`, `t2`: Engle-Granger test statistics for two directions.
1526
- - `p1`, `p2`: Engle-Granger p-values for two directions.
1527
- - `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
1528
- - Indicators and filters:
1529
- - `joh_sig`: Indicates Johansen cointegration significance.
1530
- - `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
1531
- - `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
1532
- - `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
1533
- - Spread and ranking:
1534
- - `t`: Minimum of `t1` and `t2`.
1535
- - `p`: Minimum of `p1` and `p2`.
1536
- References
1537
- ----------
1538
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1539
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1540
-
1541
- Example:
1542
- >>> import pandas as pd
1543
-
1544
- >>> # Sample Data
1545
- >>> data_securities = {
1546
- ... 'Security1': [100, 102, 101, 103, 105],
1547
- ... 'Security2': [50, 52, 53, 51, 54]
1548
- ... }
1549
- >>> data_candidates = {
1550
- ... 'Candidate1': [100, 101, 99, 102, 104],
1551
- ... 'Candidate2': [200, 202, 201, 203, 205]
1552
- ... }
1553
-
1554
- >>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
1555
- >>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
1556
-
1557
- >>> # Find cointegrated pairs
1558
- >>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1559
- >>> print(top_pairs)
1560
-
1561
- >>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
1562
- >>> |----------|-----------|------|-------|---------|--------|-------|
1563
- >>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
1564
- >>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
1565
- """
1566
- trace0_cv = __CRITICAL_VALUES[0][
1567
- 0.95
1568
- ] # critical value for 0 cointegration relationships
1569
- # critical value for 1 cointegration relationship
1570
- trace1_cv = __CRITICAL_VALUES[1][0.95]
1571
- spreads = []
1572
- if start is not None and stop is not None:
1573
- securities = securities.loc[str(start) : str(stop), :]
1574
- candidates = candidates.loc[str(start) : str(stop), :]
1575
- for i, (ticker, prices) in enumerate(securities.items(), 1):
1576
- try:
1577
- df = compute_pair_metrics(prices, candidates)
1578
- spreads.append(df.set_index("s1", append=True))
1579
- except np.linalg.LinAlgError:
1580
- continue
1581
- spreads = pd.concat(spreads)
1582
- spreads.index.names = ["s2", "s1"]
1583
- spreads = spreads.swaplevel()
1584
- spreads["t"] = spreads[["t1", "t2"]].min(axis=1)
1585
- spreads["p"] = spreads[["p1", "p2"]].min(axis=1)
1586
- spreads["joh_sig"] = (
1587
- (spreads.trace0 > trace0_cv) & (spreads.trace1 > trace1_cv)
1588
- ).astype(int)
1589
- spreads["eg_sig"] = (spreads.p < 0.05).astype(int)
1590
- spreads["s1_dep"] = spreads.p1 < spreads.p2
1591
- spreads["coint"] = (spreads.joh_sig & spreads.eg_sig).astype(int)
1592
- # select top n pairs
1593
- if coint:
1594
- if n is not None:
1595
- top_pairs = (
1596
- spreads.query("coint == 1").sort_values("t", ascending=False).head(n)
1597
- )
1598
- else:
1599
- top_pairs = spreads.query("coint == 1").sort_values("t", ascending=False)
1600
- else:
1601
- if n is not None:
1602
- top_pairs = spreads.sort_values("t", ascending=False).head(n)
1603
- else:
1604
- top_pairs = spreads.sort_values("t", ascending=False)
1605
- return top_pairs
1606
-
1607
-
1608
- def analyze_cointegrated_pairs(
1609
- spreads: pd.DataFrame,
1610
- plot_coint=True,
1611
- crosstab=False,
1612
- heuristics=False,
1613
- log_reg=False,
1614
- decis_tree=False,
1615
- ):
1616
- """
1617
- Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
1618
-
1619
- Args:
1620
- spreads (pd.DataFrame):
1621
- A DataFrame containing cointegration metrics and characteristics.
1622
- Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
1623
- plot_coint (bool, optional):
1624
- If True, generates scatterplots and boxplots to visualize cointegration characteristics.
1625
- cosstab (bool, optional):
1626
- If True, displays crosstabulations of Engle-Granger and Johansen test significance.
1627
- heuristics (bool, optional):
1628
- If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
1629
- log_reg (bool, optional):
1630
- If True, fits a logistic regression model to predict cointegration and evaluates its performance.
1631
- decis_tree (bool, optional):
1632
- If True, fits a decision tree model to predict cointegration and evaluates its performance.
1633
-
1634
- References
1635
- ----------
1636
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1637
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1638
-
1639
- Example:
1640
- >>> import pandas as pd
1641
- >>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
1642
-
1643
- >>> # Sample Data
1644
- >>> securities = pd.DataFrame({
1645
- ... 'SPY': [100, 102, 101, 103, 105],
1646
- ... 'QQQ': [50, 52, 53, 51, 54]
1647
- ... })
1648
- >>> candidates = pd.DataFrame({
1649
- ... 'AAPL': [100, 101, 99, 102, 104],
1650
- ... 'MSFT': [200, 202, 201, 203, 205]
1651
- ... })
1652
-
1653
- >>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1654
- >>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
1655
- """
1656
- if plot_coint:
1657
- trace0_cv = __CRITICAL_VALUES[0][0.95]
1658
- spreads = spreads.reset_index()
1659
- sns.scatterplot(
1660
- x=np.log1p(spreads.t.abs()),
1661
- y=np.log1p(spreads.trace1),
1662
- hue="coint",
1663
- data=spreads[spreads.trace0 > trace0_cv],
1664
- )
1665
- fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
1666
- for i, heuristic in enumerate(["drift", "vol", "corr", "corr_ret"]):
1667
- sns.boxplot(x="coint", y=heuristic, data=spreads, ax=axes[i])
1668
- fig.tight_layout()
1669
-
1670
- if heuristics:
1671
- spreads = spreads.reset_index()
1672
- h = (
1673
- spreads.groupby(spreads.coint)[["drift", "vol", "corr"]]
1674
- .describe()
1675
- .stack(level=0)
1676
- .swaplevel()
1677
- .sort_index()
1678
- )
1679
- print(h)
1680
-
1681
- if log_reg:
1682
- y = spreads.coint
1683
- X = spreads[["drift", "vol", "corr", "corr_ret"]]
1684
- log_reg = LogisticRegressionCV(
1685
- Cs=np.logspace(-10, 10, 21), class_weight="balanced", scoring="roc_auc"
1686
- )
1687
- log_reg.fit(X=X, y=y)
1688
- Cs = log_reg.Cs_
1689
- scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
1690
- scores.plot(logx=True)
1691
- res = f"C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}"
1692
- print(res)
1693
- print(log_reg.coef_)
1694
-
1695
- if decis_tree:
1696
- model = DecisionTreeClassifier(class_weight="balanced")
1697
- decision_tree = GridSearchCV(
1698
- model, param_grid={"max_depth": list(range(1, 10))}, cv=5, scoring="roc_auc"
1699
- )
1700
- y = spreads.coint
1701
- X = spreads[["drift", "vol", "corr", "corr_ret"]]
1702
- decision_tree.fit(X, y)
1703
- res = f"{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_['max_depth']}"
1704
- print(res)
1705
-
1706
- if crosstab:
1707
- pd.set_option("display.float_format", lambda x: f"{x:.2%}")
1708
- print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
1709
- print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
1710
-
1711
-
1712
- def select_candidate_pairs(pairs: pd.DataFrame, period=False):
1713
- """
1714
- Select candidate pairs from a DataFrame based on cointegration status.
1715
-
1716
- This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
1717
- indicating cointegration. It then determines the dependent and independent series for each pair
1718
- and returns the selected pairs in a dictionary format.
1719
-
1720
- Args:
1721
- pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
1722
- period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
1723
-
1724
- Returns:
1725
- list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
1726
-
1727
- References
1728
- ----------
1729
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1730
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1731
- """
1732
- candidates = pairs.query("coint == 1").copy()
1733
- candidates = candidates.reset_index()
1734
- candidates["y"] = candidates.apply(
1735
- lambda x: x["s1"] if x.s1_dep else x["s2"], axis=1
1736
- )
1737
- candidates["x"] = candidates.apply(
1738
- lambda x: x["s2"] if x.s1_dep else x["s1"], axis=1
1739
- )
1740
- if period:
1741
- return candidates[["x", "y", "period"]].to_dict(orient="records")
1742
- return candidates[["x", "y"]].to_dict(orient="records")
1743
-
1744
-
1745
- def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1746
- """
1747
- Estimate rolling mean using Kalman Smoothing.
1748
-
1749
- Args:
1750
- prices : pd.Series or np.ndarray
1751
- The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
1752
-
1753
- Returns:
1754
- pd.Series or np.ndarray
1755
- The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
1756
- If the input is a numpy array, the output will be a numpy array.
1757
-
1758
- References
1759
- ----------
1760
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1761
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1762
-
1763
- Examples
1764
- --------
1765
- >>> import yfinance as yf
1766
- >>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
1767
- >>> prices = KFSmoother(prices)
1768
- >>> print(prices[:5])
1769
- Date
1770
- 2020-01-02 00:00:00+00:00 36.39801407
1771
- 2020-01-03 00:00:00+00:00 49.06231000
1772
- 2020-01-06 00:00:00+00:00 55.86334436
1773
- 2020-01-07 00:00:00+00:00 60.02240894
1774
- 2020-01-08 00:00:00+00:00 63.15057948
1775
- dtype: float64
1776
-
1777
- """
1778
- if not isinstance(prices, (np.ndarray, pd.Series)):
1779
- raise ValueError("Input must be either a numpy array or a pandas Series.")
1780
- kf = PyKalmanFilter(
1781
- transition_matrices=np.eye(1),
1782
- observation_matrices=np.eye(1),
1783
- initial_state_mean=0,
1784
- initial_state_covariance=1,
1785
- observation_covariance=1,
1786
- transition_covariance=0.05,
1787
- )
1788
- if isinstance(prices, pd.Series):
1789
- state_means, _ = kf.filter(prices.values)
1790
- return pd.Series(state_means.flatten(), index=prices.index)
1791
- elif isinstance(prices, np.ndarray):
1792
- state_means, _ = kf.filter(prices)
1793
- return state_means.flatten()
1794
-
1795
-
1796
- def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
1797
- """
1798
- Estimate Hedge Ratio using Kalman Filter.
1799
- Args:
1800
- x : pd.Series or np.ndarray
1801
- The independent variable, which can be either a pandas Series or a numpy array.
1802
- y : pd.Series or np.ndarray
1803
- The dependent variable, which can be either a pandas Series or a numpy array.
1804
-
1805
- Returns:
1806
- np.ndarray
1807
- The estimated hedge ratio as a numpy array.
1808
-
1809
- The function returns the negative of the first state variable of each Kalman Filter estimate,
1810
- which represents the estimated hedge ratio.
1811
-
1812
- References
1813
- ----------
1814
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1815
- chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1816
- """
1817
- if not isinstance(x, (np.ndarray, pd.Series)) or not isinstance(
1818
- y, (np.ndarray, pd.Series)
1819
- ):
1820
- raise ValueError(
1821
- "Both x and y must be either a numpy array or a pandas Series."
1822
- )
1823
-
1824
- delta = 1e-3
1825
- trans_cov = delta / (1 - delta) * np.eye(2)
1826
- obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
1827
-
1828
- kf = PyKalmanFilter(
1829
- n_dim_obs=1,
1830
- n_dim_state=2,
1831
- initial_state_mean=[0, 0],
1832
- initial_state_covariance=np.ones((2, 2)),
1833
- transition_matrices=np.eye(2),
1834
- observation_matrices=obs_mat,
1835
- observation_covariance=2,
1836
- transition_covariance=trans_cov,
1837
- )
1838
- y = y.values if isinstance(y, pd.Series) else y
1839
- state_means, _ = kf.filter(y)
1840
- # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
1841
- # each Kalman Filter estimate, which is the estimated hedge ratio.
1842
- return -state_means[:, 0]
1
+ """
2
+ The `tseries` module is a designed for conducting
3
+ advanced time series analysis in financial markets.
4
+ It leverages statistical models and algorithms to perform
5
+ tasks such as cointegration testing, volatility modeling,
6
+ and filter-based estimation to assist in trading strategy development,
7
+ market analysis, and financial data exploration.
8
+ """
9
+
10
+ import pprint
11
+ import warnings
12
+ from itertools import combinations
13
+ from typing import List, Tuple, Union
14
+
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ import pandas as pd
18
+ import pmdarima as pm
19
+ import seaborn as sns
20
+ import statsmodels.api as sm
21
+ import statsmodels.tsa.stattools as ts
22
+ import yfinance as yf
23
+ from arch import arch_model
24
+ from filterpy.kalman import KalmanFilter
25
+ from hurst import compute_Hc
26
+ from pykalman import KalmanFilter as PyKalmanFilter
27
+ from scipy.optimize import minimize
28
+ from sklearn.linear_model import LogisticRegressionCV
29
+ from sklearn.model_selection import GridSearchCV
30
+ from sklearn.tree import DecisionTreeClassifier
31
+ from statsmodels.graphics.tsaplots import plot_acf
32
+ from statsmodels.stats.diagnostic import acorr_ljungbox
33
+ from statsmodels.tsa.arima.model import ARIMA
34
+ from statsmodels.tsa.stattools import adfuller, coint
35
+ from statsmodels.tsa.vector_ar.var_model import VAR
36
+ from statsmodels.tsa.vector_ar.vecm import coint_johansen
37
+ from tqdm import tqdm
38
+
39
+ warnings.filterwarnings("ignore")
40
+
41
+
42
+ __all__ = [
43
+ "load_and_prepare_data",
44
+ "fit_best_arima",
45
+ "fit_garch",
46
+ "predict_next_return",
47
+ "get_prediction",
48
+ "get_corr",
49
+ "run_cadf_test",
50
+ "run_hurst_test",
51
+ "run_coint_test",
52
+ "run_kalman_filter",
53
+ "ArimaGarchModel",
54
+ "KalmanFilterModel",
55
+ "OrnsteinUhlenbeck",
56
+ "remove_correlated_assets",
57
+ "check_stationarity",
58
+ "remove_stationary_assets",
59
+ "select_assets",
60
+ "compute_pair_metrics",
61
+ "find_cointegrated_pairs",
62
+ "analyze_cointegrated_pairs",
63
+ "select_candidate_pairs",
64
+ "KFSmoother",
65
+ "KFHedgeRatio",
66
+ ]
67
+
68
+ # *******************************************
69
+ # ARIMA AND GARCH MODELS *
70
+ # *******************************************
71
+
72
+
73
+ def load_and_prepare_data(df: pd.DataFrame):
74
+ """
75
+ Prepares financial time series data for analysis.
76
+
77
+ This function takes a pandas DataFrame containing financial data,
78
+ calculates logarithmic returns, and the first difference
79
+ of these logarithmic returns. It handles missing values
80
+ by filling them with zeros.
81
+
82
+ Args:
83
+ df (pd.DataFrame): DataFrame containing at least
84
+ a `Close` column with closing prices of a financial asset.
85
+
86
+ Returns:
87
+ pd.DataFrame: DataFrame with additional
88
+ columns for logarithmic returns (`log_return`)
89
+ and the first difference of logarithmic returns (`diff_log_return`),
90
+ with `NaN` values filled with `0`.
91
+ """
92
+ # Load data
93
+ data = df.copy()
94
+ # Calculate logarithmic returns
95
+ data["log_return"] = np.log(data["Close"] / data["Close"].shift(1))
96
+ # Differencing if necessary
97
+ data["diff_log_return"] = data["log_return"].diff()
98
+ # Drop NaN values
99
+ data.fillna(0, inplace=True)
100
+ return data
101
+
102
+
103
+ def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
104
+ """
105
+ Identifies and fits the best `ARIMA` model
106
+ based on the Akaike Information Criterion `(AIC)`.
107
+
108
+ Iterates through different combinations of `p` and `q`
109
+ parameters (within specified ranges) for the ARIMA model,
110
+ fits them to the provided data, and selects the combination
111
+ with the lowest `AIC` value.
112
+
113
+ Args:
114
+ window_data (pd.Series or np.ndarray):
115
+ Time series data to fit the `ARIMA` model on.
116
+
117
+ Returns:
118
+ ARIMA result object: The fitted `ARIMA` model with the lowest `AIC`.
119
+ """
120
+ if isinstance(window_data, pd.Series):
121
+ window_data = window_data.values
122
+
123
+ window_data = window_data[~(np.isnan(window_data) | np.isinf(window_data))]
124
+ # Fit ARIMA model with best parameters
125
+ model = pm.auto_arima(
126
+ window_data,
127
+ start_p=1,
128
+ start_q=1,
129
+ max_p=6,
130
+ max_q=6,
131
+ seasonal=False,
132
+ stepwise=True,
133
+ )
134
+ final_order = model.order
135
+ from arch.utility.exceptions import ConvergenceWarning as ArchWarning
136
+ from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
137
+
138
+ with warnings.catch_warnings():
139
+ warnings.filterwarnings("ignore", category=StatsWarning, module="statsmodels")
140
+ warnings.filterwarnings("ignore", category=ArchWarning, module="arch")
141
+ try:
142
+ best_arima_model = ARIMA(
143
+ window_data + 1e-5, order=final_order, missing="drop"
144
+ ).fit()
145
+ return best_arima_model
146
+ except np.linalg.LinAlgError:
147
+ # Catch specific linear algebra errors
148
+ print("LinAlgError occurred, skipping this data point.")
149
+ return None
150
+ except Exception as e:
151
+ # Catch any other unexpected errors and log them
152
+ print(f"An error occurred: {e}")
153
+ return None
154
+
155
+
156
+ def fit_garch(window_data: Union[pd.Series, np.ndarray]):
157
+ """
158
+ Fits an `ARIMA` model to the data to get residuals,
159
+ then fits a `GARCH(1,1)` model on these residuals.
160
+
161
+ Utilizes the residuals from the best `ARIMA` model fit to
162
+ then model volatility using a `GARCH(1,1)` model.
163
+
164
+ Args:
165
+ window_data (pd.Series or np.ndarray):
166
+ Time series data for which to fit the `ARIMA` and `GARCH` models.
167
+
168
+ Returns:
169
+ tuple: A tuple containing the `ARIMA` result
170
+ object and the `GARCH` result object.
171
+ """
172
+ arima_result = fit_best_arima(window_data)
173
+ if arima_result is None:
174
+ return None, None
175
+ resid = np.asarray(arima_result.resid)
176
+ resid = resid[~(np.isnan(resid) | np.isinf(resid))]
177
+ garch_model = arch_model(resid, p=1, q=1, rescale=False)
178
+ garch_result = garch_model.fit(disp="off")
179
+ return arima_result, garch_result
180
+
181
+
182
+ def predict_next_return(arima_result, garch_result):
183
+ """
184
+ Predicts the next return value using fitted `ARIMA` and `GARCH` models.
185
+
186
+ Combines the next period forecast from the `ARIMA` model
187
+ with the next period volatility forecast from the `GARCH` model
188
+ to predict the next return value.
189
+
190
+ Args:
191
+ arima_result (ARIMA result object): The fitted `ARIMA` model result.
192
+ garch_result (ARCH result object): The fitted `GARCH` model result.
193
+
194
+ Returns:
195
+ float: The predicted next return, adjusted for predicted volatility.
196
+ """
197
+ if arima_result is None or garch_result is None:
198
+ return 0
199
+ # Predict next value with ARIMA
200
+ arima_pred = arima_result.forecast(steps=1)
201
+ # Predict next volatility with GARCH
202
+ garch_pred = garch_result.forecast(horizon=1)
203
+ next_volatility = garch_pred.variance.iloc[-1, 0]
204
+
205
+ # Combine predictions (return + volatility)
206
+ if not isinstance(arima_pred, np.ndarray):
207
+ pred = arima_pred.values[0]
208
+ else:
209
+ pred = arima_pred[0]
210
+ return pred + next_volatility
211
+
212
+
213
+ def get_prediction(window_data: Union[pd.Series, np.ndarray]):
214
+ """
215
+ Orchestrator function to get the next period's return prediction.
216
+
217
+ This function ties together the process of fitting
218
+ both `ARIMA` and `GARCH` models on the provided data
219
+ and then predicting the next period's return using these models.
220
+
221
+ Args:
222
+ window_data (Union[pd.Series , np.ndarray]):
223
+ Time series data to fit the models and predict the next return.
224
+
225
+ Returns
226
+ float: Predicted next return value.
227
+ """
228
+ arima_result, garch_result = fit_garch(window_data)
229
+ prediction = predict_next_return(arima_result, garch_result)
230
+ return prediction
231
+
232
+
233
+ class ArimaGarchModel:
234
+ """
235
+ This class implements a time serie model
236
+ that combines `ARIMA (AutoRegressive Integrated Moving Average)`
237
+ and `GARCH (Generalized Autoregressive Conditional Heteroskedasticity)` models
238
+ to predict future returns based on historical price data.
239
+
240
+ The model is implemented in the following steps:
241
+ 1. Data Preparation: Load and prepare the historical price data.
242
+ 2. Modeling: Fit the ARIMA model to the data and then fit the GARCH model to the residuals.
243
+ 3. Prediction: Predict the next return using the ARIMA model and the next volatility using the GARCH model.
244
+ 4. Trading Strategy: Execute the trading strategy based on the predictions.
245
+ 5. Vectorized Backtesting: Backtest the trading strategy using the historical data.
246
+
247
+ Exemple:
248
+ >>> import yfinance as yf
249
+ >>> from bbstrader.tseries import ArimaGarchModel
250
+ >>> from bbstrader.tseries import load_and_prepare_data
251
+
252
+ >>> if __name__ == '__main__':
253
+ >>> # ARCH SPY Vectorize Backtest
254
+ >>> k = 252
255
+ >>> data = yf.download("SPY", start="2010-01-02", end="2015-12-31")
256
+ >>> arch = ArimaGarchModel("SPY", data, k=k)
257
+ >>> df = load_and_prepare_data(data)
258
+ >>> arch.show_arima_garch_results(df['diff_log_return'].values[-k:])
259
+ >>> arch.backtest_strategy()
260
+ """
261
+
262
+ def __init__(self, symbol, data, k: int = 252):
263
+ """
264
+ Initializes the ArimaGarchStrategy class.
265
+
266
+ Args:
267
+ symbol (str): The ticker symbol for the financial instrument.
268
+ data (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
269
+ k (int): The window size for rolling prediction in backtesting.
270
+ """
271
+ self.symbol = symbol
272
+ self.data = self.load_and_prepare_data(data)
273
+ self.k = k
274
+
275
+ # Step 1: Data Preparation
276
+ def load_and_prepare_data(self, df):
277
+ """
278
+ Prepares the dataset by calculating logarithmic returns
279
+ and differencing if necessary.
280
+
281
+ Args:
282
+ df (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
283
+
284
+ Returns:
285
+ pd.DataFrame: The dataset with additional columns
286
+ for log returns and differenced log returns.
287
+ """
288
+ return load_and_prepare_data(df)
289
+
290
+ # Step 2: Modeling (ARIMA + GARCH)
291
+ def fit_best_arima(self, window_data):
292
+ """
293
+ Fits the ARIMA model to the provided window of data,
294
+ selecting the best model based on AIC.
295
+
296
+ Args:
297
+ window_data (np.array): The dataset for a specific window period.
298
+
299
+ Returns:
300
+ ARIMA model: The best fitted ARIMA model based on AIC.
301
+ """
302
+ return fit_best_arima(window_data)
303
+
304
+ def fit_garch(self, window_data):
305
+ """
306
+ Fits the GARCH model to the residuals of the best ARIMA model.
307
+
308
+ Args:
309
+ window_data (np.array): The dataset for a specific window period.
310
+
311
+ Returns:
312
+ tuple: Contains the ARIMA result and GARCH result.
313
+ """
314
+ return fit_garch(window_data)
315
+
316
+ def show_arima_garch_results(self, window_data, acf=True, test_resid=True):
317
+ """
318
+ Displays the ARIMA and GARCH model results, including plotting
319
+ ACF of residuals and conducting , Box-Pierce and Ljung-Box tests.
320
+
321
+ Args:
322
+ window_data (np.array): The dataset for a specific window period.
323
+ acf (bool, optional): If True, plot the ACF of residuals. Defaults to True.
324
+
325
+ test_resid (bool, optional):
326
+ If True, conduct Box-Pierce and Ljung-Box tests on residuals. Defaults to True.
327
+ """
328
+ arima_result = self.fit_best_arima(window_data)
329
+ resid = np.asarray(arima_result.resid)
330
+ resid = resid[~(np.isnan(resid) | np.isinf(resid))]
331
+ garch_model = arch_model(resid, p=1, q=1, rescale=False)
332
+ garch_result = garch_model.fit(disp="off")
333
+ residuals = garch_result.resid
334
+
335
+ # TODO : Plot the ACF of the residuals
336
+ if acf:
337
+ fig = plt.figure(figsize=(12, 8))
338
+ # Plot the ACF of ARIMA residuals
339
+ ax1 = fig.add_subplot(211, ylabel="ACF")
340
+ plot_acf(resid, alpha=0.05, ax=ax1, title="ACF of ARIMA Residuals")
341
+ ax1.set_xlabel("Lags")
342
+ ax1.grid(True)
343
+
344
+ # Plot the ACF of GARCH residuals on the same axes
345
+ ax2 = fig.add_subplot(212, ylabel="ACF")
346
+ plot_acf(residuals, alpha=0.05, ax=ax2, title="ACF of GARCH Residuals")
347
+ ax2.set_xlabel("Lags")
348
+ ax2.grid(True)
349
+
350
+ # Plot the figure
351
+ plt.tight_layout()
352
+ plt.show()
353
+
354
+ # TODO : Conduct Box-Pierce and Ljung-Box Tests of the residuals
355
+ if test_resid:
356
+ print(arima_result.summary())
357
+ print(garch_result.summary())
358
+ bp_test = acorr_ljungbox(resid, return_df=True)
359
+ print("Box-Pierce and Ljung-Box Tests Results for ARIMA:\n", bp_test)
360
+
361
+ # Step 3: Prediction
362
+ def predict_next_return(self, arima_result, garch_result):
363
+ """
364
+ Predicts the next return using the ARIMA model
365
+ and the next volatility using the GARCH model.
366
+
367
+ Args:
368
+ arima_result (ARIMA model): The ARIMA model result.
369
+ garch_result (GARCH model): The GARCH model result.
370
+
371
+ Returns:
372
+ float: The predicted next return.
373
+ """
374
+ return predict_next_return(arima_result, garch_result)
375
+
376
+ def get_prediction(self, window_data):
377
+ """
378
+ Generates a prediction for the next return based on a window of data.
379
+
380
+ Args:
381
+ window_data (np.array): The dataset for a specific window period.
382
+
383
+ Returns:
384
+ float: The predicted next return.
385
+ """
386
+ return get_prediction(window_data)
387
+
388
+ def calculate_signals(self, window_data):
389
+ """
390
+ Calculates the trading signal based on the prediction.
391
+
392
+ Args:
393
+ window_data (np.array): The dataset for a specific window period.
394
+
395
+ Returns:
396
+ str: The trading signal ('LONG', 'SHORT', or None).
397
+ """
398
+ prediction = self.get_prediction(window_data)
399
+ if prediction > 0:
400
+ signal = "LONG"
401
+ elif prediction < 0:
402
+ signal = "SHORT"
403
+ else:
404
+ signal = None
405
+ return signal
406
+
407
+ # Step 4: Trading Strategy
408
+
409
+ def execute_trading_strategy(self, predictions):
410
+ """
411
+ Executes the trading strategy based on a list
412
+ of predictions, determining positions to take.
413
+
414
+ Args:
415
+ predictions (list): A list of predicted returns.
416
+
417
+ Returns:
418
+ list: A list of positions (1 for 'LONG', -1 for 'SHORT', 0 for 'HOLD').
419
+ """
420
+ positions = [] # Long if 1, Short if -1
421
+ previous_position = 0 # Initial position
422
+ for prediction in predictions:
423
+ if prediction > 0:
424
+ current_position = 1 # Long
425
+ elif prediction < 0:
426
+ current_position = -1 # Short
427
+ else:
428
+ current_position = previous_position # Hold previous position
429
+ positions.append(current_position)
430
+ previous_position = current_position
431
+
432
+ return positions
433
+
434
+ # Step 5: Vectorized Backtesting
435
+ def generate_predictions(self):
436
+ """
437
+ Generator that yields predictions one by one.
438
+ """
439
+ data = self.data
440
+ window_size = self.k
441
+ for i in range(window_size, len(data)):
442
+ print(
443
+ f"Processing window {i - window_size + 1}/{len(data) - window_size}..."
444
+ )
445
+ window_data = data["diff_log_return"].iloc[i - window_size : i]
446
+ next_return = self.get_prediction(window_data)
447
+ yield next_return
448
+
449
+ def backtest_strategy(self):
450
+ """
451
+ Performs a backtest of the strategy over
452
+ the entire dataset, plotting cumulative returns.
453
+ """
454
+ data = self.data
455
+ window_size = self.k
456
+ print(
457
+ f"Starting backtesting for {self.symbol}\n"
458
+ f"Window size {window_size}.\n"
459
+ f"Total iterations: {len(data) - window_size}.\n"
460
+ )
461
+ predictions_generator = self.generate_predictions()
462
+
463
+ positions = self.execute_trading_strategy(predictions_generator)
464
+
465
+ strategy_returns = (
466
+ np.array(positions[:-1]) * data["log_return"].iloc[window_size + 1 :].values
467
+ )
468
+ buy_and_hold = data["log_return"].iloc[window_size + 1 :].values
469
+ buy_and_hold_returns = np.cumsum(buy_and_hold)
470
+ cumulative_returns = np.cumsum(strategy_returns)
471
+ dates = data.index[window_size + 1 :]
472
+ self.plot_cumulative_returns(cumulative_returns, buy_and_hold_returns, dates)
473
+
474
+ print("\nBacktesting completed !!")
475
+
476
+ # Function to plot the cumulative returns
477
+ def plot_cumulative_returns(self, strategy_returns, buy_and_hold_returns, dates):
478
+ """
479
+ Plots the cumulative returns of the ARIMA+GARCH strategy against
480
+ a buy-and-hold strategy.
481
+
482
+ Args:
483
+ strategy_returns (np.array): Cumulative returns from the strategy.
484
+ buy_and_hold_returns (np.array): Cumulative returns from a buy-and-hold strategy.
485
+ dates (pd.Index): The dates corresponding to the returns.
486
+ """
487
+ plt.figure(figsize=(14, 7))
488
+ plt.plot(dates, strategy_returns, label="ARIMA+GARCH ", color="blue")
489
+ plt.plot(dates, buy_and_hold_returns, label="Buy & Hold", color="red")
490
+ plt.xlabel("Time")
491
+ plt.ylabel("Cumulative Returns")
492
+ plt.title(f"ARIMA+GARCH Strategy vs. Buy & Hold on ({self.symbol})")
493
+ plt.legend()
494
+ plt.grid(True)
495
+ plt.show()
496
+
497
+
498
+ # *********************************************
499
+ # STATS TEST (Cointegration , Mean Reverting)*
500
+ # *********************************************
501
+ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -> None:
502
+ """
503
+ Calculates and prints the correlation matrix of the adjusted closing prices
504
+ for a given list of stock tickers within a specified date range.
505
+
506
+ Args:
507
+ tickers (Union[List[str] , Tuple[str, ...]]):
508
+ A list or tuple of valid stock tickers (e.g., ['AAPL', 'MSFT', 'GOOG']).
509
+ start (str): The start date for the historical data in 'YYYY-MM-DD' format.
510
+ end (str): The end date for the historical data in 'YYYY-MM-DD' format.
511
+
512
+ Example:
513
+ >>> from bbstrader.tseries import get_corr
514
+ >>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
515
+ """
516
+ # Download historical data
517
+ data = yf.download(tickers, start=start, end=end, multi_level_index=False)[
518
+ "Adj Close"
519
+ ]
520
+
521
+ # Calculate correlation matrix
522
+ correlation_matrix = data.corr()
523
+
524
+ # Display the matrix
525
+ print(correlation_matrix)
526
+
527
+
528
+ def plot_price_series(df: pd.DataFrame, ts1: str, ts2: str):
529
+ """
530
+ Plot both time series on the same line graph for
531
+ the specified date range.
532
+
533
+ Args:
534
+ df (pd.DataFrame):
535
+ The DataFrame containing prices for each series
536
+ ts1 (str): The first time series column name
537
+ ts2 (str): The second time series column name
538
+ """
539
+ fig, ax = plt.subplots()
540
+ ax.plot(df.index, df[ts1], label=ts1)
541
+ ax.plot(df.index, df[ts2], label=ts2)
542
+
543
+ fig.autofmt_xdate()
544
+ plt.xlabel("Month/Year")
545
+ plt.ylabel("Price ($)")
546
+ plt.title(f"{ts1} and {ts2} Daily Prices ")
547
+ plt.legend()
548
+ plt.show()
549
+
550
+
551
+ def plot_scatter_series(df: pd.DataFrame, ts1: str, ts2: str):
552
+ """
553
+ Plot a scatter plot of both time series for
554
+ via the provided DataFrame.
555
+
556
+ Args:
557
+ df (pd.DataFrame):
558
+ The DataFrame containing prices for each series
559
+ ts1 (str): The first time series column name
560
+ ts2 (str): The second time series column name
561
+ """
562
+ plt.xlabel(f"{ts1} Price ($)")
563
+ plt.ylabel(f"{ts2} Price ($)")
564
+ plt.title(f"{ts1} and {ts2} Price Scatterplot")
565
+ plt.scatter(df[ts1], df[ts2])
566
+
567
+ # Plot the regression line
568
+ plt.plot(
569
+ df[ts1],
570
+ results.fittedvalues,
571
+ linestyle="--",
572
+ color="red",
573
+ linewidth=2,
574
+ label="Regression Line",
575
+ )
576
+ plt.legend()
577
+ plt.show()
578
+
579
+
580
+ def plot_residuals(df: pd.DataFrame):
581
+ """
582
+ Plot the residuals of OLS procedure for both
583
+ time series.
584
+
585
+ Args:
586
+ df (pd.DataFrame):
587
+ The DataFrame containing prices for each series
588
+ """
589
+ fig, ax = plt.subplots()
590
+ ax.plot(df.index, df["res"], label="Residuals")
591
+
592
+ fig.autofmt_xdate()
593
+ plt.xlabel("Month/Year")
594
+ plt.ylabel("Price ($)")
595
+ plt.title("Residual Plot")
596
+ plt.legend()
597
+ plt.show()
598
+
599
+
600
+ def run_cadf_test(
601
+ pair: Union[List[str], Tuple[str, ...]],
602
+ start: str,
603
+ end: str,
604
+ ) -> None:
605
+ """
606
+ Performs the Cointegration Augmented Dickey-Fuller (CADF) test on a pair of stock tickers
607
+ over a specified date range to check for cointegration.
608
+
609
+ The function downloads historical adjusted closing prices for the specified pair of stock tickers,
610
+ calculates the optimal hedge ratio (beta) using Ordinary Least Squares (OLS) regression, plots the
611
+ time series and their residuals, and finally performs the CADF test on the residuals.
612
+
613
+ Args:
614
+ pair (List[str] or Tuple[str, ...]):
615
+ A list or tuple containing two valid stock tickers (e.g., ['AAPL', 'MSFT']).
616
+ start (str): The start date for the historical data in 'YYYY-MM-DD' format.
617
+ end (str): The end date for the historical data in 'YYYY-MM-DD' format.
618
+
619
+ Example:
620
+ >>> from bbstrader.tseries import run_cadf_test
621
+ >>> run_cadf_test(['AAPL', 'MSFT'], '2023-01-01', '2023-12-31')
622
+ >>> Regression Metrics:
623
+ >>> Optimal Hedge Ratio (Beta): 2.2485845594120333
624
+ >>> Result Parmas:
625
+
626
+ >>> const -74.418034
627
+ >>> AAPL 2.248585
628
+ >>> dtype: float64
629
+
630
+ >>> Regression Summary:
631
+ >>> OLS Regression Results
632
+ >>> ==============================================================================
633
+ >>> Dep. Variable: MSFT R-squared: 0.900
634
+ >>> Model: OLS Adj. R-squared: 0.900
635
+ >>> Method: Least Squares F-statistic: 2244.
636
+ >>> Date: Sat, 20 Jul 2024 Prob (F-statistic): 2.95e-126
637
+ >>> Time: 13:36:58 Log-Likelihood: -996.45
638
+ >>> No. Observations: 250 AIC: 1997.
639
+ >>> Df Residuals: 248 BIC: 2004.
640
+ >>> Df Model: 1
641
+ >>> Covariance Type: nonrobust
642
+ >>> ==============================================================================
643
+ >>> coef std err t P>|t| [0.025 0.975]
644
+ >>> ------------------------------------------------------------------------------
645
+ >>> const -74.4180 8.191 -9.085 0.000 -90.551 -58.286
646
+ >>> AAPL 2.2486 0.047 47.369 0.000 2.155 2.342
647
+ >>> ==============================================================================
648
+ >>> Omnibus: 4.923 Durbin-Watson: 0.121
649
+ >>> Prob(Omnibus): 0.085 Jarque-Bera (JB): 4.862
650
+ >>> Skew: 0.342 Prob(JB): 0.0879
651
+ >>> Kurtosis: 2.993 Cond. No. 1.71e+03
652
+ >>> ==============================================================================
653
+
654
+ >>> Notes:
655
+ >>> [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
656
+ >>> [2] The condition number is large, 1.71e+03. This might indicate that there are
657
+ >>> strong multicollinearity or other numerical problems.
658
+
659
+ >>> Cointegration TEST Results:
660
+ >>> (np.float64(-3.204126144947765),
661
+ >>> np.float64(0.019747080611767602),
662
+ >>> 0,
663
+ >>> 249,
664
+ >>> {'1%': np.float64(-3.4568881317725864),
665
+ >>> '10%': np.float64(-2.5729936189738876),
666
+ >>> '5%': np.float64(-2.8732185133016057)},
667
+ >>> np.float64(1364.3866758546171))
668
+ """
669
+ # Download historical data for required stocks
670
+ p0, p1 = pair[0], pair[1]
671
+ _p0 = yf.download(
672
+ p0,
673
+ start=start,
674
+ end=end,
675
+ progress=False,
676
+ multi_level_index=False,
677
+ auto_adjust=True,
678
+ )
679
+ _p1 = yf.download(
680
+ p1,
681
+ start=start,
682
+ end=end,
683
+ progress=False,
684
+ multi_level_index=False,
685
+ auto_adjust=True,
686
+ )
687
+ df = pd.DataFrame(index=_p0.index)
688
+ df[p0] = _p0["Adj Close"]
689
+ df[p1] = _p1["Adj Close"]
690
+ df = df.dropna()
691
+
692
+ # Calculate optimal hedge ratio "beta"
693
+ # using statsmodels OLS
694
+ X = sm.add_constant(df[p0])
695
+ y = df[p1]
696
+ model = sm.OLS(y, X)
697
+ global results
698
+ results = model.fit()
699
+ beta_hr = results.params[p0]
700
+
701
+ # Plot the two time series with regression line
702
+ plot_price_series(df, p0, p1)
703
+
704
+ # Display a scatter plot of the two time series
705
+ # with regression line
706
+ plot_scatter_series(df, p0, p1)
707
+
708
+ # Calculate the residuals of the linear combination
709
+ df["res"] = results.resid
710
+ plot_residuals(df)
711
+
712
+ # Display regression metrics
713
+ print("\nRegression Metrics:")
714
+ print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
715
+ print("Result Parmas: \n")
716
+ print(results.params)
717
+ print("\nRegression Summary:")
718
+ print(results.summary())
719
+
720
+ # Calculate and output the CADF test on the residuals
721
+ print("\nCointegration TEST Results:")
722
+ cadf = ts.adfuller(df["res"], autolag="AIC")
723
+ pprint.pprint(cadf)
724
+
725
+
726
+ def _hurst(ts):
727
+ """
728
+ Returns the Hurst Exponent of the time series vector ts,
729
+ """
730
+ # Create the range of lag values
731
+ lags = range(2, 100)
732
+
733
+ # Calculate the array of the variances of the lagged differences
734
+ tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
735
+
736
+ # Use a linear fit to estimate the Hurst Exponent
737
+ poly = np.polyfit(np.log(lags), np.log(tau), 1)
738
+
739
+ # Return the Hurst exponent from the polyfit output
740
+ return poly[0] * 2.0
741
+
742
+
743
+ # Function to calculate Hurst Exponent
744
+
745
+
746
+ def hurst(time_series):
747
+ H, c, data_range = compute_Hc(time_series, kind="price", simplified=True)
748
+ return H
749
+
750
+
751
+ def run_hurst_test(symbol: str, start: str, end: str):
752
+ """
753
+ Calculates and prints the Hurst Exponent for a given stock's adjusted closing prices
754
+ within a specified date range, and for three generated series (Geometric Brownian Motion,
755
+ Mean-Reverting, and Trending).
756
+
757
+ The Hurst Exponent is used to determine the long-term memory of a time series.
758
+
759
+ Args:
760
+ symbol (str): A valid stock ticker symbol (e.g., 'AAPL').
761
+ start (str): The start date for the historical data in 'YYYY-MM-DD' format.
762
+ end (str): The end date for the historical data in 'YYYY-MM-DD' format.
763
+
764
+ Example:
765
+ >>> from bbstrader.tseries import run_hurst_test
766
+
767
+ >>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
768
+ """
769
+ data = yf.download(
770
+ symbol,
771
+ start=start,
772
+ end=end,
773
+ progress=False,
774
+ multi_level_index=False,
775
+ auto_adjust=True,
776
+ )
777
+
778
+ # Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
779
+ gbm = np.log(np.cumsum(np.random.randn(100000)) + 1000)
780
+ mr = np.log(np.random.randn(100000) + 1000)
781
+ tr = np.log(np.cumsum(np.random.randn(100000) + 1) + 1000)
782
+
783
+ # Output the Hurst Exponent for each of the series
784
+ print(f"\nHurst(GBM): {_hurst(gbm)}")
785
+ print(f"Hurst(MR): {_hurst(mr)}")
786
+ print(f"Hurst(TR): {_hurst(tr)}")
787
+ print(f"Hurst({symbol}): {hurst(data['Adj Close'])}\n")
788
+
789
+
790
+ def test_cointegration(ticker1, ticker2, start, end):
791
+ # Download historical data
792
+ stock_data_pair = yf.download(
793
+ [ticker1, ticker2],
794
+ start=start,
795
+ end=end,
796
+ progress=False,
797
+ multi_level_index=False,
798
+ auto_adjust=True,
799
+ )["Adj Close"].dropna()
800
+
801
+ # Perform Johansen cointegration test
802
+ result = coint_johansen(stock_data_pair, det_order=0, k_ar_diff=1)
803
+
804
+ # Get the cointegration rank
805
+ traces_stats = result.lr1
806
+ print(f"\nTraces Stats: \n{traces_stats}")
807
+
808
+ # Get the critical values for 95% confidence level
809
+ critical_values = result.cvt
810
+ print(f"\nCritical Values: \n{critical_values}")
811
+
812
+ # Compare the cointegration rank with critical values
813
+ if traces_stats[0] > critical_values[:, 1].all():
814
+ print(f"\n{ticker1} and {ticker2} are cointegrated.\n")
815
+ else:
816
+ print(f"\nNo cointegration found for {ticker1} and {ticker2}.\n")
817
+
818
+
819
+ def run_coint_test(tickers: List[str], start: str, end: str) -> None:
820
+ """
821
+ Performs pairwise cointegration tests on a list of stock tickers over a specified date range.
822
+
823
+ For each unique pair of tickers, the function downloads historical adjusted closing prices and
824
+ tests for cointegration.
825
+
826
+ Args:
827
+ tickers (List[str]): A list of valid stock ticker symbols (e.g., ['AAPL', 'MSFT', 'GOOG']).
828
+ start (str): The start date for the historical data in 'YYYY-MM-DD' format.
829
+ end (str): The end date for the historical data in 'YYYY-MM-DD' format.
830
+
831
+ Example:
832
+ >>> from bbstrader.tseries import run_coint_test
833
+
834
+ >>> run_coint_test(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
835
+ """
836
+ # Loop through ticker combinations
837
+ for ticker1, ticker2 in combinations(tickers, 2):
838
+ test_cointegration(ticker1, ticker2, start, end)
839
+
840
+
841
+ # *********************************
842
+ # KALMAN FILTER *
843
+ # *********************************
844
+ def draw_date_coloured_scatterplot(etfs, prices):
845
+ """
846
+ Create a scatterplot of the two ETF prices, which is
847
+ coloured by the date of the price to indicate the
848
+ changing relationship between the sets of prices
849
+ """
850
+ plen = len(prices)
851
+ colour_map = plt.cm.get_cmap("YlOrRd")
852
+ colours = np.linspace(0.1, 1, plen)
853
+
854
+ scatterplot = plt.scatter(
855
+ prices[etfs[0]],
856
+ prices[etfs[1]],
857
+ s=30,
858
+ c=colours,
859
+ cmap=colour_map,
860
+ edgecolor="k",
861
+ alpha=0.8,
862
+ )
863
+
864
+ colourbar = plt.colorbar(scatterplot)
865
+ colourbar.ax.set_yticklabels([str(p.date()) for p in prices[:: plen // 9].index])
866
+
867
+ plt.xlabel(prices.columns[0])
868
+ plt.ylabel(prices.columns[1])
869
+ plt.show()
870
+
871
+
872
+ def calc_slope_intercept_kalman(etfs, prices):
873
+ """
874
+ Utilize the Kalman Filter from the filterpy library
875
+ to calculate the slope and intercept of the regressed
876
+ ETF prices.
877
+ """
878
+ delta = 1e-5
879
+ trans_cov = delta / (1 - delta) * np.eye(2)
880
+
881
+ kf = KalmanFilter(dim_x=2, dim_z=1)
882
+ kf.x = np.zeros((2, 1)) # Initial state
883
+ kf.P = np.ones((2, 2)) * 1000.0 # Initial covariance,
884
+ # large to represent high uncertainty
885
+ kf.F = np.eye(2) # State transition matrix
886
+ kf.Q = trans_cov # Process noise covariance
887
+ kf.R = 1.0 # Scalar measurement noise covariance
888
+
889
+ state_means, state_covs = [], []
890
+ for time, z in enumerate(prices[etfs[1]].values):
891
+ # Dynamically update the observation matrix H
892
+ # to include the current independent variable
893
+ kf.H = np.array([[prices[etfs[0]][time], 1.0]])
894
+ kf.predict()
895
+ kf.update(z)
896
+ state_means.append(kf.x.copy())
897
+ state_covs.append(kf.P.copy())
898
+
899
+ return np.array(state_means), np.array(state_covs)
900
+
901
+
902
+ def draw_slope_intercept_changes(prices, state_means):
903
+ """
904
+ Plot the slope and intercept of the regressed ETF prices
905
+ between the two ETFs, with the changing values of the
906
+ Kalman Filter over time.
907
+ """
908
+ print(f"First Slops : {state_means[0, 0]}")
909
+ print(f"First intercept : {state_means[0, 1]}")
910
+ pd.DataFrame(
911
+ {
912
+ "slope": state_means[:, 0].flatten(),
913
+ "intercept": state_means[:, 1].flatten(),
914
+ },
915
+ index=prices.index,
916
+ ).plot(subplots=True)
917
+ plt.show()
918
+
919
+
920
+ def run_kalman_filter(
921
+ etfs: Union[List[str], Tuple[str, ...]], start: str, end: str
922
+ ) -> None:
923
+ """
924
+ Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
925
+ to estimate the slope and intercept over time.
926
+
927
+ The function downloads historical adjusted closing prices for the specified pair of assets,
928
+ visualizes their price relationship, calculates the Kalman filter estimates for the slope and
929
+ intercept, and visualizes the changes in these estimates over time.
930
+
931
+ Args:
932
+ etfs (Union[List[str] , Tuple[str, ...]]):
933
+ A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
934
+ start (str): The start date for the historical data in 'YYYY-MM-DD' format.
935
+ end (str): The end date for the historical data in 'YYYY-MM-DD' format.
936
+
937
+ Example:
938
+ >>> from bbstrader.tseries import run_kalman_filter
939
+
940
+ >>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
941
+ """
942
+ etf_df1 = yf.download(
943
+ etfs[0], start, end, progress=False, multi_level_index=False, auto_adjust=True
944
+ )
945
+ etf_df2 = yf.download(
946
+ etfs[1], start, end, progress=False, multi_level_index=False, auto_adjust=True
947
+ )
948
+
949
+ prices = pd.DataFrame(index=etf_df1.index)
950
+ prices[etfs[0]] = etf_df1["Adj Close"]
951
+ prices[etfs[1]] = etf_df2["Adj Close"]
952
+
953
+ draw_date_coloured_scatterplot(etfs, prices)
954
+ state_means, state_covs = calc_slope_intercept_kalman(etfs, prices)
955
+ draw_slope_intercept_changes(prices, state_means)
956
+
957
+
958
+ class KalmanFilterModel:
959
+ """
960
+ Implements a Kalman Filter model a recursive algorithm used for estimating
961
+ the state of a linear dynamic system from a series of noisy measurements.
962
+ It's designed to process market data, estimate dynamic parameters such as
963
+ the slope and intercept of price relationships,
964
+ forecast error and standard deviation of the predictions
965
+
966
+ You can learn more here https://en.wikipedia.org/wiki/Kalman_filter
967
+ """
968
+
969
+ def __init__(self, tickers: List | Tuple, **kwargs):
970
+ """
971
+ Initializes the Kalman Filter strategy.
972
+
973
+ Args:
974
+ tickers :
975
+ A list or tuple of ticker symbols representing financial instruments.
976
+
977
+ kwargs : Keyword arguments for additional parameters,
978
+ specifically `delta` and `vt`
979
+ """
980
+ self.tickers = tickers
981
+ assert self.tickers is not None
982
+
983
+ self.R = None
984
+ self.theta = np.zeros(2)
985
+ self.P = np.zeros((2, 2))
986
+ self.delta = kwargs.get("delta", 1e-4)
987
+ self.vt = kwargs.get("vt", 1e-3)
988
+ self.wt = self.delta / (1 - self.delta) * np.eye(2)
989
+ self.latest_prices = np.array([-1.0, -1.0])
990
+ self.kf = self._init_kalman()
991
+
992
+ def _init_kalman(self):
993
+ """
994
+ Initializes and returns a Kalman Filter configured
995
+ for the trading strategy. The filter is set up with initial
996
+ state and covariance, state transition matrix, process noise
997
+ and measurement noise covariances.
998
+ """
999
+ kf = KalmanFilter(dim_x=2, dim_z=1)
1000
+ kf.x = np.zeros((2, 1)) # Initial state
1001
+ kf.P = self.P # Initial covariance
1002
+ kf.F = np.eye(2) # State transition matrix
1003
+ kf.Q = self.wt # Process noise covariance
1004
+ kf.R = 1.0 # Scalar measurement noise covariance
1005
+
1006
+ return kf
1007
+
1008
+ Array = np.ndarray
1009
+
1010
+ def calc_slope_intercep(self, prices: Array) -> Tuple:
1011
+ """
1012
+ Calculates and returns the slope and intercept
1013
+ of the relationship between the provided prices using the Kalman Filter.
1014
+ This method updates the filter with the latest price and returns
1015
+ the estimated slope and intercept.
1016
+
1017
+ Args:
1018
+ prices : A numpy array of prices for two financial instruments.
1019
+
1020
+ Returns:
1021
+ A tuple containing the slope and intercept of the relationship
1022
+ """
1023
+ self.kf.H = np.array([[prices[1], 1.0]])
1024
+ self.kf.predict()
1025
+ self.kf.update(prices[1])
1026
+ slope = self.kf.x.copy().flatten()[0]
1027
+ intercept = self.kf.x.copy().flatten()[1]
1028
+
1029
+ return slope, intercept
1030
+
1031
+ def calculate_etqt(self, prices: Array) -> Tuple:
1032
+ """
1033
+ Calculates the ``forecast error`` and ``standard deviation`` of the predictions
1034
+ using the Kalman Filter.
1035
+
1036
+ Args:
1037
+ prices : A numpy array of prices for two financial instruments.
1038
+
1039
+ Returns:
1040
+ A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
1041
+ """
1042
+
1043
+ self.latest_prices[0] = prices[0]
1044
+ self.latest_prices[1] = prices[1]
1045
+
1046
+ if all(self.latest_prices > -1.0):
1047
+ slope, intercept = self.calc_slope_intercep(self.latest_prices)
1048
+
1049
+ self.theta[0] = slope
1050
+ self.theta[1] = intercept
1051
+
1052
+ # Create the observation matrix of the latest prices
1053
+ # of Y and the intercept value (1.0) as well as the
1054
+ # scalar value of the latest price from X
1055
+ F = np.asarray([self.latest_prices[0], 1.0]).reshape((1, 2))
1056
+ y = self.latest_prices[1]
1057
+
1058
+ # The prior value of the states {\theta_t} is
1059
+ # distributed as a multivariate Gaussian with
1060
+ # mean a_t and variance-covariance {R_t}
1061
+ if self.R is not None:
1062
+ self.R = self.C + self.wt
1063
+ else:
1064
+ self.R = np.zeros((2, 2))
1065
+
1066
+ # Calculate the Kalman Filter update
1067
+ # ---------------------------------
1068
+ # Calculate prediction of new observation
1069
+ # as well as forecast error of that prediction
1070
+ yhat = F.dot(self.theta)
1071
+ et = y - yhat
1072
+
1073
+ # {Q_t} is the variance of the prediction of
1074
+ # observations and hence sqrt_Qt is the
1075
+ # standard deviation of the predictions
1076
+ Qt = F.dot(self.R).dot(F.T) + self.vt
1077
+ sqrt_Qt = np.sqrt(Qt)
1078
+
1079
+ # The posterior value of the states {\theta_t} is
1080
+ # distributed as a multivariate Gaussian with mean
1081
+ # {m_t} and variance-covariance {C_t}
1082
+ At = self.R.dot(F.T) / Qt
1083
+ self.theta = self.theta + At.flatten() * et
1084
+ self.C = self.R - At * F.dot(self.R)
1085
+ return (et[0], sqrt_Qt.flatten()[0])
1086
+ else:
1087
+ return None
1088
+
1089
+
1090
+ # ******************************************
1091
+ # ORNSTEIN UHLENBECK PROCESS *
1092
+ # ******************************************
1093
+
1094
+
1095
+ class OrnsteinUhlenbeck:
1096
+ """
1097
+ The Ornstein-Uhlenbeck process is a mathematical model
1098
+ used to describe the behavior of a mean-reverting stochastic process.
1099
+ We use it to model the price dynamics of an asset that tends
1100
+ to revert to a long-term mean.
1101
+
1102
+ We Estimate the drift (θ), volatility (σ), and long-term mean (μ)
1103
+ based on historical price data; then we Simulate the OU process
1104
+ using the estimated parameters.
1105
+
1106
+ https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
1107
+ """
1108
+
1109
+ def __init__(self, prices: np.ndarray, returns: bool = True, timeframe: str = "D1"):
1110
+ """
1111
+ Initializes the OrnsteinUhlenbeck instance.
1112
+
1113
+ Args:
1114
+ prices (np.ndarray) : Historical close prices.
1115
+
1116
+ retrurns (bool) : Use it to indicate weither
1117
+ you want to simulate the returns or your raw data
1118
+
1119
+ timeframe (str) : The time frame for the Historical prices
1120
+ (1m, 5m, 15m, 30m, 1h, 4h, D1)
1121
+ """
1122
+ self.prices = prices
1123
+ if returns:
1124
+ series = pd.Series(self.prices)
1125
+ self.returns = series.pct_change().dropna().values
1126
+ else:
1127
+ self.returns = self.prices
1128
+
1129
+ time_frame_mapping = {
1130
+ "1m": 1 / (24 * 60), # 1 minute intervals
1131
+ "5m": 5 / (24 * 60), # 5 minute intervals
1132
+ "15m": 15 / (24 * 60), # 15 minute intervals
1133
+ "30m": 30 / (24 * 60), # 30 minute intervals
1134
+ "1h": 1 / 24, # 1 hour intervals
1135
+ "4h": 4 / 24, # 4 hour intervals
1136
+ "D1": 1, # Daily intervals
1137
+ }
1138
+ if timeframe not in time_frame_mapping:
1139
+ raise ValueError("Unsupported time frame")
1140
+ self.tf = time_frame_mapping[timeframe]
1141
+
1142
+ params = self.estimate_parameters()
1143
+ self.mu_hat = params[0] # Mean (μ)
1144
+ self.theta_hat = params[1] # Drift (θ)
1145
+ self.sigma_hat = params[2] # Volatility (σ)
1146
+ print(f"Estimated μ: {self.mu_hat}")
1147
+ print(f"Estimated θ: {self.theta_hat}")
1148
+ print(f"Estimated σ: {self.sigma_hat}")
1149
+
1150
+ def ornstein_uhlenbeck(self, mu, theta, sigma, dt, X0, n):
1151
+ """
1152
+ Simulates the Ornstein-Uhlenbeck process.
1153
+
1154
+ Args:
1155
+ mu (float): Estimated long-term mean.
1156
+ theta (float): Estimated drift.
1157
+ sigma (float): Estimated volatility.
1158
+ dt (float): Time step.
1159
+ X0 (float): Initial value.
1160
+ n (int): Number of time steps.
1161
+
1162
+ Returns:
1163
+ np.ndarray : Simulated process.
1164
+ """
1165
+ x = np.zeros(n)
1166
+ x[0] = X0
1167
+ for t in range(1, n):
1168
+ dW = np.random.normal(loc=0, scale=np.sqrt(dt))
1169
+ # O-U process differential equation
1170
+ x[t] = x[t - 1] + (theta * (mu - x[t - 1]) * dt) + (sigma * dW)
1171
+ # dW is a Wiener process
1172
+ # (theta * (mu - x[t-1]) * dt) represents the mean-reverting tendency
1173
+ # (sigma * dW) represents the random volatility
1174
+ return x
1175
+
1176
+ def estimate_parameters(self):
1177
+ """
1178
+ Estimates the mean-reverting parameters (μ, θ, σ)
1179
+ using the negative log-likelihood.
1180
+
1181
+ Returns:
1182
+ Tuple: Estimated μ, θ, and σ.
1183
+ """
1184
+ initial_guess = [0, 0.1, np.std(self.returns)]
1185
+ result = minimize(self._neg_log_likelihood, initial_guess, args=(self.returns,))
1186
+ mu, theta, sigma = result.x
1187
+ return mu, theta, sigma
1188
+
1189
+ def _neg_log_likelihood(self, params, returns):
1190
+ """
1191
+ Calculates the negative
1192
+ log-likelihood for parameter estimation.
1193
+
1194
+ Args:
1195
+ params (list): List of parameters [mu, theta, sigma].
1196
+ returns (np.ndarray): Historical returns.
1197
+
1198
+ Returns:
1199
+ float: Negative log-likelihood.
1200
+ """
1201
+ mu, theta, sigma = params
1202
+ dt = self.tf
1203
+ n = len(returns)
1204
+ ou_simulated = self.ornstein_uhlenbeck(mu, theta, sigma, dt, 0, n + 1)
1205
+ residuals = ou_simulated[1 : n + 1] - returns
1206
+ neg_ll = 0.5 * np.sum(residuals**2) / sigma**2 + 0.5 * n * np.log(
1207
+ 2 * np.pi * sigma**2
1208
+ )
1209
+ return neg_ll
1210
+
1211
+ def simulate_process(self, returns=None, n=100, p=None):
1212
+ """
1213
+ Simulates the OU process multiple times .
1214
+
1215
+ Args:
1216
+ returns (np.ndarray): Historical returns.
1217
+ n (int): Number of simulations to perform.
1218
+ p (int): Number of time steps.
1219
+
1220
+ Returns:
1221
+ np.ndarray: 2D array representing simulated processes.
1222
+ """
1223
+ if returns is None:
1224
+ returns = self.returns
1225
+ if p is not None:
1226
+ T = p
1227
+ else:
1228
+ T = len(returns)
1229
+ dt = self.tf
1230
+
1231
+ dW_matrix = np.random.normal(loc=0, scale=np.sqrt(dt), size=(n, T))
1232
+ simulations_matrix = np.zeros((n, T))
1233
+ simulations_matrix[:, 0] = returns[-1]
1234
+
1235
+ for t in range(1, T):
1236
+ simulations_matrix[:, t] = (
1237
+ simulations_matrix[:, t - 1]
1238
+ + self.theta_hat * (self.mu_hat - simulations_matrix[:, t - 1]) * dt
1239
+ + self.sigma_hat * dW_matrix[:, t]
1240
+ )
1241
+ return simulations_matrix
1242
+
1243
+
1244
+ def remove_correlated_assets(df: pd.DataFrame, cutoff=0.99):
1245
+ """
1246
+ Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
1247
+ This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
1248
+
1249
+ Args:
1250
+ df (pd.DataFrame): A DataFrame where each column represents an asset
1251
+ and rows represent observations (e.g., time-series data).
1252
+ cutoff (float, optional, default=0.99): The correlation threshold.
1253
+ Columns with absolute correlation greater than this value will be considered for removal.
1254
+
1255
+ Returns:
1256
+ pd.DataFrame: A DataFrame with less correlated assets.
1257
+ The columns that are highly correlated (above the cutoff) are removed.
1258
+
1259
+ References
1260
+ ----------
1261
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1262
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1263
+
1264
+ Example:
1265
+ >>> df = pd.DataFrame({
1266
+ ... 'AAPL': [100, 101, 102, 103, 104],
1267
+ ... 'MSFT': [200, 201, 202, 203, 204],
1268
+ ... 'GOOG': [300, 301, 302, 303, 304]
1269
+ ... })
1270
+ >>> df = remove_correlated_assets(df)
1271
+ """
1272
+ corr = df.corr().stack()
1273
+ corr = corr[corr < 1]
1274
+ to_check = corr[corr.abs() > cutoff].index
1275
+ keep, drop = set(), set()
1276
+ for s1, s2 in to_check:
1277
+ if s1 not in keep:
1278
+ if s2 not in keep:
1279
+ keep.add(s1)
1280
+ drop.add(s2)
1281
+ else:
1282
+ drop.add(s1)
1283
+ else:
1284
+ keep.discard(s2)
1285
+ drop.add(s2)
1286
+ return df.drop(drop, axis=1)
1287
+
1288
+
1289
+ def check_stationarity(df: pd.DataFrame):
1290
+ """
1291
+ Tests the stationarity of time-series data for each asset in the DataFrame
1292
+ using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
1293
+ in time-series analysis, and non-stationary data can affect model performance.
1294
+
1295
+ Args:
1296
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1297
+
1298
+ Returns:
1299
+ pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
1300
+ - ticker Asset name (column name from df).
1301
+ - adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
1302
+
1303
+ References
1304
+ ----------
1305
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1306
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1307
+
1308
+ Example:
1309
+ >>> df = pd.DataFrame({
1310
+ ... 'AAPL': [100, 101, 102, 103, 104],
1311
+ ... 'MSFT': [200, 201, 202, 203, 204],
1312
+ ... 'GOOG': [300, 301, 302, 303, 304]
1313
+ ... })
1314
+ >>> df = check_stationarity(df)
1315
+ """
1316
+ results = []
1317
+ for ticker, prices in df.items():
1318
+ results.append([ticker, adfuller(prices, regression="ct")[1]])
1319
+ return pd.DataFrame(results, columns=["ticker", "adf"]).sort_values("adf")
1320
+
1321
+
1322
+ def remove_stationary_assets(df: pd.DataFrame, pval=0.05):
1323
+ """
1324
+ Filters out stationary assets from the DataFrame based on the p-value obtained
1325
+ from the Augmented Dickey-Fuller test.
1326
+ Useful for focusing only on non-stationary time-series data.
1327
+
1328
+ Args:
1329
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1330
+ pval (float, optional, default=0.05): The significance level to determine stationarity.
1331
+ Columns with an ADF test p-value below this threshold are considered stationary and removed.
1332
+
1333
+ Returns:
1334
+ pd.DataFrame: A DataFrame containing only the non-stationary assets.
1335
+
1336
+ References
1337
+ ----------
1338
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1339
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1340
+
1341
+ Example:
1342
+ >>> df = pd.DataFrame({
1343
+ ... 'AAPL': [100, 101, 102, 103, 104],
1344
+ ... 'MSFT': [200, 201, 202, 203, 204],
1345
+ ... 'GOOG': [300, 301, 302, 303, 304]
1346
+ ... })
1347
+ >>> df = remove_stationary_assets(df)
1348
+ """
1349
+ test_result = check_stationarity(df)
1350
+ stationary = test_result.loc[test_result.adf <= pval, "ticker"].tolist()
1351
+ return df.drop(stationary, axis=1).sort_index()
1352
+
1353
+
1354
+ def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
1355
+ """
1356
+ Selects the top N assets based on the average trading volume from the input DataFrame.
1357
+ These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
1358
+
1359
+ Args:
1360
+ df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
1361
+ Must include columns 'close' (price) and 'volume'.
1362
+ n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
1363
+ start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
1364
+ end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
1365
+ rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
1366
+
1367
+ Returns:
1368
+ pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
1369
+
1370
+ References
1371
+ ----------
1372
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1373
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1374
+ """
1375
+ required_columns = {"close", "volume"}
1376
+ if not required_columns.issubset(df.columns):
1377
+ raise ValueError(
1378
+ f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}."
1379
+ )
1380
+
1381
+ if (
1382
+ not isinstance(df.index, pd.MultiIndex)
1383
+ or "ticker" not in df.index.names
1384
+ or "date" not in df.index.names
1385
+ ):
1386
+ raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
1387
+
1388
+ df = df.copy()
1389
+ idx = pd.IndexSlice
1390
+ start = start or df.index.get_level_values("date").min()
1391
+ end = end or df.index.get_level_values("date").max()
1392
+ df = (
1393
+ df.loc[lambda df: ~df.index.duplicated()]
1394
+ .sort_index()
1395
+ .loc[idx[:, f"{start}" : f"{end}"], :]
1396
+ .assign(dv=lambda df: df.close.mul(df.volume))
1397
+ )
1398
+
1399
+ if rolling_window is None:
1400
+ most_traded = df.groupby(level="ticker").dv.mean().nlargest(n=n).index
1401
+ else:
1402
+ # Calculate the rolling average of dollar volume
1403
+ df["dv_rolling_avg"] = (
1404
+ df.groupby(level=0)
1405
+ .dv.rolling(window=rolling_window, min_periods=1)
1406
+ .mean()
1407
+ .reset_index(level=0, drop=True)
1408
+ )
1409
+ most_traded = df.groupby(level=0)["dv_rolling_avg"].mean().nlargest(n=n).index
1410
+ df = (
1411
+ df.loc[idx[most_traded, :], "close"]
1412
+ .unstack("ticker")
1413
+ .ffill(limit=5)
1414
+ .dropna(axis=1)
1415
+ )
1416
+ df = remove_correlated_assets(df)
1417
+ df = remove_stationary_assets(df)
1418
+ return df.sort_index()
1419
+
1420
+
1421
+ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1422
+ """
1423
+ Calculates statistical and econometric metrics for a target security and a set of candidate securities.
1424
+ These metrics are useful in financial modeling and pairs trading strategies,
1425
+ providing information about drift, volatility, correlation, and cointegration.
1426
+
1427
+ Args:
1428
+ security (pd.Series): A time-series of the target security's prices.
1429
+ The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
1430
+ candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
1431
+ for candidate securities to be evaluated against the target security.
1432
+
1433
+ Returns:
1434
+ pd.DataFrame: A DataFrame combining:
1435
+ Drift: Estimated drift of spreads between the target security and each candidate.
1436
+ Volatility: Standard deviation of spreads.
1437
+ Correlation:
1438
+ ``corr``: Correlation of normalized prices between the target and each candidate.
1439
+ ``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
1440
+ Cointegration metrics:
1441
+ Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
1442
+ Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
1443
+
1444
+ References
1445
+ ----------
1446
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1447
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1448
+ """
1449
+ security = security.div(security.iloc[0])
1450
+ ticker = security.name
1451
+ candidates = candidates.div(candidates.iloc[0])
1452
+ spreads = candidates.sub(security, axis=0)
1453
+ n, m = spreads.shape
1454
+ X = np.ones(shape=(n, 2))
1455
+ X[:, 1] = np.arange(1, n + 1)
1456
+
1457
+ # compute drift
1458
+ drift = (np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1].to_frame("drift")
1459
+
1460
+ # compute volatility
1461
+ vol = spreads.std().to_frame("vol")
1462
+
1463
+ # returns correlation
1464
+ corr_ret = (
1465
+ candidates.pct_change().corrwith(security.pct_change()).to_frame("corr_ret")
1466
+ )
1467
+
1468
+ # normalized price series correlation
1469
+ corr = candidates.corrwith(security).to_frame("corr")
1470
+ metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
1471
+
1472
+ tests = []
1473
+ # run cointegration tests
1474
+ for candidate, prices in tqdm(candidates.items()):
1475
+ df = pd.DataFrame({"s1": security, "s2": prices})
1476
+ var = VAR(df.values)
1477
+ lags = var.select_order() # select VAR order
1478
+ k_ar_diff = lags.selected_orders["aic"]
1479
+ # Johansen Test with constant Term and estd. lag order
1480
+ cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
1481
+ # Engle-Granger Tests
1482
+ t1, p1 = coint(security, prices, trend="c")[:2]
1483
+ t2, p2 = coint(prices, security, trend="c")[:2]
1484
+ tests.append([ticker, candidate, t1, p1, t2, p2, k_ar_diff, *cj0.lr1])
1485
+ columns = ["s1", "s2", "t1", "p1", "t2", "p2", "k_ar_diff", "trace0", "trace1"]
1486
+ tests = pd.DataFrame(tests, columns=columns).set_index("s2")
1487
+ return metrics.join(tests)
1488
+
1489
+
1490
+ __CRITICAL_VALUES = {
1491
+ 0: {0.9: 13.4294, 0.95: 15.4943, 0.99: 19.9349},
1492
+ 1: {0.9: 2.7055, 0.95: 3.8415, 0.99: 6.6349},
1493
+ }
1494
+
1495
+
1496
+ def find_cointegrated_pairs(
1497
+ securities: pd.DataFrame,
1498
+ candidates: pd.DataFrame,
1499
+ n=None,
1500
+ start=None,
1501
+ stop=None,
1502
+ coint=False,
1503
+ ):
1504
+ """
1505
+ Identifies cointegrated pairs between a target set of securities and candidate securities
1506
+ based on econometric tests. The function evaluates statistical relationships,
1507
+ such as cointegration and Engle-Granger significance, to determine pairs suitable
1508
+ for financial strategies like pairs trading.
1509
+
1510
+ Args:
1511
+ securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
1512
+ prices of target securities to evaluate.
1513
+ candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
1514
+ prices of candidate securities to compare against the target securities.
1515
+ n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
1516
+ start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
1517
+ stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
1518
+ coint (`bool`, optional, default=False):
1519
+ - If `True`, filters for pairs identified as cointegrated.
1520
+ - If `False`, returns all evaluated pairs.
1521
+
1522
+ Returns:
1523
+ - ``pd.DataFrame``: A DataFrame containing:
1524
+ - Johansen and Engle-Granger cointegration metrics:
1525
+ - `t1`, `t2`: Engle-Granger test statistics for two directions.
1526
+ - `p1`, `p2`: Engle-Granger p-values for two directions.
1527
+ - `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
1528
+ - Indicators and filters:
1529
+ - `joh_sig`: Indicates Johansen cointegration significance.
1530
+ - `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
1531
+ - `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
1532
+ - `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
1533
+ - Spread and ranking:
1534
+ - `t`: Minimum of `t1` and `t2`.
1535
+ - `p`: Minimum of `p1` and `p2`.
1536
+ References
1537
+ ----------
1538
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1539
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1540
+
1541
+ Example:
1542
+ >>> import pandas as pd
1543
+
1544
+ >>> # Sample Data
1545
+ >>> data_securities = {
1546
+ ... 'Security1': [100, 102, 101, 103, 105],
1547
+ ... 'Security2': [50, 52, 53, 51, 54]
1548
+ ... }
1549
+ >>> data_candidates = {
1550
+ ... 'Candidate1': [100, 101, 99, 102, 104],
1551
+ ... 'Candidate2': [200, 202, 201, 203, 205]
1552
+ ... }
1553
+
1554
+ >>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
1555
+ >>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
1556
+
1557
+ >>> # Find cointegrated pairs
1558
+ >>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1559
+ >>> print(top_pairs)
1560
+
1561
+ >>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
1562
+ >>> |----------|-----------|------|-------|---------|--------|-------|
1563
+ >>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
1564
+ >>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
1565
+ """
1566
+ trace0_cv = __CRITICAL_VALUES[0][
1567
+ 0.95
1568
+ ] # critical value for 0 cointegration relationships
1569
+ # critical value for 1 cointegration relationship
1570
+ trace1_cv = __CRITICAL_VALUES[1][0.95]
1571
+ spreads = []
1572
+ if start is not None and stop is not None:
1573
+ securities = securities.loc[str(start) : str(stop), :]
1574
+ candidates = candidates.loc[str(start) : str(stop), :]
1575
+ for i, (ticker, prices) in enumerate(securities.items(), 1):
1576
+ try:
1577
+ df = compute_pair_metrics(prices, candidates)
1578
+ spreads.append(df.set_index("s1", append=True))
1579
+ except np.linalg.LinAlgError:
1580
+ continue
1581
+ spreads = pd.concat(spreads)
1582
+ spreads.index.names = ["s2", "s1"]
1583
+ spreads = spreads.swaplevel()
1584
+ spreads["t"] = spreads[["t1", "t2"]].min(axis=1)
1585
+ spreads["p"] = spreads[["p1", "p2"]].min(axis=1)
1586
+ spreads["joh_sig"] = (
1587
+ (spreads.trace0 > trace0_cv) & (spreads.trace1 > trace1_cv)
1588
+ ).astype(int)
1589
+ spreads["eg_sig"] = (spreads.p < 0.05).astype(int)
1590
+ spreads["s1_dep"] = spreads.p1 < spreads.p2
1591
+ spreads["coint"] = (spreads.joh_sig & spreads.eg_sig).astype(int)
1592
+ # select top n pairs
1593
+ if coint:
1594
+ if n is not None:
1595
+ top_pairs = (
1596
+ spreads.query("coint == 1").sort_values("t", ascending=False).head(n)
1597
+ )
1598
+ else:
1599
+ top_pairs = spreads.query("coint == 1").sort_values("t", ascending=False)
1600
+ else:
1601
+ if n is not None:
1602
+ top_pairs = spreads.sort_values("t", ascending=False).head(n)
1603
+ else:
1604
+ top_pairs = spreads.sort_values("t", ascending=False)
1605
+ return top_pairs
1606
+
1607
+
1608
+ def analyze_cointegrated_pairs(
1609
+ spreads: pd.DataFrame,
1610
+ plot_coint=True,
1611
+ crosstab=False,
1612
+ heuristics=False,
1613
+ log_reg=False,
1614
+ decis_tree=False,
1615
+ ):
1616
+ """
1617
+ Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
1618
+
1619
+ Args:
1620
+ spreads (pd.DataFrame):
1621
+ A DataFrame containing cointegration metrics and characteristics.
1622
+ Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
1623
+ plot_coint (bool, optional):
1624
+ If True, generates scatterplots and boxplots to visualize cointegration characteristics.
1625
+ cosstab (bool, optional):
1626
+ If True, displays crosstabulations of Engle-Granger and Johansen test significance.
1627
+ heuristics (bool, optional):
1628
+ If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
1629
+ log_reg (bool, optional):
1630
+ If True, fits a logistic regression model to predict cointegration and evaluates its performance.
1631
+ decis_tree (bool, optional):
1632
+ If True, fits a decision tree model to predict cointegration and evaluates its performance.
1633
+
1634
+ References
1635
+ ----------
1636
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1637
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1638
+
1639
+ Example:
1640
+ >>> import pandas as pd
1641
+ >>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
1642
+
1643
+ >>> # Sample Data
1644
+ >>> securities = pd.DataFrame({
1645
+ ... 'SPY': [100, 102, 101, 103, 105],
1646
+ ... 'QQQ': [50, 52, 53, 51, 54]
1647
+ ... })
1648
+ >>> candidates = pd.DataFrame({
1649
+ ... 'AAPL': [100, 101, 99, 102, 104],
1650
+ ... 'MSFT': [200, 202, 201, 203, 205]
1651
+ ... })
1652
+
1653
+ >>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1654
+ >>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
1655
+ """
1656
+ if plot_coint:
1657
+ trace0_cv = __CRITICAL_VALUES[0][0.95]
1658
+ spreads = spreads.reset_index()
1659
+ sns.scatterplot(
1660
+ x=np.log1p(spreads.t.abs()),
1661
+ y=np.log1p(spreads.trace1),
1662
+ hue="coint",
1663
+ data=spreads[spreads.trace0 > trace0_cv],
1664
+ )
1665
+ fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
1666
+ for i, heuristic in enumerate(["drift", "vol", "corr", "corr_ret"]):
1667
+ sns.boxplot(x="coint", y=heuristic, data=spreads, ax=axes[i])
1668
+ fig.tight_layout()
1669
+
1670
+ if heuristics:
1671
+ spreads = spreads.reset_index()
1672
+ h = (
1673
+ spreads.groupby(spreads.coint)[["drift", "vol", "corr"]]
1674
+ .describe()
1675
+ .stack(level=0)
1676
+ .swaplevel()
1677
+ .sort_index()
1678
+ )
1679
+ print(h)
1680
+
1681
+ if log_reg:
1682
+ y = spreads.coint
1683
+ X = spreads[["drift", "vol", "corr", "corr_ret"]]
1684
+ log_reg = LogisticRegressionCV(
1685
+ Cs=np.logspace(-10, 10, 21), class_weight="balanced", scoring="roc_auc"
1686
+ )
1687
+ log_reg.fit(X=X, y=y)
1688
+ Cs = log_reg.Cs_
1689
+ scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
1690
+ scores.plot(logx=True)
1691
+ res = f"C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}"
1692
+ print(res)
1693
+ print(log_reg.coef_)
1694
+
1695
+ if decis_tree:
1696
+ model = DecisionTreeClassifier(class_weight="balanced")
1697
+ decision_tree = GridSearchCV(
1698
+ model, param_grid={"max_depth": list(range(1, 10))}, cv=5, scoring="roc_auc"
1699
+ )
1700
+ y = spreads.coint
1701
+ X = spreads[["drift", "vol", "corr", "corr_ret"]]
1702
+ decision_tree.fit(X, y)
1703
+ res = f"{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_['max_depth']}"
1704
+ print(res)
1705
+
1706
+ if crosstab:
1707
+ pd.set_option("display.float_format", lambda x: f"{x:.2%}")
1708
+ print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
1709
+ print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
1710
+
1711
+
1712
+ def select_candidate_pairs(pairs: pd.DataFrame, period=False):
1713
+ """
1714
+ Select candidate pairs from a DataFrame based on cointegration status.
1715
+
1716
+ This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
1717
+ indicating cointegration. It then determines the dependent and independent series for each pair
1718
+ and returns the selected pairs in a dictionary format.
1719
+
1720
+ Args:
1721
+ pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
1722
+ period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
1723
+
1724
+ Returns:
1725
+ list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
1726
+
1727
+ References
1728
+ ----------
1729
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1730
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1731
+ """
1732
+ candidates = pairs.query("coint == 1").copy()
1733
+ candidates = candidates.reset_index()
1734
+ candidates["y"] = candidates.apply(
1735
+ lambda x: x["s1"] if x.s1_dep else x["s2"], axis=1
1736
+ )
1737
+ candidates["x"] = candidates.apply(
1738
+ lambda x: x["s2"] if x.s1_dep else x["s1"], axis=1
1739
+ )
1740
+ if period:
1741
+ return candidates[["x", "y", "period"]].to_dict(orient="records")
1742
+ return candidates[["x", "y"]].to_dict(orient="records")
1743
+
1744
+
1745
+ def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1746
+ """
1747
+ Estimate rolling mean using Kalman Smoothing.
1748
+
1749
+ Args:
1750
+ prices : pd.Series or np.ndarray
1751
+ The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
1752
+
1753
+ Returns:
1754
+ pd.Series or np.ndarray
1755
+ The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
1756
+ If the input is a numpy array, the output will be a numpy array.
1757
+
1758
+ References
1759
+ ----------
1760
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1761
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1762
+
1763
+ Examples
1764
+ --------
1765
+ >>> import yfinance as yf
1766
+ >>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
1767
+ >>> prices = KFSmoother(prices)
1768
+ >>> print(prices[:5])
1769
+ Date
1770
+ 2020-01-02 00:00:00+00:00 36.39801407
1771
+ 2020-01-03 00:00:00+00:00 49.06231000
1772
+ 2020-01-06 00:00:00+00:00 55.86334436
1773
+ 2020-01-07 00:00:00+00:00 60.02240894
1774
+ 2020-01-08 00:00:00+00:00 63.15057948
1775
+ dtype: float64
1776
+
1777
+ """
1778
+ if not isinstance(prices, (np.ndarray, pd.Series)):
1779
+ raise ValueError("Input must be either a numpy array or a pandas Series.")
1780
+ kf = PyKalmanFilter(
1781
+ transition_matrices=np.eye(1),
1782
+ observation_matrices=np.eye(1),
1783
+ initial_state_mean=0,
1784
+ initial_state_covariance=1,
1785
+ observation_covariance=1,
1786
+ transition_covariance=0.05,
1787
+ )
1788
+ if isinstance(prices, pd.Series):
1789
+ state_means, _ = kf.filter(prices.values)
1790
+ return pd.Series(state_means.flatten(), index=prices.index)
1791
+ elif isinstance(prices, np.ndarray):
1792
+ state_means, _ = kf.filter(prices)
1793
+ return state_means.flatten()
1794
+
1795
+
1796
+ def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
1797
+ """
1798
+ Estimate Hedge Ratio using Kalman Filter.
1799
+ Args:
1800
+ x : pd.Series or np.ndarray
1801
+ The independent variable, which can be either a pandas Series or a numpy array.
1802
+ y : pd.Series or np.ndarray
1803
+ The dependent variable, which can be either a pandas Series or a numpy array.
1804
+
1805
+ Returns:
1806
+ np.ndarray
1807
+ The estimated hedge ratio as a numpy array.
1808
+
1809
+ The function returns the negative of the first state variable of each Kalman Filter estimate,
1810
+ which represents the estimated hedge ratio.
1811
+
1812
+ References
1813
+ ----------
1814
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1815
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1816
+ """
1817
+ if not isinstance(x, (np.ndarray, pd.Series)) or not isinstance(
1818
+ y, (np.ndarray, pd.Series)
1819
+ ):
1820
+ raise ValueError(
1821
+ "Both x and y must be either a numpy array or a pandas Series."
1822
+ )
1823
+
1824
+ delta = 1e-3
1825
+ trans_cov = delta / (1 - delta) * np.eye(2)
1826
+ obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
1827
+
1828
+ kf = PyKalmanFilter(
1829
+ n_dim_obs=1,
1830
+ n_dim_state=2,
1831
+ initial_state_mean=[0, 0],
1832
+ initial_state_covariance=np.ones((2, 2)),
1833
+ transition_matrices=np.eye(2),
1834
+ observation_matrices=obs_mat,
1835
+ observation_covariance=2,
1836
+ transition_covariance=trans_cov,
1837
+ )
1838
+ y = y.values if isinstance(y, pd.Series) else y
1839
+ state_means, _ = kf.filter(y)
1840
+ # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
1841
+ # each Kalman Filter estimate, which is the estimated hedge ratio.
1842
+ return -state_means[:, 0]