bbstrader 0.2.93__py3-none-any.whl → 0.2.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbstrader might be problematic. Click here for more details.
- bbstrader/__ini__.py +20 -20
- bbstrader/__main__.py +50 -50
- bbstrader/btengine/__init__.py +54 -54
- bbstrader/btengine/scripts.py +157 -157
- bbstrader/compat.py +19 -19
- bbstrader/config.py +137 -137
- bbstrader/core/data.py +22 -22
- bbstrader/core/utils.py +146 -146
- bbstrader/metatrader/__init__.py +6 -6
- bbstrader/metatrader/account.py +1516 -1516
- bbstrader/metatrader/copier.py +750 -745
- bbstrader/metatrader/rates.py +584 -584
- bbstrader/metatrader/risk.py +749 -748
- bbstrader/metatrader/scripts.py +81 -81
- bbstrader/metatrader/trade.py +1836 -1836
- bbstrader/metatrader/utils.py +645 -645
- bbstrader/models/__init__.py +10 -10
- bbstrader/models/factors.py +312 -312
- bbstrader/models/ml.py +1272 -1272
- bbstrader/models/optimization.py +182 -182
- bbstrader/models/portfolio.py +223 -223
- bbstrader/models/risk.py +398 -398
- bbstrader/trading/__init__.py +11 -11
- bbstrader/trading/execution.py +846 -846
- bbstrader/trading/script.py +155 -155
- bbstrader/trading/scripts.py +69 -69
- bbstrader/trading/strategies.py +860 -860
- bbstrader/tseries.py +1842 -1842
- {bbstrader-0.2.93.dist-info → bbstrader-0.2.94.dist-info}/LICENSE +21 -21
- {bbstrader-0.2.93.dist-info → bbstrader-0.2.94.dist-info}/METADATA +188 -187
- bbstrader-0.2.94.dist-info/RECORD +44 -0
- bbstrader-0.2.93.dist-info/RECORD +0 -44
- {bbstrader-0.2.93.dist-info → bbstrader-0.2.94.dist-info}/WHEEL +0 -0
- {bbstrader-0.2.93.dist-info → bbstrader-0.2.94.dist-info}/entry_points.txt +0 -0
- {bbstrader-0.2.93.dist-info → bbstrader-0.2.94.dist-info}/top_level.txt +0 -0
bbstrader/tseries.py
CHANGED
|
@@ -1,1842 +1,1842 @@
|
|
|
1
|
-
"""
|
|
2
|
-
The `tseries` module is a designed for conducting
|
|
3
|
-
advanced time series analysis in financial markets.
|
|
4
|
-
It leverages statistical models and algorithms to perform
|
|
5
|
-
tasks such as cointegration testing, volatility modeling,
|
|
6
|
-
and filter-based estimation to assist in trading strategy development,
|
|
7
|
-
market analysis, and financial data exploration.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import pprint
|
|
11
|
-
import warnings
|
|
12
|
-
from itertools import combinations
|
|
13
|
-
from typing import List, Tuple, Union
|
|
14
|
-
|
|
15
|
-
import matplotlib.pyplot as plt
|
|
16
|
-
import numpy as np
|
|
17
|
-
import pandas as pd
|
|
18
|
-
import pmdarima as pm
|
|
19
|
-
import seaborn as sns
|
|
20
|
-
import statsmodels.api as sm
|
|
21
|
-
import statsmodels.tsa.stattools as ts
|
|
22
|
-
import yfinance as yf
|
|
23
|
-
from arch import arch_model
|
|
24
|
-
from filterpy.kalman import KalmanFilter
|
|
25
|
-
from hurst import compute_Hc
|
|
26
|
-
from pykalman import KalmanFilter as PyKalmanFilter
|
|
27
|
-
from scipy.optimize import minimize
|
|
28
|
-
from sklearn.linear_model import LogisticRegressionCV
|
|
29
|
-
from sklearn.model_selection import GridSearchCV
|
|
30
|
-
from sklearn.tree import DecisionTreeClassifier
|
|
31
|
-
from statsmodels.graphics.tsaplots import plot_acf
|
|
32
|
-
from statsmodels.stats.diagnostic import acorr_ljungbox
|
|
33
|
-
from statsmodels.tsa.arima.model import ARIMA
|
|
34
|
-
from statsmodels.tsa.stattools import adfuller, coint
|
|
35
|
-
from statsmodels.tsa.vector_ar.var_model import VAR
|
|
36
|
-
from statsmodels.tsa.vector_ar.vecm import coint_johansen
|
|
37
|
-
from tqdm import tqdm
|
|
38
|
-
|
|
39
|
-
warnings.filterwarnings("ignore")
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
__all__ = [
|
|
43
|
-
"load_and_prepare_data",
|
|
44
|
-
"fit_best_arima",
|
|
45
|
-
"fit_garch",
|
|
46
|
-
"predict_next_return",
|
|
47
|
-
"get_prediction",
|
|
48
|
-
"get_corr",
|
|
49
|
-
"run_cadf_test",
|
|
50
|
-
"run_hurst_test",
|
|
51
|
-
"run_coint_test",
|
|
52
|
-
"run_kalman_filter",
|
|
53
|
-
"ArimaGarchModel",
|
|
54
|
-
"KalmanFilterModel",
|
|
55
|
-
"OrnsteinUhlenbeck",
|
|
56
|
-
"remove_correlated_assets",
|
|
57
|
-
"check_stationarity",
|
|
58
|
-
"remove_stationary_assets",
|
|
59
|
-
"select_assets",
|
|
60
|
-
"compute_pair_metrics",
|
|
61
|
-
"find_cointegrated_pairs",
|
|
62
|
-
"analyze_cointegrated_pairs",
|
|
63
|
-
"select_candidate_pairs",
|
|
64
|
-
"KFSmoother",
|
|
65
|
-
"KFHedgeRatio",
|
|
66
|
-
]
|
|
67
|
-
|
|
68
|
-
# *******************************************
|
|
69
|
-
# ARIMA AND GARCH MODELS *
|
|
70
|
-
# *******************************************
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def load_and_prepare_data(df: pd.DataFrame):
|
|
74
|
-
"""
|
|
75
|
-
Prepares financial time series data for analysis.
|
|
76
|
-
|
|
77
|
-
This function takes a pandas DataFrame containing financial data,
|
|
78
|
-
calculates logarithmic returns, and the first difference
|
|
79
|
-
of these logarithmic returns. It handles missing values
|
|
80
|
-
by filling them with zeros.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
df (pd.DataFrame): DataFrame containing at least
|
|
84
|
-
a `Close` column with closing prices of a financial asset.
|
|
85
|
-
|
|
86
|
-
Returns:
|
|
87
|
-
pd.DataFrame: DataFrame with additional
|
|
88
|
-
columns for logarithmic returns (`log_return`)
|
|
89
|
-
and the first difference of logarithmic returns (`diff_log_return`),
|
|
90
|
-
with `NaN` values filled with `0`.
|
|
91
|
-
"""
|
|
92
|
-
# Load data
|
|
93
|
-
data = df.copy()
|
|
94
|
-
# Calculate logarithmic returns
|
|
95
|
-
data["log_return"] = np.log(data["Close"] / data["Close"].shift(1))
|
|
96
|
-
# Differencing if necessary
|
|
97
|
-
data["diff_log_return"] = data["log_return"].diff()
|
|
98
|
-
# Drop NaN values
|
|
99
|
-
data.fillna(0, inplace=True)
|
|
100
|
-
return data
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
|
|
104
|
-
"""
|
|
105
|
-
Identifies and fits the best `ARIMA` model
|
|
106
|
-
based on the Akaike Information Criterion `(AIC)`.
|
|
107
|
-
|
|
108
|
-
Iterates through different combinations of `p` and `q`
|
|
109
|
-
parameters (within specified ranges) for the ARIMA model,
|
|
110
|
-
fits them to the provided data, and selects the combination
|
|
111
|
-
with the lowest `AIC` value.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
window_data (pd.Series or np.ndarray):
|
|
115
|
-
Time series data to fit the `ARIMA` model on.
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
ARIMA result object: The fitted `ARIMA` model with the lowest `AIC`.
|
|
119
|
-
"""
|
|
120
|
-
if isinstance(window_data, pd.Series):
|
|
121
|
-
window_data = window_data.values
|
|
122
|
-
|
|
123
|
-
window_data = window_data[~(np.isnan(window_data) | np.isinf(window_data))]
|
|
124
|
-
# Fit ARIMA model with best parameters
|
|
125
|
-
model = pm.auto_arima(
|
|
126
|
-
window_data,
|
|
127
|
-
start_p=1,
|
|
128
|
-
start_q=1,
|
|
129
|
-
max_p=6,
|
|
130
|
-
max_q=6,
|
|
131
|
-
seasonal=False,
|
|
132
|
-
stepwise=True,
|
|
133
|
-
)
|
|
134
|
-
final_order = model.order
|
|
135
|
-
from arch.utility.exceptions import ConvergenceWarning as ArchWarning
|
|
136
|
-
from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
|
|
137
|
-
|
|
138
|
-
with warnings.catch_warnings():
|
|
139
|
-
warnings.filterwarnings("ignore", category=StatsWarning, module="statsmodels")
|
|
140
|
-
warnings.filterwarnings("ignore", category=ArchWarning, module="arch")
|
|
141
|
-
try:
|
|
142
|
-
best_arima_model = ARIMA(
|
|
143
|
-
window_data + 1e-5, order=final_order, missing="drop"
|
|
144
|
-
).fit()
|
|
145
|
-
return best_arima_model
|
|
146
|
-
except np.linalg.LinAlgError:
|
|
147
|
-
# Catch specific linear algebra errors
|
|
148
|
-
print("LinAlgError occurred, skipping this data point.")
|
|
149
|
-
return None
|
|
150
|
-
except Exception as e:
|
|
151
|
-
# Catch any other unexpected errors and log them
|
|
152
|
-
print(f"An error occurred: {e}")
|
|
153
|
-
return None
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def fit_garch(window_data: Union[pd.Series, np.ndarray]):
|
|
157
|
-
"""
|
|
158
|
-
Fits an `ARIMA` model to the data to get residuals,
|
|
159
|
-
then fits a `GARCH(1,1)` model on these residuals.
|
|
160
|
-
|
|
161
|
-
Utilizes the residuals from the best `ARIMA` model fit to
|
|
162
|
-
then model volatility using a `GARCH(1,1)` model.
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
window_data (pd.Series or np.ndarray):
|
|
166
|
-
Time series data for which to fit the `ARIMA` and `GARCH` models.
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
tuple: A tuple containing the `ARIMA` result
|
|
170
|
-
object and the `GARCH` result object.
|
|
171
|
-
"""
|
|
172
|
-
arima_result = fit_best_arima(window_data)
|
|
173
|
-
if arima_result is None:
|
|
174
|
-
return None, None
|
|
175
|
-
resid = np.asarray(arima_result.resid)
|
|
176
|
-
resid = resid[~(np.isnan(resid) | np.isinf(resid))]
|
|
177
|
-
garch_model = arch_model(resid, p=1, q=1, rescale=False)
|
|
178
|
-
garch_result = garch_model.fit(disp="off")
|
|
179
|
-
return arima_result, garch_result
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def predict_next_return(arima_result, garch_result):
|
|
183
|
-
"""
|
|
184
|
-
Predicts the next return value using fitted `ARIMA` and `GARCH` models.
|
|
185
|
-
|
|
186
|
-
Combines the next period forecast from the `ARIMA` model
|
|
187
|
-
with the next period volatility forecast from the `GARCH` model
|
|
188
|
-
to predict the next return value.
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
arima_result (ARIMA result object): The fitted `ARIMA` model result.
|
|
192
|
-
garch_result (ARCH result object): The fitted `GARCH` model result.
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
float: The predicted next return, adjusted for predicted volatility.
|
|
196
|
-
"""
|
|
197
|
-
if arima_result is None or garch_result is None:
|
|
198
|
-
return 0
|
|
199
|
-
# Predict next value with ARIMA
|
|
200
|
-
arima_pred = arima_result.forecast(steps=1)
|
|
201
|
-
# Predict next volatility with GARCH
|
|
202
|
-
garch_pred = garch_result.forecast(horizon=1)
|
|
203
|
-
next_volatility = garch_pred.variance.iloc[-1, 0]
|
|
204
|
-
|
|
205
|
-
# Combine predictions (return + volatility)
|
|
206
|
-
if not isinstance(arima_pred, np.ndarray):
|
|
207
|
-
pred = arima_pred.values[0]
|
|
208
|
-
else:
|
|
209
|
-
pred = arima_pred[0]
|
|
210
|
-
return pred + next_volatility
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def get_prediction(window_data: Union[pd.Series, np.ndarray]):
|
|
214
|
-
"""
|
|
215
|
-
Orchestrator function to get the next period's return prediction.
|
|
216
|
-
|
|
217
|
-
This function ties together the process of fitting
|
|
218
|
-
both `ARIMA` and `GARCH` models on the provided data
|
|
219
|
-
and then predicting the next period's return using these models.
|
|
220
|
-
|
|
221
|
-
Args:
|
|
222
|
-
window_data (Union[pd.Series , np.ndarray]):
|
|
223
|
-
Time series data to fit the models and predict the next return.
|
|
224
|
-
|
|
225
|
-
Returns
|
|
226
|
-
float: Predicted next return value.
|
|
227
|
-
"""
|
|
228
|
-
arima_result, garch_result = fit_garch(window_data)
|
|
229
|
-
prediction = predict_next_return(arima_result, garch_result)
|
|
230
|
-
return prediction
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
class ArimaGarchModel:
|
|
234
|
-
"""
|
|
235
|
-
This class implements a time serie model
|
|
236
|
-
that combines `ARIMA (AutoRegressive Integrated Moving Average)`
|
|
237
|
-
and `GARCH (Generalized Autoregressive Conditional Heteroskedasticity)` models
|
|
238
|
-
to predict future returns based on historical price data.
|
|
239
|
-
|
|
240
|
-
The model is implemented in the following steps:
|
|
241
|
-
1. Data Preparation: Load and prepare the historical price data.
|
|
242
|
-
2. Modeling: Fit the ARIMA model to the data and then fit the GARCH model to the residuals.
|
|
243
|
-
3. Prediction: Predict the next return using the ARIMA model and the next volatility using the GARCH model.
|
|
244
|
-
4. Trading Strategy: Execute the trading strategy based on the predictions.
|
|
245
|
-
5. Vectorized Backtesting: Backtest the trading strategy using the historical data.
|
|
246
|
-
|
|
247
|
-
Exemple:
|
|
248
|
-
>>> import yfinance as yf
|
|
249
|
-
>>> from bbstrader.tseries import ArimaGarchModel
|
|
250
|
-
>>> from bbstrader.tseries import load_and_prepare_data
|
|
251
|
-
|
|
252
|
-
>>> if __name__ == '__main__':
|
|
253
|
-
>>> # ARCH SPY Vectorize Backtest
|
|
254
|
-
>>> k = 252
|
|
255
|
-
>>> data = yf.download("SPY", start="2010-01-02", end="2015-12-31")
|
|
256
|
-
>>> arch = ArimaGarchModel("SPY", data, k=k)
|
|
257
|
-
>>> df = load_and_prepare_data(data)
|
|
258
|
-
>>> arch.show_arima_garch_results(df['diff_log_return'].values[-k:])
|
|
259
|
-
>>> arch.backtest_strategy()
|
|
260
|
-
"""
|
|
261
|
-
|
|
262
|
-
def __init__(self, symbol, data, k: int = 252):
|
|
263
|
-
"""
|
|
264
|
-
Initializes the ArimaGarchStrategy class.
|
|
265
|
-
|
|
266
|
-
Args:
|
|
267
|
-
symbol (str): The ticker symbol for the financial instrument.
|
|
268
|
-
data (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
|
|
269
|
-
k (int): The window size for rolling prediction in backtesting.
|
|
270
|
-
"""
|
|
271
|
-
self.symbol = symbol
|
|
272
|
-
self.data = self.load_and_prepare_data(data)
|
|
273
|
-
self.k = k
|
|
274
|
-
|
|
275
|
-
# Step 1: Data Preparation
|
|
276
|
-
def load_and_prepare_data(self, df):
|
|
277
|
-
"""
|
|
278
|
-
Prepares the dataset by calculating logarithmic returns
|
|
279
|
-
and differencing if necessary.
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
df (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
|
|
283
|
-
|
|
284
|
-
Returns:
|
|
285
|
-
pd.DataFrame: The dataset with additional columns
|
|
286
|
-
for log returns and differenced log returns.
|
|
287
|
-
"""
|
|
288
|
-
return load_and_prepare_data(df)
|
|
289
|
-
|
|
290
|
-
# Step 2: Modeling (ARIMA + GARCH)
|
|
291
|
-
def fit_best_arima(self, window_data):
|
|
292
|
-
"""
|
|
293
|
-
Fits the ARIMA model to the provided window of data,
|
|
294
|
-
selecting the best model based on AIC.
|
|
295
|
-
|
|
296
|
-
Args:
|
|
297
|
-
window_data (np.array): The dataset for a specific window period.
|
|
298
|
-
|
|
299
|
-
Returns:
|
|
300
|
-
ARIMA model: The best fitted ARIMA model based on AIC.
|
|
301
|
-
"""
|
|
302
|
-
return fit_best_arima(window_data)
|
|
303
|
-
|
|
304
|
-
def fit_garch(self, window_data):
|
|
305
|
-
"""
|
|
306
|
-
Fits the GARCH model to the residuals of the best ARIMA model.
|
|
307
|
-
|
|
308
|
-
Args:
|
|
309
|
-
window_data (np.array): The dataset for a specific window period.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
tuple: Contains the ARIMA result and GARCH result.
|
|
313
|
-
"""
|
|
314
|
-
return fit_garch(window_data)
|
|
315
|
-
|
|
316
|
-
def show_arima_garch_results(self, window_data, acf=True, test_resid=True):
|
|
317
|
-
"""
|
|
318
|
-
Displays the ARIMA and GARCH model results, including plotting
|
|
319
|
-
ACF of residuals and conducting , Box-Pierce and Ljung-Box tests.
|
|
320
|
-
|
|
321
|
-
Args:
|
|
322
|
-
window_data (np.array): The dataset for a specific window period.
|
|
323
|
-
acf (bool, optional): If True, plot the ACF of residuals. Defaults to True.
|
|
324
|
-
|
|
325
|
-
test_resid (bool, optional):
|
|
326
|
-
If True, conduct Box-Pierce and Ljung-Box tests on residuals. Defaults to True.
|
|
327
|
-
"""
|
|
328
|
-
arima_result = self.fit_best_arima(window_data)
|
|
329
|
-
resid = np.asarray(arima_result.resid)
|
|
330
|
-
resid = resid[~(np.isnan(resid) | np.isinf(resid))]
|
|
331
|
-
garch_model = arch_model(resid, p=1, q=1, rescale=False)
|
|
332
|
-
garch_result = garch_model.fit(disp="off")
|
|
333
|
-
residuals = garch_result.resid
|
|
334
|
-
|
|
335
|
-
# TODO : Plot the ACF of the residuals
|
|
336
|
-
if acf:
|
|
337
|
-
fig = plt.figure(figsize=(12, 8))
|
|
338
|
-
# Plot the ACF of ARIMA residuals
|
|
339
|
-
ax1 = fig.add_subplot(211, ylabel="ACF")
|
|
340
|
-
plot_acf(resid, alpha=0.05, ax=ax1, title="ACF of ARIMA Residuals")
|
|
341
|
-
ax1.set_xlabel("Lags")
|
|
342
|
-
ax1.grid(True)
|
|
343
|
-
|
|
344
|
-
# Plot the ACF of GARCH residuals on the same axes
|
|
345
|
-
ax2 = fig.add_subplot(212, ylabel="ACF")
|
|
346
|
-
plot_acf(residuals, alpha=0.05, ax=ax2, title="ACF of GARCH Residuals")
|
|
347
|
-
ax2.set_xlabel("Lags")
|
|
348
|
-
ax2.grid(True)
|
|
349
|
-
|
|
350
|
-
# Plot the figure
|
|
351
|
-
plt.tight_layout()
|
|
352
|
-
plt.show()
|
|
353
|
-
|
|
354
|
-
# TODO : Conduct Box-Pierce and Ljung-Box Tests of the residuals
|
|
355
|
-
if test_resid:
|
|
356
|
-
print(arima_result.summary())
|
|
357
|
-
print(garch_result.summary())
|
|
358
|
-
bp_test = acorr_ljungbox(resid, return_df=True)
|
|
359
|
-
print("Box-Pierce and Ljung-Box Tests Results for ARIMA:\n", bp_test)
|
|
360
|
-
|
|
361
|
-
# Step 3: Prediction
|
|
362
|
-
def predict_next_return(self, arima_result, garch_result):
|
|
363
|
-
"""
|
|
364
|
-
Predicts the next return using the ARIMA model
|
|
365
|
-
and the next volatility using the GARCH model.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
arima_result (ARIMA model): The ARIMA model result.
|
|
369
|
-
garch_result (GARCH model): The GARCH model result.
|
|
370
|
-
|
|
371
|
-
Returns:
|
|
372
|
-
float: The predicted next return.
|
|
373
|
-
"""
|
|
374
|
-
return predict_next_return(arima_result, garch_result)
|
|
375
|
-
|
|
376
|
-
def get_prediction(self, window_data):
|
|
377
|
-
"""
|
|
378
|
-
Generates a prediction for the next return based on a window of data.
|
|
379
|
-
|
|
380
|
-
Args:
|
|
381
|
-
window_data (np.array): The dataset for a specific window period.
|
|
382
|
-
|
|
383
|
-
Returns:
|
|
384
|
-
float: The predicted next return.
|
|
385
|
-
"""
|
|
386
|
-
return get_prediction(window_data)
|
|
387
|
-
|
|
388
|
-
def calculate_signals(self, window_data):
|
|
389
|
-
"""
|
|
390
|
-
Calculates the trading signal based on the prediction.
|
|
391
|
-
|
|
392
|
-
Args:
|
|
393
|
-
window_data (np.array): The dataset for a specific window period.
|
|
394
|
-
|
|
395
|
-
Returns:
|
|
396
|
-
str: The trading signal ('LONG', 'SHORT', or None).
|
|
397
|
-
"""
|
|
398
|
-
prediction = self.get_prediction(window_data)
|
|
399
|
-
if prediction > 0:
|
|
400
|
-
signal = "LONG"
|
|
401
|
-
elif prediction < 0:
|
|
402
|
-
signal = "SHORT"
|
|
403
|
-
else:
|
|
404
|
-
signal = None
|
|
405
|
-
return signal
|
|
406
|
-
|
|
407
|
-
# Step 4: Trading Strategy
|
|
408
|
-
|
|
409
|
-
def execute_trading_strategy(self, predictions):
|
|
410
|
-
"""
|
|
411
|
-
Executes the trading strategy based on a list
|
|
412
|
-
of predictions, determining positions to take.
|
|
413
|
-
|
|
414
|
-
Args:
|
|
415
|
-
predictions (list): A list of predicted returns.
|
|
416
|
-
|
|
417
|
-
Returns:
|
|
418
|
-
list: A list of positions (1 for 'LONG', -1 for 'SHORT', 0 for 'HOLD').
|
|
419
|
-
"""
|
|
420
|
-
positions = [] # Long if 1, Short if -1
|
|
421
|
-
previous_position = 0 # Initial position
|
|
422
|
-
for prediction in predictions:
|
|
423
|
-
if prediction > 0:
|
|
424
|
-
current_position = 1 # Long
|
|
425
|
-
elif prediction < 0:
|
|
426
|
-
current_position = -1 # Short
|
|
427
|
-
else:
|
|
428
|
-
current_position = previous_position # Hold previous position
|
|
429
|
-
positions.append(current_position)
|
|
430
|
-
previous_position = current_position
|
|
431
|
-
|
|
432
|
-
return positions
|
|
433
|
-
|
|
434
|
-
# Step 5: Vectorized Backtesting
|
|
435
|
-
def generate_predictions(self):
|
|
436
|
-
"""
|
|
437
|
-
Generator that yields predictions one by one.
|
|
438
|
-
"""
|
|
439
|
-
data = self.data
|
|
440
|
-
window_size = self.k
|
|
441
|
-
for i in range(window_size, len(data)):
|
|
442
|
-
print(
|
|
443
|
-
f"Processing window {i - window_size + 1}/{len(data) - window_size}..."
|
|
444
|
-
)
|
|
445
|
-
window_data = data["diff_log_return"].iloc[i - window_size : i]
|
|
446
|
-
next_return = self.get_prediction(window_data)
|
|
447
|
-
yield next_return
|
|
448
|
-
|
|
449
|
-
def backtest_strategy(self):
|
|
450
|
-
"""
|
|
451
|
-
Performs a backtest of the strategy over
|
|
452
|
-
the entire dataset, plotting cumulative returns.
|
|
453
|
-
"""
|
|
454
|
-
data = self.data
|
|
455
|
-
window_size = self.k
|
|
456
|
-
print(
|
|
457
|
-
f"Starting backtesting for {self.symbol}\n"
|
|
458
|
-
f"Window size {window_size}.\n"
|
|
459
|
-
f"Total iterations: {len(data) - window_size}.\n"
|
|
460
|
-
)
|
|
461
|
-
predictions_generator = self.generate_predictions()
|
|
462
|
-
|
|
463
|
-
positions = self.execute_trading_strategy(predictions_generator)
|
|
464
|
-
|
|
465
|
-
strategy_returns = (
|
|
466
|
-
np.array(positions[:-1]) * data["log_return"].iloc[window_size + 1 :].values
|
|
467
|
-
)
|
|
468
|
-
buy_and_hold = data["log_return"].iloc[window_size + 1 :].values
|
|
469
|
-
buy_and_hold_returns = np.cumsum(buy_and_hold)
|
|
470
|
-
cumulative_returns = np.cumsum(strategy_returns)
|
|
471
|
-
dates = data.index[window_size + 1 :]
|
|
472
|
-
self.plot_cumulative_returns(cumulative_returns, buy_and_hold_returns, dates)
|
|
473
|
-
|
|
474
|
-
print("\nBacktesting completed !!")
|
|
475
|
-
|
|
476
|
-
# Function to plot the cumulative returns
|
|
477
|
-
def plot_cumulative_returns(self, strategy_returns, buy_and_hold_returns, dates):
|
|
478
|
-
"""
|
|
479
|
-
Plots the cumulative returns of the ARIMA+GARCH strategy against
|
|
480
|
-
a buy-and-hold strategy.
|
|
481
|
-
|
|
482
|
-
Args:
|
|
483
|
-
strategy_returns (np.array): Cumulative returns from the strategy.
|
|
484
|
-
buy_and_hold_returns (np.array): Cumulative returns from a buy-and-hold strategy.
|
|
485
|
-
dates (pd.Index): The dates corresponding to the returns.
|
|
486
|
-
"""
|
|
487
|
-
plt.figure(figsize=(14, 7))
|
|
488
|
-
plt.plot(dates, strategy_returns, label="ARIMA+GARCH ", color="blue")
|
|
489
|
-
plt.plot(dates, buy_and_hold_returns, label="Buy & Hold", color="red")
|
|
490
|
-
plt.xlabel("Time")
|
|
491
|
-
plt.ylabel("Cumulative Returns")
|
|
492
|
-
plt.title(f"ARIMA+GARCH Strategy vs. Buy & Hold on ({self.symbol})")
|
|
493
|
-
plt.legend()
|
|
494
|
-
plt.grid(True)
|
|
495
|
-
plt.show()
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
# *********************************************
|
|
499
|
-
# STATS TEST (Cointegration , Mean Reverting)*
|
|
500
|
-
# *********************************************
|
|
501
|
-
def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -> None:
|
|
502
|
-
"""
|
|
503
|
-
Calculates and prints the correlation matrix of the adjusted closing prices
|
|
504
|
-
for a given list of stock tickers within a specified date range.
|
|
505
|
-
|
|
506
|
-
Args:
|
|
507
|
-
tickers (Union[List[str] , Tuple[str, ...]]):
|
|
508
|
-
A list or tuple of valid stock tickers (e.g., ['AAPL', 'MSFT', 'GOOG']).
|
|
509
|
-
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
510
|
-
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
511
|
-
|
|
512
|
-
Example:
|
|
513
|
-
>>> from bbstrader.tseries import get_corr
|
|
514
|
-
>>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
515
|
-
"""
|
|
516
|
-
# Download historical data
|
|
517
|
-
data = yf.download(tickers, start=start, end=end, multi_level_index=False)[
|
|
518
|
-
"Adj Close"
|
|
519
|
-
]
|
|
520
|
-
|
|
521
|
-
# Calculate correlation matrix
|
|
522
|
-
correlation_matrix = data.corr()
|
|
523
|
-
|
|
524
|
-
# Display the matrix
|
|
525
|
-
print(correlation_matrix)
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
def plot_price_series(df: pd.DataFrame, ts1: str, ts2: str):
|
|
529
|
-
"""
|
|
530
|
-
Plot both time series on the same line graph for
|
|
531
|
-
the specified date range.
|
|
532
|
-
|
|
533
|
-
Args:
|
|
534
|
-
df (pd.DataFrame):
|
|
535
|
-
The DataFrame containing prices for each series
|
|
536
|
-
ts1 (str): The first time series column name
|
|
537
|
-
ts2 (str): The second time series column name
|
|
538
|
-
"""
|
|
539
|
-
fig, ax = plt.subplots()
|
|
540
|
-
ax.plot(df.index, df[ts1], label=ts1)
|
|
541
|
-
ax.plot(df.index, df[ts2], label=ts2)
|
|
542
|
-
|
|
543
|
-
fig.autofmt_xdate()
|
|
544
|
-
plt.xlabel("Month/Year")
|
|
545
|
-
plt.ylabel("Price ($)")
|
|
546
|
-
plt.title(f"{ts1} and {ts2} Daily Prices ")
|
|
547
|
-
plt.legend()
|
|
548
|
-
plt.show()
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
def plot_scatter_series(df: pd.DataFrame, ts1: str, ts2: str):
|
|
552
|
-
"""
|
|
553
|
-
Plot a scatter plot of both time series for
|
|
554
|
-
via the provided DataFrame.
|
|
555
|
-
|
|
556
|
-
Args:
|
|
557
|
-
df (pd.DataFrame):
|
|
558
|
-
The DataFrame containing prices for each series
|
|
559
|
-
ts1 (str): The first time series column name
|
|
560
|
-
ts2 (str): The second time series column name
|
|
561
|
-
"""
|
|
562
|
-
plt.xlabel(f"{ts1} Price ($)")
|
|
563
|
-
plt.ylabel(f"{ts2} Price ($)")
|
|
564
|
-
plt.title(f"{ts1} and {ts2} Price Scatterplot")
|
|
565
|
-
plt.scatter(df[ts1], df[ts2])
|
|
566
|
-
|
|
567
|
-
# Plot the regression line
|
|
568
|
-
plt.plot(
|
|
569
|
-
df[ts1],
|
|
570
|
-
results.fittedvalues,
|
|
571
|
-
linestyle="--",
|
|
572
|
-
color="red",
|
|
573
|
-
linewidth=2,
|
|
574
|
-
label="Regression Line",
|
|
575
|
-
)
|
|
576
|
-
plt.legend()
|
|
577
|
-
plt.show()
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
def plot_residuals(df: pd.DataFrame):
|
|
581
|
-
"""
|
|
582
|
-
Plot the residuals of OLS procedure for both
|
|
583
|
-
time series.
|
|
584
|
-
|
|
585
|
-
Args:
|
|
586
|
-
df (pd.DataFrame):
|
|
587
|
-
The DataFrame containing prices for each series
|
|
588
|
-
"""
|
|
589
|
-
fig, ax = plt.subplots()
|
|
590
|
-
ax.plot(df.index, df["res"], label="Residuals")
|
|
591
|
-
|
|
592
|
-
fig.autofmt_xdate()
|
|
593
|
-
plt.xlabel("Month/Year")
|
|
594
|
-
plt.ylabel("Price ($)")
|
|
595
|
-
plt.title("Residual Plot")
|
|
596
|
-
plt.legend()
|
|
597
|
-
plt.show()
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
def run_cadf_test(
|
|
601
|
-
pair: Union[List[str], Tuple[str, ...]],
|
|
602
|
-
start: str,
|
|
603
|
-
end: str,
|
|
604
|
-
) -> None:
|
|
605
|
-
"""
|
|
606
|
-
Performs the Cointegration Augmented Dickey-Fuller (CADF) test on a pair of stock tickers
|
|
607
|
-
over a specified date range to check for cointegration.
|
|
608
|
-
|
|
609
|
-
The function downloads historical adjusted closing prices for the specified pair of stock tickers,
|
|
610
|
-
calculates the optimal hedge ratio (beta) using Ordinary Least Squares (OLS) regression, plots the
|
|
611
|
-
time series and their residuals, and finally performs the CADF test on the residuals.
|
|
612
|
-
|
|
613
|
-
Args:
|
|
614
|
-
pair (List[str] or Tuple[str, ...]):
|
|
615
|
-
A list or tuple containing two valid stock tickers (e.g., ['AAPL', 'MSFT']).
|
|
616
|
-
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
617
|
-
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
618
|
-
|
|
619
|
-
Example:
|
|
620
|
-
>>> from bbstrader.tseries import run_cadf_test
|
|
621
|
-
>>> run_cadf_test(['AAPL', 'MSFT'], '2023-01-01', '2023-12-31')
|
|
622
|
-
>>> Regression Metrics:
|
|
623
|
-
>>> Optimal Hedge Ratio (Beta): 2.2485845594120333
|
|
624
|
-
>>> Result Parmas:
|
|
625
|
-
|
|
626
|
-
>>> const -74.418034
|
|
627
|
-
>>> AAPL 2.248585
|
|
628
|
-
>>> dtype: float64
|
|
629
|
-
|
|
630
|
-
>>> Regression Summary:
|
|
631
|
-
>>> OLS Regression Results
|
|
632
|
-
>>> ==============================================================================
|
|
633
|
-
>>> Dep. Variable: MSFT R-squared: 0.900
|
|
634
|
-
>>> Model: OLS Adj. R-squared: 0.900
|
|
635
|
-
>>> Method: Least Squares F-statistic: 2244.
|
|
636
|
-
>>> Date: Sat, 20 Jul 2024 Prob (F-statistic): 2.95e-126
|
|
637
|
-
>>> Time: 13:36:58 Log-Likelihood: -996.45
|
|
638
|
-
>>> No. Observations: 250 AIC: 1997.
|
|
639
|
-
>>> Df Residuals: 248 BIC: 2004.
|
|
640
|
-
>>> Df Model: 1
|
|
641
|
-
>>> Covariance Type: nonrobust
|
|
642
|
-
>>> ==============================================================================
|
|
643
|
-
>>> coef std err t P>|t| [0.025 0.975]
|
|
644
|
-
>>> ------------------------------------------------------------------------------
|
|
645
|
-
>>> const -74.4180 8.191 -9.085 0.000 -90.551 -58.286
|
|
646
|
-
>>> AAPL 2.2486 0.047 47.369 0.000 2.155 2.342
|
|
647
|
-
>>> ==============================================================================
|
|
648
|
-
>>> Omnibus: 4.923 Durbin-Watson: 0.121
|
|
649
|
-
>>> Prob(Omnibus): 0.085 Jarque-Bera (JB): 4.862
|
|
650
|
-
>>> Skew: 0.342 Prob(JB): 0.0879
|
|
651
|
-
>>> Kurtosis: 2.993 Cond. No. 1.71e+03
|
|
652
|
-
>>> ==============================================================================
|
|
653
|
-
|
|
654
|
-
>>> Notes:
|
|
655
|
-
>>> [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
|
|
656
|
-
>>> [2] The condition number is large, 1.71e+03. This might indicate that there are
|
|
657
|
-
>>> strong multicollinearity or other numerical problems.
|
|
658
|
-
|
|
659
|
-
>>> Cointegration TEST Results:
|
|
660
|
-
>>> (np.float64(-3.204126144947765),
|
|
661
|
-
>>> np.float64(0.019747080611767602),
|
|
662
|
-
>>> 0,
|
|
663
|
-
>>> 249,
|
|
664
|
-
>>> {'1%': np.float64(-3.4568881317725864),
|
|
665
|
-
>>> '10%': np.float64(-2.5729936189738876),
|
|
666
|
-
>>> '5%': np.float64(-2.8732185133016057)},
|
|
667
|
-
>>> np.float64(1364.3866758546171))
|
|
668
|
-
"""
|
|
669
|
-
# Download historical data for required stocks
|
|
670
|
-
p0, p1 = pair[0], pair[1]
|
|
671
|
-
_p0 = yf.download(
|
|
672
|
-
p0,
|
|
673
|
-
start=start,
|
|
674
|
-
end=end,
|
|
675
|
-
progress=False,
|
|
676
|
-
multi_level_index=False,
|
|
677
|
-
auto_adjust=True,
|
|
678
|
-
)
|
|
679
|
-
_p1 = yf.download(
|
|
680
|
-
p1,
|
|
681
|
-
start=start,
|
|
682
|
-
end=end,
|
|
683
|
-
progress=False,
|
|
684
|
-
multi_level_index=False,
|
|
685
|
-
auto_adjust=True,
|
|
686
|
-
)
|
|
687
|
-
df = pd.DataFrame(index=_p0.index)
|
|
688
|
-
df[p0] = _p0["Adj Close"]
|
|
689
|
-
df[p1] = _p1["Adj Close"]
|
|
690
|
-
df = df.dropna()
|
|
691
|
-
|
|
692
|
-
# Calculate optimal hedge ratio "beta"
|
|
693
|
-
# using statsmodels OLS
|
|
694
|
-
X = sm.add_constant(df[p0])
|
|
695
|
-
y = df[p1]
|
|
696
|
-
model = sm.OLS(y, X)
|
|
697
|
-
global results
|
|
698
|
-
results = model.fit()
|
|
699
|
-
beta_hr = results.params[p0]
|
|
700
|
-
|
|
701
|
-
# Plot the two time series with regression line
|
|
702
|
-
plot_price_series(df, p0, p1)
|
|
703
|
-
|
|
704
|
-
# Display a scatter plot of the two time series
|
|
705
|
-
# with regression line
|
|
706
|
-
plot_scatter_series(df, p0, p1)
|
|
707
|
-
|
|
708
|
-
# Calculate the residuals of the linear combination
|
|
709
|
-
df["res"] = results.resid
|
|
710
|
-
plot_residuals(df)
|
|
711
|
-
|
|
712
|
-
# Display regression metrics
|
|
713
|
-
print("\nRegression Metrics:")
|
|
714
|
-
print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
|
|
715
|
-
print("Result Parmas: \n")
|
|
716
|
-
print(results.params)
|
|
717
|
-
print("\nRegression Summary:")
|
|
718
|
-
print(results.summary())
|
|
719
|
-
|
|
720
|
-
# Calculate and output the CADF test on the residuals
|
|
721
|
-
print("\nCointegration TEST Results:")
|
|
722
|
-
cadf = ts.adfuller(df["res"], autolag="AIC")
|
|
723
|
-
pprint.pprint(cadf)
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
def _hurst(ts):
|
|
727
|
-
"""
|
|
728
|
-
Returns the Hurst Exponent of the time series vector ts,
|
|
729
|
-
"""
|
|
730
|
-
# Create the range of lag values
|
|
731
|
-
lags = range(2, 100)
|
|
732
|
-
|
|
733
|
-
# Calculate the array of the variances of the lagged differences
|
|
734
|
-
tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
|
|
735
|
-
|
|
736
|
-
# Use a linear fit to estimate the Hurst Exponent
|
|
737
|
-
poly = np.polyfit(np.log(lags), np.log(tau), 1)
|
|
738
|
-
|
|
739
|
-
# Return the Hurst exponent from the polyfit output
|
|
740
|
-
return poly[0] * 2.0
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
# Function to calculate Hurst Exponent
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
def hurst(time_series):
|
|
747
|
-
H, c, data_range = compute_Hc(time_series, kind="price", simplified=True)
|
|
748
|
-
return H
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
def run_hurst_test(symbol: str, start: str, end: str):
|
|
752
|
-
"""
|
|
753
|
-
Calculates and prints the Hurst Exponent for a given stock's adjusted closing prices
|
|
754
|
-
within a specified date range, and for three generated series (Geometric Brownian Motion,
|
|
755
|
-
Mean-Reverting, and Trending).
|
|
756
|
-
|
|
757
|
-
The Hurst Exponent is used to determine the long-term memory of a time series.
|
|
758
|
-
|
|
759
|
-
Args:
|
|
760
|
-
symbol (str): A valid stock ticker symbol (e.g., 'AAPL').
|
|
761
|
-
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
762
|
-
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
763
|
-
|
|
764
|
-
Example:
|
|
765
|
-
>>> from bbstrader.tseries import run_hurst_test
|
|
766
|
-
|
|
767
|
-
>>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
|
|
768
|
-
"""
|
|
769
|
-
data = yf.download(
|
|
770
|
-
symbol,
|
|
771
|
-
start=start,
|
|
772
|
-
end=end,
|
|
773
|
-
progress=False,
|
|
774
|
-
multi_level_index=False,
|
|
775
|
-
auto_adjust=True,
|
|
776
|
-
)
|
|
777
|
-
|
|
778
|
-
# Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
|
|
779
|
-
gbm = np.log(np.cumsum(np.random.randn(100000)) + 1000)
|
|
780
|
-
mr = np.log(np.random.randn(100000) + 1000)
|
|
781
|
-
tr = np.log(np.cumsum(np.random.randn(100000) + 1) + 1000)
|
|
782
|
-
|
|
783
|
-
# Output the Hurst Exponent for each of the series
|
|
784
|
-
print(f"\nHurst(GBM): {_hurst(gbm)}")
|
|
785
|
-
print(f"Hurst(MR): {_hurst(mr)}")
|
|
786
|
-
print(f"Hurst(TR): {_hurst(tr)}")
|
|
787
|
-
print(f"Hurst({symbol}): {hurst(data['Adj Close'])}\n")
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
def test_cointegration(ticker1, ticker2, start, end):
|
|
791
|
-
# Download historical data
|
|
792
|
-
stock_data_pair = yf.download(
|
|
793
|
-
[ticker1, ticker2],
|
|
794
|
-
start=start,
|
|
795
|
-
end=end,
|
|
796
|
-
progress=False,
|
|
797
|
-
multi_level_index=False,
|
|
798
|
-
auto_adjust=True,
|
|
799
|
-
)["Adj Close"].dropna()
|
|
800
|
-
|
|
801
|
-
# Perform Johansen cointegration test
|
|
802
|
-
result = coint_johansen(stock_data_pair, det_order=0, k_ar_diff=1)
|
|
803
|
-
|
|
804
|
-
# Get the cointegration rank
|
|
805
|
-
traces_stats = result.lr1
|
|
806
|
-
print(f"\nTraces Stats: \n{traces_stats}")
|
|
807
|
-
|
|
808
|
-
# Get the critical values for 95% confidence level
|
|
809
|
-
critical_values = result.cvt
|
|
810
|
-
print(f"\nCritical Values: \n{critical_values}")
|
|
811
|
-
|
|
812
|
-
# Compare the cointegration rank with critical values
|
|
813
|
-
if traces_stats[0] > critical_values[:, 1].all():
|
|
814
|
-
print(f"\n{ticker1} and {ticker2} are cointegrated.\n")
|
|
815
|
-
else:
|
|
816
|
-
print(f"\nNo cointegration found for {ticker1} and {ticker2}.\n")
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
def run_coint_test(tickers: List[str], start: str, end: str) -> None:
|
|
820
|
-
"""
|
|
821
|
-
Performs pairwise cointegration tests on a list of stock tickers over a specified date range.
|
|
822
|
-
|
|
823
|
-
For each unique pair of tickers, the function downloads historical adjusted closing prices and
|
|
824
|
-
tests for cointegration.
|
|
825
|
-
|
|
826
|
-
Args:
|
|
827
|
-
tickers (List[str]): A list of valid stock ticker symbols (e.g., ['AAPL', 'MSFT', 'GOOG']).
|
|
828
|
-
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
829
|
-
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
830
|
-
|
|
831
|
-
Example:
|
|
832
|
-
>>> from bbstrader.tseries import run_coint_test
|
|
833
|
-
|
|
834
|
-
>>> run_coint_test(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
835
|
-
"""
|
|
836
|
-
# Loop through ticker combinations
|
|
837
|
-
for ticker1, ticker2 in combinations(tickers, 2):
|
|
838
|
-
test_cointegration(ticker1, ticker2, start, end)
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
# *********************************
|
|
842
|
-
# KALMAN FILTER *
|
|
843
|
-
# *********************************
|
|
844
|
-
def draw_date_coloured_scatterplot(etfs, prices):
|
|
845
|
-
"""
|
|
846
|
-
Create a scatterplot of the two ETF prices, which is
|
|
847
|
-
coloured by the date of the price to indicate the
|
|
848
|
-
changing relationship between the sets of prices
|
|
849
|
-
"""
|
|
850
|
-
plen = len(prices)
|
|
851
|
-
colour_map = plt.cm.get_cmap("YlOrRd")
|
|
852
|
-
colours = np.linspace(0.1, 1, plen)
|
|
853
|
-
|
|
854
|
-
scatterplot = plt.scatter(
|
|
855
|
-
prices[etfs[0]],
|
|
856
|
-
prices[etfs[1]],
|
|
857
|
-
s=30,
|
|
858
|
-
c=colours,
|
|
859
|
-
cmap=colour_map,
|
|
860
|
-
edgecolor="k",
|
|
861
|
-
alpha=0.8,
|
|
862
|
-
)
|
|
863
|
-
|
|
864
|
-
colourbar = plt.colorbar(scatterplot)
|
|
865
|
-
colourbar.ax.set_yticklabels([str(p.date()) for p in prices[:: plen // 9].index])
|
|
866
|
-
|
|
867
|
-
plt.xlabel(prices.columns[0])
|
|
868
|
-
plt.ylabel(prices.columns[1])
|
|
869
|
-
plt.show()
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
def calc_slope_intercept_kalman(etfs, prices):
|
|
873
|
-
"""
|
|
874
|
-
Utilize the Kalman Filter from the filterpy library
|
|
875
|
-
to calculate the slope and intercept of the regressed
|
|
876
|
-
ETF prices.
|
|
877
|
-
"""
|
|
878
|
-
delta = 1e-5
|
|
879
|
-
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
880
|
-
|
|
881
|
-
kf = KalmanFilter(dim_x=2, dim_z=1)
|
|
882
|
-
kf.x = np.zeros((2, 1)) # Initial state
|
|
883
|
-
kf.P = np.ones((2, 2)) * 1000.0 # Initial covariance,
|
|
884
|
-
# large to represent high uncertainty
|
|
885
|
-
kf.F = np.eye(2) # State transition matrix
|
|
886
|
-
kf.Q = trans_cov # Process noise covariance
|
|
887
|
-
kf.R = 1.0 # Scalar measurement noise covariance
|
|
888
|
-
|
|
889
|
-
state_means, state_covs = [], []
|
|
890
|
-
for time, z in enumerate(prices[etfs[1]].values):
|
|
891
|
-
# Dynamically update the observation matrix H
|
|
892
|
-
# to include the current independent variable
|
|
893
|
-
kf.H = np.array([[prices[etfs[0]][time], 1.0]])
|
|
894
|
-
kf.predict()
|
|
895
|
-
kf.update(z)
|
|
896
|
-
state_means.append(kf.x.copy())
|
|
897
|
-
state_covs.append(kf.P.copy())
|
|
898
|
-
|
|
899
|
-
return np.array(state_means), np.array(state_covs)
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
def draw_slope_intercept_changes(prices, state_means):
|
|
903
|
-
"""
|
|
904
|
-
Plot the slope and intercept of the regressed ETF prices
|
|
905
|
-
between the two ETFs, with the changing values of the
|
|
906
|
-
Kalman Filter over time.
|
|
907
|
-
"""
|
|
908
|
-
print(f"First Slops : {state_means[0, 0]}")
|
|
909
|
-
print(f"First intercept : {state_means[0, 1]}")
|
|
910
|
-
pd.DataFrame(
|
|
911
|
-
{
|
|
912
|
-
"slope": state_means[:, 0].flatten(),
|
|
913
|
-
"intercept": state_means[:, 1].flatten(),
|
|
914
|
-
},
|
|
915
|
-
index=prices.index,
|
|
916
|
-
).plot(subplots=True)
|
|
917
|
-
plt.show()
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
def run_kalman_filter(
|
|
921
|
-
etfs: Union[List[str], Tuple[str, ...]], start: str, end: str
|
|
922
|
-
) -> None:
|
|
923
|
-
"""
|
|
924
|
-
Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
|
|
925
|
-
to estimate the slope and intercept over time.
|
|
926
|
-
|
|
927
|
-
The function downloads historical adjusted closing prices for the specified pair of assets,
|
|
928
|
-
visualizes their price relationship, calculates the Kalman filter estimates for the slope and
|
|
929
|
-
intercept, and visualizes the changes in these estimates over time.
|
|
930
|
-
|
|
931
|
-
Args:
|
|
932
|
-
etfs (Union[List[str] , Tuple[str, ...]]):
|
|
933
|
-
A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
|
|
934
|
-
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
935
|
-
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
936
|
-
|
|
937
|
-
Example:
|
|
938
|
-
>>> from bbstrader.tseries import run_kalman_filter
|
|
939
|
-
|
|
940
|
-
>>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
|
|
941
|
-
"""
|
|
942
|
-
etf_df1 = yf.download(
|
|
943
|
-
etfs[0], start, end, progress=False, multi_level_index=False, auto_adjust=True
|
|
944
|
-
)
|
|
945
|
-
etf_df2 = yf.download(
|
|
946
|
-
etfs[1], start, end, progress=False, multi_level_index=False, auto_adjust=True
|
|
947
|
-
)
|
|
948
|
-
|
|
949
|
-
prices = pd.DataFrame(index=etf_df1.index)
|
|
950
|
-
prices[etfs[0]] = etf_df1["Adj Close"]
|
|
951
|
-
prices[etfs[1]] = etf_df2["Adj Close"]
|
|
952
|
-
|
|
953
|
-
draw_date_coloured_scatterplot(etfs, prices)
|
|
954
|
-
state_means, state_covs = calc_slope_intercept_kalman(etfs, prices)
|
|
955
|
-
draw_slope_intercept_changes(prices, state_means)
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
class KalmanFilterModel:
|
|
959
|
-
"""
|
|
960
|
-
Implements a Kalman Filter model a recursive algorithm used for estimating
|
|
961
|
-
the state of a linear dynamic system from a series of noisy measurements.
|
|
962
|
-
It's designed to process market data, estimate dynamic parameters such as
|
|
963
|
-
the slope and intercept of price relationships,
|
|
964
|
-
forecast error and standard deviation of the predictions
|
|
965
|
-
|
|
966
|
-
You can learn more here https://en.wikipedia.org/wiki/Kalman_filter
|
|
967
|
-
"""
|
|
968
|
-
|
|
969
|
-
def __init__(self, tickers: List | Tuple, **kwargs):
|
|
970
|
-
"""
|
|
971
|
-
Initializes the Kalman Filter strategy.
|
|
972
|
-
|
|
973
|
-
Args:
|
|
974
|
-
tickers :
|
|
975
|
-
A list or tuple of ticker symbols representing financial instruments.
|
|
976
|
-
|
|
977
|
-
kwargs : Keyword arguments for additional parameters,
|
|
978
|
-
specifically `delta` and `vt`
|
|
979
|
-
"""
|
|
980
|
-
self.tickers = tickers
|
|
981
|
-
assert self.tickers is not None
|
|
982
|
-
|
|
983
|
-
self.R = None
|
|
984
|
-
self.theta = np.zeros(2)
|
|
985
|
-
self.P = np.zeros((2, 2))
|
|
986
|
-
self.delta = kwargs.get("delta", 1e-4)
|
|
987
|
-
self.vt = kwargs.get("vt", 1e-3)
|
|
988
|
-
self.wt = self.delta / (1 - self.delta) * np.eye(2)
|
|
989
|
-
self.latest_prices = np.array([-1.0, -1.0])
|
|
990
|
-
self.kf = self._init_kalman()
|
|
991
|
-
|
|
992
|
-
def _init_kalman(self):
|
|
993
|
-
"""
|
|
994
|
-
Initializes and returns a Kalman Filter configured
|
|
995
|
-
for the trading strategy. The filter is set up with initial
|
|
996
|
-
state and covariance, state transition matrix, process noise
|
|
997
|
-
and measurement noise covariances.
|
|
998
|
-
"""
|
|
999
|
-
kf = KalmanFilter(dim_x=2, dim_z=1)
|
|
1000
|
-
kf.x = np.zeros((2, 1)) # Initial state
|
|
1001
|
-
kf.P = self.P # Initial covariance
|
|
1002
|
-
kf.F = np.eye(2) # State transition matrix
|
|
1003
|
-
kf.Q = self.wt # Process noise covariance
|
|
1004
|
-
kf.R = 1.0 # Scalar measurement noise covariance
|
|
1005
|
-
|
|
1006
|
-
return kf
|
|
1007
|
-
|
|
1008
|
-
Array = np.ndarray
|
|
1009
|
-
|
|
1010
|
-
def calc_slope_intercep(self, prices: Array) -> Tuple:
|
|
1011
|
-
"""
|
|
1012
|
-
Calculates and returns the slope and intercept
|
|
1013
|
-
of the relationship between the provided prices using the Kalman Filter.
|
|
1014
|
-
This method updates the filter with the latest price and returns
|
|
1015
|
-
the estimated slope and intercept.
|
|
1016
|
-
|
|
1017
|
-
Args:
|
|
1018
|
-
prices : A numpy array of prices for two financial instruments.
|
|
1019
|
-
|
|
1020
|
-
Returns:
|
|
1021
|
-
A tuple containing the slope and intercept of the relationship
|
|
1022
|
-
"""
|
|
1023
|
-
self.kf.H = np.array([[prices[1], 1.0]])
|
|
1024
|
-
self.kf.predict()
|
|
1025
|
-
self.kf.update(prices[1])
|
|
1026
|
-
slope = self.kf.x.copy().flatten()[0]
|
|
1027
|
-
intercept = self.kf.x.copy().flatten()[1]
|
|
1028
|
-
|
|
1029
|
-
return slope, intercept
|
|
1030
|
-
|
|
1031
|
-
def calculate_etqt(self, prices: Array) -> Tuple:
|
|
1032
|
-
"""
|
|
1033
|
-
Calculates the ``forecast error`` and ``standard deviation`` of the predictions
|
|
1034
|
-
using the Kalman Filter.
|
|
1035
|
-
|
|
1036
|
-
Args:
|
|
1037
|
-
prices : A numpy array of prices for two financial instruments.
|
|
1038
|
-
|
|
1039
|
-
Returns:
|
|
1040
|
-
A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
|
|
1041
|
-
"""
|
|
1042
|
-
|
|
1043
|
-
self.latest_prices[0] = prices[0]
|
|
1044
|
-
self.latest_prices[1] = prices[1]
|
|
1045
|
-
|
|
1046
|
-
if all(self.latest_prices > -1.0):
|
|
1047
|
-
slope, intercept = self.calc_slope_intercep(self.latest_prices)
|
|
1048
|
-
|
|
1049
|
-
self.theta[0] = slope
|
|
1050
|
-
self.theta[1] = intercept
|
|
1051
|
-
|
|
1052
|
-
# Create the observation matrix of the latest prices
|
|
1053
|
-
# of Y and the intercept value (1.0) as well as the
|
|
1054
|
-
# scalar value of the latest price from X
|
|
1055
|
-
F = np.asarray([self.latest_prices[0], 1.0]).reshape((1, 2))
|
|
1056
|
-
y = self.latest_prices[1]
|
|
1057
|
-
|
|
1058
|
-
# The prior value of the states {\theta_t} is
|
|
1059
|
-
# distributed as a multivariate Gaussian with
|
|
1060
|
-
# mean a_t and variance-covariance {R_t}
|
|
1061
|
-
if self.R is not None:
|
|
1062
|
-
self.R = self.C + self.wt
|
|
1063
|
-
else:
|
|
1064
|
-
self.R = np.zeros((2, 2))
|
|
1065
|
-
|
|
1066
|
-
# Calculate the Kalman Filter update
|
|
1067
|
-
# ---------------------------------
|
|
1068
|
-
# Calculate prediction of new observation
|
|
1069
|
-
# as well as forecast error of that prediction
|
|
1070
|
-
yhat = F.dot(self.theta)
|
|
1071
|
-
et = y - yhat
|
|
1072
|
-
|
|
1073
|
-
# {Q_t} is the variance of the prediction of
|
|
1074
|
-
# observations and hence sqrt_Qt is the
|
|
1075
|
-
# standard deviation of the predictions
|
|
1076
|
-
Qt = F.dot(self.R).dot(F.T) + self.vt
|
|
1077
|
-
sqrt_Qt = np.sqrt(Qt)
|
|
1078
|
-
|
|
1079
|
-
# The posterior value of the states {\theta_t} is
|
|
1080
|
-
# distributed as a multivariate Gaussian with mean
|
|
1081
|
-
# {m_t} and variance-covariance {C_t}
|
|
1082
|
-
At = self.R.dot(F.T) / Qt
|
|
1083
|
-
self.theta = self.theta + At.flatten() * et
|
|
1084
|
-
self.C = self.R - At * F.dot(self.R)
|
|
1085
|
-
return (et[0], sqrt_Qt.flatten()[0])
|
|
1086
|
-
else:
|
|
1087
|
-
return None
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
# ******************************************
|
|
1091
|
-
# ORNSTEIN UHLENBECK PROCESS *
|
|
1092
|
-
# ******************************************
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
class OrnsteinUhlenbeck:
|
|
1096
|
-
"""
|
|
1097
|
-
The Ornstein-Uhlenbeck process is a mathematical model
|
|
1098
|
-
used to describe the behavior of a mean-reverting stochastic process.
|
|
1099
|
-
We use it to model the price dynamics of an asset that tends
|
|
1100
|
-
to revert to a long-term mean.
|
|
1101
|
-
|
|
1102
|
-
We Estimate the drift (θ), volatility (σ), and long-term mean (μ)
|
|
1103
|
-
based on historical price data; then we Simulate the OU process
|
|
1104
|
-
using the estimated parameters.
|
|
1105
|
-
|
|
1106
|
-
https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
|
|
1107
|
-
"""
|
|
1108
|
-
|
|
1109
|
-
def __init__(self, prices: np.ndarray, returns: bool = True, timeframe: str = "D1"):
|
|
1110
|
-
"""
|
|
1111
|
-
Initializes the OrnsteinUhlenbeck instance.
|
|
1112
|
-
|
|
1113
|
-
Args:
|
|
1114
|
-
prices (np.ndarray) : Historical close prices.
|
|
1115
|
-
|
|
1116
|
-
retrurns (bool) : Use it to indicate weither
|
|
1117
|
-
you want to simulate the returns or your raw data
|
|
1118
|
-
|
|
1119
|
-
timeframe (str) : The time frame for the Historical prices
|
|
1120
|
-
(1m, 5m, 15m, 30m, 1h, 4h, D1)
|
|
1121
|
-
"""
|
|
1122
|
-
self.prices = prices
|
|
1123
|
-
if returns:
|
|
1124
|
-
series = pd.Series(self.prices)
|
|
1125
|
-
self.returns = series.pct_change().dropna().values
|
|
1126
|
-
else:
|
|
1127
|
-
self.returns = self.prices
|
|
1128
|
-
|
|
1129
|
-
time_frame_mapping = {
|
|
1130
|
-
"1m": 1 / (24 * 60), # 1 minute intervals
|
|
1131
|
-
"5m": 5 / (24 * 60), # 5 minute intervals
|
|
1132
|
-
"15m": 15 / (24 * 60), # 15 minute intervals
|
|
1133
|
-
"30m": 30 / (24 * 60), # 30 minute intervals
|
|
1134
|
-
"1h": 1 / 24, # 1 hour intervals
|
|
1135
|
-
"4h": 4 / 24, # 4 hour intervals
|
|
1136
|
-
"D1": 1, # Daily intervals
|
|
1137
|
-
}
|
|
1138
|
-
if timeframe not in time_frame_mapping:
|
|
1139
|
-
raise ValueError("Unsupported time frame")
|
|
1140
|
-
self.tf = time_frame_mapping[timeframe]
|
|
1141
|
-
|
|
1142
|
-
params = self.estimate_parameters()
|
|
1143
|
-
self.mu_hat = params[0] # Mean (μ)
|
|
1144
|
-
self.theta_hat = params[1] # Drift (θ)
|
|
1145
|
-
self.sigma_hat = params[2] # Volatility (σ)
|
|
1146
|
-
print(f"Estimated μ: {self.mu_hat}")
|
|
1147
|
-
print(f"Estimated θ: {self.theta_hat}")
|
|
1148
|
-
print(f"Estimated σ: {self.sigma_hat}")
|
|
1149
|
-
|
|
1150
|
-
def ornstein_uhlenbeck(self, mu, theta, sigma, dt, X0, n):
|
|
1151
|
-
"""
|
|
1152
|
-
Simulates the Ornstein-Uhlenbeck process.
|
|
1153
|
-
|
|
1154
|
-
Args:
|
|
1155
|
-
mu (float): Estimated long-term mean.
|
|
1156
|
-
theta (float): Estimated drift.
|
|
1157
|
-
sigma (float): Estimated volatility.
|
|
1158
|
-
dt (float): Time step.
|
|
1159
|
-
X0 (float): Initial value.
|
|
1160
|
-
n (int): Number of time steps.
|
|
1161
|
-
|
|
1162
|
-
Returns:
|
|
1163
|
-
np.ndarray : Simulated process.
|
|
1164
|
-
"""
|
|
1165
|
-
x = np.zeros(n)
|
|
1166
|
-
x[0] = X0
|
|
1167
|
-
for t in range(1, n):
|
|
1168
|
-
dW = np.random.normal(loc=0, scale=np.sqrt(dt))
|
|
1169
|
-
# O-U process differential equation
|
|
1170
|
-
x[t] = x[t - 1] + (theta * (mu - x[t - 1]) * dt) + (sigma * dW)
|
|
1171
|
-
# dW is a Wiener process
|
|
1172
|
-
# (theta * (mu - x[t-1]) * dt) represents the mean-reverting tendency
|
|
1173
|
-
# (sigma * dW) represents the random volatility
|
|
1174
|
-
return x
|
|
1175
|
-
|
|
1176
|
-
def estimate_parameters(self):
|
|
1177
|
-
"""
|
|
1178
|
-
Estimates the mean-reverting parameters (μ, θ, σ)
|
|
1179
|
-
using the negative log-likelihood.
|
|
1180
|
-
|
|
1181
|
-
Returns:
|
|
1182
|
-
Tuple: Estimated μ, θ, and σ.
|
|
1183
|
-
"""
|
|
1184
|
-
initial_guess = [0, 0.1, np.std(self.returns)]
|
|
1185
|
-
result = minimize(self._neg_log_likelihood, initial_guess, args=(self.returns,))
|
|
1186
|
-
mu, theta, sigma = result.x
|
|
1187
|
-
return mu, theta, sigma
|
|
1188
|
-
|
|
1189
|
-
def _neg_log_likelihood(self, params, returns):
|
|
1190
|
-
"""
|
|
1191
|
-
Calculates the negative
|
|
1192
|
-
log-likelihood for parameter estimation.
|
|
1193
|
-
|
|
1194
|
-
Args:
|
|
1195
|
-
params (list): List of parameters [mu, theta, sigma].
|
|
1196
|
-
returns (np.ndarray): Historical returns.
|
|
1197
|
-
|
|
1198
|
-
Returns:
|
|
1199
|
-
float: Negative log-likelihood.
|
|
1200
|
-
"""
|
|
1201
|
-
mu, theta, sigma = params
|
|
1202
|
-
dt = self.tf
|
|
1203
|
-
n = len(returns)
|
|
1204
|
-
ou_simulated = self.ornstein_uhlenbeck(mu, theta, sigma, dt, 0, n + 1)
|
|
1205
|
-
residuals = ou_simulated[1 : n + 1] - returns
|
|
1206
|
-
neg_ll = 0.5 * np.sum(residuals**2) / sigma**2 + 0.5 * n * np.log(
|
|
1207
|
-
2 * np.pi * sigma**2
|
|
1208
|
-
)
|
|
1209
|
-
return neg_ll
|
|
1210
|
-
|
|
1211
|
-
def simulate_process(self, returns=None, n=100, p=None):
|
|
1212
|
-
"""
|
|
1213
|
-
Simulates the OU process multiple times .
|
|
1214
|
-
|
|
1215
|
-
Args:
|
|
1216
|
-
returns (np.ndarray): Historical returns.
|
|
1217
|
-
n (int): Number of simulations to perform.
|
|
1218
|
-
p (int): Number of time steps.
|
|
1219
|
-
|
|
1220
|
-
Returns:
|
|
1221
|
-
np.ndarray: 2D array representing simulated processes.
|
|
1222
|
-
"""
|
|
1223
|
-
if returns is None:
|
|
1224
|
-
returns = self.returns
|
|
1225
|
-
if p is not None:
|
|
1226
|
-
T = p
|
|
1227
|
-
else:
|
|
1228
|
-
T = len(returns)
|
|
1229
|
-
dt = self.tf
|
|
1230
|
-
|
|
1231
|
-
dW_matrix = np.random.normal(loc=0, scale=np.sqrt(dt), size=(n, T))
|
|
1232
|
-
simulations_matrix = np.zeros((n, T))
|
|
1233
|
-
simulations_matrix[:, 0] = returns[-1]
|
|
1234
|
-
|
|
1235
|
-
for t in range(1, T):
|
|
1236
|
-
simulations_matrix[:, t] = (
|
|
1237
|
-
simulations_matrix[:, t - 1]
|
|
1238
|
-
+ self.theta_hat * (self.mu_hat - simulations_matrix[:, t - 1]) * dt
|
|
1239
|
-
+ self.sigma_hat * dW_matrix[:, t]
|
|
1240
|
-
)
|
|
1241
|
-
return simulations_matrix
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
def remove_correlated_assets(df: pd.DataFrame, cutoff=0.99):
|
|
1245
|
-
"""
|
|
1246
|
-
Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
|
|
1247
|
-
This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
|
|
1248
|
-
|
|
1249
|
-
Args:
|
|
1250
|
-
df (pd.DataFrame): A DataFrame where each column represents an asset
|
|
1251
|
-
and rows represent observations (e.g., time-series data).
|
|
1252
|
-
cutoff (float, optional, default=0.99): The correlation threshold.
|
|
1253
|
-
Columns with absolute correlation greater than this value will be considered for removal.
|
|
1254
|
-
|
|
1255
|
-
Returns:
|
|
1256
|
-
pd.DataFrame: A DataFrame with less correlated assets.
|
|
1257
|
-
The columns that are highly correlated (above the cutoff) are removed.
|
|
1258
|
-
|
|
1259
|
-
References
|
|
1260
|
-
----------
|
|
1261
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1262
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1263
|
-
|
|
1264
|
-
Example:
|
|
1265
|
-
>>> df = pd.DataFrame({
|
|
1266
|
-
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1267
|
-
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1268
|
-
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1269
|
-
... })
|
|
1270
|
-
>>> df = remove_correlated_assets(df)
|
|
1271
|
-
"""
|
|
1272
|
-
corr = df.corr().stack()
|
|
1273
|
-
corr = corr[corr < 1]
|
|
1274
|
-
to_check = corr[corr.abs() > cutoff].index
|
|
1275
|
-
keep, drop = set(), set()
|
|
1276
|
-
for s1, s2 in to_check:
|
|
1277
|
-
if s1 not in keep:
|
|
1278
|
-
if s2 not in keep:
|
|
1279
|
-
keep.add(s1)
|
|
1280
|
-
drop.add(s2)
|
|
1281
|
-
else:
|
|
1282
|
-
drop.add(s1)
|
|
1283
|
-
else:
|
|
1284
|
-
keep.discard(s2)
|
|
1285
|
-
drop.add(s2)
|
|
1286
|
-
return df.drop(drop, axis=1)
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
def check_stationarity(df: pd.DataFrame):
|
|
1290
|
-
"""
|
|
1291
|
-
Tests the stationarity of time-series data for each asset in the DataFrame
|
|
1292
|
-
using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
|
|
1293
|
-
in time-series analysis, and non-stationary data can affect model performance.
|
|
1294
|
-
|
|
1295
|
-
Args:
|
|
1296
|
-
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1297
|
-
|
|
1298
|
-
Returns:
|
|
1299
|
-
pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
|
|
1300
|
-
- ticker Asset name (column name from df).
|
|
1301
|
-
- adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
|
|
1302
|
-
|
|
1303
|
-
References
|
|
1304
|
-
----------
|
|
1305
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1306
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1307
|
-
|
|
1308
|
-
Example:
|
|
1309
|
-
>>> df = pd.DataFrame({
|
|
1310
|
-
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1311
|
-
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1312
|
-
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1313
|
-
... })
|
|
1314
|
-
>>> df = check_stationarity(df)
|
|
1315
|
-
"""
|
|
1316
|
-
results = []
|
|
1317
|
-
for ticker, prices in df.items():
|
|
1318
|
-
results.append([ticker, adfuller(prices, regression="ct")[1]])
|
|
1319
|
-
return pd.DataFrame(results, columns=["ticker", "adf"]).sort_values("adf")
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
def remove_stationary_assets(df: pd.DataFrame, pval=0.05):
|
|
1323
|
-
"""
|
|
1324
|
-
Filters out stationary assets from the DataFrame based on the p-value obtained
|
|
1325
|
-
from the Augmented Dickey-Fuller test.
|
|
1326
|
-
Useful for focusing only on non-stationary time-series data.
|
|
1327
|
-
|
|
1328
|
-
Args:
|
|
1329
|
-
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1330
|
-
pval (float, optional, default=0.05): The significance level to determine stationarity.
|
|
1331
|
-
Columns with an ADF test p-value below this threshold are considered stationary and removed.
|
|
1332
|
-
|
|
1333
|
-
Returns:
|
|
1334
|
-
pd.DataFrame: A DataFrame containing only the non-stationary assets.
|
|
1335
|
-
|
|
1336
|
-
References
|
|
1337
|
-
----------
|
|
1338
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1339
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1340
|
-
|
|
1341
|
-
Example:
|
|
1342
|
-
>>> df = pd.DataFrame({
|
|
1343
|
-
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1344
|
-
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1345
|
-
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1346
|
-
... })
|
|
1347
|
-
>>> df = remove_stationary_assets(df)
|
|
1348
|
-
"""
|
|
1349
|
-
test_result = check_stationarity(df)
|
|
1350
|
-
stationary = test_result.loc[test_result.adf <= pval, "ticker"].tolist()
|
|
1351
|
-
return df.drop(stationary, axis=1).sort_index()
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
|
|
1355
|
-
"""
|
|
1356
|
-
Selects the top N assets based on the average trading volume from the input DataFrame.
|
|
1357
|
-
These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
|
|
1358
|
-
|
|
1359
|
-
Args:
|
|
1360
|
-
df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
|
|
1361
|
-
Must include columns 'close' (price) and 'volume'.
|
|
1362
|
-
n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
|
|
1363
|
-
start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
|
|
1364
|
-
end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
|
|
1365
|
-
rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
|
|
1366
|
-
|
|
1367
|
-
Returns:
|
|
1368
|
-
pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
|
|
1369
|
-
|
|
1370
|
-
References
|
|
1371
|
-
----------
|
|
1372
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1373
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1374
|
-
"""
|
|
1375
|
-
required_columns = {"close", "volume"}
|
|
1376
|
-
if not required_columns.issubset(df.columns):
|
|
1377
|
-
raise ValueError(
|
|
1378
|
-
f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}."
|
|
1379
|
-
)
|
|
1380
|
-
|
|
1381
|
-
if (
|
|
1382
|
-
not isinstance(df.index, pd.MultiIndex)
|
|
1383
|
-
or "ticker" not in df.index.names
|
|
1384
|
-
or "date" not in df.index.names
|
|
1385
|
-
):
|
|
1386
|
-
raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
|
|
1387
|
-
|
|
1388
|
-
df = df.copy()
|
|
1389
|
-
idx = pd.IndexSlice
|
|
1390
|
-
start = start or df.index.get_level_values("date").min()
|
|
1391
|
-
end = end or df.index.get_level_values("date").max()
|
|
1392
|
-
df = (
|
|
1393
|
-
df.loc[lambda df: ~df.index.duplicated()]
|
|
1394
|
-
.sort_index()
|
|
1395
|
-
.loc[idx[:, f"{start}" : f"{end}"], :]
|
|
1396
|
-
.assign(dv=lambda df: df.close.mul(df.volume))
|
|
1397
|
-
)
|
|
1398
|
-
|
|
1399
|
-
if rolling_window is None:
|
|
1400
|
-
most_traded = df.groupby(level="ticker").dv.mean().nlargest(n=n).index
|
|
1401
|
-
else:
|
|
1402
|
-
# Calculate the rolling average of dollar volume
|
|
1403
|
-
df["dv_rolling_avg"] = (
|
|
1404
|
-
df.groupby(level=0)
|
|
1405
|
-
.dv.rolling(window=rolling_window, min_periods=1)
|
|
1406
|
-
.mean()
|
|
1407
|
-
.reset_index(level=0, drop=True)
|
|
1408
|
-
)
|
|
1409
|
-
most_traded = df.groupby(level=0)["dv_rolling_avg"].mean().nlargest(n=n).index
|
|
1410
|
-
df = (
|
|
1411
|
-
df.loc[idx[most_traded, :], "close"]
|
|
1412
|
-
.unstack("ticker")
|
|
1413
|
-
.ffill(limit=5)
|
|
1414
|
-
.dropna(axis=1)
|
|
1415
|
-
)
|
|
1416
|
-
df = remove_correlated_assets(df)
|
|
1417
|
-
df = remove_stationary_assets(df)
|
|
1418
|
-
return df.sort_index()
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
1422
|
-
"""
|
|
1423
|
-
Calculates statistical and econometric metrics for a target security and a set of candidate securities.
|
|
1424
|
-
These metrics are useful in financial modeling and pairs trading strategies,
|
|
1425
|
-
providing information about drift, volatility, correlation, and cointegration.
|
|
1426
|
-
|
|
1427
|
-
Args:
|
|
1428
|
-
security (pd.Series): A time-series of the target security's prices.
|
|
1429
|
-
The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
|
|
1430
|
-
candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
|
|
1431
|
-
for candidate securities to be evaluated against the target security.
|
|
1432
|
-
|
|
1433
|
-
Returns:
|
|
1434
|
-
pd.DataFrame: A DataFrame combining:
|
|
1435
|
-
Drift: Estimated drift of spreads between the target security and each candidate.
|
|
1436
|
-
Volatility: Standard deviation of spreads.
|
|
1437
|
-
Correlation:
|
|
1438
|
-
``corr``: Correlation of normalized prices between the target and each candidate.
|
|
1439
|
-
``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
|
|
1440
|
-
Cointegration metrics:
|
|
1441
|
-
Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
|
|
1442
|
-
Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
|
|
1443
|
-
|
|
1444
|
-
References
|
|
1445
|
-
----------
|
|
1446
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1447
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1448
|
-
"""
|
|
1449
|
-
security = security.div(security.iloc[0])
|
|
1450
|
-
ticker = security.name
|
|
1451
|
-
candidates = candidates.div(candidates.iloc[0])
|
|
1452
|
-
spreads = candidates.sub(security, axis=0)
|
|
1453
|
-
n, m = spreads.shape
|
|
1454
|
-
X = np.ones(shape=(n, 2))
|
|
1455
|
-
X[:, 1] = np.arange(1, n + 1)
|
|
1456
|
-
|
|
1457
|
-
# compute drift
|
|
1458
|
-
drift = (np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1].to_frame("drift")
|
|
1459
|
-
|
|
1460
|
-
# compute volatility
|
|
1461
|
-
vol = spreads.std().to_frame("vol")
|
|
1462
|
-
|
|
1463
|
-
# returns correlation
|
|
1464
|
-
corr_ret = (
|
|
1465
|
-
candidates.pct_change().corrwith(security.pct_change()).to_frame("corr_ret")
|
|
1466
|
-
)
|
|
1467
|
-
|
|
1468
|
-
# normalized price series correlation
|
|
1469
|
-
corr = candidates.corrwith(security).to_frame("corr")
|
|
1470
|
-
metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
|
|
1471
|
-
|
|
1472
|
-
tests = []
|
|
1473
|
-
# run cointegration tests
|
|
1474
|
-
for candidate, prices in tqdm(candidates.items()):
|
|
1475
|
-
df = pd.DataFrame({"s1": security, "s2": prices})
|
|
1476
|
-
var = VAR(df.values)
|
|
1477
|
-
lags = var.select_order() # select VAR order
|
|
1478
|
-
k_ar_diff = lags.selected_orders["aic"]
|
|
1479
|
-
# Johansen Test with constant Term and estd. lag order
|
|
1480
|
-
cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
|
|
1481
|
-
# Engle-Granger Tests
|
|
1482
|
-
t1, p1 = coint(security, prices, trend="c")[:2]
|
|
1483
|
-
t2, p2 = coint(prices, security, trend="c")[:2]
|
|
1484
|
-
tests.append([ticker, candidate, t1, p1, t2, p2, k_ar_diff, *cj0.lr1])
|
|
1485
|
-
columns = ["s1", "s2", "t1", "p1", "t2", "p2", "k_ar_diff", "trace0", "trace1"]
|
|
1486
|
-
tests = pd.DataFrame(tests, columns=columns).set_index("s2")
|
|
1487
|
-
return metrics.join(tests)
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
__CRITICAL_VALUES = {
|
|
1491
|
-
0: {0.9: 13.4294, 0.95: 15.4943, 0.99: 19.9349},
|
|
1492
|
-
1: {0.9: 2.7055, 0.95: 3.8415, 0.99: 6.6349},
|
|
1493
|
-
}
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
def find_cointegrated_pairs(
|
|
1497
|
-
securities: pd.DataFrame,
|
|
1498
|
-
candidates: pd.DataFrame,
|
|
1499
|
-
n=None,
|
|
1500
|
-
start=None,
|
|
1501
|
-
stop=None,
|
|
1502
|
-
coint=False,
|
|
1503
|
-
):
|
|
1504
|
-
"""
|
|
1505
|
-
Identifies cointegrated pairs between a target set of securities and candidate securities
|
|
1506
|
-
based on econometric tests. The function evaluates statistical relationships,
|
|
1507
|
-
such as cointegration and Engle-Granger significance, to determine pairs suitable
|
|
1508
|
-
for financial strategies like pairs trading.
|
|
1509
|
-
|
|
1510
|
-
Args:
|
|
1511
|
-
securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1512
|
-
prices of target securities to evaluate.
|
|
1513
|
-
candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1514
|
-
prices of candidate securities to compare against the target securities.
|
|
1515
|
-
n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
|
|
1516
|
-
start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1517
|
-
stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1518
|
-
coint (`bool`, optional, default=False):
|
|
1519
|
-
- If `True`, filters for pairs identified as cointegrated.
|
|
1520
|
-
- If `False`, returns all evaluated pairs.
|
|
1521
|
-
|
|
1522
|
-
Returns:
|
|
1523
|
-
- ``pd.DataFrame``: A DataFrame containing:
|
|
1524
|
-
- Johansen and Engle-Granger cointegration metrics:
|
|
1525
|
-
- `t1`, `t2`: Engle-Granger test statistics for two directions.
|
|
1526
|
-
- `p1`, `p2`: Engle-Granger p-values for two directions.
|
|
1527
|
-
- `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
|
|
1528
|
-
- Indicators and filters:
|
|
1529
|
-
- `joh_sig`: Indicates Johansen cointegration significance.
|
|
1530
|
-
- `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
|
|
1531
|
-
- `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
|
|
1532
|
-
- `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
|
|
1533
|
-
- Spread and ranking:
|
|
1534
|
-
- `t`: Minimum of `t1` and `t2`.
|
|
1535
|
-
- `p`: Minimum of `p1` and `p2`.
|
|
1536
|
-
References
|
|
1537
|
-
----------
|
|
1538
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1539
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1540
|
-
|
|
1541
|
-
Example:
|
|
1542
|
-
>>> import pandas as pd
|
|
1543
|
-
|
|
1544
|
-
>>> # Sample Data
|
|
1545
|
-
>>> data_securities = {
|
|
1546
|
-
... 'Security1': [100, 102, 101, 103, 105],
|
|
1547
|
-
... 'Security2': [50, 52, 53, 51, 54]
|
|
1548
|
-
... }
|
|
1549
|
-
>>> data_candidates = {
|
|
1550
|
-
... 'Candidate1': [100, 101, 99, 102, 104],
|
|
1551
|
-
... 'Candidate2': [200, 202, 201, 203, 205]
|
|
1552
|
-
... }
|
|
1553
|
-
|
|
1554
|
-
>>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
|
|
1555
|
-
>>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
|
|
1556
|
-
|
|
1557
|
-
>>> # Find cointegrated pairs
|
|
1558
|
-
>>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1559
|
-
>>> print(top_pairs)
|
|
1560
|
-
|
|
1561
|
-
>>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
|
|
1562
|
-
>>> |----------|-----------|------|-------|---------|--------|-------|
|
|
1563
|
-
>>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
|
|
1564
|
-
>>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
|
|
1565
|
-
"""
|
|
1566
|
-
trace0_cv = __CRITICAL_VALUES[0][
|
|
1567
|
-
0.95
|
|
1568
|
-
] # critical value for 0 cointegration relationships
|
|
1569
|
-
# critical value for 1 cointegration relationship
|
|
1570
|
-
trace1_cv = __CRITICAL_VALUES[1][0.95]
|
|
1571
|
-
spreads = []
|
|
1572
|
-
if start is not None and stop is not None:
|
|
1573
|
-
securities = securities.loc[str(start) : str(stop), :]
|
|
1574
|
-
candidates = candidates.loc[str(start) : str(stop), :]
|
|
1575
|
-
for i, (ticker, prices) in enumerate(securities.items(), 1):
|
|
1576
|
-
try:
|
|
1577
|
-
df = compute_pair_metrics(prices, candidates)
|
|
1578
|
-
spreads.append(df.set_index("s1", append=True))
|
|
1579
|
-
except np.linalg.LinAlgError:
|
|
1580
|
-
continue
|
|
1581
|
-
spreads = pd.concat(spreads)
|
|
1582
|
-
spreads.index.names = ["s2", "s1"]
|
|
1583
|
-
spreads = spreads.swaplevel()
|
|
1584
|
-
spreads["t"] = spreads[["t1", "t2"]].min(axis=1)
|
|
1585
|
-
spreads["p"] = spreads[["p1", "p2"]].min(axis=1)
|
|
1586
|
-
spreads["joh_sig"] = (
|
|
1587
|
-
(spreads.trace0 > trace0_cv) & (spreads.trace1 > trace1_cv)
|
|
1588
|
-
).astype(int)
|
|
1589
|
-
spreads["eg_sig"] = (spreads.p < 0.05).astype(int)
|
|
1590
|
-
spreads["s1_dep"] = spreads.p1 < spreads.p2
|
|
1591
|
-
spreads["coint"] = (spreads.joh_sig & spreads.eg_sig).astype(int)
|
|
1592
|
-
# select top n pairs
|
|
1593
|
-
if coint:
|
|
1594
|
-
if n is not None:
|
|
1595
|
-
top_pairs = (
|
|
1596
|
-
spreads.query("coint == 1").sort_values("t", ascending=False).head(n)
|
|
1597
|
-
)
|
|
1598
|
-
else:
|
|
1599
|
-
top_pairs = spreads.query("coint == 1").sort_values("t", ascending=False)
|
|
1600
|
-
else:
|
|
1601
|
-
if n is not None:
|
|
1602
|
-
top_pairs = spreads.sort_values("t", ascending=False).head(n)
|
|
1603
|
-
else:
|
|
1604
|
-
top_pairs = spreads.sort_values("t", ascending=False)
|
|
1605
|
-
return top_pairs
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
def analyze_cointegrated_pairs(
|
|
1609
|
-
spreads: pd.DataFrame,
|
|
1610
|
-
plot_coint=True,
|
|
1611
|
-
crosstab=False,
|
|
1612
|
-
heuristics=False,
|
|
1613
|
-
log_reg=False,
|
|
1614
|
-
decis_tree=False,
|
|
1615
|
-
):
|
|
1616
|
-
"""
|
|
1617
|
-
Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
|
|
1618
|
-
|
|
1619
|
-
Args:
|
|
1620
|
-
spreads (pd.DataFrame):
|
|
1621
|
-
A DataFrame containing cointegration metrics and characteristics.
|
|
1622
|
-
Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
|
|
1623
|
-
plot_coint (bool, optional):
|
|
1624
|
-
If True, generates scatterplots and boxplots to visualize cointegration characteristics.
|
|
1625
|
-
cosstab (bool, optional):
|
|
1626
|
-
If True, displays crosstabulations of Engle-Granger and Johansen test significance.
|
|
1627
|
-
heuristics (bool, optional):
|
|
1628
|
-
If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
|
|
1629
|
-
log_reg (bool, optional):
|
|
1630
|
-
If True, fits a logistic regression model to predict cointegration and evaluates its performance.
|
|
1631
|
-
decis_tree (bool, optional):
|
|
1632
|
-
If True, fits a decision tree model to predict cointegration and evaluates its performance.
|
|
1633
|
-
|
|
1634
|
-
References
|
|
1635
|
-
----------
|
|
1636
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1637
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1638
|
-
|
|
1639
|
-
Example:
|
|
1640
|
-
>>> import pandas as pd
|
|
1641
|
-
>>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
|
|
1642
|
-
|
|
1643
|
-
>>> # Sample Data
|
|
1644
|
-
>>> securities = pd.DataFrame({
|
|
1645
|
-
... 'SPY': [100, 102, 101, 103, 105],
|
|
1646
|
-
... 'QQQ': [50, 52, 53, 51, 54]
|
|
1647
|
-
... })
|
|
1648
|
-
>>> candidates = pd.DataFrame({
|
|
1649
|
-
... 'AAPL': [100, 101, 99, 102, 104],
|
|
1650
|
-
... 'MSFT': [200, 202, 201, 203, 205]
|
|
1651
|
-
... })
|
|
1652
|
-
|
|
1653
|
-
>>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1654
|
-
>>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
|
|
1655
|
-
"""
|
|
1656
|
-
if plot_coint:
|
|
1657
|
-
trace0_cv = __CRITICAL_VALUES[0][0.95]
|
|
1658
|
-
spreads = spreads.reset_index()
|
|
1659
|
-
sns.scatterplot(
|
|
1660
|
-
x=np.log1p(spreads.t.abs()),
|
|
1661
|
-
y=np.log1p(spreads.trace1),
|
|
1662
|
-
hue="coint",
|
|
1663
|
-
data=spreads[spreads.trace0 > trace0_cv],
|
|
1664
|
-
)
|
|
1665
|
-
fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
|
|
1666
|
-
for i, heuristic in enumerate(["drift", "vol", "corr", "corr_ret"]):
|
|
1667
|
-
sns.boxplot(x="coint", y=heuristic, data=spreads, ax=axes[i])
|
|
1668
|
-
fig.tight_layout()
|
|
1669
|
-
|
|
1670
|
-
if heuristics:
|
|
1671
|
-
spreads = spreads.reset_index()
|
|
1672
|
-
h = (
|
|
1673
|
-
spreads.groupby(spreads.coint)[["drift", "vol", "corr"]]
|
|
1674
|
-
.describe()
|
|
1675
|
-
.stack(level=0)
|
|
1676
|
-
.swaplevel()
|
|
1677
|
-
.sort_index()
|
|
1678
|
-
)
|
|
1679
|
-
print(h)
|
|
1680
|
-
|
|
1681
|
-
if log_reg:
|
|
1682
|
-
y = spreads.coint
|
|
1683
|
-
X = spreads[["drift", "vol", "corr", "corr_ret"]]
|
|
1684
|
-
log_reg = LogisticRegressionCV(
|
|
1685
|
-
Cs=np.logspace(-10, 10, 21), class_weight="balanced", scoring="roc_auc"
|
|
1686
|
-
)
|
|
1687
|
-
log_reg.fit(X=X, y=y)
|
|
1688
|
-
Cs = log_reg.Cs_
|
|
1689
|
-
scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
|
|
1690
|
-
scores.plot(logx=True)
|
|
1691
|
-
res = f"C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}"
|
|
1692
|
-
print(res)
|
|
1693
|
-
print(log_reg.coef_)
|
|
1694
|
-
|
|
1695
|
-
if decis_tree:
|
|
1696
|
-
model = DecisionTreeClassifier(class_weight="balanced")
|
|
1697
|
-
decision_tree = GridSearchCV(
|
|
1698
|
-
model, param_grid={"max_depth": list(range(1, 10))}, cv=5, scoring="roc_auc"
|
|
1699
|
-
)
|
|
1700
|
-
y = spreads.coint
|
|
1701
|
-
X = spreads[["drift", "vol", "corr", "corr_ret"]]
|
|
1702
|
-
decision_tree.fit(X, y)
|
|
1703
|
-
res = f"{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_['max_depth']}"
|
|
1704
|
-
print(res)
|
|
1705
|
-
|
|
1706
|
-
if crosstab:
|
|
1707
|
-
pd.set_option("display.float_format", lambda x: f"{x:.2%}")
|
|
1708
|
-
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
|
|
1709
|
-
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
def select_candidate_pairs(pairs: pd.DataFrame, period=False):
|
|
1713
|
-
"""
|
|
1714
|
-
Select candidate pairs from a DataFrame based on cointegration status.
|
|
1715
|
-
|
|
1716
|
-
This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
|
|
1717
|
-
indicating cointegration. It then determines the dependent and independent series for each pair
|
|
1718
|
-
and returns the selected pairs in a dictionary format.
|
|
1719
|
-
|
|
1720
|
-
Args:
|
|
1721
|
-
pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
|
|
1722
|
-
period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
|
|
1723
|
-
|
|
1724
|
-
Returns:
|
|
1725
|
-
list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
|
|
1726
|
-
|
|
1727
|
-
References
|
|
1728
|
-
----------
|
|
1729
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1730
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1731
|
-
"""
|
|
1732
|
-
candidates = pairs.query("coint == 1").copy()
|
|
1733
|
-
candidates = candidates.reset_index()
|
|
1734
|
-
candidates["y"] = candidates.apply(
|
|
1735
|
-
lambda x: x["s1"] if x.s1_dep else x["s2"], axis=1
|
|
1736
|
-
)
|
|
1737
|
-
candidates["x"] = candidates.apply(
|
|
1738
|
-
lambda x: x["s2"] if x.s1_dep else x["s1"], axis=1
|
|
1739
|
-
)
|
|
1740
|
-
if period:
|
|
1741
|
-
return candidates[["x", "y", "period"]].to_dict(orient="records")
|
|
1742
|
-
return candidates[["x", "y"]].to_dict(orient="records")
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
1746
|
-
"""
|
|
1747
|
-
Estimate rolling mean using Kalman Smoothing.
|
|
1748
|
-
|
|
1749
|
-
Args:
|
|
1750
|
-
prices : pd.Series or np.ndarray
|
|
1751
|
-
The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
|
|
1752
|
-
|
|
1753
|
-
Returns:
|
|
1754
|
-
pd.Series or np.ndarray
|
|
1755
|
-
The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
|
|
1756
|
-
If the input is a numpy array, the output will be a numpy array.
|
|
1757
|
-
|
|
1758
|
-
References
|
|
1759
|
-
----------
|
|
1760
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1761
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1762
|
-
|
|
1763
|
-
Examples
|
|
1764
|
-
--------
|
|
1765
|
-
>>> import yfinance as yf
|
|
1766
|
-
>>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
|
|
1767
|
-
>>> prices = KFSmoother(prices)
|
|
1768
|
-
>>> print(prices[:5])
|
|
1769
|
-
Date
|
|
1770
|
-
2020-01-02 00:00:00+00:00 36.39801407
|
|
1771
|
-
2020-01-03 00:00:00+00:00 49.06231000
|
|
1772
|
-
2020-01-06 00:00:00+00:00 55.86334436
|
|
1773
|
-
2020-01-07 00:00:00+00:00 60.02240894
|
|
1774
|
-
2020-01-08 00:00:00+00:00 63.15057948
|
|
1775
|
-
dtype: float64
|
|
1776
|
-
|
|
1777
|
-
"""
|
|
1778
|
-
if not isinstance(prices, (np.ndarray, pd.Series)):
|
|
1779
|
-
raise ValueError("Input must be either a numpy array or a pandas Series.")
|
|
1780
|
-
kf = PyKalmanFilter(
|
|
1781
|
-
transition_matrices=np.eye(1),
|
|
1782
|
-
observation_matrices=np.eye(1),
|
|
1783
|
-
initial_state_mean=0,
|
|
1784
|
-
initial_state_covariance=1,
|
|
1785
|
-
observation_covariance=1,
|
|
1786
|
-
transition_covariance=0.05,
|
|
1787
|
-
)
|
|
1788
|
-
if isinstance(prices, pd.Series):
|
|
1789
|
-
state_means, _ = kf.filter(prices.values)
|
|
1790
|
-
return pd.Series(state_means.flatten(), index=prices.index)
|
|
1791
|
-
elif isinstance(prices, np.ndarray):
|
|
1792
|
-
state_means, _ = kf.filter(prices)
|
|
1793
|
-
return state_means.flatten()
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
|
|
1797
|
-
"""
|
|
1798
|
-
Estimate Hedge Ratio using Kalman Filter.
|
|
1799
|
-
Args:
|
|
1800
|
-
x : pd.Series or np.ndarray
|
|
1801
|
-
The independent variable, which can be either a pandas Series or a numpy array.
|
|
1802
|
-
y : pd.Series or np.ndarray
|
|
1803
|
-
The dependent variable, which can be either a pandas Series or a numpy array.
|
|
1804
|
-
|
|
1805
|
-
Returns:
|
|
1806
|
-
np.ndarray
|
|
1807
|
-
The estimated hedge ratio as a numpy array.
|
|
1808
|
-
|
|
1809
|
-
The function returns the negative of the first state variable of each Kalman Filter estimate,
|
|
1810
|
-
which represents the estimated hedge ratio.
|
|
1811
|
-
|
|
1812
|
-
References
|
|
1813
|
-
----------
|
|
1814
|
-
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1815
|
-
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1816
|
-
"""
|
|
1817
|
-
if not isinstance(x, (np.ndarray, pd.Series)) or not isinstance(
|
|
1818
|
-
y, (np.ndarray, pd.Series)
|
|
1819
|
-
):
|
|
1820
|
-
raise ValueError(
|
|
1821
|
-
"Both x and y must be either a numpy array or a pandas Series."
|
|
1822
|
-
)
|
|
1823
|
-
|
|
1824
|
-
delta = 1e-3
|
|
1825
|
-
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
1826
|
-
obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
|
|
1827
|
-
|
|
1828
|
-
kf = PyKalmanFilter(
|
|
1829
|
-
n_dim_obs=1,
|
|
1830
|
-
n_dim_state=2,
|
|
1831
|
-
initial_state_mean=[0, 0],
|
|
1832
|
-
initial_state_covariance=np.ones((2, 2)),
|
|
1833
|
-
transition_matrices=np.eye(2),
|
|
1834
|
-
observation_matrices=obs_mat,
|
|
1835
|
-
observation_covariance=2,
|
|
1836
|
-
transition_covariance=trans_cov,
|
|
1837
|
-
)
|
|
1838
|
-
y = y.values if isinstance(y, pd.Series) else y
|
|
1839
|
-
state_means, _ = kf.filter(y)
|
|
1840
|
-
# Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
|
|
1841
|
-
# each Kalman Filter estimate, which is the estimated hedge ratio.
|
|
1842
|
-
return -state_means[:, 0]
|
|
1
|
+
"""
|
|
2
|
+
The `tseries` module is a designed for conducting
|
|
3
|
+
advanced time series analysis in financial markets.
|
|
4
|
+
It leverages statistical models and algorithms to perform
|
|
5
|
+
tasks such as cointegration testing, volatility modeling,
|
|
6
|
+
and filter-based estimation to assist in trading strategy development,
|
|
7
|
+
market analysis, and financial data exploration.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pprint
|
|
11
|
+
import warnings
|
|
12
|
+
from itertools import combinations
|
|
13
|
+
from typing import List, Tuple, Union
|
|
14
|
+
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import pmdarima as pm
|
|
19
|
+
import seaborn as sns
|
|
20
|
+
import statsmodels.api as sm
|
|
21
|
+
import statsmodels.tsa.stattools as ts
|
|
22
|
+
import yfinance as yf
|
|
23
|
+
from arch import arch_model
|
|
24
|
+
from filterpy.kalman import KalmanFilter
|
|
25
|
+
from hurst import compute_Hc
|
|
26
|
+
from pykalman import KalmanFilter as PyKalmanFilter
|
|
27
|
+
from scipy.optimize import minimize
|
|
28
|
+
from sklearn.linear_model import LogisticRegressionCV
|
|
29
|
+
from sklearn.model_selection import GridSearchCV
|
|
30
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
31
|
+
from statsmodels.graphics.tsaplots import plot_acf
|
|
32
|
+
from statsmodels.stats.diagnostic import acorr_ljungbox
|
|
33
|
+
from statsmodels.tsa.arima.model import ARIMA
|
|
34
|
+
from statsmodels.tsa.stattools import adfuller, coint
|
|
35
|
+
from statsmodels.tsa.vector_ar.var_model import VAR
|
|
36
|
+
from statsmodels.tsa.vector_ar.vecm import coint_johansen
|
|
37
|
+
from tqdm import tqdm
|
|
38
|
+
|
|
39
|
+
warnings.filterwarnings("ignore")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"load_and_prepare_data",
|
|
44
|
+
"fit_best_arima",
|
|
45
|
+
"fit_garch",
|
|
46
|
+
"predict_next_return",
|
|
47
|
+
"get_prediction",
|
|
48
|
+
"get_corr",
|
|
49
|
+
"run_cadf_test",
|
|
50
|
+
"run_hurst_test",
|
|
51
|
+
"run_coint_test",
|
|
52
|
+
"run_kalman_filter",
|
|
53
|
+
"ArimaGarchModel",
|
|
54
|
+
"KalmanFilterModel",
|
|
55
|
+
"OrnsteinUhlenbeck",
|
|
56
|
+
"remove_correlated_assets",
|
|
57
|
+
"check_stationarity",
|
|
58
|
+
"remove_stationary_assets",
|
|
59
|
+
"select_assets",
|
|
60
|
+
"compute_pair_metrics",
|
|
61
|
+
"find_cointegrated_pairs",
|
|
62
|
+
"analyze_cointegrated_pairs",
|
|
63
|
+
"select_candidate_pairs",
|
|
64
|
+
"KFSmoother",
|
|
65
|
+
"KFHedgeRatio",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# *******************************************
|
|
69
|
+
# ARIMA AND GARCH MODELS *
|
|
70
|
+
# *******************************************
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def load_and_prepare_data(df: pd.DataFrame):
|
|
74
|
+
"""
|
|
75
|
+
Prepares financial time series data for analysis.
|
|
76
|
+
|
|
77
|
+
This function takes a pandas DataFrame containing financial data,
|
|
78
|
+
calculates logarithmic returns, and the first difference
|
|
79
|
+
of these logarithmic returns. It handles missing values
|
|
80
|
+
by filling them with zeros.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
df (pd.DataFrame): DataFrame containing at least
|
|
84
|
+
a `Close` column with closing prices of a financial asset.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
pd.DataFrame: DataFrame with additional
|
|
88
|
+
columns for logarithmic returns (`log_return`)
|
|
89
|
+
and the first difference of logarithmic returns (`diff_log_return`),
|
|
90
|
+
with `NaN` values filled with `0`.
|
|
91
|
+
"""
|
|
92
|
+
# Load data
|
|
93
|
+
data = df.copy()
|
|
94
|
+
# Calculate logarithmic returns
|
|
95
|
+
data["log_return"] = np.log(data["Close"] / data["Close"].shift(1))
|
|
96
|
+
# Differencing if necessary
|
|
97
|
+
data["diff_log_return"] = data["log_return"].diff()
|
|
98
|
+
# Drop NaN values
|
|
99
|
+
data.fillna(0, inplace=True)
|
|
100
|
+
return data
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
|
|
104
|
+
"""
|
|
105
|
+
Identifies and fits the best `ARIMA` model
|
|
106
|
+
based on the Akaike Information Criterion `(AIC)`.
|
|
107
|
+
|
|
108
|
+
Iterates through different combinations of `p` and `q`
|
|
109
|
+
parameters (within specified ranges) for the ARIMA model,
|
|
110
|
+
fits them to the provided data, and selects the combination
|
|
111
|
+
with the lowest `AIC` value.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
window_data (pd.Series or np.ndarray):
|
|
115
|
+
Time series data to fit the `ARIMA` model on.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
ARIMA result object: The fitted `ARIMA` model with the lowest `AIC`.
|
|
119
|
+
"""
|
|
120
|
+
if isinstance(window_data, pd.Series):
|
|
121
|
+
window_data = window_data.values
|
|
122
|
+
|
|
123
|
+
window_data = window_data[~(np.isnan(window_data) | np.isinf(window_data))]
|
|
124
|
+
# Fit ARIMA model with best parameters
|
|
125
|
+
model = pm.auto_arima(
|
|
126
|
+
window_data,
|
|
127
|
+
start_p=1,
|
|
128
|
+
start_q=1,
|
|
129
|
+
max_p=6,
|
|
130
|
+
max_q=6,
|
|
131
|
+
seasonal=False,
|
|
132
|
+
stepwise=True,
|
|
133
|
+
)
|
|
134
|
+
final_order = model.order
|
|
135
|
+
from arch.utility.exceptions import ConvergenceWarning as ArchWarning
|
|
136
|
+
from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
|
|
137
|
+
|
|
138
|
+
with warnings.catch_warnings():
|
|
139
|
+
warnings.filterwarnings("ignore", category=StatsWarning, module="statsmodels")
|
|
140
|
+
warnings.filterwarnings("ignore", category=ArchWarning, module="arch")
|
|
141
|
+
try:
|
|
142
|
+
best_arima_model = ARIMA(
|
|
143
|
+
window_data + 1e-5, order=final_order, missing="drop"
|
|
144
|
+
).fit()
|
|
145
|
+
return best_arima_model
|
|
146
|
+
except np.linalg.LinAlgError:
|
|
147
|
+
# Catch specific linear algebra errors
|
|
148
|
+
print("LinAlgError occurred, skipping this data point.")
|
|
149
|
+
return None
|
|
150
|
+
except Exception as e:
|
|
151
|
+
# Catch any other unexpected errors and log them
|
|
152
|
+
print(f"An error occurred: {e}")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def fit_garch(window_data: Union[pd.Series, np.ndarray]):
|
|
157
|
+
"""
|
|
158
|
+
Fits an `ARIMA` model to the data to get residuals,
|
|
159
|
+
then fits a `GARCH(1,1)` model on these residuals.
|
|
160
|
+
|
|
161
|
+
Utilizes the residuals from the best `ARIMA` model fit to
|
|
162
|
+
then model volatility using a `GARCH(1,1)` model.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
window_data (pd.Series or np.ndarray):
|
|
166
|
+
Time series data for which to fit the `ARIMA` and `GARCH` models.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
tuple: A tuple containing the `ARIMA` result
|
|
170
|
+
object and the `GARCH` result object.
|
|
171
|
+
"""
|
|
172
|
+
arima_result = fit_best_arima(window_data)
|
|
173
|
+
if arima_result is None:
|
|
174
|
+
return None, None
|
|
175
|
+
resid = np.asarray(arima_result.resid)
|
|
176
|
+
resid = resid[~(np.isnan(resid) | np.isinf(resid))]
|
|
177
|
+
garch_model = arch_model(resid, p=1, q=1, rescale=False)
|
|
178
|
+
garch_result = garch_model.fit(disp="off")
|
|
179
|
+
return arima_result, garch_result
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def predict_next_return(arima_result, garch_result):
|
|
183
|
+
"""
|
|
184
|
+
Predicts the next return value using fitted `ARIMA` and `GARCH` models.
|
|
185
|
+
|
|
186
|
+
Combines the next period forecast from the `ARIMA` model
|
|
187
|
+
with the next period volatility forecast from the `GARCH` model
|
|
188
|
+
to predict the next return value.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
arima_result (ARIMA result object): The fitted `ARIMA` model result.
|
|
192
|
+
garch_result (ARCH result object): The fitted `GARCH` model result.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
float: The predicted next return, adjusted for predicted volatility.
|
|
196
|
+
"""
|
|
197
|
+
if arima_result is None or garch_result is None:
|
|
198
|
+
return 0
|
|
199
|
+
# Predict next value with ARIMA
|
|
200
|
+
arima_pred = arima_result.forecast(steps=1)
|
|
201
|
+
# Predict next volatility with GARCH
|
|
202
|
+
garch_pred = garch_result.forecast(horizon=1)
|
|
203
|
+
next_volatility = garch_pred.variance.iloc[-1, 0]
|
|
204
|
+
|
|
205
|
+
# Combine predictions (return + volatility)
|
|
206
|
+
if not isinstance(arima_pred, np.ndarray):
|
|
207
|
+
pred = arima_pred.values[0]
|
|
208
|
+
else:
|
|
209
|
+
pred = arima_pred[0]
|
|
210
|
+
return pred + next_volatility
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_prediction(window_data: Union[pd.Series, np.ndarray]):
|
|
214
|
+
"""
|
|
215
|
+
Orchestrator function to get the next period's return prediction.
|
|
216
|
+
|
|
217
|
+
This function ties together the process of fitting
|
|
218
|
+
both `ARIMA` and `GARCH` models on the provided data
|
|
219
|
+
and then predicting the next period's return using these models.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
window_data (Union[pd.Series , np.ndarray]):
|
|
223
|
+
Time series data to fit the models and predict the next return.
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
float: Predicted next return value.
|
|
227
|
+
"""
|
|
228
|
+
arima_result, garch_result = fit_garch(window_data)
|
|
229
|
+
prediction = predict_next_return(arima_result, garch_result)
|
|
230
|
+
return prediction
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class ArimaGarchModel:
|
|
234
|
+
"""
|
|
235
|
+
This class implements a time serie model
|
|
236
|
+
that combines `ARIMA (AutoRegressive Integrated Moving Average)`
|
|
237
|
+
and `GARCH (Generalized Autoregressive Conditional Heteroskedasticity)` models
|
|
238
|
+
to predict future returns based on historical price data.
|
|
239
|
+
|
|
240
|
+
The model is implemented in the following steps:
|
|
241
|
+
1. Data Preparation: Load and prepare the historical price data.
|
|
242
|
+
2. Modeling: Fit the ARIMA model to the data and then fit the GARCH model to the residuals.
|
|
243
|
+
3. Prediction: Predict the next return using the ARIMA model and the next volatility using the GARCH model.
|
|
244
|
+
4. Trading Strategy: Execute the trading strategy based on the predictions.
|
|
245
|
+
5. Vectorized Backtesting: Backtest the trading strategy using the historical data.
|
|
246
|
+
|
|
247
|
+
Exemple:
|
|
248
|
+
>>> import yfinance as yf
|
|
249
|
+
>>> from bbstrader.tseries import ArimaGarchModel
|
|
250
|
+
>>> from bbstrader.tseries import load_and_prepare_data
|
|
251
|
+
|
|
252
|
+
>>> if __name__ == '__main__':
|
|
253
|
+
>>> # ARCH SPY Vectorize Backtest
|
|
254
|
+
>>> k = 252
|
|
255
|
+
>>> data = yf.download("SPY", start="2010-01-02", end="2015-12-31")
|
|
256
|
+
>>> arch = ArimaGarchModel("SPY", data, k=k)
|
|
257
|
+
>>> df = load_and_prepare_data(data)
|
|
258
|
+
>>> arch.show_arima_garch_results(df['diff_log_return'].values[-k:])
|
|
259
|
+
>>> arch.backtest_strategy()
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
def __init__(self, symbol, data, k: int = 252):
|
|
263
|
+
"""
|
|
264
|
+
Initializes the ArimaGarchStrategy class.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
symbol (str): The ticker symbol for the financial instrument.
|
|
268
|
+
data (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
|
|
269
|
+
k (int): The window size for rolling prediction in backtesting.
|
|
270
|
+
"""
|
|
271
|
+
self.symbol = symbol
|
|
272
|
+
self.data = self.load_and_prepare_data(data)
|
|
273
|
+
self.k = k
|
|
274
|
+
|
|
275
|
+
# Step 1: Data Preparation
|
|
276
|
+
def load_and_prepare_data(self, df):
|
|
277
|
+
"""
|
|
278
|
+
Prepares the dataset by calculating logarithmic returns
|
|
279
|
+
and differencing if necessary.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
df (pd.DataFrame): `The raw dataset containing at least the 'Close' prices`.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
pd.DataFrame: The dataset with additional columns
|
|
286
|
+
for log returns and differenced log returns.
|
|
287
|
+
"""
|
|
288
|
+
return load_and_prepare_data(df)
|
|
289
|
+
|
|
290
|
+
# Step 2: Modeling (ARIMA + GARCH)
|
|
291
|
+
def fit_best_arima(self, window_data):
|
|
292
|
+
"""
|
|
293
|
+
Fits the ARIMA model to the provided window of data,
|
|
294
|
+
selecting the best model based on AIC.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
window_data (np.array): The dataset for a specific window period.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
ARIMA model: The best fitted ARIMA model based on AIC.
|
|
301
|
+
"""
|
|
302
|
+
return fit_best_arima(window_data)
|
|
303
|
+
|
|
304
|
+
def fit_garch(self, window_data):
|
|
305
|
+
"""
|
|
306
|
+
Fits the GARCH model to the residuals of the best ARIMA model.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
window_data (np.array): The dataset for a specific window period.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
tuple: Contains the ARIMA result and GARCH result.
|
|
313
|
+
"""
|
|
314
|
+
return fit_garch(window_data)
|
|
315
|
+
|
|
316
|
+
def show_arima_garch_results(self, window_data, acf=True, test_resid=True):
|
|
317
|
+
"""
|
|
318
|
+
Displays the ARIMA and GARCH model results, including plotting
|
|
319
|
+
ACF of residuals and conducting , Box-Pierce and Ljung-Box tests.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
window_data (np.array): The dataset for a specific window period.
|
|
323
|
+
acf (bool, optional): If True, plot the ACF of residuals. Defaults to True.
|
|
324
|
+
|
|
325
|
+
test_resid (bool, optional):
|
|
326
|
+
If True, conduct Box-Pierce and Ljung-Box tests on residuals. Defaults to True.
|
|
327
|
+
"""
|
|
328
|
+
arima_result = self.fit_best_arima(window_data)
|
|
329
|
+
resid = np.asarray(arima_result.resid)
|
|
330
|
+
resid = resid[~(np.isnan(resid) | np.isinf(resid))]
|
|
331
|
+
garch_model = arch_model(resid, p=1, q=1, rescale=False)
|
|
332
|
+
garch_result = garch_model.fit(disp="off")
|
|
333
|
+
residuals = garch_result.resid
|
|
334
|
+
|
|
335
|
+
# TODO : Plot the ACF of the residuals
|
|
336
|
+
if acf:
|
|
337
|
+
fig = plt.figure(figsize=(12, 8))
|
|
338
|
+
# Plot the ACF of ARIMA residuals
|
|
339
|
+
ax1 = fig.add_subplot(211, ylabel="ACF")
|
|
340
|
+
plot_acf(resid, alpha=0.05, ax=ax1, title="ACF of ARIMA Residuals")
|
|
341
|
+
ax1.set_xlabel("Lags")
|
|
342
|
+
ax1.grid(True)
|
|
343
|
+
|
|
344
|
+
# Plot the ACF of GARCH residuals on the same axes
|
|
345
|
+
ax2 = fig.add_subplot(212, ylabel="ACF")
|
|
346
|
+
plot_acf(residuals, alpha=0.05, ax=ax2, title="ACF of GARCH Residuals")
|
|
347
|
+
ax2.set_xlabel("Lags")
|
|
348
|
+
ax2.grid(True)
|
|
349
|
+
|
|
350
|
+
# Plot the figure
|
|
351
|
+
plt.tight_layout()
|
|
352
|
+
plt.show()
|
|
353
|
+
|
|
354
|
+
# TODO : Conduct Box-Pierce and Ljung-Box Tests of the residuals
|
|
355
|
+
if test_resid:
|
|
356
|
+
print(arima_result.summary())
|
|
357
|
+
print(garch_result.summary())
|
|
358
|
+
bp_test = acorr_ljungbox(resid, return_df=True)
|
|
359
|
+
print("Box-Pierce and Ljung-Box Tests Results for ARIMA:\n", bp_test)
|
|
360
|
+
|
|
361
|
+
# Step 3: Prediction
|
|
362
|
+
def predict_next_return(self, arima_result, garch_result):
|
|
363
|
+
"""
|
|
364
|
+
Predicts the next return using the ARIMA model
|
|
365
|
+
and the next volatility using the GARCH model.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
arima_result (ARIMA model): The ARIMA model result.
|
|
369
|
+
garch_result (GARCH model): The GARCH model result.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
float: The predicted next return.
|
|
373
|
+
"""
|
|
374
|
+
return predict_next_return(arima_result, garch_result)
|
|
375
|
+
|
|
376
|
+
def get_prediction(self, window_data):
|
|
377
|
+
"""
|
|
378
|
+
Generates a prediction for the next return based on a window of data.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
window_data (np.array): The dataset for a specific window period.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
float: The predicted next return.
|
|
385
|
+
"""
|
|
386
|
+
return get_prediction(window_data)
|
|
387
|
+
|
|
388
|
+
def calculate_signals(self, window_data):
|
|
389
|
+
"""
|
|
390
|
+
Calculates the trading signal based on the prediction.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
window_data (np.array): The dataset for a specific window period.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
str: The trading signal ('LONG', 'SHORT', or None).
|
|
397
|
+
"""
|
|
398
|
+
prediction = self.get_prediction(window_data)
|
|
399
|
+
if prediction > 0:
|
|
400
|
+
signal = "LONG"
|
|
401
|
+
elif prediction < 0:
|
|
402
|
+
signal = "SHORT"
|
|
403
|
+
else:
|
|
404
|
+
signal = None
|
|
405
|
+
return signal
|
|
406
|
+
|
|
407
|
+
# Step 4: Trading Strategy
|
|
408
|
+
|
|
409
|
+
def execute_trading_strategy(self, predictions):
|
|
410
|
+
"""
|
|
411
|
+
Executes the trading strategy based on a list
|
|
412
|
+
of predictions, determining positions to take.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
predictions (list): A list of predicted returns.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
list: A list of positions (1 for 'LONG', -1 for 'SHORT', 0 for 'HOLD').
|
|
419
|
+
"""
|
|
420
|
+
positions = [] # Long if 1, Short if -1
|
|
421
|
+
previous_position = 0 # Initial position
|
|
422
|
+
for prediction in predictions:
|
|
423
|
+
if prediction > 0:
|
|
424
|
+
current_position = 1 # Long
|
|
425
|
+
elif prediction < 0:
|
|
426
|
+
current_position = -1 # Short
|
|
427
|
+
else:
|
|
428
|
+
current_position = previous_position # Hold previous position
|
|
429
|
+
positions.append(current_position)
|
|
430
|
+
previous_position = current_position
|
|
431
|
+
|
|
432
|
+
return positions
|
|
433
|
+
|
|
434
|
+
# Step 5: Vectorized Backtesting
|
|
435
|
+
def generate_predictions(self):
|
|
436
|
+
"""
|
|
437
|
+
Generator that yields predictions one by one.
|
|
438
|
+
"""
|
|
439
|
+
data = self.data
|
|
440
|
+
window_size = self.k
|
|
441
|
+
for i in range(window_size, len(data)):
|
|
442
|
+
print(
|
|
443
|
+
f"Processing window {i - window_size + 1}/{len(data) - window_size}..."
|
|
444
|
+
)
|
|
445
|
+
window_data = data["diff_log_return"].iloc[i - window_size : i]
|
|
446
|
+
next_return = self.get_prediction(window_data)
|
|
447
|
+
yield next_return
|
|
448
|
+
|
|
449
|
+
def backtest_strategy(self):
|
|
450
|
+
"""
|
|
451
|
+
Performs a backtest of the strategy over
|
|
452
|
+
the entire dataset, plotting cumulative returns.
|
|
453
|
+
"""
|
|
454
|
+
data = self.data
|
|
455
|
+
window_size = self.k
|
|
456
|
+
print(
|
|
457
|
+
f"Starting backtesting for {self.symbol}\n"
|
|
458
|
+
f"Window size {window_size}.\n"
|
|
459
|
+
f"Total iterations: {len(data) - window_size}.\n"
|
|
460
|
+
)
|
|
461
|
+
predictions_generator = self.generate_predictions()
|
|
462
|
+
|
|
463
|
+
positions = self.execute_trading_strategy(predictions_generator)
|
|
464
|
+
|
|
465
|
+
strategy_returns = (
|
|
466
|
+
np.array(positions[:-1]) * data["log_return"].iloc[window_size + 1 :].values
|
|
467
|
+
)
|
|
468
|
+
buy_and_hold = data["log_return"].iloc[window_size + 1 :].values
|
|
469
|
+
buy_and_hold_returns = np.cumsum(buy_and_hold)
|
|
470
|
+
cumulative_returns = np.cumsum(strategy_returns)
|
|
471
|
+
dates = data.index[window_size + 1 :]
|
|
472
|
+
self.plot_cumulative_returns(cumulative_returns, buy_and_hold_returns, dates)
|
|
473
|
+
|
|
474
|
+
print("\nBacktesting completed !!")
|
|
475
|
+
|
|
476
|
+
# Function to plot the cumulative returns
|
|
477
|
+
def plot_cumulative_returns(self, strategy_returns, buy_and_hold_returns, dates):
|
|
478
|
+
"""
|
|
479
|
+
Plots the cumulative returns of the ARIMA+GARCH strategy against
|
|
480
|
+
a buy-and-hold strategy.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
strategy_returns (np.array): Cumulative returns from the strategy.
|
|
484
|
+
buy_and_hold_returns (np.array): Cumulative returns from a buy-and-hold strategy.
|
|
485
|
+
dates (pd.Index): The dates corresponding to the returns.
|
|
486
|
+
"""
|
|
487
|
+
plt.figure(figsize=(14, 7))
|
|
488
|
+
plt.plot(dates, strategy_returns, label="ARIMA+GARCH ", color="blue")
|
|
489
|
+
plt.plot(dates, buy_and_hold_returns, label="Buy & Hold", color="red")
|
|
490
|
+
plt.xlabel("Time")
|
|
491
|
+
plt.ylabel("Cumulative Returns")
|
|
492
|
+
plt.title(f"ARIMA+GARCH Strategy vs. Buy & Hold on ({self.symbol})")
|
|
493
|
+
plt.legend()
|
|
494
|
+
plt.grid(True)
|
|
495
|
+
plt.show()
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# *********************************************
|
|
499
|
+
# STATS TEST (Cointegration , Mean Reverting)*
|
|
500
|
+
# *********************************************
|
|
501
|
+
def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -> None:
|
|
502
|
+
"""
|
|
503
|
+
Calculates and prints the correlation matrix of the adjusted closing prices
|
|
504
|
+
for a given list of stock tickers within a specified date range.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
tickers (Union[List[str] , Tuple[str, ...]]):
|
|
508
|
+
A list or tuple of valid stock tickers (e.g., ['AAPL', 'MSFT', 'GOOG']).
|
|
509
|
+
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
510
|
+
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
511
|
+
|
|
512
|
+
Example:
|
|
513
|
+
>>> from bbstrader.tseries import get_corr
|
|
514
|
+
>>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
515
|
+
"""
|
|
516
|
+
# Download historical data
|
|
517
|
+
data = yf.download(tickers, start=start, end=end, multi_level_index=False)[
|
|
518
|
+
"Adj Close"
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
# Calculate correlation matrix
|
|
522
|
+
correlation_matrix = data.corr()
|
|
523
|
+
|
|
524
|
+
# Display the matrix
|
|
525
|
+
print(correlation_matrix)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def plot_price_series(df: pd.DataFrame, ts1: str, ts2: str):
|
|
529
|
+
"""
|
|
530
|
+
Plot both time series on the same line graph for
|
|
531
|
+
the specified date range.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
df (pd.DataFrame):
|
|
535
|
+
The DataFrame containing prices for each series
|
|
536
|
+
ts1 (str): The first time series column name
|
|
537
|
+
ts2 (str): The second time series column name
|
|
538
|
+
"""
|
|
539
|
+
fig, ax = plt.subplots()
|
|
540
|
+
ax.plot(df.index, df[ts1], label=ts1)
|
|
541
|
+
ax.plot(df.index, df[ts2], label=ts2)
|
|
542
|
+
|
|
543
|
+
fig.autofmt_xdate()
|
|
544
|
+
plt.xlabel("Month/Year")
|
|
545
|
+
plt.ylabel("Price ($)")
|
|
546
|
+
plt.title(f"{ts1} and {ts2} Daily Prices ")
|
|
547
|
+
plt.legend()
|
|
548
|
+
plt.show()
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def plot_scatter_series(df: pd.DataFrame, ts1: str, ts2: str):
|
|
552
|
+
"""
|
|
553
|
+
Plot a scatter plot of both time series for
|
|
554
|
+
via the provided DataFrame.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
df (pd.DataFrame):
|
|
558
|
+
The DataFrame containing prices for each series
|
|
559
|
+
ts1 (str): The first time series column name
|
|
560
|
+
ts2 (str): The second time series column name
|
|
561
|
+
"""
|
|
562
|
+
plt.xlabel(f"{ts1} Price ($)")
|
|
563
|
+
plt.ylabel(f"{ts2} Price ($)")
|
|
564
|
+
plt.title(f"{ts1} and {ts2} Price Scatterplot")
|
|
565
|
+
plt.scatter(df[ts1], df[ts2])
|
|
566
|
+
|
|
567
|
+
# Plot the regression line
|
|
568
|
+
plt.plot(
|
|
569
|
+
df[ts1],
|
|
570
|
+
results.fittedvalues,
|
|
571
|
+
linestyle="--",
|
|
572
|
+
color="red",
|
|
573
|
+
linewidth=2,
|
|
574
|
+
label="Regression Line",
|
|
575
|
+
)
|
|
576
|
+
plt.legend()
|
|
577
|
+
plt.show()
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def plot_residuals(df: pd.DataFrame):
|
|
581
|
+
"""
|
|
582
|
+
Plot the residuals of OLS procedure for both
|
|
583
|
+
time series.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
df (pd.DataFrame):
|
|
587
|
+
The DataFrame containing prices for each series
|
|
588
|
+
"""
|
|
589
|
+
fig, ax = plt.subplots()
|
|
590
|
+
ax.plot(df.index, df["res"], label="Residuals")
|
|
591
|
+
|
|
592
|
+
fig.autofmt_xdate()
|
|
593
|
+
plt.xlabel("Month/Year")
|
|
594
|
+
plt.ylabel("Price ($)")
|
|
595
|
+
plt.title("Residual Plot")
|
|
596
|
+
plt.legend()
|
|
597
|
+
plt.show()
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def run_cadf_test(
|
|
601
|
+
pair: Union[List[str], Tuple[str, ...]],
|
|
602
|
+
start: str,
|
|
603
|
+
end: str,
|
|
604
|
+
) -> None:
|
|
605
|
+
"""
|
|
606
|
+
Performs the Cointegration Augmented Dickey-Fuller (CADF) test on a pair of stock tickers
|
|
607
|
+
over a specified date range to check for cointegration.
|
|
608
|
+
|
|
609
|
+
The function downloads historical adjusted closing prices for the specified pair of stock tickers,
|
|
610
|
+
calculates the optimal hedge ratio (beta) using Ordinary Least Squares (OLS) regression, plots the
|
|
611
|
+
time series and their residuals, and finally performs the CADF test on the residuals.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
pair (List[str] or Tuple[str, ...]):
|
|
615
|
+
A list or tuple containing two valid stock tickers (e.g., ['AAPL', 'MSFT']).
|
|
616
|
+
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
617
|
+
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
618
|
+
|
|
619
|
+
Example:
|
|
620
|
+
>>> from bbstrader.tseries import run_cadf_test
|
|
621
|
+
>>> run_cadf_test(['AAPL', 'MSFT'], '2023-01-01', '2023-12-31')
|
|
622
|
+
>>> Regression Metrics:
|
|
623
|
+
>>> Optimal Hedge Ratio (Beta): 2.2485845594120333
|
|
624
|
+
>>> Result Parmas:
|
|
625
|
+
|
|
626
|
+
>>> const -74.418034
|
|
627
|
+
>>> AAPL 2.248585
|
|
628
|
+
>>> dtype: float64
|
|
629
|
+
|
|
630
|
+
>>> Regression Summary:
|
|
631
|
+
>>> OLS Regression Results
|
|
632
|
+
>>> ==============================================================================
|
|
633
|
+
>>> Dep. Variable: MSFT R-squared: 0.900
|
|
634
|
+
>>> Model: OLS Adj. R-squared: 0.900
|
|
635
|
+
>>> Method: Least Squares F-statistic: 2244.
|
|
636
|
+
>>> Date: Sat, 20 Jul 2024 Prob (F-statistic): 2.95e-126
|
|
637
|
+
>>> Time: 13:36:58 Log-Likelihood: -996.45
|
|
638
|
+
>>> No. Observations: 250 AIC: 1997.
|
|
639
|
+
>>> Df Residuals: 248 BIC: 2004.
|
|
640
|
+
>>> Df Model: 1
|
|
641
|
+
>>> Covariance Type: nonrobust
|
|
642
|
+
>>> ==============================================================================
|
|
643
|
+
>>> coef std err t P>|t| [0.025 0.975]
|
|
644
|
+
>>> ------------------------------------------------------------------------------
|
|
645
|
+
>>> const -74.4180 8.191 -9.085 0.000 -90.551 -58.286
|
|
646
|
+
>>> AAPL 2.2486 0.047 47.369 0.000 2.155 2.342
|
|
647
|
+
>>> ==============================================================================
|
|
648
|
+
>>> Omnibus: 4.923 Durbin-Watson: 0.121
|
|
649
|
+
>>> Prob(Omnibus): 0.085 Jarque-Bera (JB): 4.862
|
|
650
|
+
>>> Skew: 0.342 Prob(JB): 0.0879
|
|
651
|
+
>>> Kurtosis: 2.993 Cond. No. 1.71e+03
|
|
652
|
+
>>> ==============================================================================
|
|
653
|
+
|
|
654
|
+
>>> Notes:
|
|
655
|
+
>>> [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
|
|
656
|
+
>>> [2] The condition number is large, 1.71e+03. This might indicate that there are
|
|
657
|
+
>>> strong multicollinearity or other numerical problems.
|
|
658
|
+
|
|
659
|
+
>>> Cointegration TEST Results:
|
|
660
|
+
>>> (np.float64(-3.204126144947765),
|
|
661
|
+
>>> np.float64(0.019747080611767602),
|
|
662
|
+
>>> 0,
|
|
663
|
+
>>> 249,
|
|
664
|
+
>>> {'1%': np.float64(-3.4568881317725864),
|
|
665
|
+
>>> '10%': np.float64(-2.5729936189738876),
|
|
666
|
+
>>> '5%': np.float64(-2.8732185133016057)},
|
|
667
|
+
>>> np.float64(1364.3866758546171))
|
|
668
|
+
"""
|
|
669
|
+
# Download historical data for required stocks
|
|
670
|
+
p0, p1 = pair[0], pair[1]
|
|
671
|
+
_p0 = yf.download(
|
|
672
|
+
p0,
|
|
673
|
+
start=start,
|
|
674
|
+
end=end,
|
|
675
|
+
progress=False,
|
|
676
|
+
multi_level_index=False,
|
|
677
|
+
auto_adjust=True,
|
|
678
|
+
)
|
|
679
|
+
_p1 = yf.download(
|
|
680
|
+
p1,
|
|
681
|
+
start=start,
|
|
682
|
+
end=end,
|
|
683
|
+
progress=False,
|
|
684
|
+
multi_level_index=False,
|
|
685
|
+
auto_adjust=True,
|
|
686
|
+
)
|
|
687
|
+
df = pd.DataFrame(index=_p0.index)
|
|
688
|
+
df[p0] = _p0["Adj Close"]
|
|
689
|
+
df[p1] = _p1["Adj Close"]
|
|
690
|
+
df = df.dropna()
|
|
691
|
+
|
|
692
|
+
# Calculate optimal hedge ratio "beta"
|
|
693
|
+
# using statsmodels OLS
|
|
694
|
+
X = sm.add_constant(df[p0])
|
|
695
|
+
y = df[p1]
|
|
696
|
+
model = sm.OLS(y, X)
|
|
697
|
+
global results
|
|
698
|
+
results = model.fit()
|
|
699
|
+
beta_hr = results.params[p0]
|
|
700
|
+
|
|
701
|
+
# Plot the two time series with regression line
|
|
702
|
+
plot_price_series(df, p0, p1)
|
|
703
|
+
|
|
704
|
+
# Display a scatter plot of the two time series
|
|
705
|
+
# with regression line
|
|
706
|
+
plot_scatter_series(df, p0, p1)
|
|
707
|
+
|
|
708
|
+
# Calculate the residuals of the linear combination
|
|
709
|
+
df["res"] = results.resid
|
|
710
|
+
plot_residuals(df)
|
|
711
|
+
|
|
712
|
+
# Display regression metrics
|
|
713
|
+
print("\nRegression Metrics:")
|
|
714
|
+
print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
|
|
715
|
+
print("Result Parmas: \n")
|
|
716
|
+
print(results.params)
|
|
717
|
+
print("\nRegression Summary:")
|
|
718
|
+
print(results.summary())
|
|
719
|
+
|
|
720
|
+
# Calculate and output the CADF test on the residuals
|
|
721
|
+
print("\nCointegration TEST Results:")
|
|
722
|
+
cadf = ts.adfuller(df["res"], autolag="AIC")
|
|
723
|
+
pprint.pprint(cadf)
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _hurst(ts):
|
|
727
|
+
"""
|
|
728
|
+
Returns the Hurst Exponent of the time series vector ts,
|
|
729
|
+
"""
|
|
730
|
+
# Create the range of lag values
|
|
731
|
+
lags = range(2, 100)
|
|
732
|
+
|
|
733
|
+
# Calculate the array of the variances of the lagged differences
|
|
734
|
+
tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
|
|
735
|
+
|
|
736
|
+
# Use a linear fit to estimate the Hurst Exponent
|
|
737
|
+
poly = np.polyfit(np.log(lags), np.log(tau), 1)
|
|
738
|
+
|
|
739
|
+
# Return the Hurst exponent from the polyfit output
|
|
740
|
+
return poly[0] * 2.0
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
# Function to calculate Hurst Exponent
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def hurst(time_series):
|
|
747
|
+
H, c, data_range = compute_Hc(time_series, kind="price", simplified=True)
|
|
748
|
+
return H
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def run_hurst_test(symbol: str, start: str, end: str):
|
|
752
|
+
"""
|
|
753
|
+
Calculates and prints the Hurst Exponent for a given stock's adjusted closing prices
|
|
754
|
+
within a specified date range, and for three generated series (Geometric Brownian Motion,
|
|
755
|
+
Mean-Reverting, and Trending).
|
|
756
|
+
|
|
757
|
+
The Hurst Exponent is used to determine the long-term memory of a time series.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
symbol (str): A valid stock ticker symbol (e.g., 'AAPL').
|
|
761
|
+
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
762
|
+
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
763
|
+
|
|
764
|
+
Example:
|
|
765
|
+
>>> from bbstrader.tseries import run_hurst_test
|
|
766
|
+
|
|
767
|
+
>>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
|
|
768
|
+
"""
|
|
769
|
+
data = yf.download(
|
|
770
|
+
symbol,
|
|
771
|
+
start=start,
|
|
772
|
+
end=end,
|
|
773
|
+
progress=False,
|
|
774
|
+
multi_level_index=False,
|
|
775
|
+
auto_adjust=True,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
|
|
779
|
+
gbm = np.log(np.cumsum(np.random.randn(100000)) + 1000)
|
|
780
|
+
mr = np.log(np.random.randn(100000) + 1000)
|
|
781
|
+
tr = np.log(np.cumsum(np.random.randn(100000) + 1) + 1000)
|
|
782
|
+
|
|
783
|
+
# Output the Hurst Exponent for each of the series
|
|
784
|
+
print(f"\nHurst(GBM): {_hurst(gbm)}")
|
|
785
|
+
print(f"Hurst(MR): {_hurst(mr)}")
|
|
786
|
+
print(f"Hurst(TR): {_hurst(tr)}")
|
|
787
|
+
print(f"Hurst({symbol}): {hurst(data['Adj Close'])}\n")
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def test_cointegration(ticker1, ticker2, start, end):
|
|
791
|
+
# Download historical data
|
|
792
|
+
stock_data_pair = yf.download(
|
|
793
|
+
[ticker1, ticker2],
|
|
794
|
+
start=start,
|
|
795
|
+
end=end,
|
|
796
|
+
progress=False,
|
|
797
|
+
multi_level_index=False,
|
|
798
|
+
auto_adjust=True,
|
|
799
|
+
)["Adj Close"].dropna()
|
|
800
|
+
|
|
801
|
+
# Perform Johansen cointegration test
|
|
802
|
+
result = coint_johansen(stock_data_pair, det_order=0, k_ar_diff=1)
|
|
803
|
+
|
|
804
|
+
# Get the cointegration rank
|
|
805
|
+
traces_stats = result.lr1
|
|
806
|
+
print(f"\nTraces Stats: \n{traces_stats}")
|
|
807
|
+
|
|
808
|
+
# Get the critical values for 95% confidence level
|
|
809
|
+
critical_values = result.cvt
|
|
810
|
+
print(f"\nCritical Values: \n{critical_values}")
|
|
811
|
+
|
|
812
|
+
# Compare the cointegration rank with critical values
|
|
813
|
+
if traces_stats[0] > critical_values[:, 1].all():
|
|
814
|
+
print(f"\n{ticker1} and {ticker2} are cointegrated.\n")
|
|
815
|
+
else:
|
|
816
|
+
print(f"\nNo cointegration found for {ticker1} and {ticker2}.\n")
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def run_coint_test(tickers: List[str], start: str, end: str) -> None:
|
|
820
|
+
"""
|
|
821
|
+
Performs pairwise cointegration tests on a list of stock tickers over a specified date range.
|
|
822
|
+
|
|
823
|
+
For each unique pair of tickers, the function downloads historical adjusted closing prices and
|
|
824
|
+
tests for cointegration.
|
|
825
|
+
|
|
826
|
+
Args:
|
|
827
|
+
tickers (List[str]): A list of valid stock ticker symbols (e.g., ['AAPL', 'MSFT', 'GOOG']).
|
|
828
|
+
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
829
|
+
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
830
|
+
|
|
831
|
+
Example:
|
|
832
|
+
>>> from bbstrader.tseries import run_coint_test
|
|
833
|
+
|
|
834
|
+
>>> run_coint_test(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
835
|
+
"""
|
|
836
|
+
# Loop through ticker combinations
|
|
837
|
+
for ticker1, ticker2 in combinations(tickers, 2):
|
|
838
|
+
test_cointegration(ticker1, ticker2, start, end)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
# *********************************
|
|
842
|
+
# KALMAN FILTER *
|
|
843
|
+
# *********************************
|
|
844
|
+
def draw_date_coloured_scatterplot(etfs, prices):
|
|
845
|
+
"""
|
|
846
|
+
Create a scatterplot of the two ETF prices, which is
|
|
847
|
+
coloured by the date of the price to indicate the
|
|
848
|
+
changing relationship between the sets of prices
|
|
849
|
+
"""
|
|
850
|
+
plen = len(prices)
|
|
851
|
+
colour_map = plt.cm.get_cmap("YlOrRd")
|
|
852
|
+
colours = np.linspace(0.1, 1, plen)
|
|
853
|
+
|
|
854
|
+
scatterplot = plt.scatter(
|
|
855
|
+
prices[etfs[0]],
|
|
856
|
+
prices[etfs[1]],
|
|
857
|
+
s=30,
|
|
858
|
+
c=colours,
|
|
859
|
+
cmap=colour_map,
|
|
860
|
+
edgecolor="k",
|
|
861
|
+
alpha=0.8,
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
colourbar = plt.colorbar(scatterplot)
|
|
865
|
+
colourbar.ax.set_yticklabels([str(p.date()) for p in prices[:: plen // 9].index])
|
|
866
|
+
|
|
867
|
+
plt.xlabel(prices.columns[0])
|
|
868
|
+
plt.ylabel(prices.columns[1])
|
|
869
|
+
plt.show()
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def calc_slope_intercept_kalman(etfs, prices):
|
|
873
|
+
"""
|
|
874
|
+
Utilize the Kalman Filter from the filterpy library
|
|
875
|
+
to calculate the slope and intercept of the regressed
|
|
876
|
+
ETF prices.
|
|
877
|
+
"""
|
|
878
|
+
delta = 1e-5
|
|
879
|
+
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
880
|
+
|
|
881
|
+
kf = KalmanFilter(dim_x=2, dim_z=1)
|
|
882
|
+
kf.x = np.zeros((2, 1)) # Initial state
|
|
883
|
+
kf.P = np.ones((2, 2)) * 1000.0 # Initial covariance,
|
|
884
|
+
# large to represent high uncertainty
|
|
885
|
+
kf.F = np.eye(2) # State transition matrix
|
|
886
|
+
kf.Q = trans_cov # Process noise covariance
|
|
887
|
+
kf.R = 1.0 # Scalar measurement noise covariance
|
|
888
|
+
|
|
889
|
+
state_means, state_covs = [], []
|
|
890
|
+
for time, z in enumerate(prices[etfs[1]].values):
|
|
891
|
+
# Dynamically update the observation matrix H
|
|
892
|
+
# to include the current independent variable
|
|
893
|
+
kf.H = np.array([[prices[etfs[0]][time], 1.0]])
|
|
894
|
+
kf.predict()
|
|
895
|
+
kf.update(z)
|
|
896
|
+
state_means.append(kf.x.copy())
|
|
897
|
+
state_covs.append(kf.P.copy())
|
|
898
|
+
|
|
899
|
+
return np.array(state_means), np.array(state_covs)
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def draw_slope_intercept_changes(prices, state_means):
|
|
903
|
+
"""
|
|
904
|
+
Plot the slope and intercept of the regressed ETF prices
|
|
905
|
+
between the two ETFs, with the changing values of the
|
|
906
|
+
Kalman Filter over time.
|
|
907
|
+
"""
|
|
908
|
+
print(f"First Slops : {state_means[0, 0]}")
|
|
909
|
+
print(f"First intercept : {state_means[0, 1]}")
|
|
910
|
+
pd.DataFrame(
|
|
911
|
+
{
|
|
912
|
+
"slope": state_means[:, 0].flatten(),
|
|
913
|
+
"intercept": state_means[:, 1].flatten(),
|
|
914
|
+
},
|
|
915
|
+
index=prices.index,
|
|
916
|
+
).plot(subplots=True)
|
|
917
|
+
plt.show()
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
def run_kalman_filter(
|
|
921
|
+
etfs: Union[List[str], Tuple[str, ...]], start: str, end: str
|
|
922
|
+
) -> None:
|
|
923
|
+
"""
|
|
924
|
+
Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
|
|
925
|
+
to estimate the slope and intercept over time.
|
|
926
|
+
|
|
927
|
+
The function downloads historical adjusted closing prices for the specified pair of assets,
|
|
928
|
+
visualizes their price relationship, calculates the Kalman filter estimates for the slope and
|
|
929
|
+
intercept, and visualizes the changes in these estimates over time.
|
|
930
|
+
|
|
931
|
+
Args:
|
|
932
|
+
etfs (Union[List[str] , Tuple[str, ...]]):
|
|
933
|
+
A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
|
|
934
|
+
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
935
|
+
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
936
|
+
|
|
937
|
+
Example:
|
|
938
|
+
>>> from bbstrader.tseries import run_kalman_filter
|
|
939
|
+
|
|
940
|
+
>>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
|
|
941
|
+
"""
|
|
942
|
+
etf_df1 = yf.download(
|
|
943
|
+
etfs[0], start, end, progress=False, multi_level_index=False, auto_adjust=True
|
|
944
|
+
)
|
|
945
|
+
etf_df2 = yf.download(
|
|
946
|
+
etfs[1], start, end, progress=False, multi_level_index=False, auto_adjust=True
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
prices = pd.DataFrame(index=etf_df1.index)
|
|
950
|
+
prices[etfs[0]] = etf_df1["Adj Close"]
|
|
951
|
+
prices[etfs[1]] = etf_df2["Adj Close"]
|
|
952
|
+
|
|
953
|
+
draw_date_coloured_scatterplot(etfs, prices)
|
|
954
|
+
state_means, state_covs = calc_slope_intercept_kalman(etfs, prices)
|
|
955
|
+
draw_slope_intercept_changes(prices, state_means)
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
class KalmanFilterModel:
|
|
959
|
+
"""
|
|
960
|
+
Implements a Kalman Filter model a recursive algorithm used for estimating
|
|
961
|
+
the state of a linear dynamic system from a series of noisy measurements.
|
|
962
|
+
It's designed to process market data, estimate dynamic parameters such as
|
|
963
|
+
the slope and intercept of price relationships,
|
|
964
|
+
forecast error and standard deviation of the predictions
|
|
965
|
+
|
|
966
|
+
You can learn more here https://en.wikipedia.org/wiki/Kalman_filter
|
|
967
|
+
"""
|
|
968
|
+
|
|
969
|
+
def __init__(self, tickers: List | Tuple, **kwargs):
|
|
970
|
+
"""
|
|
971
|
+
Initializes the Kalman Filter strategy.
|
|
972
|
+
|
|
973
|
+
Args:
|
|
974
|
+
tickers :
|
|
975
|
+
A list or tuple of ticker symbols representing financial instruments.
|
|
976
|
+
|
|
977
|
+
kwargs : Keyword arguments for additional parameters,
|
|
978
|
+
specifically `delta` and `vt`
|
|
979
|
+
"""
|
|
980
|
+
self.tickers = tickers
|
|
981
|
+
assert self.tickers is not None
|
|
982
|
+
|
|
983
|
+
self.R = None
|
|
984
|
+
self.theta = np.zeros(2)
|
|
985
|
+
self.P = np.zeros((2, 2))
|
|
986
|
+
self.delta = kwargs.get("delta", 1e-4)
|
|
987
|
+
self.vt = kwargs.get("vt", 1e-3)
|
|
988
|
+
self.wt = self.delta / (1 - self.delta) * np.eye(2)
|
|
989
|
+
self.latest_prices = np.array([-1.0, -1.0])
|
|
990
|
+
self.kf = self._init_kalman()
|
|
991
|
+
|
|
992
|
+
def _init_kalman(self):
|
|
993
|
+
"""
|
|
994
|
+
Initializes and returns a Kalman Filter configured
|
|
995
|
+
for the trading strategy. The filter is set up with initial
|
|
996
|
+
state and covariance, state transition matrix, process noise
|
|
997
|
+
and measurement noise covariances.
|
|
998
|
+
"""
|
|
999
|
+
kf = KalmanFilter(dim_x=2, dim_z=1)
|
|
1000
|
+
kf.x = np.zeros((2, 1)) # Initial state
|
|
1001
|
+
kf.P = self.P # Initial covariance
|
|
1002
|
+
kf.F = np.eye(2) # State transition matrix
|
|
1003
|
+
kf.Q = self.wt # Process noise covariance
|
|
1004
|
+
kf.R = 1.0 # Scalar measurement noise covariance
|
|
1005
|
+
|
|
1006
|
+
return kf
|
|
1007
|
+
|
|
1008
|
+
Array = np.ndarray
|
|
1009
|
+
|
|
1010
|
+
def calc_slope_intercep(self, prices: Array) -> Tuple:
|
|
1011
|
+
"""
|
|
1012
|
+
Calculates and returns the slope and intercept
|
|
1013
|
+
of the relationship between the provided prices using the Kalman Filter.
|
|
1014
|
+
This method updates the filter with the latest price and returns
|
|
1015
|
+
the estimated slope and intercept.
|
|
1016
|
+
|
|
1017
|
+
Args:
|
|
1018
|
+
prices : A numpy array of prices for two financial instruments.
|
|
1019
|
+
|
|
1020
|
+
Returns:
|
|
1021
|
+
A tuple containing the slope and intercept of the relationship
|
|
1022
|
+
"""
|
|
1023
|
+
self.kf.H = np.array([[prices[1], 1.0]])
|
|
1024
|
+
self.kf.predict()
|
|
1025
|
+
self.kf.update(prices[1])
|
|
1026
|
+
slope = self.kf.x.copy().flatten()[0]
|
|
1027
|
+
intercept = self.kf.x.copy().flatten()[1]
|
|
1028
|
+
|
|
1029
|
+
return slope, intercept
|
|
1030
|
+
|
|
1031
|
+
def calculate_etqt(self, prices: Array) -> Tuple:
|
|
1032
|
+
"""
|
|
1033
|
+
Calculates the ``forecast error`` and ``standard deviation`` of the predictions
|
|
1034
|
+
using the Kalman Filter.
|
|
1035
|
+
|
|
1036
|
+
Args:
|
|
1037
|
+
prices : A numpy array of prices for two financial instruments.
|
|
1038
|
+
|
|
1039
|
+
Returns:
|
|
1040
|
+
A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
|
|
1041
|
+
"""
|
|
1042
|
+
|
|
1043
|
+
self.latest_prices[0] = prices[0]
|
|
1044
|
+
self.latest_prices[1] = prices[1]
|
|
1045
|
+
|
|
1046
|
+
if all(self.latest_prices > -1.0):
|
|
1047
|
+
slope, intercept = self.calc_slope_intercep(self.latest_prices)
|
|
1048
|
+
|
|
1049
|
+
self.theta[0] = slope
|
|
1050
|
+
self.theta[1] = intercept
|
|
1051
|
+
|
|
1052
|
+
# Create the observation matrix of the latest prices
|
|
1053
|
+
# of Y and the intercept value (1.0) as well as the
|
|
1054
|
+
# scalar value of the latest price from X
|
|
1055
|
+
F = np.asarray([self.latest_prices[0], 1.0]).reshape((1, 2))
|
|
1056
|
+
y = self.latest_prices[1]
|
|
1057
|
+
|
|
1058
|
+
# The prior value of the states {\theta_t} is
|
|
1059
|
+
# distributed as a multivariate Gaussian with
|
|
1060
|
+
# mean a_t and variance-covariance {R_t}
|
|
1061
|
+
if self.R is not None:
|
|
1062
|
+
self.R = self.C + self.wt
|
|
1063
|
+
else:
|
|
1064
|
+
self.R = np.zeros((2, 2))
|
|
1065
|
+
|
|
1066
|
+
# Calculate the Kalman Filter update
|
|
1067
|
+
# ---------------------------------
|
|
1068
|
+
# Calculate prediction of new observation
|
|
1069
|
+
# as well as forecast error of that prediction
|
|
1070
|
+
yhat = F.dot(self.theta)
|
|
1071
|
+
et = y - yhat
|
|
1072
|
+
|
|
1073
|
+
# {Q_t} is the variance of the prediction of
|
|
1074
|
+
# observations and hence sqrt_Qt is the
|
|
1075
|
+
# standard deviation of the predictions
|
|
1076
|
+
Qt = F.dot(self.R).dot(F.T) + self.vt
|
|
1077
|
+
sqrt_Qt = np.sqrt(Qt)
|
|
1078
|
+
|
|
1079
|
+
# The posterior value of the states {\theta_t} is
|
|
1080
|
+
# distributed as a multivariate Gaussian with mean
|
|
1081
|
+
# {m_t} and variance-covariance {C_t}
|
|
1082
|
+
At = self.R.dot(F.T) / Qt
|
|
1083
|
+
self.theta = self.theta + At.flatten() * et
|
|
1084
|
+
self.C = self.R - At * F.dot(self.R)
|
|
1085
|
+
return (et[0], sqrt_Qt.flatten()[0])
|
|
1086
|
+
else:
|
|
1087
|
+
return None
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
# ******************************************
|
|
1091
|
+
# ORNSTEIN UHLENBECK PROCESS *
|
|
1092
|
+
# ******************************************
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
class OrnsteinUhlenbeck:
|
|
1096
|
+
"""
|
|
1097
|
+
The Ornstein-Uhlenbeck process is a mathematical model
|
|
1098
|
+
used to describe the behavior of a mean-reverting stochastic process.
|
|
1099
|
+
We use it to model the price dynamics of an asset that tends
|
|
1100
|
+
to revert to a long-term mean.
|
|
1101
|
+
|
|
1102
|
+
We Estimate the drift (θ), volatility (σ), and long-term mean (μ)
|
|
1103
|
+
based on historical price data; then we Simulate the OU process
|
|
1104
|
+
using the estimated parameters.
|
|
1105
|
+
|
|
1106
|
+
https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
|
|
1107
|
+
"""
|
|
1108
|
+
|
|
1109
|
+
def __init__(self, prices: np.ndarray, returns: bool = True, timeframe: str = "D1"):
|
|
1110
|
+
"""
|
|
1111
|
+
Initializes the OrnsteinUhlenbeck instance.
|
|
1112
|
+
|
|
1113
|
+
Args:
|
|
1114
|
+
prices (np.ndarray) : Historical close prices.
|
|
1115
|
+
|
|
1116
|
+
retrurns (bool) : Use it to indicate weither
|
|
1117
|
+
you want to simulate the returns or your raw data
|
|
1118
|
+
|
|
1119
|
+
timeframe (str) : The time frame for the Historical prices
|
|
1120
|
+
(1m, 5m, 15m, 30m, 1h, 4h, D1)
|
|
1121
|
+
"""
|
|
1122
|
+
self.prices = prices
|
|
1123
|
+
if returns:
|
|
1124
|
+
series = pd.Series(self.prices)
|
|
1125
|
+
self.returns = series.pct_change().dropna().values
|
|
1126
|
+
else:
|
|
1127
|
+
self.returns = self.prices
|
|
1128
|
+
|
|
1129
|
+
time_frame_mapping = {
|
|
1130
|
+
"1m": 1 / (24 * 60), # 1 minute intervals
|
|
1131
|
+
"5m": 5 / (24 * 60), # 5 minute intervals
|
|
1132
|
+
"15m": 15 / (24 * 60), # 15 minute intervals
|
|
1133
|
+
"30m": 30 / (24 * 60), # 30 minute intervals
|
|
1134
|
+
"1h": 1 / 24, # 1 hour intervals
|
|
1135
|
+
"4h": 4 / 24, # 4 hour intervals
|
|
1136
|
+
"D1": 1, # Daily intervals
|
|
1137
|
+
}
|
|
1138
|
+
if timeframe not in time_frame_mapping:
|
|
1139
|
+
raise ValueError("Unsupported time frame")
|
|
1140
|
+
self.tf = time_frame_mapping[timeframe]
|
|
1141
|
+
|
|
1142
|
+
params = self.estimate_parameters()
|
|
1143
|
+
self.mu_hat = params[0] # Mean (μ)
|
|
1144
|
+
self.theta_hat = params[1] # Drift (θ)
|
|
1145
|
+
self.sigma_hat = params[2] # Volatility (σ)
|
|
1146
|
+
print(f"Estimated μ: {self.mu_hat}")
|
|
1147
|
+
print(f"Estimated θ: {self.theta_hat}")
|
|
1148
|
+
print(f"Estimated σ: {self.sigma_hat}")
|
|
1149
|
+
|
|
1150
|
+
def ornstein_uhlenbeck(self, mu, theta, sigma, dt, X0, n):
|
|
1151
|
+
"""
|
|
1152
|
+
Simulates the Ornstein-Uhlenbeck process.
|
|
1153
|
+
|
|
1154
|
+
Args:
|
|
1155
|
+
mu (float): Estimated long-term mean.
|
|
1156
|
+
theta (float): Estimated drift.
|
|
1157
|
+
sigma (float): Estimated volatility.
|
|
1158
|
+
dt (float): Time step.
|
|
1159
|
+
X0 (float): Initial value.
|
|
1160
|
+
n (int): Number of time steps.
|
|
1161
|
+
|
|
1162
|
+
Returns:
|
|
1163
|
+
np.ndarray : Simulated process.
|
|
1164
|
+
"""
|
|
1165
|
+
x = np.zeros(n)
|
|
1166
|
+
x[0] = X0
|
|
1167
|
+
for t in range(1, n):
|
|
1168
|
+
dW = np.random.normal(loc=0, scale=np.sqrt(dt))
|
|
1169
|
+
# O-U process differential equation
|
|
1170
|
+
x[t] = x[t - 1] + (theta * (mu - x[t - 1]) * dt) + (sigma * dW)
|
|
1171
|
+
# dW is a Wiener process
|
|
1172
|
+
# (theta * (mu - x[t-1]) * dt) represents the mean-reverting tendency
|
|
1173
|
+
# (sigma * dW) represents the random volatility
|
|
1174
|
+
return x
|
|
1175
|
+
|
|
1176
|
+
def estimate_parameters(self):
|
|
1177
|
+
"""
|
|
1178
|
+
Estimates the mean-reverting parameters (μ, θ, σ)
|
|
1179
|
+
using the negative log-likelihood.
|
|
1180
|
+
|
|
1181
|
+
Returns:
|
|
1182
|
+
Tuple: Estimated μ, θ, and σ.
|
|
1183
|
+
"""
|
|
1184
|
+
initial_guess = [0, 0.1, np.std(self.returns)]
|
|
1185
|
+
result = minimize(self._neg_log_likelihood, initial_guess, args=(self.returns,))
|
|
1186
|
+
mu, theta, sigma = result.x
|
|
1187
|
+
return mu, theta, sigma
|
|
1188
|
+
|
|
1189
|
+
def _neg_log_likelihood(self, params, returns):
|
|
1190
|
+
"""
|
|
1191
|
+
Calculates the negative
|
|
1192
|
+
log-likelihood for parameter estimation.
|
|
1193
|
+
|
|
1194
|
+
Args:
|
|
1195
|
+
params (list): List of parameters [mu, theta, sigma].
|
|
1196
|
+
returns (np.ndarray): Historical returns.
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
float: Negative log-likelihood.
|
|
1200
|
+
"""
|
|
1201
|
+
mu, theta, sigma = params
|
|
1202
|
+
dt = self.tf
|
|
1203
|
+
n = len(returns)
|
|
1204
|
+
ou_simulated = self.ornstein_uhlenbeck(mu, theta, sigma, dt, 0, n + 1)
|
|
1205
|
+
residuals = ou_simulated[1 : n + 1] - returns
|
|
1206
|
+
neg_ll = 0.5 * np.sum(residuals**2) / sigma**2 + 0.5 * n * np.log(
|
|
1207
|
+
2 * np.pi * sigma**2
|
|
1208
|
+
)
|
|
1209
|
+
return neg_ll
|
|
1210
|
+
|
|
1211
|
+
def simulate_process(self, returns=None, n=100, p=None):
|
|
1212
|
+
"""
|
|
1213
|
+
Simulates the OU process multiple times .
|
|
1214
|
+
|
|
1215
|
+
Args:
|
|
1216
|
+
returns (np.ndarray): Historical returns.
|
|
1217
|
+
n (int): Number of simulations to perform.
|
|
1218
|
+
p (int): Number of time steps.
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
np.ndarray: 2D array representing simulated processes.
|
|
1222
|
+
"""
|
|
1223
|
+
if returns is None:
|
|
1224
|
+
returns = self.returns
|
|
1225
|
+
if p is not None:
|
|
1226
|
+
T = p
|
|
1227
|
+
else:
|
|
1228
|
+
T = len(returns)
|
|
1229
|
+
dt = self.tf
|
|
1230
|
+
|
|
1231
|
+
dW_matrix = np.random.normal(loc=0, scale=np.sqrt(dt), size=(n, T))
|
|
1232
|
+
simulations_matrix = np.zeros((n, T))
|
|
1233
|
+
simulations_matrix[:, 0] = returns[-1]
|
|
1234
|
+
|
|
1235
|
+
for t in range(1, T):
|
|
1236
|
+
simulations_matrix[:, t] = (
|
|
1237
|
+
simulations_matrix[:, t - 1]
|
|
1238
|
+
+ self.theta_hat * (self.mu_hat - simulations_matrix[:, t - 1]) * dt
|
|
1239
|
+
+ self.sigma_hat * dW_matrix[:, t]
|
|
1240
|
+
)
|
|
1241
|
+
return simulations_matrix
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
def remove_correlated_assets(df: pd.DataFrame, cutoff=0.99):
|
|
1245
|
+
"""
|
|
1246
|
+
Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
|
|
1247
|
+
This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
|
|
1248
|
+
|
|
1249
|
+
Args:
|
|
1250
|
+
df (pd.DataFrame): A DataFrame where each column represents an asset
|
|
1251
|
+
and rows represent observations (e.g., time-series data).
|
|
1252
|
+
cutoff (float, optional, default=0.99): The correlation threshold.
|
|
1253
|
+
Columns with absolute correlation greater than this value will be considered for removal.
|
|
1254
|
+
|
|
1255
|
+
Returns:
|
|
1256
|
+
pd.DataFrame: A DataFrame with less correlated assets.
|
|
1257
|
+
The columns that are highly correlated (above the cutoff) are removed.
|
|
1258
|
+
|
|
1259
|
+
References
|
|
1260
|
+
----------
|
|
1261
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1262
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1263
|
+
|
|
1264
|
+
Example:
|
|
1265
|
+
>>> df = pd.DataFrame({
|
|
1266
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1267
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1268
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1269
|
+
... })
|
|
1270
|
+
>>> df = remove_correlated_assets(df)
|
|
1271
|
+
"""
|
|
1272
|
+
corr = df.corr().stack()
|
|
1273
|
+
corr = corr[corr < 1]
|
|
1274
|
+
to_check = corr[corr.abs() > cutoff].index
|
|
1275
|
+
keep, drop = set(), set()
|
|
1276
|
+
for s1, s2 in to_check:
|
|
1277
|
+
if s1 not in keep:
|
|
1278
|
+
if s2 not in keep:
|
|
1279
|
+
keep.add(s1)
|
|
1280
|
+
drop.add(s2)
|
|
1281
|
+
else:
|
|
1282
|
+
drop.add(s1)
|
|
1283
|
+
else:
|
|
1284
|
+
keep.discard(s2)
|
|
1285
|
+
drop.add(s2)
|
|
1286
|
+
return df.drop(drop, axis=1)
|
|
1287
|
+
|
|
1288
|
+
|
|
1289
|
+
def check_stationarity(df: pd.DataFrame):
|
|
1290
|
+
"""
|
|
1291
|
+
Tests the stationarity of time-series data for each asset in the DataFrame
|
|
1292
|
+
using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
|
|
1293
|
+
in time-series analysis, and non-stationary data can affect model performance.
|
|
1294
|
+
|
|
1295
|
+
Args:
|
|
1296
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1297
|
+
|
|
1298
|
+
Returns:
|
|
1299
|
+
pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
|
|
1300
|
+
- ticker Asset name (column name from df).
|
|
1301
|
+
- adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
|
|
1302
|
+
|
|
1303
|
+
References
|
|
1304
|
+
----------
|
|
1305
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1306
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1307
|
+
|
|
1308
|
+
Example:
|
|
1309
|
+
>>> df = pd.DataFrame({
|
|
1310
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1311
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1312
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1313
|
+
... })
|
|
1314
|
+
>>> df = check_stationarity(df)
|
|
1315
|
+
"""
|
|
1316
|
+
results = []
|
|
1317
|
+
for ticker, prices in df.items():
|
|
1318
|
+
results.append([ticker, adfuller(prices, regression="ct")[1]])
|
|
1319
|
+
return pd.DataFrame(results, columns=["ticker", "adf"]).sort_values("adf")
|
|
1320
|
+
|
|
1321
|
+
|
|
1322
|
+
def remove_stationary_assets(df: pd.DataFrame, pval=0.05):
|
|
1323
|
+
"""
|
|
1324
|
+
Filters out stationary assets from the DataFrame based on the p-value obtained
|
|
1325
|
+
from the Augmented Dickey-Fuller test.
|
|
1326
|
+
Useful for focusing only on non-stationary time-series data.
|
|
1327
|
+
|
|
1328
|
+
Args:
|
|
1329
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1330
|
+
pval (float, optional, default=0.05): The significance level to determine stationarity.
|
|
1331
|
+
Columns with an ADF test p-value below this threshold are considered stationary and removed.
|
|
1332
|
+
|
|
1333
|
+
Returns:
|
|
1334
|
+
pd.DataFrame: A DataFrame containing only the non-stationary assets.
|
|
1335
|
+
|
|
1336
|
+
References
|
|
1337
|
+
----------
|
|
1338
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1339
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1340
|
+
|
|
1341
|
+
Example:
|
|
1342
|
+
>>> df = pd.DataFrame({
|
|
1343
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1344
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1345
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1346
|
+
... })
|
|
1347
|
+
>>> df = remove_stationary_assets(df)
|
|
1348
|
+
"""
|
|
1349
|
+
test_result = check_stationarity(df)
|
|
1350
|
+
stationary = test_result.loc[test_result.adf <= pval, "ticker"].tolist()
|
|
1351
|
+
return df.drop(stationary, axis=1).sort_index()
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
|
|
1355
|
+
"""
|
|
1356
|
+
Selects the top N assets based on the average trading volume from the input DataFrame.
|
|
1357
|
+
These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
|
|
1358
|
+
|
|
1359
|
+
Args:
|
|
1360
|
+
df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
|
|
1361
|
+
Must include columns 'close' (price) and 'volume'.
|
|
1362
|
+
n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
|
|
1363
|
+
start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
|
|
1364
|
+
end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
|
|
1365
|
+
rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
|
|
1366
|
+
|
|
1367
|
+
Returns:
|
|
1368
|
+
pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
|
|
1369
|
+
|
|
1370
|
+
References
|
|
1371
|
+
----------
|
|
1372
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1373
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1374
|
+
"""
|
|
1375
|
+
required_columns = {"close", "volume"}
|
|
1376
|
+
if not required_columns.issubset(df.columns):
|
|
1377
|
+
raise ValueError(
|
|
1378
|
+
f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}."
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
if (
|
|
1382
|
+
not isinstance(df.index, pd.MultiIndex)
|
|
1383
|
+
or "ticker" not in df.index.names
|
|
1384
|
+
or "date" not in df.index.names
|
|
1385
|
+
):
|
|
1386
|
+
raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
|
|
1387
|
+
|
|
1388
|
+
df = df.copy()
|
|
1389
|
+
idx = pd.IndexSlice
|
|
1390
|
+
start = start or df.index.get_level_values("date").min()
|
|
1391
|
+
end = end or df.index.get_level_values("date").max()
|
|
1392
|
+
df = (
|
|
1393
|
+
df.loc[lambda df: ~df.index.duplicated()]
|
|
1394
|
+
.sort_index()
|
|
1395
|
+
.loc[idx[:, f"{start}" : f"{end}"], :]
|
|
1396
|
+
.assign(dv=lambda df: df.close.mul(df.volume))
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
if rolling_window is None:
|
|
1400
|
+
most_traded = df.groupby(level="ticker").dv.mean().nlargest(n=n).index
|
|
1401
|
+
else:
|
|
1402
|
+
# Calculate the rolling average of dollar volume
|
|
1403
|
+
df["dv_rolling_avg"] = (
|
|
1404
|
+
df.groupby(level=0)
|
|
1405
|
+
.dv.rolling(window=rolling_window, min_periods=1)
|
|
1406
|
+
.mean()
|
|
1407
|
+
.reset_index(level=0, drop=True)
|
|
1408
|
+
)
|
|
1409
|
+
most_traded = df.groupby(level=0)["dv_rolling_avg"].mean().nlargest(n=n).index
|
|
1410
|
+
df = (
|
|
1411
|
+
df.loc[idx[most_traded, :], "close"]
|
|
1412
|
+
.unstack("ticker")
|
|
1413
|
+
.ffill(limit=5)
|
|
1414
|
+
.dropna(axis=1)
|
|
1415
|
+
)
|
|
1416
|
+
df = remove_correlated_assets(df)
|
|
1417
|
+
df = remove_stationary_assets(df)
|
|
1418
|
+
return df.sort_index()
|
|
1419
|
+
|
|
1420
|
+
|
|
1421
|
+
def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
1422
|
+
"""
|
|
1423
|
+
Calculates statistical and econometric metrics for a target security and a set of candidate securities.
|
|
1424
|
+
These metrics are useful in financial modeling and pairs trading strategies,
|
|
1425
|
+
providing information about drift, volatility, correlation, and cointegration.
|
|
1426
|
+
|
|
1427
|
+
Args:
|
|
1428
|
+
security (pd.Series): A time-series of the target security's prices.
|
|
1429
|
+
The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
|
|
1430
|
+
candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
|
|
1431
|
+
for candidate securities to be evaluated against the target security.
|
|
1432
|
+
|
|
1433
|
+
Returns:
|
|
1434
|
+
pd.DataFrame: A DataFrame combining:
|
|
1435
|
+
Drift: Estimated drift of spreads between the target security and each candidate.
|
|
1436
|
+
Volatility: Standard deviation of spreads.
|
|
1437
|
+
Correlation:
|
|
1438
|
+
``corr``: Correlation of normalized prices between the target and each candidate.
|
|
1439
|
+
``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
|
|
1440
|
+
Cointegration metrics:
|
|
1441
|
+
Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
|
|
1442
|
+
Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
|
|
1443
|
+
|
|
1444
|
+
References
|
|
1445
|
+
----------
|
|
1446
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1447
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1448
|
+
"""
|
|
1449
|
+
security = security.div(security.iloc[0])
|
|
1450
|
+
ticker = security.name
|
|
1451
|
+
candidates = candidates.div(candidates.iloc[0])
|
|
1452
|
+
spreads = candidates.sub(security, axis=0)
|
|
1453
|
+
n, m = spreads.shape
|
|
1454
|
+
X = np.ones(shape=(n, 2))
|
|
1455
|
+
X[:, 1] = np.arange(1, n + 1)
|
|
1456
|
+
|
|
1457
|
+
# compute drift
|
|
1458
|
+
drift = (np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1].to_frame("drift")
|
|
1459
|
+
|
|
1460
|
+
# compute volatility
|
|
1461
|
+
vol = spreads.std().to_frame("vol")
|
|
1462
|
+
|
|
1463
|
+
# returns correlation
|
|
1464
|
+
corr_ret = (
|
|
1465
|
+
candidates.pct_change().corrwith(security.pct_change()).to_frame("corr_ret")
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
# normalized price series correlation
|
|
1469
|
+
corr = candidates.corrwith(security).to_frame("corr")
|
|
1470
|
+
metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
|
|
1471
|
+
|
|
1472
|
+
tests = []
|
|
1473
|
+
# run cointegration tests
|
|
1474
|
+
for candidate, prices in tqdm(candidates.items()):
|
|
1475
|
+
df = pd.DataFrame({"s1": security, "s2": prices})
|
|
1476
|
+
var = VAR(df.values)
|
|
1477
|
+
lags = var.select_order() # select VAR order
|
|
1478
|
+
k_ar_diff = lags.selected_orders["aic"]
|
|
1479
|
+
# Johansen Test with constant Term and estd. lag order
|
|
1480
|
+
cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
|
|
1481
|
+
# Engle-Granger Tests
|
|
1482
|
+
t1, p1 = coint(security, prices, trend="c")[:2]
|
|
1483
|
+
t2, p2 = coint(prices, security, trend="c")[:2]
|
|
1484
|
+
tests.append([ticker, candidate, t1, p1, t2, p2, k_ar_diff, *cj0.lr1])
|
|
1485
|
+
columns = ["s1", "s2", "t1", "p1", "t2", "p2", "k_ar_diff", "trace0", "trace1"]
|
|
1486
|
+
tests = pd.DataFrame(tests, columns=columns).set_index("s2")
|
|
1487
|
+
return metrics.join(tests)
|
|
1488
|
+
|
|
1489
|
+
|
|
1490
|
+
__CRITICAL_VALUES = {
|
|
1491
|
+
0: {0.9: 13.4294, 0.95: 15.4943, 0.99: 19.9349},
|
|
1492
|
+
1: {0.9: 2.7055, 0.95: 3.8415, 0.99: 6.6349},
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
def find_cointegrated_pairs(
|
|
1497
|
+
securities: pd.DataFrame,
|
|
1498
|
+
candidates: pd.DataFrame,
|
|
1499
|
+
n=None,
|
|
1500
|
+
start=None,
|
|
1501
|
+
stop=None,
|
|
1502
|
+
coint=False,
|
|
1503
|
+
):
|
|
1504
|
+
"""
|
|
1505
|
+
Identifies cointegrated pairs between a target set of securities and candidate securities
|
|
1506
|
+
based on econometric tests. The function evaluates statistical relationships,
|
|
1507
|
+
such as cointegration and Engle-Granger significance, to determine pairs suitable
|
|
1508
|
+
for financial strategies like pairs trading.
|
|
1509
|
+
|
|
1510
|
+
Args:
|
|
1511
|
+
securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1512
|
+
prices of target securities to evaluate.
|
|
1513
|
+
candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1514
|
+
prices of candidate securities to compare against the target securities.
|
|
1515
|
+
n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
|
|
1516
|
+
start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1517
|
+
stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1518
|
+
coint (`bool`, optional, default=False):
|
|
1519
|
+
- If `True`, filters for pairs identified as cointegrated.
|
|
1520
|
+
- If `False`, returns all evaluated pairs.
|
|
1521
|
+
|
|
1522
|
+
Returns:
|
|
1523
|
+
- ``pd.DataFrame``: A DataFrame containing:
|
|
1524
|
+
- Johansen and Engle-Granger cointegration metrics:
|
|
1525
|
+
- `t1`, `t2`: Engle-Granger test statistics for two directions.
|
|
1526
|
+
- `p1`, `p2`: Engle-Granger p-values for two directions.
|
|
1527
|
+
- `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
|
|
1528
|
+
- Indicators and filters:
|
|
1529
|
+
- `joh_sig`: Indicates Johansen cointegration significance.
|
|
1530
|
+
- `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
|
|
1531
|
+
- `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
|
|
1532
|
+
- `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
|
|
1533
|
+
- Spread and ranking:
|
|
1534
|
+
- `t`: Minimum of `t1` and `t2`.
|
|
1535
|
+
- `p`: Minimum of `p1` and `p2`.
|
|
1536
|
+
References
|
|
1537
|
+
----------
|
|
1538
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1539
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1540
|
+
|
|
1541
|
+
Example:
|
|
1542
|
+
>>> import pandas as pd
|
|
1543
|
+
|
|
1544
|
+
>>> # Sample Data
|
|
1545
|
+
>>> data_securities = {
|
|
1546
|
+
... 'Security1': [100, 102, 101, 103, 105],
|
|
1547
|
+
... 'Security2': [50, 52, 53, 51, 54]
|
|
1548
|
+
... }
|
|
1549
|
+
>>> data_candidates = {
|
|
1550
|
+
... 'Candidate1': [100, 101, 99, 102, 104],
|
|
1551
|
+
... 'Candidate2': [200, 202, 201, 203, 205]
|
|
1552
|
+
... }
|
|
1553
|
+
|
|
1554
|
+
>>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
|
|
1555
|
+
>>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
|
|
1556
|
+
|
|
1557
|
+
>>> # Find cointegrated pairs
|
|
1558
|
+
>>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1559
|
+
>>> print(top_pairs)
|
|
1560
|
+
|
|
1561
|
+
>>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
|
|
1562
|
+
>>> |----------|-----------|------|-------|---------|--------|-------|
|
|
1563
|
+
>>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
|
|
1564
|
+
>>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
|
|
1565
|
+
"""
|
|
1566
|
+
trace0_cv = __CRITICAL_VALUES[0][
|
|
1567
|
+
0.95
|
|
1568
|
+
] # critical value for 0 cointegration relationships
|
|
1569
|
+
# critical value for 1 cointegration relationship
|
|
1570
|
+
trace1_cv = __CRITICAL_VALUES[1][0.95]
|
|
1571
|
+
spreads = []
|
|
1572
|
+
if start is not None and stop is not None:
|
|
1573
|
+
securities = securities.loc[str(start) : str(stop), :]
|
|
1574
|
+
candidates = candidates.loc[str(start) : str(stop), :]
|
|
1575
|
+
for i, (ticker, prices) in enumerate(securities.items(), 1):
|
|
1576
|
+
try:
|
|
1577
|
+
df = compute_pair_metrics(prices, candidates)
|
|
1578
|
+
spreads.append(df.set_index("s1", append=True))
|
|
1579
|
+
except np.linalg.LinAlgError:
|
|
1580
|
+
continue
|
|
1581
|
+
spreads = pd.concat(spreads)
|
|
1582
|
+
spreads.index.names = ["s2", "s1"]
|
|
1583
|
+
spreads = spreads.swaplevel()
|
|
1584
|
+
spreads["t"] = spreads[["t1", "t2"]].min(axis=1)
|
|
1585
|
+
spreads["p"] = spreads[["p1", "p2"]].min(axis=1)
|
|
1586
|
+
spreads["joh_sig"] = (
|
|
1587
|
+
(spreads.trace0 > trace0_cv) & (spreads.trace1 > trace1_cv)
|
|
1588
|
+
).astype(int)
|
|
1589
|
+
spreads["eg_sig"] = (spreads.p < 0.05).astype(int)
|
|
1590
|
+
spreads["s1_dep"] = spreads.p1 < spreads.p2
|
|
1591
|
+
spreads["coint"] = (spreads.joh_sig & spreads.eg_sig).astype(int)
|
|
1592
|
+
# select top n pairs
|
|
1593
|
+
if coint:
|
|
1594
|
+
if n is not None:
|
|
1595
|
+
top_pairs = (
|
|
1596
|
+
spreads.query("coint == 1").sort_values("t", ascending=False).head(n)
|
|
1597
|
+
)
|
|
1598
|
+
else:
|
|
1599
|
+
top_pairs = spreads.query("coint == 1").sort_values("t", ascending=False)
|
|
1600
|
+
else:
|
|
1601
|
+
if n is not None:
|
|
1602
|
+
top_pairs = spreads.sort_values("t", ascending=False).head(n)
|
|
1603
|
+
else:
|
|
1604
|
+
top_pairs = spreads.sort_values("t", ascending=False)
|
|
1605
|
+
return top_pairs
|
|
1606
|
+
|
|
1607
|
+
|
|
1608
|
+
def analyze_cointegrated_pairs(
|
|
1609
|
+
spreads: pd.DataFrame,
|
|
1610
|
+
plot_coint=True,
|
|
1611
|
+
crosstab=False,
|
|
1612
|
+
heuristics=False,
|
|
1613
|
+
log_reg=False,
|
|
1614
|
+
decis_tree=False,
|
|
1615
|
+
):
|
|
1616
|
+
"""
|
|
1617
|
+
Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
|
|
1618
|
+
|
|
1619
|
+
Args:
|
|
1620
|
+
spreads (pd.DataFrame):
|
|
1621
|
+
A DataFrame containing cointegration metrics and characteristics.
|
|
1622
|
+
Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
|
|
1623
|
+
plot_coint (bool, optional):
|
|
1624
|
+
If True, generates scatterplots and boxplots to visualize cointegration characteristics.
|
|
1625
|
+
cosstab (bool, optional):
|
|
1626
|
+
If True, displays crosstabulations of Engle-Granger and Johansen test significance.
|
|
1627
|
+
heuristics (bool, optional):
|
|
1628
|
+
If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
|
|
1629
|
+
log_reg (bool, optional):
|
|
1630
|
+
If True, fits a logistic regression model to predict cointegration and evaluates its performance.
|
|
1631
|
+
decis_tree (bool, optional):
|
|
1632
|
+
If True, fits a decision tree model to predict cointegration and evaluates its performance.
|
|
1633
|
+
|
|
1634
|
+
References
|
|
1635
|
+
----------
|
|
1636
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1637
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1638
|
+
|
|
1639
|
+
Example:
|
|
1640
|
+
>>> import pandas as pd
|
|
1641
|
+
>>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
|
|
1642
|
+
|
|
1643
|
+
>>> # Sample Data
|
|
1644
|
+
>>> securities = pd.DataFrame({
|
|
1645
|
+
... 'SPY': [100, 102, 101, 103, 105],
|
|
1646
|
+
... 'QQQ': [50, 52, 53, 51, 54]
|
|
1647
|
+
... })
|
|
1648
|
+
>>> candidates = pd.DataFrame({
|
|
1649
|
+
... 'AAPL': [100, 101, 99, 102, 104],
|
|
1650
|
+
... 'MSFT': [200, 202, 201, 203, 205]
|
|
1651
|
+
... })
|
|
1652
|
+
|
|
1653
|
+
>>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1654
|
+
>>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
|
|
1655
|
+
"""
|
|
1656
|
+
if plot_coint:
|
|
1657
|
+
trace0_cv = __CRITICAL_VALUES[0][0.95]
|
|
1658
|
+
spreads = spreads.reset_index()
|
|
1659
|
+
sns.scatterplot(
|
|
1660
|
+
x=np.log1p(spreads.t.abs()),
|
|
1661
|
+
y=np.log1p(spreads.trace1),
|
|
1662
|
+
hue="coint",
|
|
1663
|
+
data=spreads[spreads.trace0 > trace0_cv],
|
|
1664
|
+
)
|
|
1665
|
+
fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
|
|
1666
|
+
for i, heuristic in enumerate(["drift", "vol", "corr", "corr_ret"]):
|
|
1667
|
+
sns.boxplot(x="coint", y=heuristic, data=spreads, ax=axes[i])
|
|
1668
|
+
fig.tight_layout()
|
|
1669
|
+
|
|
1670
|
+
if heuristics:
|
|
1671
|
+
spreads = spreads.reset_index()
|
|
1672
|
+
h = (
|
|
1673
|
+
spreads.groupby(spreads.coint)[["drift", "vol", "corr"]]
|
|
1674
|
+
.describe()
|
|
1675
|
+
.stack(level=0)
|
|
1676
|
+
.swaplevel()
|
|
1677
|
+
.sort_index()
|
|
1678
|
+
)
|
|
1679
|
+
print(h)
|
|
1680
|
+
|
|
1681
|
+
if log_reg:
|
|
1682
|
+
y = spreads.coint
|
|
1683
|
+
X = spreads[["drift", "vol", "corr", "corr_ret"]]
|
|
1684
|
+
log_reg = LogisticRegressionCV(
|
|
1685
|
+
Cs=np.logspace(-10, 10, 21), class_weight="balanced", scoring="roc_auc"
|
|
1686
|
+
)
|
|
1687
|
+
log_reg.fit(X=X, y=y)
|
|
1688
|
+
Cs = log_reg.Cs_
|
|
1689
|
+
scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
|
|
1690
|
+
scores.plot(logx=True)
|
|
1691
|
+
res = f"C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}"
|
|
1692
|
+
print(res)
|
|
1693
|
+
print(log_reg.coef_)
|
|
1694
|
+
|
|
1695
|
+
if decis_tree:
|
|
1696
|
+
model = DecisionTreeClassifier(class_weight="balanced")
|
|
1697
|
+
decision_tree = GridSearchCV(
|
|
1698
|
+
model, param_grid={"max_depth": list(range(1, 10))}, cv=5, scoring="roc_auc"
|
|
1699
|
+
)
|
|
1700
|
+
y = spreads.coint
|
|
1701
|
+
X = spreads[["drift", "vol", "corr", "corr_ret"]]
|
|
1702
|
+
decision_tree.fit(X, y)
|
|
1703
|
+
res = f"{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_['max_depth']}"
|
|
1704
|
+
print(res)
|
|
1705
|
+
|
|
1706
|
+
if crosstab:
|
|
1707
|
+
pd.set_option("display.float_format", lambda x: f"{x:.2%}")
|
|
1708
|
+
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
|
|
1709
|
+
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
|
|
1710
|
+
|
|
1711
|
+
|
|
1712
|
+
def select_candidate_pairs(pairs: pd.DataFrame, period=False):
|
|
1713
|
+
"""
|
|
1714
|
+
Select candidate pairs from a DataFrame based on cointegration status.
|
|
1715
|
+
|
|
1716
|
+
This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
|
|
1717
|
+
indicating cointegration. It then determines the dependent and independent series for each pair
|
|
1718
|
+
and returns the selected pairs in a dictionary format.
|
|
1719
|
+
|
|
1720
|
+
Args:
|
|
1721
|
+
pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
|
|
1722
|
+
period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
|
|
1723
|
+
|
|
1724
|
+
Returns:
|
|
1725
|
+
list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
|
|
1726
|
+
|
|
1727
|
+
References
|
|
1728
|
+
----------
|
|
1729
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1730
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1731
|
+
"""
|
|
1732
|
+
candidates = pairs.query("coint == 1").copy()
|
|
1733
|
+
candidates = candidates.reset_index()
|
|
1734
|
+
candidates["y"] = candidates.apply(
|
|
1735
|
+
lambda x: x["s1"] if x.s1_dep else x["s2"], axis=1
|
|
1736
|
+
)
|
|
1737
|
+
candidates["x"] = candidates.apply(
|
|
1738
|
+
lambda x: x["s2"] if x.s1_dep else x["s1"], axis=1
|
|
1739
|
+
)
|
|
1740
|
+
if period:
|
|
1741
|
+
return candidates[["x", "y", "period"]].to_dict(orient="records")
|
|
1742
|
+
return candidates[["x", "y"]].to_dict(orient="records")
|
|
1743
|
+
|
|
1744
|
+
|
|
1745
|
+
def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
1746
|
+
"""
|
|
1747
|
+
Estimate rolling mean using Kalman Smoothing.
|
|
1748
|
+
|
|
1749
|
+
Args:
|
|
1750
|
+
prices : pd.Series or np.ndarray
|
|
1751
|
+
The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
|
|
1752
|
+
|
|
1753
|
+
Returns:
|
|
1754
|
+
pd.Series or np.ndarray
|
|
1755
|
+
The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
|
|
1756
|
+
If the input is a numpy array, the output will be a numpy array.
|
|
1757
|
+
|
|
1758
|
+
References
|
|
1759
|
+
----------
|
|
1760
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1761
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1762
|
+
|
|
1763
|
+
Examples
|
|
1764
|
+
--------
|
|
1765
|
+
>>> import yfinance as yf
|
|
1766
|
+
>>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
|
|
1767
|
+
>>> prices = KFSmoother(prices)
|
|
1768
|
+
>>> print(prices[:5])
|
|
1769
|
+
Date
|
|
1770
|
+
2020-01-02 00:00:00+00:00 36.39801407
|
|
1771
|
+
2020-01-03 00:00:00+00:00 49.06231000
|
|
1772
|
+
2020-01-06 00:00:00+00:00 55.86334436
|
|
1773
|
+
2020-01-07 00:00:00+00:00 60.02240894
|
|
1774
|
+
2020-01-08 00:00:00+00:00 63.15057948
|
|
1775
|
+
dtype: float64
|
|
1776
|
+
|
|
1777
|
+
"""
|
|
1778
|
+
if not isinstance(prices, (np.ndarray, pd.Series)):
|
|
1779
|
+
raise ValueError("Input must be either a numpy array or a pandas Series.")
|
|
1780
|
+
kf = PyKalmanFilter(
|
|
1781
|
+
transition_matrices=np.eye(1),
|
|
1782
|
+
observation_matrices=np.eye(1),
|
|
1783
|
+
initial_state_mean=0,
|
|
1784
|
+
initial_state_covariance=1,
|
|
1785
|
+
observation_covariance=1,
|
|
1786
|
+
transition_covariance=0.05,
|
|
1787
|
+
)
|
|
1788
|
+
if isinstance(prices, pd.Series):
|
|
1789
|
+
state_means, _ = kf.filter(prices.values)
|
|
1790
|
+
return pd.Series(state_means.flatten(), index=prices.index)
|
|
1791
|
+
elif isinstance(prices, np.ndarray):
|
|
1792
|
+
state_means, _ = kf.filter(prices)
|
|
1793
|
+
return state_means.flatten()
|
|
1794
|
+
|
|
1795
|
+
|
|
1796
|
+
def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
|
|
1797
|
+
"""
|
|
1798
|
+
Estimate Hedge Ratio using Kalman Filter.
|
|
1799
|
+
Args:
|
|
1800
|
+
x : pd.Series or np.ndarray
|
|
1801
|
+
The independent variable, which can be either a pandas Series or a numpy array.
|
|
1802
|
+
y : pd.Series or np.ndarray
|
|
1803
|
+
The dependent variable, which can be either a pandas Series or a numpy array.
|
|
1804
|
+
|
|
1805
|
+
Returns:
|
|
1806
|
+
np.ndarray
|
|
1807
|
+
The estimated hedge ratio as a numpy array.
|
|
1808
|
+
|
|
1809
|
+
The function returns the negative of the first state variable of each Kalman Filter estimate,
|
|
1810
|
+
which represents the estimated hedge ratio.
|
|
1811
|
+
|
|
1812
|
+
References
|
|
1813
|
+
----------
|
|
1814
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1815
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1816
|
+
"""
|
|
1817
|
+
if not isinstance(x, (np.ndarray, pd.Series)) or not isinstance(
|
|
1818
|
+
y, (np.ndarray, pd.Series)
|
|
1819
|
+
):
|
|
1820
|
+
raise ValueError(
|
|
1821
|
+
"Both x and y must be either a numpy array or a pandas Series."
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
delta = 1e-3
|
|
1825
|
+
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
1826
|
+
obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
|
|
1827
|
+
|
|
1828
|
+
kf = PyKalmanFilter(
|
|
1829
|
+
n_dim_obs=1,
|
|
1830
|
+
n_dim_state=2,
|
|
1831
|
+
initial_state_mean=[0, 0],
|
|
1832
|
+
initial_state_covariance=np.ones((2, 2)),
|
|
1833
|
+
transition_matrices=np.eye(2),
|
|
1834
|
+
observation_matrices=obs_mat,
|
|
1835
|
+
observation_covariance=2,
|
|
1836
|
+
transition_covariance=trans_cov,
|
|
1837
|
+
)
|
|
1838
|
+
y = y.values if isinstance(y, pd.Series) else y
|
|
1839
|
+
state_means, _ = kf.filter(y)
|
|
1840
|
+
# Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
|
|
1841
|
+
# each Kalman Filter estimate, which is the estimated hedge ratio.
|
|
1842
|
+
return -state_means[:, 0]
|