bbstrader 0.1.94__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbstrader might be problematic. Click here for more details.
- bbstrader/__ini__.py +9 -9
- bbstrader/btengine/__init__.py +7 -7
- bbstrader/btengine/backtest.py +30 -26
- bbstrader/btengine/data.py +100 -79
- bbstrader/btengine/event.py +2 -1
- bbstrader/btengine/execution.py +18 -16
- bbstrader/btengine/performance.py +11 -7
- bbstrader/btengine/portfolio.py +35 -36
- bbstrader/btengine/strategy.py +119 -94
- bbstrader/config.py +14 -8
- bbstrader/core/__init__.py +0 -0
- bbstrader/core/data.py +22 -0
- bbstrader/core/utils.py +57 -0
- bbstrader/ibkr/__init__.py +0 -0
- bbstrader/ibkr/utils.py +0 -0
- bbstrader/metatrader/__init__.py +5 -5
- bbstrader/metatrader/account.py +117 -121
- bbstrader/metatrader/rates.py +83 -80
- bbstrader/metatrader/risk.py +23 -37
- bbstrader/metatrader/trade.py +169 -140
- bbstrader/metatrader/utils.py +3 -3
- bbstrader/models/__init__.py +5 -5
- bbstrader/models/factors.py +280 -0
- bbstrader/models/ml.py +1092 -0
- bbstrader/models/optimization.py +31 -28
- bbstrader/models/{portfolios.py → portfolio.py} +64 -46
- bbstrader/models/risk.py +15 -9
- bbstrader/trading/__init__.py +2 -2
- bbstrader/trading/execution.py +252 -164
- bbstrader/trading/scripts.py +8 -4
- bbstrader/trading/strategies.py +79 -66
- bbstrader/tseries.py +482 -107
- {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/LICENSE +1 -1
- {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/METADATA +6 -1
- bbstrader-0.2.1.dist-info/RECORD +37 -0
- bbstrader-0.1.94.dist-info/RECORD +0 -32
- {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/WHEEL +0 -0
- {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/top_level.txt +0 -0
bbstrader/tseries.py
CHANGED
|
@@ -8,33 +8,34 @@ market analysis, and financial data exploration.
|
|
|
8
8
|
"""
|
|
9
9
|
import pprint
|
|
10
10
|
import warnings
|
|
11
|
+
from itertools import combinations
|
|
12
|
+
from typing import List, Tuple, Union
|
|
13
|
+
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
11
15
|
import numpy as np
|
|
12
16
|
import pandas as pd
|
|
13
|
-
from tqdm import tqdm
|
|
14
|
-
import yfinance as yf
|
|
15
17
|
import pmdarima as pm
|
|
16
18
|
import seaborn as sns
|
|
17
19
|
import statsmodels.api as sm
|
|
18
|
-
import matplotlib.pyplot as plt
|
|
19
20
|
import statsmodels.tsa.stattools as ts
|
|
20
|
-
|
|
21
|
+
import yfinance as yf
|
|
21
22
|
from arch import arch_model
|
|
22
|
-
from scipy.optimize import minimize
|
|
23
23
|
from filterpy.kalman import KalmanFilter
|
|
24
|
+
from hurst import compute_Hc
|
|
24
25
|
from pykalman import KalmanFilter as PyKalmanFilter
|
|
25
|
-
from
|
|
26
|
+
from scipy.optimize import minimize
|
|
27
|
+
from sklearn.linear_model import LogisticRegressionCV
|
|
28
|
+
from sklearn.model_selection import GridSearchCV
|
|
29
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
26
30
|
from statsmodels.graphics.tsaplots import plot_acf
|
|
27
|
-
from statsmodels.
|
|
31
|
+
from statsmodels.stats.diagnostic import acorr_ljungbox
|
|
28
32
|
from statsmodels.tsa.arima.model import ARIMA
|
|
33
|
+
from statsmodels.tsa.stattools import adfuller, coint
|
|
29
34
|
from statsmodels.tsa.vector_ar.var_model import VAR
|
|
30
|
-
from
|
|
31
|
-
from
|
|
32
|
-
from sklearn.linear_model import LogisticRegressionCV
|
|
33
|
-
from statsmodels.stats.diagnostic import acorr_ljungbox
|
|
34
|
-
from itertools import combinations
|
|
35
|
-
from typing import Union, List, Tuple
|
|
36
|
-
warnings.filterwarnings("ignore")
|
|
35
|
+
from statsmodels.tsa.vector_ar.vecm import coint_johansen
|
|
36
|
+
from tqdm import tqdm
|
|
37
37
|
|
|
38
|
+
warnings.filterwarnings("ignore")
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
__all__ = [
|
|
@@ -50,7 +51,17 @@ __all__ = [
|
|
|
50
51
|
"run_kalman_filter",
|
|
51
52
|
"ArimaGarchModel",
|
|
52
53
|
"KalmanFilterModel",
|
|
53
|
-
"
|
|
54
|
+
"OrnsteinUhlenbeck",
|
|
55
|
+
"remove_correlated_assets",
|
|
56
|
+
"check_stationarity",
|
|
57
|
+
"remove_stationary_assets",
|
|
58
|
+
"select_assets",
|
|
59
|
+
"compute_pair_metrics",
|
|
60
|
+
"find_cointegrated_pairs",
|
|
61
|
+
"analyze_cointegrated_pairs",
|
|
62
|
+
"select_candidate_pairs",
|
|
63
|
+
"KFSmoother",
|
|
64
|
+
"KFHedgeRatio",
|
|
54
65
|
]
|
|
55
66
|
|
|
56
67
|
# *******************************************
|
|
@@ -123,7 +134,8 @@ def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
|
|
|
123
134
|
from arch.utility.exceptions import ConvergenceWarning as ArchWarning
|
|
124
135
|
from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
|
|
125
136
|
with warnings.catch_warnings():
|
|
126
|
-
warnings.filterwarnings(
|
|
137
|
+
warnings.filterwarnings(
|
|
138
|
+
"ignore", category=StatsWarning, module='statsmodels')
|
|
127
139
|
warnings.filterwarnings("ignore", category=ArchWarning, module='arch')
|
|
128
140
|
try:
|
|
129
141
|
best_arima_model = ARIMA(
|
|
@@ -499,7 +511,8 @@ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -
|
|
|
499
511
|
>>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
500
512
|
"""
|
|
501
513
|
# Download historical data
|
|
502
|
-
data = yf.download(tickers, start=start, end=end
|
|
514
|
+
data = yf.download(tickers, start=start, end=end,
|
|
515
|
+
multi_level_index=False)['Adj Close']
|
|
503
516
|
|
|
504
517
|
# Calculate correlation matrix
|
|
505
518
|
correlation_matrix = data.corr()
|
|
@@ -643,8 +656,10 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
|
|
|
643
656
|
"""
|
|
644
657
|
# Download historical data for required stocks
|
|
645
658
|
p0, p1 = pair[0], pair[1]
|
|
646
|
-
_p0 = yf.download(p0, start=start, end=end
|
|
647
|
-
|
|
659
|
+
_p0 = yf.download(p0, start=start, end=end,
|
|
660
|
+
progress=False, multi_level_index=False)
|
|
661
|
+
_p1 = yf.download(p1, start=start, end=end,
|
|
662
|
+
progress=False, multi_level_index=False)
|
|
648
663
|
df = pd.DataFrame(index=_p0.index)
|
|
649
664
|
df[p0] = _p0["Adj Close"]
|
|
650
665
|
df[p1] = _p1["Adj Close"]
|
|
@@ -673,7 +688,7 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
|
|
|
673
688
|
# Display regression metrics
|
|
674
689
|
print("\nRegression Metrics:")
|
|
675
690
|
print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
|
|
676
|
-
print(
|
|
691
|
+
print('Result Parmas: \n')
|
|
677
692
|
print(results.params)
|
|
678
693
|
print("\nRegression Summary:")
|
|
679
694
|
print(results.summary())
|
|
@@ -726,7 +741,8 @@ def run_hurst_test(symbol: str, start: str, end: str):
|
|
|
726
741
|
|
|
727
742
|
>>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
|
|
728
743
|
"""
|
|
729
|
-
data = yf.download(symbol, start=start, end=end
|
|
744
|
+
data = yf.download(symbol, start=start, end=end,
|
|
745
|
+
progress=False, multi_level_index=False)
|
|
730
746
|
|
|
731
747
|
# Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
|
|
732
748
|
gbm = np.log(np.cumsum(np.random.randn(100000))+1000)
|
|
@@ -743,7 +759,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
|
|
|
743
759
|
def test_cointegration(ticker1, ticker2, start, end):
|
|
744
760
|
# Download historical data
|
|
745
761
|
stock_data_pair = yf.download(
|
|
746
|
-
[ticker1, ticker2], start=start, end=end
|
|
762
|
+
[ticker1, ticker2], start=start, end=end, progress=False, multi_level_index=False
|
|
747
763
|
)['Adj Close'].dropna()
|
|
748
764
|
|
|
749
765
|
# Perform Johansen cointegration test
|
|
@@ -865,16 +881,16 @@ def run_kalman_filter(
|
|
|
865
881
|
etfs: Union[List[str], Tuple[str, ...]],
|
|
866
882
|
start: str, end: str) -> None:
|
|
867
883
|
"""
|
|
868
|
-
Applies a Kalman filter to a pair of
|
|
884
|
+
Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
|
|
869
885
|
to estimate the slope and intercept over time.
|
|
870
886
|
|
|
871
|
-
The function downloads historical adjusted closing prices for the specified pair of
|
|
887
|
+
The function downloads historical adjusted closing prices for the specified pair of assets,
|
|
872
888
|
visualizes their price relationship, calculates the Kalman filter estimates for the slope and
|
|
873
889
|
intercept, and visualizes the changes in these estimates over time.
|
|
874
890
|
|
|
875
891
|
Args:
|
|
876
892
|
etfs (Union[List[str] , Tuple[str, ...]]):
|
|
877
|
-
A list or tuple containing two valid
|
|
893
|
+
A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
|
|
878
894
|
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
879
895
|
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
880
896
|
|
|
@@ -883,8 +899,10 @@ def run_kalman_filter(
|
|
|
883
899
|
|
|
884
900
|
>>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
|
|
885
901
|
"""
|
|
886
|
-
etf_df1 = yf.download(etfs[0], start, end
|
|
887
|
-
|
|
902
|
+
etf_df1 = yf.download(etfs[0], start, end,
|
|
903
|
+
progress=False, multi_level_index=False)
|
|
904
|
+
etf_df2 = yf.download(etfs[1], start, end,
|
|
905
|
+
progress=False, multi_level_index=False)
|
|
888
906
|
|
|
889
907
|
prices = pd.DataFrame(index=etf_df1.index)
|
|
890
908
|
prices[etfs[0]] = etf_df1["Adj Close"]
|
|
@@ -919,13 +937,14 @@ class KalmanFilterModel():
|
|
|
919
937
|
"""
|
|
920
938
|
self.tickers = tickers
|
|
921
939
|
assert self.tickers is not None
|
|
922
|
-
|
|
923
|
-
self.
|
|
924
|
-
self.wt = self.delta/(1-self.delta) * np.eye(2)
|
|
925
|
-
self.vt = kwargs.get("vt", 1e-3)
|
|
940
|
+
|
|
941
|
+
self.R = None
|
|
926
942
|
self.theta = np.zeros(2)
|
|
927
943
|
self.P = np.zeros((2, 2))
|
|
928
|
-
self.
|
|
944
|
+
self.delta = kwargs.get("delta", 1e-4)
|
|
945
|
+
self.vt = kwargs.get("vt", 1e-3)
|
|
946
|
+
self.wt = self.delta/(1-self.delta) * np.eye(2)
|
|
947
|
+
self.latest_prices = np.array([-1.0, -1.0])
|
|
929
948
|
self.kf = self._init_kalman()
|
|
930
949
|
|
|
931
950
|
def _init_kalman(self):
|
|
@@ -945,6 +964,7 @@ class KalmanFilterModel():
|
|
|
945
964
|
return kf
|
|
946
965
|
|
|
947
966
|
Array = np.ndarray
|
|
967
|
+
|
|
948
968
|
def calc_slope_intercep(self, prices: Array) -> Tuple:
|
|
949
969
|
"""
|
|
950
970
|
Calculates and returns the slope and intercept
|
|
@@ -958,25 +978,24 @@ class KalmanFilterModel():
|
|
|
958
978
|
Returns:
|
|
959
979
|
A tuple containing the slope and intercept of the relationship
|
|
960
980
|
"""
|
|
961
|
-
kf =
|
|
962
|
-
kf.
|
|
963
|
-
kf.
|
|
964
|
-
kf.
|
|
965
|
-
|
|
966
|
-
intercept = kf.x.copy().flatten()[1]
|
|
981
|
+
self.kf.H = np.array([[prices[1], 1.0]])
|
|
982
|
+
self.kf.predict()
|
|
983
|
+
self.kf.update(prices[1])
|
|
984
|
+
slope = self.kf.x.copy().flatten()[0]
|
|
985
|
+
intercept = self.kf.x.copy().flatten()[1]
|
|
967
986
|
|
|
968
987
|
return slope, intercept
|
|
969
|
-
|
|
988
|
+
|
|
970
989
|
def calculate_etqt(self, prices: Array) -> Tuple:
|
|
971
990
|
"""
|
|
972
|
-
Calculates the forecast error and standard deviation of the predictions
|
|
991
|
+
Calculates the ``forecast error`` and ``standard deviation`` of the predictions
|
|
973
992
|
using the Kalman Filter.
|
|
974
993
|
|
|
975
994
|
Args:
|
|
976
995
|
prices : A numpy array of prices for two financial instruments.
|
|
977
996
|
|
|
978
997
|
Returns:
|
|
979
|
-
A tuple containing the forecast error and standard deviation of the predictions.
|
|
998
|
+
A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
|
|
980
999
|
"""
|
|
981
1000
|
|
|
982
1001
|
self.latest_prices[0] = prices[0]
|
|
@@ -1021,7 +1040,7 @@ class KalmanFilterModel():
|
|
|
1021
1040
|
At = self.R.dot(F.T) / Qt
|
|
1022
1041
|
self.theta = self.theta + At.flatten() * et
|
|
1023
1042
|
self.C = self.R - At * F.dot(self.R)
|
|
1024
|
-
return (et, sqrt_Qt)
|
|
1043
|
+
return (et[0], sqrt_Qt.flatten()[0])
|
|
1025
1044
|
else:
|
|
1026
1045
|
return None
|
|
1027
1046
|
|
|
@@ -1165,7 +1184,7 @@ class OrnsteinUhlenbeck():
|
|
|
1165
1184
|
Returns:
|
|
1166
1185
|
np.ndarray: 2D array representing simulated processes.
|
|
1167
1186
|
"""
|
|
1168
|
-
if returns is
|
|
1187
|
+
if returns is None:
|
|
1169
1188
|
returns = self.returns
|
|
1170
1189
|
if p is not None:
|
|
1171
1190
|
T = p
|
|
@@ -1190,6 +1209,33 @@ class OrnsteinUhlenbeck():
|
|
|
1190
1209
|
|
|
1191
1210
|
|
|
1192
1211
|
def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
|
|
1212
|
+
"""
|
|
1213
|
+
Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
|
|
1214
|
+
This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
|
|
1215
|
+
|
|
1216
|
+
Args:
|
|
1217
|
+
df (pd.DataFrame): A DataFrame where each column represents an asset
|
|
1218
|
+
and rows represent observations (e.g., time-series data).
|
|
1219
|
+
cutoff (float, optional, default=0.99): The correlation threshold.
|
|
1220
|
+
Columns with absolute correlation greater than this value will be considered for removal.
|
|
1221
|
+
|
|
1222
|
+
Returns:
|
|
1223
|
+
pd.DataFrame: A DataFrame with less correlated assets.
|
|
1224
|
+
The columns that are highly correlated (above the cutoff) are removed.
|
|
1225
|
+
|
|
1226
|
+
References
|
|
1227
|
+
----------
|
|
1228
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1229
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1230
|
+
|
|
1231
|
+
Example:
|
|
1232
|
+
>>> df = pd.DataFrame({
|
|
1233
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1234
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1235
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1236
|
+
... })
|
|
1237
|
+
>>> df = remove_correlated_assets(df)
|
|
1238
|
+
"""
|
|
1193
1239
|
corr = df.corr().stack()
|
|
1194
1240
|
corr = corr[corr < 1]
|
|
1195
1241
|
to_check = corr[corr.abs() > cutoff].index
|
|
@@ -1208,6 +1254,32 @@ def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
|
|
|
1208
1254
|
|
|
1209
1255
|
|
|
1210
1256
|
def check_stationarity(df: pd.DataFrame):
|
|
1257
|
+
"""
|
|
1258
|
+
Tests the stationarity of time-series data for each asset in the DataFrame
|
|
1259
|
+
using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
|
|
1260
|
+
in time-series analysis, and non-stationary data can affect model performance.
|
|
1261
|
+
|
|
1262
|
+
Args:
|
|
1263
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1264
|
+
|
|
1265
|
+
Returns:
|
|
1266
|
+
pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
|
|
1267
|
+
- ticker Asset name (column name from df).
|
|
1268
|
+
- adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
|
|
1269
|
+
|
|
1270
|
+
References
|
|
1271
|
+
----------
|
|
1272
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1273
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1274
|
+
|
|
1275
|
+
Example:
|
|
1276
|
+
>>> df = pd.DataFrame({
|
|
1277
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1278
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1279
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1280
|
+
... })
|
|
1281
|
+
>>> df = check_stationarity(df)
|
|
1282
|
+
"""
|
|
1211
1283
|
results = []
|
|
1212
1284
|
for ticker, prices in df.items():
|
|
1213
1285
|
results.append([ticker, adfuller(prices, regression='ct')[1]])
|
|
@@ -1215,37 +1287,133 @@ def check_stationarity(df: pd.DataFrame):
|
|
|
1215
1287
|
|
|
1216
1288
|
|
|
1217
1289
|
def remove_stationary_assets(df: pd.DataFrame, pval=.05):
|
|
1290
|
+
"""
|
|
1291
|
+
Filters out stationary assets from the DataFrame based on the p-value obtained
|
|
1292
|
+
from the Augmented Dickey-Fuller test.
|
|
1293
|
+
Useful for focusing only on non-stationary time-series data.
|
|
1294
|
+
|
|
1295
|
+
Args:
|
|
1296
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1297
|
+
pval (float, optional, default=0.05): The significance level to determine stationarity.
|
|
1298
|
+
Columns with an ADF test p-value below this threshold are considered stationary and removed.
|
|
1299
|
+
|
|
1300
|
+
Returns:
|
|
1301
|
+
pd.DataFrame: A DataFrame containing only the non-stationary assets.
|
|
1302
|
+
|
|
1303
|
+
References
|
|
1304
|
+
----------
|
|
1305
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1306
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1307
|
+
|
|
1308
|
+
Example:
|
|
1309
|
+
>>> df = pd.DataFrame({
|
|
1310
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1311
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1312
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1313
|
+
... })
|
|
1314
|
+
>>> df = remove_stationary_assets(df)
|
|
1315
|
+
"""
|
|
1218
1316
|
test_result = check_stationarity(df)
|
|
1219
1317
|
stationary = test_result.loc[test_result.adf <= pval, 'ticker'].tolist()
|
|
1220
1318
|
return df.drop(stationary, axis=1).sort_index()
|
|
1221
1319
|
|
|
1222
1320
|
|
|
1223
|
-
def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
|
|
1321
|
+
def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
|
|
1322
|
+
"""
|
|
1323
|
+
Selects the top N assets based on the average trading volume from the input DataFrame.
|
|
1324
|
+
These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
|
|
1325
|
+
|
|
1326
|
+
Args:
|
|
1327
|
+
df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
|
|
1328
|
+
Must include columns 'close' (price) and 'volume'.
|
|
1329
|
+
n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
|
|
1330
|
+
start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
|
|
1331
|
+
end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
|
|
1332
|
+
rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
|
|
1333
|
+
|
|
1334
|
+
Returns:
|
|
1335
|
+
pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
|
|
1336
|
+
|
|
1337
|
+
References
|
|
1338
|
+
----------
|
|
1339
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1340
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1341
|
+
"""
|
|
1342
|
+
required_columns = {'close', 'volume'}
|
|
1343
|
+
if not required_columns.issubset(df.columns):
|
|
1344
|
+
raise ValueError(
|
|
1345
|
+
f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}.")
|
|
1346
|
+
|
|
1347
|
+
if not isinstance(df.index, pd.MultiIndex) or 'ticker' not in df.index.names or 'date' not in df.index.names:
|
|
1348
|
+
raise ValueError(
|
|
1349
|
+
"Index must be a MultiIndex with levels ['ticker', 'date'].")
|
|
1350
|
+
|
|
1351
|
+
df = df.copy()
|
|
1224
1352
|
idx = pd.IndexSlice
|
|
1225
1353
|
start = start or df.index.get_level_values('date').min()
|
|
1226
1354
|
end = end or df.index.get_level_values('date').max()
|
|
1227
1355
|
df = (df
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1356
|
+
.loc[lambda df: ~df.index.duplicated()]
|
|
1357
|
+
.sort_index()
|
|
1358
|
+
.loc[idx[:, f'{start}':f'{end}'], :]
|
|
1359
|
+
.assign(dv=lambda df: df.close.mul(df.volume)))
|
|
1360
|
+
|
|
1361
|
+
if rolling_window is None:
|
|
1362
|
+
most_traded = (df.groupby(level='ticker')
|
|
1363
|
+
.dv.mean()
|
|
1364
|
+
.nlargest(n=n).index)
|
|
1365
|
+
else:
|
|
1366
|
+
# Calculate the rolling average of dollar volume
|
|
1367
|
+
df['dv_rolling_avg'] = (
|
|
1368
|
+
df.groupby(level=0)
|
|
1369
|
+
.dv
|
|
1370
|
+
.rolling(window=rolling_window, min_periods=1)
|
|
1371
|
+
.mean()
|
|
1372
|
+
.reset_index(level=0, drop=True)
|
|
1373
|
+
)
|
|
1374
|
+
most_traded = (
|
|
1375
|
+
df.groupby(level=0)['dv_rolling_avg']
|
|
1376
|
+
.mean()
|
|
1377
|
+
.nlargest(n=n)
|
|
1378
|
+
.index
|
|
1379
|
+
)
|
|
1240
1380
|
df = (df.loc[idx[most_traded, :], 'close']
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1381
|
+
.unstack('ticker')
|
|
1382
|
+
.ffill(limit=5)
|
|
1383
|
+
.dropna(axis=1))
|
|
1245
1384
|
df = remove_correlated_assets(df)
|
|
1246
|
-
|
|
1385
|
+
df = remove_stationary_assets(df)
|
|
1386
|
+
return df.sort_index()
|
|
1387
|
+
|
|
1247
1388
|
|
|
1248
1389
|
def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
1390
|
+
"""
|
|
1391
|
+
Calculates statistical and econometric metrics for a target security and a set of candidate securities.
|
|
1392
|
+
These metrics are useful in financial modeling and pairs trading strategies,
|
|
1393
|
+
providing information about drift, volatility, correlation, and cointegration.
|
|
1394
|
+
|
|
1395
|
+
Args:
|
|
1396
|
+
security (pd.Series): A time-series of the target security's prices.
|
|
1397
|
+
The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
|
|
1398
|
+
candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
|
|
1399
|
+
for candidate securities to be evaluated against the target security.
|
|
1400
|
+
|
|
1401
|
+
Returns:
|
|
1402
|
+
pd.DataFrame: A DataFrame combining:
|
|
1403
|
+
Drift: Estimated drift of spreads between the target security and each candidate.
|
|
1404
|
+
Volatility: Standard deviation of spreads.
|
|
1405
|
+
Correlation:
|
|
1406
|
+
``corr``: Correlation of normalized prices between the target and each candidate.
|
|
1407
|
+
``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
|
|
1408
|
+
Cointegration metrics:
|
|
1409
|
+
Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
|
|
1410
|
+
Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
|
|
1411
|
+
|
|
1412
|
+
References
|
|
1413
|
+
----------
|
|
1414
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1415
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1416
|
+
"""
|
|
1249
1417
|
security = security.div(security.iloc[0])
|
|
1250
1418
|
ticker = security.name
|
|
1251
1419
|
candidates = candidates.div(candidates.iloc[0])
|
|
@@ -1253,133 +1421,315 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
|
1253
1421
|
n, m = spreads.shape
|
|
1254
1422
|
X = np.ones(shape=(n, 2))
|
|
1255
1423
|
X[:, 1] = np.arange(1, n + 1)
|
|
1256
|
-
|
|
1424
|
+
|
|
1257
1425
|
# compute drift
|
|
1258
1426
|
drift = ((np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1]
|
|
1259
1427
|
.to_frame('drift'))
|
|
1260
|
-
|
|
1428
|
+
|
|
1261
1429
|
# compute volatility
|
|
1262
1430
|
vol = spreads.std().to_frame('vol')
|
|
1263
|
-
|
|
1264
|
-
#
|
|
1431
|
+
|
|
1432
|
+
# returns correlation
|
|
1265
1433
|
corr_ret = (candidates.pct_change()
|
|
1266
1434
|
.corrwith(security.pct_change())
|
|
1267
1435
|
.to_frame('corr_ret'))
|
|
1268
|
-
|
|
1436
|
+
|
|
1269
1437
|
# normalized price series correlation
|
|
1270
1438
|
corr = candidates.corrwith(security).to_frame('corr')
|
|
1271
1439
|
metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
|
|
1272
|
-
|
|
1440
|
+
|
|
1273
1441
|
tests = []
|
|
1274
1442
|
# run cointegration tests
|
|
1275
1443
|
for candidate, prices in tqdm(candidates.items()):
|
|
1276
1444
|
df = pd.DataFrame({'s1': security, 's2': prices})
|
|
1277
1445
|
var = VAR(df.values)
|
|
1278
|
-
lags = var.select_order()
|
|
1446
|
+
lags = var.select_order() # select VAR order
|
|
1279
1447
|
k_ar_diff = lags.selected_orders['aic']
|
|
1280
1448
|
# Johansen Test with constant Term and estd. lag order
|
|
1281
1449
|
cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
|
|
1282
1450
|
# Engle-Granger Tests
|
|
1283
1451
|
t1, p1 = coint(security, prices, trend='c')[:2]
|
|
1284
1452
|
t2, p2 = coint(prices, security, trend='c')[:2]
|
|
1285
|
-
tests.append([ticker, candidate, t1, p1, t2, p2,
|
|
1453
|
+
tests.append([ticker, candidate, t1, p1, t2, p2,
|
|
1286
1454
|
k_ar_diff, *cj0.lr1])
|
|
1287
|
-
columns = ['s1', 's2', 't1', 'p1', 't2',
|
|
1455
|
+
columns = ['s1', 's2', 't1', 'p1', 't2',
|
|
1456
|
+
'p2', 'k_ar_diff', 'trace0', 'trace1']
|
|
1288
1457
|
tests = pd.DataFrame(tests, columns=columns).set_index('s2')
|
|
1289
1458
|
return metrics.join(tests)
|
|
1290
1459
|
|
|
1291
|
-
|
|
1460
|
+
|
|
1461
|
+
__CRITICAL_VALUES = {
|
|
1292
1462
|
0: {.9: 13.4294, .95: 15.4943, .99: 19.9349},
|
|
1293
1463
|
1: {.9: 2.7055, .95: 3.8415, .99: 6.6349}
|
|
1294
1464
|
}
|
|
1295
1465
|
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1466
|
+
|
|
1467
|
+
def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
|
|
1468
|
+
n=None, start=None, stop=None, coint=False):
|
|
1469
|
+
"""
|
|
1470
|
+
Identifies cointegrated pairs between a target set of securities and candidate securities
|
|
1471
|
+
based on econometric tests. The function evaluates statistical relationships,
|
|
1472
|
+
such as cointegration and Engle-Granger significance, to determine pairs suitable
|
|
1473
|
+
for financial strategies like pairs trading.
|
|
1474
|
+
|
|
1475
|
+
Args:
|
|
1476
|
+
securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1477
|
+
prices of target securities to evaluate.
|
|
1478
|
+
candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1479
|
+
prices of candidate securities to compare against the target securities.
|
|
1480
|
+
n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
|
|
1481
|
+
start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1482
|
+
stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1483
|
+
coint (`bool`, optional, default=False):
|
|
1484
|
+
- If `True`, filters for pairs identified as cointegrated.
|
|
1485
|
+
- If `False`, returns all evaluated pairs.
|
|
1486
|
+
|
|
1487
|
+
Returns:
|
|
1488
|
+
- ``pd.DataFrame``: A DataFrame containing:
|
|
1489
|
+
- Johansen and Engle-Granger cointegration metrics:
|
|
1490
|
+
- `t1`, `t2`: Engle-Granger test statistics for two directions.
|
|
1491
|
+
- `p1`, `p2`: Engle-Granger p-values for two directions.
|
|
1492
|
+
- `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
|
|
1493
|
+
- Indicators and filters:
|
|
1494
|
+
- `joh_sig`: Indicates Johansen cointegration significance.
|
|
1495
|
+
- `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
|
|
1496
|
+
- `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
|
|
1497
|
+
- `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
|
|
1498
|
+
- Spread and ranking:
|
|
1499
|
+
- `t`: Minimum of `t1` and `t2`.
|
|
1500
|
+
- `p`: Minimum of `p1` and `p2`.
|
|
1501
|
+
References
|
|
1502
|
+
----------
|
|
1503
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1504
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1505
|
+
|
|
1506
|
+
Example:
|
|
1507
|
+
>>> import pandas as pd
|
|
1508
|
+
|
|
1509
|
+
>>> # Sample Data
|
|
1510
|
+
>>> data_securities = {
|
|
1511
|
+
... 'Security1': [100, 102, 101, 103, 105],
|
|
1512
|
+
... 'Security2': [50, 52, 53, 51, 54]
|
|
1513
|
+
... }
|
|
1514
|
+
>>> data_candidates = {
|
|
1515
|
+
... 'Candidate1': [100, 101, 99, 102, 104],
|
|
1516
|
+
... 'Candidate2': [200, 202, 201, 203, 205]
|
|
1517
|
+
... }
|
|
1518
|
+
|
|
1519
|
+
>>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
|
|
1520
|
+
>>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
|
|
1521
|
+
|
|
1522
|
+
>>> # Find cointegrated pairs
|
|
1523
|
+
>>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1524
|
+
>>> print(top_pairs)
|
|
1525
|
+
|
|
1526
|
+
>>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
|
|
1527
|
+
>>> |----------|-----------|------|-------|---------|--------|-------|
|
|
1528
|
+
>>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
|
|
1529
|
+
>>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
|
|
1530
|
+
"""
|
|
1531
|
+
trace0_cv = __CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
|
|
1532
|
+
# critical value for 1 cointegration relationship
|
|
1533
|
+
trace1_cv = __CRITICAL_VALUES[1][.95]
|
|
1300
1534
|
spreads = []
|
|
1301
1535
|
if start is not None and stop is not None:
|
|
1302
1536
|
securities = securities.loc[str(start): str(stop), :]
|
|
1303
1537
|
candidates = candidates.loc[str(start): str(stop), :]
|
|
1304
1538
|
for i, (ticker, prices) in enumerate(securities.items(), 1):
|
|
1305
|
-
|
|
1306
|
-
|
|
1539
|
+
try:
|
|
1540
|
+
df = compute_pair_metrics(prices, candidates)
|
|
1541
|
+
spreads.append(df.set_index('s1', append=True))
|
|
1542
|
+
except np.linalg.LinAlgError:
|
|
1543
|
+
continue
|
|
1307
1544
|
spreads = pd.concat(spreads)
|
|
1308
1545
|
spreads.index.names = ['s2', 's1']
|
|
1309
1546
|
spreads = spreads.swaplevel()
|
|
1310
1547
|
spreads['t'] = spreads[['t1', 't2']].min(axis=1)
|
|
1311
1548
|
spreads['p'] = spreads[['p1', 'p2']].min(axis=1)
|
|
1312
1549
|
spreads['joh_sig'] = ((spreads.trace0 > trace0_cv) &
|
|
1313
|
-
|
|
1550
|
+
(spreads.trace1 > trace1_cv)).astype(int)
|
|
1314
1551
|
spreads['eg_sig'] = (spreads.p < .05).astype(int)
|
|
1315
1552
|
spreads['s1_dep'] = spreads.p1 < spreads.p2
|
|
1316
1553
|
spreads['coint'] = (spreads.joh_sig & spreads.eg_sig).astype(int)
|
|
1317
1554
|
# select top n pairs
|
|
1318
|
-
if
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1555
|
+
if coint:
|
|
1556
|
+
if n is not None:
|
|
1557
|
+
top_pairs = (spreads.query('coint == 1')
|
|
1558
|
+
.sort_values('t', ascending=False)
|
|
1559
|
+
.head(n))
|
|
1560
|
+
else:
|
|
1561
|
+
top_pairs = (spreads.query('coint == 1')
|
|
1562
|
+
.sort_values('t', ascending=False))
|
|
1322
1563
|
else:
|
|
1323
|
-
|
|
1564
|
+
if n is not None:
|
|
1565
|
+
top_pairs = (spreads
|
|
1566
|
+
.sort_values('t', ascending=False)
|
|
1567
|
+
.head(n))
|
|
1568
|
+
else:
|
|
1569
|
+
top_pairs = (spreads
|
|
1570
|
+
.sort_values('t', ascending=False))
|
|
1324
1571
|
return top_pairs
|
|
1325
1572
|
|
|
1326
|
-
|
|
1573
|
+
|
|
1574
|
+
def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=True, crosstab=False,
|
|
1327
1575
|
heuristics=False, log_reg=False, decis_tree=False):
|
|
1576
|
+
"""
|
|
1577
|
+
Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
|
|
1578
|
+
|
|
1579
|
+
Args:
|
|
1580
|
+
spreads (pd.DataFrame):
|
|
1581
|
+
A DataFrame containing cointegration metrics and characteristics.
|
|
1582
|
+
Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
|
|
1583
|
+
plot_coint (bool, optional):
|
|
1584
|
+
If True, generates scatterplots and boxplots to visualize cointegration characteristics.
|
|
1585
|
+
cosstab (bool, optional):
|
|
1586
|
+
If True, displays crosstabulations of Engle-Granger and Johansen test significance.
|
|
1587
|
+
heuristics (bool, optional):
|
|
1588
|
+
If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
|
|
1589
|
+
log_reg (bool, optional):
|
|
1590
|
+
If True, fits a logistic regression model to predict cointegration and evaluates its performance.
|
|
1591
|
+
decis_tree (bool, optional):
|
|
1592
|
+
If True, fits a decision tree model to predict cointegration and evaluates its performance.
|
|
1593
|
+
|
|
1594
|
+
References
|
|
1595
|
+
----------
|
|
1596
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1597
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1598
|
+
|
|
1599
|
+
Example:
|
|
1600
|
+
>>> import pandas as pd
|
|
1601
|
+
>>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
|
|
1602
|
+
|
|
1603
|
+
>>> # Sample Data
|
|
1604
|
+
>>> securities = pd.DataFrame({
|
|
1605
|
+
... 'SPY': [100, 102, 101, 103, 105],
|
|
1606
|
+
... 'QQQ': [50, 52, 53, 51, 54]
|
|
1607
|
+
... })
|
|
1608
|
+
>>> candidates = pd.DataFrame({
|
|
1609
|
+
... 'AAPL': [100, 101, 99, 102, 104],
|
|
1610
|
+
... 'MSFT': [200, 202, 201, 203, 205]
|
|
1611
|
+
... })
|
|
1612
|
+
|
|
1613
|
+
>>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1614
|
+
>>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
|
|
1615
|
+
"""
|
|
1328
1616
|
if plot_coint:
|
|
1329
|
-
trace0_cv =
|
|
1617
|
+
trace0_cv = __CRITICAL_VALUES[0][.95]
|
|
1330
1618
|
spreads = spreads.reset_index()
|
|
1331
|
-
sns.scatterplot(x=np.log1p(spreads.t.abs()),
|
|
1332
|
-
y=np.log1p(spreads.trace1),
|
|
1333
|
-
hue='coint', data=spreads[spreads.trace0>trace0_cv])
|
|
1619
|
+
sns.scatterplot(x=np.log1p(spreads.t.abs()),
|
|
1620
|
+
y=np.log1p(spreads.trace1),
|
|
1621
|
+
hue='coint', data=spreads[spreads.trace0 > trace0_cv])
|
|
1334
1622
|
fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
|
|
1335
1623
|
for i, heuristic in enumerate(['drift', 'vol', 'corr', 'corr_ret']):
|
|
1336
1624
|
sns.boxplot(x='coint', y=heuristic, data=spreads, ax=axes[i])
|
|
1337
|
-
fig.tight_layout()
|
|
1625
|
+
fig.tight_layout()
|
|
1626
|
+
|
|
1338
1627
|
if heuristics:
|
|
1339
1628
|
spreads = spreads.reset_index()
|
|
1340
1629
|
h = spreads.groupby(spreads.coint)[
|
|
1341
1630
|
['drift', 'vol', 'corr']].describe().stack(level=0).swaplevel().sort_index()
|
|
1342
1631
|
print(h)
|
|
1632
|
+
|
|
1343
1633
|
if log_reg:
|
|
1344
1634
|
y = spreads.coint
|
|
1345
1635
|
X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
|
|
1346
|
-
log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
|
|
1347
|
-
|
|
1348
|
-
|
|
1636
|
+
log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
|
|
1637
|
+
class_weight='balanced',
|
|
1638
|
+
scoring='roc_auc')
|
|
1349
1639
|
log_reg.fit(X=X, y=y)
|
|
1350
1640
|
Cs = log_reg.Cs_
|
|
1351
1641
|
scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
|
|
1352
|
-
scores.plot(logx=True)
|
|
1642
|
+
scores.plot(logx=True)
|
|
1353
1643
|
res = f'C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}'
|
|
1354
1644
|
print(res)
|
|
1355
1645
|
print(log_reg.coef_)
|
|
1646
|
+
|
|
1356
1647
|
if decis_tree:
|
|
1357
1648
|
model = DecisionTreeClassifier(class_weight='balanced')
|
|
1358
1649
|
decision_tree = GridSearchCV(model,
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1650
|
+
param_grid={
|
|
1651
|
+
'max_depth': list(range(1, 10))},
|
|
1652
|
+
cv=5,
|
|
1653
|
+
scoring='roc_auc')
|
|
1362
1654
|
y = spreads.coint
|
|
1363
1655
|
X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
|
|
1364
1656
|
decision_tree.fit(X, y)
|
|
1365
1657
|
res = f'{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_["max_depth"]}'
|
|
1366
1658
|
print(res)
|
|
1367
|
-
|
|
1659
|
+
|
|
1660
|
+
if crosstab:
|
|
1368
1661
|
pd.set_option('display.float_format', lambda x: f'{x:.2%}')
|
|
1369
1662
|
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
|
|
1370
1663
|
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
|
|
1371
1664
|
|
|
1372
1665
|
|
|
1373
|
-
def select_candidate_pairs(pairs: pd.DataFrame):
|
|
1666
|
+
def select_candidate_pairs(pairs: pd.DataFrame, period=False):
|
|
1667
|
+
"""
|
|
1668
|
+
Select candidate pairs from a DataFrame based on cointegration status.
|
|
1669
|
+
|
|
1670
|
+
This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
|
|
1671
|
+
indicating cointegration. It then determines the dependent and independent series for each pair
|
|
1672
|
+
and returns the selected pairs in a dictionary format.
|
|
1673
|
+
|
|
1674
|
+
Args:
|
|
1675
|
+
pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
|
|
1676
|
+
period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
|
|
1677
|
+
|
|
1678
|
+
Returns:
|
|
1679
|
+
list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
|
|
1680
|
+
|
|
1681
|
+
References
|
|
1682
|
+
----------
|
|
1683
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1684
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1685
|
+
"""
|
|
1374
1686
|
candidates = pairs.query('coint == 1').copy()
|
|
1375
|
-
candidates
|
|
1376
|
-
candidates['
|
|
1377
|
-
|
|
1687
|
+
candidates = candidates.reset_index()
|
|
1688
|
+
candidates['y'] = candidates.apply(
|
|
1689
|
+
lambda x: x['s1'] if x.s1_dep else x['s2'], axis=1)
|
|
1690
|
+
candidates['x'] = candidates.apply(
|
|
1691
|
+
lambda x: x['s2'] if x.s1_dep else x['s1'], axis=1)
|
|
1692
|
+
if period:
|
|
1693
|
+
return candidates[['x', 'y', 'period']].to_dict(orient='records')
|
|
1378
1694
|
return candidates[['x', 'y']].to_dict(orient='records')
|
|
1379
1695
|
|
|
1380
1696
|
|
|
1381
|
-
def KFSmoother(
|
|
1382
|
-
"""
|
|
1697
|
+
def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
1698
|
+
"""
|
|
1699
|
+
Estimate rolling mean using Kalman Smoothing.
|
|
1700
|
+
|
|
1701
|
+
Args:
|
|
1702
|
+
prices : pd.Series or np.ndarray
|
|
1703
|
+
The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
|
|
1704
|
+
|
|
1705
|
+
Returns:
|
|
1706
|
+
pd.Series or np.ndarray
|
|
1707
|
+
The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
|
|
1708
|
+
If the input is a numpy array, the output will be a numpy array.
|
|
1709
|
+
|
|
1710
|
+
References
|
|
1711
|
+
----------
|
|
1712
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1713
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1714
|
+
|
|
1715
|
+
Examples
|
|
1716
|
+
--------
|
|
1717
|
+
>>> import yfinance as yf
|
|
1718
|
+
>>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
|
|
1719
|
+
>>> prices = KFSmoother(prices)
|
|
1720
|
+
>>> print(prices[:5])
|
|
1721
|
+
Date
|
|
1722
|
+
2020-01-02 00:00:00+00:00 36.39801407
|
|
1723
|
+
2020-01-03 00:00:00+00:00 49.06231000
|
|
1724
|
+
2020-01-06 00:00:00+00:00 55.86334436
|
|
1725
|
+
2020-01-07 00:00:00+00:00 60.02240894
|
|
1726
|
+
2020-01-08 00:00:00+00:00 63.15057948
|
|
1727
|
+
dtype: float64
|
|
1728
|
+
|
|
1729
|
+
"""
|
|
1730
|
+
if not isinstance(prices, (np.ndarray, pd.Series)):
|
|
1731
|
+
raise ValueError(
|
|
1732
|
+
"Input must be either a numpy array or a pandas Series.")
|
|
1383
1733
|
kf = PyKalmanFilter(
|
|
1384
1734
|
transition_matrices=np.eye(1),
|
|
1385
1735
|
observation_matrices=np.eye(1),
|
|
@@ -1396,8 +1746,32 @@ def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
|
1396
1746
|
return state_means.flatten()
|
|
1397
1747
|
|
|
1398
1748
|
|
|
1399
|
-
def KFHedgeRatio(
|
|
1400
|
-
"""
|
|
1749
|
+
def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
|
|
1750
|
+
"""
|
|
1751
|
+
Estimate Hedge Ratio using Kalman Filter.
|
|
1752
|
+
Args:
|
|
1753
|
+
x : pd.Series or np.ndarray
|
|
1754
|
+
The independent variable, which can be either a pandas Series or a numpy array.
|
|
1755
|
+
y : pd.Series or np.ndarray
|
|
1756
|
+
The dependent variable, which can be either a pandas Series or a numpy array.
|
|
1757
|
+
|
|
1758
|
+
Returns:
|
|
1759
|
+
np.ndarray
|
|
1760
|
+
The estimated hedge ratio as a numpy array.
|
|
1761
|
+
|
|
1762
|
+
The function returns the negative of the first state variable of each Kalman Filter estimate,
|
|
1763
|
+
which represents the estimated hedge ratio.
|
|
1764
|
+
|
|
1765
|
+
References
|
|
1766
|
+
----------
|
|
1767
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1768
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1769
|
+
"""
|
|
1770
|
+
if (not isinstance(x, (np.ndarray, pd.Series))
|
|
1771
|
+
or not isinstance(y, (np.ndarray, pd.Series))):
|
|
1772
|
+
raise ValueError(
|
|
1773
|
+
"Both x and y must be either a numpy array or a pandas Series.")
|
|
1774
|
+
|
|
1401
1775
|
delta = 1e-3
|
|
1402
1776
|
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
1403
1777
|
obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
|
|
@@ -1411,7 +1785,8 @@ def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
|
|
|
1411
1785
|
observation_covariance=2,
|
|
1412
1786
|
transition_covariance=trans_cov
|
|
1413
1787
|
)
|
|
1414
|
-
|
|
1415
|
-
|
|
1788
|
+
y = y.values if isinstance(y, pd.Series) else y
|
|
1789
|
+
state_means, _ = kf.filter(y)
|
|
1790
|
+
# Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
|
|
1416
1791
|
# each Kalman Filter estimate, which is the estimated hedge ratio.
|
|
1417
1792
|
return -state_means[:, 0]
|