bbstrader 0.1.93__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbstrader might be problematic. Click here for more details.
- bbstrader/__ini__.py +2 -2
- bbstrader/btengine/data.py +241 -40
- bbstrader/btengine/strategy.py +12 -8
- bbstrader/config.py +4 -0
- bbstrader/core/__init__.py +0 -0
- bbstrader/core/data.py +23 -0
- bbstrader/core/utils.py +0 -0
- bbstrader/ibkr/__init__.py +0 -0
- bbstrader/metatrader/account.py +66 -12
- bbstrader/metatrader/rates.py +24 -20
- bbstrader/metatrader/risk.py +6 -3
- bbstrader/metatrader/trade.py +31 -13
- bbstrader/models/__init__.py +1 -1
- bbstrader/models/factors.py +275 -0
- bbstrader/models/ml.py +1026 -0
- bbstrader/models/optimization.py +17 -16
- bbstrader/models/{portfolios.py → portfolio.py} +20 -11
- bbstrader/models/risk.py +10 -2
- bbstrader/trading/execution.py +67 -35
- bbstrader/trading/strategies.py +5 -5
- bbstrader/tseries.py +412 -63
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/METADATA +9 -3
- bbstrader-0.2.0.dist-info/RECORD +36 -0
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/WHEEL +1 -1
- bbstrader-0.1.93.dist-info/RECORD +0 -32
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/LICENSE +0 -0
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/top_level.txt +0 -0
bbstrader/tseries.py
CHANGED
|
@@ -50,7 +50,8 @@ __all__ = [
|
|
|
50
50
|
"run_kalman_filter",
|
|
51
51
|
"ArimaGarchModel",
|
|
52
52
|
"KalmanFilterModel",
|
|
53
|
-
"OrnsteinUhlenbeckModel"
|
|
53
|
+
"OrnsteinUhlenbeckModel",
|
|
54
|
+
|
|
54
55
|
]
|
|
55
56
|
|
|
56
57
|
# *******************************************
|
|
@@ -499,7 +500,7 @@ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -
|
|
|
499
500
|
>>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
|
|
500
501
|
"""
|
|
501
502
|
# Download historical data
|
|
502
|
-
data = yf.download(tickers, start=start, end=end)['Adj Close']
|
|
503
|
+
data = yf.download(tickers, start=start, end=end, multi_level_index=False)['Adj Close']
|
|
503
504
|
|
|
504
505
|
# Calculate correlation matrix
|
|
505
506
|
correlation_matrix = data.corr()
|
|
@@ -643,8 +644,8 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
|
|
|
643
644
|
"""
|
|
644
645
|
# Download historical data for required stocks
|
|
645
646
|
p0, p1 = pair[0], pair[1]
|
|
646
|
-
_p0 = yf.download(p0, start=start, end=end)
|
|
647
|
-
_p1 = yf.download(p1, start=start, end=end)
|
|
647
|
+
_p0 = yf.download(p0, start=start, end=end, progress=False, multi_level_index=False)
|
|
648
|
+
_p1 = yf.download(p1, start=start, end=end, progress=False, multi_level_index=False)
|
|
648
649
|
df = pd.DataFrame(index=_p0.index)
|
|
649
650
|
df[p0] = _p0["Adj Close"]
|
|
650
651
|
df[p1] = _p1["Adj Close"]
|
|
@@ -726,7 +727,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
|
|
|
726
727
|
|
|
727
728
|
>>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
|
|
728
729
|
"""
|
|
729
|
-
data = yf.download(symbol, start=start, end=end)
|
|
730
|
+
data = yf.download(symbol, start=start, end=end, progress=False, multi_level_index=False)
|
|
730
731
|
|
|
731
732
|
# Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
|
|
732
733
|
gbm = np.log(np.cumsum(np.random.randn(100000))+1000)
|
|
@@ -743,7 +744,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
|
|
|
743
744
|
def test_cointegration(ticker1, ticker2, start, end):
|
|
744
745
|
# Download historical data
|
|
745
746
|
stock_data_pair = yf.download(
|
|
746
|
-
[ticker1, ticker2], start=start, end=end
|
|
747
|
+
[ticker1, ticker2], start=start, end=end, progress=False, multi_level_index=False
|
|
747
748
|
)['Adj Close'].dropna()
|
|
748
749
|
|
|
749
750
|
# Perform Johansen cointegration test
|
|
@@ -865,16 +866,16 @@ def run_kalman_filter(
|
|
|
865
866
|
etfs: Union[List[str], Tuple[str, ...]],
|
|
866
867
|
start: str, end: str) -> None:
|
|
867
868
|
"""
|
|
868
|
-
Applies a Kalman filter to a pair of
|
|
869
|
+
Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
|
|
869
870
|
to estimate the slope and intercept over time.
|
|
870
871
|
|
|
871
|
-
The function downloads historical adjusted closing prices for the specified pair of
|
|
872
|
+
The function downloads historical adjusted closing prices for the specified pair of assets,
|
|
872
873
|
visualizes their price relationship, calculates the Kalman filter estimates for the slope and
|
|
873
874
|
intercept, and visualizes the changes in these estimates over time.
|
|
874
875
|
|
|
875
876
|
Args:
|
|
876
877
|
etfs (Union[List[str] , Tuple[str, ...]]):
|
|
877
|
-
A list or tuple containing two valid
|
|
878
|
+
A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
|
|
878
879
|
start (str): The start date for the historical data in 'YYYY-MM-DD' format.
|
|
879
880
|
end (str): The end date for the historical data in 'YYYY-MM-DD' format.
|
|
880
881
|
|
|
@@ -883,8 +884,8 @@ def run_kalman_filter(
|
|
|
883
884
|
|
|
884
885
|
>>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
|
|
885
886
|
"""
|
|
886
|
-
etf_df1 = yf.download(etfs[0], start, end)
|
|
887
|
-
etf_df2 = yf.download(etfs[1], start, end)
|
|
887
|
+
etf_df1 = yf.download(etfs[0], start, end, progress=False, multi_level_index=False)
|
|
888
|
+
etf_df2 = yf.download(etfs[1], start, end, progress=False, multi_level_index=False)
|
|
888
889
|
|
|
889
890
|
prices = pd.DataFrame(index=etf_df1.index)
|
|
890
891
|
prices[etfs[0]] = etf_df1["Adj Close"]
|
|
@@ -919,13 +920,14 @@ class KalmanFilterModel():
|
|
|
919
920
|
"""
|
|
920
921
|
self.tickers = tickers
|
|
921
922
|
assert self.tickers is not None
|
|
923
|
+
|
|
924
|
+
self.R = None
|
|
925
|
+
self.theta = np.zeros(2)
|
|
926
|
+
self.P = np.zeros((2, 2))
|
|
927
|
+
self.delta = kwargs.get("delta", 1e-4)
|
|
928
|
+
self.vt = kwargs.get("vt", 1e-3)
|
|
929
|
+
self.wt = self.delta/(1-self.delta) * np.eye(2)
|
|
922
930
|
self.latest_prices = np.array([-1.0, -1.0])
|
|
923
|
-
self.delta = kwargs.get("delta", 1e-4)
|
|
924
|
-
self.wt = self.delta/(1-self.delta) * np.eye(2)
|
|
925
|
-
self.vt = kwargs.get("vt", 1e-3)
|
|
926
|
-
self.theta = np.zeros(2)
|
|
927
|
-
self.P = np.zeros((2, 2))
|
|
928
|
-
self.R = None
|
|
929
931
|
self.kf = self._init_kalman()
|
|
930
932
|
|
|
931
933
|
def _init_kalman(self):
|
|
@@ -958,25 +960,24 @@ class KalmanFilterModel():
|
|
|
958
960
|
Returns:
|
|
959
961
|
A tuple containing the slope and intercept of the relationship
|
|
960
962
|
"""
|
|
961
|
-
kf =
|
|
962
|
-
kf.
|
|
963
|
-
kf.
|
|
964
|
-
kf.
|
|
965
|
-
|
|
966
|
-
intercept = kf.x.copy().flatten()[1]
|
|
963
|
+
self.kf.H = np.array([[prices[1], 1.0]])
|
|
964
|
+
self.kf.predict()
|
|
965
|
+
self.kf.update(prices[1])
|
|
966
|
+
slope = self.kf.x.copy().flatten()[0]
|
|
967
|
+
intercept = self.kf.x.copy().flatten()[1]
|
|
967
968
|
|
|
968
969
|
return slope, intercept
|
|
969
970
|
|
|
970
971
|
def calculate_etqt(self, prices: Array) -> Tuple:
|
|
971
972
|
"""
|
|
972
|
-
Calculates the forecast error and standard deviation of the predictions
|
|
973
|
+
Calculates the ``forecast error`` and ``standard deviation`` of the predictions
|
|
973
974
|
using the Kalman Filter.
|
|
974
975
|
|
|
975
976
|
Args:
|
|
976
977
|
prices : A numpy array of prices for two financial instruments.
|
|
977
978
|
|
|
978
979
|
Returns:
|
|
979
|
-
A tuple containing the forecast error and standard deviation of the predictions.
|
|
980
|
+
A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
|
|
980
981
|
"""
|
|
981
982
|
|
|
982
983
|
self.latest_prices[0] = prices[0]
|
|
@@ -1021,7 +1022,7 @@ class KalmanFilterModel():
|
|
|
1021
1022
|
At = self.R.dot(F.T) / Qt
|
|
1022
1023
|
self.theta = self.theta + At.flatten() * et
|
|
1023
1024
|
self.C = self.R - At * F.dot(self.R)
|
|
1024
|
-
return (et, sqrt_Qt)
|
|
1025
|
+
return (et[0], sqrt_Qt.flatten()[0])
|
|
1025
1026
|
else:
|
|
1026
1027
|
return None
|
|
1027
1028
|
|
|
@@ -1190,6 +1191,33 @@ class OrnsteinUhlenbeck():
|
|
|
1190
1191
|
|
|
1191
1192
|
|
|
1192
1193
|
def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
|
|
1194
|
+
"""
|
|
1195
|
+
Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
|
|
1196
|
+
This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
|
|
1197
|
+
|
|
1198
|
+
Args:
|
|
1199
|
+
df (pd.DataFrame): A DataFrame where each column represents an asset
|
|
1200
|
+
and rows represent observations (e.g., time-series data).
|
|
1201
|
+
cutoff (float, optional, default=0.99): The correlation threshold.
|
|
1202
|
+
Columns with absolute correlation greater than this value will be considered for removal.
|
|
1203
|
+
|
|
1204
|
+
Returns:
|
|
1205
|
+
pd.DataFrame: A DataFrame with less correlated assets.
|
|
1206
|
+
The columns that are highly correlated (above the cutoff) are removed.
|
|
1207
|
+
|
|
1208
|
+
References
|
|
1209
|
+
----------
|
|
1210
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1211
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1212
|
+
|
|
1213
|
+
Example:
|
|
1214
|
+
>>> df = pd.DataFrame({
|
|
1215
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1216
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1217
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1218
|
+
... })
|
|
1219
|
+
>>> df = remove_correlated_assets(df)
|
|
1220
|
+
"""
|
|
1193
1221
|
corr = df.corr().stack()
|
|
1194
1222
|
corr = corr[corr < 1]
|
|
1195
1223
|
to_check = corr[corr.abs() > cutoff].index
|
|
@@ -1208,6 +1236,32 @@ def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
|
|
|
1208
1236
|
|
|
1209
1237
|
|
|
1210
1238
|
def check_stationarity(df: pd.DataFrame):
|
|
1239
|
+
"""
|
|
1240
|
+
Tests the stationarity of time-series data for each asset in the DataFrame
|
|
1241
|
+
using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
|
|
1242
|
+
in time-series analysis, and non-stationary data can affect model performance.
|
|
1243
|
+
|
|
1244
|
+
Args:
|
|
1245
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1246
|
+
|
|
1247
|
+
Returns:
|
|
1248
|
+
pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
|
|
1249
|
+
- ticker Asset name (column name from df).
|
|
1250
|
+
- adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
|
|
1251
|
+
|
|
1252
|
+
References
|
|
1253
|
+
----------
|
|
1254
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1255
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1256
|
+
|
|
1257
|
+
Example:
|
|
1258
|
+
>>> df = pd.DataFrame({
|
|
1259
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1260
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1261
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1262
|
+
... })
|
|
1263
|
+
>>> df = check_stationarity(df)
|
|
1264
|
+
"""
|
|
1211
1265
|
results = []
|
|
1212
1266
|
for ticker, prices in df.items():
|
|
1213
1267
|
results.append([ticker, adfuller(prices, regression='ct')[1]])
|
|
@@ -1215,12 +1269,66 @@ def check_stationarity(df: pd.DataFrame):
|
|
|
1215
1269
|
|
|
1216
1270
|
|
|
1217
1271
|
def remove_stationary_assets(df: pd.DataFrame, pval=.05):
|
|
1272
|
+
"""
|
|
1273
|
+
Filters out stationary assets from the DataFrame based on the p-value obtained
|
|
1274
|
+
from the Augmented Dickey-Fuller test.
|
|
1275
|
+
Useful for focusing only on non-stationary time-series data.
|
|
1276
|
+
|
|
1277
|
+
Args:
|
|
1278
|
+
df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
|
|
1279
|
+
pval (float, optional, default=0.05): The significance level to determine stationarity.
|
|
1280
|
+
Columns with an ADF test p-value below this threshold are considered stationary and removed.
|
|
1281
|
+
|
|
1282
|
+
Returns:
|
|
1283
|
+
pd.DataFrame: A DataFrame containing only the non-stationary assets.
|
|
1284
|
+
|
|
1285
|
+
References
|
|
1286
|
+
----------
|
|
1287
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1288
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1289
|
+
|
|
1290
|
+
Example:
|
|
1291
|
+
>>> df = pd.DataFrame({
|
|
1292
|
+
... 'AAPL': [100, 101, 102, 103, 104],
|
|
1293
|
+
... 'MSFT': [200, 201, 202, 203, 204],
|
|
1294
|
+
... 'GOOG': [300, 301, 302, 303, 304]
|
|
1295
|
+
... })
|
|
1296
|
+
>>> df = remove_stationary_assets(df)
|
|
1297
|
+
"""
|
|
1218
1298
|
test_result = check_stationarity(df)
|
|
1219
1299
|
stationary = test_result.loc[test_result.adf <= pval, 'ticker'].tolist()
|
|
1220
1300
|
return df.drop(stationary, axis=1).sort_index()
|
|
1221
1301
|
|
|
1222
1302
|
|
|
1223
|
-
def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
|
|
1303
|
+
def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
|
|
1304
|
+
"""
|
|
1305
|
+
Selects the top N assets based on the average trading volume from the input DataFrame.
|
|
1306
|
+
These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
|
|
1307
|
+
|
|
1308
|
+
Args:
|
|
1309
|
+
df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
|
|
1310
|
+
Must include columns 'close' (price) and 'volume'.
|
|
1311
|
+
n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
|
|
1312
|
+
start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
|
|
1313
|
+
end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
|
|
1314
|
+
rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
|
|
1315
|
+
|
|
1316
|
+
Returns:
|
|
1317
|
+
pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
|
|
1318
|
+
|
|
1319
|
+
References
|
|
1320
|
+
----------
|
|
1321
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1322
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1323
|
+
"""
|
|
1324
|
+
required_columns = {'close', 'volume'}
|
|
1325
|
+
if not required_columns.issubset(df.columns):
|
|
1326
|
+
raise ValueError(f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}.")
|
|
1327
|
+
|
|
1328
|
+
if not isinstance(df.index, pd.MultiIndex) or 'ticker' not in df.index.names or 'date' not in df.index.names:
|
|
1329
|
+
raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
|
|
1330
|
+
|
|
1331
|
+
df = df.copy()
|
|
1224
1332
|
idx = pd.IndexSlice
|
|
1225
1333
|
start = start or df.index.get_level_values('date').min()
|
|
1226
1334
|
end = end or df.index.get_level_values('date').max()
|
|
@@ -1229,23 +1337,63 @@ def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
|
|
|
1229
1337
|
.sort_index()
|
|
1230
1338
|
.loc[idx[:, f'{start}':f'{end}'], :]
|
|
1231
1339
|
.assign(dv=lambda df: df.close.mul(df.volume)))
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1340
|
+
|
|
1341
|
+
if rolling_window is None:
|
|
1342
|
+
most_traded = (df.groupby(level='ticker')
|
|
1343
|
+
.dv.mean()
|
|
1344
|
+
.nlargest(n=n).index)
|
|
1345
|
+
else:
|
|
1346
|
+
# Calculate the rolling average of dollar volume
|
|
1347
|
+
df['dv_rolling_avg'] = (
|
|
1348
|
+
df.groupby(level=0)
|
|
1349
|
+
.dv
|
|
1350
|
+
.rolling(window=rolling_window, min_periods=1)
|
|
1351
|
+
.mean()
|
|
1352
|
+
.reset_index(level=0, drop=True)
|
|
1353
|
+
)
|
|
1354
|
+
most_traded = (
|
|
1355
|
+
df.groupby(level=0)['dv_rolling_avg']
|
|
1356
|
+
.mean()
|
|
1357
|
+
.nlargest(n=n)
|
|
1358
|
+
.index
|
|
1359
|
+
)
|
|
1240
1360
|
df = (df.loc[idx[most_traded, :], 'close']
|
|
1241
1361
|
.unstack('ticker')
|
|
1242
|
-
.ffill(limit=5)
|
|
1243
|
-
.dropna(axis=1))
|
|
1244
|
-
|
|
1362
|
+
.ffill(limit=5)
|
|
1363
|
+
.dropna(axis=1))
|
|
1245
1364
|
df = remove_correlated_assets(df)
|
|
1246
|
-
|
|
1365
|
+
df = remove_stationary_assets(df)
|
|
1366
|
+
return df.sort_index()
|
|
1367
|
+
|
|
1247
1368
|
|
|
1248
1369
|
def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
1370
|
+
"""
|
|
1371
|
+
Calculates statistical and econometric metrics for a target security and a set of candidate securities.
|
|
1372
|
+
These metrics are useful in financial modeling and pairs trading strategies,
|
|
1373
|
+
providing information about drift, volatility, correlation, and cointegration.
|
|
1374
|
+
|
|
1375
|
+
Args:
|
|
1376
|
+
security (pd.Series): A time-series of the target security's prices.
|
|
1377
|
+
The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
|
|
1378
|
+
candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
|
|
1379
|
+
for candidate securities to be evaluated against the target security.
|
|
1380
|
+
|
|
1381
|
+
Returns:
|
|
1382
|
+
pd.DataFrame: A DataFrame combining:
|
|
1383
|
+
Drift: Estimated drift of spreads between the target security and each candidate.
|
|
1384
|
+
Volatility: Standard deviation of spreads.
|
|
1385
|
+
Correlation:
|
|
1386
|
+
``corr``: Correlation of normalized prices between the target and each candidate.
|
|
1387
|
+
``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
|
|
1388
|
+
Cointegration metrics:
|
|
1389
|
+
Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
|
|
1390
|
+
Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
|
|
1391
|
+
|
|
1392
|
+
References
|
|
1393
|
+
----------
|
|
1394
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1395
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1396
|
+
"""
|
|
1249
1397
|
security = security.div(security.iloc[0])
|
|
1250
1398
|
ticker = security.name
|
|
1251
1399
|
candidates = candidates.div(candidates.iloc[0])
|
|
@@ -1261,7 +1409,7 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
|
1261
1409
|
# compute volatility
|
|
1262
1410
|
vol = spreads.std().to_frame('vol')
|
|
1263
1411
|
|
|
1264
|
-
#
|
|
1412
|
+
# returns correlation
|
|
1265
1413
|
corr_ret = (candidates.pct_change()
|
|
1266
1414
|
.corrwith(security.pct_change())
|
|
1267
1415
|
.to_frame('corr_ret'))
|
|
@@ -1288,22 +1436,90 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
|
|
|
1288
1436
|
tests = pd.DataFrame(tests, columns=columns).set_index('s2')
|
|
1289
1437
|
return metrics.join(tests)
|
|
1290
1438
|
|
|
1291
|
-
|
|
1439
|
+
|
|
1440
|
+
__CRITICAL_VALUES = {
|
|
1292
1441
|
0: {.9: 13.4294, .95: 15.4943, .99: 19.9349},
|
|
1293
1442
|
1: {.9: 2.7055, .95: 3.8415, .99: 6.6349}
|
|
1294
1443
|
}
|
|
1295
1444
|
|
|
1445
|
+
|
|
1296
1446
|
def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
|
|
1297
|
-
n=None, start=None, stop=None):
|
|
1298
|
-
|
|
1299
|
-
|
|
1447
|
+
n=None, start=None, stop=None, coint=False):
|
|
1448
|
+
|
|
1449
|
+
"""
|
|
1450
|
+
Identifies cointegrated pairs between a target set of securities and candidate securities
|
|
1451
|
+
based on econometric tests. The function evaluates statistical relationships,
|
|
1452
|
+
such as cointegration and Engle-Granger significance, to determine pairs suitable
|
|
1453
|
+
for financial strategies like pairs trading.
|
|
1454
|
+
|
|
1455
|
+
Args:
|
|
1456
|
+
securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1457
|
+
prices of target securities to evaluate.
|
|
1458
|
+
candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
|
|
1459
|
+
prices of candidate securities to compare against the target securities.
|
|
1460
|
+
n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
|
|
1461
|
+
start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1462
|
+
stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
|
|
1463
|
+
coint (`bool`, optional, default=False):
|
|
1464
|
+
- If `True`, filters for pairs identified as cointegrated.
|
|
1465
|
+
- If `False`, returns all evaluated pairs.
|
|
1466
|
+
|
|
1467
|
+
Returns:
|
|
1468
|
+
- ``pd.DataFrame``: A DataFrame containing:
|
|
1469
|
+
- Johansen and Engle-Granger cointegration metrics:
|
|
1470
|
+
- `t1`, `t2`: Engle-Granger test statistics for two directions.
|
|
1471
|
+
- `p1`, `p2`: Engle-Granger p-values for two directions.
|
|
1472
|
+
- `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
|
|
1473
|
+
- Indicators and filters:
|
|
1474
|
+
- `joh_sig`: Indicates Johansen cointegration significance.
|
|
1475
|
+
- `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
|
|
1476
|
+
- `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
|
|
1477
|
+
- `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
|
|
1478
|
+
- Spread and ranking:
|
|
1479
|
+
- `t`: Minimum of `t1` and `t2`.
|
|
1480
|
+
- `p`: Minimum of `p1` and `p2`.
|
|
1481
|
+
References
|
|
1482
|
+
----------
|
|
1483
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1484
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1485
|
+
|
|
1486
|
+
Example:
|
|
1487
|
+
>>> import pandas as pd
|
|
1488
|
+
|
|
1489
|
+
>>> # Sample Data
|
|
1490
|
+
>>> data_securities = {
|
|
1491
|
+
... 'Security1': [100, 102, 101, 103, 105],
|
|
1492
|
+
... 'Security2': [50, 52, 53, 51, 54]
|
|
1493
|
+
... }
|
|
1494
|
+
>>> data_candidates = {
|
|
1495
|
+
... 'Candidate1': [100, 101, 99, 102, 104],
|
|
1496
|
+
... 'Candidate2': [200, 202, 201, 203, 205]
|
|
1497
|
+
... }
|
|
1498
|
+
|
|
1499
|
+
>>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
|
|
1500
|
+
>>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
|
|
1501
|
+
|
|
1502
|
+
>>> # Find cointegrated pairs
|
|
1503
|
+
>>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1504
|
+
>>> print(top_pairs)
|
|
1505
|
+
|
|
1506
|
+
>>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
|
|
1507
|
+
>>> |----------|-----------|------|-------|---------|--------|-------|
|
|
1508
|
+
>>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
|
|
1509
|
+
>>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
|
|
1510
|
+
"""
|
|
1511
|
+
trace0_cv = __CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
|
|
1512
|
+
trace1_cv = __CRITICAL_VALUES[1][.95] # critical value for 1 cointegration relationship
|
|
1300
1513
|
spreads = []
|
|
1301
1514
|
if start is not None and stop is not None:
|
|
1302
1515
|
securities = securities.loc[str(start): str(stop), :]
|
|
1303
1516
|
candidates = candidates.loc[str(start): str(stop), :]
|
|
1304
1517
|
for i, (ticker, prices) in enumerate(securities.items(), 1):
|
|
1305
|
-
|
|
1306
|
-
|
|
1518
|
+
try:
|
|
1519
|
+
df = compute_pair_metrics(prices, candidates)
|
|
1520
|
+
spreads.append(df.set_index('s1', append=True))
|
|
1521
|
+
except np.linalg.LinAlgError:
|
|
1522
|
+
continue
|
|
1307
1523
|
spreads = pd.concat(spreads)
|
|
1308
1524
|
spreads.index.names = ['s2', 's1']
|
|
1309
1525
|
spreads = spreads.swaplevel()
|
|
@@ -1315,18 +1531,69 @@ def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
|
|
|
1315
1531
|
spreads['s1_dep'] = spreads.p1 < spreads.p2
|
|
1316
1532
|
spreads['coint'] = (spreads.joh_sig & spreads.eg_sig).astype(int)
|
|
1317
1533
|
# select top n pairs
|
|
1318
|
-
if
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1534
|
+
if coint:
|
|
1535
|
+
if n is not None:
|
|
1536
|
+
top_pairs = (spreads.query('coint == 1')
|
|
1537
|
+
.sort_values('t', ascending=False)
|
|
1538
|
+
.head(n))
|
|
1539
|
+
else:
|
|
1540
|
+
top_pairs = (spreads.query('coint == 1')
|
|
1541
|
+
.sort_values('t', ascending=False))
|
|
1322
1542
|
else:
|
|
1323
|
-
|
|
1543
|
+
if n is not None:
|
|
1544
|
+
top_pairs = (spreads
|
|
1545
|
+
.sort_values('t', ascending=False)
|
|
1546
|
+
.head(n))
|
|
1547
|
+
else:
|
|
1548
|
+
top_pairs = (spreads
|
|
1549
|
+
.sort_values('t', ascending=False))
|
|
1324
1550
|
return top_pairs
|
|
1325
1551
|
|
|
1326
|
-
|
|
1552
|
+
|
|
1553
|
+
def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=True, crosstab=False,
|
|
1327
1554
|
heuristics=False, log_reg=False, decis_tree=False):
|
|
1555
|
+
"""
|
|
1556
|
+
Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
|
|
1557
|
+
|
|
1558
|
+
Args:
|
|
1559
|
+
spreads (pd.DataFrame):
|
|
1560
|
+
A DataFrame containing cointegration metrics and characteristics.
|
|
1561
|
+
Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
|
|
1562
|
+
plot_coint (bool, optional):
|
|
1563
|
+
If True, generates scatterplots and boxplots to visualize cointegration characteristics.
|
|
1564
|
+
cosstab (bool, optional):
|
|
1565
|
+
If True, displays crosstabulations of Engle-Granger and Johansen test significance.
|
|
1566
|
+
heuristics (bool, optional):
|
|
1567
|
+
If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
|
|
1568
|
+
log_reg (bool, optional):
|
|
1569
|
+
If True, fits a logistic regression model to predict cointegration and evaluates its performance.
|
|
1570
|
+
decis_tree (bool, optional):
|
|
1571
|
+
If True, fits a decision tree model to predict cointegration and evaluates its performance.
|
|
1572
|
+
|
|
1573
|
+
References
|
|
1574
|
+
----------
|
|
1575
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1576
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1577
|
+
|
|
1578
|
+
Example:
|
|
1579
|
+
>>> import pandas as pd
|
|
1580
|
+
>>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
|
|
1581
|
+
|
|
1582
|
+
>>> # Sample Data
|
|
1583
|
+
>>> securities = pd.DataFrame({
|
|
1584
|
+
... 'SPY': [100, 102, 101, 103, 105],
|
|
1585
|
+
... 'QQQ': [50, 52, 53, 51, 54]
|
|
1586
|
+
... })
|
|
1587
|
+
>>> candidates = pd.DataFrame({
|
|
1588
|
+
... 'AAPL': [100, 101, 99, 102, 104],
|
|
1589
|
+
... 'MSFT': [200, 202, 201, 203, 205]
|
|
1590
|
+
... })
|
|
1591
|
+
|
|
1592
|
+
>>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
|
|
1593
|
+
>>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
|
|
1594
|
+
"""
|
|
1328
1595
|
if plot_coint:
|
|
1329
|
-
trace0_cv =
|
|
1596
|
+
trace0_cv = __CRITICAL_VALUES[0][.95]
|
|
1330
1597
|
spreads = spreads.reset_index()
|
|
1331
1598
|
sns.scatterplot(x=np.log1p(spreads.t.abs()),
|
|
1332
1599
|
y=np.log1p(spreads.trace1),
|
|
@@ -1335,11 +1602,13 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
|
|
|
1335
1602
|
for i, heuristic in enumerate(['drift', 'vol', 'corr', 'corr_ret']):
|
|
1336
1603
|
sns.boxplot(x='coint', y=heuristic, data=spreads, ax=axes[i])
|
|
1337
1604
|
fig.tight_layout();
|
|
1605
|
+
|
|
1338
1606
|
if heuristics:
|
|
1339
1607
|
spreads = spreads.reset_index()
|
|
1340
1608
|
h = spreads.groupby(spreads.coint)[
|
|
1341
1609
|
['drift', 'vol', 'corr']].describe().stack(level=0).swaplevel().sort_index()
|
|
1342
1610
|
print(h)
|
|
1611
|
+
|
|
1343
1612
|
if log_reg:
|
|
1344
1613
|
y = spreads.coint
|
|
1345
1614
|
X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
|
|
@@ -1353,6 +1622,7 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
|
|
|
1353
1622
|
res = f'C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}'
|
|
1354
1623
|
print(res)
|
|
1355
1624
|
print(log_reg.coef_)
|
|
1625
|
+
|
|
1356
1626
|
if decis_tree:
|
|
1357
1627
|
model = DecisionTreeClassifier(class_weight='balanced')
|
|
1358
1628
|
decision_tree = GridSearchCV(model,
|
|
@@ -1364,22 +1634,77 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
|
|
|
1364
1634
|
decision_tree.fit(X, y)
|
|
1365
1635
|
res = f'{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_["max_depth"]}'
|
|
1366
1636
|
print(res)
|
|
1367
|
-
|
|
1637
|
+
|
|
1638
|
+
if crosstab:
|
|
1368
1639
|
pd.set_option('display.float_format', lambda x: f'{x:.2%}')
|
|
1369
1640
|
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
|
|
1370
1641
|
print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
|
|
1371
1642
|
|
|
1372
1643
|
|
|
1373
|
-
def select_candidate_pairs(pairs: pd.DataFrame):
|
|
1644
|
+
def select_candidate_pairs(pairs: pd.DataFrame, period=False):
|
|
1645
|
+
"""
|
|
1646
|
+
Select candidate pairs from a DataFrame based on cointegration status.
|
|
1647
|
+
|
|
1648
|
+
This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
|
|
1649
|
+
indicating cointegration. It then determines the dependent and independent series for each pair
|
|
1650
|
+
and returns the selected pairs in a dictionary format.
|
|
1651
|
+
|
|
1652
|
+
Args:
|
|
1653
|
+
pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
|
|
1654
|
+
period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
|
|
1655
|
+
|
|
1656
|
+
Returns:
|
|
1657
|
+
list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
|
|
1658
|
+
|
|
1659
|
+
References
|
|
1660
|
+
----------
|
|
1661
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1662
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1663
|
+
"""
|
|
1374
1664
|
candidates = pairs.query('coint == 1').copy()
|
|
1375
|
-
candidates
|
|
1376
|
-
candidates['
|
|
1377
|
-
candidates.
|
|
1665
|
+
candidates = candidates.reset_index()
|
|
1666
|
+
candidates['y'] = candidates.apply(lambda x: x['s1'] if x.s1_dep else x['s2'], axis=1)
|
|
1667
|
+
candidates['x'] = candidates.apply(lambda x: x['s2'] if x.s1_dep else x['s1'], axis=1)
|
|
1668
|
+
if period:
|
|
1669
|
+
return candidates[['x', 'y', 'period']].to_dict(orient='records')
|
|
1378
1670
|
return candidates[['x', 'y']].to_dict(orient='records')
|
|
1379
1671
|
|
|
1380
1672
|
|
|
1381
|
-
def KFSmoother(
|
|
1382
|
-
"""
|
|
1673
|
+
def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
1674
|
+
"""
|
|
1675
|
+
Estimate rolling mean using Kalman Smoothing.
|
|
1676
|
+
|
|
1677
|
+
Args:
|
|
1678
|
+
prices : pd.Series or np.ndarray
|
|
1679
|
+
The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
|
|
1680
|
+
|
|
1681
|
+
Returns:
|
|
1682
|
+
pd.Series or np.ndarray
|
|
1683
|
+
The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
|
|
1684
|
+
If the input is a numpy array, the output will be a numpy array.
|
|
1685
|
+
|
|
1686
|
+
References
|
|
1687
|
+
----------
|
|
1688
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1689
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1690
|
+
|
|
1691
|
+
Examples
|
|
1692
|
+
--------
|
|
1693
|
+
>>> import yfinance as yf
|
|
1694
|
+
>>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
|
|
1695
|
+
>>> prices = KFSmoother(prices)
|
|
1696
|
+
>>> print(prices[:5])
|
|
1697
|
+
Date
|
|
1698
|
+
2020-01-02 00:00:00+00:00 36.39801407
|
|
1699
|
+
2020-01-03 00:00:00+00:00 49.06231000
|
|
1700
|
+
2020-01-06 00:00:00+00:00 55.86334436
|
|
1701
|
+
2020-01-07 00:00:00+00:00 60.02240894
|
|
1702
|
+
2020-01-08 00:00:00+00:00 63.15057948
|
|
1703
|
+
dtype: float64
|
|
1704
|
+
|
|
1705
|
+
"""
|
|
1706
|
+
if not isinstance(prices, (np.ndarray, pd.Series)):
|
|
1707
|
+
raise ValueError("Input must be either a numpy array or a pandas Series.")
|
|
1383
1708
|
kf = PyKalmanFilter(
|
|
1384
1709
|
transition_matrices=np.eye(1),
|
|
1385
1710
|
observation_matrices=np.eye(1),
|
|
@@ -1396,8 +1721,31 @@ def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
|
|
|
1396
1721
|
return state_means.flatten()
|
|
1397
1722
|
|
|
1398
1723
|
|
|
1399
|
-
def KFHedgeRatio(
|
|
1400
|
-
"""
|
|
1724
|
+
def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
|
|
1725
|
+
"""
|
|
1726
|
+
Estimate Hedge Ratio using Kalman Filter.
|
|
1727
|
+
Args:
|
|
1728
|
+
x : pd.Series or np.ndarray
|
|
1729
|
+
The independent variable, which can be either a pandas Series or a numpy array.
|
|
1730
|
+
y : pd.Series or np.ndarray
|
|
1731
|
+
The dependent variable, which can be either a pandas Series or a numpy array.
|
|
1732
|
+
|
|
1733
|
+
Returns:
|
|
1734
|
+
np.ndarray
|
|
1735
|
+
The estimated hedge ratio as a numpy array.
|
|
1736
|
+
|
|
1737
|
+
The function returns the negative of the first state variable of each Kalman Filter estimate,
|
|
1738
|
+
which represents the estimated hedge ratio.
|
|
1739
|
+
|
|
1740
|
+
References
|
|
1741
|
+
----------
|
|
1742
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
1743
|
+
chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
|
|
1744
|
+
"""
|
|
1745
|
+
if (not isinstance(x, (np.ndarray, pd.Series))
|
|
1746
|
+
or not isinstance(y, (np.ndarray, pd.Series))):
|
|
1747
|
+
raise ValueError("Both x and y must be either a numpy array or a pandas Series.")
|
|
1748
|
+
|
|
1401
1749
|
delta = 1e-3
|
|
1402
1750
|
trans_cov = delta / (1 - delta) * np.eye(2)
|
|
1403
1751
|
obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
|
|
@@ -1411,7 +1759,8 @@ def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
|
|
|
1411
1759
|
observation_covariance=2,
|
|
1412
1760
|
transition_covariance=trans_cov
|
|
1413
1761
|
)
|
|
1414
|
-
|
|
1762
|
+
y = y.values if isinstance(y, pd.Series) else y
|
|
1763
|
+
state_means, _ = kf.filter(y)
|
|
1415
1764
|
# Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
|
|
1416
1765
|
# each Kalman Filter estimate, which is the estimated hedge ratio.
|
|
1417
1766
|
return -state_means[:, 0]
|