bbstrader 0.1.94__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

Files changed (38) hide show
  1. bbstrader/__ini__.py +9 -9
  2. bbstrader/btengine/__init__.py +7 -7
  3. bbstrader/btengine/backtest.py +30 -26
  4. bbstrader/btengine/data.py +100 -79
  5. bbstrader/btengine/event.py +2 -1
  6. bbstrader/btengine/execution.py +18 -16
  7. bbstrader/btengine/performance.py +11 -7
  8. bbstrader/btengine/portfolio.py +35 -36
  9. bbstrader/btengine/strategy.py +119 -94
  10. bbstrader/config.py +14 -8
  11. bbstrader/core/__init__.py +0 -0
  12. bbstrader/core/data.py +22 -0
  13. bbstrader/core/utils.py +57 -0
  14. bbstrader/ibkr/__init__.py +0 -0
  15. bbstrader/ibkr/utils.py +0 -0
  16. bbstrader/metatrader/__init__.py +5 -5
  17. bbstrader/metatrader/account.py +117 -121
  18. bbstrader/metatrader/rates.py +83 -80
  19. bbstrader/metatrader/risk.py +23 -37
  20. bbstrader/metatrader/trade.py +169 -140
  21. bbstrader/metatrader/utils.py +3 -3
  22. bbstrader/models/__init__.py +5 -5
  23. bbstrader/models/factors.py +280 -0
  24. bbstrader/models/ml.py +1092 -0
  25. bbstrader/models/optimization.py +31 -28
  26. bbstrader/models/{portfolios.py → portfolio.py} +64 -46
  27. bbstrader/models/risk.py +15 -9
  28. bbstrader/trading/__init__.py +2 -2
  29. bbstrader/trading/execution.py +252 -164
  30. bbstrader/trading/scripts.py +8 -4
  31. bbstrader/trading/strategies.py +79 -66
  32. bbstrader/tseries.py +482 -107
  33. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/LICENSE +1 -1
  34. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/METADATA +6 -1
  35. bbstrader-0.2.1.dist-info/RECORD +37 -0
  36. bbstrader-0.1.94.dist-info/RECORD +0 -32
  37. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/WHEEL +0 -0
  38. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/top_level.txt +0 -0
bbstrader/tseries.py CHANGED
@@ -8,33 +8,34 @@ market analysis, and financial data exploration.
8
8
  """
9
9
  import pprint
10
10
  import warnings
11
+ from itertools import combinations
12
+ from typing import List, Tuple, Union
13
+
14
+ import matplotlib.pyplot as plt
11
15
  import numpy as np
12
16
  import pandas as pd
13
- from tqdm import tqdm
14
- import yfinance as yf
15
17
  import pmdarima as pm
16
18
  import seaborn as sns
17
19
  import statsmodels.api as sm
18
- import matplotlib.pyplot as plt
19
20
  import statsmodels.tsa.stattools as ts
20
- from hurst import compute_Hc
21
+ import yfinance as yf
21
22
  from arch import arch_model
22
- from scipy.optimize import minimize
23
23
  from filterpy.kalman import KalmanFilter
24
+ from hurst import compute_Hc
24
25
  from pykalman import KalmanFilter as PyKalmanFilter
25
- from statsmodels.tsa.vector_ar.vecm import coint_johansen
26
+ from scipy.optimize import minimize
27
+ from sklearn.linear_model import LogisticRegressionCV
28
+ from sklearn.model_selection import GridSearchCV
29
+ from sklearn.tree import DecisionTreeClassifier
26
30
  from statsmodels.graphics.tsaplots import plot_acf
27
- from statsmodels.tsa.stattools import adfuller, coint
31
+ from statsmodels.stats.diagnostic import acorr_ljungbox
28
32
  from statsmodels.tsa.arima.model import ARIMA
33
+ from statsmodels.tsa.stattools import adfuller, coint
29
34
  from statsmodels.tsa.vector_ar.var_model import VAR
30
- from sklearn.model_selection import GridSearchCV
31
- from sklearn.tree import DecisionTreeClassifier
32
- from sklearn.linear_model import LogisticRegressionCV
33
- from statsmodels.stats.diagnostic import acorr_ljungbox
34
- from itertools import combinations
35
- from typing import Union, List, Tuple
36
- warnings.filterwarnings("ignore")
35
+ from statsmodels.tsa.vector_ar.vecm import coint_johansen
36
+ from tqdm import tqdm
37
37
 
38
+ warnings.filterwarnings("ignore")
38
39
 
39
40
 
40
41
  __all__ = [
@@ -50,7 +51,17 @@ __all__ = [
50
51
  "run_kalman_filter",
51
52
  "ArimaGarchModel",
52
53
  "KalmanFilterModel",
53
- "OrnsteinUhlenbeckModel"
54
+ "OrnsteinUhlenbeck",
55
+ "remove_correlated_assets",
56
+ "check_stationarity",
57
+ "remove_stationary_assets",
58
+ "select_assets",
59
+ "compute_pair_metrics",
60
+ "find_cointegrated_pairs",
61
+ "analyze_cointegrated_pairs",
62
+ "select_candidate_pairs",
63
+ "KFSmoother",
64
+ "KFHedgeRatio",
54
65
  ]
55
66
 
56
67
  # *******************************************
@@ -123,7 +134,8 @@ def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
123
134
  from arch.utility.exceptions import ConvergenceWarning as ArchWarning
124
135
  from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
125
136
  with warnings.catch_warnings():
126
- warnings.filterwarnings("ignore", category=StatsWarning, module='statsmodels')
137
+ warnings.filterwarnings(
138
+ "ignore", category=StatsWarning, module='statsmodels')
127
139
  warnings.filterwarnings("ignore", category=ArchWarning, module='arch')
128
140
  try:
129
141
  best_arima_model = ARIMA(
@@ -499,7 +511,8 @@ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -
499
511
  >>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
500
512
  """
501
513
  # Download historical data
502
- data = yf.download(tickers, start=start, end=end)['Adj Close']
514
+ data = yf.download(tickers, start=start, end=end,
515
+ multi_level_index=False)['Adj Close']
503
516
 
504
517
  # Calculate correlation matrix
505
518
  correlation_matrix = data.corr()
@@ -643,8 +656,10 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
643
656
  """
644
657
  # Download historical data for required stocks
645
658
  p0, p1 = pair[0], pair[1]
646
- _p0 = yf.download(p0, start=start, end=end)
647
- _p1 = yf.download(p1, start=start, end=end)
659
+ _p0 = yf.download(p0, start=start, end=end,
660
+ progress=False, multi_level_index=False)
661
+ _p1 = yf.download(p1, start=start, end=end,
662
+ progress=False, multi_level_index=False)
648
663
  df = pd.DataFrame(index=_p0.index)
649
664
  df[p0] = _p0["Adj Close"]
650
665
  df[p1] = _p1["Adj Close"]
@@ -673,7 +688,7 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
673
688
  # Display regression metrics
674
689
  print("\nRegression Metrics:")
675
690
  print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
676
- print(f'Result Parmas: \n')
691
+ print('Result Parmas: \n')
677
692
  print(results.params)
678
693
  print("\nRegression Summary:")
679
694
  print(results.summary())
@@ -726,7 +741,8 @@ def run_hurst_test(symbol: str, start: str, end: str):
726
741
 
727
742
  >>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
728
743
  """
729
- data = yf.download(symbol, start=start, end=end)
744
+ data = yf.download(symbol, start=start, end=end,
745
+ progress=False, multi_level_index=False)
730
746
 
731
747
  # Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
732
748
  gbm = np.log(np.cumsum(np.random.randn(100000))+1000)
@@ -743,7 +759,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
743
759
  def test_cointegration(ticker1, ticker2, start, end):
744
760
  # Download historical data
745
761
  stock_data_pair = yf.download(
746
- [ticker1, ticker2], start=start, end=end
762
+ [ticker1, ticker2], start=start, end=end, progress=False, multi_level_index=False
747
763
  )['Adj Close'].dropna()
748
764
 
749
765
  # Perform Johansen cointegration test
@@ -865,16 +881,16 @@ def run_kalman_filter(
865
881
  etfs: Union[List[str], Tuple[str, ...]],
866
882
  start: str, end: str) -> None:
867
883
  """
868
- Applies a Kalman filter to a pair of ETF adjusted closing prices within a specified date range
884
+ Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
869
885
  to estimate the slope and intercept over time.
870
886
 
871
- The function downloads historical adjusted closing prices for the specified pair of ETFs,
887
+ The function downloads historical adjusted closing prices for the specified pair of assets,
872
888
  visualizes their price relationship, calculates the Kalman filter estimates for the slope and
873
889
  intercept, and visualizes the changes in these estimates over time.
874
890
 
875
891
  Args:
876
892
  etfs (Union[List[str] , Tuple[str, ...]]):
877
- A list or tuple containing two valid ETF tickers (e.g., ['SPY', 'QQQ']).
893
+ A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
878
894
  start (str): The start date for the historical data in 'YYYY-MM-DD' format.
879
895
  end (str): The end date for the historical data in 'YYYY-MM-DD' format.
880
896
 
@@ -883,8 +899,10 @@ def run_kalman_filter(
883
899
 
884
900
  >>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
885
901
  """
886
- etf_df1 = yf.download(etfs[0], start, end)
887
- etf_df2 = yf.download(etfs[1], start, end)
902
+ etf_df1 = yf.download(etfs[0], start, end,
903
+ progress=False, multi_level_index=False)
904
+ etf_df2 = yf.download(etfs[1], start, end,
905
+ progress=False, multi_level_index=False)
888
906
 
889
907
  prices = pd.DataFrame(index=etf_df1.index)
890
908
  prices[etfs[0]] = etf_df1["Adj Close"]
@@ -919,13 +937,14 @@ class KalmanFilterModel():
919
937
  """
920
938
  self.tickers = tickers
921
939
  assert self.tickers is not None
922
- self.latest_prices = np.array([-1.0, -1.0])
923
- self.delta = kwargs.get("delta", 1e-4)
924
- self.wt = self.delta/(1-self.delta) * np.eye(2)
925
- self.vt = kwargs.get("vt", 1e-3)
940
+
941
+ self.R = None
926
942
  self.theta = np.zeros(2)
927
943
  self.P = np.zeros((2, 2))
928
- self.R = None
944
+ self.delta = kwargs.get("delta", 1e-4)
945
+ self.vt = kwargs.get("vt", 1e-3)
946
+ self.wt = self.delta/(1-self.delta) * np.eye(2)
947
+ self.latest_prices = np.array([-1.0, -1.0])
929
948
  self.kf = self._init_kalman()
930
949
 
931
950
  def _init_kalman(self):
@@ -945,6 +964,7 @@ class KalmanFilterModel():
945
964
  return kf
946
965
 
947
966
  Array = np.ndarray
967
+
948
968
  def calc_slope_intercep(self, prices: Array) -> Tuple:
949
969
  """
950
970
  Calculates and returns the slope and intercept
@@ -958,25 +978,24 @@ class KalmanFilterModel():
958
978
  Returns:
959
979
  A tuple containing the slope and intercept of the relationship
960
980
  """
961
- kf = self.kf
962
- kf.H = np.array([[prices[1], 1.0]])
963
- kf.predict()
964
- kf.update(prices[0])
965
- slope = kf.x.copy().flatten()[0]
966
- intercept = kf.x.copy().flatten()[1]
981
+ self.kf.H = np.array([[prices[1], 1.0]])
982
+ self.kf.predict()
983
+ self.kf.update(prices[1])
984
+ slope = self.kf.x.copy().flatten()[0]
985
+ intercept = self.kf.x.copy().flatten()[1]
967
986
 
968
987
  return slope, intercept
969
-
988
+
970
989
  def calculate_etqt(self, prices: Array) -> Tuple:
971
990
  """
972
- Calculates the forecast error and standard deviation of the predictions
991
+ Calculates the ``forecast error`` and ``standard deviation`` of the predictions
973
992
  using the Kalman Filter.
974
993
 
975
994
  Args:
976
995
  prices : A numpy array of prices for two financial instruments.
977
996
 
978
997
  Returns:
979
- A tuple containing the forecast error and standard deviation of the predictions.
998
+ A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
980
999
  """
981
1000
 
982
1001
  self.latest_prices[0] = prices[0]
@@ -1021,7 +1040,7 @@ class KalmanFilterModel():
1021
1040
  At = self.R.dot(F.T) / Qt
1022
1041
  self.theta = self.theta + At.flatten() * et
1023
1042
  self.C = self.R - At * F.dot(self.R)
1024
- return (et, sqrt_Qt)
1043
+ return (et[0], sqrt_Qt.flatten()[0])
1025
1044
  else:
1026
1045
  return None
1027
1046
 
@@ -1165,7 +1184,7 @@ class OrnsteinUhlenbeck():
1165
1184
  Returns:
1166
1185
  np.ndarray: 2D array representing simulated processes.
1167
1186
  """
1168
- if returns is None:
1187
+ if returns is None:
1169
1188
  returns = self.returns
1170
1189
  if p is not None:
1171
1190
  T = p
@@ -1190,6 +1209,33 @@ class OrnsteinUhlenbeck():
1190
1209
 
1191
1210
 
1192
1211
  def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
1212
+ """
1213
+ Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
1214
+ This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
1215
+
1216
+ Args:
1217
+ df (pd.DataFrame): A DataFrame where each column represents an asset
1218
+ and rows represent observations (e.g., time-series data).
1219
+ cutoff (float, optional, default=0.99): The correlation threshold.
1220
+ Columns with absolute correlation greater than this value will be considered for removal.
1221
+
1222
+ Returns:
1223
+ pd.DataFrame: A DataFrame with less correlated assets.
1224
+ The columns that are highly correlated (above the cutoff) are removed.
1225
+
1226
+ References
1227
+ ----------
1228
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1229
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1230
+
1231
+ Example:
1232
+ >>> df = pd.DataFrame({
1233
+ ... 'AAPL': [100, 101, 102, 103, 104],
1234
+ ... 'MSFT': [200, 201, 202, 203, 204],
1235
+ ... 'GOOG': [300, 301, 302, 303, 304]
1236
+ ... })
1237
+ >>> df = remove_correlated_assets(df)
1238
+ """
1193
1239
  corr = df.corr().stack()
1194
1240
  corr = corr[corr < 1]
1195
1241
  to_check = corr[corr.abs() > cutoff].index
@@ -1208,6 +1254,32 @@ def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
1208
1254
 
1209
1255
 
1210
1256
  def check_stationarity(df: pd.DataFrame):
1257
+ """
1258
+ Tests the stationarity of time-series data for each asset in the DataFrame
1259
+ using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
1260
+ in time-series analysis, and non-stationary data can affect model performance.
1261
+
1262
+ Args:
1263
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1264
+
1265
+ Returns:
1266
+ pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
1267
+ - ticker Asset name (column name from df).
1268
+ - adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
1269
+
1270
+ References
1271
+ ----------
1272
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1273
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1274
+
1275
+ Example:
1276
+ >>> df = pd.DataFrame({
1277
+ ... 'AAPL': [100, 101, 102, 103, 104],
1278
+ ... 'MSFT': [200, 201, 202, 203, 204],
1279
+ ... 'GOOG': [300, 301, 302, 303, 304]
1280
+ ... })
1281
+ >>> df = check_stationarity(df)
1282
+ """
1211
1283
  results = []
1212
1284
  for ticker, prices in df.items():
1213
1285
  results.append([ticker, adfuller(prices, regression='ct')[1]])
@@ -1215,37 +1287,133 @@ def check_stationarity(df: pd.DataFrame):
1215
1287
 
1216
1288
 
1217
1289
  def remove_stationary_assets(df: pd.DataFrame, pval=.05):
1290
+ """
1291
+ Filters out stationary assets from the DataFrame based on the p-value obtained
1292
+ from the Augmented Dickey-Fuller test.
1293
+ Useful for focusing only on non-stationary time-series data.
1294
+
1295
+ Args:
1296
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1297
+ pval (float, optional, default=0.05): The significance level to determine stationarity.
1298
+ Columns with an ADF test p-value below this threshold are considered stationary and removed.
1299
+
1300
+ Returns:
1301
+ pd.DataFrame: A DataFrame containing only the non-stationary assets.
1302
+
1303
+ References
1304
+ ----------
1305
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1306
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1307
+
1308
+ Example:
1309
+ >>> df = pd.DataFrame({
1310
+ ... 'AAPL': [100, 101, 102, 103, 104],
1311
+ ... 'MSFT': [200, 201, 202, 203, 204],
1312
+ ... 'GOOG': [300, 301, 302, 303, 304]
1313
+ ... })
1314
+ >>> df = remove_stationary_assets(df)
1315
+ """
1218
1316
  test_result = check_stationarity(df)
1219
1317
  stationary = test_result.loc[test_result.adf <= pval, 'ticker'].tolist()
1220
1318
  return df.drop(stationary, axis=1).sort_index()
1221
1319
 
1222
1320
 
1223
- def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
1321
+ def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
1322
+ """
1323
+ Selects the top N assets based on the average trading volume from the input DataFrame.
1324
+ These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
1325
+
1326
+ Args:
1327
+ df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
1328
+ Must include columns 'close' (price) and 'volume'.
1329
+ n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
1330
+ start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
1331
+ end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
1332
+ rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
1333
+
1334
+ Returns:
1335
+ pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
1336
+
1337
+ References
1338
+ ----------
1339
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1340
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1341
+ """
1342
+ required_columns = {'close', 'volume'}
1343
+ if not required_columns.issubset(df.columns):
1344
+ raise ValueError(
1345
+ f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}.")
1346
+
1347
+ if not isinstance(df.index, pd.MultiIndex) or 'ticker' not in df.index.names or 'date' not in df.index.names:
1348
+ raise ValueError(
1349
+ "Index must be a MultiIndex with levels ['ticker', 'date'].")
1350
+
1351
+ df = df.copy()
1224
1352
  idx = pd.IndexSlice
1225
1353
  start = start or df.index.get_level_values('date').min()
1226
1354
  end = end or df.index.get_level_values('date').max()
1227
1355
  df = (df
1228
- .loc[lambda df: ~df.index.duplicated()]
1229
- .sort_index()
1230
- .loc[idx[:, f'{start}':f'{end}'], :]
1231
- .assign(dv=lambda df: df.close.mul(df.volume)))
1232
-
1233
- # select n assets with the highest average trading volume
1234
- # we are taking a shortcut to simplify; should select
1235
- # based on historical only, e.g. yearly rolling avg
1236
- most_traded = (df.groupby(level='ticker')
1237
- .dv.mean()
1238
- .nlargest(n=n).index)
1239
-
1356
+ .loc[lambda df: ~df.index.duplicated()]
1357
+ .sort_index()
1358
+ .loc[idx[:, f'{start}':f'{end}'], :]
1359
+ .assign(dv=lambda df: df.close.mul(df.volume)))
1360
+
1361
+ if rolling_window is None:
1362
+ most_traded = (df.groupby(level='ticker')
1363
+ .dv.mean()
1364
+ .nlargest(n=n).index)
1365
+ else:
1366
+ # Calculate the rolling average of dollar volume
1367
+ df['dv_rolling_avg'] = (
1368
+ df.groupby(level=0)
1369
+ .dv
1370
+ .rolling(window=rolling_window, min_periods=1)
1371
+ .mean()
1372
+ .reset_index(level=0, drop=True)
1373
+ )
1374
+ most_traded = (
1375
+ df.groupby(level=0)['dv_rolling_avg']
1376
+ .mean()
1377
+ .nlargest(n=n)
1378
+ .index
1379
+ )
1240
1380
  df = (df.loc[idx[most_traded, :], 'close']
1241
- .unstack('ticker')
1242
- .ffill(limit=5) # fill up to five values
1243
- .dropna(axis=1)) # remove assets with any missing values
1244
-
1381
+ .unstack('ticker')
1382
+ .ffill(limit=5)
1383
+ .dropna(axis=1))
1245
1384
  df = remove_correlated_assets(df)
1246
- return remove_stationary_assets(df).sort_index()
1385
+ df = remove_stationary_assets(df)
1386
+ return df.sort_index()
1387
+
1247
1388
 
1248
1389
  def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1390
+ """
1391
+ Calculates statistical and econometric metrics for a target security and a set of candidate securities.
1392
+ These metrics are useful in financial modeling and pairs trading strategies,
1393
+ providing information about drift, volatility, correlation, and cointegration.
1394
+
1395
+ Args:
1396
+ security (pd.Series): A time-series of the target security's prices.
1397
+ The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
1398
+ candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
1399
+ for candidate securities to be evaluated against the target security.
1400
+
1401
+ Returns:
1402
+ pd.DataFrame: A DataFrame combining:
1403
+ Drift: Estimated drift of spreads between the target security and each candidate.
1404
+ Volatility: Standard deviation of spreads.
1405
+ Correlation:
1406
+ ``corr``: Correlation of normalized prices between the target and each candidate.
1407
+ ``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
1408
+ Cointegration metrics:
1409
+ Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
1410
+ Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
1411
+
1412
+ References
1413
+ ----------
1414
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1415
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1416
+ """
1249
1417
  security = security.div(security.iloc[0])
1250
1418
  ticker = security.name
1251
1419
  candidates = candidates.div(candidates.iloc[0])
@@ -1253,133 +1421,315 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1253
1421
  n, m = spreads.shape
1254
1422
  X = np.ones(shape=(n, 2))
1255
1423
  X[:, 1] = np.arange(1, n + 1)
1256
-
1424
+
1257
1425
  # compute drift
1258
1426
  drift = ((np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1]
1259
1427
  .to_frame('drift'))
1260
-
1428
+
1261
1429
  # compute volatility
1262
1430
  vol = spreads.std().to_frame('vol')
1263
-
1264
- # return correlation
1431
+
1432
+ # returns correlation
1265
1433
  corr_ret = (candidates.pct_change()
1266
1434
  .corrwith(security.pct_change())
1267
1435
  .to_frame('corr_ret'))
1268
-
1436
+
1269
1437
  # normalized price series correlation
1270
1438
  corr = candidates.corrwith(security).to_frame('corr')
1271
1439
  metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
1272
-
1440
+
1273
1441
  tests = []
1274
1442
  # run cointegration tests
1275
1443
  for candidate, prices in tqdm(candidates.items()):
1276
1444
  df = pd.DataFrame({'s1': security, 's2': prices})
1277
1445
  var = VAR(df.values)
1278
- lags = var.select_order() # select VAR order
1446
+ lags = var.select_order() # select VAR order
1279
1447
  k_ar_diff = lags.selected_orders['aic']
1280
1448
  # Johansen Test with constant Term and estd. lag order
1281
1449
  cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
1282
1450
  # Engle-Granger Tests
1283
1451
  t1, p1 = coint(security, prices, trend='c')[:2]
1284
1452
  t2, p2 = coint(prices, security, trend='c')[:2]
1285
- tests.append([ticker, candidate, t1, p1, t2, p2,
1453
+ tests.append([ticker, candidate, t1, p1, t2, p2,
1286
1454
  k_ar_diff, *cj0.lr1])
1287
- columns = ['s1', 's2', 't1', 'p1', 't2', 'p2', 'k_ar_diff', 'trace0', 'trace1']
1455
+ columns = ['s1', 's2', 't1', 'p1', 't2',
1456
+ 'p2', 'k_ar_diff', 'trace0', 'trace1']
1288
1457
  tests = pd.DataFrame(tests, columns=columns).set_index('s2')
1289
1458
  return metrics.join(tests)
1290
1459
 
1291
- CRITICAL_VALUES = {
1460
+
1461
+ __CRITICAL_VALUES = {
1292
1462
  0: {.9: 13.4294, .95: 15.4943, .99: 19.9349},
1293
1463
  1: {.9: 2.7055, .95: 3.8415, .99: 6.6349}
1294
1464
  }
1295
1465
 
1296
- def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
1297
- n=None, start=None, stop=None):
1298
- trace0_cv = CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
1299
- trace1_cv = CRITICAL_VALUES[1][.95] # critical value for 1 cointegration relationship
1466
+
1467
+ def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
1468
+ n=None, start=None, stop=None, coint=False):
1469
+ """
1470
+ Identifies cointegrated pairs between a target set of securities and candidate securities
1471
+ based on econometric tests. The function evaluates statistical relationships,
1472
+ such as cointegration and Engle-Granger significance, to determine pairs suitable
1473
+ for financial strategies like pairs trading.
1474
+
1475
+ Args:
1476
+ securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
1477
+ prices of target securities to evaluate.
1478
+ candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
1479
+ prices of candidate securities to compare against the target securities.
1480
+ n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
1481
+ start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
1482
+ stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
1483
+ coint (`bool`, optional, default=False):
1484
+ - If `True`, filters for pairs identified as cointegrated.
1485
+ - If `False`, returns all evaluated pairs.
1486
+
1487
+ Returns:
1488
+ - ``pd.DataFrame``: A DataFrame containing:
1489
+ - Johansen and Engle-Granger cointegration metrics:
1490
+ - `t1`, `t2`: Engle-Granger test statistics for two directions.
1491
+ - `p1`, `p2`: Engle-Granger p-values for two directions.
1492
+ - `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
1493
+ - Indicators and filters:
1494
+ - `joh_sig`: Indicates Johansen cointegration significance.
1495
+ - `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
1496
+ - `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
1497
+ - `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
1498
+ - Spread and ranking:
1499
+ - `t`: Minimum of `t1` and `t2`.
1500
+ - `p`: Minimum of `p1` and `p2`.
1501
+ References
1502
+ ----------
1503
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1504
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1505
+
1506
+ Example:
1507
+ >>> import pandas as pd
1508
+
1509
+ >>> # Sample Data
1510
+ >>> data_securities = {
1511
+ ... 'Security1': [100, 102, 101, 103, 105],
1512
+ ... 'Security2': [50, 52, 53, 51, 54]
1513
+ ... }
1514
+ >>> data_candidates = {
1515
+ ... 'Candidate1': [100, 101, 99, 102, 104],
1516
+ ... 'Candidate2': [200, 202, 201, 203, 205]
1517
+ ... }
1518
+
1519
+ >>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
1520
+ >>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
1521
+
1522
+ >>> # Find cointegrated pairs
1523
+ >>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1524
+ >>> print(top_pairs)
1525
+
1526
+ >>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
1527
+ >>> |----------|-----------|------|-------|---------|--------|-------|
1528
+ >>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
1529
+ >>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
1530
+ """
1531
+ trace0_cv = __CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
1532
+ # critical value for 1 cointegration relationship
1533
+ trace1_cv = __CRITICAL_VALUES[1][.95]
1300
1534
  spreads = []
1301
1535
  if start is not None and stop is not None:
1302
1536
  securities = securities.loc[str(start): str(stop), :]
1303
1537
  candidates = candidates.loc[str(start): str(stop), :]
1304
1538
  for i, (ticker, prices) in enumerate(securities.items(), 1):
1305
- df = compute_pair_metrics(prices, candidates)
1306
- spreads.append(df.set_index('s1', append=True))
1539
+ try:
1540
+ df = compute_pair_metrics(prices, candidates)
1541
+ spreads.append(df.set_index('s1', append=True))
1542
+ except np.linalg.LinAlgError:
1543
+ continue
1307
1544
  spreads = pd.concat(spreads)
1308
1545
  spreads.index.names = ['s2', 's1']
1309
1546
  spreads = spreads.swaplevel()
1310
1547
  spreads['t'] = spreads[['t1', 't2']].min(axis=1)
1311
1548
  spreads['p'] = spreads[['p1', 'p2']].min(axis=1)
1312
1549
  spreads['joh_sig'] = ((spreads.trace0 > trace0_cv) &
1313
- (spreads.trace1 > trace1_cv)).astype(int)
1550
+ (spreads.trace1 > trace1_cv)).astype(int)
1314
1551
  spreads['eg_sig'] = (spreads.p < .05).astype(int)
1315
1552
  spreads['s1_dep'] = spreads.p1 < spreads.p2
1316
1553
  spreads['coint'] = (spreads.joh_sig & spreads.eg_sig).astype(int)
1317
1554
  # select top n pairs
1318
- if n is not None:
1319
- top_pairs = (spreads.query('coint == 1')
1320
- .sort_values('t', ascending=False)
1321
- .head(n))
1555
+ if coint:
1556
+ if n is not None:
1557
+ top_pairs = (spreads.query('coint == 1')
1558
+ .sort_values('t', ascending=False)
1559
+ .head(n))
1560
+ else:
1561
+ top_pairs = (spreads.query('coint == 1')
1562
+ .sort_values('t', ascending=False))
1322
1563
  else:
1323
- top_pairs = spreads.query('coint == 1')
1564
+ if n is not None:
1565
+ top_pairs = (spreads
1566
+ .sort_values('t', ascending=False)
1567
+ .head(n))
1568
+ else:
1569
+ top_pairs = (spreads
1570
+ .sort_values('t', ascending=False))
1324
1571
  return top_pairs
1325
1572
 
1326
- def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=False,
1573
+
1574
+ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=True, crosstab=False,
1327
1575
  heuristics=False, log_reg=False, decis_tree=False):
1576
+ """
1577
+ Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
1578
+
1579
+ Args:
1580
+ spreads (pd.DataFrame):
1581
+ A DataFrame containing cointegration metrics and characteristics.
1582
+ Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
1583
+ plot_coint (bool, optional):
1584
+ If True, generates scatterplots and boxplots to visualize cointegration characteristics.
1585
+ cosstab (bool, optional):
1586
+ If True, displays crosstabulations of Engle-Granger and Johansen test significance.
1587
+ heuristics (bool, optional):
1588
+ If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
1589
+ log_reg (bool, optional):
1590
+ If True, fits a logistic regression model to predict cointegration and evaluates its performance.
1591
+ decis_tree (bool, optional):
1592
+ If True, fits a decision tree model to predict cointegration and evaluates its performance.
1593
+
1594
+ References
1595
+ ----------
1596
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1597
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1598
+
1599
+ Example:
1600
+ >>> import pandas as pd
1601
+ >>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
1602
+
1603
+ >>> # Sample Data
1604
+ >>> securities = pd.DataFrame({
1605
+ ... 'SPY': [100, 102, 101, 103, 105],
1606
+ ... 'QQQ': [50, 52, 53, 51, 54]
1607
+ ... })
1608
+ >>> candidates = pd.DataFrame({
1609
+ ... 'AAPL': [100, 101, 99, 102, 104],
1610
+ ... 'MSFT': [200, 202, 201, 203, 205]
1611
+ ... })
1612
+
1613
+ >>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1614
+ >>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
1615
+ """
1328
1616
  if plot_coint:
1329
- trace0_cv = CRITICAL_VALUES[0][.95]
1617
+ trace0_cv = __CRITICAL_VALUES[0][.95]
1330
1618
  spreads = spreads.reset_index()
1331
- sns.scatterplot(x=np.log1p(spreads.t.abs()),
1332
- y=np.log1p(spreads.trace1),
1333
- hue='coint', data=spreads[spreads.trace0>trace0_cv]);
1619
+ sns.scatterplot(x=np.log1p(spreads.t.abs()),
1620
+ y=np.log1p(spreads.trace1),
1621
+ hue='coint', data=spreads[spreads.trace0 > trace0_cv])
1334
1622
  fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
1335
1623
  for i, heuristic in enumerate(['drift', 'vol', 'corr', 'corr_ret']):
1336
1624
  sns.boxplot(x='coint', y=heuristic, data=spreads, ax=axes[i])
1337
- fig.tight_layout();
1625
+ fig.tight_layout()
1626
+
1338
1627
  if heuristics:
1339
1628
  spreads = spreads.reset_index()
1340
1629
  h = spreads.groupby(spreads.coint)[
1341
1630
  ['drift', 'vol', 'corr']].describe().stack(level=0).swaplevel().sort_index()
1342
1631
  print(h)
1632
+
1343
1633
  if log_reg:
1344
1634
  y = spreads.coint
1345
1635
  X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
1346
- log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
1347
- class_weight='balanced',
1348
- scoring='roc_auc')
1636
+ log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
1637
+ class_weight='balanced',
1638
+ scoring='roc_auc')
1349
1639
  log_reg.fit(X=X, y=y)
1350
1640
  Cs = log_reg.Cs_
1351
1641
  scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
1352
- scores.plot(logx=True);
1642
+ scores.plot(logx=True)
1353
1643
  res = f'C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}'
1354
1644
  print(res)
1355
1645
  print(log_reg.coef_)
1646
+
1356
1647
  if decis_tree:
1357
1648
  model = DecisionTreeClassifier(class_weight='balanced')
1358
1649
  decision_tree = GridSearchCV(model,
1359
- param_grid={'max_depth': list(range(1, 10))},
1360
- cv=5,
1361
- scoring='roc_auc')
1650
+ param_grid={
1651
+ 'max_depth': list(range(1, 10))},
1652
+ cv=5,
1653
+ scoring='roc_auc')
1362
1654
  y = spreads.coint
1363
1655
  X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
1364
1656
  decision_tree.fit(X, y)
1365
1657
  res = f'{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_["max_depth"]}'
1366
1658
  print(res)
1367
- if cosstab:
1659
+
1660
+ if crosstab:
1368
1661
  pd.set_option('display.float_format', lambda x: f'{x:.2%}')
1369
1662
  print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
1370
1663
  print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
1371
1664
 
1372
1665
 
1373
- def select_candidate_pairs(pairs: pd.DataFrame):
1666
+ def select_candidate_pairs(pairs: pd.DataFrame, period=False):
1667
+ """
1668
+ Select candidate pairs from a DataFrame based on cointegration status.
1669
+
1670
+ This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
1671
+ indicating cointegration. It then determines the dependent and independent series for each pair
1672
+ and returns the selected pairs in a dictionary format.
1673
+
1674
+ Args:
1675
+ pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
1676
+ period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
1677
+
1678
+ Returns:
1679
+ list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
1680
+
1681
+ References
1682
+ ----------
1683
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1684
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1685
+ """
1374
1686
  candidates = pairs.query('coint == 1').copy()
1375
- candidates['y'] = candidates.apply(lambda x: x.s1 if x.s1_dep else x.s2, axis=1)
1376
- candidates['x'] = candidates.apply(lambda x: x.s2 if x.s1_dep else x.s1, axis=1)
1377
- candidates.drop(['s1_dep', 's1', 's2'], axis=1)
1687
+ candidates = candidates.reset_index()
1688
+ candidates['y'] = candidates.apply(
1689
+ lambda x: x['s1'] if x.s1_dep else x['s2'], axis=1)
1690
+ candidates['x'] = candidates.apply(
1691
+ lambda x: x['s2'] if x.s1_dep else x['s1'], axis=1)
1692
+ if period:
1693
+ return candidates[['x', 'y', 'period']].to_dict(orient='records')
1378
1694
  return candidates[['x', 'y']].to_dict(orient='records')
1379
1695
 
1380
1696
 
1381
- def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1382
- """Estimate rolling mean using Kalman Smoothing."""
1697
+ def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1698
+ """
1699
+ Estimate rolling mean using Kalman Smoothing.
1700
+
1701
+ Args:
1702
+ prices : pd.Series or np.ndarray
1703
+ The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
1704
+
1705
+ Returns:
1706
+ pd.Series or np.ndarray
1707
+ The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
1708
+ If the input is a numpy array, the output will be a numpy array.
1709
+
1710
+ References
1711
+ ----------
1712
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1713
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1714
+
1715
+ Examples
1716
+ --------
1717
+ >>> import yfinance as yf
1718
+ >>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
1719
+ >>> prices = KFSmoother(prices)
1720
+ >>> print(prices[:5])
1721
+ Date
1722
+ 2020-01-02 00:00:00+00:00 36.39801407
1723
+ 2020-01-03 00:00:00+00:00 49.06231000
1724
+ 2020-01-06 00:00:00+00:00 55.86334436
1725
+ 2020-01-07 00:00:00+00:00 60.02240894
1726
+ 2020-01-08 00:00:00+00:00 63.15057948
1727
+ dtype: float64
1728
+
1729
+ """
1730
+ if not isinstance(prices, (np.ndarray, pd.Series)):
1731
+ raise ValueError(
1732
+ "Input must be either a numpy array or a pandas Series.")
1383
1733
  kf = PyKalmanFilter(
1384
1734
  transition_matrices=np.eye(1),
1385
1735
  observation_matrices=np.eye(1),
@@ -1396,8 +1746,32 @@ def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1396
1746
  return state_means.flatten()
1397
1747
 
1398
1748
 
1399
- def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
1400
- """Estimate Hedge Ratio using Kalman Filter."""
1749
+ def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
1750
+ """
1751
+ Estimate Hedge Ratio using Kalman Filter.
1752
+ Args:
1753
+ x : pd.Series or np.ndarray
1754
+ The independent variable, which can be either a pandas Series or a numpy array.
1755
+ y : pd.Series or np.ndarray
1756
+ The dependent variable, which can be either a pandas Series or a numpy array.
1757
+
1758
+ Returns:
1759
+ np.ndarray
1760
+ The estimated hedge ratio as a numpy array.
1761
+
1762
+ The function returns the negative of the first state variable of each Kalman Filter estimate,
1763
+ which represents the estimated hedge ratio.
1764
+
1765
+ References
1766
+ ----------
1767
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1768
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1769
+ """
1770
+ if (not isinstance(x, (np.ndarray, pd.Series))
1771
+ or not isinstance(y, (np.ndarray, pd.Series))):
1772
+ raise ValueError(
1773
+ "Both x and y must be either a numpy array or a pandas Series.")
1774
+
1401
1775
  delta = 1e-3
1402
1776
  trans_cov = delta / (1 - delta) * np.eye(2)
1403
1777
  obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
@@ -1411,7 +1785,8 @@ def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
1411
1785
  observation_covariance=2,
1412
1786
  transition_covariance=trans_cov
1413
1787
  )
1414
- state_means, _ = kf.filter(y.values)
1415
- # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
1788
+ y = y.values if isinstance(y, pd.Series) else y
1789
+ state_means, _ = kf.filter(y)
1790
+ # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
1416
1791
  # each Kalman Filter estimate, which is the estimated hedge ratio.
1417
1792
  return -state_means[:, 0]