bbstrader 0.1.94__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

bbstrader/tseries.py CHANGED
@@ -50,7 +50,8 @@ __all__ = [
50
50
  "run_kalman_filter",
51
51
  "ArimaGarchModel",
52
52
  "KalmanFilterModel",
53
- "OrnsteinUhlenbeckModel"
53
+ "OrnsteinUhlenbeckModel",
54
+
54
55
  ]
55
56
 
56
57
  # *******************************************
@@ -499,7 +500,7 @@ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -
499
500
  >>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
500
501
  """
501
502
  # Download historical data
502
- data = yf.download(tickers, start=start, end=end)['Adj Close']
503
+ data = yf.download(tickers, start=start, end=end, multi_level_index=False)['Adj Close']
503
504
 
504
505
  # Calculate correlation matrix
505
506
  correlation_matrix = data.corr()
@@ -643,8 +644,8 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
643
644
  """
644
645
  # Download historical data for required stocks
645
646
  p0, p1 = pair[0], pair[1]
646
- _p0 = yf.download(p0, start=start, end=end)
647
- _p1 = yf.download(p1, start=start, end=end)
647
+ _p0 = yf.download(p0, start=start, end=end, progress=False, multi_level_index=False)
648
+ _p1 = yf.download(p1, start=start, end=end, progress=False, multi_level_index=False)
648
649
  df = pd.DataFrame(index=_p0.index)
649
650
  df[p0] = _p0["Adj Close"]
650
651
  df[p1] = _p1["Adj Close"]
@@ -726,7 +727,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
726
727
 
727
728
  >>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
728
729
  """
729
- data = yf.download(symbol, start=start, end=end)
730
+ data = yf.download(symbol, start=start, end=end, progress=False, multi_level_index=False)
730
731
 
731
732
  # Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
732
733
  gbm = np.log(np.cumsum(np.random.randn(100000))+1000)
@@ -743,7 +744,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
743
744
  def test_cointegration(ticker1, ticker2, start, end):
744
745
  # Download historical data
745
746
  stock_data_pair = yf.download(
746
- [ticker1, ticker2], start=start, end=end
747
+ [ticker1, ticker2], start=start, end=end, progress=False, multi_level_index=False
747
748
  )['Adj Close'].dropna()
748
749
 
749
750
  # Perform Johansen cointegration test
@@ -865,16 +866,16 @@ def run_kalman_filter(
865
866
  etfs: Union[List[str], Tuple[str, ...]],
866
867
  start: str, end: str) -> None:
867
868
  """
868
- Applies a Kalman filter to a pair of ETF adjusted closing prices within a specified date range
869
+ Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
869
870
  to estimate the slope and intercept over time.
870
871
 
871
- The function downloads historical adjusted closing prices for the specified pair of ETFs,
872
+ The function downloads historical adjusted closing prices for the specified pair of assets,
872
873
  visualizes their price relationship, calculates the Kalman filter estimates for the slope and
873
874
  intercept, and visualizes the changes in these estimates over time.
874
875
 
875
876
  Args:
876
877
  etfs (Union[List[str] , Tuple[str, ...]]):
877
- A list or tuple containing two valid ETF tickers (e.g., ['SPY', 'QQQ']).
878
+ A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
878
879
  start (str): The start date for the historical data in 'YYYY-MM-DD' format.
879
880
  end (str): The end date for the historical data in 'YYYY-MM-DD' format.
880
881
 
@@ -883,8 +884,8 @@ def run_kalman_filter(
883
884
 
884
885
  >>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
885
886
  """
886
- etf_df1 = yf.download(etfs[0], start, end)
887
- etf_df2 = yf.download(etfs[1], start, end)
887
+ etf_df1 = yf.download(etfs[0], start, end, progress=False, multi_level_index=False)
888
+ etf_df2 = yf.download(etfs[1], start, end, progress=False, multi_level_index=False)
888
889
 
889
890
  prices = pd.DataFrame(index=etf_df1.index)
890
891
  prices[etfs[0]] = etf_df1["Adj Close"]
@@ -919,13 +920,14 @@ class KalmanFilterModel():
919
920
  """
920
921
  self.tickers = tickers
921
922
  assert self.tickers is not None
923
+
924
+ self.R = None
925
+ self.theta = np.zeros(2)
926
+ self.P = np.zeros((2, 2))
927
+ self.delta = kwargs.get("delta", 1e-4)
928
+ self.vt = kwargs.get("vt", 1e-3)
929
+ self.wt = self.delta/(1-self.delta) * np.eye(2)
922
930
  self.latest_prices = np.array([-1.0, -1.0])
923
- self.delta = kwargs.get("delta", 1e-4)
924
- self.wt = self.delta/(1-self.delta) * np.eye(2)
925
- self.vt = kwargs.get("vt", 1e-3)
926
- self.theta = np.zeros(2)
927
- self.P = np.zeros((2, 2))
928
- self.R = None
929
931
  self.kf = self._init_kalman()
930
932
 
931
933
  def _init_kalman(self):
@@ -958,25 +960,24 @@ class KalmanFilterModel():
958
960
  Returns:
959
961
  A tuple containing the slope and intercept of the relationship
960
962
  """
961
- kf = self.kf
962
- kf.H = np.array([[prices[1], 1.0]])
963
- kf.predict()
964
- kf.update(prices[0])
965
- slope = kf.x.copy().flatten()[0]
966
- intercept = kf.x.copy().flatten()[1]
963
+ self.kf.H = np.array([[prices[1], 1.0]])
964
+ self.kf.predict()
965
+ self.kf.update(prices[1])
966
+ slope = self.kf.x.copy().flatten()[0]
967
+ intercept = self.kf.x.copy().flatten()[1]
967
968
 
968
969
  return slope, intercept
969
970
 
970
971
  def calculate_etqt(self, prices: Array) -> Tuple:
971
972
  """
972
- Calculates the forecast error and standard deviation of the predictions
973
+ Calculates the ``forecast error`` and ``standard deviation`` of the predictions
973
974
  using the Kalman Filter.
974
975
 
975
976
  Args:
976
977
  prices : A numpy array of prices for two financial instruments.
977
978
 
978
979
  Returns:
979
- A tuple containing the forecast error and standard deviation of the predictions.
980
+ A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
980
981
  """
981
982
 
982
983
  self.latest_prices[0] = prices[0]
@@ -1021,7 +1022,7 @@ class KalmanFilterModel():
1021
1022
  At = self.R.dot(F.T) / Qt
1022
1023
  self.theta = self.theta + At.flatten() * et
1023
1024
  self.C = self.R - At * F.dot(self.R)
1024
- return (et, sqrt_Qt)
1025
+ return (et[0], sqrt_Qt.flatten()[0])
1025
1026
  else:
1026
1027
  return None
1027
1028
 
@@ -1190,6 +1191,33 @@ class OrnsteinUhlenbeck():
1190
1191
 
1191
1192
 
1192
1193
  def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
1194
+ """
1195
+ Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
1196
+ This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
1197
+
1198
+ Args:
1199
+ df (pd.DataFrame): A DataFrame where each column represents an asset
1200
+ and rows represent observations (e.g., time-series data).
1201
+ cutoff (float, optional, default=0.99): The correlation threshold.
1202
+ Columns with absolute correlation greater than this value will be considered for removal.
1203
+
1204
+ Returns:
1205
+ pd.DataFrame: A DataFrame with less correlated assets.
1206
+ The columns that are highly correlated (above the cutoff) are removed.
1207
+
1208
+ References
1209
+ ----------
1210
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1211
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1212
+
1213
+ Example:
1214
+ >>> df = pd.DataFrame({
1215
+ ... 'AAPL': [100, 101, 102, 103, 104],
1216
+ ... 'MSFT': [200, 201, 202, 203, 204],
1217
+ ... 'GOOG': [300, 301, 302, 303, 304]
1218
+ ... })
1219
+ >>> df = remove_correlated_assets(df)
1220
+ """
1193
1221
  corr = df.corr().stack()
1194
1222
  corr = corr[corr < 1]
1195
1223
  to_check = corr[corr.abs() > cutoff].index
@@ -1208,6 +1236,32 @@ def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
1208
1236
 
1209
1237
 
1210
1238
  def check_stationarity(df: pd.DataFrame):
1239
+ """
1240
+ Tests the stationarity of time-series data for each asset in the DataFrame
1241
+ using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
1242
+ in time-series analysis, and non-stationary data can affect model performance.
1243
+
1244
+ Args:
1245
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1246
+
1247
+ Returns:
1248
+ pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
1249
+ - ticker Asset name (column name from df).
1250
+ - adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
1251
+
1252
+ References
1253
+ ----------
1254
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1255
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1256
+
1257
+ Example:
1258
+ >>> df = pd.DataFrame({
1259
+ ... 'AAPL': [100, 101, 102, 103, 104],
1260
+ ... 'MSFT': [200, 201, 202, 203, 204],
1261
+ ... 'GOOG': [300, 301, 302, 303, 304]
1262
+ ... })
1263
+ >>> df = check_stationarity(df)
1264
+ """
1211
1265
  results = []
1212
1266
  for ticker, prices in df.items():
1213
1267
  results.append([ticker, adfuller(prices, regression='ct')[1]])
@@ -1215,12 +1269,66 @@ def check_stationarity(df: pd.DataFrame):
1215
1269
 
1216
1270
 
1217
1271
  def remove_stationary_assets(df: pd.DataFrame, pval=.05):
1272
+ """
1273
+ Filters out stationary assets from the DataFrame based on the p-value obtained
1274
+ from the Augmented Dickey-Fuller test.
1275
+ Useful for focusing only on non-stationary time-series data.
1276
+
1277
+ Args:
1278
+ df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
1279
+ pval (float, optional, default=0.05): The significance level to determine stationarity.
1280
+ Columns with an ADF test p-value below this threshold are considered stationary and removed.
1281
+
1282
+ Returns:
1283
+ pd.DataFrame: A DataFrame containing only the non-stationary assets.
1284
+
1285
+ References
1286
+ ----------
1287
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1288
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1289
+
1290
+ Example:
1291
+ >>> df = pd.DataFrame({
1292
+ ... 'AAPL': [100, 101, 102, 103, 104],
1293
+ ... 'MSFT': [200, 201, 202, 203, 204],
1294
+ ... 'GOOG': [300, 301, 302, 303, 304]
1295
+ ... })
1296
+ >>> df = remove_stationary_assets(df)
1297
+ """
1218
1298
  test_result = check_stationarity(df)
1219
1299
  stationary = test_result.loc[test_result.adf <= pval, 'ticker'].tolist()
1220
1300
  return df.drop(stationary, axis=1).sort_index()
1221
1301
 
1222
1302
 
1223
- def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
1303
+ def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
1304
+ """
1305
+ Selects the top N assets based on the average trading volume from the input DataFrame.
1306
+ These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
1307
+
1308
+ Args:
1309
+ df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
1310
+ Must include columns 'close' (price) and 'volume'.
1311
+ n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
1312
+ start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
1313
+ end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
1314
+ rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
1315
+
1316
+ Returns:
1317
+ pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
1318
+
1319
+ References
1320
+ ----------
1321
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1322
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1323
+ """
1324
+ required_columns = {'close', 'volume'}
1325
+ if not required_columns.issubset(df.columns):
1326
+ raise ValueError(f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}.")
1327
+
1328
+ if not isinstance(df.index, pd.MultiIndex) or 'ticker' not in df.index.names or 'date' not in df.index.names:
1329
+ raise ValueError("Index must be a MultiIndex with levels ['ticker', 'date'].")
1330
+
1331
+ df = df.copy()
1224
1332
  idx = pd.IndexSlice
1225
1333
  start = start or df.index.get_level_values('date').min()
1226
1334
  end = end or df.index.get_level_values('date').max()
@@ -1229,23 +1337,63 @@ def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
1229
1337
  .sort_index()
1230
1338
  .loc[idx[:, f'{start}':f'{end}'], :]
1231
1339
  .assign(dv=lambda df: df.close.mul(df.volume)))
1232
-
1233
- # select n assets with the highest average trading volume
1234
- # we are taking a shortcut to simplify; should select
1235
- # based on historical only, e.g. yearly rolling avg
1236
- most_traded = (df.groupby(level='ticker')
1237
- .dv.mean()
1238
- .nlargest(n=n).index)
1239
-
1340
+
1341
+ if rolling_window is None:
1342
+ most_traded = (df.groupby(level='ticker')
1343
+ .dv.mean()
1344
+ .nlargest(n=n).index)
1345
+ else:
1346
+ # Calculate the rolling average of dollar volume
1347
+ df['dv_rolling_avg'] = (
1348
+ df.groupby(level=0)
1349
+ .dv
1350
+ .rolling(window=rolling_window, min_periods=1)
1351
+ .mean()
1352
+ .reset_index(level=0, drop=True)
1353
+ )
1354
+ most_traded = (
1355
+ df.groupby(level=0)['dv_rolling_avg']
1356
+ .mean()
1357
+ .nlargest(n=n)
1358
+ .index
1359
+ )
1240
1360
  df = (df.loc[idx[most_traded, :], 'close']
1241
1361
  .unstack('ticker')
1242
- .ffill(limit=5) # fill up to five values
1243
- .dropna(axis=1)) # remove assets with any missing values
1244
-
1362
+ .ffill(limit=5)
1363
+ .dropna(axis=1))
1245
1364
  df = remove_correlated_assets(df)
1246
- return remove_stationary_assets(df).sort_index()
1365
+ df = remove_stationary_assets(df)
1366
+ return df.sort_index()
1367
+
1247
1368
 
1248
1369
  def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1370
+ """
1371
+ Calculates statistical and econometric metrics for a target security and a set of candidate securities.
1372
+ These metrics are useful in financial modeling and pairs trading strategies,
1373
+ providing information about drift, volatility, correlation, and cointegration.
1374
+
1375
+ Args:
1376
+ security (pd.Series): A time-series of the target security's prices.
1377
+ The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
1378
+ candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
1379
+ for candidate securities to be evaluated against the target security.
1380
+
1381
+ Returns:
1382
+ pd.DataFrame: A DataFrame combining:
1383
+ Drift: Estimated drift of spreads between the target security and each candidate.
1384
+ Volatility: Standard deviation of spreads.
1385
+ Correlation:
1386
+ ``corr``: Correlation of normalized prices between the target and each candidate.
1387
+ ``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
1388
+ Cointegration metrics:
1389
+ Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
1390
+ Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
1391
+
1392
+ References
1393
+ ----------
1394
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1395
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1396
+ """
1249
1397
  security = security.div(security.iloc[0])
1250
1398
  ticker = security.name
1251
1399
  candidates = candidates.div(candidates.iloc[0])
@@ -1261,7 +1409,7 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1261
1409
  # compute volatility
1262
1410
  vol = spreads.std().to_frame('vol')
1263
1411
 
1264
- # return correlation
1412
+ # returns correlation
1265
1413
  corr_ret = (candidates.pct_change()
1266
1414
  .corrwith(security.pct_change())
1267
1415
  .to_frame('corr_ret'))
@@ -1288,22 +1436,90 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
1288
1436
  tests = pd.DataFrame(tests, columns=columns).set_index('s2')
1289
1437
  return metrics.join(tests)
1290
1438
 
1291
- CRITICAL_VALUES = {
1439
+
1440
+ __CRITICAL_VALUES = {
1292
1441
  0: {.9: 13.4294, .95: 15.4943, .99: 19.9349},
1293
1442
  1: {.9: 2.7055, .95: 3.8415, .99: 6.6349}
1294
1443
  }
1295
1444
 
1445
+
1296
1446
  def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
1297
- n=None, start=None, stop=None):
1298
- trace0_cv = CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
1299
- trace1_cv = CRITICAL_VALUES[1][.95] # critical value for 1 cointegration relationship
1447
+ n=None, start=None, stop=None, coint=False):
1448
+
1449
+ """
1450
+ Identifies cointegrated pairs between a target set of securities and candidate securities
1451
+ based on econometric tests. The function evaluates statistical relationships,
1452
+ such as cointegration and Engle-Granger significance, to determine pairs suitable
1453
+ for financial strategies like pairs trading.
1454
+
1455
+ Args:
1456
+ securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
1457
+ prices of target securities to evaluate.
1458
+ candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
1459
+ prices of candidate securities to compare against the target securities.
1460
+ n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
1461
+ start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
1462
+ stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
1463
+ coint (`bool`, optional, default=False):
1464
+ - If `True`, filters for pairs identified as cointegrated.
1465
+ - If `False`, returns all evaluated pairs.
1466
+
1467
+ Returns:
1468
+ - ``pd.DataFrame``: A DataFrame containing:
1469
+ - Johansen and Engle-Granger cointegration metrics:
1470
+ - `t1`, `t2`: Engle-Granger test statistics for two directions.
1471
+ - `p1`, `p2`: Engle-Granger p-values for two directions.
1472
+ - `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
1473
+ - Indicators and filters:
1474
+ - `joh_sig`: Indicates Johansen cointegration significance.
1475
+ - `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
1476
+ - `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
1477
+ - `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
1478
+ - Spread and ranking:
1479
+ - `t`: Minimum of `t1` and `t2`.
1480
+ - `p`: Minimum of `p1` and `p2`.
1481
+ References
1482
+ ----------
1483
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1484
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1485
+
1486
+ Example:
1487
+ >>> import pandas as pd
1488
+
1489
+ >>> # Sample Data
1490
+ >>> data_securities = {
1491
+ ... 'Security1': [100, 102, 101, 103, 105],
1492
+ ... 'Security2': [50, 52, 53, 51, 54]
1493
+ ... }
1494
+ >>> data_candidates = {
1495
+ ... 'Candidate1': [100, 101, 99, 102, 104],
1496
+ ... 'Candidate2': [200, 202, 201, 203, 205]
1497
+ ... }
1498
+
1499
+ >>> securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
1500
+ >>> candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
1501
+
1502
+ >>> # Find cointegrated pairs
1503
+ >>> top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1504
+ >>> print(top_pairs)
1505
+
1506
+ >>> | s1 | s2 | t | p | joh_sig | eg_sig | coint |
1507
+ >>> |----------|-----------|------|-------|---------|--------|-------|
1508
+ >>> | Security1| Candidate1| -3.5 | 0.01 | 1 | 1 | 1 |
1509
+ >>> | Security2| Candidate2| -2.9 | 0.04 | 1 | 1 | 1 |
1510
+ """
1511
+ trace0_cv = __CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
1512
+ trace1_cv = __CRITICAL_VALUES[1][.95] # critical value for 1 cointegration relationship
1300
1513
  spreads = []
1301
1514
  if start is not None and stop is not None:
1302
1515
  securities = securities.loc[str(start): str(stop), :]
1303
1516
  candidates = candidates.loc[str(start): str(stop), :]
1304
1517
  for i, (ticker, prices) in enumerate(securities.items(), 1):
1305
- df = compute_pair_metrics(prices, candidates)
1306
- spreads.append(df.set_index('s1', append=True))
1518
+ try:
1519
+ df = compute_pair_metrics(prices, candidates)
1520
+ spreads.append(df.set_index('s1', append=True))
1521
+ except np.linalg.LinAlgError:
1522
+ continue
1307
1523
  spreads = pd.concat(spreads)
1308
1524
  spreads.index.names = ['s2', 's1']
1309
1525
  spreads = spreads.swaplevel()
@@ -1315,18 +1531,69 @@ def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
1315
1531
  spreads['s1_dep'] = spreads.p1 < spreads.p2
1316
1532
  spreads['coint'] = (spreads.joh_sig & spreads.eg_sig).astype(int)
1317
1533
  # select top n pairs
1318
- if n is not None:
1319
- top_pairs = (spreads.query('coint == 1')
1320
- .sort_values('t', ascending=False)
1321
- .head(n))
1534
+ if coint:
1535
+ if n is not None:
1536
+ top_pairs = (spreads.query('coint == 1')
1537
+ .sort_values('t', ascending=False)
1538
+ .head(n))
1539
+ else:
1540
+ top_pairs = (spreads.query('coint == 1')
1541
+ .sort_values('t', ascending=False))
1322
1542
  else:
1323
- top_pairs = spreads.query('coint == 1')
1543
+ if n is not None:
1544
+ top_pairs = (spreads
1545
+ .sort_values('t', ascending=False)
1546
+ .head(n))
1547
+ else:
1548
+ top_pairs = (spreads
1549
+ .sort_values('t', ascending=False))
1324
1550
  return top_pairs
1325
1551
 
1326
- def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=False,
1552
+
1553
+ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=True, crosstab=False,
1327
1554
  heuristics=False, log_reg=False, decis_tree=False):
1555
+ """
1556
+ Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
1557
+
1558
+ Args:
1559
+ spreads (pd.DataFrame):
1560
+ A DataFrame containing cointegration metrics and characteristics.
1561
+ Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
1562
+ plot_coint (bool, optional):
1563
+ If True, generates scatterplots and boxplots to visualize cointegration characteristics.
1564
+ cosstab (bool, optional):
1565
+ If True, displays crosstabulations of Engle-Granger and Johansen test significance.
1566
+ heuristics (bool, optional):
1567
+ If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
1568
+ log_reg (bool, optional):
1569
+ If True, fits a logistic regression model to predict cointegration and evaluates its performance.
1570
+ decis_tree (bool, optional):
1571
+ If True, fits a decision tree model to predict cointegration and evaluates its performance.
1572
+
1573
+ References
1574
+ ----------
1575
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1576
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1577
+
1578
+ Example:
1579
+ >>> import pandas as pd
1580
+ >>> from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
1581
+
1582
+ >>> # Sample Data
1583
+ >>> securities = pd.DataFrame({
1584
+ ... 'SPY': [100, 102, 101, 103, 105],
1585
+ ... 'QQQ': [50, 52, 53, 51, 54]
1586
+ ... })
1587
+ >>> candidates = pd.DataFrame({
1588
+ ... 'AAPL': [100, 101, 99, 102, 104],
1589
+ ... 'MSFT': [200, 202, 201, 203, 205]
1590
+ ... })
1591
+
1592
+ >>> pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
1593
+ >>> analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
1594
+ """
1328
1595
  if plot_coint:
1329
- trace0_cv = CRITICAL_VALUES[0][.95]
1596
+ trace0_cv = __CRITICAL_VALUES[0][.95]
1330
1597
  spreads = spreads.reset_index()
1331
1598
  sns.scatterplot(x=np.log1p(spreads.t.abs()),
1332
1599
  y=np.log1p(spreads.trace1),
@@ -1335,11 +1602,13 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
1335
1602
  for i, heuristic in enumerate(['drift', 'vol', 'corr', 'corr_ret']):
1336
1603
  sns.boxplot(x='coint', y=heuristic, data=spreads, ax=axes[i])
1337
1604
  fig.tight_layout();
1605
+
1338
1606
  if heuristics:
1339
1607
  spreads = spreads.reset_index()
1340
1608
  h = spreads.groupby(spreads.coint)[
1341
1609
  ['drift', 'vol', 'corr']].describe().stack(level=0).swaplevel().sort_index()
1342
1610
  print(h)
1611
+
1343
1612
  if log_reg:
1344
1613
  y = spreads.coint
1345
1614
  X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
@@ -1353,6 +1622,7 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
1353
1622
  res = f'C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}'
1354
1623
  print(res)
1355
1624
  print(log_reg.coef_)
1625
+
1356
1626
  if decis_tree:
1357
1627
  model = DecisionTreeClassifier(class_weight='balanced')
1358
1628
  decision_tree = GridSearchCV(model,
@@ -1364,22 +1634,77 @@ def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=
1364
1634
  decision_tree.fit(X, y)
1365
1635
  res = f'{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_["max_depth"]}'
1366
1636
  print(res)
1367
- if cosstab:
1637
+
1638
+ if crosstab:
1368
1639
  pd.set_option('display.float_format', lambda x: f'{x:.2%}')
1369
1640
  print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
1370
1641
  print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
1371
1642
 
1372
1643
 
1373
- def select_candidate_pairs(pairs: pd.DataFrame):
1644
+ def select_candidate_pairs(pairs: pd.DataFrame, period=False):
1645
+ """
1646
+ Select candidate pairs from a DataFrame based on cointegration status.
1647
+
1648
+ This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
1649
+ indicating cointegration. It then determines the dependent and independent series for each pair
1650
+ and returns the selected pairs in a dictionary format.
1651
+
1652
+ Args:
1653
+ pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
1654
+ period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
1655
+
1656
+ Returns:
1657
+ list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
1658
+
1659
+ References
1660
+ ----------
1661
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1662
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1663
+ """
1374
1664
  candidates = pairs.query('coint == 1').copy()
1375
- candidates['y'] = candidates.apply(lambda x: x.s1 if x.s1_dep else x.s2, axis=1)
1376
- candidates['x'] = candidates.apply(lambda x: x.s2 if x.s1_dep else x.s1, axis=1)
1377
- candidates.drop(['s1_dep', 's1', 's2'], axis=1)
1665
+ candidates = candidates.reset_index()
1666
+ candidates['y'] = candidates.apply(lambda x: x['s1'] if x.s1_dep else x['s2'], axis=1)
1667
+ candidates['x'] = candidates.apply(lambda x: x['s2'] if x.s1_dep else x['s1'], axis=1)
1668
+ if period:
1669
+ return candidates[['x', 'y', 'period']].to_dict(orient='records')
1378
1670
  return candidates[['x', 'y']].to_dict(orient='records')
1379
1671
 
1380
1672
 
1381
- def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1382
- """Estimate rolling mean using Kalman Smoothing."""
1673
+ def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1674
+ """
1675
+ Estimate rolling mean using Kalman Smoothing.
1676
+
1677
+ Args:
1678
+ prices : pd.Series or np.ndarray
1679
+ The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
1680
+
1681
+ Returns:
1682
+ pd.Series or np.ndarray
1683
+ The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
1684
+ If the input is a numpy array, the output will be a numpy array.
1685
+
1686
+ References
1687
+ ----------
1688
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1689
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1690
+
1691
+ Examples
1692
+ --------
1693
+ >>> import yfinance as yf
1694
+ >>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
1695
+ >>> prices = KFSmoother(prices)
1696
+ >>> print(prices[:5])
1697
+ Date
1698
+ 2020-01-02 00:00:00+00:00 36.39801407
1699
+ 2020-01-03 00:00:00+00:00 49.06231000
1700
+ 2020-01-06 00:00:00+00:00 55.86334436
1701
+ 2020-01-07 00:00:00+00:00 60.02240894
1702
+ 2020-01-08 00:00:00+00:00 63.15057948
1703
+ dtype: float64
1704
+
1705
+ """
1706
+ if not isinstance(prices, (np.ndarray, pd.Series)):
1707
+ raise ValueError("Input must be either a numpy array or a pandas Series.")
1383
1708
  kf = PyKalmanFilter(
1384
1709
  transition_matrices=np.eye(1),
1385
1710
  observation_matrices=np.eye(1),
@@ -1396,8 +1721,31 @@ def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
1396
1721
  return state_means.flatten()
1397
1722
 
1398
1723
 
1399
- def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
1400
- """Estimate Hedge Ratio using Kalman Filter."""
1724
+ def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
1725
+ """
1726
+ Estimate Hedge Ratio using Kalman Filter.
1727
+ Args:
1728
+ x : pd.Series or np.ndarray
1729
+ The independent variable, which can be either a pandas Series or a numpy array.
1730
+ y : pd.Series or np.ndarray
1731
+ The dependent variable, which can be either a pandas Series or a numpy array.
1732
+
1733
+ Returns:
1734
+ np.ndarray
1735
+ The estimated hedge ratio as a numpy array.
1736
+
1737
+ The function returns the negative of the first state variable of each Kalman Filter estimate,
1738
+ which represents the estimated hedge ratio.
1739
+
1740
+ References
1741
+ ----------
1742
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
1743
+ chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
1744
+ """
1745
+ if (not isinstance(x, (np.ndarray, pd.Series))
1746
+ or not isinstance(y, (np.ndarray, pd.Series))):
1747
+ raise ValueError("Both x and y must be either a numpy array or a pandas Series.")
1748
+
1401
1749
  delta = 1e-3
1402
1750
  trans_cov = delta / (1 - delta) * np.eye(2)
1403
1751
  obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
@@ -1411,7 +1759,8 @@ def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
1411
1759
  observation_covariance=2,
1412
1760
  transition_covariance=trans_cov
1413
1761
  )
1414
- state_means, _ = kf.filter(y.values)
1762
+ y = y.values if isinstance(y, pd.Series) else y
1763
+ state_means, _ = kf.filter(y)
1415
1764
  # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
1416
1765
  # each Kalman Filter estimate, which is the estimated hedge ratio.
1417
1766
  return -state_means[:, 0]