PyPI - bbstrader - Versions diffs - 0.1.94__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

bbstrader 0.1.94py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bbstrader might be problematic. Click here for more details.

Files changed (38) hide show

bbstrader/__ini__.py +9 -9
bbstrader/btengine/__init__.py +7 -7
bbstrader/btengine/backtest.py +30 -26
bbstrader/btengine/data.py +100 -79
bbstrader/btengine/event.py +2 -1
bbstrader/btengine/execution.py +18 -16
bbstrader/btengine/performance.py +11 -7
bbstrader/btengine/portfolio.py +35 -36
bbstrader/btengine/strategy.py +119 -94
bbstrader/config.py +14 -8
bbstrader/core/__init__.py +0 -0
bbstrader/core/data.py +22 -0
bbstrader/core/utils.py +57 -0
bbstrader/ibkr/__init__.py +0 -0
bbstrader/ibkr/utils.py +0 -0
bbstrader/metatrader/__init__.py +5 -5
bbstrader/metatrader/account.py +117 -121
bbstrader/metatrader/rates.py +83 -80
bbstrader/metatrader/risk.py +23 -37
bbstrader/metatrader/trade.py +169 -140
bbstrader/metatrader/utils.py +3 -3
bbstrader/models/__init__.py +5 -5
bbstrader/models/factors.py +280 -0
bbstrader/models/ml.py +1092 -0
bbstrader/models/optimization.py +31 -28
bbstrader/models/{portfolios.py → portfolio.py} +64 -46
bbstrader/models/risk.py +15 -9
bbstrader/trading/__init__.py +2 -2
bbstrader/trading/execution.py +252 -164
bbstrader/trading/scripts.py +8 -4
bbstrader/trading/strategies.py +79 -66
bbstrader/tseries.py +482 -107
{bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/LICENSE +1 -1
{bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/METADATA +6 -1
bbstrader-0.2.1.dist-info/RECORD +37 -0
bbstrader-0.1.94.dist-info/RECORD +0 -32
{bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/WHEEL +0 -0
{bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/top_level.txt +0 -0

bbstrader/tseries.py CHANGED Viewed

@@ -8,33 +8,34 @@ market analysis, and financial data exploration.
 """
 import pprint
 import warnings
+from itertools import combinations
+from typing import List, Tuple, Union
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from   tqdm import tqdm
-import yfinance as yf
 import pmdarima as pm
 import seaborn as sns
 import statsmodels.api as sm
-import matplotlib.pyplot as plt
 import statsmodels.tsa.stattools as ts
-from hurst import compute_Hc
+import yfinance as yf
 from arch import arch_model
-from scipy.optimize import minimize
 from filterpy.kalman import KalmanFilter
+from hurst import compute_Hc
 from pykalman import KalmanFilter as PyKalmanFilter
-from statsmodels.tsa.vector_ar.vecm import coint_johansen
+from scipy.optimize import minimize
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV
+from sklearn.tree import DecisionTreeClassifier
 from statsmodels.graphics.tsaplots import plot_acf
-from statsmodels.tsa.stattools import adfuller, coint
+from statsmodels.stats.diagnostic import acorr_ljungbox
 from statsmodels.tsa.arima.model import ARIMA
+from statsmodels.tsa.stattools import adfuller, coint
 from statsmodels.tsa.vector_ar.var_model import VAR
-from sklearn.model_selection import GridSearchCV
-from sklearn.tree import  DecisionTreeClassifier
-from sklearn.linear_model import LogisticRegressionCV
-from statsmodels.stats.diagnostic import acorr_ljungbox
-from itertools import combinations
-from typing import Union, List, Tuple
-warnings.filterwarnings("ignore")
+from statsmodels.tsa.vector_ar.vecm import coint_johansen
+from tqdm import tqdm
+warnings.filterwarnings("ignore")
 __all__ = [
@@ -50,7 +51,17 @@ __all__ = [
     "run_kalman_filter",
     "ArimaGarchModel",
     "KalmanFilterModel",
-    "OrnsteinUhlenbeckModel"
+    "OrnsteinUhlenbeck",
+    "remove_correlated_assets",
+    "check_stationarity",
+    "remove_stationary_assets",
+    "select_assets",
+    "compute_pair_metrics",
+    "find_cointegrated_pairs",
+    "analyze_cointegrated_pairs",
+    "select_candidate_pairs",
+    "KFSmoother",
+    "KFHedgeRatio",
 ]
 # *******************************************
@@ -123,7 +134,8 @@ def fit_best_arima(window_data: Union[pd.Series, np.ndarray]):
     from arch.utility.exceptions import ConvergenceWarning as ArchWarning
     from statsmodels.tools.sm_exceptions import ConvergenceWarning as StatsWarning
     with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=StatsWarning, module='statsmodels')
+        warnings.filterwarnings(
+            "ignore", category=StatsWarning, module='statsmodels')
         warnings.filterwarnings("ignore", category=ArchWarning, module='arch')
         try:
             best_arima_model = ARIMA(
@@ -499,7 +511,8 @@ def get_corr(tickers: Union[List[str], Tuple[str, ...]], start: str, end: str) -
     >>> get_corr(['AAPL', 'MSFT', 'GOOG'], '2023-01-01', '2023-12-31')
     """
     # Download historical data
-    data = yf.download(tickers, start=start, end=end)['Adj Close']
+    data = yf.download(tickers, start=start, end=end,
+                       multi_level_index=False)['Adj Close']
     # Calculate correlation matrix
     correlation_matrix = data.corr()
@@ -643,8 +656,10 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
     """
     # Download historical data for required stocks
     p0, p1 = pair[0], pair[1]
-    _p0 = yf.download(p0, start=start, end=end)
-    _p1 = yf.download(p1, start=start, end=end)
+    _p0 = yf.download(p0, start=start, end=end,
+                      progress=False, multi_level_index=False)
+    _p1 = yf.download(p1, start=start, end=end,
+                      progress=False, multi_level_index=False)
     df = pd.DataFrame(index=_p0.index)
     df[p0] = _p0["Adj Close"]
     df[p1] = _p1["Adj Close"]
@@ -673,7 +688,7 @@ def run_cadf_test(pair: Union[List[str], Tuple[str, ...]], start: str, end: str)
     # Display regression metrics
     print("\nRegression Metrics:")
     print(f"Optimal Hedge Ratio (Beta): {beta_hr}")
-    print(f'Result Parmas: \n')
+    print('Result Parmas: \n')
     print(results.params)
     print("\nRegression Summary:")
     print(results.summary())
@@ -726,7 +741,8 @@ def run_hurst_test(symbol: str, start: str, end: str):
     >>> run_hurst_test('AAPL', '2023-01-01', '2023-12-31')
     """
-    data = yf.download(symbol, start=start, end=end)
+    data = yf.download(symbol, start=start, end=end,
+                       progress=False, multi_level_index=False)
     # Create a Geometric Brownian Motion, Mean-Reverting, and Trending Series
     gbm = np.log(np.cumsum(np.random.randn(100000))+1000)
@@ -743,7 +759,7 @@ def run_hurst_test(symbol: str, start: str, end: str):
 def test_cointegration(ticker1, ticker2, start, end):
     # Download historical data
     stock_data_pair = yf.download(
-        [ticker1, ticker2], start=start, end=end
+        [ticker1, ticker2], start=start, end=end, progress=False, multi_level_index=False
     )['Adj Close'].dropna()
     # Perform Johansen cointegration test
@@ -865,16 +881,16 @@ def run_kalman_filter(
         etfs: Union[List[str], Tuple[str, ...]],
         start: str, end: str) -> None:
     """
-    Applies a Kalman filter to a pair of ETF adjusted closing prices within a specified date range
+    Applies a Kalman filter to a pair of assets adjusted closing prices within a specified date range
     to estimate the slope and intercept over time.
-    The function downloads historical adjusted closing prices for the specified pair of ETFs,
+    The function downloads historical adjusted closing prices for the specified pair of assets,
     visualizes their price relationship, calculates the Kalman filter estimates for the slope and
     intercept, and visualizes the changes in these estimates over time.
     Args:
         etfs (Union[List[str] , Tuple[str, ...]]):
-        A list or tuple containing two valid ETF tickers (e.g., ['SPY', 'QQQ']).
+        A list or tuple containing two valid assets tickers (e.g., ['SPY', 'QQQ']).
         start (str): The start date for the historical data in 'YYYY-MM-DD' format.
         end (str): The end date for the historical data in 'YYYY-MM-DD' format.
@@ -883,8 +899,10 @@ def run_kalman_filter(
     >>> run_kalman_filter(['SPY', 'QQQ'], '2023-01-01', '2023-12-31')
     """
-    etf_df1 = yf.download(etfs[0], start, end)
-    etf_df2 = yf.download(etfs[1], start, end)
+    etf_df1 = yf.download(etfs[0], start, end,
+                          progress=False, multi_level_index=False)
+    etf_df2 = yf.download(etfs[1], start, end,
+                          progress=False, multi_level_index=False)
     prices = pd.DataFrame(index=etf_df1.index)
     prices[etfs[0]] = etf_df1["Adj Close"]
@@ -919,13 +937,14 @@ class KalmanFilterModel():
         """
         self.tickers = tickers
         assert self.tickers is not None
-        self.latest_prices = np.array([-1.0, -1.0])
-        self.delta = kwargs.get("delta", 1e-4)
-        self.wt = self.delta/(1-self.delta) * np.eye(2)
-        self.vt = kwargs.get("vt", 1e-3)
+        self.R = None
         self.theta = np.zeros(2)
         self.P = np.zeros((2, 2))
-        self.R = None
+        self.delta = kwargs.get("delta", 1e-4)
+        self.vt = kwargs.get("vt", 1e-3)
+        self.wt = self.delta/(1-self.delta) * np.eye(2)
+        self.latest_prices = np.array([-1.0, -1.0])
         self.kf = self._init_kalman()
     def _init_kalman(self):
@@ -945,6 +964,7 @@ class KalmanFilterModel():
         return kf
     Array = np.ndarray
     def calc_slope_intercep(self, prices: Array) -> Tuple:
         """
         Calculates and returns the slope and intercept
@@ -958,25 +978,24 @@ class KalmanFilterModel():
         Returns:
             A tuple containing the slope and intercept of the relationship
         """
-        kf = self.kf
-        kf.H = np.array([[prices[1], 1.0]])
-        kf.predict()
-        kf.update(prices[0])
-        slope = kf.x.copy().flatten()[0]
-        intercept = kf.x.copy().flatten()[1]
+        self.kf.H = np.array([[prices[1], 1.0]])
+        self.kf.predict()
+        self.kf.update(prices[1])
+        slope = self.kf.x.copy().flatten()[0]
+        intercept = self.kf.x.copy().flatten()[1]
         return slope, intercept
     def calculate_etqt(self, prices: Array) -> Tuple:
         """
-        Calculates the forecast error and standard deviation of the predictions
+        Calculates the ``forecast error`` and ``standard deviation`` of the predictions
         using the Kalman Filter.
         Args:
             prices : A numpy array of prices for two financial instruments.
         Returns:
-            A tuple containing the forecast error and standard deviation of the predictions.
+            A tuple containing the ``forecast error`` and ``standard deviation`` of the predictions.
         """
         self.latest_prices[0] = prices[0]
@@ -1021,7 +1040,7 @@ class KalmanFilterModel():
             At = self.R.dot(F.T) / Qt
             self.theta = self.theta + At.flatten() * et
             self.C = self.R - At * F.dot(self.R)
-            return (et, sqrt_Qt)
+            return (et[0], sqrt_Qt.flatten()[0])
         else:
             return None
@@ -1165,7 +1184,7 @@ class OrnsteinUhlenbeck():
         Returns:
             np.ndarray: 2D array representing simulated processes.
         """
-        if returns is  None:
+        if returns is None:
             returns = self.returns
         if p is not None:
             T = p
@@ -1190,6 +1209,33 @@ class OrnsteinUhlenbeck():
 def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
+    """
+    Removes highly correlated assets from a DataFrame based on a specified correlation cutoff threshold.
+    This is useful in financial data analysis to reduce redundancy and multicollinearity in portfolios or datasets.
+    Args:
+        df (pd.DataFrame): A DataFrame where each column represents an asset
+            and rows represent observations (e.g., time-series data).
+        cutoff (float, optional, default=0.99): The correlation threshold.
+            Columns with absolute correlation greater than this value will be considered for removal.
+    Returns:
+        pd.DataFrame: A DataFrame with less correlated assets.
+            The columns that are highly correlated (above the cutoff) are removed.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Example:
+    >>> df = pd.DataFrame({
+    ...     'AAPL': [100, 101, 102, 103, 104],
+    ...     'MSFT': [200, 201, 202, 203, 204],
+    ...     'GOOG': [300, 301, 302, 303, 304]
+    ... })
+    >>> df =  remove_correlated_assets(df)
+    """
     corr = df.corr().stack()
     corr = corr[corr < 1]
     to_check = corr[corr.abs() > cutoff].index
@@ -1208,6 +1254,32 @@ def remove_correlated_assets(df: pd.DataFrame, cutoff=.99):
 def check_stationarity(df: pd.DataFrame):
+    """
+    Tests the stationarity of time-series data for each asset in the DataFrame
+    using the Augmented Dickey-Fuller (ADF) test. Stationarity is a key property
+    in time-series analysis, and non-stationary data can affect model performance.
+    Args:
+        df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
+    Returns:
+        pd.DataFrame: A DataFrame containing the ADF p-values for each asset,
+        - ticker Asset name (column name from df).
+        - adf p-value from the ADF test, indicating the probability of the null hypothesis (data is non-stationary).
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Example:
+    >>> df = pd.DataFrame({
+    ...     'AAPL': [100, 101, 102, 103, 104],
+    ...     'MSFT': [200, 201, 202, 203, 204],
+    ...     'GOOG': [300, 301, 302, 303, 304]
+    ... })
+    >>> df = check_stationarity(df)
+    """
     results = []
     for ticker, prices in df.items():
         results.append([ticker, adfuller(prices, regression='ct')[1]])
@@ -1215,37 +1287,133 @@ def check_stationarity(df: pd.DataFrame):
 def remove_stationary_assets(df: pd.DataFrame, pval=.05):
+    """
+    Filters out stationary assets from the DataFrame based on the p-value obtained
+    from the Augmented Dickey-Fuller test.
+    Useful for focusing only on non-stationary time-series data.
+    Args:
+        df (pd.DataFrame): A DataFrame where each column represents a time series of an asset.
+        pval (float, optional, default=0.05): The significance level to determine stationarity.
+            Columns with an ADF test p-value below this threshold are considered stationary and removed.
+    Returns:
+        pd.DataFrame: A DataFrame containing only the non-stationary assets.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Example:
+    >>> df = pd.DataFrame({
+    ...     'AAPL': [100, 101, 102, 103, 104],
+    ...     'MSFT': [200, 201, 202, 203, 204],
+    ...     'GOOG': [300, 301, 302, 303, 304]
+    ... })
+    >>> df = remove_stationary_assets(df)
+    """
     test_result = check_stationarity(df)
     stationary = test_result.loc[test_result.adf <= pval, 'ticker'].tolist()
     return df.drop(stationary, axis=1).sort_index()
-def select_assets(df: pd.DataFrame, n=100, start=None, end=None):
+def select_assets(df: pd.DataFrame, n=100, start=None, end=None, rolling_window=None):
+    """
+    Selects the top N assets based on the average trading volume from the input DataFrame.
+    These assets are used as universe in which we can search cointegrated pairs for pairs trading strategies.
+    Args:
+        df (pd.DataFrame): A multi-index DataFrame with levels ['ticker', 'date'] containing market data.
+            Must include columns 'close' (price) and 'volume'.
+        n (int, optional): Number of assets to select based on highest average trading volume. Default is 100.
+        start (str, optional): Start date for filtering the data. Default is the earliest date in the DataFrame.
+        end (str, optional): End date for filtering the data. Default is the latest date in the DataFrame.
+        rolling_window (int, optional): Rolling window for calculating the average trading volume. Default is None.
+    Returns:
+        pd.DataFrame: A DataFrame of selected assets with filtered, cleaned data, indexed by date.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+        """
+    required_columns = {'close', 'volume'}
+    if not required_columns.issubset(df.columns):
+        raise ValueError(
+            f"Input DataFrame must contain {required_columns}, but got {df.columns.tolist()}.")
+    if not isinstance(df.index, pd.MultiIndex) or 'ticker' not in df.index.names or 'date' not in df.index.names:
+        raise ValueError(
+            "Index must be a MultiIndex with levels ['ticker', 'date'].")
+    df = df.copy()
     idx = pd.IndexSlice
     start = start or df.index.get_level_values('date').min()
     end = end or df.index.get_level_values('date').max()
     df = (df
-        .loc[lambda df: ~df.index.duplicated()]
-        .sort_index()
-        .loc[idx[:, f'{start}':f'{end}'], :]
-        .assign(dv=lambda df: df.close.mul(df.volume)))
-    # select n assets with the highest average trading volume
-    # we are taking a shortcut to simplify; should select
-    # based on historical only, e.g. yearly rolling avg
-    most_traded = (df.groupby(level='ticker')
-                .dv.mean()
-                .nlargest(n=n).index)
+          .loc[lambda df: ~df.index.duplicated()]
+          .sort_index()
+          .loc[idx[:, f'{start}':f'{end}'], :]
+          .assign(dv=lambda df: df.close.mul(df.volume)))
+    if rolling_window is None:
+        most_traded = (df.groupby(level='ticker')
+                       .dv.mean()
+                       .nlargest(n=n).index)
+    else:
+        # Calculate the rolling average of dollar volume
+        df['dv_rolling_avg'] = (
+            df.groupby(level=0)
+            .dv
+            .rolling(window=rolling_window, min_periods=1)
+            .mean()
+            .reset_index(level=0, drop=True)
+        )
+        most_traded = (
+            df.groupby(level=0)['dv_rolling_avg']
+            .mean()
+            .nlargest(n=n)
+            .index
+        )
     df = (df.loc[idx[most_traded, :], 'close']
-        .unstack('ticker')
-        .ffill(limit=5)  # fill up to five values
-        .dropna(axis=1))  # remove assets with any missing values
+          .unstack('ticker')
+          .ffill(limit=5)
+          .dropna(axis=1))
     df = remove_correlated_assets(df)
-    return remove_stationary_assets(df).sort_index()
+    df = remove_stationary_assets(df)
+    return df.sort_index()
 def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
+    """
+    Calculates statistical and econometric metrics for a target security and a set of candidate securities.
+    These metrics are useful in financial modeling and pairs trading strategies,
+    providing information about drift, volatility, correlation, and cointegration.
+    Args:
+        security (pd.Series): A time-series of the target security's prices.
+            The name of the Series should correspond to the security's identifier (e.g., ticker symbol).
+        candidates (pd.DataFrame): A DataFrame where each column represents a time-series of prices
+            for candidate securities to be evaluated against the target security.
+    Returns:
+        pd.DataFrame: A DataFrame combining:
+            Drift: Estimated drift of spreads between the target security and each candidate.
+            Volatility: Standard deviation of spreads.
+            Correlation:
+                ``corr``: Correlation of normalized prices between the target and each candidate.
+                ``corr_ret``: Correlation of returns (percentage change) between the target and each candidate.
+            Cointegration metrics:
+                Engle-Granger test statistics (``t1``, ``t2``) and p-values (``p1``, ``p2``).
+                Johansen test trace statistics (``trace0``, ``trace1``) and selected lag order (``k_ar_diff``).
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    """
     security = security.div(security.iloc[0])
     ticker = security.name
     candidates = candidates.div(candidates.iloc[0])
@@ -1253,133 +1421,315 @@ def compute_pair_metrics(security: pd.Series, candidates: pd.DataFrame):
     n, m = spreads.shape
     X = np.ones(shape=(n, 2))
     X[:, 1] = np.arange(1, n + 1)
     # compute drift
     drift = ((np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1]
              .to_frame('drift'))
     # compute volatility
     vol = spreads.std().to_frame('vol')
-    # return correlation
+    # returns correlation
     corr_ret = (candidates.pct_change()
                 .corrwith(security.pct_change())
                 .to_frame('corr_ret'))
     # normalized price series correlation
     corr = candidates.corrwith(security).to_frame('corr')
     metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n)
     tests = []
     # run cointegration tests
     for candidate, prices in tqdm(candidates.items()):
         df = pd.DataFrame({'s1': security, 's2': prices})
         var = VAR(df.values)
-        lags = var.select_order() # select VAR order
+        lags = var.select_order()  # select VAR order
         k_ar_diff = lags.selected_orders['aic']
         # Johansen Test with constant Term and estd. lag order
         cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff)
         # Engle-Granger Tests
         t1, p1 = coint(security, prices, trend='c')[:2]
         t2, p2 = coint(prices, security, trend='c')[:2]
-        tests.append([ticker, candidate, t1, p1, t2, p2,
+        tests.append([ticker, candidate, t1, p1, t2, p2,
                       k_ar_diff, *cj0.lr1])
-    columns = ['s1', 's2', 't1', 'p1', 't2', 'p2', 'k_ar_diff', 'trace0', 'trace1']
+    columns = ['s1', 's2', 't1', 'p1', 't2',
+               'p2', 'k_ar_diff', 'trace0', 'trace1']
     tests = pd.DataFrame(tests, columns=columns).set_index('s2')
     return metrics.join(tests)
-CRITICAL_VALUES = {
+__CRITICAL_VALUES = {
     0: {.9: 13.4294, .95: 15.4943, .99: 19.9349},
     1: {.9: 2.7055, .95: 3.8415, .99: 6.6349}
 }
-def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
-                            n=None, start=None, stop=None):
-    trace0_cv = CRITICAL_VALUES[0][.95] # critical value for 0 cointegration relationships
-    trace1_cv = CRITICAL_VALUES[1][.95] # critical value for 1 cointegration relationship
+def find_cointegrated_pairs(securities: pd.DataFrame, candidates: pd.DataFrame,
+                            n=None, start=None, stop=None, coint=False):
+    """
+    Identifies cointegrated pairs between a target set of securities and candidate securities
+    based on econometric tests. The function evaluates statistical relationships,
+    such as cointegration and Engle-Granger significance, to determine pairs suitable
+    for financial strategies like pairs trading.
+    Args:
+        securities (`pd.DataFrame`): A DataFrame where each column represents the time-series
+            prices of target securities to evaluate.
+        candidates (`pd.DataFrame`): A DataFrame where each column represents the time-series
+            prices of candidate securities to compare against the target securities.
+        n (`int`, optional): The number of top pairs to return. If `None`, returns all pairs.
+        start (`str`, optional): Start date for slicing the data (e.g., 'YYYY-MM-DD').
+        stop (`str`, optional): End date for slicing the data (e.g., 'YYYY-MM-DD').
+        coint (`bool`, optional, default=False):
+            - If `True`, filters for pairs identified as cointegrated.
+            - If `False`, returns all evaluated pairs.
+    Returns:
+        - ``pd.DataFrame``: A DataFrame containing:
+        - Johansen and Engle-Granger cointegration metrics:
+            - `t1`, `t2`: Engle-Granger test statistics for two directions.
+            - `p1`, `p2`: Engle-Granger p-values for two directions.
+            - `trace0`, `trace1`: Johansen test trace statistics for 0 and 1 cointegration relationships.
+        - Indicators and filters:
+            - `joh_sig`: Indicates Johansen cointegration significance.
+            - `eg_sig`: Indicates Engle-Granger significance (p-value < 0.05).
+            - `s1_dep`: Indicates whether the first series depends on the second (based on p-values).
+            - `coint`: Combined cointegration indicator (Johansen & Engle-Granger).
+        - Spread and ranking:
+            - `t`: Minimum of `t1` and `t2`.
+            - `p`: Minimum of `p1` and `p2`.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Example:
+    >>>    import pandas as pd
+    >>>    # Sample Data
+    >>>    data_securities = {
+    ...        'Security1': [100, 102, 101, 103, 105],
+    ...        'Security2': [50, 52, 53, 51, 54]
+    ...    }
+    >>>    data_candidates = {
+    ...        'Candidate1': [100, 101, 99, 102, 104],
+    ...        'Candidate2': [200, 202, 201, 203, 205]
+    ...    }
+    >>>    securities = pd.DataFrame(data_securities, index=pd.date_range('2023-01-01', periods=5))
+    >>>    candidates = pd.DataFrame(data_candidates, index=pd.date_range('2023-01-01', periods=5))
+    >>>    # Find cointegrated pairs
+    >>>    top_pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
+    >>>    print(top_pairs)
+    >>>    | s1       | s2        | t    | p     | joh_sig | eg_sig | coint |
+    >>>    |----------|-----------|------|-------|---------|--------|-------|
+    >>>    | Security1| Candidate1| -3.5 | 0.01  | 1       | 1      | 1     |
+    >>>    | Security2| Candidate2| -2.9 | 0.04  | 1       | 1      | 1     |
+    """
+    trace0_cv = __CRITICAL_VALUES[0][.95]  # critical value for 0 cointegration relationships
+    # critical value for 1 cointegration relationship
+    trace1_cv = __CRITICAL_VALUES[1][.95]
     spreads = []
     if start is not None and stop is not None:
         securities = securities.loc[str(start): str(stop), :]
         candidates = candidates.loc[str(start): str(stop), :]
     for i, (ticker, prices) in enumerate(securities.items(), 1):
-        df = compute_pair_metrics(prices, candidates)
-        spreads.append(df.set_index('s1', append=True))
+        try:
+            df = compute_pair_metrics(prices, candidates)
+            spreads.append(df.set_index('s1', append=True))
+        except np.linalg.LinAlgError:
+            continue
     spreads = pd.concat(spreads)
     spreads.index.names = ['s2', 's1']
     spreads = spreads.swaplevel()
     spreads['t'] = spreads[['t1', 't2']].min(axis=1)
     spreads['p'] = spreads[['p1', 'p2']].min(axis=1)
     spreads['joh_sig'] = ((spreads.trace0 > trace0_cv) &
-                        (spreads.trace1 > trace1_cv)).astype(int)
+                          (spreads.trace1 > trace1_cv)).astype(int)
     spreads['eg_sig'] = (spreads.p < .05).astype(int)
     spreads['s1_dep'] = spreads.p1 < spreads.p2
     spreads['coint'] = (spreads.joh_sig & spreads.eg_sig).astype(int)
     # select top n pairs
-    if n is not None:
-        top_pairs = (spreads.query('coint == 1')
-                    .sort_values('t', ascending=False)
-                    .head(n))
+    if coint:
+        if n is not None:
+            top_pairs = (spreads.query('coint == 1')
+                         .sort_values('t', ascending=False)
+                         .head(n))
+        else:
+            top_pairs = (spreads.query('coint == 1')
+                         .sort_values('t', ascending=False))
     else:
-        top_pairs = spreads.query('coint == 1')
+        if n is not None:
+            top_pairs = (spreads
+                         .sort_values('t', ascending=False)
+                         .head(n))
+        else:
+            top_pairs = (spreads
+                         .sort_values('t', ascending=False))
     return top_pairs
-def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=False, cosstab=False,
+def analyze_cointegrated_pairs(spreads: pd.DataFrame, plot_coint=True, crosstab=False,
                                heuristics=False, log_reg=False, decis_tree=False):
+    """
+    Analyzes cointegrated pairs by visualizing, summarizing, and applying predictive models.
+    Args:
+        spreads (pd.DataFrame):
+            A DataFrame containing cointegration metrics and characteristics.
+            Required columns: 'coint', 't', 'trace0', 'trace1', 'drift', 'vol', 'corr', 'corr_ret', 'eg_sig', 'joh_sig'.
+        plot_coint (bool, optional):
+            If True, generates scatterplots and boxplots to visualize cointegration characteristics.
+        cosstab (bool, optional):
+            If True, displays crosstabulations of Engle-Granger and Johansen test significance.
+        heuristics (bool, optional):
+            If True, prints descriptive statistics for drift, volatility, and correlation grouped by cointegration status.
+        log_reg (bool, optional):
+            If True, fits a logistic regression model to predict cointegration and evaluates its performance.
+        decis_tree (bool, optional):
+            If True, fits a decision tree model to predict cointegration and evaluates its performance.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Example:
+    >>>   import pandas as pd
+    >>>   from bbstrader.tseries import find_cointegrated_pairs, analyze_cointegrated_pairs
+    >>>    # Sample Data
+    >>>    securities = pd.DataFrame({
+    ...        'SPY': [100, 102, 101, 103, 105],
+    ...        'QQQ': [50, 52, 53, 51, 54]
+    ...    })
+    >>>    candidates = pd.DataFrame({
+    ...        'AAPL': [100, 101, 99, 102, 104],
+    ...        'MSFT': [200, 202, 201, 203, 205]
+    ...    })
+    >>>    pairs = find_cointegrated_pairs(securities, candidates, n=2, coint=True)
+    >>>    analyze_cointegrated_pairs(pairs, plot_coint=True, cosstab=True, heuristics=True, log_reg=True, decis_tree=True
+    """
     if plot_coint:
-        trace0_cv = CRITICAL_VALUES[0][.95]
+        trace0_cv = __CRITICAL_VALUES[0][.95]
         spreads = spreads.reset_index()
-        sns.scatterplot(x=np.log1p(spreads.t.abs()),
-                        y=np.log1p(spreads.trace1),
-                        hue='coint', data=spreads[spreads.trace0>trace0_cv]);
+        sns.scatterplot(x=np.log1p(spreads.t.abs()),
+                        y=np.log1p(spreads.trace1),
+                        hue='coint', data=spreads[spreads.trace0 > trace0_cv])
         fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
         for i, heuristic in enumerate(['drift', 'vol', 'corr', 'corr_ret']):
             sns.boxplot(x='coint', y=heuristic, data=spreads, ax=axes[i])
-        fig.tight_layout();
+        fig.tight_layout()
     if heuristics:
         spreads = spreads.reset_index()
         h = spreads.groupby(spreads.coint)[
             ['drift', 'vol', 'corr']].describe().stack(level=0).swaplevel().sort_index()
         print(h)
     if log_reg:
         y = spreads.coint
         X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
-        log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
-                               class_weight='balanced',
-                               scoring='roc_auc')
+        log_reg = LogisticRegressionCV(Cs=np.logspace(-10, 10, 21),
+                                       class_weight='balanced',
+                                       scoring='roc_auc')
         log_reg.fit(X=X, y=y)
         Cs = log_reg.Cs_
         scores = pd.DataFrame(log_reg.scores_[True], columns=Cs).mean()
-        scores.plot(logx=True);
+        scores.plot(logx=True)
         res = f'C:{np.log10(scores.idxmax()):.2f}, AUC: {scores.max():.2%}'
         print(res)
         print(log_reg.coef_)
     if decis_tree:
         model = DecisionTreeClassifier(class_weight='balanced')
         decision_tree = GridSearchCV(model,
-                             param_grid={'max_depth': list(range(1, 10))},
-                             cv=5,
-                             scoring='roc_auc')
+                                     param_grid={
+                                         'max_depth': list(range(1, 10))},
+                                     cv=5,
+                                     scoring='roc_auc')
         y = spreads.coint
         X = spreads[['drift', 'vol', 'corr', 'corr_ret']]
         decision_tree.fit(X, y)
         res = f'{decision_tree.best_score_:.2%}, Depth: {decision_tree.best_params_["max_depth"]}'
         print(res)
-    if cosstab:
+    if crosstab:
         pd.set_option('display.float_format', lambda x: f'{x:.2%}')
         print(pd.crosstab(spreads.eg_sig, spreads.joh_sig))
         print(pd.crosstab(spreads.eg_sig, spreads.joh_sig, normalize=True))
-def select_candidate_pairs(pairs: pd.DataFrame):
+def select_candidate_pairs(pairs: pd.DataFrame, period=False):
+    """
+    Select candidate pairs from a DataFrame based on cointegration status.
+    This function filters the input DataFrame to select pairs where the 'coint' column equals 1,
+    indicating cointegration. It then determines the dependent and independent series for each pair
+    and returns the selected pairs in a dictionary format.
+    Args:
+        pairs (pd.DataFrame): A DataFrame containing pairs of time series with columns 'coint', 's1', 's2', and 's1_dep'.
+        period (bool, optional): If True, includes the 'period' column in the output. Defaults to False.
+    Returns:
+        list[dict]: A list of dictionaries, each containing the keys 'x' and 'y' (and optionally 'period') representing the selected pairs.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    """
     candidates = pairs.query('coint == 1').copy()
-    candidates['y'] = candidates.apply(lambda x: x.s1 if x.s1_dep else x.s2, axis=1)
-    candidates['x'] = candidates.apply(lambda x: x.s2 if x.s1_dep else x.s1, axis=1)
-    candidates.drop(['s1_dep', 's1', 's2'], axis=1)
+    candidates = candidates.reset_index()
+    candidates['y'] = candidates.apply(
+        lambda x: x['s1'] if x.s1_dep else x['s2'], axis=1)
+    candidates['x'] = candidates.apply(
+        lambda x: x['s2'] if x.s1_dep else x['s1'], axis=1)
+    if period:
+        return candidates[['x', 'y', 'period']].to_dict(orient='records')
     return candidates[['x', 'y']].to_dict(orient='records')
-def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
-    """Estimate rolling mean using Kalman Smoothing."""
+def KFSmoother(prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
+    """
+    Estimate rolling mean using Kalman Smoothing.
+    Args:
+        prices : pd.Series or np.ndarray
+            The input time series data to be smoothed. It must be either a pandas Series or a numpy array.
+    Returns:
+        pd.Series or np.ndarray
+            The smoothed time series data. If the input is a pandas Series, the output will also be a pandas Series with the same index.
+            If the input is a numpy array, the output will be a numpy array.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    Examples
+    --------
+    >>> import yfinance as yf
+    >>> prices = yf.download('AAPL', start='2020-01-01', end='2021-01-01', multi_level_index=False)['Adj Close']
+    >>> prices = KFSmoother(prices)
+    >>> print(prices[:5])
+    Date
+    2020-01-02 00:00:00+00:00   36.39801407
+    2020-01-03 00:00:00+00:00   49.06231000
+    2020-01-06 00:00:00+00:00   55.86334436
+    2020-01-07 00:00:00+00:00   60.02240894
+    2020-01-08 00:00:00+00:00   63.15057948
+    dtype: float64
+    """
+    if not isinstance(prices, (np.ndarray, pd.Series)):
+        raise ValueError(
+            "Input must be either a numpy array or a pandas Series.")
     kf = PyKalmanFilter(
         transition_matrices=np.eye(1),
         observation_matrices=np.eye(1),
@@ -1396,8 +1746,32 @@ def KFSmoother(self, prices: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
         return state_means.flatten()
-def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
-    """Estimate Hedge Ratio using Kalman Filter."""
+def KFHedgeRatio(x: pd.Series | np.ndarray, y: pd.Series | np.ndarray) -> np.ndarray:
+    """
+    Estimate Hedge Ratio using Kalman Filter.
+    Args:
+        x : pd.Series or np.ndarray
+            The independent variable, which can be either a pandas Series or a numpy array.
+        y : pd.Series or np.ndarray
+            The dependent variable, which can be either a pandas Series or a numpy array.
+    Returns:
+        np.ndarray
+            The estimated hedge ratio as a numpy array.
+    The function returns the negative of the first state variable of each Kalman Filter estimate,
+    which represents the estimated hedge ratio.
+    References
+    ----------
+    Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
+    chapter 9, Time-Series Models for Volatility Forecasts and Statistical Arbitrage.
+    """
+    if (not isinstance(x, (np.ndarray, pd.Series))
+            or not isinstance(y, (np.ndarray, pd.Series))):
+        raise ValueError(
+            "Both x and y must be either a numpy array or a pandas Series.")
     delta = 1e-3
     trans_cov = delta / (1 - delta) * np.eye(2)
     obs_mat = np.expand_dims(np.vstack([[x], [np.ones(len(x))]]).T, axis=1)
@@ -1411,7 +1785,8 @@ def KFHedgeRatio(self, x: pd.Series, y: pd.Series) -> np.ndarray:
         observation_covariance=2,
         transition_covariance=trans_cov
     )
-    state_means, _ = kf.filter(y.values)
-    # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
+    y = y.values if isinstance(y, pd.Series) else y
+    state_means, _ = kf.filter(y)
+    # Indexing with [:, 0] in state_means[:, 0] extracts only the first state variable of
     # each Kalman Filter estimate, which is the estimated hedge ratio.
     return -state_means[:, 0]

bbstrader 0.1.94__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

bbstrader 0.1.94py3-none-any.whl → 0.2.1py3-none-any.whl