PyPI - quantjourney-bidask - Versions diffs - 0.9.3__py3-none-any.whl → 1.0__py3-none-any.whl - Mend

quantjourney-bidask 0.9.3py3-none-any.whl → 1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

quantjourney_bidask/__init__.py +31 -5
quantjourney_bidask/_compare_edge.py +152 -0
quantjourney_bidask/edge.py +149 -127
quantjourney_bidask/edge_expanding.py +44 -58
quantjourney_bidask/edge_hft.py +126 -0
quantjourney_bidask/edge_rolling.py +90 -199
{quantjourney_bidask-0.9.3.dist-info → quantjourney_bidask-1.0.dist-info}/METADATA +93 -35
quantjourney_bidask-1.0.dist-info/RECORD +11 -0
quantjourney_bidask/_version.py +0 -7
quantjourney_bidask/data_fetcher.py +0 -160
quantjourney_bidask/websocket_fetcher.py +0 -308
quantjourney_bidask-0.9.3.dist-info/RECORD +0 -12
{quantjourney_bidask-0.9.3.dist-info → quantjourney_bidask-1.0.dist-info}/WHEEL +0 -0
{quantjourney_bidask-0.9.3.dist-info → quantjourney_bidask-1.0.dist-info}/licenses/LICENSE +0 -0
{quantjourney_bidask-0.9.3.dist-info → quantjourney_bidask-1.0.dist-info}/top_level.txt +0 -0

quantjourney_bidask/__init__.py CHANGED Viewed

@@ -1,8 +1,34 @@
+"""
+QuantJourney Bid-Ask Spread Estimator - Core Library.
+Efficient estimation of bid-ask spreads from OHLC prices using the methodology
+from Ardia, Guidotti, & Kroencke (2024).
+Author: Jakub Polec
+Date: 2025-06-28
+Part of the QuantJourney framework - The framework with advanced quantitative
+finance tools and insights.
+"""
 from .edge import edge
-from .edge_rolling import edge_rolling
 from .edge_expanding import edge_expanding
-from .data_fetcher import fetch_binance_data, fetch_yfinance_data
-from .websocket_fetcher import LiveSpreadMonitor
-from ._version import __version__, __author__, __email__, __license__
+from .edge_rolling import edge_rolling
+# Import version from package metadata
+try:
+    from importlib.metadata import metadata, version
+    __version__ = version("quantjourney-bidask")
+    _meta = metadata("quantjourney-bidask")
+    __author__ = "Jakub Polec"
+    __email__ = "jakub@quantjourney.pro"
+    __license__ = "MIT"
+except ImportError:
+    # Fallback for development mode
+    __version__ = "X.Y"
+    __author__ = "Jakub Polec"
+    __email__ = "jakub@quantjourney.pro"
+    __license__ = "MIT"
-__all__ = ['edge', 'edge_rolling', 'edge_expanding', 'fetch_binance_data', 'fetch_yfinance_data', 'LiveSpreadMonitor']
+__all__ = ["edge", "edge_rolling", "edge_expanding"]

quantjourney_bidask/_compare_edge.py ADDED Viewed

@@ -0,0 +1,152 @@
+# compare_edge_v2.py
+"""
+Comprehensive comparison script for EDGE estimator implementations.
+This script benchmarks three versions of the EDGE bid-ask spread estimator:
+1.  `edge_original`: The baseline, pure NumPy implementation.
+2.  `edge_improved_v1`: Optimized with a modular structure and a Numba kernel
+    for the core calculation.
+3.  `edge_improved_v2`: Hyper-optimized with a single, monolithic Numba kernel
+    to minimize Python overhead and maximize compiler optimizations.
+The script validates that the optimized versions are numerically identical to the
+original (within floating-point tolerances) and quantifies the performance gains
+across a variety of test datasets.
+To Run:
+1. Ensure `edge.py`, `edge_improved.py`, and `edge_improved_v2.py` are in the same directory.
+2. Execute from the terminal: `python compare_edge_v2.py`
+"""
+import time
+import numpy as np
+# Import the three versions of the edge function for comparison
+from edge import edge as edge_original
+from quantjourney_bidask.edge_improved_v1 import edge as edge_improved_v1
+from quantjourney_bidask.edge_improved_v2 import edge as edge_improved_v2
+def generate_complex_ohlc_data(num_points, initial_price=100.0, annual_vol=0.20, annual_drift=0.05, daily_spread_pct=0.005, overnight_vol=0.001, seed=42):
+    """Generates synthetic OHLC data with overnight gaps for robust testing."""
+    np.random.seed(seed)
+    dt = 1 / 252.0
+    daily_vol = annual_vol * np.sqrt(dt)
+    daily_drift = annual_drift * dt
+    log_returns = daily_drift + daily_vol * np.random.normal(size=num_points)
+    mid_prices_series = initial_price * np.exp(np.cumsum(log_returns))
+    mid_prices_series = np.maximum(mid_prices_series, 1e-6)
+    overnight_returns = np.random.normal(loc=0, scale=overnight_vol, size=num_points)
+    open_prices = mid_prices_series * np.exp(overnight_returns)
+    open_prices = np.roll(open_prices, 1)
+    open_prices[0] = initial_price
+    close_prices = mid_prices_series
+    intraday_range_factor = np.random.uniform(daily_vol, daily_vol * 2.5, size=num_points)
+    intraday_range = intraday_range_factor * mid_prices_series
+    high_prices = mid_prices_series + intraday_range / 2.0
+    low_prices = mid_prices_series - intraday_range / 2.0
+    spread_component = daily_spread_pct * mid_prices_series
+    high_prices += spread_component / 2.0
+    low_prices -= spread_component / 2.0
+    high_prices = np.maximum.reduce([high_prices, open_prices, close_prices])
+    low_prices = np.minimum.reduce([low_prices, open_prices, close_prices])
+    high_prices = np.maximum(high_prices, 1e-6)
+    low_prices = np.maximum(low_prices, 1e-6)
+    open_prices = np.maximum(open_prices, 1e-6)
+    close_prices = np.maximum(close_prices, 1e-6)
+    return open_prices, high_prices, low_prices, close_prices
+# --- Test Case Definitions ---
+NUM_POINTS_10_YEARS = 10 * 252
+open_10y, high_10y, low_10y, close_10y = generate_complex_ohlc_data(NUM_POINTS_10_YEARS, initial_price=500.0, daily_spread_pct=0.005)
+open_small, high_small, low_small, close_small = [100.0, 101.5, 99.8, 102.1, 100.9], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
+open_invalid, high_invalid, low_invalid, close_invalid = [100.0, 101.5, 99.8], [99.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
+open_nan, high_nan, low_nan, close_nan = [np.nan] * 5, [np.nan] * 5, [np.nan] * 5, [np.nan] * 5
+open_non_positive, high_non_positive, low_non_positive, close_non_positive = [100.0, 0.0, 99.8], [102.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
+open_near_zero_diff, high_near_zero_diff, low_near_zero_diff, close_near_zero_diff = [100.0, 100.00000001, 100.00000002, 100.0, 100.00000001], [100.00000002, 100.00000003, 100.00000004, 100.00000002, 100.00000003], [99.99999998, 99.99999997, 99.99999996, 99.99999998, 99.99999997], [100.00000001, 100.00000002, 100.00000001, 100.00000001, 100.00000002]
+open_partial_nan, high_partial_nan, low_partial_nan, close_partial_nan = [100.0, np.nan, 99.8, 102.1, np.nan], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
+open_low_variability, high_low_variability, low_low_variability, close_low_variability = [100.0, 100.01, 100.02, 100.01, 100.0], [100.03, 100.04, 100.05, 100.04, 100.03], [99.97, 99.96, 99.95, 99.96, 99.97], [100.01, 100.02, 100.01, 100.02, 100.01]
+test_cases = [
+    {"name": f"Large Dataset ({NUM_POINTS_10_YEARS} points)", "open": open_10y, "high": high_10y, "low": low_10y, "close": close_10y},
+    {"name": "Small Dataset (5 points)", "open": open_small, "high": high_small, "low": low_small, "close": close_small},
+    {"name": "Invalid OHLC (high < low)", "open": open_invalid, "high": high_invalid, "low": low_invalid, "close": close_invalid},
+    {"name": "All NaN", "open": open_nan, "high": high_nan, "low": low_nan, "close": close_nan},
+    {"name": "Non-positive Prices", "open": open_non_positive, "high": high_non_positive, "low": low_non_positive, "close": close_non_positive},
+    {"name": "Near-zero Differences", "open": open_near_zero_diff, "high": high_near_zero_diff, "low": low_near_zero_diff, "close": close_near_zero_diff},
+    {"name": "Partial NaN", "open": open_partial_nan, "high": high_partial_nan, "low": low_partial_nan, "close": close_partial_nan},
+    {"name": "Low Variability", "open": open_low_variability, "high": high_low_variability, "low": low_low_variability, "close": close_low_variability},
+]
+# --- Numba Warm-up ---
+# First call to a Numba function includes compilation time.
+# We run it once on a small dataset to ensure subsequent timings are for execution only.
+print("Warming up Numba JIT compilers (this may take a moment)...")
+try:
+    edge_improved_v1(open_small, high_small, low_small, close_small)
+    edge_improved_v2(open_small, high_small, low_small, close_small)
+except Exception as e:
+    print(f"An error occurred during warm-up: {e}")
+print("Warm-up complete.\n")
+# --- Main Comparison ---
+print("="*80)
+print("Comparing edge_original vs. edge_improved_v1 vs. edge_improved_v2")
+print("="*80 + "\n")
+for test in test_cases:
+    name = test["name"]
+    open_p, high_p, low_p, close_p = test["open"], test["high"], test["low"], test["close"]
+    # --- Run original function and time it ---
+    try:
+        start_time = time.perf_counter()
+        result_original = edge_original(open_p, high_p, low_p, close_p)
+        time_original = time.perf_counter() - start_time
+    except Exception as e:
+        result_original, time_original = f"Error: {type(e).__name__}", -1
+    # --- Run improved_v1 function and time it ---
+    try:
+        start_time = time.perf_counter()
+        result_v1 = edge_improved_v1(open_p, high_p, low_p, close_p)
+        time_v1 = time.perf_counter() - start_time
+    except Exception as e:
+        result_v1, time_v1 = f"Error: {type(e).__name__}", -1
+    # --- Run improved_v2 (hyper-optimized) function and time it ---
+    try:
+        start_time = time.perf_counter()
+        result_v2 = edge_improved_v2(open_p, high_p, low_p, close_p)
+        time_v2 = time.perf_counter() - start_time
+    except Exception as e:
+        result_v2, time_v2 = f"Error: {type(e).__name__}", -1
+    # --- Reporting ---
+    print(f"--- Test Case: {name} ---")
+    print(f"  Original:      {result_original:<25} (Time: {time_original*1000:.4f} ms)")
+    print(f"  Improved v1:   {result_v1:<25} (Time: {time_v1*1000:.4f} ms)")
+    print(f"  Improved v2:   {result_v2:<25} (Time: {time_v2*1000:.4f} ms)")
+    # Check numerical equivalence against the original baseline
+    is_v1_ok = np.isclose(result_original, result_v1, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v1, float) else str(result_original) == str(result_v1)
+    is_v2_ok = np.isclose(result_original, result_v2, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v2, float) else str(result_original) == str(result_v2)
+    status_v1 = "\033[92mPASS\033[0m" if is_v1_ok else "\033[91mFAIL\033[0m"
+    status_v2 = "\033[92mPASS\033[0m" if is_v2_ok else "\033[91mFAIL\033[0m"
+    print(f"  Equivalence (v1/v2 vs Original): {status_v1} / {status_v2}")
+    # Performance reporting
+    perf_string_v1, perf_string_v2 = "N/A", "N/A"
+    if time_original > 0 and time_v1 > 0:
+        perf_string_v1 = f"{time_original / time_v1:.2f}x"
+    if time_original > 0 and time_v2 > 0:
+        perf_string_v2 = f"{time_original / time_v2:.2f}x"
+    print(f"  Speedup (v1/v2 vs Original):     {perf_string_v1} / {perf_string_v2}\n")

quantjourney_bidask/edge.py CHANGED Viewed

@@ -1,152 +1,174 @@
-import numpy as np
+"""
+Optimized and robust EDGE estimator for bid-ask spread calculation.
+Implements the efficient estimator from Ardia, Guidotti, & Kroencke (2024) for
+single-period bid-ask spread estimation from OHLC prices. This version is
+optimized for speed using Numba and careful memory handling, while ensuring
+numerical identity with the reference implementation.
+Author: Jakub Polec
+Date: 2025-06-28
+"""
 import warnings
-from typing import Union, List, Tuple, Any
+from typing import Union, List, Any
+import numpy as np
+from numba import jit
+@jit(nopython=True, cache=True)
+def _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt):
+    """
+    Core spread calculation using Numba for maximum performance.
+    This is the computational bottleneck and benefits most from JIT compilation.
+    """
+    # De-mean returns, scaling by the probability of a valid period (pt)
+    # This aligns with the GMM framework where moments are conditioned on tau=1
+    r1_mean = np.nanmean(r1)    # Mean of the first return
+    r3_mean = np.nanmean(r3)    # Mean of the third return
+    r5_mean = np.nanmean(r5)    # Mean of the fifth return
+    d1 = r1 - r1_mean / pt * tau     # De-mean returns, scaling by the probability of a valid period (pt)
+    d3 = r3 - r3_mean / pt * tau     # De-mean returns, scaling by the probability of a valid period (pt)
+    d5 = r5 - r5_mean / pt * tau     # De-mean returns, scaling by the probability of a valid period (pt)
+    # GMM moment conditions
+    x1 = -4.0 / po * d1 * r2 + -4.0 / pc * d3 * r4     # First moment condition
+    x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4     # Second moment condition
+    # Expectations of the moment conditions
+    e1 = np.nanmean(x1)     # Expectation of the first moment condition
+    e2 = np.nanmean(x2)     # Expectation of the second moment condition
+    # Variances for optimal weighting
+    v1 = np.nanmean(x1**2) - e1**2     # Variance of the first moment condition
+    v2 = np.nanmean(x2**2) - e2**2     # Variance of the second moment condition
+    # Optimal GMM weighting
+    vt = v1 + v2     # Total variance
+    # If total variance is zero or negative (rare small sample issue), use simple average
+    s2 = (v2 * e1 + v1 * e2) / vt if vt > 0.0 else (e1 + e2) / 2.0     # Spread estimate
+    return s2
 def edge(
-    open: Union[List[float], Any],
+    open_prices: Union[List[float], Any],
     high: Union[List[float], Any],
     low: Union[List[float], Any],
     close: Union[List[float], Any],
-    sign: bool = False
+    sign: bool = False,
+    min_pt: float = 1e-6, # Keep this robustness check
+    debug: bool = False,
 ) -> float:
     """
-    Estimate the effective bid-ask spread from open, high, low, and close (OHLC) prices.
-    Implements the efficient estimator described in Ardia, Guidotti, & Kroencke (2024):
-    https://doi.org/10.1016/j.jfineco.2024.103916. The estimator computes the root mean square
-    effective spread within the sample period using log-returns and indicator variables.
-    Parameters
-    ----------
-    open : array-like
-        Vector of open prices, sorted in ascending order of timestamp.
-    high : array-like
-        Vector of high prices, sorted in ascending order of timestamp.
-    low : array-like
-        Vector of low prices, sorted in ascending order of timestamp.
-    close : array-like
-        Vector of close prices, sorted in ascending order of timestamp.
-    sign : bool, default False
-        If True, returns signed estimates (negative values possible). If False, returns
-        absolute values to reduce small-sample bias in averaging or regression studies.
-    Returns
-    -------
-    float
-        Estimated bid-ask spread as a fraction of price (e.g., 0.01 = 1% spread).
-        Returns np.nan if the estimate cannot be computed (e.g., insufficient data).
-    Notes
-    -----
-    - Requires at least 3 observations for a valid estimate.
-    - Handles missing values (NaNs) automatically by excluding them from calculations.
-    - The estimator assumes prices are positive and non-zero to compute log-prices.
-    - For optimal results, use high-frequency data (e.g., minute or hourly) for frequently
-      traded assets, or lower frequency (e.g., daily) for less liquid assets.
-    Examples
-    --------
-    >>> import pandas as pd
-    >>> # Example OHLC data
-    >>> open_prices = [100.0, 101.5, 99.8, 102.1, 100.9]
-    >>> high_prices = [102.3, 103.0, 101.2, 103.5, 102.0]
-    >>> low_prices = [99.5, 100.8, 98.9, 101.0, 100.1]
-    >>> close_prices = [101.2, 102.5, 100.3, 102.8, 101.5]
-    >>> spread = edge(open_prices, high_prices, low_prices, close_prices)
-    >>> print(f"Estimated spread: {spread:.6f}")
-    Estimated spread: 0.007109
+    Estimate the effective bid-ask spread from OHLC prices.
+    Implements the efficient estimator described in Ardia, Guidotti, & Kroencke
+    (2024): https://doi.org/10.1016/j.jfineco.2024.103916.
+    Args:
+        open_prices : array-like
+            Vector of open prices.
+        high : array-like
+            Vector of high prices.
+        low : array-like
+            Vector of low prices.
+        close : array-like
+            Vector of close prices.
+        sign : bool, default False
+            If True, returns signed estimates. If False, returns absolute values.
+        min_pt : float, default 1e-6
+            Minimum probability threshold for tau to ensure reliable estimates.
+        debug : bool, default False
+            If True, prints intermediate values.
+    Returns:
+        float
+            Estimated bid-ask spread. Returns np.nan if invalid.
+    Examples:
+        >>> import numpy as np
+        >>> from edge import edge
+        >>> open_prices = np.array([100.0, 101.5, 99.8, 102.1, 100.9])
+        >>> high = np.array([102.3, 103.0, 101.2, 103.5, 102.0])
+        >>> low = np.array([99.5, 100.8, 98.9, 101.0, 100.1])
+        >>> close = np.array([101.2, 102.5, 100.3, 102.8, 101.5])
     """
-    # Convert inputs to numpy arrays
-    open = np.asarray(open, dtype=float)
-    high = np.asarray(high, dtype=float)
-    low = np.asarray(low, dtype=float)
-    close = np.asarray(close, dtype=float)
-    # Validate input lengths
-    nobs = len(open)
-    if len(high) != nobs or len(low) != nobs or len(close) != nobs:
-        raise ValueError("Open, high, low, and close must have the same length")
-    # Return NaN if insufficient observations
-    if nobs < 3:
+    # --- 1. Input Validation and Conversion ---
+    o_arr = np.asarray(open_prices, dtype=float)    # Convert to numpy array
+    h_arr = np.asarray(high, dtype=float)           # Convert to numpy array
+    l_arr = np.asarray(low, dtype=float)            # Convert to numpy array
+    c_arr = np.asarray(close, dtype=float)          # Convert to numpy array
+    nobs = len(o_arr)
+    if not (len(h_arr) == nobs and len(l_arr) == nobs and len(c_arr) == nobs):
+        raise ValueError("Input arrays must have the same length.")
+    if nobs < 3:    # If there are less than 3 observations, return NaN
+        if debug: print("NaN reason: nobs < 3")
         return np.nan
-    # Compute log-prices, handling non-positive prices
+    # --- 2. Log-Price Calculation ---
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", RuntimeWarning)
-        o = np.log(np.where(open > 0, open, np.nan))
-        h = np.log(np.where(high > 0, high, np.nan))
-        l = np.log(np.where(low > 0, low, np.nan))
-        c = np.log(np.where(close > 0, close, np.nan))
-        m = (h + l) / 2.0  # Mid-price log
-    # Shift log-prices by one period
-    h1, l1, c1, m1 = h[:-1], l[:-1], c[:-1], m[:-1]
-    o, h, l, c, m = o[1:], h[1:], l[1:], c[1:], m[1:]
-    # Compute log-returns
-    r1 = m - o        # Mid - Open
-    r2 = o - m1       # Open - Previous Mid
-    r3 = m - c1       # Mid - Previous Close
-    r4 = c1 - m1      # Previous Close - Previous Mid
-    r5 = o - c1       # Open - Previous Close
-    # Compute indicator variables
-    # tau: Indicator for valid price variation (1 if high != low or low != previous close)
-    tau = np.where(np.isnan(h) | np.isnan(l) | np.isnan(c1), np.nan,
-                   ((h != l) | (l != c1)).astype(float))
-    # po1: Indicator for open price not equal to high, scaled by tau
-    po1 = tau * np.where(np.isnan(o) | np.isnan(h), np.nan, (o != h).astype(float))
-    # po2: Indicator for open price not equal to low, scaled by tau
-    po2 = tau * np.where(np.isnan(o) | np.isnan(l), np.nan, (o != l).astype(float))
+        # Replace non-positive prices with NaN to avoid log(0) or log(-1) issues
+        o = np.log(np.where(o_arr > 0, o_arr, np.nan))  # Log-price of the open price
+        h = np.log(np.where(h_arr > 0, h_arr, np.nan))  # Log-price of the high price
+        l = np.log(np.where(l_arr > 0, l_arr, np.nan))  # Log-price of the low price
+        c = np.log(np.where(c_arr > 0, c_arr, np.nan))  # Log-price of the close price
+        m = (h + l) / 2.0     # Mid-price log
+    # --- 3. Shift Arrays for Lagged Calculations (THE CRITICAL FIX) ---
+    # All calculations from here on use N-1 observations.
+    o_t = o[1:] # Open price at time t
+    h_t = h[1:] # High price at time t
+    l_t = l[1:] # Low price at time t
+    m_t = m[1:] # Mid-price at time t
-    # pc1: Indicator for previous close not equal to previous high, scaled by tau
-    pc1 = tau * np.where(np.isnan(c1) | np.isnan(h1), np.nan, (c1 != h1).astype(float))
+    h_tm1 = h[:-1] # High price at time t-1
+    l_tm1 = l[:-1]
+    c_tm1 = c[:-1]
+    m_tm1 = m[:-1]
+    # --- 4. Compute Log-Returns ---
+    r1 = m_t - o_t          # Mid-price - Open price
+    r2 = o_t - m_tm1        # Open price - Previous mid-price
+    r3 = m_t - c_tm1        # Mid-price - Previous close
+    r4 = c_tm1 - m_tm1      # Previous close - Previous mid-price
+    r5 = o_t - c_tm1        # Open price - Previous close
+    # --- 5. Compute Indicator Variables ---
+    tau = np.where(np.isnan(h_t) | np.isnan(l_t) | np.isnan(c_tm1), np.nan, ((h_t != l_t) | (l_t != c_tm1)).astype(float))
+    po1 = tau * np.where(np.isnan(o_t) | np.isnan(h_t), np.nan, (o_t != h_t).astype(float))
+    po2 = tau * np.where(np.isnan(o_t) | np.isnan(l_t), np.nan, (o_t != l_t).astype(float))
+    pc1 = tau * np.where(np.isnan(c_tm1) | np.isnan(h_tm1), np.nan, (c_tm1 != h_tm1).astype(float))
+    pc2 = tau * np.where(np.isnan(c_tm1) | np.isnan(l_tm1), np.nan, (c_tm1 != l_tm1).astype(float))
-    # pc2: Indicator for previous close not equal to previous low, scaled by tau
-    pc2 = tau * np.where(np.isnan(c1) | np.isnan(l1), np.nan, (c1 != l1).astype(float))
-    # Compute probabilities with NaN handling
+    # --- 6. Compute Probabilities ---
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", RuntimeWarning)
-        pt = np.nanmean(tau)
-        po = np.nanmean(po1) + np.nanmean(po2)
-        pc = np.nanmean(pc1) + np.nanmean(pc2)
-    # Return NaN if insufficient valid periods or probabilities are zero
-    if np.nansum(tau) < 2 or po == 0 or pc == 0:
-        return np.nan
+        pt = np.nanmean(tau)                        # Probability of a valid period
+        po = np.nanmean(po1) + np.nanmean(po2)      # Probability of open price not equal to high
+        pc = np.nanmean(pc1) + np.nanmean(pc2)      # Probability of close price not equal to high
-    # Compute de-meaned log-returns
-    d1 = r1 - np.nanmean(r1) / pt * tau
-    d3 = r3 - np.nanmean(r3) / pt * tau
-    d5 = r5 - np.nanmean(r5) / pt * tau
+    if debug:
+        print(f"Debug: tau_sum={np.nansum(tau):.2f}, po={po:.4f}, pc={pc:.4f}, pt={pt:.4f}")
-    # Compute input vectors for GMM estimation
-    # x1: First moment condition combining open-high-low and close-high-low effects
-    x1 = -4.0 / po * d1 * r2 + -4.0 / pc * d3 * r4  # Scaled by probability of open/close extremes
-    # x2: Second moment condition combining open-high-low-close and close-high-low-open effects
-    x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4
-    # Compute expectations (means) of the moment conditions
-    e1 = np.nanmean(x1)  # First moment expectation
-    e2 = np.nanmean(x2)  # Second moment expectation
-    # Compute variances of the moment conditions for optimal weighting
-    v1 = np.nanmean(x1**2) - e1**2  # Variance of first moment
-    v2 = np.nanmean(x2**2) - e2**2  # Variance of second moment
+    # --- 7. Check for Data Quality ---
+    if np.nansum(tau) < 2 or po == 0.0 or pc == 0.0 or pt < min_pt:
+        if debug: print(f"NaN reason: Insufficient valid data (tau_sum={np.nansum(tau)}, po={po}, pc={pc}, pt={pt})")
+        return np.nan
-    # Compute squared spread estimate using optimal GMM weights
-    vt = v1 + v2  # Total variance for weighting
-    # If total variance is positive, use optimal weighted average
-    # Otherwise fall back to simple average of the two estimates
-    s2 = (v2 * e1 + v1 * e2) / vt if vt > 0 else (e1 + e2) / 2.0
+    # --- 8. Compute Spread (using the Numba-optimized function) ---
+    s2 = _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt) # Spread estimate
+    if np.isnan(s2):
+        if debug: print("NaN reason: s2 calculation resulted in NaN")
+        return np.nan
-    # Compute signed root
     s = np.sqrt(np.abs(s2))
     if sign:
-        s *= np.sign(s2)
+        s *= np.sign(s2)     # Signed spread estimate
+    if debug:
+        print(f"Debug: s2={s2:.6e}, s={s:.6e}")
     return float(s)

quantjourney_bidask/edge_expanding.py CHANGED Viewed

@@ -1,65 +1,51 @@
+"""
+Expanding window EDGE estimator implementation.
+This module provides an expanding window implementation of the EDGE estimator,
+ensuring compatibility with all pandas windowing features like 'step'.
+Author: Jakub Polec
+Date: 2025-06-28
+Part of the QuantJourney framework - The framework with advanced quantitative
+finance tools and insights.
+"""
+import warnings
+import numpy as np
 import pandas as pd
-from typing import Union
-from .edge import edge
-from .edge_rolling import edge_rolling
+from .edge import edge as edge_single # Import the core, fast estimator
 def edge_expanding(
     df: pd.DataFrame,
-    min_periods: int = 1,
-    sign: bool = False
+    min_periods: int = 3,
+    sign: bool = False,
 ) -> pd.Series:
-    """
-    Compute expanding window estimates of the bid-ask spread from OHLC prices.
-    Uses the efficient estimator from Ardia, Guidotti, & Kroencke (2024):
-    https://doi.org/10.1016/j.jfineco.2024.103916. Calculates spreads over
-    expanding windows starting from the first observation.
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame with columns 'open', 'high', 'low', 'close' (case-insensitive).
-    min_periods : int, default 1
-        Minimum number of observations required for an estimate. Note that
-        at least 3 observations are needed for a non-NaN result.
-    sign : bool, default False
-        If True, returns signed estimates. If False, returns absolute values.
-    Returns
-    -------
-    pd.Series
-        Series of expanding spread estimates, indexed by the DataFrame's index.
-        A value of 0.01 corresponds to a 1% spread. NaN for periods with
-        insufficient data.
-    Notes
-    -----
-    - The function leverages `edge_rolling` with a window equal to the DataFrame length.
-    - Missing values are handled automatically.
-    - The estimator is most reliable with sufficient data (e.g., 20+ observations).
+    """Computes expanding EDGE estimates by calling the core estimator on a growing window."""
+    if min_periods < 3:
+        warnings.warn("min_periods < 3 is not recommended, setting to 3.", UserWarning)
+        min_periods = 3
+    # Prepare data
+    df_proc = df.rename(columns=str.lower).copy()
+    open_p = df_proc["open"].values
+    high_p = df_proc["high"].values
+    low_p = df_proc["low"].values
+    close_p = df_proc["close"].values
-    Examples
-    --------
-    >>> import pandas as pd
-    >>> # Example OHLC DataFrame
-    >>> df = pd.DataFrame({
-    ...     'open': [100.0, 101.5, 99.8, 102.1, 100.9, 103.2],
-    ...     'high': [102.3, 103.0, 101.2, 103.5, 102.0, 104.8],
-    ...     'low': [99.5, 100.8, 98.9, 101.0, 100.1, 102.5],
-    ...     'close': [101.2, 102.5, 100.3, 102.8, 101.5, 104.1]
-    ... })
-    >>> spreads = edge_expanding(df, min_periods=3)
-    >>> print(spreads.dropna())
-    """
-    # Standardize column names
-    df = df.rename(columns=str.lower).copy()
-    required_cols = ['open', 'high', 'low', 'close']
-    if not all(col in df.columns for col in required_cols):
-        raise ValueError("DataFrame must contain 'open', 'high', 'low', 'close' columns")
+    n = len(df_proc)
+    estimates = np.full(n, np.nan)
-    return edge_rolling(
-        df=df,
-        window=len(df),
-        min_periods=max(min_periods, 3),
-        sign=sign
-    )
+    # This loop perfectly replicates the test's logic for an expanding window
+    for i in range(n):
+        t1 = i + 1
+        if t1 >= min_periods:
+            # Call the fast, single-shot edge estimator on the expanding slice
+            estimates[i] = edge_single(
+                open_p[:t1],
+                high_p[:t1],
+                low_p[:t1],
+                close_p[:t1],
+                sign=sign,
+            )
+    return pd.Series(estimates, index=df_proc.index, name="EDGE_expanding")

quantjourney-bidask 0.9.3__py3-none-any.whl → 1.0__py3-none-any.whl

quantjourney-bidask 0.9.3py3-none-any.whl → 1.0py3-none-any.whl