quantjourney-bidask 0.9.3__py3-none-any.whl → 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,34 @@
1
+ """
2
+ QuantJourney Bid-Ask Spread Estimator - Core Library.
3
+
4
+ Efficient estimation of bid-ask spreads from OHLC prices using the methodology
5
+ from Ardia, Guidotti, & Kroencke (2024).
6
+
7
+ Author: Jakub Polec
8
+ Date: 2025-06-28
9
+
10
+ Part of the QuantJourney framework - The framework with advanced quantitative
11
+ finance tools and insights.
12
+ """
13
+
1
14
  from .edge import edge
2
- from .edge_rolling import edge_rolling
3
15
  from .edge_expanding import edge_expanding
4
- from .data_fetcher import fetch_binance_data, fetch_yfinance_data
5
- from .websocket_fetcher import LiveSpreadMonitor
6
- from ._version import __version__, __author__, __email__, __license__
16
+ from .edge_rolling import edge_rolling
17
+
18
+ # Import version from package metadata
19
+ try:
20
+ from importlib.metadata import metadata, version
21
+
22
+ __version__ = version("quantjourney-bidask")
23
+ _meta = metadata("quantjourney-bidask")
24
+ __author__ = "Jakub Polec"
25
+ __email__ = "jakub@quantjourney.pro"
26
+ __license__ = "MIT"
27
+ except ImportError:
28
+ # Fallback for development mode
29
+ __version__ = "X.Y"
30
+ __author__ = "Jakub Polec"
31
+ __email__ = "jakub@quantjourney.pro"
32
+ __license__ = "MIT"
7
33
 
8
- __all__ = ['edge', 'edge_rolling', 'edge_expanding', 'fetch_binance_data', 'fetch_yfinance_data', 'LiveSpreadMonitor']
34
+ __all__ = ["edge", "edge_rolling", "edge_expanding"]
@@ -0,0 +1,152 @@
1
+ # compare_edge_v2.py
2
+ """
3
+ Comprehensive comparison script for EDGE estimator implementations.
4
+
5
+ This script benchmarks three versions of the EDGE bid-ask spread estimator:
6
+ 1. `edge_original`: The baseline, pure NumPy implementation.
7
+ 2. `edge_improved_v1`: Optimized with a modular structure and a Numba kernel
8
+ for the core calculation.
9
+ 3. `edge_improved_v2`: Hyper-optimized with a single, monolithic Numba kernel
10
+ to minimize Python overhead and maximize compiler optimizations.
11
+
12
+ The script validates that the optimized versions are numerically identical to the
13
+ original (within floating-point tolerances) and quantifies the performance gains
14
+ across a variety of test datasets.
15
+
16
+ To Run:
17
+ 1. Ensure `edge.py`, `edge_improved.py`, and `edge_improved_v2.py` are in the same directory.
18
+ 2. Execute from the terminal: `python compare_edge_v2.py`
19
+ """
20
+ import time
21
+ import numpy as np
22
+
23
+ # Import the three versions of the edge function for comparison
24
+ from edge import edge as edge_original
25
+ from quantjourney_bidask.edge_improved_v1 import edge as edge_improved_v1
26
+ from quantjourney_bidask.edge_improved_v2 import edge as edge_improved_v2
27
+
28
+
29
+ def generate_complex_ohlc_data(num_points, initial_price=100.0, annual_vol=0.20, annual_drift=0.05, daily_spread_pct=0.005, overnight_vol=0.001, seed=42):
30
+ """Generates synthetic OHLC data with overnight gaps for robust testing."""
31
+ np.random.seed(seed)
32
+ dt = 1 / 252.0
33
+ daily_vol = annual_vol * np.sqrt(dt)
34
+ daily_drift = annual_drift * dt
35
+ log_returns = daily_drift + daily_vol * np.random.normal(size=num_points)
36
+ mid_prices_series = initial_price * np.exp(np.cumsum(log_returns))
37
+
38
+ mid_prices_series = np.maximum(mid_prices_series, 1e-6)
39
+ overnight_returns = np.random.normal(loc=0, scale=overnight_vol, size=num_points)
40
+ open_prices = mid_prices_series * np.exp(overnight_returns)
41
+ open_prices = np.roll(open_prices, 1)
42
+ open_prices[0] = initial_price
43
+ close_prices = mid_prices_series
44
+
45
+ intraday_range_factor = np.random.uniform(daily_vol, daily_vol * 2.5, size=num_points)
46
+ intraday_range = intraday_range_factor * mid_prices_series
47
+ high_prices = mid_prices_series + intraday_range / 2.0
48
+ low_prices = mid_prices_series - intraday_range / 2.0
49
+
50
+ spread_component = daily_spread_pct * mid_prices_series
51
+ high_prices += spread_component / 2.0
52
+ low_prices -= spread_component / 2.0
53
+
54
+ high_prices = np.maximum.reduce([high_prices, open_prices, close_prices])
55
+ low_prices = np.minimum.reduce([low_prices, open_prices, close_prices])
56
+
57
+ high_prices = np.maximum(high_prices, 1e-6)
58
+ low_prices = np.maximum(low_prices, 1e-6)
59
+ open_prices = np.maximum(open_prices, 1e-6)
60
+ close_prices = np.maximum(close_prices, 1e-6)
61
+
62
+ return open_prices, high_prices, low_prices, close_prices
63
+
64
+ # --- Test Case Definitions ---
65
+ NUM_POINTS_10_YEARS = 10 * 252
66
+ open_10y, high_10y, low_10y, close_10y = generate_complex_ohlc_data(NUM_POINTS_10_YEARS, initial_price=500.0, daily_spread_pct=0.005)
67
+ open_small, high_small, low_small, close_small = [100.0, 101.5, 99.8, 102.1, 100.9], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
68
+ open_invalid, high_invalid, low_invalid, close_invalid = [100.0, 101.5, 99.8], [99.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
69
+ open_nan, high_nan, low_nan, close_nan = [np.nan] * 5, [np.nan] * 5, [np.nan] * 5, [np.nan] * 5
70
+ open_non_positive, high_non_positive, low_non_positive, close_non_positive = [100.0, 0.0, 99.8], [102.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
71
+ open_near_zero_diff, high_near_zero_diff, low_near_zero_diff, close_near_zero_diff = [100.0, 100.00000001, 100.00000002, 100.0, 100.00000001], [100.00000002, 100.00000003, 100.00000004, 100.00000002, 100.00000003], [99.99999998, 99.99999997, 99.99999996, 99.99999998, 99.99999997], [100.00000001, 100.00000002, 100.00000001, 100.00000001, 100.00000002]
72
+ open_partial_nan, high_partial_nan, low_partial_nan, close_partial_nan = [100.0, np.nan, 99.8, 102.1, np.nan], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
73
+ open_low_variability, high_low_variability, low_low_variability, close_low_variability = [100.0, 100.01, 100.02, 100.01, 100.0], [100.03, 100.04, 100.05, 100.04, 100.03], [99.97, 99.96, 99.95, 99.96, 99.97], [100.01, 100.02, 100.01, 100.02, 100.01]
74
+
75
+ test_cases = [
76
+ {"name": f"Large Dataset ({NUM_POINTS_10_YEARS} points)", "open": open_10y, "high": high_10y, "low": low_10y, "close": close_10y},
77
+ {"name": "Small Dataset (5 points)", "open": open_small, "high": high_small, "low": low_small, "close": close_small},
78
+ {"name": "Invalid OHLC (high < low)", "open": open_invalid, "high": high_invalid, "low": low_invalid, "close": close_invalid},
79
+ {"name": "All NaN", "open": open_nan, "high": high_nan, "low": low_nan, "close": close_nan},
80
+ {"name": "Non-positive Prices", "open": open_non_positive, "high": high_non_positive, "low": low_non_positive, "close": close_non_positive},
81
+ {"name": "Near-zero Differences", "open": open_near_zero_diff, "high": high_near_zero_diff, "low": low_near_zero_diff, "close": close_near_zero_diff},
82
+ {"name": "Partial NaN", "open": open_partial_nan, "high": high_partial_nan, "low": low_partial_nan, "close": close_partial_nan},
83
+ {"name": "Low Variability", "open": open_low_variability, "high": high_low_variability, "low": low_low_variability, "close": close_low_variability},
84
+ ]
85
+
86
+ # --- Numba Warm-up ---
87
+ # First call to a Numba function includes compilation time.
88
+ # We run it once on a small dataset to ensure subsequent timings are for execution only.
89
+ print("Warming up Numba JIT compilers (this may take a moment)...")
90
+ try:
91
+ edge_improved_v1(open_small, high_small, low_small, close_small)
92
+ edge_improved_v2(open_small, high_small, low_small, close_small)
93
+ except Exception as e:
94
+ print(f"An error occurred during warm-up: {e}")
95
+ print("Warm-up complete.\n")
96
+
97
+
98
+ # --- Main Comparison ---
99
+ print("="*80)
100
+ print("Comparing edge_original vs. edge_improved_v1 vs. edge_improved_v2")
101
+ print("="*80 + "\n")
102
+
103
+ for test in test_cases:
104
+ name = test["name"]
105
+ open_p, high_p, low_p, close_p = test["open"], test["high"], test["low"], test["close"]
106
+
107
+ # --- Run original function and time it ---
108
+ try:
109
+ start_time = time.perf_counter()
110
+ result_original = edge_original(open_p, high_p, low_p, close_p)
111
+ time_original = time.perf_counter() - start_time
112
+ except Exception as e:
113
+ result_original, time_original = f"Error: {type(e).__name__}", -1
114
+
115
+ # --- Run improved_v1 function and time it ---
116
+ try:
117
+ start_time = time.perf_counter()
118
+ result_v1 = edge_improved_v1(open_p, high_p, low_p, close_p)
119
+ time_v1 = time.perf_counter() - start_time
120
+ except Exception as e:
121
+ result_v1, time_v1 = f"Error: {type(e).__name__}", -1
122
+
123
+ # --- Run improved_v2 (hyper-optimized) function and time it ---
124
+ try:
125
+ start_time = time.perf_counter()
126
+ result_v2 = edge_improved_v2(open_p, high_p, low_p, close_p)
127
+ time_v2 = time.perf_counter() - start_time
128
+ except Exception as e:
129
+ result_v2, time_v2 = f"Error: {type(e).__name__}", -1
130
+
131
+ # --- Reporting ---
132
+ print(f"--- Test Case: {name} ---")
133
+ print(f" Original: {result_original:<25} (Time: {time_original*1000:.4f} ms)")
134
+ print(f" Improved v1: {result_v1:<25} (Time: {time_v1*1000:.4f} ms)")
135
+ print(f" Improved v2: {result_v2:<25} (Time: {time_v2*1000:.4f} ms)")
136
+
137
+ # Check numerical equivalence against the original baseline
138
+ is_v1_ok = np.isclose(result_original, result_v1, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v1, float) else str(result_original) == str(result_v1)
139
+ is_v2_ok = np.isclose(result_original, result_v2, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v2, float) else str(result_original) == str(result_v2)
140
+
141
+ status_v1 = "\033[92mPASS\033[0m" if is_v1_ok else "\033[91mFAIL\033[0m"
142
+ status_v2 = "\033[92mPASS\033[0m" if is_v2_ok else "\033[91mFAIL\033[0m"
143
+ print(f" Equivalence (v1/v2 vs Original): {status_v1} / {status_v2}")
144
+
145
+ # Performance reporting
146
+ perf_string_v1, perf_string_v2 = "N/A", "N/A"
147
+ if time_original > 0 and time_v1 > 0:
148
+ perf_string_v1 = f"{time_original / time_v1:.2f}x"
149
+ if time_original > 0 and time_v2 > 0:
150
+ perf_string_v2 = f"{time_original / time_v2:.2f}x"
151
+
152
+ print(f" Speedup (v1/v2 vs Original): {perf_string_v1} / {perf_string_v2}\n")
@@ -1,152 +1,174 @@
1
- import numpy as np
1
+ """
2
+ Optimized and robust EDGE estimator for bid-ask spread calculation.
3
+
4
+ Implements the efficient estimator from Ardia, Guidotti, & Kroencke (2024) for
5
+ single-period bid-ask spread estimation from OHLC prices. This version is
6
+ optimized for speed using Numba and careful memory handling, while ensuring
7
+ numerical identity with the reference implementation.
8
+
9
+ Author: Jakub Polec
10
+ Date: 2025-06-28
11
+ """
2
12
  import warnings
3
- from typing import Union, List, Tuple, Any
13
+ from typing import Union, List, Any
14
+ import numpy as np
15
+ from numba import jit
16
+
17
+ @jit(nopython=True, cache=True)
18
+ def _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt):
19
+ """
20
+ Core spread calculation using Numba for maximum performance.
21
+ This is the computational bottleneck and benefits most from JIT compilation.
22
+ """
23
+ # De-mean returns, scaling by the probability of a valid period (pt)
24
+ # This aligns with the GMM framework where moments are conditioned on tau=1
25
+ r1_mean = np.nanmean(r1) # Mean of the first return
26
+ r3_mean = np.nanmean(r3) # Mean of the third return
27
+ r5_mean = np.nanmean(r5) # Mean of the fifth return
28
+
29
+ d1 = r1 - r1_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
30
+ d3 = r3 - r3_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
31
+ d5 = r5 - r5_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
32
+
33
+ # GMM moment conditions
34
+ x1 = -4.0 / po * d1 * r2 + -4.0 / pc * d3 * r4 # First moment condition
35
+ x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4 # Second moment condition
36
+
37
+ # Expectations of the moment conditions
38
+ e1 = np.nanmean(x1) # Expectation of the first moment condition
39
+ e2 = np.nanmean(x2) # Expectation of the second moment condition
40
+
41
+ # Variances for optimal weighting
42
+ v1 = np.nanmean(x1**2) - e1**2 # Variance of the first moment condition
43
+ v2 = np.nanmean(x2**2) - e2**2 # Variance of the second moment condition
44
+
45
+ # Optimal GMM weighting
46
+ vt = v1 + v2 # Total variance
47
+ # If total variance is zero or negative (rare small sample issue), use simple average
48
+ s2 = (v2 * e1 + v1 * e2) / vt if vt > 0.0 else (e1 + e2) / 2.0 # Spread estimate
49
+
50
+ return s2
4
51
 
5
52
  def edge(
6
- open: Union[List[float], Any],
53
+ open_prices: Union[List[float], Any],
7
54
  high: Union[List[float], Any],
8
55
  low: Union[List[float], Any],
9
56
  close: Union[List[float], Any],
10
- sign: bool = False
57
+ sign: bool = False,
58
+ min_pt: float = 1e-6, # Keep this robustness check
59
+ debug: bool = False,
11
60
  ) -> float:
12
61
  """
13
- Estimate the effective bid-ask spread from open, high, low, and close (OHLC) prices.
14
-
15
- Implements the efficient estimator described in Ardia, Guidotti, & Kroencke (2024):
16
- https://doi.org/10.1016/j.jfineco.2024.103916. The estimator computes the root mean square
17
- effective spread within the sample period using log-returns and indicator variables.
18
-
19
- Parameters
20
- ----------
21
- open : array-like
22
- Vector of open prices, sorted in ascending order of timestamp.
23
- high : array-like
24
- Vector of high prices, sorted in ascending order of timestamp.
25
- low : array-like
26
- Vector of low prices, sorted in ascending order of timestamp.
27
- close : array-like
28
- Vector of close prices, sorted in ascending order of timestamp.
29
- sign : bool, default False
30
- If True, returns signed estimates (negative values possible). If False, returns
31
- absolute values to reduce small-sample bias in averaging or regression studies.
32
-
33
- Returns
34
- -------
35
- float
36
- Estimated bid-ask spread as a fraction of price (e.g., 0.01 = 1% spread).
37
- Returns np.nan if the estimate cannot be computed (e.g., insufficient data).
38
-
39
- Notes
40
- -----
41
- - Requires at least 3 observations for a valid estimate.
42
- - Handles missing values (NaNs) automatically by excluding them from calculations.
43
- - The estimator assumes prices are positive and non-zero to compute log-prices.
44
- - For optimal results, use high-frequency data (e.g., minute or hourly) for frequently
45
- traded assets, or lower frequency (e.g., daily) for less liquid assets.
46
-
47
- Examples
48
- --------
49
- >>> import pandas as pd
50
- >>> # Example OHLC data
51
- >>> open_prices = [100.0, 101.5, 99.8, 102.1, 100.9]
52
- >>> high_prices = [102.3, 103.0, 101.2, 103.5, 102.0]
53
- >>> low_prices = [99.5, 100.8, 98.9, 101.0, 100.1]
54
- >>> close_prices = [101.2, 102.5, 100.3, 102.8, 101.5]
55
- >>> spread = edge(open_prices, high_prices, low_prices, close_prices)
56
- >>> print(f"Estimated spread: {spread:.6f}")
57
- Estimated spread: 0.007109
62
+ Estimate the effective bid-ask spread from OHLC prices.
63
+
64
+ Implements the efficient estimator described in Ardia, Guidotti, & Kroencke
65
+ (2024): https://doi.org/10.1016/j.jfineco.2024.103916.
66
+
67
+ Args:
68
+ open_prices : array-like
69
+ Vector of open prices.
70
+ high : array-like
71
+ Vector of high prices.
72
+ low : array-like
73
+ Vector of low prices.
74
+ close : array-like
75
+ Vector of close prices.
76
+ sign : bool, default False
77
+ If True, returns signed estimates. If False, returns absolute values.
78
+ min_pt : float, default 1e-6
79
+ Minimum probability threshold for tau to ensure reliable estimates.
80
+ debug : bool, default False
81
+ If True, prints intermediate values.
82
+
83
+ Returns:
84
+ float
85
+ Estimated bid-ask spread. Returns np.nan if invalid.
86
+
87
+ Examples:
88
+ >>> import numpy as np
89
+ >>> from edge import edge
90
+ >>> open_prices = np.array([100.0, 101.5, 99.8, 102.1, 100.9])
91
+ >>> high = np.array([102.3, 103.0, 101.2, 103.5, 102.0])
92
+ >>> low = np.array([99.5, 100.8, 98.9, 101.0, 100.1])
93
+ >>> close = np.array([101.2, 102.5, 100.3, 102.8, 101.5])
58
94
  """
59
- # Convert inputs to numpy arrays
60
- open = np.asarray(open, dtype=float)
61
- high = np.asarray(high, dtype=float)
62
- low = np.asarray(low, dtype=float)
63
- close = np.asarray(close, dtype=float)
64
-
65
- # Validate input lengths
66
- nobs = len(open)
67
- if len(high) != nobs or len(low) != nobs or len(close) != nobs:
68
- raise ValueError("Open, high, low, and close must have the same length")
69
-
70
- # Return NaN if insufficient observations
71
- if nobs < 3:
95
+ # --- 1. Input Validation and Conversion ---
96
+ o_arr = np.asarray(open_prices, dtype=float) # Convert to numpy array
97
+ h_arr = np.asarray(high, dtype=float) # Convert to numpy array
98
+ l_arr = np.asarray(low, dtype=float) # Convert to numpy array
99
+ c_arr = np.asarray(close, dtype=float) # Convert to numpy array
100
+
101
+ nobs = len(o_arr)
102
+ if not (len(h_arr) == nobs and len(l_arr) == nobs and len(c_arr) == nobs):
103
+ raise ValueError("Input arrays must have the same length.")
104
+
105
+ if nobs < 3: # If there are less than 3 observations, return NaN
106
+ if debug: print("NaN reason: nobs < 3")
72
107
  return np.nan
73
108
 
74
- # Compute log-prices, handling non-positive prices
109
+ # --- 2. Log-Price Calculation ---
75
110
  with warnings.catch_warnings():
76
111
  warnings.simplefilter("ignore", RuntimeWarning)
77
- o = np.log(np.where(open > 0, open, np.nan))
78
- h = np.log(np.where(high > 0, high, np.nan))
79
- l = np.log(np.where(low > 0, low, np.nan))
80
- c = np.log(np.where(close > 0, close, np.nan))
81
- m = (h + l) / 2.0 # Mid-price log
82
-
83
- # Shift log-prices by one period
84
- h1, l1, c1, m1 = h[:-1], l[:-1], c[:-1], m[:-1]
85
- o, h, l, c, m = o[1:], h[1:], l[1:], c[1:], m[1:]
86
-
87
- # Compute log-returns
88
- r1 = m - o # Mid - Open
89
- r2 = o - m1 # Open - Previous Mid
90
- r3 = m - c1 # Mid - Previous Close
91
- r4 = c1 - m1 # Previous Close - Previous Mid
92
- r5 = o - c1 # Open - Previous Close
93
-
94
- # Compute indicator variables
95
- # tau: Indicator for valid price variation (1 if high != low or low != previous close)
96
- tau = np.where(np.isnan(h) | np.isnan(l) | np.isnan(c1), np.nan,
97
- ((h != l) | (l != c1)).astype(float))
98
-
99
- # po1: Indicator for open price not equal to high, scaled by tau
100
- po1 = tau * np.where(np.isnan(o) | np.isnan(h), np.nan, (o != h).astype(float))
101
-
102
- # po2: Indicator for open price not equal to low, scaled by tau
103
- po2 = tau * np.where(np.isnan(o) | np.isnan(l), np.nan, (o != l).astype(float))
112
+ # Replace non-positive prices with NaN to avoid log(0) or log(-1) issues
113
+ o = np.log(np.where(o_arr > 0, o_arr, np.nan)) # Log-price of the open price
114
+ h = np.log(np.where(h_arr > 0, h_arr, np.nan)) # Log-price of the high price
115
+ l = np.log(np.where(l_arr > 0, l_arr, np.nan)) # Log-price of the low price
116
+ c = np.log(np.where(c_arr > 0, c_arr, np.nan)) # Log-price of the close price
117
+ m = (h + l) / 2.0 # Mid-price log
118
+
119
+ # --- 3. Shift Arrays for Lagged Calculations (THE CRITICAL FIX) ---
120
+ # All calculations from here on use N-1 observations.
121
+ o_t = o[1:] # Open price at time t
122
+ h_t = h[1:] # High price at time t
123
+ l_t = l[1:] # Low price at time t
124
+ m_t = m[1:] # Mid-price at time t
104
125
 
105
- # pc1: Indicator for previous close not equal to previous high, scaled by tau
106
- pc1 = tau * np.where(np.isnan(c1) | np.isnan(h1), np.nan, (c1 != h1).astype(float))
126
+ h_tm1 = h[:-1] # High price at time t-1
127
+ l_tm1 = l[:-1]
128
+ c_tm1 = c[:-1]
129
+ m_tm1 = m[:-1]
130
+
131
+ # --- 4. Compute Log-Returns ---
132
+ r1 = m_t - o_t # Mid-price - Open price
133
+ r2 = o_t - m_tm1 # Open price - Previous mid-price
134
+ r3 = m_t - c_tm1 # Mid-price - Previous close
135
+ r4 = c_tm1 - m_tm1 # Previous close - Previous mid-price
136
+ r5 = o_t - c_tm1 # Open price - Previous close
137
+
138
+ # --- 5. Compute Indicator Variables ---
139
+ tau = np.where(np.isnan(h_t) | np.isnan(l_t) | np.isnan(c_tm1), np.nan, ((h_t != l_t) | (l_t != c_tm1)).astype(float))
140
+ po1 = tau * np.where(np.isnan(o_t) | np.isnan(h_t), np.nan, (o_t != h_t).astype(float))
141
+ po2 = tau * np.where(np.isnan(o_t) | np.isnan(l_t), np.nan, (o_t != l_t).astype(float))
142
+ pc1 = tau * np.where(np.isnan(c_tm1) | np.isnan(h_tm1), np.nan, (c_tm1 != h_tm1).astype(float))
143
+ pc2 = tau * np.where(np.isnan(c_tm1) | np.isnan(l_tm1), np.nan, (c_tm1 != l_tm1).astype(float))
107
144
 
108
- # pc2: Indicator for previous close not equal to previous low, scaled by tau
109
- pc2 = tau * np.where(np.isnan(c1) | np.isnan(l1), np.nan, (c1 != l1).astype(float))
110
-
111
- # Compute probabilities with NaN handling
145
+ # --- 6. Compute Probabilities ---
112
146
  with warnings.catch_warnings():
113
147
  warnings.simplefilter("ignore", RuntimeWarning)
114
- pt = np.nanmean(tau)
115
- po = np.nanmean(po1) + np.nanmean(po2)
116
- pc = np.nanmean(pc1) + np.nanmean(pc2)
117
-
118
- # Return NaN if insufficient valid periods or probabilities are zero
119
- if np.nansum(tau) < 2 or po == 0 or pc == 0:
120
- return np.nan
148
+ pt = np.nanmean(tau) # Probability of a valid period
149
+ po = np.nanmean(po1) + np.nanmean(po2) # Probability of open price not equal to high
150
+ pc = np.nanmean(pc1) + np.nanmean(pc2) # Probability of close price not equal to high
121
151
 
122
- # Compute de-meaned log-returns
123
- d1 = r1 - np.nanmean(r1) / pt * tau
124
- d3 = r3 - np.nanmean(r3) / pt * tau
125
- d5 = r5 - np.nanmean(r5) / pt * tau
152
+ if debug:
153
+ print(f"Debug: tau_sum={np.nansum(tau):.2f}, po={po:.4f}, pc={pc:.4f}, pt={pt:.4f}")
126
154
 
127
- # Compute input vectors for GMM estimation
128
- # x1: First moment condition combining open-high-low and close-high-low effects
129
- x1 = -4.0 / po * d1 * r2 + -4.0 / pc * d3 * r4 # Scaled by probability of open/close extremes
130
- # x2: Second moment condition combining open-high-low-close and close-high-low-open effects
131
- x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4
132
-
133
- # Compute expectations (means) of the moment conditions
134
- e1 = np.nanmean(x1) # First moment expectation
135
- e2 = np.nanmean(x2) # Second moment expectation
136
-
137
- # Compute variances of the moment conditions for optimal weighting
138
- v1 = np.nanmean(x1**2) - e1**2 # Variance of first moment
139
- v2 = np.nanmean(x2**2) - e2**2 # Variance of second moment
155
+ # --- 7. Check for Data Quality ---
156
+ if np.nansum(tau) < 2 or po == 0.0 or pc == 0.0 or pt < min_pt:
157
+ if debug: print(f"NaN reason: Insufficient valid data (tau_sum={np.nansum(tau)}, po={po}, pc={pc}, pt={pt})")
158
+ return np.nan
140
159
 
141
- # Compute squared spread estimate using optimal GMM weights
142
- vt = v1 + v2 # Total variance for weighting
143
- # If total variance is positive, use optimal weighted average
144
- # Otherwise fall back to simple average of the two estimates
145
- s2 = (v2 * e1 + v1 * e2) / vt if vt > 0 else (e1 + e2) / 2.0
160
+ # --- 8. Compute Spread (using the Numba-optimized function) ---
161
+ s2 = _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt) # Spread estimate
162
+
163
+ if np.isnan(s2):
164
+ if debug: print("NaN reason: s2 calculation resulted in NaN")
165
+ return np.nan
146
166
 
147
- # Compute signed root
148
167
  s = np.sqrt(np.abs(s2))
149
168
  if sign:
150
- s *= np.sign(s2)
169
+ s *= np.sign(s2) # Signed spread estimate
170
+
171
+ if debug:
172
+ print(f"Debug: s2={s2:.6e}, s={s:.6e}")
151
173
 
152
174
  return float(s)
@@ -1,65 +1,51 @@
1
+ """
2
+ Expanding window EDGE estimator implementation.
3
+
4
+ This module provides an expanding window implementation of the EDGE estimator,
5
+ ensuring compatibility with all pandas windowing features like 'step'.
6
+
7
+ Author: Jakub Polec
8
+ Date: 2025-06-28
9
+
10
+ Part of the QuantJourney framework - The framework with advanced quantitative
11
+ finance tools and insights.
12
+ """
13
+ import warnings
14
+ import numpy as np
1
15
  import pandas as pd
2
- from typing import Union
3
- from .edge import edge
4
- from .edge_rolling import edge_rolling
16
+ from .edge import edge as edge_single # Import the core, fast estimator
5
17
 
6
18
  def edge_expanding(
7
19
  df: pd.DataFrame,
8
- min_periods: int = 1,
9
- sign: bool = False
20
+ min_periods: int = 3,
21
+ sign: bool = False,
10
22
  ) -> pd.Series:
11
- """
12
- Compute expanding window estimates of the bid-ask spread from OHLC prices.
13
-
14
- Uses the efficient estimator from Ardia, Guidotti, & Kroencke (2024):
15
- https://doi.org/10.1016/j.jfineco.2024.103916. Calculates spreads over
16
- expanding windows starting from the first observation.
17
-
18
- Parameters
19
- ----------
20
- df : pd.DataFrame
21
- DataFrame with columns 'open', 'high', 'low', 'close' (case-insensitive).
22
- min_periods : int, default 1
23
- Minimum number of observations required for an estimate. Note that
24
- at least 3 observations are needed for a non-NaN result.
25
- sign : bool, default False
26
- If True, returns signed estimates. If False, returns absolute values.
27
-
28
- Returns
29
- -------
30
- pd.Series
31
- Series of expanding spread estimates, indexed by the DataFrame's index.
32
- A value of 0.01 corresponds to a 1% spread. NaN for periods with
33
- insufficient data.
34
-
35
- Notes
36
- -----
37
- - The function leverages `edge_rolling` with a window equal to the DataFrame length.
38
- - Missing values are handled automatically.
39
- - The estimator is most reliable with sufficient data (e.g., 20+ observations).
23
+ """Computes expanding EDGE estimates by calling the core estimator on a growing window."""
24
+ if min_periods < 3:
25
+ warnings.warn("min_periods < 3 is not recommended, setting to 3.", UserWarning)
26
+ min_periods = 3
27
+
28
+ # Prepare data
29
+ df_proc = df.rename(columns=str.lower).copy()
30
+ open_p = df_proc["open"].values
31
+ high_p = df_proc["high"].values
32
+ low_p = df_proc["low"].values
33
+ close_p = df_proc["close"].values
40
34
 
41
- Examples
42
- --------
43
- >>> import pandas as pd
44
- >>> # Example OHLC DataFrame
45
- >>> df = pd.DataFrame({
46
- ... 'open': [100.0, 101.5, 99.8, 102.1, 100.9, 103.2],
47
- ... 'high': [102.3, 103.0, 101.2, 103.5, 102.0, 104.8],
48
- ... 'low': [99.5, 100.8, 98.9, 101.0, 100.1, 102.5],
49
- ... 'close': [101.2, 102.5, 100.3, 102.8, 101.5, 104.1]
50
- ... })
51
- >>> spreads = edge_expanding(df, min_periods=3)
52
- >>> print(spreads.dropna())
53
- """
54
- # Standardize column names
55
- df = df.rename(columns=str.lower).copy()
56
- required_cols = ['open', 'high', 'low', 'close']
57
- if not all(col in df.columns for col in required_cols):
58
- raise ValueError("DataFrame must contain 'open', 'high', 'low', 'close' columns")
35
+ n = len(df_proc)
36
+ estimates = np.full(n, np.nan)
59
37
 
60
- return edge_rolling(
61
- df=df,
62
- window=len(df),
63
- min_periods=max(min_periods, 3),
64
- sign=sign
65
- )
38
+ # This loop perfectly replicates the test's logic for an expanding window
39
+ for i in range(n):
40
+ t1 = i + 1
41
+ if t1 >= min_periods:
42
+ # Call the fast, single-shot edge estimator on the expanding slice
43
+ estimates[i] = edge_single(
44
+ open_p[:t1],
45
+ high_p[:t1],
46
+ low_p[:t1],
47
+ close_p[:t1],
48
+ sign=sign,
49
+ )
50
+
51
+ return pd.Series(estimates, index=df_proc.index, name="EDGE_expanding")