quantjourney-bidask 0.9.4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantjourney_bidask/__init__.py +31 -4
- quantjourney_bidask/_compare_edge.py +152 -0
- quantjourney_bidask/edge.py +149 -127
- quantjourney_bidask/edge_expanding.py +45 -57
- quantjourney_bidask/edge_hft.py +126 -0
- quantjourney_bidask/edge_rolling.py +54 -198
- {quantjourney_bidask-0.9.4.dist-info → quantjourney_bidask-1.0.1.dist-info}/METADATA +94 -35
- quantjourney_bidask-1.0.1.dist-info/RECORD +11 -0
- quantjourney_bidask/_version.py +0 -7
- quantjourney_bidask/websocket_fetcher.py +0 -308
- quantjourney_bidask-0.9.4.dist-info/RECORD +0 -11
- {quantjourney_bidask-0.9.4.dist-info → quantjourney_bidask-1.0.1.dist-info}/WHEEL +0 -0
- {quantjourney_bidask-0.9.4.dist-info → quantjourney_bidask-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {quantjourney_bidask-0.9.4.dist-info → quantjourney_bidask-1.0.1.dist-info}/top_level.txt +0 -0
quantjourney_bidask/__init__.py
CHANGED
@@ -1,7 +1,34 @@
|
|
1
|
+
"""
|
2
|
+
QuantJourney Bid-Ask Spread Estimator - Core Library.
|
3
|
+
|
4
|
+
Efficient estimation of bid-ask spreads from OHLC prices using the methodology
|
5
|
+
from Ardia, Guidotti, & Kroencke (2024).
|
6
|
+
|
7
|
+
Author: Jakub Polec
|
8
|
+
Date: 2025-06-28
|
9
|
+
|
10
|
+
Part of the QuantJourney framework - The framework with advanced quantitative
|
11
|
+
finance tools and insights.
|
12
|
+
"""
|
13
|
+
|
1
14
|
from .edge import edge
|
2
|
-
from .edge_rolling import edge_rolling
|
3
15
|
from .edge_expanding import edge_expanding
|
4
|
-
from .
|
5
|
-
|
16
|
+
from .edge_rolling import edge_rolling
|
17
|
+
|
18
|
+
# Import version from package metadata
|
19
|
+
try:
|
20
|
+
from importlib.metadata import metadata, version
|
21
|
+
|
22
|
+
__version__ = version("quantjourney-bidask")
|
23
|
+
_meta = metadata("quantjourney-bidask")
|
24
|
+
__author__ = "Jakub Polec"
|
25
|
+
__email__ = "jakub@quantjourney.pro"
|
26
|
+
__license__ = "MIT"
|
27
|
+
except ImportError:
|
28
|
+
# Fallback for development mode
|
29
|
+
__version__ = "X.Y"
|
30
|
+
__author__ = "Jakub Polec"
|
31
|
+
__email__ = "jakub@quantjourney.pro"
|
32
|
+
__license__ = "MIT"
|
6
33
|
|
7
|
-
__all__ = [
|
34
|
+
__all__ = ["edge", "edge_rolling", "edge_expanding"]
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# compare_edge_v2.py
|
2
|
+
"""
|
3
|
+
Comprehensive comparison script for EDGE estimator implementations.
|
4
|
+
|
5
|
+
This script benchmarks three versions of the EDGE bid-ask spread estimator:
|
6
|
+
1. `edge_original`: The baseline, pure NumPy implementation.
|
7
|
+
2. `edge_improved_v1`: Optimized with a modular structure and a Numba kernel
|
8
|
+
for the core calculation.
|
9
|
+
3. `edge_improved_v2`: Hyper-optimized with a single, monolithic Numba kernel
|
10
|
+
to minimize Python overhead and maximize compiler optimizations.
|
11
|
+
|
12
|
+
The script validates that the optimized versions are numerically identical to the
|
13
|
+
original (within floating-point tolerances) and quantifies the performance gains
|
14
|
+
across a variety of test datasets.
|
15
|
+
|
16
|
+
To Run:
|
17
|
+
1. Ensure `edge.py`, `edge_improved.py`, and `edge_improved_v2.py` are in the same directory.
|
18
|
+
2. Execute from the terminal: `python compare_edge_v2.py`
|
19
|
+
"""
|
20
|
+
import time
|
21
|
+
import numpy as np
|
22
|
+
|
23
|
+
# Import the three versions of the edge function for comparison
|
24
|
+
from edge import edge as edge_original
|
25
|
+
from quantjourney_bidask.edge_improved_v1 import edge as edge_improved_v1
|
26
|
+
from quantjourney_bidask.edge_improved_v2 import edge as edge_improved_v2
|
27
|
+
|
28
|
+
|
29
|
+
def generate_complex_ohlc_data(num_points, initial_price=100.0, annual_vol=0.20, annual_drift=0.05, daily_spread_pct=0.005, overnight_vol=0.001, seed=42):
|
30
|
+
"""Generates synthetic OHLC data with overnight gaps for robust testing."""
|
31
|
+
np.random.seed(seed)
|
32
|
+
dt = 1 / 252.0
|
33
|
+
daily_vol = annual_vol * np.sqrt(dt)
|
34
|
+
daily_drift = annual_drift * dt
|
35
|
+
log_returns = daily_drift + daily_vol * np.random.normal(size=num_points)
|
36
|
+
mid_prices_series = initial_price * np.exp(np.cumsum(log_returns))
|
37
|
+
|
38
|
+
mid_prices_series = np.maximum(mid_prices_series, 1e-6)
|
39
|
+
overnight_returns = np.random.normal(loc=0, scale=overnight_vol, size=num_points)
|
40
|
+
open_prices = mid_prices_series * np.exp(overnight_returns)
|
41
|
+
open_prices = np.roll(open_prices, 1)
|
42
|
+
open_prices[0] = initial_price
|
43
|
+
close_prices = mid_prices_series
|
44
|
+
|
45
|
+
intraday_range_factor = np.random.uniform(daily_vol, daily_vol * 2.5, size=num_points)
|
46
|
+
intraday_range = intraday_range_factor * mid_prices_series
|
47
|
+
high_prices = mid_prices_series + intraday_range / 2.0
|
48
|
+
low_prices = mid_prices_series - intraday_range / 2.0
|
49
|
+
|
50
|
+
spread_component = daily_spread_pct * mid_prices_series
|
51
|
+
high_prices += spread_component / 2.0
|
52
|
+
low_prices -= spread_component / 2.0
|
53
|
+
|
54
|
+
high_prices = np.maximum.reduce([high_prices, open_prices, close_prices])
|
55
|
+
low_prices = np.minimum.reduce([low_prices, open_prices, close_prices])
|
56
|
+
|
57
|
+
high_prices = np.maximum(high_prices, 1e-6)
|
58
|
+
low_prices = np.maximum(low_prices, 1e-6)
|
59
|
+
open_prices = np.maximum(open_prices, 1e-6)
|
60
|
+
close_prices = np.maximum(close_prices, 1e-6)
|
61
|
+
|
62
|
+
return open_prices, high_prices, low_prices, close_prices
|
63
|
+
|
64
|
+
# --- Test Case Definitions ---
|
65
|
+
NUM_POINTS_10_YEARS = 10 * 252
|
66
|
+
open_10y, high_10y, low_10y, close_10y = generate_complex_ohlc_data(NUM_POINTS_10_YEARS, initial_price=500.0, daily_spread_pct=0.005)
|
67
|
+
open_small, high_small, low_small, close_small = [100.0, 101.5, 99.8, 102.1, 100.9], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
|
68
|
+
open_invalid, high_invalid, low_invalid, close_invalid = [100.0, 101.5, 99.8], [99.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
|
69
|
+
open_nan, high_nan, low_nan, close_nan = [np.nan] * 5, [np.nan] * 5, [np.nan] * 5, [np.nan] * 5
|
70
|
+
open_non_positive, high_non_positive, low_non_positive, close_non_positive = [100.0, 0.0, 99.8], [102.0, 103.0, 101.2], [99.5, 100.8, 98.9], [101.2, 102.5, 100.3]
|
71
|
+
open_near_zero_diff, high_near_zero_diff, low_near_zero_diff, close_near_zero_diff = [100.0, 100.00000001, 100.00000002, 100.0, 100.00000001], [100.00000002, 100.00000003, 100.00000004, 100.00000002, 100.00000003], [99.99999998, 99.99999997, 99.99999996, 99.99999998, 99.99999997], [100.00000001, 100.00000002, 100.00000001, 100.00000001, 100.00000002]
|
72
|
+
open_partial_nan, high_partial_nan, low_partial_nan, close_partial_nan = [100.0, np.nan, 99.8, 102.1, np.nan], [102.3, 103.0, 101.2, 103.5, 102.0], [99.5, 100.8, 98.9, 101.0, 100.1], [101.2, 102.5, 100.3, 102.8, 101.5]
|
73
|
+
open_low_variability, high_low_variability, low_low_variability, close_low_variability = [100.0, 100.01, 100.02, 100.01, 100.0], [100.03, 100.04, 100.05, 100.04, 100.03], [99.97, 99.96, 99.95, 99.96, 99.97], [100.01, 100.02, 100.01, 100.02, 100.01]
|
74
|
+
|
75
|
+
test_cases = [
|
76
|
+
{"name": f"Large Dataset ({NUM_POINTS_10_YEARS} points)", "open": open_10y, "high": high_10y, "low": low_10y, "close": close_10y},
|
77
|
+
{"name": "Small Dataset (5 points)", "open": open_small, "high": high_small, "low": low_small, "close": close_small},
|
78
|
+
{"name": "Invalid OHLC (high < low)", "open": open_invalid, "high": high_invalid, "low": low_invalid, "close": close_invalid},
|
79
|
+
{"name": "All NaN", "open": open_nan, "high": high_nan, "low": low_nan, "close": close_nan},
|
80
|
+
{"name": "Non-positive Prices", "open": open_non_positive, "high": high_non_positive, "low": low_non_positive, "close": close_non_positive},
|
81
|
+
{"name": "Near-zero Differences", "open": open_near_zero_diff, "high": high_near_zero_diff, "low": low_near_zero_diff, "close": close_near_zero_diff},
|
82
|
+
{"name": "Partial NaN", "open": open_partial_nan, "high": high_partial_nan, "low": low_partial_nan, "close": close_partial_nan},
|
83
|
+
{"name": "Low Variability", "open": open_low_variability, "high": high_low_variability, "low": low_low_variability, "close": close_low_variability},
|
84
|
+
]
|
85
|
+
|
86
|
+
# --- Numba Warm-up ---
|
87
|
+
# First call to a Numba function includes compilation time.
|
88
|
+
# We run it once on a small dataset to ensure subsequent timings are for execution only.
|
89
|
+
print("Warming up Numba JIT compilers (this may take a moment)...")
|
90
|
+
try:
|
91
|
+
edge_improved_v1(open_small, high_small, low_small, close_small)
|
92
|
+
edge_improved_v2(open_small, high_small, low_small, close_small)
|
93
|
+
except Exception as e:
|
94
|
+
print(f"An error occurred during warm-up: {e}")
|
95
|
+
print("Warm-up complete.\n")
|
96
|
+
|
97
|
+
|
98
|
+
# --- Main Comparison ---
|
99
|
+
print("="*80)
|
100
|
+
print("Comparing edge_original vs. edge_improved_v1 vs. edge_improved_v2")
|
101
|
+
print("="*80 + "\n")
|
102
|
+
|
103
|
+
for test in test_cases:
|
104
|
+
name = test["name"]
|
105
|
+
open_p, high_p, low_p, close_p = test["open"], test["high"], test["low"], test["close"]
|
106
|
+
|
107
|
+
# --- Run original function and time it ---
|
108
|
+
try:
|
109
|
+
start_time = time.perf_counter()
|
110
|
+
result_original = edge_original(open_p, high_p, low_p, close_p)
|
111
|
+
time_original = time.perf_counter() - start_time
|
112
|
+
except Exception as e:
|
113
|
+
result_original, time_original = f"Error: {type(e).__name__}", -1
|
114
|
+
|
115
|
+
# --- Run improved_v1 function and time it ---
|
116
|
+
try:
|
117
|
+
start_time = time.perf_counter()
|
118
|
+
result_v1 = edge_improved_v1(open_p, high_p, low_p, close_p)
|
119
|
+
time_v1 = time.perf_counter() - start_time
|
120
|
+
except Exception as e:
|
121
|
+
result_v1, time_v1 = f"Error: {type(e).__name__}", -1
|
122
|
+
|
123
|
+
# --- Run improved_v2 (hyper-optimized) function and time it ---
|
124
|
+
try:
|
125
|
+
start_time = time.perf_counter()
|
126
|
+
result_v2 = edge_improved_v2(open_p, high_p, low_p, close_p)
|
127
|
+
time_v2 = time.perf_counter() - start_time
|
128
|
+
except Exception as e:
|
129
|
+
result_v2, time_v2 = f"Error: {type(e).__name__}", -1
|
130
|
+
|
131
|
+
# --- Reporting ---
|
132
|
+
print(f"--- Test Case: {name} ---")
|
133
|
+
print(f" Original: {result_original:<25} (Time: {time_original*1000:.4f} ms)")
|
134
|
+
print(f" Improved v1: {result_v1:<25} (Time: {time_v1*1000:.4f} ms)")
|
135
|
+
print(f" Improved v2: {result_v2:<25} (Time: {time_v2*1000:.4f} ms)")
|
136
|
+
|
137
|
+
# Check numerical equivalence against the original baseline
|
138
|
+
is_v1_ok = np.isclose(result_original, result_v1, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v1, float) else str(result_original) == str(result_v1)
|
139
|
+
is_v2_ok = np.isclose(result_original, result_v2, rtol=1e-9, atol=1e-12, equal_nan=True) if isinstance(result_original, float) and isinstance(result_v2, float) else str(result_original) == str(result_v2)
|
140
|
+
|
141
|
+
status_v1 = "\033[92mPASS\033[0m" if is_v1_ok else "\033[91mFAIL\033[0m"
|
142
|
+
status_v2 = "\033[92mPASS\033[0m" if is_v2_ok else "\033[91mFAIL\033[0m"
|
143
|
+
print(f" Equivalence (v1/v2 vs Original): {status_v1} / {status_v2}")
|
144
|
+
|
145
|
+
# Performance reporting
|
146
|
+
perf_string_v1, perf_string_v2 = "N/A", "N/A"
|
147
|
+
if time_original > 0 and time_v1 > 0:
|
148
|
+
perf_string_v1 = f"{time_original / time_v1:.2f}x"
|
149
|
+
if time_original > 0 and time_v2 > 0:
|
150
|
+
perf_string_v2 = f"{time_original / time_v2:.2f}x"
|
151
|
+
|
152
|
+
print(f" Speedup (v1/v2 vs Original): {perf_string_v1} / {perf_string_v2}\n")
|
quantjourney_bidask/edge.py
CHANGED
@@ -1,152 +1,174 @@
|
|
1
|
-
|
1
|
+
"""
|
2
|
+
Optimized and robust EDGE estimator for bid-ask spread calculation.
|
3
|
+
|
4
|
+
Implements the efficient estimator from Ardia, Guidotti, & Kroencke (2024) for
|
5
|
+
single-period bid-ask spread estimation from OHLC prices. This version is
|
6
|
+
optimized for speed using Numba and careful memory handling, while ensuring
|
7
|
+
numerical identity with the reference implementation.
|
8
|
+
|
9
|
+
Author: Jakub Polec
|
10
|
+
Date: 2025-06-28
|
11
|
+
"""
|
2
12
|
import warnings
|
3
|
-
from typing import Union, List,
|
13
|
+
from typing import Union, List, Any
|
14
|
+
import numpy as np
|
15
|
+
from numba import jit
|
16
|
+
|
17
|
+
@jit(nopython=True, cache=True)
|
18
|
+
def _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt):
|
19
|
+
"""
|
20
|
+
Core spread calculation using Numba for maximum performance.
|
21
|
+
This is the computational bottleneck and benefits most from JIT compilation.
|
22
|
+
"""
|
23
|
+
# De-mean returns, scaling by the probability of a valid period (pt)
|
24
|
+
# This aligns with the GMM framework where moments are conditioned on tau=1
|
25
|
+
r1_mean = np.nanmean(r1) # Mean of the first return
|
26
|
+
r3_mean = np.nanmean(r3) # Mean of the third return
|
27
|
+
r5_mean = np.nanmean(r5) # Mean of the fifth return
|
28
|
+
|
29
|
+
d1 = r1 - r1_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
|
30
|
+
d3 = r3 - r3_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
|
31
|
+
d5 = r5 - r5_mean / pt * tau # De-mean returns, scaling by the probability of a valid period (pt)
|
32
|
+
|
33
|
+
# GMM moment conditions
|
34
|
+
x1 = -4.0 / po * d1 * r2 + -4.0 / pc * d3 * r4 # First moment condition
|
35
|
+
x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4 # Second moment condition
|
36
|
+
|
37
|
+
# Expectations of the moment conditions
|
38
|
+
e1 = np.nanmean(x1) # Expectation of the first moment condition
|
39
|
+
e2 = np.nanmean(x2) # Expectation of the second moment condition
|
40
|
+
|
41
|
+
# Variances for optimal weighting
|
42
|
+
v1 = np.nanmean(x1**2) - e1**2 # Variance of the first moment condition
|
43
|
+
v2 = np.nanmean(x2**2) - e2**2 # Variance of the second moment condition
|
44
|
+
|
45
|
+
# Optimal GMM weighting
|
46
|
+
vt = v1 + v2 # Total variance
|
47
|
+
# If total variance is zero or negative (rare small sample issue), use simple average
|
48
|
+
s2 = (v2 * e1 + v1 * e2) / vt if vt > 0.0 else (e1 + e2) / 2.0 # Spread estimate
|
49
|
+
|
50
|
+
return s2
|
4
51
|
|
5
52
|
def edge(
|
6
|
-
|
53
|
+
open_prices: Union[List[float], Any],
|
7
54
|
high: Union[List[float], Any],
|
8
55
|
low: Union[List[float], Any],
|
9
56
|
close: Union[List[float], Any],
|
10
|
-
sign: bool = False
|
57
|
+
sign: bool = False,
|
58
|
+
min_pt: float = 1e-6, # Keep this robustness check
|
59
|
+
debug: bool = False,
|
11
60
|
) -> float:
|
12
61
|
"""
|
13
|
-
Estimate the effective bid-ask spread from
|
14
|
-
|
15
|
-
Implements the efficient estimator described in Ardia, Guidotti, & Kroencke
|
16
|
-
https://doi.org/10.1016/j.jfineco.2024.103916.
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
traded assets, or lower frequency (e.g., daily) for less liquid assets.
|
46
|
-
|
47
|
-
Examples
|
48
|
-
--------
|
49
|
-
>>> import pandas as pd
|
50
|
-
>>> # Example OHLC data
|
51
|
-
>>> open_prices = [100.0, 101.5, 99.8, 102.1, 100.9]
|
52
|
-
>>> high_prices = [102.3, 103.0, 101.2, 103.5, 102.0]
|
53
|
-
>>> low_prices = [99.5, 100.8, 98.9, 101.0, 100.1]
|
54
|
-
>>> close_prices = [101.2, 102.5, 100.3, 102.8, 101.5]
|
55
|
-
>>> spread = edge(open_prices, high_prices, low_prices, close_prices)
|
56
|
-
>>> print(f"Estimated spread: {spread:.6f}")
|
57
|
-
Estimated spread: 0.007109
|
62
|
+
Estimate the effective bid-ask spread from OHLC prices.
|
63
|
+
|
64
|
+
Implements the efficient estimator described in Ardia, Guidotti, & Kroencke
|
65
|
+
(2024): https://doi.org/10.1016/j.jfineco.2024.103916.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
open_prices : array-like
|
69
|
+
Vector of open prices.
|
70
|
+
high : array-like
|
71
|
+
Vector of high prices.
|
72
|
+
low : array-like
|
73
|
+
Vector of low prices.
|
74
|
+
close : array-like
|
75
|
+
Vector of close prices.
|
76
|
+
sign : bool, default False
|
77
|
+
If True, returns signed estimates. If False, returns absolute values.
|
78
|
+
min_pt : float, default 1e-6
|
79
|
+
Minimum probability threshold for tau to ensure reliable estimates.
|
80
|
+
debug : bool, default False
|
81
|
+
If True, prints intermediate values.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
float
|
85
|
+
Estimated bid-ask spread. Returns np.nan if invalid.
|
86
|
+
|
87
|
+
Examples:
|
88
|
+
>>> import numpy as np
|
89
|
+
>>> from edge import edge
|
90
|
+
>>> open_prices = np.array([100.0, 101.5, 99.8, 102.1, 100.9])
|
91
|
+
>>> high = np.array([102.3, 103.0, 101.2, 103.5, 102.0])
|
92
|
+
>>> low = np.array([99.5, 100.8, 98.9, 101.0, 100.1])
|
93
|
+
>>> close = np.array([101.2, 102.5, 100.3, 102.8, 101.5])
|
58
94
|
"""
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
nobs
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if nobs < 3:
|
95
|
+
# --- 1. Input Validation and Conversion ---
|
96
|
+
o_arr = np.asarray(open_prices, dtype=float) # Convert to numpy array
|
97
|
+
h_arr = np.asarray(high, dtype=float) # Convert to numpy array
|
98
|
+
l_arr = np.asarray(low, dtype=float) # Convert to numpy array
|
99
|
+
c_arr = np.asarray(close, dtype=float) # Convert to numpy array
|
100
|
+
|
101
|
+
nobs = len(o_arr)
|
102
|
+
if not (len(h_arr) == nobs and len(l_arr) == nobs and len(c_arr) == nobs):
|
103
|
+
raise ValueError("Input arrays must have the same length.")
|
104
|
+
|
105
|
+
if nobs < 3: # If there are less than 3 observations, return NaN
|
106
|
+
if debug: print("NaN reason: nobs < 3")
|
72
107
|
return np.nan
|
73
108
|
|
74
|
-
#
|
109
|
+
# --- 2. Log-Price Calculation ---
|
75
110
|
with warnings.catch_warnings():
|
76
111
|
warnings.simplefilter("ignore", RuntimeWarning)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
r3 = m - c1 # Mid - Previous Close
|
91
|
-
r4 = c1 - m1 # Previous Close - Previous Mid
|
92
|
-
r5 = o - c1 # Open - Previous Close
|
93
|
-
|
94
|
-
# Compute indicator variables
|
95
|
-
# tau: Indicator for valid price variation (1 if high != low or low != previous close)
|
96
|
-
tau = np.where(np.isnan(h) | np.isnan(l) | np.isnan(c1), np.nan,
|
97
|
-
((h != l) | (l != c1)).astype(float))
|
98
|
-
|
99
|
-
# po1: Indicator for open price not equal to high, scaled by tau
|
100
|
-
po1 = tau * np.where(np.isnan(o) | np.isnan(h), np.nan, (o != h).astype(float))
|
101
|
-
|
102
|
-
# po2: Indicator for open price not equal to low, scaled by tau
|
103
|
-
po2 = tau * np.where(np.isnan(o) | np.isnan(l), np.nan, (o != l).astype(float))
|
112
|
+
# Replace non-positive prices with NaN to avoid log(0) or log(-1) issues
|
113
|
+
o = np.log(np.where(o_arr > 0, o_arr, np.nan)) # Log-price of the open price
|
114
|
+
h = np.log(np.where(h_arr > 0, h_arr, np.nan)) # Log-price of the high price
|
115
|
+
l = np.log(np.where(l_arr > 0, l_arr, np.nan)) # Log-price of the low price
|
116
|
+
c = np.log(np.where(c_arr > 0, c_arr, np.nan)) # Log-price of the close price
|
117
|
+
m = (h + l) / 2.0 # Mid-price log
|
118
|
+
|
119
|
+
# --- 3. Shift Arrays for Lagged Calculations (THE CRITICAL FIX) ---
|
120
|
+
# All calculations from here on use N-1 observations.
|
121
|
+
o_t = o[1:] # Open price at time t
|
122
|
+
h_t = h[1:] # High price at time t
|
123
|
+
l_t = l[1:] # Low price at time t
|
124
|
+
m_t = m[1:] # Mid-price at time t
|
104
125
|
|
105
|
-
|
106
|
-
|
126
|
+
h_tm1 = h[:-1] # High price at time t-1
|
127
|
+
l_tm1 = l[:-1]
|
128
|
+
c_tm1 = c[:-1]
|
129
|
+
m_tm1 = m[:-1]
|
130
|
+
|
131
|
+
# --- 4. Compute Log-Returns ---
|
132
|
+
r1 = m_t - o_t # Mid-price - Open price
|
133
|
+
r2 = o_t - m_tm1 # Open price - Previous mid-price
|
134
|
+
r3 = m_t - c_tm1 # Mid-price - Previous close
|
135
|
+
r4 = c_tm1 - m_tm1 # Previous close - Previous mid-price
|
136
|
+
r5 = o_t - c_tm1 # Open price - Previous close
|
137
|
+
|
138
|
+
# --- 5. Compute Indicator Variables ---
|
139
|
+
tau = np.where(np.isnan(h_t) | np.isnan(l_t) | np.isnan(c_tm1), np.nan, ((h_t != l_t) | (l_t != c_tm1)).astype(float))
|
140
|
+
po1 = tau * np.where(np.isnan(o_t) | np.isnan(h_t), np.nan, (o_t != h_t).astype(float))
|
141
|
+
po2 = tau * np.where(np.isnan(o_t) | np.isnan(l_t), np.nan, (o_t != l_t).astype(float))
|
142
|
+
pc1 = tau * np.where(np.isnan(c_tm1) | np.isnan(h_tm1), np.nan, (c_tm1 != h_tm1).astype(float))
|
143
|
+
pc2 = tau * np.where(np.isnan(c_tm1) | np.isnan(l_tm1), np.nan, (c_tm1 != l_tm1).astype(float))
|
107
144
|
|
108
|
-
#
|
109
|
-
pc2 = tau * np.where(np.isnan(c1) | np.isnan(l1), np.nan, (c1 != l1).astype(float))
|
110
|
-
|
111
|
-
# Compute probabilities with NaN handling
|
145
|
+
# --- 6. Compute Probabilities ---
|
112
146
|
with warnings.catch_warnings():
|
113
147
|
warnings.simplefilter("ignore", RuntimeWarning)
|
114
|
-
pt = np.nanmean(tau)
|
115
|
-
po = np.nanmean(po1) + np.nanmean(po2)
|
116
|
-
pc = np.nanmean(pc1) + np.nanmean(pc2)
|
117
|
-
|
118
|
-
# Return NaN if insufficient valid periods or probabilities are zero
|
119
|
-
if np.nansum(tau) < 2 or po == 0 or pc == 0:
|
120
|
-
return np.nan
|
148
|
+
pt = np.nanmean(tau) # Probability of a valid period
|
149
|
+
po = np.nanmean(po1) + np.nanmean(po2) # Probability of open price not equal to high
|
150
|
+
pc = np.nanmean(pc1) + np.nanmean(pc2) # Probability of close price not equal to high
|
121
151
|
|
122
|
-
|
123
|
-
|
124
|
-
d3 = r3 - np.nanmean(r3) / pt * tau
|
125
|
-
d5 = r5 - np.nanmean(r5) / pt * tau
|
152
|
+
if debug:
|
153
|
+
print(f"Debug: tau_sum={np.nansum(tau):.2f}, po={po:.4f}, pc={pc:.4f}, pt={pt:.4f}")
|
126
154
|
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
x2 = -4.0 / po * d1 * r5 + -4.0 / pc * d5 * r4
|
132
|
-
|
133
|
-
# Compute expectations (means) of the moment conditions
|
134
|
-
e1 = np.nanmean(x1) # First moment expectation
|
135
|
-
e2 = np.nanmean(x2) # Second moment expectation
|
136
|
-
|
137
|
-
# Compute variances of the moment conditions for optimal weighting
|
138
|
-
v1 = np.nanmean(x1**2) - e1**2 # Variance of first moment
|
139
|
-
v2 = np.nanmean(x2**2) - e2**2 # Variance of second moment
|
155
|
+
# --- 7. Check for Data Quality ---
|
156
|
+
if np.nansum(tau) < 2 or po == 0.0 or pc == 0.0 or pt < min_pt:
|
157
|
+
if debug: print(f"NaN reason: Insufficient valid data (tau_sum={np.nansum(tau)}, po={po}, pc={pc}, pt={pt})")
|
158
|
+
return np.nan
|
140
159
|
|
141
|
-
#
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
160
|
+
# --- 8. Compute Spread (using the Numba-optimized function) ---
|
161
|
+
s2 = _compute_spread_numba(r1, r2, r3, r4, r5, tau, po, pc, pt) # Spread estimate
|
162
|
+
|
163
|
+
if np.isnan(s2):
|
164
|
+
if debug: print("NaN reason: s2 calculation resulted in NaN")
|
165
|
+
return np.nan
|
146
166
|
|
147
|
-
# Compute signed root
|
148
167
|
s = np.sqrt(np.abs(s2))
|
149
168
|
if sign:
|
150
|
-
s *= np.sign(s2)
|
169
|
+
s *= np.sign(s2) # Signed spread estimate
|
170
|
+
|
171
|
+
if debug:
|
172
|
+
print(f"Debug: s2={s2:.6e}, s={s:.6e}")
|
151
173
|
|
152
174
|
return float(s)
|
@@ -1,65 +1,53 @@
|
|
1
|
+
"""
|
2
|
+
Expanding window EDGE estimator implementation.
|
3
|
+
|
4
|
+
This module provides an expanding window implementation of the EDGE estimator,
|
5
|
+
ensuring compatibility with all pandas windowing features like 'step'.
|
6
|
+
|
7
|
+
Author: Jakub Polec
|
8
|
+
Date: 2025-06-28
|
9
|
+
|
10
|
+
Part of the QuantJourney framework - The framework with advanced quantitative
|
11
|
+
finance tools and insights.
|
12
|
+
"""
|
13
|
+
import warnings
|
14
|
+
import numpy as np
|
1
15
|
import pandas as pd
|
2
|
-
from
|
3
|
-
from .edge import edge
|
4
|
-
from .edge_rolling import edge_rolling
|
16
|
+
from .edge import edge as edge_single # Import the core, fast estimator
|
5
17
|
|
6
18
|
def edge_expanding(
|
7
19
|
df: pd.DataFrame,
|
8
|
-
min_periods: int =
|
9
|
-
sign: bool = False
|
20
|
+
min_periods: int = 3,
|
21
|
+
sign: bool = False,
|
10
22
|
) -> pd.Series:
|
11
23
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
Uses the efficient estimator from Ardia, Guidotti, & Kroencke (2024):
|
15
|
-
https://doi.org/10.1016/j.jfineco.2024.103916. Calculates spreads over
|
16
|
-
expanding windows starting from the first observation.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
df : pd.DataFrame
|
21
|
-
DataFrame with columns 'open', 'high', 'low', 'close' (case-insensitive).
|
22
|
-
min_periods : int, default 1
|
23
|
-
Minimum number of observations required for an estimate. Note that
|
24
|
-
at least 3 observations are needed for a non-NaN result.
|
25
|
-
sign : bool, default False
|
26
|
-
If True, returns signed estimates. If False, returns absolute values.
|
27
|
-
|
28
|
-
Returns
|
29
|
-
-------
|
30
|
-
pd.Series
|
31
|
-
Series of expanding spread estimates, indexed by the DataFrame's index.
|
32
|
-
A value of 0.01 corresponds to a 1% spread. NaN for periods with
|
33
|
-
insufficient data.
|
34
|
-
|
35
|
-
Notes
|
36
|
-
-----
|
37
|
-
- The function leverages `edge_rolling` with a window equal to the DataFrame length.
|
38
|
-
- Missing values are handled automatically.
|
39
|
-
- The estimator is most reliable with sufficient data (e.g., 20+ observations).
|
40
|
-
|
41
|
-
Examples
|
42
|
-
--------
|
43
|
-
>>> import pandas as pd
|
44
|
-
>>> # Example OHLC DataFrame
|
45
|
-
>>> df = pd.DataFrame({
|
46
|
-
... 'open': [100.0, 101.5, 99.8, 102.1, 100.9, 103.2],
|
47
|
-
... 'high': [102.3, 103.0, 101.2, 103.5, 102.0, 104.8],
|
48
|
-
... 'low': [99.5, 100.8, 98.9, 101.0, 100.1, 102.5],
|
49
|
-
... 'close': [101.2, 102.5, 100.3, 102.8, 101.5, 104.1]
|
50
|
-
... })
|
51
|
-
>>> spreads = edge_expanding(df, min_periods=3)
|
52
|
-
>>> print(spreads.dropna())
|
24
|
+
Computes expanding EDGE estimates by calling the core estimator on a growing window.
|
53
25
|
"""
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
26
|
+
if min_periods < 3:
|
27
|
+
warnings.warn("min_periods < 3 is not recommended, setting to 3.", UserWarning)
|
28
|
+
min_periods = 3
|
29
|
+
|
30
|
+
# --- 1. Data Preparation ---
|
31
|
+
df_proc = df.rename(columns=str.lower).copy()
|
32
|
+
open_p = df_proc["open"].values
|
33
|
+
high_p = df_proc["high"].values
|
34
|
+
low_p = df_proc["low"].values
|
35
|
+
close_p = df_proc["close"].values
|
36
|
+
|
37
|
+
n = len(df_proc)
|
38
|
+
estimates = np.full(n, np.nan)
|
59
39
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
40
|
+
# --- 2. Loop and Apply ---
|
41
|
+
# This loop perfectly replicates the test's logic for an expanding window.
|
42
|
+
for i in range(n):
|
43
|
+
t1 = i + 1
|
44
|
+
if t1 >= min_periods:
|
45
|
+
estimates[i] = edge_single(
|
46
|
+
open_p[:t1],
|
47
|
+
high_p[:t1],
|
48
|
+
low_p[:t1],
|
49
|
+
close_p[:t1],
|
50
|
+
sign=sign,
|
51
|
+
)
|
52
|
+
|
53
|
+
return pd.Series(estimates, index=df_proc.index, name="EDGE_expanding")
|