rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
rangebar/streaming.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# ADR: docs/adr/2026-01-31-realtime-streaming-api.md
|
|
2
|
+
"""Real-time streaming API for range bar construction.
|
|
3
|
+
|
|
4
|
+
This module provides async Python APIs for constructing range bars from live
|
|
5
|
+
data sources (Binance WebSocket, Exness tick feeds).
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
- Low-level: Callback-based Rust bindings (PyBinanceLiveStream)
|
|
9
|
+
- High-level: Python async generators built on top
|
|
10
|
+
|
|
11
|
+
Examples
|
|
12
|
+
--------
|
|
13
|
+
Async generator (recommended for most use cases):
|
|
14
|
+
|
|
15
|
+
>>> import asyncio
|
|
16
|
+
>>> from rangebar.streaming import stream_binance_live
|
|
17
|
+
>>>
|
|
18
|
+
>>> async def main():
|
|
19
|
+
... async for bar in stream_binance_live("BTCUSDT", threshold_bps=250):
|
|
20
|
+
... print(f"New bar: {bar['close']}")
|
|
21
|
+
...
|
|
22
|
+
>>> asyncio.run(main())
|
|
23
|
+
|
|
24
|
+
Low-level callback interface:
|
|
25
|
+
|
|
26
|
+
>>> from rangebar.streaming import BinanceLiveStream
|
|
27
|
+
>>>
|
|
28
|
+
>>> stream = BinanceLiveStream("BTCUSDT", threshold_decimal_bps=250)
|
|
29
|
+
>>> stream.connect()
|
|
30
|
+
>>> while stream.is_connected:
|
|
31
|
+
... bar = stream.next_bar(timeout_ms=5000)
|
|
32
|
+
... if bar:
|
|
33
|
+
... print(f"New bar: {bar['close']}")
|
|
34
|
+
|
|
35
|
+
Custom data source with StreamingRangeBarProcessor:
|
|
36
|
+
|
|
37
|
+
>>> from rangebar.streaming import StreamingRangeBarProcessor
|
|
38
|
+
>>>
|
|
39
|
+
>>> processor = StreamingRangeBarProcessor(250)
|
|
40
|
+
>>> for trade in my_trade_source():
|
|
41
|
+
... bars = processor.process_trade(trade)
|
|
42
|
+
... for bar in bars:
|
|
43
|
+
... print(f"Completed bar: {bar['close']}")
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import asyncio
|
|
49
|
+
import random
|
|
50
|
+
import time
|
|
51
|
+
from collections.abc import AsyncIterator
|
|
52
|
+
from dataclasses import dataclass
|
|
53
|
+
from typing import TYPE_CHECKING
|
|
54
|
+
|
|
55
|
+
from ._core import (
|
|
56
|
+
BinanceLiveStream,
|
|
57
|
+
StreamingConfig,
|
|
58
|
+
StreamingMetrics,
|
|
59
|
+
StreamingRangeBarProcessor,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if TYPE_CHECKING:
|
|
63
|
+
from typing import Any
|
|
64
|
+
|
|
65
|
+
__all__ = [
|
|
66
|
+
"BinanceLiveStream",
|
|
67
|
+
"ReconnectionConfig",
|
|
68
|
+
"StreamingConfig",
|
|
69
|
+
"StreamingError",
|
|
70
|
+
"StreamingMetrics",
|
|
71
|
+
"StreamingRangeBarProcessor",
|
|
72
|
+
"stream_binance_live",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class StreamingError(Exception):
|
|
77
|
+
"""Error during streaming operation."""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class ReconnectionConfig:
|
|
82
|
+
"""Configuration for automatic reconnection with jitter.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
max_retries: Maximum reconnection attempts (0 = infinite)
|
|
86
|
+
initial_delay_s: Initial delay before first retry
|
|
87
|
+
max_delay_s: Maximum delay between retries
|
|
88
|
+
backoff_factor: Multiplier for exponential backoff
|
|
89
|
+
jitter_factor: Random jitter range (0.5 = ±50% of delay)
|
|
90
|
+
max_total_duration_s: Maximum total time spent reconnecting (0 = infinite)
|
|
91
|
+
|
|
92
|
+
Notes:
|
|
93
|
+
Jitter is applied to prevent thundering herd when multiple clients
|
|
94
|
+
reconnect simultaneously. The actual delay is:
|
|
95
|
+
`delay * (1 - jitter_factor + random() * 2 * jitter_factor)`
|
|
96
|
+
|
|
97
|
+
For jitter_factor=0.5, delay varies from 50% to 150% of base delay.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
max_retries: int = 0 # 0 = infinite
|
|
101
|
+
initial_delay_s: float = 1.0
|
|
102
|
+
max_delay_s: float = 60.0
|
|
103
|
+
backoff_factor: float = 2.0
|
|
104
|
+
jitter_factor: float = 0.5 # ±50% jitter to prevent thundering herd
|
|
105
|
+
max_total_duration_s: float = 0.0 # 0 = infinite
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def stream_binance_live(
|
|
109
|
+
symbol: str,
|
|
110
|
+
threshold_bps: int = 250,
|
|
111
|
+
*,
|
|
112
|
+
reconnect: bool = True,
|
|
113
|
+
reconnect_config: ReconnectionConfig | None = None,
|
|
114
|
+
) -> AsyncIterator[dict[str, Any]]:
|
|
115
|
+
"""Stream range bars from Binance WebSocket in real-time.
|
|
116
|
+
|
|
117
|
+
This is an async generator that yields completed range bars as they
|
|
118
|
+
are constructed from live trade data.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
symbol: Trading pair (e.g., "BTCUSDT")
|
|
122
|
+
threshold_bps: Range bar threshold in decimal basis points (250 = 0.25%)
|
|
123
|
+
reconnect: Whether to automatically reconnect on disconnect
|
|
124
|
+
reconnect_config: Custom reconnection settings
|
|
125
|
+
|
|
126
|
+
Yields:
|
|
127
|
+
Range bar dicts with OHLCV + microstructure features
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
StreamingError: If connection fails and reconnection is disabled
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> async for bar in stream_binance_live("BTCUSDT", threshold_bps=250):
|
|
134
|
+
... print(f"New bar: {bar['close']}, OFI: {bar['ofi']}")
|
|
135
|
+
"""
|
|
136
|
+
if reconnect_config is None:
|
|
137
|
+
reconnect_config = ReconnectionConfig()
|
|
138
|
+
|
|
139
|
+
retry_count = 0
|
|
140
|
+
current_delay = reconnect_config.initial_delay_s
|
|
141
|
+
reconnect_start_time: float | None = None
|
|
142
|
+
|
|
143
|
+
while True:
|
|
144
|
+
try:
|
|
145
|
+
# Create stream and connect
|
|
146
|
+
stream = BinanceLiveStream(symbol, threshold_bps)
|
|
147
|
+
|
|
148
|
+
# Run connect in thread pool to avoid blocking event loop
|
|
149
|
+
loop = asyncio.get_event_loop()
|
|
150
|
+
await loop.run_in_executor(None, stream.connect)
|
|
151
|
+
|
|
152
|
+
# Reset retry state on successful connection
|
|
153
|
+
retry_count = 0
|
|
154
|
+
current_delay = reconnect_config.initial_delay_s
|
|
155
|
+
reconnect_start_time = None
|
|
156
|
+
|
|
157
|
+
# Yield bars as they arrive
|
|
158
|
+
while stream.is_connected:
|
|
159
|
+
# Poll for bar with timeout (non-blocking via thread pool)
|
|
160
|
+
# Bind stream as default arg to avoid B023 closure issue
|
|
161
|
+
bar = await loop.run_in_executor(
|
|
162
|
+
None, lambda s=stream: s.next_bar(timeout_ms=1000)
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if bar is not None:
|
|
166
|
+
yield bar
|
|
167
|
+
|
|
168
|
+
# Allow other coroutines to run
|
|
169
|
+
await asyncio.sleep(0)
|
|
170
|
+
|
|
171
|
+
# Stream disconnected
|
|
172
|
+
if not reconnect:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
if not reconnect:
|
|
177
|
+
msg = f"Stream connection failed: {e}"
|
|
178
|
+
raise StreamingError(msg) from e
|
|
179
|
+
|
|
180
|
+
# Track reconnection start time
|
|
181
|
+
if reconnect_start_time is None:
|
|
182
|
+
reconnect_start_time = time.monotonic()
|
|
183
|
+
|
|
184
|
+
retry_count += 1
|
|
185
|
+
|
|
186
|
+
# Check max retries
|
|
187
|
+
if (
|
|
188
|
+
reconnect_config.max_retries > 0
|
|
189
|
+
and retry_count > reconnect_config.max_retries
|
|
190
|
+
):
|
|
191
|
+
msg = f"Max retries ({reconnect_config.max_retries}) exceeded"
|
|
192
|
+
raise StreamingError(msg) from e
|
|
193
|
+
|
|
194
|
+
# Check max total duration
|
|
195
|
+
if reconnect_config.max_total_duration_s > 0:
|
|
196
|
+
elapsed = time.monotonic() - reconnect_start_time
|
|
197
|
+
if elapsed > reconnect_config.max_total_duration_s:
|
|
198
|
+
msg = (
|
|
199
|
+
f"Max reconnection duration "
|
|
200
|
+
f"({reconnect_config.max_total_duration_s:.0f}s) exceeded"
|
|
201
|
+
)
|
|
202
|
+
raise StreamingError(msg) from e
|
|
203
|
+
|
|
204
|
+
# Apply jitter to prevent thundering herd
|
|
205
|
+
# Jitter range: [1 - jitter_factor, 1 + jitter_factor]
|
|
206
|
+
jitter_multiplier = 1.0 - reconnect_config.jitter_factor + (
|
|
207
|
+
random.random() * 2 * reconnect_config.jitter_factor
|
|
208
|
+
)
|
|
209
|
+
jittered_delay = current_delay * jitter_multiplier
|
|
210
|
+
|
|
211
|
+
# Log and wait before retry
|
|
212
|
+
print(
|
|
213
|
+
f"Stream disconnected, retrying in {jittered_delay:.1f}s "
|
|
214
|
+
f"(attempt {retry_count})"
|
|
215
|
+
)
|
|
216
|
+
await asyncio.sleep(jittered_delay)
|
|
217
|
+
|
|
218
|
+
# Exponential backoff (applied to base delay, not jittered)
|
|
219
|
+
current_delay = min(
|
|
220
|
+
current_delay * reconnect_config.backoff_factor,
|
|
221
|
+
reconnect_config.max_delay_s,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class AsyncStreamingProcessor:
|
|
226
|
+
"""Async wrapper for StreamingRangeBarProcessor.
|
|
227
|
+
|
|
228
|
+
This class provides an async interface for processing trades from
|
|
229
|
+
any data source into range bars.
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
>>> processor = AsyncStreamingProcessor(250)
|
|
233
|
+
>>> async for trade in my_async_trade_source():
|
|
234
|
+
... bars = await processor.process_trade(trade)
|
|
235
|
+
... for bar in bars:
|
|
236
|
+
... await handle_new_bar(bar)
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
def __init__(self, threshold_decimal_bps: int) -> None:
|
|
240
|
+
"""Create async streaming processor.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
threshold_decimal_bps: Range bar threshold (250 = 0.25%)
|
|
244
|
+
"""
|
|
245
|
+
self._processor = StreamingRangeBarProcessor(threshold_decimal_bps)
|
|
246
|
+
self._lock = asyncio.Lock()
|
|
247
|
+
|
|
248
|
+
async def process_trade(self, trade: dict[str, Any]) -> list[dict[str, Any]]:
|
|
249
|
+
"""Process a single trade asynchronously.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
trade: Trade dict with timestamp, price, quantity/volume
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of completed bar dicts (usually 0 or 1)
|
|
256
|
+
"""
|
|
257
|
+
async with self._lock:
|
|
258
|
+
loop = asyncio.get_event_loop()
|
|
259
|
+
return await loop.run_in_executor(
|
|
260
|
+
None, self._processor.process_trade, trade
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
async def process_trades(
|
|
264
|
+
self, trades: list[dict[str, Any]]
|
|
265
|
+
) -> list[dict[str, Any]]:
|
|
266
|
+
"""Process multiple trades asynchronously.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
trades: List of trade dicts
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List of completed bar dicts
|
|
273
|
+
"""
|
|
274
|
+
async with self._lock:
|
|
275
|
+
loop = asyncio.get_event_loop()
|
|
276
|
+
return await loop.run_in_executor(
|
|
277
|
+
None, self._processor.process_trades, trades
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
async def get_incomplete_bar(self) -> dict[str, Any] | None:
|
|
281
|
+
"""Get current incomplete bar asynchronously."""
|
|
282
|
+
async with self._lock:
|
|
283
|
+
loop = asyncio.get_event_loop()
|
|
284
|
+
return await loop.run_in_executor(
|
|
285
|
+
None, self._processor.get_incomplete_bar
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def trades_processed(self) -> int:
|
|
290
|
+
"""Number of trades processed."""
|
|
291
|
+
return self._processor.trades_processed
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def bars_generated(self) -> int:
|
|
295
|
+
"""Number of bars generated."""
|
|
296
|
+
return self._processor.bars_generated
|
|
297
|
+
|
|
298
|
+
def get_metrics(self) -> StreamingMetrics:
|
|
299
|
+
"""Get streaming metrics."""
|
|
300
|
+
return self._processor.get_metrics()
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Validation framework for microstructure features (Issue #25) and cache integrity (Issue #39).
|
|
2
|
+
|
|
3
|
+
Provides tiered validation for market microstructure features and cache operations:
|
|
4
|
+
- Tier 0: Cache staleness detection (<5ms) - schema evolution support
|
|
5
|
+
- Tier 0: Post-storage validation after cache operations (<1 sec) - Issue #39
|
|
6
|
+
- Tier 1: Auto-validation on every precompute (<30 sec)
|
|
7
|
+
- Tier 2: Statistical validation before production ML (~10 min)
|
|
8
|
+
- Tier 3: Feature importance and drift analysis (30+ min, on-demand)
|
|
9
|
+
- Continuity: Tiered gap classification for range bar data (Issue #19)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .cache_staleness import (
|
|
13
|
+
StalenessResult,
|
|
14
|
+
detect_staleness,
|
|
15
|
+
validate_schema_version,
|
|
16
|
+
)
|
|
17
|
+
from .continuity import (
|
|
18
|
+
ASSET_CLASS_MULTIPLIERS,
|
|
19
|
+
VALIDATION_PRESETS,
|
|
20
|
+
AssetClass,
|
|
21
|
+
ContinuityError,
|
|
22
|
+
ContinuityWarning,
|
|
23
|
+
GapInfo,
|
|
24
|
+
GapTier,
|
|
25
|
+
TieredValidationResult,
|
|
26
|
+
TierSummary,
|
|
27
|
+
TierThresholds,
|
|
28
|
+
ValidationPreset,
|
|
29
|
+
detect_asset_class,
|
|
30
|
+
validate_continuity,
|
|
31
|
+
validate_continuity_tiered,
|
|
32
|
+
validate_junction_continuity,
|
|
33
|
+
)
|
|
34
|
+
from .post_storage import (
|
|
35
|
+
ValidationResult,
|
|
36
|
+
compute_dataframe_checksum,
|
|
37
|
+
validate_ohlc_invariants,
|
|
38
|
+
validate_post_storage,
|
|
39
|
+
)
|
|
40
|
+
from .tier1 import FEATURE_COLS, validate_tier1
|
|
41
|
+
from .tier2 import validate_tier2
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
"ASSET_CLASS_MULTIPLIERS",
|
|
45
|
+
"FEATURE_COLS",
|
|
46
|
+
"VALIDATION_PRESETS",
|
|
47
|
+
"AssetClass",
|
|
48
|
+
"ContinuityError",
|
|
49
|
+
"ContinuityWarning",
|
|
50
|
+
"GapInfo",
|
|
51
|
+
"GapTier",
|
|
52
|
+
"StalenessResult",
|
|
53
|
+
"TierSummary",
|
|
54
|
+
"TierThresholds",
|
|
55
|
+
"TieredValidationResult",
|
|
56
|
+
"ValidationPreset",
|
|
57
|
+
"ValidationResult",
|
|
58
|
+
"compute_dataframe_checksum",
|
|
59
|
+
"detect_asset_class",
|
|
60
|
+
"detect_staleness",
|
|
61
|
+
"validate_continuity",
|
|
62
|
+
"validate_continuity_tiered",
|
|
63
|
+
"validate_junction_continuity",
|
|
64
|
+
"validate_ohlc_invariants",
|
|
65
|
+
"validate_post_storage",
|
|
66
|
+
"validate_schema_version",
|
|
67
|
+
"validate_tier1",
|
|
68
|
+
"validate_tier2",
|
|
69
|
+
]
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# polars-exception: ClickHouse cache returns Pandas for backtesting.py
|
|
2
|
+
"""Cache staleness detection for schema evolution.
|
|
3
|
+
|
|
4
|
+
This module provides content-based validation to detect stale cached data
|
|
5
|
+
that was computed with older versions lacking microstructure features.
|
|
6
|
+
|
|
7
|
+
Tier 0 validation: Fast staleness detection (<5ms for 100K bars).
|
|
8
|
+
Run on every cache read when microstructure features are requested.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import TYPE_CHECKING, Literal
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from rangebar.constants import MICROSTRUCTURE_COLUMNS
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Semantic version has 3 parts
|
|
25
|
+
_VERSION_PARTS = 3
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class StalenessResult:
|
|
30
|
+
"""Result of cache staleness detection.
|
|
31
|
+
|
|
32
|
+
Attributes
|
|
33
|
+
----------
|
|
34
|
+
is_stale : bool
|
|
35
|
+
True if cached data is detected as stale and should be invalidated.
|
|
36
|
+
reason : str | None
|
|
37
|
+
Human-readable description of why data is stale (None if not stale).
|
|
38
|
+
confidence : Literal["high", "medium", "low"]
|
|
39
|
+
Confidence level of staleness detection.
|
|
40
|
+
checks_passed : dict[str, bool]
|
|
41
|
+
Individual validation checks and their results.
|
|
42
|
+
recommendations : list[str]
|
|
43
|
+
Suggested actions to resolve staleness.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
is_stale: bool
|
|
47
|
+
reason: str | None = None
|
|
48
|
+
confidence: Literal["high", "medium", "low"] = "high"
|
|
49
|
+
checks_passed: dict[str, bool] = field(default_factory=dict)
|
|
50
|
+
recommendations: list[str] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _check_vwap(
|
|
54
|
+
df: pd.DataFrame,
|
|
55
|
+
checks: dict[str, bool],
|
|
56
|
+
reasons: list[str],
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Validate VWAP is within [Low, High] and not all zeros."""
|
|
59
|
+
if "vwap" not in df.columns:
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
vwap_all_zero = (df["vwap"] == 0).all()
|
|
63
|
+
checks["vwap_not_all_zero"] = not vwap_all_zero
|
|
64
|
+
|
|
65
|
+
if vwap_all_zero:
|
|
66
|
+
reasons.append("All VWAP values are zero (pre-v7.0 cache data)")
|
|
67
|
+
else:
|
|
68
|
+
vwap_valid = (df["vwap"] >= df["Low"]) & (df["vwap"] <= df["High"])
|
|
69
|
+
checks["vwap_bounded"] = vwap_valid.all()
|
|
70
|
+
if not vwap_valid.all():
|
|
71
|
+
invalid_count = (~vwap_valid).sum()
|
|
72
|
+
reasons.append(f"VWAP outside [Low, High] for {invalid_count} bars")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _check_bounded_columns(
|
|
76
|
+
df: pd.DataFrame,
|
|
77
|
+
checks: dict[str, bool],
|
|
78
|
+
reasons: list[str],
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Validate bounded microstructure columns are within expected ranges."""
|
|
81
|
+
# OFI in [-1, 1]
|
|
82
|
+
if "ofi" in df.columns:
|
|
83
|
+
ofi_bounded = df["ofi"].between(-1, 1).all()
|
|
84
|
+
checks["ofi_bounded"] = ofi_bounded
|
|
85
|
+
if not ofi_bounded:
|
|
86
|
+
reasons.append("OFI values outside [-1, 1] range")
|
|
87
|
+
|
|
88
|
+
# Turnover imbalance in [-1, 1]
|
|
89
|
+
if "turnover_imbalance" in df.columns:
|
|
90
|
+
ti_bounded = df["turnover_imbalance"].between(-1, 1).all()
|
|
91
|
+
checks["turnover_imbalance_bounded"] = ti_bounded
|
|
92
|
+
if not ti_bounded:
|
|
93
|
+
reasons.append("Turnover imbalance outside [-1, 1] range")
|
|
94
|
+
|
|
95
|
+
# Duration non-negative
|
|
96
|
+
if "duration_us" in df.columns:
|
|
97
|
+
duration_valid = (df["duration_us"] >= 0).all()
|
|
98
|
+
checks["duration_non_negative"] = duration_valid
|
|
99
|
+
if not duration_valid:
|
|
100
|
+
reasons.append("Negative duration values detected")
|
|
101
|
+
|
|
102
|
+
# Aggregation density >= 1
|
|
103
|
+
if "aggregation_density" in df.columns:
|
|
104
|
+
agg_valid = (df["aggregation_density"] >= 1).all()
|
|
105
|
+
checks["aggregation_density_valid"] = agg_valid
|
|
106
|
+
if not agg_valid:
|
|
107
|
+
reasons.append("Aggregation density < 1 detected")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _check_volume_consistency(
|
|
111
|
+
df: pd.DataFrame,
|
|
112
|
+
checks: dict[str, bool],
|
|
113
|
+
reasons: list[str],
|
|
114
|
+
) -> None:
|
|
115
|
+
"""Validate buy_volume + sell_volume == Volume."""
|
|
116
|
+
required_cols = {"buy_volume", "sell_volume", "Volume"}
|
|
117
|
+
if not required_cols.issubset(df.columns):
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
vol_sum = df["buy_volume"] + df["sell_volume"]
|
|
121
|
+
vol_diff = (vol_sum - df["Volume"]).abs()
|
|
122
|
+
vol_match = (vol_diff < 1e-6 * df["Volume"].abs().clip(lower=1e-10)).all()
|
|
123
|
+
checks["volume_consistency"] = vol_match
|
|
124
|
+
if not vol_match:
|
|
125
|
+
reasons.append("buy_volume + sell_volume != Volume")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _check_trade_counts(
|
|
129
|
+
df: pd.DataFrame,
|
|
130
|
+
checks: dict[str, bool],
|
|
131
|
+
reasons: list[str],
|
|
132
|
+
) -> None:
|
|
133
|
+
"""Validate trade count columns have valid values."""
|
|
134
|
+
if "individual_trade_count" in df.columns:
|
|
135
|
+
counts_valid = (df["individual_trade_count"] >= 1).all()
|
|
136
|
+
checks["trade_counts_valid"] = counts_valid
|
|
137
|
+
if not counts_valid:
|
|
138
|
+
reasons.append("Invalid trade count values (< 1)")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _check_all_microstructure_zero(
|
|
142
|
+
df: pd.DataFrame,
|
|
143
|
+
checks: dict[str, bool],
|
|
144
|
+
reasons: list[str],
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Check if all microstructure columns are zero (indicates stale data)."""
|
|
147
|
+
micro_cols_present = [c for c in MICROSTRUCTURE_COLUMNS if c in df.columns]
|
|
148
|
+
if not micro_cols_present:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
all_micro_zero = all((df[col] == 0).all() for col in micro_cols_present)
|
|
152
|
+
checks["microstructure_not_all_zero"] = not all_micro_zero
|
|
153
|
+
if all_micro_zero:
|
|
154
|
+
reasons.append("All microstructure columns are zero (pre-v7.0 cache data)")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _determine_staleness(
|
|
158
|
+
checks: dict[str, bool],
|
|
159
|
+
reasons: list[str],
|
|
160
|
+
) -> StalenessResult:
|
|
161
|
+
"""Determine final staleness result from individual checks."""
|
|
162
|
+
high_confidence_checks = [
|
|
163
|
+
"vwap_bounded",
|
|
164
|
+
"vwap_not_all_zero",
|
|
165
|
+
"ofi_bounded",
|
|
166
|
+
"turnover_imbalance_bounded",
|
|
167
|
+
"duration_non_negative",
|
|
168
|
+
"aggregation_density_valid",
|
|
169
|
+
"trade_counts_valid",
|
|
170
|
+
"microstructure_not_all_zero",
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
high_conf_failures = [
|
|
174
|
+
k for k in high_confidence_checks if k in checks and not checks[k]
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
is_stale = len(high_conf_failures) > 0
|
|
178
|
+
|
|
179
|
+
# Determine confidence level
|
|
180
|
+
confidence: Literal["high", "medium", "low"] = "high"
|
|
181
|
+
if is_stale and not high_conf_failures:
|
|
182
|
+
confidence = "medium"
|
|
183
|
+
|
|
184
|
+
# Build recommendations
|
|
185
|
+
recommendations: list[str] = []
|
|
186
|
+
if is_stale:
|
|
187
|
+
recommendations.append(
|
|
188
|
+
"Invalidate cache entry and recompute with current version"
|
|
189
|
+
)
|
|
190
|
+
if "vwap_not_all_zero" in high_conf_failures:
|
|
191
|
+
recommendations.append("Data appears to be from pre-v7.0")
|
|
192
|
+
if "microstructure_not_all_zero" in high_conf_failures:
|
|
193
|
+
recommendations.append("Data appears to be from pre-v7.0")
|
|
194
|
+
recommendations.append("Run: get_range_bars(..., use_cache=False)")
|
|
195
|
+
|
|
196
|
+
return StalenessResult(
|
|
197
|
+
is_stale=is_stale,
|
|
198
|
+
reason="; ".join(reasons) if reasons else None,
|
|
199
|
+
confidence=confidence,
|
|
200
|
+
checks_passed=checks,
|
|
201
|
+
recommendations=recommendations,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def detect_staleness(
|
|
206
|
+
df: pd.DataFrame,
|
|
207
|
+
require_microstructure: bool = True,
|
|
208
|
+
) -> StalenessResult:
|
|
209
|
+
"""Detect stale cached data using content-based validation.
|
|
210
|
+
|
|
211
|
+
This is Tier 0 validation: fast staleness detection (<5ms for 100K bars).
|
|
212
|
+
Run on every cache read before returning data to caller.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
df : pd.DataFrame
|
|
217
|
+
Cached range bar DataFrame, possibly with microstructure columns.
|
|
218
|
+
require_microstructure : bool, default=True
|
|
219
|
+
If True, check for valid microstructure columns.
|
|
220
|
+
|
|
221
|
+
Returns
|
|
222
|
+
-------
|
|
223
|
+
StalenessResult
|
|
224
|
+
Detection result with confidence level and specific failures.
|
|
225
|
+
"""
|
|
226
|
+
checks: dict[str, bool] = {}
|
|
227
|
+
reasons: list[str] = []
|
|
228
|
+
|
|
229
|
+
if require_microstructure:
|
|
230
|
+
_check_vwap(df, checks, reasons)
|
|
231
|
+
_check_bounded_columns(df, checks, reasons)
|
|
232
|
+
_check_volume_consistency(df, checks, reasons)
|
|
233
|
+
_check_trade_counts(df, checks, reasons)
|
|
234
|
+
_check_all_microstructure_zero(df, checks, reasons)
|
|
235
|
+
|
|
236
|
+
return _determine_staleness(checks, reasons)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def validate_schema_version(
|
|
240
|
+
cached_version: str | None,
|
|
241
|
+
min_version: str,
|
|
242
|
+
) -> bool:
|
|
243
|
+
"""Check if cached data meets minimum schema version requirement.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
cached_version : str | None
|
|
248
|
+
Version string from cached data (e.g., "7.0.0").
|
|
249
|
+
min_version : str
|
|
250
|
+
Minimum required version (e.g., "7.0.0").
|
|
251
|
+
|
|
252
|
+
Returns
|
|
253
|
+
-------
|
|
254
|
+
bool
|
|
255
|
+
True if cached_version >= min_version.
|
|
256
|
+
"""
|
|
257
|
+
if not cached_version:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
cached_parts = [int(x) for x in cached_version.split(".")[:_VERSION_PARTS]]
|
|
262
|
+
min_parts = [int(x) for x in min_version.split(".")[:_VERSION_PARTS]]
|
|
263
|
+
|
|
264
|
+
# Pad to 3 parts
|
|
265
|
+
while len(cached_parts) < _VERSION_PARTS:
|
|
266
|
+
cached_parts.append(0)
|
|
267
|
+
while len(min_parts) < _VERSION_PARTS:
|
|
268
|
+
min_parts.append(0)
|
|
269
|
+
|
|
270
|
+
return tuple(cached_parts) >= tuple(min_parts)
|
|
271
|
+
except (ValueError, AttributeError):
|
|
272
|
+
logger.warning(
|
|
273
|
+
"Invalid version format: cached=%r, min=%r",
|
|
274
|
+
cached_version,
|
|
275
|
+
min_version,
|
|
276
|
+
)
|
|
277
|
+
return False
|