rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# Issue #19: Gap classification extracted from continuity.py for modularization
|
|
2
|
+
"""Gap classification types and presets for range bar validation.
|
|
3
|
+
|
|
4
|
+
This module provides the tiered gap classification system based on empirical
|
|
5
|
+
analysis of 30-month BTC data (Issue #19). It identifies 49 legitimate market
|
|
6
|
+
microstructure events and classifies gaps into severity tiers.
|
|
7
|
+
|
|
8
|
+
Gap Tiers:
|
|
9
|
+
- PRECISION: < 0.001% - Floating-point artifacts (always ignored)
|
|
10
|
+
- NOISE: 0.001% - 0.01% - Tick-level noise (logged, not flagged)
|
|
11
|
+
- MARKET_MOVE: 0.01% - 0.1% - Normal market movement (configurable)
|
|
12
|
+
- MICROSTRUCTURE: > 0.1% - Flash crashes, liquidations (warning/error)
|
|
13
|
+
- SESSION_BOUNDARY: > threshold*2 - Definite session break (always error)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from enum import Enum, IntEnum
|
|
20
|
+
from typing import Literal
|
|
21
|
+
|
|
22
|
+
from rangebar.constants import _CRYPTO_BASES, _FOREX_CURRENCIES
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"ASSET_CLASS_MULTIPLIERS",
|
|
26
|
+
"VALIDATION_PRESETS",
|
|
27
|
+
"AssetClass",
|
|
28
|
+
"GapTier",
|
|
29
|
+
"TierThresholds",
|
|
30
|
+
"ValidationPreset",
|
|
31
|
+
"detect_asset_class",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ============================================================================
|
|
36
|
+
# Gap Tier Enum
|
|
37
|
+
# ============================================================================
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GapTier(IntEnum):
|
|
41
|
+
"""Gap severity classification for range bar continuity validation.
|
|
42
|
+
|
|
43
|
+
Tiers are based on empirical analysis of 30-month BTC data (Issue #19)
|
|
44
|
+
which identified 49 legitimate market microstructure events.
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
>>> gap_pct = 0.05 # 0.05% gap
|
|
49
|
+
>>> if gap_pct < 0.00001:
|
|
50
|
+
... tier = GapTier.PRECISION
|
|
51
|
+
>>> elif gap_pct < 0.0001:
|
|
52
|
+
... tier = GapTier.NOISE
|
|
53
|
+
>>> elif gap_pct < 0.001:
|
|
54
|
+
... tier = GapTier.MARKET_MOVE
|
|
55
|
+
>>> elif gap_pct < threshold * 2:
|
|
56
|
+
... tier = GapTier.MICROSTRUCTURE
|
|
57
|
+
>>> else:
|
|
58
|
+
... tier = GapTier.SESSION_BOUNDARY
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
PRECISION = 1 # < 0.001% - Floating-point artifacts (always ignored)
|
|
62
|
+
NOISE = 2 # 0.001% - 0.01% - Tick-level noise (logged, not flagged)
|
|
63
|
+
MARKET_MOVE = 3 # 0.01% - 0.1% - Normal market movement (configurable)
|
|
64
|
+
MICROSTRUCTURE = 4 # > 0.1% - Flash crashes, liquidations (warning/error)
|
|
65
|
+
SESSION_BOUNDARY = 5 # > threshold*2 - Definite session break (always error)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ============================================================================
|
|
69
|
+
# Asset Class Enum
|
|
70
|
+
# ============================================================================
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AssetClass(Enum):
|
|
74
|
+
"""Asset class for tolerance calibration.
|
|
75
|
+
|
|
76
|
+
Different asset classes have different typical gap magnitudes:
|
|
77
|
+
- Crypto: 24/7 markets, flash crashes possible, baseline tolerance
|
|
78
|
+
- Forex: Session-based, weekend gaps, tighter tolerance
|
|
79
|
+
- Equities: Overnight gaps, circuit breakers, looser tolerance
|
|
80
|
+
|
|
81
|
+
Examples
|
|
82
|
+
--------
|
|
83
|
+
>>> from rangebar import detect_asset_class, AssetClass
|
|
84
|
+
>>> detect_asset_class("BTCUSDT")
|
|
85
|
+
<AssetClass.CRYPTO: 'crypto'>
|
|
86
|
+
>>> detect_asset_class("EURUSD")
|
|
87
|
+
<AssetClass.FOREX: 'forex'>
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
CRYPTO = "crypto" # 24/7 markets, flash crashes possible
|
|
91
|
+
FOREX = "forex" # Session-based, weekend gaps
|
|
92
|
+
EQUITIES = "equities" # Overnight gaps, circuit breakers
|
|
93
|
+
UNKNOWN = "unknown" # Fallback to crypto defaults
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Tolerance multipliers by asset class (relative to baseline)
|
|
97
|
+
ASSET_CLASS_MULTIPLIERS: dict[AssetClass, float] = {
|
|
98
|
+
AssetClass.CRYPTO: 1.0, # Baseline
|
|
99
|
+
AssetClass.FOREX: 0.5, # Tighter (more stable)
|
|
100
|
+
AssetClass.EQUITIES: 1.5, # Looser (overnight gaps)
|
|
101
|
+
AssetClass.UNKNOWN: 1.0, # Default to crypto
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def detect_asset_class(symbol: str) -> AssetClass:
|
|
106
|
+
"""Auto-detect asset class from symbol pattern.
|
|
107
|
+
|
|
108
|
+
Detection Rules:
|
|
109
|
+
- Crypto: Contains common crypto bases (BTC, ETH, BNB, SOL, etc.)
|
|
110
|
+
or ends with USDT/BUSD/USDC
|
|
111
|
+
- Forex: Standard 6-char pairs (EURUSD, GBPJPY, etc.)
|
|
112
|
+
or commodity symbols (XAU, XAG, BRENT, WTI)
|
|
113
|
+
- Unknown: Fallback for unrecognized patterns
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
symbol : str
|
|
118
|
+
Trading symbol (case-insensitive)
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
AssetClass
|
|
123
|
+
Detected asset class
|
|
124
|
+
|
|
125
|
+
Examples
|
|
126
|
+
--------
|
|
127
|
+
>>> detect_asset_class("BTCUSDT")
|
|
128
|
+
<AssetClass.CRYPTO: 'crypto'>
|
|
129
|
+
>>> detect_asset_class("EURUSD")
|
|
130
|
+
<AssetClass.FOREX: 'forex'>
|
|
131
|
+
>>> detect_asset_class("AAPL")
|
|
132
|
+
<AssetClass.UNKNOWN: 'unknown'>
|
|
133
|
+
"""
|
|
134
|
+
symbol_upper = symbol.upper()
|
|
135
|
+
|
|
136
|
+
# Crypto patterns: contains known base or ends with stablecoin
|
|
137
|
+
if any(base in symbol_upper for base in _CRYPTO_BASES):
|
|
138
|
+
return AssetClass.CRYPTO
|
|
139
|
+
if symbol_upper.endswith(("USDT", "BUSD", "USDC", "TUSD", "FDUSD")):
|
|
140
|
+
return AssetClass.CRYPTO
|
|
141
|
+
|
|
142
|
+
# Forex patterns: 6-char standard pairs (e.g., EURUSD)
|
|
143
|
+
forex_pair_length = 6
|
|
144
|
+
if len(symbol_upper) == forex_pair_length:
|
|
145
|
+
base, quote = symbol_upper[:3], symbol_upper[3:]
|
|
146
|
+
if base in _FOREX_CURRENCIES and quote in _FOREX_CURRENCIES:
|
|
147
|
+
return AssetClass.FOREX
|
|
148
|
+
|
|
149
|
+
# Commodities via forex brokers
|
|
150
|
+
if any(x in symbol_upper for x in ("XAU", "XAG", "BRENT", "WTI")):
|
|
151
|
+
return AssetClass.FOREX
|
|
152
|
+
|
|
153
|
+
return AssetClass.UNKNOWN
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ============================================================================
|
|
157
|
+
# Configuration Dataclasses
|
|
158
|
+
# ============================================================================
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass(frozen=True)
|
|
162
|
+
class TierThresholds:
|
|
163
|
+
"""Configurable boundaries between gap tiers (in percentage).
|
|
164
|
+
|
|
165
|
+
These thresholds define the boundaries for classifying gaps into tiers.
|
|
166
|
+
Values are percentages (e.g., 0.00001 = 0.001%).
|
|
167
|
+
|
|
168
|
+
Attributes
|
|
169
|
+
----------
|
|
170
|
+
precision : float
|
|
171
|
+
Tier 1/2 boundary - gaps below this are floating-point artifacts
|
|
172
|
+
noise : float
|
|
173
|
+
Tier 2/3 boundary - gaps below this are tick-level noise
|
|
174
|
+
market_move : float
|
|
175
|
+
Tier 3/4 boundary - gaps below this are normal market movement
|
|
176
|
+
session_factor : float
|
|
177
|
+
Tier 5 multiplier - gaps > (threshold * factor) are session breaks
|
|
178
|
+
|
|
179
|
+
Examples
|
|
180
|
+
--------
|
|
181
|
+
>>> thresholds = TierThresholds()
|
|
182
|
+
>>> thresholds.precision
|
|
183
|
+
1e-05
|
|
184
|
+
>>> thresholds.noise
|
|
185
|
+
0.0001
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
precision: float = 0.00001 # 0.001% - Tier 1/2 boundary
|
|
189
|
+
noise: float = 0.0001 # 0.01% - Tier 2/3 boundary
|
|
190
|
+
market_move: float = 0.001 # 0.1% - Tier 3/4 boundary
|
|
191
|
+
session_factor: float = 2.0 # Tier 5 = threshold * factor
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@dataclass(frozen=True)
|
|
195
|
+
class ValidationPreset:
|
|
196
|
+
"""Immutable validation configuration preset.
|
|
197
|
+
|
|
198
|
+
Presets bundle tolerance, behavior mode, and tier thresholds into
|
|
199
|
+
named configurations for common use cases.
|
|
200
|
+
|
|
201
|
+
Attributes
|
|
202
|
+
----------
|
|
203
|
+
tolerance_pct : float
|
|
204
|
+
Maximum gap percentage before flagging (e.g., 0.01 = 1%)
|
|
205
|
+
mode : Literal["error", "warn", "skip"]
|
|
206
|
+
Behavior on validation failure
|
|
207
|
+
tier_thresholds : TierThresholds
|
|
208
|
+
Boundaries for gap tier classification
|
|
209
|
+
asset_class : AssetClass | None
|
|
210
|
+
Override auto-detection if set
|
|
211
|
+
description : str
|
|
212
|
+
Human-readable description of the preset
|
|
213
|
+
|
|
214
|
+
Examples
|
|
215
|
+
--------
|
|
216
|
+
>>> preset = VALIDATION_PRESETS["research"]
|
|
217
|
+
>>> preset.tolerance_pct
|
|
218
|
+
0.02
|
|
219
|
+
>>> preset.mode
|
|
220
|
+
'warn'
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
tolerance_pct: float # Max gap before flagging
|
|
224
|
+
mode: Literal["error", "warn", "skip"] # Behavior on failure
|
|
225
|
+
tier_thresholds: TierThresholds = field(default_factory=TierThresholds)
|
|
226
|
+
asset_class: AssetClass | None = None # Override auto-detection
|
|
227
|
+
description: str = ""
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# Named validation presets for common scenarios
|
|
231
|
+
VALIDATION_PRESETS: dict[str, ValidationPreset] = {
|
|
232
|
+
# =========================================================================
|
|
233
|
+
# GENERAL-PURPOSE PRESETS
|
|
234
|
+
# =========================================================================
|
|
235
|
+
"permissive": ValidationPreset(
|
|
236
|
+
tolerance_pct=0.05, # 5%
|
|
237
|
+
mode="warn",
|
|
238
|
+
description="Accept most microstructure events, warn on extreme gaps",
|
|
239
|
+
),
|
|
240
|
+
"research": ValidationPreset(
|
|
241
|
+
tolerance_pct=0.02, # 2%
|
|
242
|
+
mode="warn",
|
|
243
|
+
description="Standard exploratory analysis with monitoring",
|
|
244
|
+
),
|
|
245
|
+
"standard": ValidationPreset(
|
|
246
|
+
tolerance_pct=0.01, # 1%
|
|
247
|
+
mode="warn",
|
|
248
|
+
description="Balanced tolerance for production backtesting",
|
|
249
|
+
),
|
|
250
|
+
"strict": ValidationPreset(
|
|
251
|
+
tolerance_pct=0.005, # 0.5%
|
|
252
|
+
mode="error",
|
|
253
|
+
description="Strict validation for ML training data",
|
|
254
|
+
),
|
|
255
|
+
"paranoid": ValidationPreset(
|
|
256
|
+
tolerance_pct=0.001, # 0.1%
|
|
257
|
+
mode="error",
|
|
258
|
+
description="Maximum strictness (original v6.1.0 behavior)",
|
|
259
|
+
),
|
|
260
|
+
# =========================================================================
|
|
261
|
+
# ASSET-CLASS SPECIFIC PRESETS
|
|
262
|
+
# =========================================================================
|
|
263
|
+
"crypto": ValidationPreset(
|
|
264
|
+
tolerance_pct=0.02, # 2%
|
|
265
|
+
mode="warn",
|
|
266
|
+
asset_class=AssetClass.CRYPTO,
|
|
267
|
+
description="Crypto: Tuned for 24/7 markets with flash crashes",
|
|
268
|
+
),
|
|
269
|
+
"forex": ValidationPreset(
|
|
270
|
+
tolerance_pct=0.01, # 1%
|
|
271
|
+
mode="warn",
|
|
272
|
+
asset_class=AssetClass.FOREX,
|
|
273
|
+
description="Forex: Accounts for session boundaries",
|
|
274
|
+
),
|
|
275
|
+
"equities": ValidationPreset(
|
|
276
|
+
tolerance_pct=0.03, # 3%
|
|
277
|
+
mode="warn",
|
|
278
|
+
asset_class=AssetClass.EQUITIES,
|
|
279
|
+
description="Equities: Accounts for overnight gaps",
|
|
280
|
+
),
|
|
281
|
+
# =========================================================================
|
|
282
|
+
# SPECIAL PRESETS
|
|
283
|
+
# =========================================================================
|
|
284
|
+
"skip": ValidationPreset(
|
|
285
|
+
tolerance_pct=0.0,
|
|
286
|
+
mode="skip",
|
|
287
|
+
description="Disable validation entirely",
|
|
288
|
+
),
|
|
289
|
+
"audit": ValidationPreset(
|
|
290
|
+
tolerance_pct=0.002, # 0.2%
|
|
291
|
+
mode="error",
|
|
292
|
+
description="Data quality audit mode",
|
|
293
|
+
),
|
|
294
|
+
}
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Tier 0: Post-storage validation for cache integrity (<1 sec).
|
|
2
|
+
|
|
3
|
+
Run after every cache write to verify data was stored correctly.
|
|
4
|
+
This is the fastest validation tier - critical for detecting cache corruption.
|
|
5
|
+
|
|
6
|
+
Issue #39: Post-storage validation to verify cached data matches computed data.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from typing import TYPE_CHECKING, Any
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ValidationResult:
|
|
25
|
+
"""Result of post-storage validation.
|
|
26
|
+
|
|
27
|
+
Attributes
|
|
28
|
+
----------
|
|
29
|
+
passed : bool
|
|
30
|
+
True if all validation checks passed.
|
|
31
|
+
checks : dict[str, bool]
|
|
32
|
+
Individual check results.
|
|
33
|
+
details : dict[str, Any]
|
|
34
|
+
Additional details about validation (counts, timestamps, etc.).
|
|
35
|
+
timestamp : datetime
|
|
36
|
+
When validation was performed.
|
|
37
|
+
duration_ms : float
|
|
38
|
+
How long validation took in milliseconds.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
passed: bool
|
|
42
|
+
checks: dict[str, bool] = field(default_factory=dict)
|
|
43
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
44
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
45
|
+
duration_ms: float = 0.0
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict[str, Any]:
|
|
48
|
+
"""Convert to dictionary for logging/serialization."""
|
|
49
|
+
return {
|
|
50
|
+
"passed": self.passed,
|
|
51
|
+
"checks": self.checks,
|
|
52
|
+
"details": self.details,
|
|
53
|
+
"timestamp": self.timestamp.isoformat(),
|
|
54
|
+
"duration_ms": self.duration_ms,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def compute_dataframe_checksum(df: pd.DataFrame) -> str:
|
|
59
|
+
"""Compute a checksum for a DataFrame's key columns.
|
|
60
|
+
|
|
61
|
+
Uses MD5 hash of (timestamp, OHLCV) tuples for fast comparison.
|
|
62
|
+
xxHash64 would be faster but MD5 is available without deps.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
df : pd.DataFrame
|
|
67
|
+
Range bar DataFrame with DatetimeIndex and OHLCV columns.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
str
|
|
72
|
+
Hex digest of the checksum.
|
|
73
|
+
"""
|
|
74
|
+
import pandas as pd
|
|
75
|
+
|
|
76
|
+
# Use key columns for checksum
|
|
77
|
+
key_cols = ["Open", "High", "Low", "Close", "Volume"]
|
|
78
|
+
present_cols = [c for c in key_cols if c in df.columns]
|
|
79
|
+
|
|
80
|
+
if not present_cols or df.empty:
|
|
81
|
+
return "empty"
|
|
82
|
+
|
|
83
|
+
# Create string representation of key data
|
|
84
|
+
# Include index (timestamps) and OHLCV values
|
|
85
|
+
hasher = hashlib.md5()
|
|
86
|
+
|
|
87
|
+
# Hash index (timestamps)
|
|
88
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
89
|
+
index_str = df.index.astype(int).astype(str).str.cat(sep=",")
|
|
90
|
+
else:
|
|
91
|
+
index_str = ",".join(str(x) for x in df.index)
|
|
92
|
+
hasher.update(index_str.encode())
|
|
93
|
+
|
|
94
|
+
# Hash each column
|
|
95
|
+
for col in present_cols:
|
|
96
|
+
col_str = df[col].astype(str).str.cat(sep=",")
|
|
97
|
+
hasher.update(col_str.encode())
|
|
98
|
+
|
|
99
|
+
return hasher.hexdigest()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def validate_post_storage(
|
|
103
|
+
expected: pd.DataFrame,
|
|
104
|
+
retrieved: pd.DataFrame | None,
|
|
105
|
+
*,
|
|
106
|
+
symbol: str = "",
|
|
107
|
+
threshold_bps: int = 0,
|
|
108
|
+
) -> ValidationResult:
|
|
109
|
+
"""Validate that retrieved data matches expected data after cache operation.
|
|
110
|
+
|
|
111
|
+
This is a fast (<1 sec) validation that should run after every cache write
|
|
112
|
+
to verify data integrity. It checks:
|
|
113
|
+
1. Row count matches
|
|
114
|
+
2. First timestamp matches
|
|
115
|
+
3. Last timestamp matches
|
|
116
|
+
4. Checksum matches (OHLCV data)
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
expected : pd.DataFrame
|
|
121
|
+
The DataFrame that was written to cache.
|
|
122
|
+
retrieved : pd.DataFrame | None
|
|
123
|
+
The DataFrame read back from cache (None if read failed).
|
|
124
|
+
symbol : str, optional
|
|
125
|
+
Symbol for logging context.
|
|
126
|
+
threshold_bps : int, optional
|
|
127
|
+
Threshold for logging context.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
ValidationResult
|
|
132
|
+
Validation result with pass/fail and details.
|
|
133
|
+
|
|
134
|
+
Examples
|
|
135
|
+
--------
|
|
136
|
+
>>> from rangebar.validation.post_storage import validate_post_storage
|
|
137
|
+
>>> result = validate_post_storage(computed_df, cached_df, symbol="BTCUSDT")
|
|
138
|
+
>>> if not result.passed:
|
|
139
|
+
... logger.error("Post-storage validation FAILED: %s", result.checks)
|
|
140
|
+
"""
|
|
141
|
+
import time
|
|
142
|
+
|
|
143
|
+
start_time = time.perf_counter()
|
|
144
|
+
checks: dict[str, bool] = {}
|
|
145
|
+
details: dict[str, Any] = {
|
|
146
|
+
"symbol": symbol,
|
|
147
|
+
"threshold_bps": threshold_bps,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
# Check 1: Retrieved data exists
|
|
151
|
+
if retrieved is None:
|
|
152
|
+
checks["data_retrieved"] = False
|
|
153
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
154
|
+
return ValidationResult(
|
|
155
|
+
passed=False,
|
|
156
|
+
checks=checks,
|
|
157
|
+
details={**details, "error": "No data retrieved from cache"},
|
|
158
|
+
duration_ms=duration_ms,
|
|
159
|
+
)
|
|
160
|
+
checks["data_retrieved"] = True
|
|
161
|
+
|
|
162
|
+
# Check 2: Row count matches
|
|
163
|
+
expected_count = len(expected)
|
|
164
|
+
retrieved_count = len(retrieved)
|
|
165
|
+
checks["row_count_match"] = expected_count == retrieved_count
|
|
166
|
+
details["expected_count"] = expected_count
|
|
167
|
+
details["retrieved_count"] = retrieved_count
|
|
168
|
+
|
|
169
|
+
if not checks["row_count_match"]:
|
|
170
|
+
logger.warning(
|
|
171
|
+
"Row count mismatch for %s: expected %d, got %d",
|
|
172
|
+
symbol,
|
|
173
|
+
expected_count,
|
|
174
|
+
retrieved_count,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Check 3: First timestamp matches
|
|
178
|
+
if not expected.empty and not retrieved.empty:
|
|
179
|
+
expected_first = expected.index[0]
|
|
180
|
+
retrieved_first = retrieved.index[0]
|
|
181
|
+
checks["first_timestamp_match"] = expected_first == retrieved_first
|
|
182
|
+
details["expected_first_ts"] = str(expected_first)
|
|
183
|
+
details["retrieved_first_ts"] = str(retrieved_first)
|
|
184
|
+
|
|
185
|
+
# Check 4: Last timestamp matches
|
|
186
|
+
expected_last = expected.index[-1]
|
|
187
|
+
retrieved_last = retrieved.index[-1]
|
|
188
|
+
checks["last_timestamp_match"] = expected_last == retrieved_last
|
|
189
|
+
details["expected_last_ts"] = str(expected_last)
|
|
190
|
+
details["retrieved_last_ts"] = str(retrieved_last)
|
|
191
|
+
else:
|
|
192
|
+
checks["first_timestamp_match"] = expected.empty == retrieved.empty
|
|
193
|
+
checks["last_timestamp_match"] = expected.empty == retrieved.empty
|
|
194
|
+
|
|
195
|
+
# Check 5: Checksum matches (OHLCV data integrity)
|
|
196
|
+
expected_checksum = compute_dataframe_checksum(expected)
|
|
197
|
+
retrieved_checksum = compute_dataframe_checksum(retrieved)
|
|
198
|
+
checks["checksum_match"] = expected_checksum == retrieved_checksum
|
|
199
|
+
details["expected_checksum"] = expected_checksum[:16] # Truncate for logging
|
|
200
|
+
details["retrieved_checksum"] = retrieved_checksum[:16]
|
|
201
|
+
|
|
202
|
+
if not checks["checksum_match"]:
|
|
203
|
+
logger.warning(
|
|
204
|
+
"Checksum mismatch for %s: data corruption detected",
|
|
205
|
+
symbol,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Overall pass/fail
|
|
209
|
+
passed = all(checks.values())
|
|
210
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
211
|
+
|
|
212
|
+
result = ValidationResult(
|
|
213
|
+
passed=passed,
|
|
214
|
+
checks=checks,
|
|
215
|
+
details=details,
|
|
216
|
+
duration_ms=duration_ms,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if passed:
|
|
220
|
+
logger.debug(
|
|
221
|
+
"Post-storage validation PASSED for %s (%d bars, %.1fms)",
|
|
222
|
+
symbol,
|
|
223
|
+
expected_count,
|
|
224
|
+
duration_ms,
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
logger.warning(
|
|
228
|
+
"Post-storage validation FAILED for %s: %s",
|
|
229
|
+
symbol,
|
|
230
|
+
{k: v for k, v in checks.items() if not v},
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def validate_ohlc_invariants(df: pd.DataFrame) -> ValidationResult:
|
|
237
|
+
"""Validate OHLC price invariants.
|
|
238
|
+
|
|
239
|
+
Checks that for all bars:
|
|
240
|
+
- High >= max(Open, Close)
|
|
241
|
+
- Low <= min(Open, Close)
|
|
242
|
+
|
|
243
|
+
These invariants should always hold for valid OHLC data.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
df : pd.DataFrame
|
|
248
|
+
Range bar DataFrame with Open, High, Low, Close columns.
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
ValidationResult
|
|
253
|
+
Validation result.
|
|
254
|
+
"""
|
|
255
|
+
import time
|
|
256
|
+
|
|
257
|
+
start_time = time.perf_counter()
|
|
258
|
+
checks: dict[str, bool] = {}
|
|
259
|
+
details: dict[str, Any] = {"bar_count": len(df)}
|
|
260
|
+
|
|
261
|
+
if df.empty:
|
|
262
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
263
|
+
return ValidationResult(
|
|
264
|
+
passed=True,
|
|
265
|
+
checks={"empty_dataframe": True},
|
|
266
|
+
details=details,
|
|
267
|
+
duration_ms=duration_ms,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Check required columns
|
|
271
|
+
required = ["Open", "High", "Low", "Close"]
|
|
272
|
+
missing = [c for c in required if c not in df.columns]
|
|
273
|
+
if missing:
|
|
274
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
275
|
+
return ValidationResult(
|
|
276
|
+
passed=False,
|
|
277
|
+
checks={"columns_present": False},
|
|
278
|
+
details={**details, "missing_columns": missing},
|
|
279
|
+
duration_ms=duration_ms,
|
|
280
|
+
)
|
|
281
|
+
checks["columns_present"] = True
|
|
282
|
+
|
|
283
|
+
# High >= max(Open, Close)
|
|
284
|
+
high_valid = (df["High"] >= df[["Open", "Close"]].max(axis=1)).all()
|
|
285
|
+
checks["high_ge_open_close"] = bool(high_valid)
|
|
286
|
+
|
|
287
|
+
if not high_valid:
|
|
288
|
+
invalid_rows = df[df["High"] < df[["Open", "Close"]].max(axis=1)]
|
|
289
|
+
details["high_invalid_count"] = len(invalid_rows)
|
|
290
|
+
details["high_invalid_first_ts"] = str(invalid_rows.index[0])
|
|
291
|
+
|
|
292
|
+
# Low <= min(Open, Close)
|
|
293
|
+
low_valid = (df["Low"] <= df[["Open", "Close"]].min(axis=1)).all()
|
|
294
|
+
checks["low_le_open_close"] = bool(low_valid)
|
|
295
|
+
|
|
296
|
+
if not low_valid:
|
|
297
|
+
invalid_rows = df[df["Low"] > df[["Open", "Close"]].min(axis=1)]
|
|
298
|
+
details["low_invalid_count"] = len(invalid_rows)
|
|
299
|
+
details["low_invalid_first_ts"] = str(invalid_rows.index[0])
|
|
300
|
+
|
|
301
|
+
passed = all(checks.values())
|
|
302
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
303
|
+
|
|
304
|
+
return ValidationResult(
|
|
305
|
+
passed=passed,
|
|
306
|
+
checks=checks,
|
|
307
|
+
details=details,
|
|
308
|
+
duration_ms=duration_ms,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
__all__ = [
|
|
313
|
+
"ValidationResult",
|
|
314
|
+
"compute_dataframe_checksum",
|
|
315
|
+
"validate_ohlc_invariants",
|
|
316
|
+
"validate_post_storage",
|
|
317
|
+
]
|