rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
# polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
|
|
2
|
+
# Issue #46: Modularization M4 - Extract get_range_bars from __init__.py
|
|
3
|
+
"""Date-bounded range bar generation.
|
|
4
|
+
|
|
5
|
+
Provides get_range_bars() - the single entry point for all range bar generation
|
|
6
|
+
with automatic data fetching, caching, and ouroboros boundary handling.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from rangebar.constants import (
|
|
17
|
+
THRESHOLD_DECIMAL_MAX,
|
|
18
|
+
THRESHOLD_DECIMAL_MIN,
|
|
19
|
+
THRESHOLD_PRESETS,
|
|
20
|
+
)
|
|
21
|
+
from rangebar.processors.core import RangeBarProcessor
|
|
22
|
+
from rangebar.validation.cache_staleness import detect_staleness
|
|
23
|
+
|
|
24
|
+
from .helpers import (
|
|
25
|
+
_fetch_binance,
|
|
26
|
+
_fetch_exness,
|
|
27
|
+
_process_binance_trades,
|
|
28
|
+
_process_exness_ticks,
|
|
29
|
+
_stream_range_bars_binance,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
import polars as pl
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_range_bars(
|
|
37
|
+
symbol: str,
|
|
38
|
+
start_date: str,
|
|
39
|
+
end_date: str,
|
|
40
|
+
threshold_decimal_bps: int | str = 250,
|
|
41
|
+
*,
|
|
42
|
+
# Ouroboros: Cyclical reset boundaries (v11.0+)
|
|
43
|
+
ouroboros: Literal["year", "month", "week"] = "year",
|
|
44
|
+
include_orphaned_bars: bool = False,
|
|
45
|
+
# Streaming options (v8.0+)
|
|
46
|
+
materialize: bool = True,
|
|
47
|
+
batch_size: int = 10_000,
|
|
48
|
+
# Data source configuration
|
|
49
|
+
source: str = "binance",
|
|
50
|
+
market: str = "spot",
|
|
51
|
+
# Exness-specific options
|
|
52
|
+
validation: str = "strict",
|
|
53
|
+
# Processing options
|
|
54
|
+
include_incomplete: bool = False,
|
|
55
|
+
include_microstructure: bool = False,
|
|
56
|
+
include_exchange_sessions: bool = False, # Issue #8: Exchange session flags
|
|
57
|
+
# Timestamp gating (Issue #36)
|
|
58
|
+
prevent_same_timestamp_close: bool = True,
|
|
59
|
+
# Data integrity (Issue #43)
|
|
60
|
+
verify_checksum: bool = True,
|
|
61
|
+
# Caching options
|
|
62
|
+
use_cache: bool = True,
|
|
63
|
+
fetch_if_missing: bool = True,
|
|
64
|
+
cache_dir: str | None = None,
|
|
65
|
+
# Memory guards (Issue #49)
|
|
66
|
+
max_memory_mb: int | None = None,
|
|
67
|
+
# Inter-bar features (Issue #59)
|
|
68
|
+
inter_bar_lookback_count: int | None = None,
|
|
69
|
+
) -> pd.DataFrame | Iterator[pl.DataFrame]:
|
|
70
|
+
"""Get range bars for a symbol with automatic data fetching and caching.
|
|
71
|
+
|
|
72
|
+
This is the single entry point for all range bar generation. It supports
|
|
73
|
+
multiple data sources (Binance crypto, Exness forex), all market types,
|
|
74
|
+
and exposes the full configurability of the underlying Rust engine.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
symbol : str
|
|
79
|
+
Trading symbol (uppercase).
|
|
80
|
+
- Binance: "BTCUSDT", "ETHUSDT", etc.
|
|
81
|
+
- Exness: "EURUSD", "GBPUSD", "XAUUSD", etc.
|
|
82
|
+
start_date : str
|
|
83
|
+
Start date in YYYY-MM-DD format.
|
|
84
|
+
end_date : str
|
|
85
|
+
End date in YYYY-MM-DD format.
|
|
86
|
+
threshold_decimal_bps : int or str, default=250
|
|
87
|
+
Threshold in decimal basis points. Can be:
|
|
88
|
+
- Integer: Direct value (250 dbps = 0.25%)
|
|
89
|
+
- String preset: "micro" (10 dbps), "tight" (50 dbps), "standard" (100 dbps),
|
|
90
|
+
"medium" (250 dbps), "wide" (500 dbps), "macro" (1000 dbps)
|
|
91
|
+
Valid range: 1-100,000 dbps (0.001% to 100%)
|
|
92
|
+
ouroboros : {"year", "month", "week"}, default="year"
|
|
93
|
+
Cyclical reset boundary for reproducible bar construction (v11.0+).
|
|
94
|
+
Processor state resets at each boundary for deterministic results.
|
|
95
|
+
- "year" (default): Reset at January 1st 00:00:00 UTC (cryptocurrency)
|
|
96
|
+
- "month": Reset at 1st of each month 00:00:00 UTC
|
|
97
|
+
- "week": Reset at Sunday 00:00:00 UTC (required for Forex)
|
|
98
|
+
Named after the Greek serpent eating its tail (οὐροβόρος).
|
|
99
|
+
include_orphaned_bars : bool, default=False
|
|
100
|
+
Include incomplete bars from ouroboros boundaries.
|
|
101
|
+
If True, orphaned bars are included with ``is_orphan=True`` column.
|
|
102
|
+
Useful for analysis; filter with ``df[~df.get('is_orphan', False)]``.
|
|
103
|
+
materialize : bool, default=True
|
|
104
|
+
If True, return a single pd.DataFrame (legacy behavior).
|
|
105
|
+
If False, return an Iterator[pl.DataFrame] that yields batches
|
|
106
|
+
of bars for memory-efficient streaming (v8.0+).
|
|
107
|
+
batch_size : int, default=10_000
|
|
108
|
+
Number of bars per batch when materialize=False.
|
|
109
|
+
Each batch is ~500 KB. Only used in streaming mode.
|
|
110
|
+
|
|
111
|
+
source : str, default="binance"
|
|
112
|
+
Data source: "binance" or "exness"
|
|
113
|
+
market : str, default="spot"
|
|
114
|
+
Market type (Binance only):
|
|
115
|
+
- "spot": Spot market
|
|
116
|
+
- "futures-um" or "um": USD-M perpetual futures
|
|
117
|
+
- "futures-cm" or "cm": COIN-M perpetual futures
|
|
118
|
+
validation : str, default="strict"
|
|
119
|
+
Validation strictness (Exness only):
|
|
120
|
+
- "permissive": Basic checks (bid > 0, ask > 0, bid < ask)
|
|
121
|
+
- "strict": + Spread < 10% (catches obvious errors)
|
|
122
|
+
- "paranoid": + Spread < 1% (flags suspicious data)
|
|
123
|
+
include_incomplete : bool, default=False
|
|
124
|
+
Include the final incomplete bar (useful for analysis).
|
|
125
|
+
If False (default), only completed bars are returned.
|
|
126
|
+
include_microstructure : bool, default=False
|
|
127
|
+
Include market microstructure columns:
|
|
128
|
+
- buy_volume, sell_volume: Volume by aggressor side
|
|
129
|
+
- vwap: Volume-weighted average price
|
|
130
|
+
- trade_count: Number of trades in bar
|
|
131
|
+
- (Exness) spread_min, spread_max, spread_avg: Spread statistics
|
|
132
|
+
include_exchange_sessions : bool, default=False
|
|
133
|
+
Include traditional exchange market session flags (Issue #8).
|
|
134
|
+
When True, adds boolean columns indicating active sessions at bar close:
|
|
135
|
+
- exchange_session_sydney: ASX (10:00-16:00 Sydney time)
|
|
136
|
+
- exchange_session_tokyo: TSE (09:00-15:00 Tokyo time)
|
|
137
|
+
- exchange_session_london: LSE (08:00-17:00 London time)
|
|
138
|
+
- exchange_session_newyork: NYSE (10:00-16:00 New York time)
|
|
139
|
+
Useful for analyzing crypto/forex behavior during traditional market hours.
|
|
140
|
+
prevent_same_timestamp_close : bool, default=True
|
|
141
|
+
Timestamp gating for flash crash prevention (Issue #36).
|
|
142
|
+
If True (default): A bar cannot close on the same timestamp it opened.
|
|
143
|
+
This prevents flash crash scenarios from creating thousands of bars
|
|
144
|
+
at identical timestamps. If False: Legacy v8 behavior where bars can
|
|
145
|
+
close immediately on breach regardless of timestamp. Use False for
|
|
146
|
+
comparative analysis between old and new behavior.
|
|
147
|
+
verify_checksum : bool, default=True
|
|
148
|
+
Verify SHA-256 checksum of downloaded data (Issue #43).
|
|
149
|
+
If True (default): Verify downloaded ZIP files against Binance-provided
|
|
150
|
+
checksums to detect data corruption early. If verification fails,
|
|
151
|
+
raises RuntimeError. If False: Skip checksum verification for faster
|
|
152
|
+
downloads (use when data integrity is verified elsewhere).
|
|
153
|
+
use_cache : bool, default=True
|
|
154
|
+
Cache tick data locally in Parquet format.
|
|
155
|
+
fetch_if_missing : bool, default=True
|
|
156
|
+
If True (default), fetch tick data from source when not available
|
|
157
|
+
in cache. If False, return only cached data (may return empty
|
|
158
|
+
DataFrame if no cached data exists for the date range).
|
|
159
|
+
cache_dir : str or None, default=None
|
|
160
|
+
Custom cache directory. If None, uses platform default:
|
|
161
|
+
- macOS: ~/Library/Caches/rangebar/
|
|
162
|
+
- Linux: ~/.cache/rangebar/
|
|
163
|
+
- Windows: %LOCALAPPDATA%/terrylica/rangebar/Cache/
|
|
164
|
+
max_memory_mb : int or None, default=None
|
|
165
|
+
Memory budget in MB for tick data loading. If the estimated
|
|
166
|
+
in-memory size exceeds this limit, raises MemoryError. If None,
|
|
167
|
+
uses automatic detection (80% of available RAM). Set to 0 to
|
|
168
|
+
disable all memory guards.
|
|
169
|
+
inter_bar_lookback_count : int or None, default=None
|
|
170
|
+
Number of trades to keep in lookback buffer for inter-bar feature
|
|
171
|
+
computation (Issue #59). If set, enables 16 inter-bar features
|
|
172
|
+
computed from trades BEFORE each bar opens. Recommended: 100-500.
|
|
173
|
+
If None (default), inter-bar features are disabled.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
pd.DataFrame or Iterator[pl.DataFrame]
|
|
178
|
+
If materialize=True (default): Single pd.DataFrame ready for
|
|
179
|
+
backtesting.py, with DatetimeIndex and OHLCV columns.
|
|
180
|
+
|
|
181
|
+
If materialize=False: Iterator yielding pl.DataFrame batches
|
|
182
|
+
(batch_size bars each) for memory-efficient streaming. Convert
|
|
183
|
+
to pandas with: ``pl.concat(list(iterator)).to_pandas()``
|
|
184
|
+
|
|
185
|
+
Columns: Open, High, Low, Close, Volume
|
|
186
|
+
(if include_microstructure) Additional columns
|
|
187
|
+
|
|
188
|
+
Raises
|
|
189
|
+
------
|
|
190
|
+
ValueError
|
|
191
|
+
- Invalid threshold (outside 1-100,000 range)
|
|
192
|
+
- Invalid dates or date format
|
|
193
|
+
- Unknown source, market, or validation level
|
|
194
|
+
- Unknown threshold preset name
|
|
195
|
+
RuntimeError
|
|
196
|
+
- Data fetching failed
|
|
197
|
+
- No data available for date range
|
|
198
|
+
- Feature not enabled (e.g., Exness without exness feature)
|
|
199
|
+
|
|
200
|
+
Examples
|
|
201
|
+
--------
|
|
202
|
+
Basic usage - Binance spot:
|
|
203
|
+
|
|
204
|
+
>>> from rangebar import get_range_bars
|
|
205
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-06-30")
|
|
206
|
+
|
|
207
|
+
Using threshold presets:
|
|
208
|
+
|
|
209
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-03-31", "tight")
|
|
210
|
+
|
|
211
|
+
Binance USD-M Futures:
|
|
212
|
+
|
|
213
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-03-31", market="futures-um")
|
|
214
|
+
|
|
215
|
+
Exness forex with spread monitoring:
|
|
216
|
+
|
|
217
|
+
>>> df = get_range_bars(
|
|
218
|
+
... "EURUSD", "2024-01-01", "2024-01-31",
|
|
219
|
+
... source="exness",
|
|
220
|
+
... threshold_decimal_bps="standard",
|
|
221
|
+
... include_microstructure=True, # includes spread stats
|
|
222
|
+
... )
|
|
223
|
+
|
|
224
|
+
Include incomplete bar for analysis:
|
|
225
|
+
|
|
226
|
+
>>> df = get_range_bars(
|
|
227
|
+
... "ETHUSDT", "2024-01-01", "2024-01-07",
|
|
228
|
+
... include_incomplete=True,
|
|
229
|
+
... )
|
|
230
|
+
|
|
231
|
+
Use with backtesting.py:
|
|
232
|
+
|
|
233
|
+
>>> from backtesting import Backtest, Strategy
|
|
234
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-12-31")
|
|
235
|
+
>>> bt = Backtest(df, MyStrategy, cash=10000, commission=0.0002)
|
|
236
|
+
>>> stats = bt.run()
|
|
237
|
+
|
|
238
|
+
Streaming mode for large datasets (v8.0+):
|
|
239
|
+
|
|
240
|
+
>>> import polars as pl
|
|
241
|
+
>>> # Memory-efficient: yields ~500 KB batches
|
|
242
|
+
>>> for batch in get_range_bars(
|
|
243
|
+
... "BTCUSDT", "2024-01-01", "2024-06-30",
|
|
244
|
+
... materialize=False,
|
|
245
|
+
... batch_size=10_000,
|
|
246
|
+
... ):
|
|
247
|
+
... process_batch(batch) # batch is pl.DataFrame
|
|
248
|
+
...
|
|
249
|
+
>>> # Or collect to single DataFrame:
|
|
250
|
+
>>> batches = list(get_range_bars(
|
|
251
|
+
... "BTCUSDT", "2024-01-01", "2024-03-31",
|
|
252
|
+
... materialize=False,
|
|
253
|
+
... ))
|
|
254
|
+
>>> df = pl.concat(batches).to_pandas()
|
|
255
|
+
|
|
256
|
+
Notes
|
|
257
|
+
-----
|
|
258
|
+
Threshold units (decimal basis points):
|
|
259
|
+
The threshold is specified in decimal basis points (0.1bps) for precision.
|
|
260
|
+
Common conversions:
|
|
261
|
+
- 10 = 1bps = 0.01%
|
|
262
|
+
- 100 = 10bps = 0.1%
|
|
263
|
+
- 250 = 25bps = 0.25%
|
|
264
|
+
- 1000 = 100bps = 1%
|
|
265
|
+
|
|
266
|
+
Tier-1 symbols:
|
|
267
|
+
18 high-liquidity symbols available on ALL Binance markets:
|
|
268
|
+
AAVE, ADA, AVAX, BCH, BNB, BTC, DOGE, ETH, FIL,
|
|
269
|
+
LINK, LTC, NEAR, SOL, SUI, UNI, WIF, WLD, XRP
|
|
270
|
+
|
|
271
|
+
Non-lookahead guarantee:
|
|
272
|
+
- Threshold computed from bar OPEN price only
|
|
273
|
+
- Breaching trade included in closing bar
|
|
274
|
+
- No future information used in bar construction
|
|
275
|
+
|
|
276
|
+
See Also
|
|
277
|
+
--------
|
|
278
|
+
TIER1_SYMBOLS : Tuple of high-liquidity symbols
|
|
279
|
+
THRESHOLD_PRESETS : Dictionary of named threshold values
|
|
280
|
+
"""
|
|
281
|
+
from datetime import datetime
|
|
282
|
+
from pathlib import Path
|
|
283
|
+
|
|
284
|
+
from rangebar.storage.parquet import TickStorage
|
|
285
|
+
|
|
286
|
+
# -------------------------------------------------------------------------
|
|
287
|
+
# Resolve threshold (support presets)
|
|
288
|
+
# -------------------------------------------------------------------------
|
|
289
|
+
if isinstance(threshold_decimal_bps, str):
|
|
290
|
+
if threshold_decimal_bps not in THRESHOLD_PRESETS:
|
|
291
|
+
msg = (
|
|
292
|
+
f"Unknown threshold preset: {threshold_decimal_bps!r}. "
|
|
293
|
+
f"Valid presets: {list(THRESHOLD_PRESETS.keys())}"
|
|
294
|
+
)
|
|
295
|
+
raise ValueError(msg)
|
|
296
|
+
threshold_decimal_bps = THRESHOLD_PRESETS[threshold_decimal_bps]
|
|
297
|
+
|
|
298
|
+
if not THRESHOLD_DECIMAL_MIN <= threshold_decimal_bps <= THRESHOLD_DECIMAL_MAX:
|
|
299
|
+
msg = (
|
|
300
|
+
f"threshold_decimal_bps must be between {THRESHOLD_DECIMAL_MIN} and {THRESHOLD_DECIMAL_MAX}, "
|
|
301
|
+
f"got {threshold_decimal_bps}"
|
|
302
|
+
)
|
|
303
|
+
raise ValueError(msg)
|
|
304
|
+
|
|
305
|
+
# -------------------------------------------------------------------------
|
|
306
|
+
# Validate ouroboros mode (v11.0+)
|
|
307
|
+
# -------------------------------------------------------------------------
|
|
308
|
+
from rangebar.ouroboros import validate_ouroboros_mode
|
|
309
|
+
|
|
310
|
+
ouroboros = validate_ouroboros_mode(ouroboros)
|
|
311
|
+
|
|
312
|
+
# -------------------------------------------------------------------------
|
|
313
|
+
# Validate source and market
|
|
314
|
+
# -------------------------------------------------------------------------
|
|
315
|
+
source = source.lower()
|
|
316
|
+
if source not in ("binance", "exness"):
|
|
317
|
+
msg = f"Unknown source: {source!r}. Must be 'binance' or 'exness'"
|
|
318
|
+
raise ValueError(msg)
|
|
319
|
+
|
|
320
|
+
# Normalize market type
|
|
321
|
+
market_map = {
|
|
322
|
+
"spot": "spot",
|
|
323
|
+
"futures-um": "um",
|
|
324
|
+
"futures-cm": "cm",
|
|
325
|
+
"um": "um",
|
|
326
|
+
"cm": "cm",
|
|
327
|
+
}
|
|
328
|
+
market = market.lower()
|
|
329
|
+
if source == "binance" and market not in market_map:
|
|
330
|
+
msg = (
|
|
331
|
+
f"Unknown market: {market!r}. "
|
|
332
|
+
"Must be 'spot', 'futures-um'/'um', or 'futures-cm'/'cm'"
|
|
333
|
+
)
|
|
334
|
+
raise ValueError(msg)
|
|
335
|
+
market_normalized = market_map.get(market, market)
|
|
336
|
+
|
|
337
|
+
# Validate Exness validation strictness
|
|
338
|
+
validation = validation.lower()
|
|
339
|
+
if source == "exness" and validation not in ("permissive", "strict", "paranoid"):
|
|
340
|
+
msg = (
|
|
341
|
+
f"Unknown validation: {validation!r}. "
|
|
342
|
+
"Must be 'permissive', 'strict', or 'paranoid'"
|
|
343
|
+
)
|
|
344
|
+
raise ValueError(msg)
|
|
345
|
+
|
|
346
|
+
# -------------------------------------------------------------------------
|
|
347
|
+
# Parse and validate dates
|
|
348
|
+
# -------------------------------------------------------------------------
|
|
349
|
+
try:
|
|
350
|
+
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
351
|
+
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
352
|
+
except ValueError as e:
|
|
353
|
+
msg = f"Invalid date format. Use YYYY-MM-DD: {e}"
|
|
354
|
+
raise ValueError(msg) from e
|
|
355
|
+
|
|
356
|
+
if start_dt > end_dt:
|
|
357
|
+
msg = "start_date must be <= end_date"
|
|
358
|
+
raise ValueError(msg)
|
|
359
|
+
|
|
360
|
+
# Convert to milliseconds for cache lookup
|
|
361
|
+
start_ts = int(start_dt.timestamp() * 1000)
|
|
362
|
+
end_ts = int((end_dt.timestamp() + 86399) * 1000) # End of day
|
|
363
|
+
|
|
364
|
+
# -------------------------------------------------------------------------
|
|
365
|
+
# Streaming mode (v8.0+): Return generator instead of materializing
|
|
366
|
+
# -------------------------------------------------------------------------
|
|
367
|
+
if not materialize:
|
|
368
|
+
if source == "exness":
|
|
369
|
+
msg = (
|
|
370
|
+
"Streaming mode (materialize=False) is not yet supported for Exness. "
|
|
371
|
+
"Use materialize=True or use Binance source."
|
|
372
|
+
)
|
|
373
|
+
raise ValueError(msg)
|
|
374
|
+
|
|
375
|
+
# Binance streaming: yields batches directly from network
|
|
376
|
+
return _stream_range_bars_binance(
|
|
377
|
+
symbol=symbol,
|
|
378
|
+
start_date=start_date,
|
|
379
|
+
end_date=end_date,
|
|
380
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
381
|
+
market=market_normalized,
|
|
382
|
+
batch_size=batch_size,
|
|
383
|
+
include_microstructure=include_microstructure,
|
|
384
|
+
include_incomplete=include_incomplete,
|
|
385
|
+
prevent_same_timestamp_close=prevent_same_timestamp_close,
|
|
386
|
+
verify_checksum=verify_checksum,
|
|
387
|
+
inter_bar_lookback_count=inter_bar_lookback_count,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# -------------------------------------------------------------------------
|
|
391
|
+
# Check ClickHouse bar cache first (Issue #21: fast path for precomputed bars)
|
|
392
|
+
# -------------------------------------------------------------------------
|
|
393
|
+
if use_cache:
|
|
394
|
+
try:
|
|
395
|
+
from rangebar.clickhouse import RangeBarCache
|
|
396
|
+
|
|
397
|
+
with RangeBarCache() as cache:
|
|
398
|
+
# Ouroboros mode filter ensures cache isolation (Plan: sparkling-coalescing-dijkstra.md)
|
|
399
|
+
cached_bars = cache.get_bars_by_timestamp_range(
|
|
400
|
+
symbol=symbol,
|
|
401
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
402
|
+
start_ts=start_ts,
|
|
403
|
+
end_ts=end_ts,
|
|
404
|
+
include_microstructure=include_microstructure,
|
|
405
|
+
ouroboros_mode=ouroboros,
|
|
406
|
+
)
|
|
407
|
+
if cached_bars is not None and len(cached_bars) > 0:
|
|
408
|
+
# Tier 0 validation: Content-based staleness detection (Issue #39)
|
|
409
|
+
# This catches stale cached data from pre-v7.0 (e.g., VWAP=0)
|
|
410
|
+
if include_microstructure:
|
|
411
|
+
staleness = detect_staleness(
|
|
412
|
+
cached_bars, require_microstructure=True
|
|
413
|
+
)
|
|
414
|
+
if staleness.is_stale:
|
|
415
|
+
import logging
|
|
416
|
+
|
|
417
|
+
logger = logging.getLogger(__name__)
|
|
418
|
+
logger.warning(
|
|
419
|
+
"Stale cache data detected for %s: %s. "
|
|
420
|
+
"Falling through to recompute.",
|
|
421
|
+
symbol,
|
|
422
|
+
staleness.reason,
|
|
423
|
+
)
|
|
424
|
+
# Fall through to tick processing path
|
|
425
|
+
else:
|
|
426
|
+
# Fast path: return validated bars from ClickHouse (~50ms)
|
|
427
|
+
return cached_bars
|
|
428
|
+
else:
|
|
429
|
+
# Fast path: return precomputed bars from ClickHouse (~50ms)
|
|
430
|
+
return cached_bars
|
|
431
|
+
except ImportError:
|
|
432
|
+
# ClickHouse not available, fall through to tick processing
|
|
433
|
+
pass
|
|
434
|
+
except ConnectionError:
|
|
435
|
+
# ClickHouse connection failed, fall through to tick processing
|
|
436
|
+
pass
|
|
437
|
+
|
|
438
|
+
# -------------------------------------------------------------------------
|
|
439
|
+
# Initialize storage (Tier 1: local Parquet ticks)
|
|
440
|
+
# -------------------------------------------------------------------------
|
|
441
|
+
storage = TickStorage(cache_dir=Path(cache_dir) if cache_dir else None)
|
|
442
|
+
|
|
443
|
+
# Cache key includes source and market to avoid collisions
|
|
444
|
+
cache_symbol = f"{source}_{market_normalized}_{symbol}".upper()
|
|
445
|
+
|
|
446
|
+
# -------------------------------------------------------------------------
|
|
447
|
+
# Determine tick data source (cache or network)
|
|
448
|
+
# -------------------------------------------------------------------------
|
|
449
|
+
has_cached_ticks = use_cache and storage.has_ticks(cache_symbol, start_ts, end_ts)
|
|
450
|
+
|
|
451
|
+
if not has_cached_ticks and not fetch_if_missing:
|
|
452
|
+
return pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"])
|
|
453
|
+
|
|
454
|
+
# For Exness, load all ticks upfront (smaller datasets)
|
|
455
|
+
if source == "exness":
|
|
456
|
+
if has_cached_ticks:
|
|
457
|
+
tick_data = storage.read_ticks(cache_symbol, start_ts, end_ts)
|
|
458
|
+
else:
|
|
459
|
+
tick_data = _fetch_exness(symbol, start_date, end_date, validation)
|
|
460
|
+
if use_cache and not tick_data.is_empty():
|
|
461
|
+
storage.write_ticks(cache_symbol, tick_data)
|
|
462
|
+
if tick_data.is_empty():
|
|
463
|
+
msg = f"No data available for {symbol} from {start_date} to {end_date}"
|
|
464
|
+
raise RuntimeError(msg)
|
|
465
|
+
return _process_exness_ticks(
|
|
466
|
+
tick_data,
|
|
467
|
+
symbol,
|
|
468
|
+
threshold_decimal_bps,
|
|
469
|
+
validation,
|
|
470
|
+
include_incomplete,
|
|
471
|
+
include_microstructure,
|
|
472
|
+
inter_bar_lookback_count=inter_bar_lookback_count,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# -------------------------------------------------------------------------
|
|
476
|
+
# MEM-010: Pre-flight memory estimation (Issue #49)
|
|
477
|
+
# Check if cached tick data would fit in memory before loading.
|
|
478
|
+
# -------------------------------------------------------------------------
|
|
479
|
+
if has_cached_ticks and max_memory_mb != 0:
|
|
480
|
+
import warnings
|
|
481
|
+
|
|
482
|
+
from rangebar.resource_guard import estimate_tick_memory
|
|
483
|
+
|
|
484
|
+
estimate = estimate_tick_memory(
|
|
485
|
+
storage, cache_symbol, start_ts, end_ts
|
|
486
|
+
)
|
|
487
|
+
if estimate.recommendation == "will_oom":
|
|
488
|
+
msg = (
|
|
489
|
+
f"Loading {symbol} ({start_date} -> {end_date}) would "
|
|
490
|
+
f"require ~{estimate.estimated_memory_mb} MB "
|
|
491
|
+
f"(available: {estimate.system_available_mb} MB). "
|
|
492
|
+
f"Use precompute_range_bars() for streaming processing."
|
|
493
|
+
)
|
|
494
|
+
if max_memory_mb is not None:
|
|
495
|
+
estimate.check_or_raise(max_mb=max_memory_mb)
|
|
496
|
+
else:
|
|
497
|
+
raise MemoryError(msg)
|
|
498
|
+
elif estimate.recommendation == "streaming_recommended":
|
|
499
|
+
warnings.warn(
|
|
500
|
+
f"Large tick dataset for {symbol} "
|
|
501
|
+
f"(~{estimate.estimated_memory_mb} MB). "
|
|
502
|
+
f"Consider precompute_range_bars() for memory-safe "
|
|
503
|
+
f"processing.",
|
|
504
|
+
ResourceWarning,
|
|
505
|
+
stacklevel=2,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# -------------------------------------------------------------------------
|
|
509
|
+
# Binance: Process with ouroboros segment iteration (Issue #51)
|
|
510
|
+
# Load ticks per-segment to avoid OOM on large date ranges.
|
|
511
|
+
# Each segment loads only the ticks within its boundaries (~1 year max).
|
|
512
|
+
# -------------------------------------------------------------------------
|
|
513
|
+
from rangebar.ouroboros import iter_ouroboros_segments
|
|
514
|
+
|
|
515
|
+
all_bars: list[pd.DataFrame] = []
|
|
516
|
+
processor: RangeBarProcessor | None = None
|
|
517
|
+
any_data_found = False
|
|
518
|
+
|
|
519
|
+
for segment_start, segment_end, boundary in iter_ouroboros_segments(
|
|
520
|
+
start_dt.date(), end_dt.date(), ouroboros
|
|
521
|
+
):
|
|
522
|
+
# Reset processor at ouroboros boundary
|
|
523
|
+
if boundary is not None and processor is not None:
|
|
524
|
+
orphaned_bar = processor.reset_at_ouroboros()
|
|
525
|
+
if include_orphaned_bars and orphaned_bar is not None:
|
|
526
|
+
# Add orphan metadata
|
|
527
|
+
orphaned_bar["is_orphan"] = True
|
|
528
|
+
orphaned_bar["ouroboros_boundary"] = boundary.timestamp
|
|
529
|
+
orphaned_bar["ouroboros_reason"] = boundary.reason
|
|
530
|
+
orphan_df = pd.DataFrame([orphaned_bar])
|
|
531
|
+
# Convert timestamp to datetime index
|
|
532
|
+
if "timestamp" in orphan_df.columns:
|
|
533
|
+
orphan_df["timestamp"] = pd.to_datetime(
|
|
534
|
+
orphan_df["timestamp"], unit="us", utc=True
|
|
535
|
+
)
|
|
536
|
+
orphan_df = orphan_df.set_index("timestamp")
|
|
537
|
+
all_bars.append(orphan_df)
|
|
538
|
+
|
|
539
|
+
# Load tick data scoped to this segment (not the full range)
|
|
540
|
+
segment_start_ms = int(segment_start.timestamp() * 1_000)
|
|
541
|
+
segment_end_ms = int(segment_end.timestamp() * 1_000)
|
|
542
|
+
|
|
543
|
+
if has_cached_ticks:
|
|
544
|
+
segment_ticks = storage.read_ticks(
|
|
545
|
+
cache_symbol, segment_start_ms, segment_end_ms
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
# Fetch from network for this segment only
|
|
549
|
+
seg_start_str = segment_start.strftime("%Y-%m-%d")
|
|
550
|
+
seg_end_str = segment_end.strftime("%Y-%m-%d")
|
|
551
|
+
segment_ticks = _fetch_binance(
|
|
552
|
+
symbol, seg_start_str, seg_end_str, market_normalized
|
|
553
|
+
)
|
|
554
|
+
# Cache segment ticks
|
|
555
|
+
if use_cache and not segment_ticks.is_empty():
|
|
556
|
+
storage.write_ticks(cache_symbol, segment_ticks)
|
|
557
|
+
|
|
558
|
+
if segment_ticks.is_empty():
|
|
559
|
+
continue
|
|
560
|
+
|
|
561
|
+
any_data_found = True
|
|
562
|
+
|
|
563
|
+
# Process segment (reuse processor for state continuity within segment)
|
|
564
|
+
segment_bars, processor = _process_binance_trades(
|
|
565
|
+
segment_ticks,
|
|
566
|
+
threshold_decimal_bps,
|
|
567
|
+
include_incomplete,
|
|
568
|
+
include_microstructure,
|
|
569
|
+
processor=processor,
|
|
570
|
+
symbol=symbol,
|
|
571
|
+
prevent_same_timestamp_close=prevent_same_timestamp_close,
|
|
572
|
+
inter_bar_lookback_count=inter_bar_lookback_count,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
if segment_bars is not None and not segment_bars.empty:
|
|
576
|
+
all_bars.append(segment_bars)
|
|
577
|
+
|
|
578
|
+
if not any_data_found:
|
|
579
|
+
msg = f"No data available for {symbol} from {start_date} to {end_date}"
|
|
580
|
+
raise RuntimeError(msg)
|
|
581
|
+
|
|
582
|
+
# Concatenate all segments
|
|
583
|
+
if not all_bars:
|
|
584
|
+
bars_df = pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"])
|
|
585
|
+
elif len(all_bars) == 1:
|
|
586
|
+
bars_df = all_bars[0]
|
|
587
|
+
else:
|
|
588
|
+
bars_df = pd.concat(all_bars, axis=0)
|
|
589
|
+
bars_df = bars_df.sort_index()
|
|
590
|
+
|
|
591
|
+
# -------------------------------------------------------------------------
|
|
592
|
+
# Add exchange session flags (Issue #8)
|
|
593
|
+
# -------------------------------------------------------------------------
|
|
594
|
+
# Session flags indicate which traditional market sessions were active
|
|
595
|
+
# at bar close time. Useful for analyzing crypto/forex behavior.
|
|
596
|
+
if include_exchange_sessions and not bars_df.empty:
|
|
597
|
+
import warnings
|
|
598
|
+
|
|
599
|
+
from rangebar.ouroboros import get_active_exchange_sessions
|
|
600
|
+
|
|
601
|
+
# Compute session flags for each bar based on close timestamp (index)
|
|
602
|
+
session_data = {
|
|
603
|
+
"exchange_session_sydney": [],
|
|
604
|
+
"exchange_session_tokyo": [],
|
|
605
|
+
"exchange_session_london": [],
|
|
606
|
+
"exchange_session_newyork": [],
|
|
607
|
+
}
|
|
608
|
+
for ts in bars_df.index:
|
|
609
|
+
# Ensure timezone-aware UTC timestamp
|
|
610
|
+
if ts.tzinfo is None:
|
|
611
|
+
ts_utc = ts.tz_localize("UTC")
|
|
612
|
+
else:
|
|
613
|
+
ts_utc = ts.tz_convert("UTC")
|
|
614
|
+
# Suppress nanosecond warning - session detection is hour-granularity
|
|
615
|
+
with warnings.catch_warnings():
|
|
616
|
+
warnings.filterwarnings("ignore", "Discarding nonzero nanoseconds")
|
|
617
|
+
flags = get_active_exchange_sessions(ts_utc.to_pydatetime())
|
|
618
|
+
session_data["exchange_session_sydney"].append(flags.sydney)
|
|
619
|
+
session_data["exchange_session_tokyo"].append(flags.tokyo)
|
|
620
|
+
session_data["exchange_session_london"].append(flags.london)
|
|
621
|
+
session_data["exchange_session_newyork"].append(flags.newyork)
|
|
622
|
+
|
|
623
|
+
# Add columns to DataFrame
|
|
624
|
+
for col, values in session_data.items():
|
|
625
|
+
bars_df[col] = values
|
|
626
|
+
|
|
627
|
+
# -------------------------------------------------------------------------
|
|
628
|
+
# Write computed bars to ClickHouse cache (Issue #37)
|
|
629
|
+
# -------------------------------------------------------------------------
|
|
630
|
+
# Cache write is non-blocking: failures don't affect the return value.
|
|
631
|
+
# The computation succeeded, so we return bars even if caching fails.
|
|
632
|
+
if use_cache and bars_df is not None and not bars_df.empty:
|
|
633
|
+
try:
|
|
634
|
+
from rangebar.clickhouse import RangeBarCache
|
|
635
|
+
from rangebar.exceptions import CacheError
|
|
636
|
+
|
|
637
|
+
with RangeBarCache() as cache:
|
|
638
|
+
# Use store_bars_bulk for bars computed without exact CacheKey
|
|
639
|
+
# Ouroboros mode determines cache key (Plan: sparkling-coalescing-dijkstra.md)
|
|
640
|
+
written = cache.store_bars_bulk(
|
|
641
|
+
symbol=symbol,
|
|
642
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
643
|
+
bars=bars_df,
|
|
644
|
+
version="", # Version tracked elsewhere
|
|
645
|
+
ouroboros_mode=ouroboros,
|
|
646
|
+
)
|
|
647
|
+
import logging
|
|
648
|
+
|
|
649
|
+
logger = logging.getLogger(__name__)
|
|
650
|
+
logger.info(
|
|
651
|
+
"Cached %d bars for %s @ %d dbps",
|
|
652
|
+
written,
|
|
653
|
+
symbol,
|
|
654
|
+
threshold_decimal_bps,
|
|
655
|
+
)
|
|
656
|
+
except ImportError:
|
|
657
|
+
# ClickHouse not available - skip caching
|
|
658
|
+
pass
|
|
659
|
+
except ConnectionError:
|
|
660
|
+
# ClickHouse connection failed - skip caching
|
|
661
|
+
pass
|
|
662
|
+
except (CacheError, OSError, RuntimeError) as e:
|
|
663
|
+
# Log but don't fail - cache is optimization layer
|
|
664
|
+
# CacheError: All cache-specific errors
|
|
665
|
+
# OSError: Network/disk errors
|
|
666
|
+
# RuntimeError: ClickHouse driver errors
|
|
667
|
+
import logging
|
|
668
|
+
|
|
669
|
+
logger = logging.getLogger(__name__)
|
|
670
|
+
logger.warning("Cache write failed (non-fatal): %s", e)
|
|
671
|
+
|
|
672
|
+
return bars_df
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def get_range_bars_pandas(
|
|
676
|
+
symbol: str,
|
|
677
|
+
start_date: str,
|
|
678
|
+
end_date: str,
|
|
679
|
+
threshold_decimal_bps: int | str = 250,
|
|
680
|
+
**kwargs: Any,
|
|
681
|
+
) -> pd.DataFrame:
|
|
682
|
+
"""Get range bars as pandas DataFrame (deprecated compatibility shim).
|
|
683
|
+
|
|
684
|
+
.. deprecated:: 8.0
|
|
685
|
+
Use ``get_range_bars(materialize=True)`` directly instead.
|
|
686
|
+
This function will be removed in v9.0.
|
|
687
|
+
|
|
688
|
+
This function exists for backward compatibility with code written before
|
|
689
|
+
the streaming API was introduced. It simply calls ``get_range_bars()``
|
|
690
|
+
with ``materialize=True`` and returns the result.
|
|
691
|
+
|
|
692
|
+
Parameters
|
|
693
|
+
----------
|
|
694
|
+
symbol : str
|
|
695
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
696
|
+
start_date : str
|
|
697
|
+
Start date in YYYY-MM-DD format
|
|
698
|
+
end_date : str
|
|
699
|
+
End date in YYYY-MM-DD format
|
|
700
|
+
threshold_decimal_bps : int or str, default=250
|
|
701
|
+
Threshold in decimal basis points
|
|
702
|
+
**kwargs
|
|
703
|
+
Additional arguments passed to ``get_range_bars()``
|
|
704
|
+
|
|
705
|
+
Returns
|
|
706
|
+
-------
|
|
707
|
+
pd.DataFrame
|
|
708
|
+
OHLCV DataFrame ready for backtesting.py
|
|
709
|
+
|
|
710
|
+
Examples
|
|
711
|
+
--------
|
|
712
|
+
Instead of:
|
|
713
|
+
|
|
714
|
+
>>> df = get_range_bars_pandas("BTCUSDT", "2024-01-01", "2024-06-30")
|
|
715
|
+
|
|
716
|
+
Use:
|
|
717
|
+
|
|
718
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-06-30", materialize=True)
|
|
719
|
+
"""
|
|
720
|
+
import warnings
|
|
721
|
+
|
|
722
|
+
warnings.warn(
|
|
723
|
+
"get_range_bars_pandas() is deprecated. "
|
|
724
|
+
"Use get_range_bars(materialize=True) instead. "
|
|
725
|
+
"This function will be removed in v9.0.",
|
|
726
|
+
DeprecationWarning,
|
|
727
|
+
stacklevel=2,
|
|
728
|
+
)
|
|
729
|
+
return get_range_bars(
|
|
730
|
+
symbol,
|
|
731
|
+
start_date,
|
|
732
|
+
end_date,
|
|
733
|
+
threshold_decimal_bps,
|
|
734
|
+
materialize=True,
|
|
735
|
+
**kwargs,
|
|
736
|
+
)
|