rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
# polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
|
|
2
|
+
# Issue #46: Modularization M4 - Extract count-bounded orchestration from __init__.py
|
|
3
|
+
"""Count-bounded range bar retrieval (get_n_range_bars).
|
|
4
|
+
|
|
5
|
+
This module provides the count-bounded API for retrieving exactly N range bars,
|
|
6
|
+
useful for ML training and walk-forward optimization. Includes adaptive
|
|
7
|
+
gap-filling with exponential backoff for cache misses.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from datetime import UTC
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from rangebar.constants import (
|
|
19
|
+
THRESHOLD_DECIMAL_MAX,
|
|
20
|
+
THRESHOLD_DECIMAL_MIN,
|
|
21
|
+
THRESHOLD_PRESETS,
|
|
22
|
+
)
|
|
23
|
+
from rangebar.conversion import _concat_pandas_via_polars
|
|
24
|
+
from rangebar.validation.cache_staleness import detect_staleness
|
|
25
|
+
from rangebar.validation.continuity import (
|
|
26
|
+
ContinuityError,
|
|
27
|
+
ContinuityWarning,
|
|
28
|
+
validate_junction_continuity,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from .helpers import (
|
|
32
|
+
_fetch_exness,
|
|
33
|
+
_process_binance_trades,
|
|
34
|
+
_process_exness_ticks,
|
|
35
|
+
)
|
|
36
|
+
from .tick_fetcher import estimate_ticks_per_bar, fetch_ticks_with_backoff
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from datetime import datetime
|
|
40
|
+
|
|
41
|
+
from rangebar.clickhouse import RangeBarCache
|
|
42
|
+
from rangebar.storage.parquet import TickStorage
|
|
43
|
+
|
|
44
|
+
# Module-level logger (matches __init__.py pattern)
|
|
45
|
+
import logging
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger("rangebar")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_n_range_bars(
|
|
51
|
+
symbol: str,
|
|
52
|
+
n_bars: int,
|
|
53
|
+
threshold_decimal_bps: int | str = 250,
|
|
54
|
+
*,
|
|
55
|
+
end_date: str | None = None,
|
|
56
|
+
source: str = "binance",
|
|
57
|
+
market: str = "spot",
|
|
58
|
+
include_microstructure: bool = False,
|
|
59
|
+
prevent_same_timestamp_close: bool = True,
|
|
60
|
+
use_cache: bool = True,
|
|
61
|
+
fetch_if_missing: bool = True,
|
|
62
|
+
max_lookback_days: int = 90,
|
|
63
|
+
warn_if_fewer: bool = True,
|
|
64
|
+
validate_on_return: bool = False,
|
|
65
|
+
continuity_action: str = "warn",
|
|
66
|
+
chunk_size: int = 100_000,
|
|
67
|
+
cache_dir: str | None = None,
|
|
68
|
+
) -> pd.DataFrame:
|
|
69
|
+
"""Get exactly N range bars ending at or before a given date.
|
|
70
|
+
|
|
71
|
+
Unlike `get_range_bars()` which uses date bounds (producing variable bar counts),
|
|
72
|
+
this function returns a deterministic number of bars. This is useful for:
|
|
73
|
+
- ML training (exactly 10,000 samples)
|
|
74
|
+
- Walk-forward optimization (fixed window sizes)
|
|
75
|
+
- Consistent backtest comparisons
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
symbol : str
|
|
80
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
81
|
+
n_bars : int
|
|
82
|
+
Number of bars to retrieve. Must be > 0.
|
|
83
|
+
threshold_decimal_bps : int or str, default=250
|
|
84
|
+
Threshold in decimal basis points. Can be:
|
|
85
|
+
- Integer: Direct value (250 = 25bps = 0.25%)
|
|
86
|
+
- String preset: "micro", "tight", "standard", "medium", "wide", "macro"
|
|
87
|
+
end_date : str or None, default=None
|
|
88
|
+
End date in YYYY-MM-DD format. If None, uses most recent available data.
|
|
89
|
+
source : str, default="binance"
|
|
90
|
+
Data source: "binance" or "exness"
|
|
91
|
+
market : str, default="spot"
|
|
92
|
+
Market type (Binance only): "spot", "futures-um", or "futures-cm"
|
|
93
|
+
include_microstructure : bool, default=False
|
|
94
|
+
Include microstructure columns (vwap, buy_volume, sell_volume)
|
|
95
|
+
prevent_same_timestamp_close : bool, default=True
|
|
96
|
+
Timestamp gating for flash crash prevention (Issue #36).
|
|
97
|
+
If True (default): A bar cannot close on the same timestamp it opened.
|
|
98
|
+
If False: Legacy v8 behavior for comparative analysis.
|
|
99
|
+
use_cache : bool, default=True
|
|
100
|
+
Use ClickHouse cache for bar retrieval/storage
|
|
101
|
+
fetch_if_missing : bool, default=True
|
|
102
|
+
Fetch and process new data if cache doesn't have enough bars
|
|
103
|
+
max_lookback_days : int, default=90
|
|
104
|
+
Safety limit: maximum days to look back when fetching missing data.
|
|
105
|
+
Prevents runaway fetches on empty caches.
|
|
106
|
+
warn_if_fewer : bool, default=True
|
|
107
|
+
Emit UserWarning if returning fewer bars than requested.
|
|
108
|
+
validate_on_return : bool, default=False
|
|
109
|
+
If True, validate bar continuity before returning.
|
|
110
|
+
Uses continuity_action to determine behavior on failure.
|
|
111
|
+
continuity_action : str, default="warn"
|
|
112
|
+
Action when discontinuity found during validation:
|
|
113
|
+
- "warn": Log warning but return data
|
|
114
|
+
- "raise": Raise ContinuityError
|
|
115
|
+
- "log": Silent logging only
|
|
116
|
+
chunk_size : int, default=100_000
|
|
117
|
+
Number of ticks per processing chunk for memory efficiency.
|
|
118
|
+
Larger values = faster processing, more memory.
|
|
119
|
+
Default 100K = ~15MB memory overhead.
|
|
120
|
+
cache_dir : str or None, default=None
|
|
121
|
+
Custom cache directory for tick data (Tier 1).
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
pd.DataFrame
|
|
126
|
+
OHLCV DataFrame with exactly n_bars rows (or fewer if not enough data),
|
|
127
|
+
sorted chronologically (oldest first). Columns:
|
|
128
|
+
- Open, High, Low, Close, Volume
|
|
129
|
+
- (if include_microstructure) vwap, buy_volume, sell_volume
|
|
130
|
+
|
|
131
|
+
Raises
|
|
132
|
+
------
|
|
133
|
+
ValueError
|
|
134
|
+
- n_bars <= 0
|
|
135
|
+
- Invalid threshold
|
|
136
|
+
- Invalid date format
|
|
137
|
+
RuntimeError
|
|
138
|
+
- ClickHouse not available when use_cache=True
|
|
139
|
+
- Data fetching failed
|
|
140
|
+
|
|
141
|
+
Examples
|
|
142
|
+
--------
|
|
143
|
+
Get last 10,000 bars for ML training:
|
|
144
|
+
|
|
145
|
+
>>> from rangebar import get_n_range_bars
|
|
146
|
+
>>> df = get_n_range_bars("BTCUSDT", n_bars=10000)
|
|
147
|
+
>>> assert len(df) == 10000
|
|
148
|
+
|
|
149
|
+
Get 5,000 bars ending at specific date for walk-forward:
|
|
150
|
+
|
|
151
|
+
>>> df = get_n_range_bars("BTCUSDT", n_bars=5000, end_date="2024-06-01")
|
|
152
|
+
|
|
153
|
+
With safety limit (won't fetch more than 30 days of data):
|
|
154
|
+
|
|
155
|
+
>>> df = get_n_range_bars("BTCUSDT", n_bars=1000, max_lookback_days=30)
|
|
156
|
+
|
|
157
|
+
Notes
|
|
158
|
+
-----
|
|
159
|
+
Cache behavior:
|
|
160
|
+
- Fast path: If cache has >= n_bars, returns immediately (~50ms)
|
|
161
|
+
- Slow path: If cache has < n_bars and fetch_if_missing=True,
|
|
162
|
+
fetches additional data, computes bars, stores in cache, returns
|
|
163
|
+
|
|
164
|
+
Gap-filling algorithm:
|
|
165
|
+
Uses adaptive exponential backoff to estimate how many ticks to fetch.
|
|
166
|
+
Learns compression ratio (ticks/bar) for each (symbol, threshold) pair.
|
|
167
|
+
|
|
168
|
+
See Also
|
|
169
|
+
--------
|
|
170
|
+
get_range_bars : Date-bounded bar retrieval (variable bar count)
|
|
171
|
+
THRESHOLD_PRESETS : Named threshold values
|
|
172
|
+
"""
|
|
173
|
+
import warnings
|
|
174
|
+
from datetime import datetime
|
|
175
|
+
|
|
176
|
+
import numpy as np
|
|
177
|
+
|
|
178
|
+
# -------------------------------------------------------------------------
|
|
179
|
+
# Validation helper (closure over validate_on_return, continuity_action)
|
|
180
|
+
# -------------------------------------------------------------------------
|
|
181
|
+
def _apply_validation(df: pd.DataFrame) -> pd.DataFrame:
|
|
182
|
+
"""Apply continuity validation if enabled, then return DataFrame."""
|
|
183
|
+
if not validate_on_return or df.empty or len(df) <= 1:
|
|
184
|
+
return df
|
|
185
|
+
|
|
186
|
+
# Check continuity: Close[i] should equal Open[i+1]
|
|
187
|
+
close_prices = df["Close"].to_numpy()[:-1]
|
|
188
|
+
open_prices = df["Open"].to_numpy()[1:]
|
|
189
|
+
|
|
190
|
+
# Calculate relative differences
|
|
191
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
192
|
+
rel_diff = np.abs(open_prices - close_prices) / np.abs(close_prices)
|
|
193
|
+
|
|
194
|
+
# 0.01% tolerance for floating-point errors
|
|
195
|
+
tolerance = 0.0001
|
|
196
|
+
discontinuities_mask = rel_diff > tolerance
|
|
197
|
+
|
|
198
|
+
if not np.any(discontinuities_mask):
|
|
199
|
+
return df
|
|
200
|
+
|
|
201
|
+
# Found discontinuities
|
|
202
|
+
discontinuity_count = int(np.sum(discontinuities_mask))
|
|
203
|
+
msg = f"Found {discontinuity_count} discontinuities in {len(df)} bars"
|
|
204
|
+
|
|
205
|
+
if continuity_action == "raise":
|
|
206
|
+
# Build details for ContinuityError
|
|
207
|
+
indices = np.where(discontinuities_mask)[0]
|
|
208
|
+
details = []
|
|
209
|
+
for idx in indices[:10]: # Limit to first 10
|
|
210
|
+
details.append(
|
|
211
|
+
{
|
|
212
|
+
"bar_index": int(idx),
|
|
213
|
+
"prev_close": float(close_prices[idx]),
|
|
214
|
+
"next_open": float(open_prices[idx]),
|
|
215
|
+
"gap_pct": float(rel_diff[idx] * 100),
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
raise ContinuityError(msg, details)
|
|
219
|
+
if continuity_action == "warn":
|
|
220
|
+
warnings.warn(msg, ContinuityWarning, stacklevel=3)
|
|
221
|
+
else: # "log"
|
|
222
|
+
logging.getLogger("rangebar").warning(msg)
|
|
223
|
+
|
|
224
|
+
return df
|
|
225
|
+
|
|
226
|
+
# -------------------------------------------------------------------------
|
|
227
|
+
# Validate parameters
|
|
228
|
+
# -------------------------------------------------------------------------
|
|
229
|
+
if n_bars <= 0:
|
|
230
|
+
msg = f"n_bars must be > 0, got {n_bars}"
|
|
231
|
+
raise ValueError(msg)
|
|
232
|
+
|
|
233
|
+
# Resolve threshold (support presets)
|
|
234
|
+
threshold: int
|
|
235
|
+
if isinstance(threshold_decimal_bps, str):
|
|
236
|
+
if threshold_decimal_bps not in THRESHOLD_PRESETS:
|
|
237
|
+
msg = (
|
|
238
|
+
f"Unknown threshold preset: {threshold_decimal_bps!r}. "
|
|
239
|
+
f"Valid presets: {list(THRESHOLD_PRESETS.keys())}"
|
|
240
|
+
)
|
|
241
|
+
raise ValueError(msg)
|
|
242
|
+
threshold = THRESHOLD_PRESETS[threshold_decimal_bps]
|
|
243
|
+
else:
|
|
244
|
+
threshold = threshold_decimal_bps
|
|
245
|
+
|
|
246
|
+
if not THRESHOLD_DECIMAL_MIN <= threshold <= THRESHOLD_DECIMAL_MAX:
|
|
247
|
+
msg = (
|
|
248
|
+
f"threshold_decimal_bps must be between {THRESHOLD_DECIMAL_MIN} and "
|
|
249
|
+
f"{THRESHOLD_DECIMAL_MAX}, got {threshold}"
|
|
250
|
+
)
|
|
251
|
+
raise ValueError(msg)
|
|
252
|
+
|
|
253
|
+
# Normalize source and market
|
|
254
|
+
source = source.lower()
|
|
255
|
+
if source not in ("binance", "exness"):
|
|
256
|
+
msg = f"Unknown source: {source!r}. Must be 'binance' or 'exness'"
|
|
257
|
+
raise ValueError(msg)
|
|
258
|
+
|
|
259
|
+
market_map = {
|
|
260
|
+
"spot": "spot",
|
|
261
|
+
"futures-um": "um",
|
|
262
|
+
"futures-cm": "cm",
|
|
263
|
+
"um": "um",
|
|
264
|
+
"cm": "cm",
|
|
265
|
+
}
|
|
266
|
+
market = market.lower()
|
|
267
|
+
if source == "binance" and market not in market_map:
|
|
268
|
+
msg = (
|
|
269
|
+
f"Unknown market: {market!r}. "
|
|
270
|
+
"Must be 'spot', 'futures-um'/'um', or 'futures-cm'/'cm'"
|
|
271
|
+
)
|
|
272
|
+
raise ValueError(msg)
|
|
273
|
+
market_normalized = market_map.get(market, market)
|
|
274
|
+
|
|
275
|
+
# Parse end_date if provided
|
|
276
|
+
end_ts: int | None = None
|
|
277
|
+
if end_date is not None:
|
|
278
|
+
try:
|
|
279
|
+
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
280
|
+
# End of day in milliseconds
|
|
281
|
+
end_ts = int((end_dt.timestamp() + 86399) * 1000)
|
|
282
|
+
except ValueError as e:
|
|
283
|
+
msg = f"Invalid date format. Use YYYY-MM-DD: {e}"
|
|
284
|
+
raise ValueError(msg) from e
|
|
285
|
+
|
|
286
|
+
# -------------------------------------------------------------------------
|
|
287
|
+
# Try cache first (if enabled)
|
|
288
|
+
# -------------------------------------------------------------------------
|
|
289
|
+
if use_cache:
|
|
290
|
+
try:
|
|
291
|
+
from rangebar.clickhouse import RangeBarCache
|
|
292
|
+
|
|
293
|
+
with RangeBarCache() as cache:
|
|
294
|
+
# Fast path: check if cache has enough bars
|
|
295
|
+
bars_df, available_count = cache.get_n_bars(
|
|
296
|
+
symbol=symbol,
|
|
297
|
+
threshold_decimal_bps=threshold,
|
|
298
|
+
n_bars=n_bars,
|
|
299
|
+
before_ts=end_ts,
|
|
300
|
+
include_microstructure=include_microstructure,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if bars_df is not None and len(bars_df) >= n_bars:
|
|
304
|
+
# Tier 0 validation: Content-based staleness detection (Issue #39)
|
|
305
|
+
if include_microstructure:
|
|
306
|
+
staleness = detect_staleness(
|
|
307
|
+
bars_df, require_microstructure=True
|
|
308
|
+
)
|
|
309
|
+
if staleness.is_stale:
|
|
310
|
+
logger.warning(
|
|
311
|
+
"Stale cache data detected for %s: %s. "
|
|
312
|
+
"Falling through to recompute.",
|
|
313
|
+
symbol,
|
|
314
|
+
staleness.reason,
|
|
315
|
+
)
|
|
316
|
+
# Fall through to fetch_if_missing path
|
|
317
|
+
else:
|
|
318
|
+
# Cache hit - return exactly n_bars
|
|
319
|
+
return _apply_validation(bars_df.tail(n_bars))
|
|
320
|
+
else:
|
|
321
|
+
# Cache hit - return exactly n_bars
|
|
322
|
+
return _apply_validation(bars_df.tail(n_bars))
|
|
323
|
+
|
|
324
|
+
# Slow path: need to fetch more data
|
|
325
|
+
if fetch_if_missing:
|
|
326
|
+
bars_df = _fill_gap_and_cache(
|
|
327
|
+
symbol=symbol,
|
|
328
|
+
threshold=threshold,
|
|
329
|
+
n_bars=n_bars,
|
|
330
|
+
end_ts=end_ts,
|
|
331
|
+
source=source,
|
|
332
|
+
market=market_normalized,
|
|
333
|
+
include_microstructure=include_microstructure,
|
|
334
|
+
max_lookback_days=max_lookback_days,
|
|
335
|
+
cache=cache,
|
|
336
|
+
cache_dir=Path(cache_dir) if cache_dir else None,
|
|
337
|
+
current_bars=bars_df,
|
|
338
|
+
current_count=available_count,
|
|
339
|
+
chunk_size=chunk_size,
|
|
340
|
+
prevent_same_timestamp_close=prevent_same_timestamp_close,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
if bars_df is not None and len(bars_df) >= n_bars:
|
|
344
|
+
return _apply_validation(bars_df.tail(n_bars))
|
|
345
|
+
|
|
346
|
+
# Return what we have (or None)
|
|
347
|
+
if bars_df is not None and len(bars_df) > 0:
|
|
348
|
+
if warn_if_fewer and len(bars_df) < n_bars:
|
|
349
|
+
warnings.warn(
|
|
350
|
+
f"Returning {len(bars_df)} bars instead of requested {n_bars}. "
|
|
351
|
+
f"Insufficient data available within max_lookback_days={max_lookback_days}.",
|
|
352
|
+
UserWarning,
|
|
353
|
+
stacklevel=2,
|
|
354
|
+
)
|
|
355
|
+
return _apply_validation(bars_df)
|
|
356
|
+
|
|
357
|
+
# Empty result
|
|
358
|
+
if warn_if_fewer:
|
|
359
|
+
warnings.warn(
|
|
360
|
+
f"Returning 0 bars instead of requested {n_bars}. "
|
|
361
|
+
"No data available in cache or from source.",
|
|
362
|
+
UserWarning,
|
|
363
|
+
stacklevel=2,
|
|
364
|
+
)
|
|
365
|
+
return pd.DataFrame(
|
|
366
|
+
columns=["Open", "High", "Low", "Close", "Volume"]
|
|
367
|
+
).set_index(pd.DatetimeIndex([]))
|
|
368
|
+
|
|
369
|
+
except Exception as e:
|
|
370
|
+
# ClickHouse not available - fall through to compute-only mode
|
|
371
|
+
if "ClickHouseNotConfigured" in type(e).__name__:
|
|
372
|
+
pass # Fall through to compute-only mode
|
|
373
|
+
else:
|
|
374
|
+
raise
|
|
375
|
+
|
|
376
|
+
# -------------------------------------------------------------------------
|
|
377
|
+
# Compute-only mode (no cache)
|
|
378
|
+
# -------------------------------------------------------------------------
|
|
379
|
+
if not fetch_if_missing:
|
|
380
|
+
if warn_if_fewer:
|
|
381
|
+
warnings.warn(
|
|
382
|
+
f"Returning 0 bars instead of requested {n_bars}. "
|
|
383
|
+
"Cache disabled and fetch_if_missing=False.",
|
|
384
|
+
UserWarning,
|
|
385
|
+
stacklevel=2,
|
|
386
|
+
)
|
|
387
|
+
return pd.DataFrame(
|
|
388
|
+
columns=["Open", "High", "Low", "Close", "Volume"]
|
|
389
|
+
).set_index(pd.DatetimeIndex([]))
|
|
390
|
+
|
|
391
|
+
# Fetch and compute without caching
|
|
392
|
+
bars_df = _fetch_and_compute_bars(
|
|
393
|
+
symbol=symbol,
|
|
394
|
+
threshold=threshold,
|
|
395
|
+
n_bars=n_bars,
|
|
396
|
+
end_ts=end_ts,
|
|
397
|
+
source=source,
|
|
398
|
+
market=market_normalized,
|
|
399
|
+
include_microstructure=include_microstructure,
|
|
400
|
+
max_lookback_days=max_lookback_days,
|
|
401
|
+
cache_dir=Path(cache_dir) if cache_dir else None,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
if bars_df is not None and len(bars_df) >= n_bars:
|
|
405
|
+
return _apply_validation(bars_df.tail(n_bars))
|
|
406
|
+
|
|
407
|
+
if bars_df is not None and len(bars_df) > 0:
|
|
408
|
+
if warn_if_fewer:
|
|
409
|
+
warnings.warn(
|
|
410
|
+
f"Returning {len(bars_df)} bars instead of requested {n_bars}. "
|
|
411
|
+
f"Insufficient data available within max_lookback_days={max_lookback_days}.",
|
|
412
|
+
UserWarning,
|
|
413
|
+
stacklevel=2,
|
|
414
|
+
)
|
|
415
|
+
return _apply_validation(bars_df)
|
|
416
|
+
|
|
417
|
+
if warn_if_fewer:
|
|
418
|
+
warnings.warn(
|
|
419
|
+
f"Returning 0 bars instead of requested {n_bars}. "
|
|
420
|
+
"No data available from source.",
|
|
421
|
+
UserWarning,
|
|
422
|
+
stacklevel=2,
|
|
423
|
+
)
|
|
424
|
+
return pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"]).set_index(
|
|
425
|
+
pd.DatetimeIndex([])
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _fill_gap_and_cache(
|
|
430
|
+
symbol: str,
|
|
431
|
+
threshold: int,
|
|
432
|
+
n_bars: int,
|
|
433
|
+
end_ts: int | None,
|
|
434
|
+
source: str,
|
|
435
|
+
market: str,
|
|
436
|
+
include_microstructure: bool,
|
|
437
|
+
max_lookback_days: int,
|
|
438
|
+
cache: RangeBarCache,
|
|
439
|
+
cache_dir: Path | None,
|
|
440
|
+
current_bars: pd.DataFrame | None,
|
|
441
|
+
current_count: int,
|
|
442
|
+
chunk_size: int = 100_000,
|
|
443
|
+
prevent_same_timestamp_close: bool = True,
|
|
444
|
+
) -> pd.DataFrame | None:
|
|
445
|
+
"""Fill gap in cache by fetching and processing additional data.
|
|
446
|
+
|
|
447
|
+
Uses checkpoint-based cross-file continuity for Binance (24/7 crypto markets).
|
|
448
|
+
The key insight: ALL ticks must be processed with a SINGLE processor to
|
|
449
|
+
maintain the bar[i+1].open == bar[i].close invariant.
|
|
450
|
+
|
|
451
|
+
For Binance (24/7):
|
|
452
|
+
1. Collect ALL tick data first (no intermediate processing)
|
|
453
|
+
2. Merge all ticks chronologically
|
|
454
|
+
3. Process with SINGLE processor (guarantees continuity)
|
|
455
|
+
4. Store with unified cache key
|
|
456
|
+
|
|
457
|
+
For Exness (forex):
|
|
458
|
+
Session-bounded processing is acceptable since weekend gaps are natural.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
chunk_size : int, default=100_000
|
|
463
|
+
Number of ticks per processing chunk for memory efficiency when using
|
|
464
|
+
chunked processing with checkpoint continuation.
|
|
465
|
+
"""
|
|
466
|
+
from datetime import datetime
|
|
467
|
+
|
|
468
|
+
from rangebar.storage.parquet import TickStorage
|
|
469
|
+
|
|
470
|
+
# Determine how many more bars we need
|
|
471
|
+
bars_needed = n_bars - (len(current_bars) if current_bars is not None else 0)
|
|
472
|
+
|
|
473
|
+
if bars_needed <= 0:
|
|
474
|
+
return current_bars
|
|
475
|
+
|
|
476
|
+
# Determine end date for fetching
|
|
477
|
+
if end_ts is not None:
|
|
478
|
+
end_dt = datetime.fromtimestamp(end_ts / 1000, tz=UTC)
|
|
479
|
+
else:
|
|
480
|
+
end_dt = datetime.now(tz=UTC)
|
|
481
|
+
|
|
482
|
+
# Get oldest bar timestamp to know where to start fetching
|
|
483
|
+
oldest_ts = cache.get_oldest_bar_timestamp(symbol, threshold)
|
|
484
|
+
|
|
485
|
+
# Estimate ticks needed using extracted helper
|
|
486
|
+
estimated_ticks_per_bar = estimate_ticks_per_bar(threshold)
|
|
487
|
+
target_ticks = bars_needed * estimated_ticks_per_bar * 2 # 2x buffer
|
|
488
|
+
|
|
489
|
+
storage = TickStorage(cache_dir=cache_dir)
|
|
490
|
+
|
|
491
|
+
# =========================================================================
|
|
492
|
+
# BINANCE (24/7 CRYPTO): Single-pass processing with checkpoint continuity
|
|
493
|
+
# =========================================================================
|
|
494
|
+
if source == "binance":
|
|
495
|
+
# Phase 1: Fetch ALL tick data using extracted tick fetcher
|
|
496
|
+
fetch_result = fetch_ticks_with_backoff(
|
|
497
|
+
symbol=symbol,
|
|
498
|
+
source=source,
|
|
499
|
+
market=market,
|
|
500
|
+
target_ticks=target_ticks,
|
|
501
|
+
end_dt=end_dt,
|
|
502
|
+
oldest_ts=oldest_ts,
|
|
503
|
+
max_lookback_days=max_lookback_days,
|
|
504
|
+
storage=storage,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
if fetch_result.ticks is None:
|
|
508
|
+
return current_bars
|
|
509
|
+
|
|
510
|
+
# Phase 2: Process with SINGLE processor (guarantees continuity)
|
|
511
|
+
new_bars, _ = _process_binance_trades(
|
|
512
|
+
fetch_result.ticks,
|
|
513
|
+
threshold,
|
|
514
|
+
False,
|
|
515
|
+
include_microstructure,
|
|
516
|
+
symbol=symbol,
|
|
517
|
+
prevent_same_timestamp_close=prevent_same_timestamp_close,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Phase 3: Store with unified cache key
|
|
521
|
+
if not new_bars.empty:
|
|
522
|
+
cache.store_bars_bulk(symbol, threshold, new_bars)
|
|
523
|
+
|
|
524
|
+
# Combine with existing bars
|
|
525
|
+
if current_bars is not None and len(current_bars) > 0:
|
|
526
|
+
# Validate continuity at junction (new_bars older, current_bars newer)
|
|
527
|
+
is_continuous, gap_pct = validate_junction_continuity(
|
|
528
|
+
new_bars, current_bars
|
|
529
|
+
)
|
|
530
|
+
if not is_continuous:
|
|
531
|
+
import warnings
|
|
532
|
+
|
|
533
|
+
warnings.warn(
|
|
534
|
+
f"Discontinuity detected at junction: {symbol} @ {threshold} dbps. "
|
|
535
|
+
f"Gap: {gap_pct:.4%}. This occurs because range bars from different "
|
|
536
|
+
f"processing sessions cannot guarantee bar[n].close == bar[n+1].open. "
|
|
537
|
+
f"Consider invalidating cache and re-fetching all data for continuous "
|
|
538
|
+
f"bars. See: https://github.com/terrylica/rangebar-py/issues/5",
|
|
539
|
+
stacklevel=3,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# MEM-006: Use Polars for memory-efficient concatenation
|
|
543
|
+
combined = _concat_pandas_via_polars([new_bars, current_bars])
|
|
544
|
+
return combined[~combined.index.duplicated(keep="last")]
|
|
545
|
+
|
|
546
|
+
return new_bars
|
|
547
|
+
|
|
548
|
+
# =========================================================================
|
|
549
|
+
# EXNESS (FOREX): Session-bounded processing (weekend gaps are natural)
|
|
550
|
+
# =========================================================================
|
|
551
|
+
return _fill_gap_exness(
|
|
552
|
+
symbol=symbol,
|
|
553
|
+
threshold=threshold,
|
|
554
|
+
n_bars=n_bars,
|
|
555
|
+
end_dt=end_dt,
|
|
556
|
+
oldest_ts=oldest_ts,
|
|
557
|
+
include_microstructure=include_microstructure,
|
|
558
|
+
max_lookback_days=max_lookback_days,
|
|
559
|
+
cache=cache,
|
|
560
|
+
storage=storage,
|
|
561
|
+
current_bars=current_bars,
|
|
562
|
+
estimated_ticks_per_bar=estimated_ticks_per_bar,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _fill_gap_exness(
|
|
567
|
+
symbol: str,
|
|
568
|
+
threshold: int,
|
|
569
|
+
n_bars: int,
|
|
570
|
+
end_dt: datetime,
|
|
571
|
+
oldest_ts: int | None,
|
|
572
|
+
include_microstructure: bool,
|
|
573
|
+
max_lookback_days: int,
|
|
574
|
+
cache: RangeBarCache,
|
|
575
|
+
storage: TickStorage,
|
|
576
|
+
current_bars: pd.DataFrame | None,
|
|
577
|
+
estimated_ticks_per_bar: int,
|
|
578
|
+
) -> pd.DataFrame | None:
|
|
579
|
+
"""Fill gap for Exness forex data with session-bounded processing.
|
|
580
|
+
|
|
581
|
+
Forex markets have natural weekend gaps, so session-bounded processing
|
|
582
|
+
is acceptable (unlike 24/7 crypto markets).
|
|
583
|
+
"""
|
|
584
|
+
from datetime import datetime, timedelta
|
|
585
|
+
|
|
586
|
+
multiplier = 2.0
|
|
587
|
+
max_attempts = 5
|
|
588
|
+
bars_needed = n_bars - (len(current_bars) if current_bars is not None else 0)
|
|
589
|
+
|
|
590
|
+
all_bars: list[pd.DataFrame] = []
|
|
591
|
+
if current_bars is not None and len(current_bars) > 0:
|
|
592
|
+
all_bars.append(current_bars)
|
|
593
|
+
|
|
594
|
+
cache_symbol = f"exness_spot_{symbol}".upper()
|
|
595
|
+
|
|
596
|
+
for _attempt in range(max_attempts):
|
|
597
|
+
# Estimate days to fetch
|
|
598
|
+
ticks_to_fetch = int(bars_needed * estimated_ticks_per_bar * multiplier)
|
|
599
|
+
days_to_fetch = max(1, ticks_to_fetch // 1_000_000)
|
|
600
|
+
days_to_fetch = min(days_to_fetch, max_lookback_days)
|
|
601
|
+
|
|
602
|
+
# Calculate fetch range
|
|
603
|
+
if oldest_ts is not None:
|
|
604
|
+
fetch_end_dt = datetime.fromtimestamp(oldest_ts / 1000, tz=UTC)
|
|
605
|
+
else:
|
|
606
|
+
fetch_end_dt = end_dt
|
|
607
|
+
|
|
608
|
+
fetch_start_dt = fetch_end_dt - timedelta(days=days_to_fetch)
|
|
609
|
+
|
|
610
|
+
if (end_dt - fetch_start_dt).days > max_lookback_days:
|
|
611
|
+
break
|
|
612
|
+
|
|
613
|
+
start_date = fetch_start_dt.strftime("%Y-%m-%d")
|
|
614
|
+
end_date_str = fetch_end_dt.strftime("%Y-%m-%d")
|
|
615
|
+
start_ts_fetch = int(fetch_start_dt.timestamp() * 1000)
|
|
616
|
+
end_ts_fetch = int(fetch_end_dt.timestamp() * 1000)
|
|
617
|
+
|
|
618
|
+
if storage.has_ticks(cache_symbol, start_ts_fetch, end_ts_fetch):
|
|
619
|
+
tick_data = storage.read_ticks(cache_symbol, start_ts_fetch, end_ts_fetch)
|
|
620
|
+
else:
|
|
621
|
+
tick_data = _fetch_exness(symbol, start_date, end_date_str, "strict")
|
|
622
|
+
if not tick_data.is_empty():
|
|
623
|
+
storage.write_ticks(cache_symbol, tick_data)
|
|
624
|
+
|
|
625
|
+
if tick_data.is_empty():
|
|
626
|
+
break
|
|
627
|
+
|
|
628
|
+
# Process to bars (forex: session-bounded is OK)
|
|
629
|
+
new_bars = _process_exness_ticks(
|
|
630
|
+
tick_data, symbol, threshold, "strict", False, include_microstructure
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if not new_bars.empty:
|
|
634
|
+
cache.store_bars_bulk(symbol, threshold, new_bars)
|
|
635
|
+
all_bars.insert(0, new_bars)
|
|
636
|
+
oldest_ts = int(new_bars.index.min().timestamp() * 1000)
|
|
637
|
+
|
|
638
|
+
total_bars = sum(len(df) for df in all_bars)
|
|
639
|
+
if total_bars >= n_bars:
|
|
640
|
+
break
|
|
641
|
+
|
|
642
|
+
multiplier *= 2
|
|
643
|
+
|
|
644
|
+
if not all_bars:
|
|
645
|
+
return None
|
|
646
|
+
|
|
647
|
+
# MEM-006: Use Polars for memory-efficient concatenation
|
|
648
|
+
combined = _concat_pandas_via_polars(all_bars)
|
|
649
|
+
return combined[~combined.index.duplicated(keep="last")]
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
def _fetch_and_compute_bars(
|
|
653
|
+
symbol: str,
|
|
654
|
+
threshold: int,
|
|
655
|
+
n_bars: int,
|
|
656
|
+
end_ts: int | None,
|
|
657
|
+
source: str,
|
|
658
|
+
market: str,
|
|
659
|
+
include_microstructure: bool,
|
|
660
|
+
max_lookback_days: int,
|
|
661
|
+
cache_dir: Path | None,
|
|
662
|
+
) -> pd.DataFrame | None:
|
|
663
|
+
"""Fetch and compute bars without caching (compute-only mode).
|
|
664
|
+
|
|
665
|
+
Uses single-pass processing for Binance (24/7 crypto) to guarantee continuity.
|
|
666
|
+
"""
|
|
667
|
+
from datetime import datetime
|
|
668
|
+
|
|
669
|
+
from rangebar.storage.parquet import TickStorage
|
|
670
|
+
|
|
671
|
+
# Determine end date
|
|
672
|
+
if end_ts is not None:
|
|
673
|
+
end_dt = datetime.fromtimestamp(end_ts / 1000, tz=UTC)
|
|
674
|
+
else:
|
|
675
|
+
end_dt = datetime.now(tz=UTC)
|
|
676
|
+
|
|
677
|
+
# Estimate ticks needed using extracted helper
|
|
678
|
+
estimated_ticks_per_bar = estimate_ticks_per_bar(threshold)
|
|
679
|
+
target_ticks = n_bars * estimated_ticks_per_bar * 2
|
|
680
|
+
|
|
681
|
+
storage = TickStorage(cache_dir=cache_dir)
|
|
682
|
+
|
|
683
|
+
# =========================================================================
|
|
684
|
+
# BINANCE (24/7 CRYPTO): Single-pass processing for continuity
|
|
685
|
+
# =========================================================================
|
|
686
|
+
if source == "binance":
|
|
687
|
+
# Use extracted tick fetcher
|
|
688
|
+
fetch_result = fetch_ticks_with_backoff(
|
|
689
|
+
symbol=symbol,
|
|
690
|
+
source=source,
|
|
691
|
+
market=market,
|
|
692
|
+
target_ticks=target_ticks,
|
|
693
|
+
end_dt=end_dt,
|
|
694
|
+
oldest_ts=None,
|
|
695
|
+
max_lookback_days=max_lookback_days,
|
|
696
|
+
storage=storage,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
if fetch_result.ticks is None:
|
|
700
|
+
return None
|
|
701
|
+
|
|
702
|
+
bars_df, _ = _process_binance_trades(
|
|
703
|
+
fetch_result.ticks, threshold, False, include_microstructure, symbol=symbol
|
|
704
|
+
)
|
|
705
|
+
return bars_df if not bars_df.empty else None
|
|
706
|
+
|
|
707
|
+
# =========================================================================
|
|
708
|
+
# EXNESS (FOREX): Session-bounded processing
|
|
709
|
+
# =========================================================================
|
|
710
|
+
return _compute_exness_bars(
|
|
711
|
+
symbol=symbol,
|
|
712
|
+
threshold=threshold,
|
|
713
|
+
n_bars=n_bars,
|
|
714
|
+
end_dt=end_dt,
|
|
715
|
+
include_microstructure=include_microstructure,
|
|
716
|
+
max_lookback_days=max_lookback_days,
|
|
717
|
+
storage=storage,
|
|
718
|
+
estimated_ticks_per_bar=estimated_ticks_per_bar,
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def _compute_exness_bars(
|
|
723
|
+
symbol: str,
|
|
724
|
+
threshold: int,
|
|
725
|
+
n_bars: int,
|
|
726
|
+
end_dt: datetime,
|
|
727
|
+
include_microstructure: bool,
|
|
728
|
+
max_lookback_days: int,
|
|
729
|
+
storage: TickStorage,
|
|
730
|
+
estimated_ticks_per_bar: int,
|
|
731
|
+
) -> pd.DataFrame | None:
|
|
732
|
+
"""Compute Exness forex bars without caching (compute-only mode).
|
|
733
|
+
|
|
734
|
+
Forex markets have natural weekend gaps, so session-bounded processing
|
|
735
|
+
is acceptable.
|
|
736
|
+
"""
|
|
737
|
+
from datetime import datetime, timedelta
|
|
738
|
+
|
|
739
|
+
multiplier = 2.0
|
|
740
|
+
max_attempts = 5
|
|
741
|
+
oldest_ts: int | None = None
|
|
742
|
+
cache_symbol = f"exness_spot_{symbol}".upper()
|
|
743
|
+
|
|
744
|
+
all_bars: list[pd.DataFrame] = []
|
|
745
|
+
|
|
746
|
+
for _attempt in range(max_attempts):
|
|
747
|
+
bars_still_needed = n_bars - sum(len(df) for df in all_bars)
|
|
748
|
+
ticks_to_fetch = int(bars_still_needed * estimated_ticks_per_bar * multiplier)
|
|
749
|
+
days_to_fetch = max(1, ticks_to_fetch // 1_000_000)
|
|
750
|
+
days_to_fetch = min(days_to_fetch, max_lookback_days)
|
|
751
|
+
|
|
752
|
+
if oldest_ts is not None:
|
|
753
|
+
fetch_end_dt = datetime.fromtimestamp(oldest_ts / 1000, tz=UTC)
|
|
754
|
+
else:
|
|
755
|
+
fetch_end_dt = end_dt
|
|
756
|
+
|
|
757
|
+
fetch_start_dt = fetch_end_dt - timedelta(days=days_to_fetch)
|
|
758
|
+
|
|
759
|
+
if (end_dt - fetch_start_dt).days > max_lookback_days:
|
|
760
|
+
break
|
|
761
|
+
|
|
762
|
+
start_date = fetch_start_dt.strftime("%Y-%m-%d")
|
|
763
|
+
end_date_str = fetch_end_dt.strftime("%Y-%m-%d")
|
|
764
|
+
start_ts_fetch = int(fetch_start_dt.timestamp() * 1000)
|
|
765
|
+
end_ts_fetch = int(fetch_end_dt.timestamp() * 1000)
|
|
766
|
+
|
|
767
|
+
if storage.has_ticks(cache_symbol, start_ts_fetch, end_ts_fetch):
|
|
768
|
+
tick_data = storage.read_ticks(cache_symbol, start_ts_fetch, end_ts_fetch)
|
|
769
|
+
else:
|
|
770
|
+
tick_data = _fetch_exness(symbol, start_date, end_date_str, "strict")
|
|
771
|
+
if not tick_data.is_empty():
|
|
772
|
+
storage.write_ticks(cache_symbol, tick_data)
|
|
773
|
+
|
|
774
|
+
if tick_data.is_empty():
|
|
775
|
+
break
|
|
776
|
+
|
|
777
|
+
new_bars = _process_exness_ticks(
|
|
778
|
+
tick_data, symbol, threshold, "strict", False, include_microstructure
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
if not new_bars.empty:
|
|
782
|
+
all_bars.insert(0, new_bars)
|
|
783
|
+
oldest_ts = int(new_bars.index.min().timestamp() * 1000)
|
|
784
|
+
|
|
785
|
+
total_bars = sum(len(df) for df in all_bars)
|
|
786
|
+
if total_bars >= n_bars:
|
|
787
|
+
break
|
|
788
|
+
|
|
789
|
+
multiplier *= 2
|
|
790
|
+
|
|
791
|
+
if not all_bars:
|
|
792
|
+
return None
|
|
793
|
+
|
|
794
|
+
# MEM-006: Use Polars for memory-efficient concatenation
|
|
795
|
+
combined = _concat_pandas_via_polars(all_bars)
|
|
796
|
+
# Remove duplicates (by index) and return
|
|
797
|
+
return combined[~combined.index.duplicated(keep="last")]
|