rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,797 @@
1
+ # polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
2
+ # Issue #46: Modularization M4 - Extract count-bounded orchestration from __init__.py
3
+ """Count-bounded range bar retrieval (get_n_range_bars).
4
+
5
+ This module provides the count-bounded API for retrieving exactly N range bars,
6
+ useful for ML training and walk-forward optimization. Includes adaptive
7
+ gap-filling with exponential backoff for cache misses.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import UTC
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING
15
+
16
+ import pandas as pd
17
+
18
+ from rangebar.constants import (
19
+ THRESHOLD_DECIMAL_MAX,
20
+ THRESHOLD_DECIMAL_MIN,
21
+ THRESHOLD_PRESETS,
22
+ )
23
+ from rangebar.conversion import _concat_pandas_via_polars
24
+ from rangebar.validation.cache_staleness import detect_staleness
25
+ from rangebar.validation.continuity import (
26
+ ContinuityError,
27
+ ContinuityWarning,
28
+ validate_junction_continuity,
29
+ )
30
+
31
+ from .helpers import (
32
+ _fetch_exness,
33
+ _process_binance_trades,
34
+ _process_exness_ticks,
35
+ )
36
+ from .tick_fetcher import estimate_ticks_per_bar, fetch_ticks_with_backoff
37
+
38
+ if TYPE_CHECKING:
39
+ from datetime import datetime
40
+
41
+ from rangebar.clickhouse import RangeBarCache
42
+ from rangebar.storage.parquet import TickStorage
43
+
44
+ # Module-level logger (matches __init__.py pattern)
45
+ import logging
46
+
47
+ logger = logging.getLogger("rangebar")
48
+
49
+
50
+ def get_n_range_bars(
51
+ symbol: str,
52
+ n_bars: int,
53
+ threshold_decimal_bps: int | str = 250,
54
+ *,
55
+ end_date: str | None = None,
56
+ source: str = "binance",
57
+ market: str = "spot",
58
+ include_microstructure: bool = False,
59
+ prevent_same_timestamp_close: bool = True,
60
+ use_cache: bool = True,
61
+ fetch_if_missing: bool = True,
62
+ max_lookback_days: int = 90,
63
+ warn_if_fewer: bool = True,
64
+ validate_on_return: bool = False,
65
+ continuity_action: str = "warn",
66
+ chunk_size: int = 100_000,
67
+ cache_dir: str | None = None,
68
+ ) -> pd.DataFrame:
69
+ """Get exactly N range bars ending at or before a given date.
70
+
71
+ Unlike `get_range_bars()` which uses date bounds (producing variable bar counts),
72
+ this function returns a deterministic number of bars. This is useful for:
73
+ - ML training (exactly 10,000 samples)
74
+ - Walk-forward optimization (fixed window sizes)
75
+ - Consistent backtest comparisons
76
+
77
+ Parameters
78
+ ----------
79
+ symbol : str
80
+ Trading symbol (e.g., "BTCUSDT")
81
+ n_bars : int
82
+ Number of bars to retrieve. Must be > 0.
83
+ threshold_decimal_bps : int or str, default=250
84
+ Threshold in decimal basis points. Can be:
85
+ - Integer: Direct value (250 = 25bps = 0.25%)
86
+ - String preset: "micro", "tight", "standard", "medium", "wide", "macro"
87
+ end_date : str or None, default=None
88
+ End date in YYYY-MM-DD format. If None, uses most recent available data.
89
+ source : str, default="binance"
90
+ Data source: "binance" or "exness"
91
+ market : str, default="spot"
92
+ Market type (Binance only): "spot", "futures-um", or "futures-cm"
93
+ include_microstructure : bool, default=False
94
+ Include microstructure columns (vwap, buy_volume, sell_volume)
95
+ prevent_same_timestamp_close : bool, default=True
96
+ Timestamp gating for flash crash prevention (Issue #36).
97
+ If True (default): A bar cannot close on the same timestamp it opened.
98
+ If False: Legacy v8 behavior for comparative analysis.
99
+ use_cache : bool, default=True
100
+ Use ClickHouse cache for bar retrieval/storage
101
+ fetch_if_missing : bool, default=True
102
+ Fetch and process new data if cache doesn't have enough bars
103
+ max_lookback_days : int, default=90
104
+ Safety limit: maximum days to look back when fetching missing data.
105
+ Prevents runaway fetches on empty caches.
106
+ warn_if_fewer : bool, default=True
107
+ Emit UserWarning if returning fewer bars than requested.
108
+ validate_on_return : bool, default=False
109
+ If True, validate bar continuity before returning.
110
+ Uses continuity_action to determine behavior on failure.
111
+ continuity_action : str, default="warn"
112
+ Action when discontinuity found during validation:
113
+ - "warn": Log warning but return data
114
+ - "raise": Raise ContinuityError
115
+ - "log": Silent logging only
116
+ chunk_size : int, default=100_000
117
+ Number of ticks per processing chunk for memory efficiency.
118
+ Larger values = faster processing, more memory.
119
+ Default 100K = ~15MB memory overhead.
120
+ cache_dir : str or None, default=None
121
+ Custom cache directory for tick data (Tier 1).
122
+
123
+ Returns
124
+ -------
125
+ pd.DataFrame
126
+ OHLCV DataFrame with exactly n_bars rows (or fewer if not enough data),
127
+ sorted chronologically (oldest first). Columns:
128
+ - Open, High, Low, Close, Volume
129
+ - (if include_microstructure) vwap, buy_volume, sell_volume
130
+
131
+ Raises
132
+ ------
133
+ ValueError
134
+ - n_bars <= 0
135
+ - Invalid threshold
136
+ - Invalid date format
137
+ RuntimeError
138
+ - ClickHouse not available when use_cache=True
139
+ - Data fetching failed
140
+
141
+ Examples
142
+ --------
143
+ Get last 10,000 bars for ML training:
144
+
145
+ >>> from rangebar import get_n_range_bars
146
+ >>> df = get_n_range_bars("BTCUSDT", n_bars=10000)
147
+ >>> assert len(df) == 10000
148
+
149
+ Get 5,000 bars ending at specific date for walk-forward:
150
+
151
+ >>> df = get_n_range_bars("BTCUSDT", n_bars=5000, end_date="2024-06-01")
152
+
153
+ With safety limit (won't fetch more than 30 days of data):
154
+
155
+ >>> df = get_n_range_bars("BTCUSDT", n_bars=1000, max_lookback_days=30)
156
+
157
+ Notes
158
+ -----
159
+ Cache behavior:
160
+ - Fast path: If cache has >= n_bars, returns immediately (~50ms)
161
+ - Slow path: If cache has < n_bars and fetch_if_missing=True,
162
+ fetches additional data, computes bars, stores in cache, returns
163
+
164
+ Gap-filling algorithm:
165
+ Uses adaptive exponential backoff to estimate how many ticks to fetch.
166
+ Learns compression ratio (ticks/bar) for each (symbol, threshold) pair.
167
+
168
+ See Also
169
+ --------
170
+ get_range_bars : Date-bounded bar retrieval (variable bar count)
171
+ THRESHOLD_PRESETS : Named threshold values
172
+ """
173
+ import warnings
174
+ from datetime import datetime
175
+
176
+ import numpy as np
177
+
178
+ # -------------------------------------------------------------------------
179
+ # Validation helper (closure over validate_on_return, continuity_action)
180
+ # -------------------------------------------------------------------------
181
+ def _apply_validation(df: pd.DataFrame) -> pd.DataFrame:
182
+ """Apply continuity validation if enabled, then return DataFrame."""
183
+ if not validate_on_return or df.empty or len(df) <= 1:
184
+ return df
185
+
186
+ # Check continuity: Close[i] should equal Open[i+1]
187
+ close_prices = df["Close"].to_numpy()[:-1]
188
+ open_prices = df["Open"].to_numpy()[1:]
189
+
190
+ # Calculate relative differences
191
+ with np.errstate(divide="ignore", invalid="ignore"):
192
+ rel_diff = np.abs(open_prices - close_prices) / np.abs(close_prices)
193
+
194
+ # 0.01% tolerance for floating-point errors
195
+ tolerance = 0.0001
196
+ discontinuities_mask = rel_diff > tolerance
197
+
198
+ if not np.any(discontinuities_mask):
199
+ return df
200
+
201
+ # Found discontinuities
202
+ discontinuity_count = int(np.sum(discontinuities_mask))
203
+ msg = f"Found {discontinuity_count} discontinuities in {len(df)} bars"
204
+
205
+ if continuity_action == "raise":
206
+ # Build details for ContinuityError
207
+ indices = np.where(discontinuities_mask)[0]
208
+ details = []
209
+ for idx in indices[:10]: # Limit to first 10
210
+ details.append(
211
+ {
212
+ "bar_index": int(idx),
213
+ "prev_close": float(close_prices[idx]),
214
+ "next_open": float(open_prices[idx]),
215
+ "gap_pct": float(rel_diff[idx] * 100),
216
+ }
217
+ )
218
+ raise ContinuityError(msg, details)
219
+ if continuity_action == "warn":
220
+ warnings.warn(msg, ContinuityWarning, stacklevel=3)
221
+ else: # "log"
222
+ logging.getLogger("rangebar").warning(msg)
223
+
224
+ return df
225
+
226
+ # -------------------------------------------------------------------------
227
+ # Validate parameters
228
+ # -------------------------------------------------------------------------
229
+ if n_bars <= 0:
230
+ msg = f"n_bars must be > 0, got {n_bars}"
231
+ raise ValueError(msg)
232
+
233
+ # Resolve threshold (support presets)
234
+ threshold: int
235
+ if isinstance(threshold_decimal_bps, str):
236
+ if threshold_decimal_bps not in THRESHOLD_PRESETS:
237
+ msg = (
238
+ f"Unknown threshold preset: {threshold_decimal_bps!r}. "
239
+ f"Valid presets: {list(THRESHOLD_PRESETS.keys())}"
240
+ )
241
+ raise ValueError(msg)
242
+ threshold = THRESHOLD_PRESETS[threshold_decimal_bps]
243
+ else:
244
+ threshold = threshold_decimal_bps
245
+
246
+ if not THRESHOLD_DECIMAL_MIN <= threshold <= THRESHOLD_DECIMAL_MAX:
247
+ msg = (
248
+ f"threshold_decimal_bps must be between {THRESHOLD_DECIMAL_MIN} and "
249
+ f"{THRESHOLD_DECIMAL_MAX}, got {threshold}"
250
+ )
251
+ raise ValueError(msg)
252
+
253
+ # Normalize source and market
254
+ source = source.lower()
255
+ if source not in ("binance", "exness"):
256
+ msg = f"Unknown source: {source!r}. Must be 'binance' or 'exness'"
257
+ raise ValueError(msg)
258
+
259
+ market_map = {
260
+ "spot": "spot",
261
+ "futures-um": "um",
262
+ "futures-cm": "cm",
263
+ "um": "um",
264
+ "cm": "cm",
265
+ }
266
+ market = market.lower()
267
+ if source == "binance" and market not in market_map:
268
+ msg = (
269
+ f"Unknown market: {market!r}. "
270
+ "Must be 'spot', 'futures-um'/'um', or 'futures-cm'/'cm'"
271
+ )
272
+ raise ValueError(msg)
273
+ market_normalized = market_map.get(market, market)
274
+
275
+ # Parse end_date if provided
276
+ end_ts: int | None = None
277
+ if end_date is not None:
278
+ try:
279
+ end_dt = datetime.strptime(end_date, "%Y-%m-%d")
280
+ # End of day in milliseconds
281
+ end_ts = int((end_dt.timestamp() + 86399) * 1000)
282
+ except ValueError as e:
283
+ msg = f"Invalid date format. Use YYYY-MM-DD: {e}"
284
+ raise ValueError(msg) from e
285
+
286
+ # -------------------------------------------------------------------------
287
+ # Try cache first (if enabled)
288
+ # -------------------------------------------------------------------------
289
+ if use_cache:
290
+ try:
291
+ from rangebar.clickhouse import RangeBarCache
292
+
293
+ with RangeBarCache() as cache:
294
+ # Fast path: check if cache has enough bars
295
+ bars_df, available_count = cache.get_n_bars(
296
+ symbol=symbol,
297
+ threshold_decimal_bps=threshold,
298
+ n_bars=n_bars,
299
+ before_ts=end_ts,
300
+ include_microstructure=include_microstructure,
301
+ )
302
+
303
+ if bars_df is not None and len(bars_df) >= n_bars:
304
+ # Tier 0 validation: Content-based staleness detection (Issue #39)
305
+ if include_microstructure:
306
+ staleness = detect_staleness(
307
+ bars_df, require_microstructure=True
308
+ )
309
+ if staleness.is_stale:
310
+ logger.warning(
311
+ "Stale cache data detected for %s: %s. "
312
+ "Falling through to recompute.",
313
+ symbol,
314
+ staleness.reason,
315
+ )
316
+ # Fall through to fetch_if_missing path
317
+ else:
318
+ # Cache hit - return exactly n_bars
319
+ return _apply_validation(bars_df.tail(n_bars))
320
+ else:
321
+ # Cache hit - return exactly n_bars
322
+ return _apply_validation(bars_df.tail(n_bars))
323
+
324
+ # Slow path: need to fetch more data
325
+ if fetch_if_missing:
326
+ bars_df = _fill_gap_and_cache(
327
+ symbol=symbol,
328
+ threshold=threshold,
329
+ n_bars=n_bars,
330
+ end_ts=end_ts,
331
+ source=source,
332
+ market=market_normalized,
333
+ include_microstructure=include_microstructure,
334
+ max_lookback_days=max_lookback_days,
335
+ cache=cache,
336
+ cache_dir=Path(cache_dir) if cache_dir else None,
337
+ current_bars=bars_df,
338
+ current_count=available_count,
339
+ chunk_size=chunk_size,
340
+ prevent_same_timestamp_close=prevent_same_timestamp_close,
341
+ )
342
+
343
+ if bars_df is not None and len(bars_df) >= n_bars:
344
+ return _apply_validation(bars_df.tail(n_bars))
345
+
346
+ # Return what we have (or None)
347
+ if bars_df is not None and len(bars_df) > 0:
348
+ if warn_if_fewer and len(bars_df) < n_bars:
349
+ warnings.warn(
350
+ f"Returning {len(bars_df)} bars instead of requested {n_bars}. "
351
+ f"Insufficient data available within max_lookback_days={max_lookback_days}.",
352
+ UserWarning,
353
+ stacklevel=2,
354
+ )
355
+ return _apply_validation(bars_df)
356
+
357
+ # Empty result
358
+ if warn_if_fewer:
359
+ warnings.warn(
360
+ f"Returning 0 bars instead of requested {n_bars}. "
361
+ "No data available in cache or from source.",
362
+ UserWarning,
363
+ stacklevel=2,
364
+ )
365
+ return pd.DataFrame(
366
+ columns=["Open", "High", "Low", "Close", "Volume"]
367
+ ).set_index(pd.DatetimeIndex([]))
368
+
369
+ except Exception as e:
370
+ # ClickHouse not available - fall through to compute-only mode
371
+ if "ClickHouseNotConfigured" in type(e).__name__:
372
+ pass # Fall through to compute-only mode
373
+ else:
374
+ raise
375
+
376
+ # -------------------------------------------------------------------------
377
+ # Compute-only mode (no cache)
378
+ # -------------------------------------------------------------------------
379
+ if not fetch_if_missing:
380
+ if warn_if_fewer:
381
+ warnings.warn(
382
+ f"Returning 0 bars instead of requested {n_bars}. "
383
+ "Cache disabled and fetch_if_missing=False.",
384
+ UserWarning,
385
+ stacklevel=2,
386
+ )
387
+ return pd.DataFrame(
388
+ columns=["Open", "High", "Low", "Close", "Volume"]
389
+ ).set_index(pd.DatetimeIndex([]))
390
+
391
+ # Fetch and compute without caching
392
+ bars_df = _fetch_and_compute_bars(
393
+ symbol=symbol,
394
+ threshold=threshold,
395
+ n_bars=n_bars,
396
+ end_ts=end_ts,
397
+ source=source,
398
+ market=market_normalized,
399
+ include_microstructure=include_microstructure,
400
+ max_lookback_days=max_lookback_days,
401
+ cache_dir=Path(cache_dir) if cache_dir else None,
402
+ )
403
+
404
+ if bars_df is not None and len(bars_df) >= n_bars:
405
+ return _apply_validation(bars_df.tail(n_bars))
406
+
407
+ if bars_df is not None and len(bars_df) > 0:
408
+ if warn_if_fewer:
409
+ warnings.warn(
410
+ f"Returning {len(bars_df)} bars instead of requested {n_bars}. "
411
+ f"Insufficient data available within max_lookback_days={max_lookback_days}.",
412
+ UserWarning,
413
+ stacklevel=2,
414
+ )
415
+ return _apply_validation(bars_df)
416
+
417
+ if warn_if_fewer:
418
+ warnings.warn(
419
+ f"Returning 0 bars instead of requested {n_bars}. "
420
+ "No data available from source.",
421
+ UserWarning,
422
+ stacklevel=2,
423
+ )
424
+ return pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"]).set_index(
425
+ pd.DatetimeIndex([])
426
+ )
427
+
428
+
429
+ def _fill_gap_and_cache(
430
+ symbol: str,
431
+ threshold: int,
432
+ n_bars: int,
433
+ end_ts: int | None,
434
+ source: str,
435
+ market: str,
436
+ include_microstructure: bool,
437
+ max_lookback_days: int,
438
+ cache: RangeBarCache,
439
+ cache_dir: Path | None,
440
+ current_bars: pd.DataFrame | None,
441
+ current_count: int,
442
+ chunk_size: int = 100_000,
443
+ prevent_same_timestamp_close: bool = True,
444
+ ) -> pd.DataFrame | None:
445
+ """Fill gap in cache by fetching and processing additional data.
446
+
447
+ Uses checkpoint-based cross-file continuity for Binance (24/7 crypto markets).
448
+ The key insight: ALL ticks must be processed with a SINGLE processor to
449
+ maintain the bar[i+1].open == bar[i].close invariant.
450
+
451
+ For Binance (24/7):
452
+ 1. Collect ALL tick data first (no intermediate processing)
453
+ 2. Merge all ticks chronologically
454
+ 3. Process with SINGLE processor (guarantees continuity)
455
+ 4. Store with unified cache key
456
+
457
+ For Exness (forex):
458
+ Session-bounded processing is acceptable since weekend gaps are natural.
459
+
460
+ Parameters
461
+ ----------
462
+ chunk_size : int, default=100_000
463
+ Number of ticks per processing chunk for memory efficiency when using
464
+ chunked processing with checkpoint continuation.
465
+ """
466
+ from datetime import datetime
467
+
468
+ from rangebar.storage.parquet import TickStorage
469
+
470
+ # Determine how many more bars we need
471
+ bars_needed = n_bars - (len(current_bars) if current_bars is not None else 0)
472
+
473
+ if bars_needed <= 0:
474
+ return current_bars
475
+
476
+ # Determine end date for fetching
477
+ if end_ts is not None:
478
+ end_dt = datetime.fromtimestamp(end_ts / 1000, tz=UTC)
479
+ else:
480
+ end_dt = datetime.now(tz=UTC)
481
+
482
+ # Get oldest bar timestamp to know where to start fetching
483
+ oldest_ts = cache.get_oldest_bar_timestamp(symbol, threshold)
484
+
485
+ # Estimate ticks needed using extracted helper
486
+ estimated_ticks_per_bar = estimate_ticks_per_bar(threshold)
487
+ target_ticks = bars_needed * estimated_ticks_per_bar * 2 # 2x buffer
488
+
489
+ storage = TickStorage(cache_dir=cache_dir)
490
+
491
+ # =========================================================================
492
+ # BINANCE (24/7 CRYPTO): Single-pass processing with checkpoint continuity
493
+ # =========================================================================
494
+ if source == "binance":
495
+ # Phase 1: Fetch ALL tick data using extracted tick fetcher
496
+ fetch_result = fetch_ticks_with_backoff(
497
+ symbol=symbol,
498
+ source=source,
499
+ market=market,
500
+ target_ticks=target_ticks,
501
+ end_dt=end_dt,
502
+ oldest_ts=oldest_ts,
503
+ max_lookback_days=max_lookback_days,
504
+ storage=storage,
505
+ )
506
+
507
+ if fetch_result.ticks is None:
508
+ return current_bars
509
+
510
+ # Phase 2: Process with SINGLE processor (guarantees continuity)
511
+ new_bars, _ = _process_binance_trades(
512
+ fetch_result.ticks,
513
+ threshold,
514
+ False,
515
+ include_microstructure,
516
+ symbol=symbol,
517
+ prevent_same_timestamp_close=prevent_same_timestamp_close,
518
+ )
519
+
520
+ # Phase 3: Store with unified cache key
521
+ if not new_bars.empty:
522
+ cache.store_bars_bulk(symbol, threshold, new_bars)
523
+
524
+ # Combine with existing bars
525
+ if current_bars is not None and len(current_bars) > 0:
526
+ # Validate continuity at junction (new_bars older, current_bars newer)
527
+ is_continuous, gap_pct = validate_junction_continuity(
528
+ new_bars, current_bars
529
+ )
530
+ if not is_continuous:
531
+ import warnings
532
+
533
+ warnings.warn(
534
+ f"Discontinuity detected at junction: {symbol} @ {threshold} dbps. "
535
+ f"Gap: {gap_pct:.4%}. This occurs because range bars from different "
536
+ f"processing sessions cannot guarantee bar[n].close == bar[n+1].open. "
537
+ f"Consider invalidating cache and re-fetching all data for continuous "
538
+ f"bars. See: https://github.com/terrylica/rangebar-py/issues/5",
539
+ stacklevel=3,
540
+ )
541
+
542
+ # MEM-006: Use Polars for memory-efficient concatenation
543
+ combined = _concat_pandas_via_polars([new_bars, current_bars])
544
+ return combined[~combined.index.duplicated(keep="last")]
545
+
546
+ return new_bars
547
+
548
+ # =========================================================================
549
+ # EXNESS (FOREX): Session-bounded processing (weekend gaps are natural)
550
+ # =========================================================================
551
+ return _fill_gap_exness(
552
+ symbol=symbol,
553
+ threshold=threshold,
554
+ n_bars=n_bars,
555
+ end_dt=end_dt,
556
+ oldest_ts=oldest_ts,
557
+ include_microstructure=include_microstructure,
558
+ max_lookback_days=max_lookback_days,
559
+ cache=cache,
560
+ storage=storage,
561
+ current_bars=current_bars,
562
+ estimated_ticks_per_bar=estimated_ticks_per_bar,
563
+ )
564
+
565
+
566
+ def _fill_gap_exness(
567
+ symbol: str,
568
+ threshold: int,
569
+ n_bars: int,
570
+ end_dt: datetime,
571
+ oldest_ts: int | None,
572
+ include_microstructure: bool,
573
+ max_lookback_days: int,
574
+ cache: RangeBarCache,
575
+ storage: TickStorage,
576
+ current_bars: pd.DataFrame | None,
577
+ estimated_ticks_per_bar: int,
578
+ ) -> pd.DataFrame | None:
579
+ """Fill gap for Exness forex data with session-bounded processing.
580
+
581
+ Forex markets have natural weekend gaps, so session-bounded processing
582
+ is acceptable (unlike 24/7 crypto markets).
583
+ """
584
+ from datetime import datetime, timedelta
585
+
586
+ multiplier = 2.0
587
+ max_attempts = 5
588
+ bars_needed = n_bars - (len(current_bars) if current_bars is not None else 0)
589
+
590
+ all_bars: list[pd.DataFrame] = []
591
+ if current_bars is not None and len(current_bars) > 0:
592
+ all_bars.append(current_bars)
593
+
594
+ cache_symbol = f"exness_spot_{symbol}".upper()
595
+
596
+ for _attempt in range(max_attempts):
597
+ # Estimate days to fetch
598
+ ticks_to_fetch = int(bars_needed * estimated_ticks_per_bar * multiplier)
599
+ days_to_fetch = max(1, ticks_to_fetch // 1_000_000)
600
+ days_to_fetch = min(days_to_fetch, max_lookback_days)
601
+
602
+ # Calculate fetch range
603
+ if oldest_ts is not None:
604
+ fetch_end_dt = datetime.fromtimestamp(oldest_ts / 1000, tz=UTC)
605
+ else:
606
+ fetch_end_dt = end_dt
607
+
608
+ fetch_start_dt = fetch_end_dt - timedelta(days=days_to_fetch)
609
+
610
+ if (end_dt - fetch_start_dt).days > max_lookback_days:
611
+ break
612
+
613
+ start_date = fetch_start_dt.strftime("%Y-%m-%d")
614
+ end_date_str = fetch_end_dt.strftime("%Y-%m-%d")
615
+ start_ts_fetch = int(fetch_start_dt.timestamp() * 1000)
616
+ end_ts_fetch = int(fetch_end_dt.timestamp() * 1000)
617
+
618
+ if storage.has_ticks(cache_symbol, start_ts_fetch, end_ts_fetch):
619
+ tick_data = storage.read_ticks(cache_symbol, start_ts_fetch, end_ts_fetch)
620
+ else:
621
+ tick_data = _fetch_exness(symbol, start_date, end_date_str, "strict")
622
+ if not tick_data.is_empty():
623
+ storage.write_ticks(cache_symbol, tick_data)
624
+
625
+ if tick_data.is_empty():
626
+ break
627
+
628
+ # Process to bars (forex: session-bounded is OK)
629
+ new_bars = _process_exness_ticks(
630
+ tick_data, symbol, threshold, "strict", False, include_microstructure
631
+ )
632
+
633
+ if not new_bars.empty:
634
+ cache.store_bars_bulk(symbol, threshold, new_bars)
635
+ all_bars.insert(0, new_bars)
636
+ oldest_ts = int(new_bars.index.min().timestamp() * 1000)
637
+
638
+ total_bars = sum(len(df) for df in all_bars)
639
+ if total_bars >= n_bars:
640
+ break
641
+
642
+ multiplier *= 2
643
+
644
+ if not all_bars:
645
+ return None
646
+
647
+ # MEM-006: Use Polars for memory-efficient concatenation
648
+ combined = _concat_pandas_via_polars(all_bars)
649
+ return combined[~combined.index.duplicated(keep="last")]
650
+
651
+
652
+ def _fetch_and_compute_bars(
653
+ symbol: str,
654
+ threshold: int,
655
+ n_bars: int,
656
+ end_ts: int | None,
657
+ source: str,
658
+ market: str,
659
+ include_microstructure: bool,
660
+ max_lookback_days: int,
661
+ cache_dir: Path | None,
662
+ ) -> pd.DataFrame | None:
663
+ """Fetch and compute bars without caching (compute-only mode).
664
+
665
+ Uses single-pass processing for Binance (24/7 crypto) to guarantee continuity.
666
+ """
667
+ from datetime import datetime
668
+
669
+ from rangebar.storage.parquet import TickStorage
670
+
671
+ # Determine end date
672
+ if end_ts is not None:
673
+ end_dt = datetime.fromtimestamp(end_ts / 1000, tz=UTC)
674
+ else:
675
+ end_dt = datetime.now(tz=UTC)
676
+
677
+ # Estimate ticks needed using extracted helper
678
+ estimated_ticks_per_bar = estimate_ticks_per_bar(threshold)
679
+ target_ticks = n_bars * estimated_ticks_per_bar * 2
680
+
681
+ storage = TickStorage(cache_dir=cache_dir)
682
+
683
+ # =========================================================================
684
+ # BINANCE (24/7 CRYPTO): Single-pass processing for continuity
685
+ # =========================================================================
686
+ if source == "binance":
687
+ # Use extracted tick fetcher
688
+ fetch_result = fetch_ticks_with_backoff(
689
+ symbol=symbol,
690
+ source=source,
691
+ market=market,
692
+ target_ticks=target_ticks,
693
+ end_dt=end_dt,
694
+ oldest_ts=None,
695
+ max_lookback_days=max_lookback_days,
696
+ storage=storage,
697
+ )
698
+
699
+ if fetch_result.ticks is None:
700
+ return None
701
+
702
+ bars_df, _ = _process_binance_trades(
703
+ fetch_result.ticks, threshold, False, include_microstructure, symbol=symbol
704
+ )
705
+ return bars_df if not bars_df.empty else None
706
+
707
+ # =========================================================================
708
+ # EXNESS (FOREX): Session-bounded processing
709
+ # =========================================================================
710
+ return _compute_exness_bars(
711
+ symbol=symbol,
712
+ threshold=threshold,
713
+ n_bars=n_bars,
714
+ end_dt=end_dt,
715
+ include_microstructure=include_microstructure,
716
+ max_lookback_days=max_lookback_days,
717
+ storage=storage,
718
+ estimated_ticks_per_bar=estimated_ticks_per_bar,
719
+ )
720
+
721
+
722
+ def _compute_exness_bars(
723
+ symbol: str,
724
+ threshold: int,
725
+ n_bars: int,
726
+ end_dt: datetime,
727
+ include_microstructure: bool,
728
+ max_lookback_days: int,
729
+ storage: TickStorage,
730
+ estimated_ticks_per_bar: int,
731
+ ) -> pd.DataFrame | None:
732
+ """Compute Exness forex bars without caching (compute-only mode).
733
+
734
+ Forex markets have natural weekend gaps, so session-bounded processing
735
+ is acceptable.
736
+ """
737
+ from datetime import datetime, timedelta
738
+
739
+ multiplier = 2.0
740
+ max_attempts = 5
741
+ oldest_ts: int | None = None
742
+ cache_symbol = f"exness_spot_{symbol}".upper()
743
+
744
+ all_bars: list[pd.DataFrame] = []
745
+
746
+ for _attempt in range(max_attempts):
747
+ bars_still_needed = n_bars - sum(len(df) for df in all_bars)
748
+ ticks_to_fetch = int(bars_still_needed * estimated_ticks_per_bar * multiplier)
749
+ days_to_fetch = max(1, ticks_to_fetch // 1_000_000)
750
+ days_to_fetch = min(days_to_fetch, max_lookback_days)
751
+
752
+ if oldest_ts is not None:
753
+ fetch_end_dt = datetime.fromtimestamp(oldest_ts / 1000, tz=UTC)
754
+ else:
755
+ fetch_end_dt = end_dt
756
+
757
+ fetch_start_dt = fetch_end_dt - timedelta(days=days_to_fetch)
758
+
759
+ if (end_dt - fetch_start_dt).days > max_lookback_days:
760
+ break
761
+
762
+ start_date = fetch_start_dt.strftime("%Y-%m-%d")
763
+ end_date_str = fetch_end_dt.strftime("%Y-%m-%d")
764
+ start_ts_fetch = int(fetch_start_dt.timestamp() * 1000)
765
+ end_ts_fetch = int(fetch_end_dt.timestamp() * 1000)
766
+
767
+ if storage.has_ticks(cache_symbol, start_ts_fetch, end_ts_fetch):
768
+ tick_data = storage.read_ticks(cache_symbol, start_ts_fetch, end_ts_fetch)
769
+ else:
770
+ tick_data = _fetch_exness(symbol, start_date, end_date_str, "strict")
771
+ if not tick_data.is_empty():
772
+ storage.write_ticks(cache_symbol, tick_data)
773
+
774
+ if tick_data.is_empty():
775
+ break
776
+
777
+ new_bars = _process_exness_ticks(
778
+ tick_data, symbol, threshold, "strict", False, include_microstructure
779
+ )
780
+
781
+ if not new_bars.empty:
782
+ all_bars.insert(0, new_bars)
783
+ oldest_ts = int(new_bars.index.min().timestamp() * 1000)
784
+
785
+ total_bars = sum(len(df) for df in all_bars)
786
+ if total_bars >= n_bars:
787
+ break
788
+
789
+ multiplier *= 2
790
+
791
+ if not all_bars:
792
+ return None
793
+
794
+ # MEM-006: Use Polars for memory-efficient concatenation
795
+ combined = _concat_pandas_via_polars(all_bars)
796
+ # Remove duplicates (by index) and return
797
+ return combined[~combined.index.duplicated(keep="last")]