rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,498 @@
1
+ # polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
2
+ # Issue #46: Modularization M4 - Extract precompute_range_bars from __init__.py
3
+ """Batch precomputation pipeline for range bars.
4
+
5
+ Provides precompute_range_bars() for ML workflows requiring continuous
6
+ bar sequences with cache invalidation and continuity validation.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable
12
+
13
+ import pandas as pd
14
+
15
+ from rangebar.constants import (
16
+ THRESHOLD_DECIMAL_MAX,
17
+ THRESHOLD_DECIMAL_MIN,
18
+ )
19
+ from rangebar.conversion import _concat_pandas_via_polars
20
+ from rangebar.processors.core import RangeBarProcessor
21
+ from rangebar.validation.continuity import ContinuityError, validate_continuity
22
+
23
+ from .helpers import _fetch_binance, _fetch_exness
24
+ from .models import PrecomputeProgress, PrecomputeResult
25
+
26
+
27
+ def precompute_range_bars(
28
+ symbol: str,
29
+ start_date: str,
30
+ end_date: str,
31
+ threshold_decimal_bps: int = 250,
32
+ *,
33
+ source: str = "binance",
34
+ market: str = "spot",
35
+ chunk_size: int = 100_000,
36
+ invalidate_existing: str = "smart",
37
+ progress_callback: Callable[[PrecomputeProgress], None] | None = None,
38
+ include_microstructure: bool = False,
39
+ validate_on_complete: str = "error",
40
+ continuity_tolerance_pct: float = 0.001,
41
+ cache_dir: str | None = None,
42
+ max_memory_gb: float | None = None,
43
+ ) -> PrecomputeResult:
44
+ """Precompute continuous range bars for a date range (single-pass, guaranteed continuity).
45
+
46
+ Designed for ML workflows requiring continuous bar sequences for training/validation.
47
+ Uses Checkpoint API for memory-efficient chunked processing with state preservation.
48
+
49
+ Parameters
50
+ ----------
51
+ symbol : str
52
+ Trading symbol (e.g., "BTCUSDT")
53
+ start_date : str
54
+ Start date (inclusive) "YYYY-MM-DD"
55
+ end_date : str
56
+ End date (inclusive) "YYYY-MM-DD"
57
+ threshold_decimal_bps : int, default=250
58
+ Range bar threshold (250 = 0.25%)
59
+ source : str, default="binance"
60
+ Data source ("binance" or "exness")
61
+ market : str, default="spot"
62
+ Market type for Binance ("spot", "futures-um", "futures-cm")
63
+ chunk_size : int, default=100_000
64
+ Ticks per processing chunk (~15MB memory per 100K ticks)
65
+ invalidate_existing : str, default="smart"
66
+ Cache invalidation strategy:
67
+ - "overlap": Invalidate only bars in date range
68
+ - "full": Invalidate ALL bars for symbol/threshold
69
+ - "none": Skip if any cached bars exist in range
70
+ - "smart": Invalidate overlapping + validate junction continuity
71
+ progress_callback : Callable, optional
72
+ Optional callback for progress updates
73
+ include_microstructure : bool, default=False
74
+ Include order flow metrics (buy_volume, sell_volume, vwap)
75
+ validate_on_complete : str, default="error"
76
+ Continuity validation mode after precomputation:
77
+ - "error": Raise ContinuityError if discontinuities found
78
+ - "warn": Log warning but continue (sets continuity_valid=False)
79
+ - "skip": Skip validation entirely (continuity_valid=None)
80
+ continuity_tolerance_pct : float, default=0.001
81
+ Maximum allowed price gap percentage for continuity validation.
82
+ Default 0.1% (0.001) accommodates market microstructure events.
83
+ The total allowed gap is threshold_pct + continuity_tolerance_pct.
84
+ cache_dir : str or None, default=None
85
+ Custom cache directory for tick data
86
+ max_memory_gb : float or None, default=None
87
+ Process-level memory cap in GB. Sets RLIMIT_AS so that
88
+ exceeding the limit raises MemoryError instead of OOM kill.
89
+ None disables the cap.
90
+
91
+ Returns
92
+ -------
93
+ PrecomputeResult
94
+ Result with statistics and cache key
95
+
96
+ Raises
97
+ ------
98
+ ValueError
99
+ Invalid parameters
100
+ RuntimeError
101
+ Fetch or processing failure
102
+ ContinuityError
103
+ If validate_on_complete=True and discontinuities found
104
+
105
+ Examples
106
+ --------
107
+ Basic precomputation:
108
+
109
+ >>> result = precompute_range_bars("BTCUSDT", "2024-01-01", "2024-06-30")
110
+ >>> print(f"Generated {result.total_bars} bars")
111
+
112
+ With progress callback:
113
+
114
+ >>> def on_progress(p):
115
+ ... print(f"[{p.current_month}] {p.bars_generated} bars")
116
+ >>> result = precompute_range_bars(
117
+ ... "BTCUSDT", "2024-01-01", "2024-03-31",
118
+ ... progress_callback=on_progress
119
+ ... )
120
+ """
121
+ import gc
122
+ import time
123
+ from datetime import datetime
124
+ from pathlib import Path
125
+
126
+ from rangebar.clickhouse import CacheKey, RangeBarCache
127
+ from rangebar.storage.parquet import TickStorage
128
+
129
+ # MEM-009: Set process-level memory cap if requested (Issue #49)
130
+ if max_memory_gb is not None:
131
+ from rangebar.resource_guard import set_memory_limit
132
+
133
+ set_memory_limit(max_gb=max_memory_gb)
134
+
135
+ start_time = time.time()
136
+
137
+ # Validate parameters
138
+ if invalidate_existing not in ("overlap", "full", "none", "smart"):
139
+ msg = f"Invalid invalidate_existing: {invalidate_existing!r}. Must be 'overlap', 'full', 'none', or 'smart'"
140
+ raise ValueError(msg)
141
+
142
+ if validate_on_complete not in ("error", "warn", "skip"):
143
+ msg = f"Invalid validate_on_complete: {validate_on_complete!r}. Must be 'error', 'warn', or 'skip'"
144
+ raise ValueError(msg)
145
+
146
+ if not THRESHOLD_DECIMAL_MIN <= threshold_decimal_bps <= THRESHOLD_DECIMAL_MAX:
147
+ msg = f"threshold_decimal_bps must be between {THRESHOLD_DECIMAL_MIN} and {THRESHOLD_DECIMAL_MAX}"
148
+ raise ValueError(msg)
149
+
150
+ # Parse dates
151
+ try:
152
+ start_dt = datetime.strptime(start_date, "%Y-%m-%d")
153
+ end_dt = datetime.strptime(end_date, "%Y-%m-%d")
154
+ except ValueError as e:
155
+ msg = f"Invalid date format. Use YYYY-MM-DD: {e}"
156
+ raise ValueError(msg) from e
157
+
158
+ if start_dt > end_dt:
159
+ msg = "start_date must be <= end_date"
160
+ raise ValueError(msg)
161
+
162
+ # Normalize market type
163
+ market_map = {
164
+ "spot": "spot",
165
+ "futures-um": "um",
166
+ "futures-cm": "cm",
167
+ "um": "um",
168
+ "cm": "cm",
169
+ }
170
+ market_normalized = market_map.get(market.lower(), market.lower())
171
+
172
+ # Initialize storage and cache
173
+ storage = TickStorage(cache_dir=Path(cache_dir) if cache_dir else None)
174
+ cache = RangeBarCache()
175
+
176
+ # Generate list of months to process
177
+ months: list[tuple[int, int]] = []
178
+ current = start_dt.replace(day=1)
179
+ _december = 12
180
+ while current <= end_dt:
181
+ months.append((current.year, current.month))
182
+ # Move to next month
183
+ if current.month == _december:
184
+ current = current.replace(year=current.year + 1, month=1)
185
+ else:
186
+ current = current.replace(month=current.month + 1)
187
+
188
+ # Handle cache invalidation
189
+ start_ts = int(start_dt.timestamp() * 1000)
190
+ end_ts = int((end_dt.timestamp() + 86399) * 1000) # End of day
191
+ cache_key = CacheKey(
192
+ symbol=symbol,
193
+ threshold_decimal_bps=threshold_decimal_bps,
194
+ start_ts=start_ts,
195
+ end_ts=end_ts,
196
+ )
197
+
198
+ if invalidate_existing == "full":
199
+ cache.invalidate_range_bars(cache_key)
200
+ elif invalidate_existing in ("overlap", "smart"):
201
+ # Check for overlapping bars - will be handled after processing
202
+ # For now, just invalidate the date range to ensure clean slate
203
+ cache.invalidate_range_bars(cache_key)
204
+ elif invalidate_existing == "none":
205
+ # Check if any bars exist in range by counting
206
+ bar_count = cache.count_bars(symbol, threshold_decimal_bps)
207
+ if bar_count > 0:
208
+ # Return early - some cached data exists
209
+ # Note: This is approximate; full implementation would check time range
210
+ return PrecomputeResult(
211
+ symbol=symbol,
212
+ threshold_decimal_bps=threshold_decimal_bps,
213
+ start_date=start_date,
214
+ end_date=end_date,
215
+ total_bars=bar_count,
216
+ total_ticks=0,
217
+ elapsed_seconds=time.time() - start_time,
218
+ continuity_valid=True, # Assume valid for cached data
219
+ cache_key=f"{symbol}_{threshold_decimal_bps}",
220
+ )
221
+
222
+ # Initialize processor (single instance for continuity)
223
+ processor = RangeBarProcessor(threshold_decimal_bps, symbol=symbol)
224
+
225
+ all_bars: list[pd.DataFrame] = []
226
+ month_bars: list[pd.DataFrame] = (
227
+ []
228
+ ) # Issue #27: Track bars per month for incremental caching
229
+ total_ticks = 0
230
+ cache_symbol = f"{source}_{market_normalized}_{symbol}".upper()
231
+
232
+ for i, (year, month) in enumerate(months):
233
+ month_str = f"{year}-{month:02d}"
234
+
235
+ # Report progress - fetching
236
+ if progress_callback:
237
+ progress_callback(
238
+ PrecomputeProgress(
239
+ phase="fetching",
240
+ current_month=month_str,
241
+ months_completed=i,
242
+ months_total=len(months),
243
+ bars_generated=sum(len(b) for b in all_bars),
244
+ ticks_processed=total_ticks,
245
+ elapsed_seconds=time.time() - start_time,
246
+ )
247
+ )
248
+
249
+ # Calculate month boundaries
250
+ month_start = datetime(year, month, 1)
251
+ _december = 12
252
+ if month == _december:
253
+ month_end = datetime(year + 1, 1, 1)
254
+ else:
255
+ month_end = datetime(year, month + 1, 1)
256
+
257
+ # Adjust to fit within requested date range
258
+ actual_start = max(month_start, start_dt)
259
+ actual_end = min(month_end, end_dt + pd.Timedelta(days=1))
260
+
261
+ # Fetch tick data for this period
262
+ start_ts_month = int(actual_start.timestamp() * 1000)
263
+ end_ts_month = int(actual_end.timestamp() * 1000)
264
+
265
+ # Check if data is already cached
266
+ data_cached = storage.has_ticks(cache_symbol, start_ts_month, end_ts_month)
267
+
268
+ if data_cached:
269
+ # STREAMING READ: Use row-group based streaming to avoid OOM (Issue #12)
270
+ # The Rust processor maintains state between process_trades() calls,
271
+ # so we stream chunks directly without loading entire month into memory
272
+ month_has_data = False
273
+ for raw_tick_chunk in storage.read_ticks_streaming(
274
+ cache_symbol, start_ts_month, end_ts_month, chunk_size=chunk_size
275
+ ):
276
+ month_has_data = True
277
+
278
+ # Deduplicate trades by trade_id within chunk
279
+ tick_chunk = raw_tick_chunk
280
+ if "agg_trade_id" in tick_chunk.columns:
281
+ tick_chunk = tick_chunk.unique(
282
+ subset=["agg_trade_id"], maintain_order=True
283
+ )
284
+ elif "trade_id" in tick_chunk.columns:
285
+ tick_chunk = tick_chunk.unique(
286
+ subset=["trade_id"], maintain_order=True
287
+ )
288
+
289
+ # Sort by (timestamp, trade_id) - Rust crate requires this order
290
+ if "timestamp" in tick_chunk.columns:
291
+ if "agg_trade_id" in tick_chunk.columns:
292
+ tick_chunk = tick_chunk.sort(["timestamp", "agg_trade_id"])
293
+ elif "trade_id" in tick_chunk.columns:
294
+ tick_chunk = tick_chunk.sort(["timestamp", "trade_id"])
295
+ else:
296
+ tick_chunk = tick_chunk.sort("timestamp")
297
+
298
+ total_ticks += len(tick_chunk)
299
+
300
+ # Report progress - processing
301
+ if progress_callback:
302
+ progress_callback(
303
+ PrecomputeProgress(
304
+ phase="processing",
305
+ current_month=month_str,
306
+ months_completed=i,
307
+ months_total=len(months),
308
+ bars_generated=sum(len(b) for b in all_bars),
309
+ ticks_processed=total_ticks,
310
+ elapsed_seconds=time.time() - start_time,
311
+ )
312
+ )
313
+
314
+ # Stream directly to Rust processor (Issue #16: use streaming mode)
315
+ chunk = tick_chunk.to_dicts()
316
+ bars = processor.process_trades_streaming(chunk)
317
+ if bars:
318
+ # Issue #30: Always include microstructure for ClickHouse cache
319
+ bars_df = processor.to_dataframe(bars, include_microstructure=True)
320
+ month_bars.append(bars_df) # Issue #27: Track per-month bars
321
+
322
+ del chunk, tick_chunk
323
+
324
+ gc.collect()
325
+
326
+ if not month_has_data:
327
+ continue
328
+ else:
329
+ # DATA NOT CACHED: Fetch from source day-by-day to prevent OOM
330
+ # Issue #14: Fetching entire month at once causes OOM for high-volume
331
+ # months like March 2024. Fetch day-by-day instead.
332
+ current_day = actual_start
333
+ while current_day < actual_end:
334
+ next_day = current_day + pd.Timedelta(days=1)
335
+ next_day = min(next_day, actual_end)
336
+
337
+ day_start_str = current_day.strftime("%Y-%m-%d")
338
+ day_end_str = (next_day - pd.Timedelta(seconds=1)).strftime("%Y-%m-%d")
339
+
340
+ if source == "binance":
341
+ tick_data = _fetch_binance(
342
+ symbol, day_start_str, day_end_str, market_normalized
343
+ )
344
+ else:
345
+ tick_data = _fetch_exness(
346
+ symbol, day_start_str, day_end_str, "strict"
347
+ )
348
+
349
+ if not tick_data.is_empty():
350
+ storage.write_ticks(cache_symbol, tick_data)
351
+
352
+ # Deduplicate trades by trade_id
353
+ if "agg_trade_id" in tick_data.columns:
354
+ tick_data = tick_data.unique(
355
+ subset=["agg_trade_id"], maintain_order=True
356
+ )
357
+ elif "trade_id" in tick_data.columns:
358
+ tick_data = tick_data.unique(
359
+ subset=["trade_id"], maintain_order=True
360
+ )
361
+
362
+ # Sort by (timestamp, trade_id) - Rust crate requires order
363
+ if "timestamp" in tick_data.columns:
364
+ if "agg_trade_id" in tick_data.columns:
365
+ tick_data = tick_data.sort(["timestamp", "agg_trade_id"])
366
+ elif "trade_id" in tick_data.columns:
367
+ tick_data = tick_data.sort(["timestamp", "trade_id"])
368
+ else:
369
+ tick_data = tick_data.sort("timestamp")
370
+
371
+ total_ticks += len(tick_data)
372
+
373
+ # Report progress - processing
374
+ if progress_callback:
375
+ progress_callback(
376
+ PrecomputeProgress(
377
+ phase="processing",
378
+ current_month=month_str,
379
+ months_completed=i,
380
+ months_total=len(months),
381
+ bars_generated=sum(len(b) for b in all_bars),
382
+ ticks_processed=total_ticks,
383
+ elapsed_seconds=time.time() - start_time,
384
+ )
385
+ )
386
+
387
+ # Process with chunking for memory efficiency
388
+ tick_count = len(tick_data)
389
+ for chunk_start in range(0, tick_count, chunk_size):
390
+ chunk_end = min(chunk_start + chunk_size, tick_count)
391
+ chunk_df = tick_data.slice(chunk_start, chunk_end - chunk_start)
392
+ chunk = chunk_df.to_dicts()
393
+
394
+ # Stream to Rust processor (Issue #16: use streaming mode)
395
+ bars = processor.process_trades_streaming(chunk)
396
+ if bars:
397
+ # Issue #30: Always include microstructure for ClickHouse cache
398
+ bars_df = processor.to_dataframe(
399
+ bars, include_microstructure=True
400
+ )
401
+ month_bars.append(
402
+ bars_df
403
+ ) # Issue #27: Track per-month bars
404
+
405
+ del chunk, chunk_df
406
+
407
+ del tick_data
408
+ gc.collect()
409
+
410
+ current_day = next_day
411
+
412
+ # Issue #27: Incremental caching - store bars to ClickHouse after each month
413
+ # This provides crash resilience and bounded memory for DB writes
414
+ if month_bars:
415
+ # MEM-006: Use Polars for memory-efficient concatenation
416
+ month_df = _concat_pandas_via_polars(month_bars)
417
+
418
+ # Report progress - caching this month
419
+ if progress_callback:
420
+ progress_callback(
421
+ PrecomputeProgress(
422
+ phase="caching",
423
+ current_month=month_str,
424
+ months_completed=i + 1,
425
+ months_total=len(months),
426
+ bars_generated=len(month_df) + sum(len(b) for b in all_bars),
427
+ ticks_processed=total_ticks,
428
+ elapsed_seconds=time.time() - start_time,
429
+ )
430
+ )
431
+
432
+ # Cache immediately (idempotent via ReplacingMergeTree)
433
+ rows_sent = len(month_df)
434
+ rows_inserted = cache.store_bars_bulk(
435
+ symbol, threshold_decimal_bps, month_df
436
+ )
437
+
438
+ # Post-cache validation: FAIL LOUDLY if ClickHouse didn't receive all bars
439
+ if rows_inserted != rows_sent:
440
+ msg = (
441
+ f"ClickHouse cache validation FAILED for {month_str}: "
442
+ f"sent {rows_sent} bars but only {rows_inserted} inserted. "
443
+ f"Data integrity compromised - aborting."
444
+ )
445
+ raise RuntimeError(msg)
446
+
447
+ # Preserve for final validation and return
448
+ all_bars.append(month_df)
449
+ month_bars = [] # Clear to reclaim memory
450
+ gc.collect()
451
+
452
+ # Combine all bars (MEM-006: use Polars for memory efficiency)
453
+ if all_bars:
454
+ final_bars = _concat_pandas_via_polars(all_bars)
455
+ else:
456
+ final_bars = pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"])
457
+
458
+ # Note: Caching now happens incrementally after each month (Issue #27)
459
+ # No final bulk store needed - all bars already cached per-month
460
+
461
+ # Validate continuity (Issue #19: configurable tolerance and validation mode)
462
+ continuity_valid: bool | None = None
463
+
464
+ if validate_on_complete == "skip":
465
+ # Skip validation entirely
466
+ continuity_valid = None
467
+ else:
468
+ continuity_result = validate_continuity(
469
+ final_bars,
470
+ tolerance_pct=continuity_tolerance_pct,
471
+ threshold_decimal_bps=threshold_decimal_bps,
472
+ )
473
+ continuity_valid = continuity_result["is_valid"]
474
+
475
+ if not continuity_valid:
476
+ if validate_on_complete == "error":
477
+ msg = f"Found {continuity_result['discontinuity_count']} discontinuities in precomputed bars"
478
+ raise ContinuityError(msg, continuity_result["discontinuities"])
479
+ if validate_on_complete == "warn":
480
+ import warnings
481
+
482
+ msg = (
483
+ f"Found {continuity_result['discontinuity_count']} discontinuities "
484
+ f"in precomputed bars (tolerance: {continuity_tolerance_pct:.4%})"
485
+ )
486
+ warnings.warn(msg, stacklevel=2)
487
+
488
+ return PrecomputeResult(
489
+ symbol=symbol,
490
+ threshold_decimal_bps=threshold_decimal_bps,
491
+ start_date=start_date,
492
+ end_date=end_date,
493
+ total_bars=len(final_bars),
494
+ total_ticks=total_ticks,
495
+ elapsed_seconds=time.time() - start_time,
496
+ continuity_valid=continuity_valid,
497
+ cache_key=f"{symbol}_{threshold_decimal_bps}",
498
+ )