rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
# polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
|
|
2
|
+
# Issue #46: Modularization M4 - Extract precompute_range_bars from __init__.py
|
|
3
|
+
"""Batch precomputation pipeline for range bars.
|
|
4
|
+
|
|
5
|
+
Provides precompute_range_bars() for ML workflows requiring continuous
|
|
6
|
+
bar sequences with cache invalidation and continuity validation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from rangebar.constants import (
|
|
16
|
+
THRESHOLD_DECIMAL_MAX,
|
|
17
|
+
THRESHOLD_DECIMAL_MIN,
|
|
18
|
+
)
|
|
19
|
+
from rangebar.conversion import _concat_pandas_via_polars
|
|
20
|
+
from rangebar.processors.core import RangeBarProcessor
|
|
21
|
+
from rangebar.validation.continuity import ContinuityError, validate_continuity
|
|
22
|
+
|
|
23
|
+
from .helpers import _fetch_binance, _fetch_exness
|
|
24
|
+
from .models import PrecomputeProgress, PrecomputeResult
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def precompute_range_bars(
|
|
28
|
+
symbol: str,
|
|
29
|
+
start_date: str,
|
|
30
|
+
end_date: str,
|
|
31
|
+
threshold_decimal_bps: int = 250,
|
|
32
|
+
*,
|
|
33
|
+
source: str = "binance",
|
|
34
|
+
market: str = "spot",
|
|
35
|
+
chunk_size: int = 100_000,
|
|
36
|
+
invalidate_existing: str = "smart",
|
|
37
|
+
progress_callback: Callable[[PrecomputeProgress], None] | None = None,
|
|
38
|
+
include_microstructure: bool = False,
|
|
39
|
+
validate_on_complete: str = "error",
|
|
40
|
+
continuity_tolerance_pct: float = 0.001,
|
|
41
|
+
cache_dir: str | None = None,
|
|
42
|
+
max_memory_gb: float | None = None,
|
|
43
|
+
) -> PrecomputeResult:
|
|
44
|
+
"""Precompute continuous range bars for a date range (single-pass, guaranteed continuity).
|
|
45
|
+
|
|
46
|
+
Designed for ML workflows requiring continuous bar sequences for training/validation.
|
|
47
|
+
Uses Checkpoint API for memory-efficient chunked processing with state preservation.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
symbol : str
|
|
52
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
53
|
+
start_date : str
|
|
54
|
+
Start date (inclusive) "YYYY-MM-DD"
|
|
55
|
+
end_date : str
|
|
56
|
+
End date (inclusive) "YYYY-MM-DD"
|
|
57
|
+
threshold_decimal_bps : int, default=250
|
|
58
|
+
Range bar threshold (250 = 0.25%)
|
|
59
|
+
source : str, default="binance"
|
|
60
|
+
Data source ("binance" or "exness")
|
|
61
|
+
market : str, default="spot"
|
|
62
|
+
Market type for Binance ("spot", "futures-um", "futures-cm")
|
|
63
|
+
chunk_size : int, default=100_000
|
|
64
|
+
Ticks per processing chunk (~15MB memory per 100K ticks)
|
|
65
|
+
invalidate_existing : str, default="smart"
|
|
66
|
+
Cache invalidation strategy:
|
|
67
|
+
- "overlap": Invalidate only bars in date range
|
|
68
|
+
- "full": Invalidate ALL bars for symbol/threshold
|
|
69
|
+
- "none": Skip if any cached bars exist in range
|
|
70
|
+
- "smart": Invalidate overlapping + validate junction continuity
|
|
71
|
+
progress_callback : Callable, optional
|
|
72
|
+
Optional callback for progress updates
|
|
73
|
+
include_microstructure : bool, default=False
|
|
74
|
+
Include order flow metrics (buy_volume, sell_volume, vwap)
|
|
75
|
+
validate_on_complete : str, default="error"
|
|
76
|
+
Continuity validation mode after precomputation:
|
|
77
|
+
- "error": Raise ContinuityError if discontinuities found
|
|
78
|
+
- "warn": Log warning but continue (sets continuity_valid=False)
|
|
79
|
+
- "skip": Skip validation entirely (continuity_valid=None)
|
|
80
|
+
continuity_tolerance_pct : float, default=0.001
|
|
81
|
+
Maximum allowed price gap percentage for continuity validation.
|
|
82
|
+
Default 0.1% (0.001) accommodates market microstructure events.
|
|
83
|
+
The total allowed gap is threshold_pct + continuity_tolerance_pct.
|
|
84
|
+
cache_dir : str or None, default=None
|
|
85
|
+
Custom cache directory for tick data
|
|
86
|
+
max_memory_gb : float or None, default=None
|
|
87
|
+
Process-level memory cap in GB. Sets RLIMIT_AS so that
|
|
88
|
+
exceeding the limit raises MemoryError instead of OOM kill.
|
|
89
|
+
None disables the cap.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
PrecomputeResult
|
|
94
|
+
Result with statistics and cache key
|
|
95
|
+
|
|
96
|
+
Raises
|
|
97
|
+
------
|
|
98
|
+
ValueError
|
|
99
|
+
Invalid parameters
|
|
100
|
+
RuntimeError
|
|
101
|
+
Fetch or processing failure
|
|
102
|
+
ContinuityError
|
|
103
|
+
If validate_on_complete=True and discontinuities found
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
Basic precomputation:
|
|
108
|
+
|
|
109
|
+
>>> result = precompute_range_bars("BTCUSDT", "2024-01-01", "2024-06-30")
|
|
110
|
+
>>> print(f"Generated {result.total_bars} bars")
|
|
111
|
+
|
|
112
|
+
With progress callback:
|
|
113
|
+
|
|
114
|
+
>>> def on_progress(p):
|
|
115
|
+
... print(f"[{p.current_month}] {p.bars_generated} bars")
|
|
116
|
+
>>> result = precompute_range_bars(
|
|
117
|
+
... "BTCUSDT", "2024-01-01", "2024-03-31",
|
|
118
|
+
... progress_callback=on_progress
|
|
119
|
+
... )
|
|
120
|
+
"""
|
|
121
|
+
import gc
|
|
122
|
+
import time
|
|
123
|
+
from datetime import datetime
|
|
124
|
+
from pathlib import Path
|
|
125
|
+
|
|
126
|
+
from rangebar.clickhouse import CacheKey, RangeBarCache
|
|
127
|
+
from rangebar.storage.parquet import TickStorage
|
|
128
|
+
|
|
129
|
+
# MEM-009: Set process-level memory cap if requested (Issue #49)
|
|
130
|
+
if max_memory_gb is not None:
|
|
131
|
+
from rangebar.resource_guard import set_memory_limit
|
|
132
|
+
|
|
133
|
+
set_memory_limit(max_gb=max_memory_gb)
|
|
134
|
+
|
|
135
|
+
start_time = time.time()
|
|
136
|
+
|
|
137
|
+
# Validate parameters
|
|
138
|
+
if invalidate_existing not in ("overlap", "full", "none", "smart"):
|
|
139
|
+
msg = f"Invalid invalidate_existing: {invalidate_existing!r}. Must be 'overlap', 'full', 'none', or 'smart'"
|
|
140
|
+
raise ValueError(msg)
|
|
141
|
+
|
|
142
|
+
if validate_on_complete not in ("error", "warn", "skip"):
|
|
143
|
+
msg = f"Invalid validate_on_complete: {validate_on_complete!r}. Must be 'error', 'warn', or 'skip'"
|
|
144
|
+
raise ValueError(msg)
|
|
145
|
+
|
|
146
|
+
if not THRESHOLD_DECIMAL_MIN <= threshold_decimal_bps <= THRESHOLD_DECIMAL_MAX:
|
|
147
|
+
msg = f"threshold_decimal_bps must be between {THRESHOLD_DECIMAL_MIN} and {THRESHOLD_DECIMAL_MAX}"
|
|
148
|
+
raise ValueError(msg)
|
|
149
|
+
|
|
150
|
+
# Parse dates
|
|
151
|
+
try:
|
|
152
|
+
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
153
|
+
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
154
|
+
except ValueError as e:
|
|
155
|
+
msg = f"Invalid date format. Use YYYY-MM-DD: {e}"
|
|
156
|
+
raise ValueError(msg) from e
|
|
157
|
+
|
|
158
|
+
if start_dt > end_dt:
|
|
159
|
+
msg = "start_date must be <= end_date"
|
|
160
|
+
raise ValueError(msg)
|
|
161
|
+
|
|
162
|
+
# Normalize market type
|
|
163
|
+
market_map = {
|
|
164
|
+
"spot": "spot",
|
|
165
|
+
"futures-um": "um",
|
|
166
|
+
"futures-cm": "cm",
|
|
167
|
+
"um": "um",
|
|
168
|
+
"cm": "cm",
|
|
169
|
+
}
|
|
170
|
+
market_normalized = market_map.get(market.lower(), market.lower())
|
|
171
|
+
|
|
172
|
+
# Initialize storage and cache
|
|
173
|
+
storage = TickStorage(cache_dir=Path(cache_dir) if cache_dir else None)
|
|
174
|
+
cache = RangeBarCache()
|
|
175
|
+
|
|
176
|
+
# Generate list of months to process
|
|
177
|
+
months: list[tuple[int, int]] = []
|
|
178
|
+
current = start_dt.replace(day=1)
|
|
179
|
+
_december = 12
|
|
180
|
+
while current <= end_dt:
|
|
181
|
+
months.append((current.year, current.month))
|
|
182
|
+
# Move to next month
|
|
183
|
+
if current.month == _december:
|
|
184
|
+
current = current.replace(year=current.year + 1, month=1)
|
|
185
|
+
else:
|
|
186
|
+
current = current.replace(month=current.month + 1)
|
|
187
|
+
|
|
188
|
+
# Handle cache invalidation
|
|
189
|
+
start_ts = int(start_dt.timestamp() * 1000)
|
|
190
|
+
end_ts = int((end_dt.timestamp() + 86399) * 1000) # End of day
|
|
191
|
+
cache_key = CacheKey(
|
|
192
|
+
symbol=symbol,
|
|
193
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
194
|
+
start_ts=start_ts,
|
|
195
|
+
end_ts=end_ts,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if invalidate_existing == "full":
|
|
199
|
+
cache.invalidate_range_bars(cache_key)
|
|
200
|
+
elif invalidate_existing in ("overlap", "smart"):
|
|
201
|
+
# Check for overlapping bars - will be handled after processing
|
|
202
|
+
# For now, just invalidate the date range to ensure clean slate
|
|
203
|
+
cache.invalidate_range_bars(cache_key)
|
|
204
|
+
elif invalidate_existing == "none":
|
|
205
|
+
# Check if any bars exist in range by counting
|
|
206
|
+
bar_count = cache.count_bars(symbol, threshold_decimal_bps)
|
|
207
|
+
if bar_count > 0:
|
|
208
|
+
# Return early - some cached data exists
|
|
209
|
+
# Note: This is approximate; full implementation would check time range
|
|
210
|
+
return PrecomputeResult(
|
|
211
|
+
symbol=symbol,
|
|
212
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
213
|
+
start_date=start_date,
|
|
214
|
+
end_date=end_date,
|
|
215
|
+
total_bars=bar_count,
|
|
216
|
+
total_ticks=0,
|
|
217
|
+
elapsed_seconds=time.time() - start_time,
|
|
218
|
+
continuity_valid=True, # Assume valid for cached data
|
|
219
|
+
cache_key=f"{symbol}_{threshold_decimal_bps}",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Initialize processor (single instance for continuity)
|
|
223
|
+
processor = RangeBarProcessor(threshold_decimal_bps, symbol=symbol)
|
|
224
|
+
|
|
225
|
+
all_bars: list[pd.DataFrame] = []
|
|
226
|
+
month_bars: list[pd.DataFrame] = (
|
|
227
|
+
[]
|
|
228
|
+
) # Issue #27: Track bars per month for incremental caching
|
|
229
|
+
total_ticks = 0
|
|
230
|
+
cache_symbol = f"{source}_{market_normalized}_{symbol}".upper()
|
|
231
|
+
|
|
232
|
+
for i, (year, month) in enumerate(months):
|
|
233
|
+
month_str = f"{year}-{month:02d}"
|
|
234
|
+
|
|
235
|
+
# Report progress - fetching
|
|
236
|
+
if progress_callback:
|
|
237
|
+
progress_callback(
|
|
238
|
+
PrecomputeProgress(
|
|
239
|
+
phase="fetching",
|
|
240
|
+
current_month=month_str,
|
|
241
|
+
months_completed=i,
|
|
242
|
+
months_total=len(months),
|
|
243
|
+
bars_generated=sum(len(b) for b in all_bars),
|
|
244
|
+
ticks_processed=total_ticks,
|
|
245
|
+
elapsed_seconds=time.time() - start_time,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Calculate month boundaries
|
|
250
|
+
month_start = datetime(year, month, 1)
|
|
251
|
+
_december = 12
|
|
252
|
+
if month == _december:
|
|
253
|
+
month_end = datetime(year + 1, 1, 1)
|
|
254
|
+
else:
|
|
255
|
+
month_end = datetime(year, month + 1, 1)
|
|
256
|
+
|
|
257
|
+
# Adjust to fit within requested date range
|
|
258
|
+
actual_start = max(month_start, start_dt)
|
|
259
|
+
actual_end = min(month_end, end_dt + pd.Timedelta(days=1))
|
|
260
|
+
|
|
261
|
+
# Fetch tick data for this period
|
|
262
|
+
start_ts_month = int(actual_start.timestamp() * 1000)
|
|
263
|
+
end_ts_month = int(actual_end.timestamp() * 1000)
|
|
264
|
+
|
|
265
|
+
# Check if data is already cached
|
|
266
|
+
data_cached = storage.has_ticks(cache_symbol, start_ts_month, end_ts_month)
|
|
267
|
+
|
|
268
|
+
if data_cached:
|
|
269
|
+
# STREAMING READ: Use row-group based streaming to avoid OOM (Issue #12)
|
|
270
|
+
# The Rust processor maintains state between process_trades() calls,
|
|
271
|
+
# so we stream chunks directly without loading entire month into memory
|
|
272
|
+
month_has_data = False
|
|
273
|
+
for raw_tick_chunk in storage.read_ticks_streaming(
|
|
274
|
+
cache_symbol, start_ts_month, end_ts_month, chunk_size=chunk_size
|
|
275
|
+
):
|
|
276
|
+
month_has_data = True
|
|
277
|
+
|
|
278
|
+
# Deduplicate trades by trade_id within chunk
|
|
279
|
+
tick_chunk = raw_tick_chunk
|
|
280
|
+
if "agg_trade_id" in tick_chunk.columns:
|
|
281
|
+
tick_chunk = tick_chunk.unique(
|
|
282
|
+
subset=["agg_trade_id"], maintain_order=True
|
|
283
|
+
)
|
|
284
|
+
elif "trade_id" in tick_chunk.columns:
|
|
285
|
+
tick_chunk = tick_chunk.unique(
|
|
286
|
+
subset=["trade_id"], maintain_order=True
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Sort by (timestamp, trade_id) - Rust crate requires this order
|
|
290
|
+
if "timestamp" in tick_chunk.columns:
|
|
291
|
+
if "agg_trade_id" in tick_chunk.columns:
|
|
292
|
+
tick_chunk = tick_chunk.sort(["timestamp", "agg_trade_id"])
|
|
293
|
+
elif "trade_id" in tick_chunk.columns:
|
|
294
|
+
tick_chunk = tick_chunk.sort(["timestamp", "trade_id"])
|
|
295
|
+
else:
|
|
296
|
+
tick_chunk = tick_chunk.sort("timestamp")
|
|
297
|
+
|
|
298
|
+
total_ticks += len(tick_chunk)
|
|
299
|
+
|
|
300
|
+
# Report progress - processing
|
|
301
|
+
if progress_callback:
|
|
302
|
+
progress_callback(
|
|
303
|
+
PrecomputeProgress(
|
|
304
|
+
phase="processing",
|
|
305
|
+
current_month=month_str,
|
|
306
|
+
months_completed=i,
|
|
307
|
+
months_total=len(months),
|
|
308
|
+
bars_generated=sum(len(b) for b in all_bars),
|
|
309
|
+
ticks_processed=total_ticks,
|
|
310
|
+
elapsed_seconds=time.time() - start_time,
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Stream directly to Rust processor (Issue #16: use streaming mode)
|
|
315
|
+
chunk = tick_chunk.to_dicts()
|
|
316
|
+
bars = processor.process_trades_streaming(chunk)
|
|
317
|
+
if bars:
|
|
318
|
+
# Issue #30: Always include microstructure for ClickHouse cache
|
|
319
|
+
bars_df = processor.to_dataframe(bars, include_microstructure=True)
|
|
320
|
+
month_bars.append(bars_df) # Issue #27: Track per-month bars
|
|
321
|
+
|
|
322
|
+
del chunk, tick_chunk
|
|
323
|
+
|
|
324
|
+
gc.collect()
|
|
325
|
+
|
|
326
|
+
if not month_has_data:
|
|
327
|
+
continue
|
|
328
|
+
else:
|
|
329
|
+
# DATA NOT CACHED: Fetch from source day-by-day to prevent OOM
|
|
330
|
+
# Issue #14: Fetching entire month at once causes OOM for high-volume
|
|
331
|
+
# months like March 2024. Fetch day-by-day instead.
|
|
332
|
+
current_day = actual_start
|
|
333
|
+
while current_day < actual_end:
|
|
334
|
+
next_day = current_day + pd.Timedelta(days=1)
|
|
335
|
+
next_day = min(next_day, actual_end)
|
|
336
|
+
|
|
337
|
+
day_start_str = current_day.strftime("%Y-%m-%d")
|
|
338
|
+
day_end_str = (next_day - pd.Timedelta(seconds=1)).strftime("%Y-%m-%d")
|
|
339
|
+
|
|
340
|
+
if source == "binance":
|
|
341
|
+
tick_data = _fetch_binance(
|
|
342
|
+
symbol, day_start_str, day_end_str, market_normalized
|
|
343
|
+
)
|
|
344
|
+
else:
|
|
345
|
+
tick_data = _fetch_exness(
|
|
346
|
+
symbol, day_start_str, day_end_str, "strict"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if not tick_data.is_empty():
|
|
350
|
+
storage.write_ticks(cache_symbol, tick_data)
|
|
351
|
+
|
|
352
|
+
# Deduplicate trades by trade_id
|
|
353
|
+
if "agg_trade_id" in tick_data.columns:
|
|
354
|
+
tick_data = tick_data.unique(
|
|
355
|
+
subset=["agg_trade_id"], maintain_order=True
|
|
356
|
+
)
|
|
357
|
+
elif "trade_id" in tick_data.columns:
|
|
358
|
+
tick_data = tick_data.unique(
|
|
359
|
+
subset=["trade_id"], maintain_order=True
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Sort by (timestamp, trade_id) - Rust crate requires order
|
|
363
|
+
if "timestamp" in tick_data.columns:
|
|
364
|
+
if "agg_trade_id" in tick_data.columns:
|
|
365
|
+
tick_data = tick_data.sort(["timestamp", "agg_trade_id"])
|
|
366
|
+
elif "trade_id" in tick_data.columns:
|
|
367
|
+
tick_data = tick_data.sort(["timestamp", "trade_id"])
|
|
368
|
+
else:
|
|
369
|
+
tick_data = tick_data.sort("timestamp")
|
|
370
|
+
|
|
371
|
+
total_ticks += len(tick_data)
|
|
372
|
+
|
|
373
|
+
# Report progress - processing
|
|
374
|
+
if progress_callback:
|
|
375
|
+
progress_callback(
|
|
376
|
+
PrecomputeProgress(
|
|
377
|
+
phase="processing",
|
|
378
|
+
current_month=month_str,
|
|
379
|
+
months_completed=i,
|
|
380
|
+
months_total=len(months),
|
|
381
|
+
bars_generated=sum(len(b) for b in all_bars),
|
|
382
|
+
ticks_processed=total_ticks,
|
|
383
|
+
elapsed_seconds=time.time() - start_time,
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Process with chunking for memory efficiency
|
|
388
|
+
tick_count = len(tick_data)
|
|
389
|
+
for chunk_start in range(0, tick_count, chunk_size):
|
|
390
|
+
chunk_end = min(chunk_start + chunk_size, tick_count)
|
|
391
|
+
chunk_df = tick_data.slice(chunk_start, chunk_end - chunk_start)
|
|
392
|
+
chunk = chunk_df.to_dicts()
|
|
393
|
+
|
|
394
|
+
# Stream to Rust processor (Issue #16: use streaming mode)
|
|
395
|
+
bars = processor.process_trades_streaming(chunk)
|
|
396
|
+
if bars:
|
|
397
|
+
# Issue #30: Always include microstructure for ClickHouse cache
|
|
398
|
+
bars_df = processor.to_dataframe(
|
|
399
|
+
bars, include_microstructure=True
|
|
400
|
+
)
|
|
401
|
+
month_bars.append(
|
|
402
|
+
bars_df
|
|
403
|
+
) # Issue #27: Track per-month bars
|
|
404
|
+
|
|
405
|
+
del chunk, chunk_df
|
|
406
|
+
|
|
407
|
+
del tick_data
|
|
408
|
+
gc.collect()
|
|
409
|
+
|
|
410
|
+
current_day = next_day
|
|
411
|
+
|
|
412
|
+
# Issue #27: Incremental caching - store bars to ClickHouse after each month
|
|
413
|
+
# This provides crash resilience and bounded memory for DB writes
|
|
414
|
+
if month_bars:
|
|
415
|
+
# MEM-006: Use Polars for memory-efficient concatenation
|
|
416
|
+
month_df = _concat_pandas_via_polars(month_bars)
|
|
417
|
+
|
|
418
|
+
# Report progress - caching this month
|
|
419
|
+
if progress_callback:
|
|
420
|
+
progress_callback(
|
|
421
|
+
PrecomputeProgress(
|
|
422
|
+
phase="caching",
|
|
423
|
+
current_month=month_str,
|
|
424
|
+
months_completed=i + 1,
|
|
425
|
+
months_total=len(months),
|
|
426
|
+
bars_generated=len(month_df) + sum(len(b) for b in all_bars),
|
|
427
|
+
ticks_processed=total_ticks,
|
|
428
|
+
elapsed_seconds=time.time() - start_time,
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Cache immediately (idempotent via ReplacingMergeTree)
|
|
433
|
+
rows_sent = len(month_df)
|
|
434
|
+
rows_inserted = cache.store_bars_bulk(
|
|
435
|
+
symbol, threshold_decimal_bps, month_df
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Post-cache validation: FAIL LOUDLY if ClickHouse didn't receive all bars
|
|
439
|
+
if rows_inserted != rows_sent:
|
|
440
|
+
msg = (
|
|
441
|
+
f"ClickHouse cache validation FAILED for {month_str}: "
|
|
442
|
+
f"sent {rows_sent} bars but only {rows_inserted} inserted. "
|
|
443
|
+
f"Data integrity compromised - aborting."
|
|
444
|
+
)
|
|
445
|
+
raise RuntimeError(msg)
|
|
446
|
+
|
|
447
|
+
# Preserve for final validation and return
|
|
448
|
+
all_bars.append(month_df)
|
|
449
|
+
month_bars = [] # Clear to reclaim memory
|
|
450
|
+
gc.collect()
|
|
451
|
+
|
|
452
|
+
# Combine all bars (MEM-006: use Polars for memory efficiency)
|
|
453
|
+
if all_bars:
|
|
454
|
+
final_bars = _concat_pandas_via_polars(all_bars)
|
|
455
|
+
else:
|
|
456
|
+
final_bars = pd.DataFrame(columns=["Open", "High", "Low", "Close", "Volume"])
|
|
457
|
+
|
|
458
|
+
# Note: Caching now happens incrementally after each month (Issue #27)
|
|
459
|
+
# No final bulk store needed - all bars already cached per-month
|
|
460
|
+
|
|
461
|
+
# Validate continuity (Issue #19: configurable tolerance and validation mode)
|
|
462
|
+
continuity_valid: bool | None = None
|
|
463
|
+
|
|
464
|
+
if validate_on_complete == "skip":
|
|
465
|
+
# Skip validation entirely
|
|
466
|
+
continuity_valid = None
|
|
467
|
+
else:
|
|
468
|
+
continuity_result = validate_continuity(
|
|
469
|
+
final_bars,
|
|
470
|
+
tolerance_pct=continuity_tolerance_pct,
|
|
471
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
472
|
+
)
|
|
473
|
+
continuity_valid = continuity_result["is_valid"]
|
|
474
|
+
|
|
475
|
+
if not continuity_valid:
|
|
476
|
+
if validate_on_complete == "error":
|
|
477
|
+
msg = f"Found {continuity_result['discontinuity_count']} discontinuities in precomputed bars"
|
|
478
|
+
raise ContinuityError(msg, continuity_result["discontinuities"])
|
|
479
|
+
if validate_on_complete == "warn":
|
|
480
|
+
import warnings
|
|
481
|
+
|
|
482
|
+
msg = (
|
|
483
|
+
f"Found {continuity_result['discontinuity_count']} discontinuities "
|
|
484
|
+
f"in precomputed bars (tolerance: {continuity_tolerance_pct:.4%})"
|
|
485
|
+
)
|
|
486
|
+
warnings.warn(msg, stacklevel=2)
|
|
487
|
+
|
|
488
|
+
return PrecomputeResult(
|
|
489
|
+
symbol=symbol,
|
|
490
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
491
|
+
start_date=start_date,
|
|
492
|
+
end_date=end_date,
|
|
493
|
+
total_bars=len(final_bars),
|
|
494
|
+
total_ticks=total_ticks,
|
|
495
|
+
elapsed_seconds=time.time() - start_time,
|
|
496
|
+
continuity_valid=continuity_valid,
|
|
497
|
+
cache_key=f"{symbol}_{threshold_decimal_bps}",
|
|
498
|
+
)
|