rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
"""Parquet-based tick storage with ZSTD-3 compression.
|
|
2
|
+
|
|
3
|
+
This module replaces ClickHouse Tier 1 (raw trades cache) with local
|
|
4
|
+
Parquet files for better portability and no server requirement.
|
|
5
|
+
|
|
6
|
+
Compression choice (based on empirical benchmark 2026-01-07):
|
|
7
|
+
- ZSTD-3: 6.50 MB for 761K trades (5.37x compression)
|
|
8
|
+
- Write: 0.019s, Read: 0.006s
|
|
9
|
+
- Beats Brotli on BOTH size AND speed for tick data
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import shutil
|
|
16
|
+
from collections.abc import Iterator
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
from platformdirs import user_cache_dir
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
# Constants
|
|
28
|
+
COMPRESSION = "zstd"
|
|
29
|
+
COMPRESSION_LEVEL = 3
|
|
30
|
+
APP_NAME = "rangebar"
|
|
31
|
+
APP_AUTHOR = "terrylica"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_cache_dir() -> Path:
|
|
35
|
+
"""Get the cross-platform cache directory for rangebar.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
Path
|
|
40
|
+
Platform-specific cache directory:
|
|
41
|
+
- macOS: ~/Library/Caches/rangebar/
|
|
42
|
+
- Linux: ~/.cache/rangebar/ (respects XDG_CACHE_HOME)
|
|
43
|
+
- Windows: %USERPROFILE%\\AppData\\Local\\terrylica\\rangebar\\Cache\\
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> from rangebar.storage import get_cache_dir
|
|
48
|
+
>>> cache_dir = get_cache_dir()
|
|
49
|
+
>>> print(cache_dir)
|
|
50
|
+
/Users/username/Library/Caches/rangebar
|
|
51
|
+
"""
|
|
52
|
+
# Allow override via environment variable
|
|
53
|
+
env_override = os.getenv("RANGEBAR_CACHE_DIR")
|
|
54
|
+
if env_override:
|
|
55
|
+
return Path(env_override)
|
|
56
|
+
|
|
57
|
+
return Path(user_cache_dir(APP_NAME, APP_AUTHOR))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TickStorage:
|
|
61
|
+
"""Parquet-based tick data storage with ZSTD-3 compression.
|
|
62
|
+
|
|
63
|
+
Stores raw tick data in Parquet files partitioned by symbol and month.
|
|
64
|
+
Uses polars for fast I/O with ZSTD-3 compression.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
cache_dir : Path | str | None
|
|
69
|
+
Custom cache directory. If None, uses platformdirs default.
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> storage = TickStorage()
|
|
74
|
+
>>> storage.write_ticks("BTCUSDT", trades_df)
|
|
75
|
+
>>> df = storage.read_ticks("BTCUSDT", start_ts, end_ts)
|
|
76
|
+
|
|
77
|
+
Directory Structure
|
|
78
|
+
-------------------
|
|
79
|
+
~/.cache/rangebar/ticks/
|
|
80
|
+
├── BTCUSDT/
|
|
81
|
+
│ ├── 2024-01.parquet
|
|
82
|
+
│ ├── 2024-02.parquet
|
|
83
|
+
│ └── ...
|
|
84
|
+
└── EURUSD/
|
|
85
|
+
└── ...
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, cache_dir: Path | str | None = None) -> None:
|
|
89
|
+
"""Initialize tick storage.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
cache_dir : Path | str | None
|
|
94
|
+
Custom cache directory. If None, uses platformdirs default.
|
|
95
|
+
"""
|
|
96
|
+
if cache_dir is None:
|
|
97
|
+
self._cache_dir = get_cache_dir()
|
|
98
|
+
else:
|
|
99
|
+
self._cache_dir = Path(cache_dir)
|
|
100
|
+
|
|
101
|
+
self._ticks_dir = self._cache_dir / "ticks"
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def cache_dir(self) -> Path:
|
|
105
|
+
"""Get the cache directory path."""
|
|
106
|
+
return self._cache_dir
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def ticks_dir(self) -> Path:
|
|
110
|
+
"""Get the ticks storage directory path."""
|
|
111
|
+
return self._ticks_dir
|
|
112
|
+
|
|
113
|
+
def _get_symbol_dir(self, symbol: str) -> Path:
|
|
114
|
+
"""Get directory for a symbol's tick files."""
|
|
115
|
+
return self._ticks_dir / symbol
|
|
116
|
+
|
|
117
|
+
def _get_parquet_path(self, symbol: str, year_month: str) -> Path:
|
|
118
|
+
"""Get path for a specific month's parquet file.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
symbol : str
|
|
123
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
124
|
+
year_month : str
|
|
125
|
+
Year-month string (e.g., "2024-01")
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
Path
|
|
130
|
+
Path to the parquet file
|
|
131
|
+
"""
|
|
132
|
+
return self._get_symbol_dir(symbol) / f"{year_month}.parquet"
|
|
133
|
+
|
|
134
|
+
def _timestamp_to_year_month(self, timestamp_ms: int) -> str:
|
|
135
|
+
"""Convert millisecond timestamp to year-month string."""
|
|
136
|
+
dt = datetime.fromtimestamp(timestamp_ms / 1000, tz=UTC)
|
|
137
|
+
return dt.strftime("%Y-%m")
|
|
138
|
+
|
|
139
|
+
def write_ticks(
|
|
140
|
+
self,
|
|
141
|
+
symbol: str,
|
|
142
|
+
ticks: pl.DataFrame | pd.DataFrame,
|
|
143
|
+
*,
|
|
144
|
+
timestamp_col: str = "timestamp",
|
|
145
|
+
) -> int:
|
|
146
|
+
"""Write tick data to Parquet files, partitioned by month.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
symbol : str
|
|
151
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
152
|
+
ticks : pl.DataFrame | pd.DataFrame
|
|
153
|
+
Tick data with timestamp column
|
|
154
|
+
timestamp_col : str
|
|
155
|
+
Name of the timestamp column (milliseconds since epoch or datetime)
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
int
|
|
160
|
+
Number of rows written
|
|
161
|
+
|
|
162
|
+
Notes
|
|
163
|
+
-----
|
|
164
|
+
Tick data is partitioned by month and appended to existing files.
|
|
165
|
+
Duplicates are not automatically removed - use ClickHouse for deduplication.
|
|
166
|
+
"""
|
|
167
|
+
# Convert pandas to polars if needed
|
|
168
|
+
if not isinstance(ticks, pl.DataFrame):
|
|
169
|
+
ticks = pl.from_pandas(ticks)
|
|
170
|
+
|
|
171
|
+
if ticks.is_empty():
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
# Ensure symbol directory exists
|
|
175
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
176
|
+
symbol_dir.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
|
|
178
|
+
# Convert timestamp to milliseconds if datetime
|
|
179
|
+
if ticks.schema[timestamp_col] in (pl.Datetime, pl.Date):
|
|
180
|
+
ticks = ticks.with_columns(
|
|
181
|
+
pl.col(timestamp_col).dt.epoch(time_unit="ms").alias(timestamp_col)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Add year_month column for partitioning (vectorized, no Python per-row calls)
|
|
185
|
+
# MEM-001: Replaced map_elements() with native Polars dt operations
|
|
186
|
+
# Impact: 13.4 GB → ~100 MB (99% reduction)
|
|
187
|
+
ticks = ticks.with_columns(
|
|
188
|
+
pl.col(timestamp_col)
|
|
189
|
+
.cast(pl.Datetime(time_unit="ms"))
|
|
190
|
+
.dt.strftime("%Y-%m")
|
|
191
|
+
.alias("_year_month")
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Group by month and write
|
|
195
|
+
total_rows = 0
|
|
196
|
+
for (year_month,), group_df in ticks.group_by("_year_month"):
|
|
197
|
+
parquet_path = self._get_parquet_path(symbol, year_month)
|
|
198
|
+
|
|
199
|
+
# Drop the partition column before writing
|
|
200
|
+
write_df = group_df.drop("_year_month")
|
|
201
|
+
|
|
202
|
+
if parquet_path.exists():
|
|
203
|
+
# Append to existing file
|
|
204
|
+
existing_df = pl.read_parquet(parquet_path)
|
|
205
|
+
combined_df = pl.concat([existing_df, write_df])
|
|
206
|
+
combined_df.write_parquet(
|
|
207
|
+
parquet_path,
|
|
208
|
+
compression=COMPRESSION,
|
|
209
|
+
compression_level=COMPRESSION_LEVEL,
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
# Write new file
|
|
213
|
+
write_df.write_parquet(
|
|
214
|
+
parquet_path,
|
|
215
|
+
compression=COMPRESSION,
|
|
216
|
+
compression_level=COMPRESSION_LEVEL,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
total_rows += len(write_df)
|
|
220
|
+
|
|
221
|
+
return total_rows
|
|
222
|
+
|
|
223
|
+
# Default Parquet compression ratio (compressed → in-memory expansion)
|
|
224
|
+
# Empirically measured: Binance aggTrades Parquet files expand ~4x
|
|
225
|
+
_COMPRESSION_RATIO: float = 4.0
|
|
226
|
+
|
|
227
|
+
def read_ticks(
|
|
228
|
+
self,
|
|
229
|
+
symbol: str,
|
|
230
|
+
start_ts: int | None = None,
|
|
231
|
+
end_ts: int | None = None,
|
|
232
|
+
*,
|
|
233
|
+
timestamp_col: str = "timestamp",
|
|
234
|
+
max_memory_mb: int | None = None,
|
|
235
|
+
) -> pl.DataFrame:
|
|
236
|
+
"""Read tick data from Parquet files.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
symbol : str
|
|
241
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
242
|
+
start_ts : int | None
|
|
243
|
+
Start timestamp in milliseconds (inclusive)
|
|
244
|
+
end_ts : int | None
|
|
245
|
+
End timestamp in milliseconds (inclusive)
|
|
246
|
+
timestamp_col : str
|
|
247
|
+
Name of the timestamp column
|
|
248
|
+
max_memory_mb : int | None
|
|
249
|
+
Memory budget in MB. If the estimated in-memory size exceeds
|
|
250
|
+
this limit, raises MemoryError with a suggestion to use
|
|
251
|
+
read_ticks_streaming(). None disables the guard.
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
pl.DataFrame
|
|
256
|
+
Tick data filtered by time range
|
|
257
|
+
|
|
258
|
+
Raises
|
|
259
|
+
------
|
|
260
|
+
MemoryError
|
|
261
|
+
If estimated memory exceeds max_memory_mb budget.
|
|
262
|
+
|
|
263
|
+
Notes
|
|
264
|
+
-----
|
|
265
|
+
Reads all relevant monthly files and concatenates them.
|
|
266
|
+
Uses lazy evaluation for efficient memory usage.
|
|
267
|
+
"""
|
|
268
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
269
|
+
|
|
270
|
+
if not symbol_dir.exists():
|
|
271
|
+
return pl.DataFrame()
|
|
272
|
+
|
|
273
|
+
# Find relevant parquet files
|
|
274
|
+
parquet_files = sorted(symbol_dir.glob("*.parquet"))
|
|
275
|
+
|
|
276
|
+
if not parquet_files:
|
|
277
|
+
return pl.DataFrame()
|
|
278
|
+
|
|
279
|
+
# Filter files by month if time range specified
|
|
280
|
+
if start_ts is not None and end_ts is not None:
|
|
281
|
+
start_month = self._timestamp_to_year_month(start_ts)
|
|
282
|
+
end_month = self._timestamp_to_year_month(end_ts)
|
|
283
|
+
|
|
284
|
+
parquet_files = [
|
|
285
|
+
f for f in parquet_files if start_month <= f.stem <= end_month
|
|
286
|
+
]
|
|
287
|
+
|
|
288
|
+
if not parquet_files:
|
|
289
|
+
return pl.DataFrame()
|
|
290
|
+
|
|
291
|
+
# MEM-004: Estimate size before materializing (Issue #49)
|
|
292
|
+
if max_memory_mb is not None:
|
|
293
|
+
total_bytes = sum(f.stat().st_size for f in parquet_files)
|
|
294
|
+
estimated_mb = int(
|
|
295
|
+
total_bytes * self._COMPRESSION_RATIO / (1024 * 1024)
|
|
296
|
+
)
|
|
297
|
+
if estimated_mb > max_memory_mb:
|
|
298
|
+
msg = (
|
|
299
|
+
f"Estimated {estimated_mb} MB for {symbol} "
|
|
300
|
+
f"({len(parquet_files)} files), exceeds budget "
|
|
301
|
+
f"{max_memory_mb} MB. Use read_ticks_streaming() "
|
|
302
|
+
f"for chunked loading."
|
|
303
|
+
)
|
|
304
|
+
raise MemoryError(msg)
|
|
305
|
+
|
|
306
|
+
# LAZY LOADING with predicate pushdown
|
|
307
|
+
# Uses pl.scan_parquet() instead of pl.read_parquet() to enable:
|
|
308
|
+
# 1. Predicate pushdown: filters applied at Parquet row-group level
|
|
309
|
+
# 2. Lazy evaluation: only filtered rows loaded into memory
|
|
310
|
+
# 3. 2x I/O speedup, 50% memory reduction for filtered queries
|
|
311
|
+
lazy_dfs = [pl.scan_parquet(f) for f in parquet_files]
|
|
312
|
+
result = pl.concat(lazy_dfs)
|
|
313
|
+
|
|
314
|
+
# Apply time range filter (pushed down to Parquet)
|
|
315
|
+
if start_ts is not None:
|
|
316
|
+
result = result.filter(pl.col(timestamp_col) >= start_ts)
|
|
317
|
+
if end_ts is not None:
|
|
318
|
+
result = result.filter(pl.col(timestamp_col) <= end_ts)
|
|
319
|
+
|
|
320
|
+
# Sort and materialize
|
|
321
|
+
return result.sort(timestamp_col).collect()
|
|
322
|
+
|
|
323
|
+
def read_ticks_streaming( # noqa: PLR0912
|
|
324
|
+
self,
|
|
325
|
+
symbol: str,
|
|
326
|
+
start_ts: int | None = None,
|
|
327
|
+
end_ts: int | None = None,
|
|
328
|
+
*,
|
|
329
|
+
chunk_size: int = 100_000,
|
|
330
|
+
timestamp_col: str = "timestamp",
|
|
331
|
+
) -> Iterator[pl.DataFrame]:
|
|
332
|
+
"""Read tick data in streaming chunks to avoid OOM on large months.
|
|
333
|
+
|
|
334
|
+
This method yields chunks of tick data instead of loading everything
|
|
335
|
+
into memory at once. Essential for high-volume months like March 2024.
|
|
336
|
+
|
|
337
|
+
Parameters
|
|
338
|
+
----------
|
|
339
|
+
symbol : str
|
|
340
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
341
|
+
start_ts : int | None
|
|
342
|
+
Start timestamp in milliseconds (inclusive)
|
|
343
|
+
end_ts : int | None
|
|
344
|
+
End timestamp in milliseconds (inclusive)
|
|
345
|
+
chunk_size : int
|
|
346
|
+
Number of rows per chunk (default: 100,000)
|
|
347
|
+
timestamp_col : str
|
|
348
|
+
Name of the timestamp column
|
|
349
|
+
|
|
350
|
+
Yields
|
|
351
|
+
------
|
|
352
|
+
pl.DataFrame
|
|
353
|
+
Chunks of tick data, sorted by timestamp within each chunk
|
|
354
|
+
|
|
355
|
+
Notes
|
|
356
|
+
-----
|
|
357
|
+
Memory usage is O(chunk_size) instead of O(total_ticks).
|
|
358
|
+
Each chunk is sorted independently; overall order is maintained
|
|
359
|
+
because parquet files are read in month order.
|
|
360
|
+
|
|
361
|
+
Examples
|
|
362
|
+
--------
|
|
363
|
+
>>> storage = TickStorage()
|
|
364
|
+
>>> for chunk in storage.read_ticks_streaming("BTCUSDT", start_ts, end_ts):
|
|
365
|
+
... process_chunk(chunk)
|
|
366
|
+
"""
|
|
367
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
368
|
+
|
|
369
|
+
if not symbol_dir.exists():
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
# Find relevant parquet files
|
|
373
|
+
parquet_files = sorted(symbol_dir.glob("*.parquet"))
|
|
374
|
+
|
|
375
|
+
if not parquet_files:
|
|
376
|
+
return
|
|
377
|
+
|
|
378
|
+
# Filter files by month if time range specified
|
|
379
|
+
if start_ts is not None and end_ts is not None:
|
|
380
|
+
start_month = self._timestamp_to_year_month(start_ts)
|
|
381
|
+
end_month = self._timestamp_to_year_month(end_ts)
|
|
382
|
+
|
|
383
|
+
parquet_files = [
|
|
384
|
+
f for f in parquet_files if start_month <= f.stem <= end_month
|
|
385
|
+
]
|
|
386
|
+
|
|
387
|
+
if not parquet_files:
|
|
388
|
+
return
|
|
389
|
+
|
|
390
|
+
# Process each parquet file using PyArrow's row group-based reading
|
|
391
|
+
# Row groups are Parquet's native chunking mechanism (typically 64K-1M rows)
|
|
392
|
+
# This is the key to avoiding OOM - we never load the entire file into memory
|
|
393
|
+
import pyarrow.parquet as pq
|
|
394
|
+
|
|
395
|
+
for parquet_file in parquet_files:
|
|
396
|
+
# Read parquet file in row groups
|
|
397
|
+
parquet_reader = pq.ParquetFile(parquet_file)
|
|
398
|
+
num_row_groups = parquet_reader.metadata.num_row_groups
|
|
399
|
+
|
|
400
|
+
accumulated_rows: list[pl.DataFrame] = []
|
|
401
|
+
accumulated_count = 0
|
|
402
|
+
|
|
403
|
+
for rg_idx in range(num_row_groups):
|
|
404
|
+
# Read single row group into PyArrow table (memory efficient)
|
|
405
|
+
row_group = parquet_reader.read_row_group(rg_idx)
|
|
406
|
+
chunk_df = pl.from_arrow(row_group)
|
|
407
|
+
|
|
408
|
+
# Apply time range filter using Polars expressions
|
|
409
|
+
if start_ts is not None:
|
|
410
|
+
chunk_df = chunk_df.filter(
|
|
411
|
+
pl.col(timestamp_col) >= pl.lit(start_ts)
|
|
412
|
+
)
|
|
413
|
+
if end_ts is not None:
|
|
414
|
+
chunk_df = chunk_df.filter(pl.col(timestamp_col) <= pl.lit(end_ts))
|
|
415
|
+
|
|
416
|
+
if chunk_df.is_empty():
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
accumulated_rows.append(chunk_df)
|
|
420
|
+
accumulated_count += len(chunk_df)
|
|
421
|
+
|
|
422
|
+
# Yield when accumulated enough rows
|
|
423
|
+
while accumulated_count >= chunk_size:
|
|
424
|
+
# Concatenate and slice
|
|
425
|
+
combined = pl.concat(accumulated_rows)
|
|
426
|
+
combined = combined.sort(timestamp_col)
|
|
427
|
+
|
|
428
|
+
# Yield chunk_size rows
|
|
429
|
+
yield combined.slice(0, chunk_size)
|
|
430
|
+
|
|
431
|
+
# Keep remainder for next iteration
|
|
432
|
+
remainder_count = accumulated_count - chunk_size
|
|
433
|
+
if remainder_count > 0:
|
|
434
|
+
remainder = combined.slice(chunk_size, remainder_count)
|
|
435
|
+
accumulated_rows = [remainder]
|
|
436
|
+
accumulated_count = len(remainder)
|
|
437
|
+
else:
|
|
438
|
+
accumulated_rows = []
|
|
439
|
+
accumulated_count = 0
|
|
440
|
+
|
|
441
|
+
del combined
|
|
442
|
+
|
|
443
|
+
# Yield any remaining rows
|
|
444
|
+
if accumulated_rows:
|
|
445
|
+
combined = pl.concat(accumulated_rows)
|
|
446
|
+
combined = combined.sort(timestamp_col)
|
|
447
|
+
if not combined.is_empty():
|
|
448
|
+
yield combined
|
|
449
|
+
del combined
|
|
450
|
+
|
|
451
|
+
def has_ticks(
|
|
452
|
+
self,
|
|
453
|
+
symbol: str,
|
|
454
|
+
start_ts: int,
|
|
455
|
+
end_ts: int,
|
|
456
|
+
*,
|
|
457
|
+
min_coverage: float = 0.95,
|
|
458
|
+
timestamp_col: str = "timestamp",
|
|
459
|
+
) -> bool:
|
|
460
|
+
"""Check if tick data exists for the specified time range.
|
|
461
|
+
|
|
462
|
+
Parameters
|
|
463
|
+
----------
|
|
464
|
+
symbol : str
|
|
465
|
+
Trading symbol
|
|
466
|
+
start_ts : int
|
|
467
|
+
Start timestamp in milliseconds
|
|
468
|
+
end_ts : int
|
|
469
|
+
End timestamp in milliseconds
|
|
470
|
+
min_coverage : float
|
|
471
|
+
Minimum coverage ratio (0.0 to 1.0)
|
|
472
|
+
timestamp_col : str
|
|
473
|
+
Name of the timestamp column
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
bool
|
|
478
|
+
True if sufficient data exists
|
|
479
|
+
"""
|
|
480
|
+
tick_data = self.read_ticks(
|
|
481
|
+
symbol, start_ts, end_ts, timestamp_col=timestamp_col
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if tick_data.is_empty():
|
|
485
|
+
return False
|
|
486
|
+
|
|
487
|
+
actual_start = tick_data[timestamp_col].min()
|
|
488
|
+
actual_end = tick_data[timestamp_col].max()
|
|
489
|
+
|
|
490
|
+
if actual_start is None or actual_end is None:
|
|
491
|
+
return False
|
|
492
|
+
|
|
493
|
+
actual_range = actual_end - actual_start
|
|
494
|
+
requested_range = end_ts - start_ts
|
|
495
|
+
|
|
496
|
+
if requested_range == 0:
|
|
497
|
+
return len(tick_data) > 0
|
|
498
|
+
|
|
499
|
+
coverage = actual_range / requested_range
|
|
500
|
+
return coverage >= min_coverage
|
|
501
|
+
|
|
502
|
+
def list_symbols(self) -> list[str]:
|
|
503
|
+
"""List all symbols with stored tick data.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
list[str]
|
|
508
|
+
List of symbol names
|
|
509
|
+
"""
|
|
510
|
+
if not self._ticks_dir.exists():
|
|
511
|
+
return []
|
|
512
|
+
|
|
513
|
+
return sorted(
|
|
514
|
+
d.name for d in self._ticks_dir.iterdir() if d.is_dir() and d.name != ""
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
def list_months(self, symbol: str) -> list[str]:
|
|
518
|
+
"""List all months with stored tick data for a symbol.
|
|
519
|
+
|
|
520
|
+
Parameters
|
|
521
|
+
----------
|
|
522
|
+
symbol : str
|
|
523
|
+
Trading symbol
|
|
524
|
+
|
|
525
|
+
Returns
|
|
526
|
+
-------
|
|
527
|
+
list[str]
|
|
528
|
+
List of year-month strings (e.g., ["2024-01", "2024-02"])
|
|
529
|
+
"""
|
|
530
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
531
|
+
|
|
532
|
+
if not symbol_dir.exists():
|
|
533
|
+
return []
|
|
534
|
+
|
|
535
|
+
return sorted(f.stem for f in symbol_dir.glob("*.parquet"))
|
|
536
|
+
|
|
537
|
+
def delete_ticks(self, symbol: str, year_month: str | None = None) -> bool:
|
|
538
|
+
"""Delete tick data for a symbol or specific month.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
symbol : str
|
|
543
|
+
Trading symbol
|
|
544
|
+
year_month : str | None
|
|
545
|
+
Specific month to delete (e.g., "2024-01"), or None for all
|
|
546
|
+
|
|
547
|
+
Returns
|
|
548
|
+
-------
|
|
549
|
+
bool
|
|
550
|
+
True if files were deleted
|
|
551
|
+
"""
|
|
552
|
+
if year_month is not None:
|
|
553
|
+
# Delete specific month
|
|
554
|
+
parquet_path = self._get_parquet_path(symbol, year_month)
|
|
555
|
+
if parquet_path.exists():
|
|
556
|
+
parquet_path.unlink()
|
|
557
|
+
return True
|
|
558
|
+
return False
|
|
559
|
+
|
|
560
|
+
# Delete all data for symbol
|
|
561
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
562
|
+
if symbol_dir.exists():
|
|
563
|
+
shutil.rmtree(symbol_dir)
|
|
564
|
+
return True
|
|
565
|
+
return False
|
|
566
|
+
|
|
567
|
+
def get_stats(self, symbol: str) -> dict:
|
|
568
|
+
"""Get storage statistics for a symbol.
|
|
569
|
+
|
|
570
|
+
Parameters
|
|
571
|
+
----------
|
|
572
|
+
symbol : str
|
|
573
|
+
Trading symbol
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
dict
|
|
578
|
+
Statistics including file count, total size, row count, date range
|
|
579
|
+
"""
|
|
580
|
+
symbol_dir = self._get_symbol_dir(symbol)
|
|
581
|
+
|
|
582
|
+
if not symbol_dir.exists():
|
|
583
|
+
return {
|
|
584
|
+
"symbol": symbol,
|
|
585
|
+
"exists": False,
|
|
586
|
+
"file_count": 0,
|
|
587
|
+
"total_size_bytes": 0,
|
|
588
|
+
"total_rows": 0,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
parquet_files = list(symbol_dir.glob("*.parquet"))
|
|
592
|
+
total_size = sum(f.stat().st_size for f in parquet_files)
|
|
593
|
+
total_rows = 0
|
|
594
|
+
months = []
|
|
595
|
+
|
|
596
|
+
for f in parquet_files:
|
|
597
|
+
file_data = pl.read_parquet(f)
|
|
598
|
+
total_rows += len(file_data)
|
|
599
|
+
months.append(f.stem)
|
|
600
|
+
|
|
601
|
+
return {
|
|
602
|
+
"symbol": symbol,
|
|
603
|
+
"exists": True,
|
|
604
|
+
"file_count": len(parquet_files),
|
|
605
|
+
"total_size_bytes": total_size,
|
|
606
|
+
"total_size_mb": total_size / 1024 / 1024,
|
|
607
|
+
"total_rows": total_rows,
|
|
608
|
+
"months": sorted(months),
|
|
609
|
+
"compression": f"{COMPRESSION}-{COMPRESSION_LEVEL}",
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
def fetch_month(
|
|
613
|
+
self,
|
|
614
|
+
symbol: str,
|
|
615
|
+
year: int,
|
|
616
|
+
month: int,
|
|
617
|
+
*,
|
|
618
|
+
timestamp_col: str = "timestamp", # noqa: ARG002 - reserved for filtering
|
|
619
|
+
force_refresh: bool = False,
|
|
620
|
+
) -> pl.LazyFrame:
|
|
621
|
+
"""Fetch tick data for a specific month (lazy loading).
|
|
622
|
+
|
|
623
|
+
Returns a LazyFrame for memory efficiency. If data is not cached,
|
|
624
|
+
this method does NOT automatically download from source - use
|
|
625
|
+
`get_range_bars()` or manual fetching first.
|
|
626
|
+
|
|
627
|
+
Parameters
|
|
628
|
+
----------
|
|
629
|
+
symbol : str
|
|
630
|
+
Trading symbol (e.g., "BTCUSDT" or "BINANCE_SPOT_BTCUSDT")
|
|
631
|
+
year : int
|
|
632
|
+
Year (e.g., 2024)
|
|
633
|
+
month : int
|
|
634
|
+
Month (1-12)
|
|
635
|
+
timestamp_col : str
|
|
636
|
+
Name of the timestamp column
|
|
637
|
+
force_refresh : bool
|
|
638
|
+
If True, skip cache and return empty LazyFrame (caller must fetch)
|
|
639
|
+
|
|
640
|
+
Returns
|
|
641
|
+
-------
|
|
642
|
+
pl.LazyFrame
|
|
643
|
+
Lazy frame for the month's tick data, or empty LazyFrame if not cached
|
|
644
|
+
|
|
645
|
+
Examples
|
|
646
|
+
--------
|
|
647
|
+
>>> storage = TickStorage()
|
|
648
|
+
>>> lf = storage.fetch_month("BTCUSDT", 2024, 1)
|
|
649
|
+
>>> df = lf.collect() # Materialize when needed
|
|
650
|
+
"""
|
|
651
|
+
year_month = f"{year}-{month:02d}"
|
|
652
|
+
|
|
653
|
+
if force_refresh:
|
|
654
|
+
return pl.LazyFrame()
|
|
655
|
+
|
|
656
|
+
parquet_path = self._get_parquet_path(symbol, year_month)
|
|
657
|
+
|
|
658
|
+
if not parquet_path.exists():
|
|
659
|
+
return pl.LazyFrame()
|
|
660
|
+
|
|
661
|
+
return pl.scan_parquet(parquet_path)
|
|
662
|
+
|
|
663
|
+
def fetch_date_range(
|
|
664
|
+
self,
|
|
665
|
+
symbol: str,
|
|
666
|
+
start_date: str,
|
|
667
|
+
end_date: str,
|
|
668
|
+
*,
|
|
669
|
+
timestamp_col: str = "timestamp",
|
|
670
|
+
) -> Iterator[pl.LazyFrame]:
|
|
671
|
+
"""Iterate over tick data for date range, one month at a time.
|
|
672
|
+
|
|
673
|
+
Yields LazyFrames for each month in range. This is the recommended
|
|
674
|
+
approach for processing large date ranges without loading all data
|
|
675
|
+
into memory at once.
|
|
676
|
+
|
|
677
|
+
Parameters
|
|
678
|
+
----------
|
|
679
|
+
symbol : str
|
|
680
|
+
Trading symbol (e.g., "BTCUSDT")
|
|
681
|
+
start_date : str
|
|
682
|
+
Start date "YYYY-MM-DD"
|
|
683
|
+
end_date : str
|
|
684
|
+
End date "YYYY-MM-DD"
|
|
685
|
+
timestamp_col : str
|
|
686
|
+
Name of the timestamp column
|
|
687
|
+
|
|
688
|
+
Yields
|
|
689
|
+
------
|
|
690
|
+
pl.LazyFrame
|
|
691
|
+
Lazy frame for each month with available data
|
|
692
|
+
|
|
693
|
+
Examples
|
|
694
|
+
--------
|
|
695
|
+
>>> storage = TickStorage()
|
|
696
|
+
>>> for lf in storage.fetch_date_range("BTCUSDT", "2024-01-01", "2024-03-31"):
|
|
697
|
+
... df = lf.collect()
|
|
698
|
+
... print(f"Processing {len(df)} ticks")
|
|
699
|
+
|
|
700
|
+
Notes
|
|
701
|
+
-----
|
|
702
|
+
- Only yields LazyFrames for months with cached data
|
|
703
|
+
- Data is NOT automatically downloaded - use get_range_bars() first
|
|
704
|
+
- Each LazyFrame can be collected independently for O(month) memory
|
|
705
|
+
"""
|
|
706
|
+
start_dt = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=UTC)
|
|
707
|
+
end_dt = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC)
|
|
708
|
+
|
|
709
|
+
current = start_dt.replace(day=1)
|
|
710
|
+
while current <= end_dt:
|
|
711
|
+
year = current.year
|
|
712
|
+
month = current.month
|
|
713
|
+
|
|
714
|
+
lf = self.fetch_month(symbol, year, month, timestamp_col=timestamp_col)
|
|
715
|
+
|
|
716
|
+
# Only yield non-empty LazyFrames
|
|
717
|
+
# Note: We can't easily check if LazyFrame is empty without collecting
|
|
718
|
+
# So we yield all and let caller handle empty results
|
|
719
|
+
parquet_path = self._get_parquet_path(symbol, f"{year}-{month:02d}")
|
|
720
|
+
if parquet_path.exists():
|
|
721
|
+
yield lf
|
|
722
|
+
|
|
723
|
+
# Move to next month
|
|
724
|
+
_december = 12
|
|
725
|
+
if current.month == _december:
|
|
726
|
+
current = current.replace(year=current.year + 1, month=1)
|
|
727
|
+
else:
|
|
728
|
+
current = current.replace(month=current.month + 1)
|