rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,728 @@
1
+ """Parquet-based tick storage with ZSTD-3 compression.
2
+
3
+ This module replaces ClickHouse Tier 1 (raw trades cache) with local
4
+ Parquet files for better portability and no server requirement.
5
+
6
+ Compression choice (based on empirical benchmark 2026-01-07):
7
+ - ZSTD-3: 6.50 MB for 761K trades (5.37x compression)
8
+ - Write: 0.019s, Read: 0.006s
9
+ - Beats Brotli on BOTH size AND speed for tick data
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import shutil
16
+ from collections.abc import Iterator
17
+ from datetime import UTC, datetime
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING
20
+
21
+ import polars as pl
22
+ from platformdirs import user_cache_dir
23
+
24
+ if TYPE_CHECKING:
25
+ import pandas as pd
26
+
27
+ # Constants
28
+ COMPRESSION = "zstd"
29
+ COMPRESSION_LEVEL = 3
30
+ APP_NAME = "rangebar"
31
+ APP_AUTHOR = "terrylica"
32
+
33
+
34
+ def get_cache_dir() -> Path:
35
+ """Get the cross-platform cache directory for rangebar.
36
+
37
+ Returns
38
+ -------
39
+ Path
40
+ Platform-specific cache directory:
41
+ - macOS: ~/Library/Caches/rangebar/
42
+ - Linux: ~/.cache/rangebar/ (respects XDG_CACHE_HOME)
43
+ - Windows: %USERPROFILE%\\AppData\\Local\\terrylica\\rangebar\\Cache\\
44
+
45
+ Examples
46
+ --------
47
+ >>> from rangebar.storage import get_cache_dir
48
+ >>> cache_dir = get_cache_dir()
49
+ >>> print(cache_dir)
50
+ /Users/username/Library/Caches/rangebar
51
+ """
52
+ # Allow override via environment variable
53
+ env_override = os.getenv("RANGEBAR_CACHE_DIR")
54
+ if env_override:
55
+ return Path(env_override)
56
+
57
+ return Path(user_cache_dir(APP_NAME, APP_AUTHOR))
58
+
59
+
60
+ class TickStorage:
61
+ """Parquet-based tick data storage with ZSTD-3 compression.
62
+
63
+ Stores raw tick data in Parquet files partitioned by symbol and month.
64
+ Uses polars for fast I/O with ZSTD-3 compression.
65
+
66
+ Parameters
67
+ ----------
68
+ cache_dir : Path | str | None
69
+ Custom cache directory. If None, uses platformdirs default.
70
+
71
+ Examples
72
+ --------
73
+ >>> storage = TickStorage()
74
+ >>> storage.write_ticks("BTCUSDT", trades_df)
75
+ >>> df = storage.read_ticks("BTCUSDT", start_ts, end_ts)
76
+
77
+ Directory Structure
78
+ -------------------
79
+ ~/.cache/rangebar/ticks/
80
+ ├── BTCUSDT/
81
+ │ ├── 2024-01.parquet
82
+ │ ├── 2024-02.parquet
83
+ │ └── ...
84
+ └── EURUSD/
85
+ └── ...
86
+ """
87
+
88
+ def __init__(self, cache_dir: Path | str | None = None) -> None:
89
+ """Initialize tick storage.
90
+
91
+ Parameters
92
+ ----------
93
+ cache_dir : Path | str | None
94
+ Custom cache directory. If None, uses platformdirs default.
95
+ """
96
+ if cache_dir is None:
97
+ self._cache_dir = get_cache_dir()
98
+ else:
99
+ self._cache_dir = Path(cache_dir)
100
+
101
+ self._ticks_dir = self._cache_dir / "ticks"
102
+
103
+ @property
104
+ def cache_dir(self) -> Path:
105
+ """Get the cache directory path."""
106
+ return self._cache_dir
107
+
108
+ @property
109
+ def ticks_dir(self) -> Path:
110
+ """Get the ticks storage directory path."""
111
+ return self._ticks_dir
112
+
113
+ def _get_symbol_dir(self, symbol: str) -> Path:
114
+ """Get directory for a symbol's tick files."""
115
+ return self._ticks_dir / symbol
116
+
117
+ def _get_parquet_path(self, symbol: str, year_month: str) -> Path:
118
+ """Get path for a specific month's parquet file.
119
+
120
+ Parameters
121
+ ----------
122
+ symbol : str
123
+ Trading symbol (e.g., "BTCUSDT")
124
+ year_month : str
125
+ Year-month string (e.g., "2024-01")
126
+
127
+ Returns
128
+ -------
129
+ Path
130
+ Path to the parquet file
131
+ """
132
+ return self._get_symbol_dir(symbol) / f"{year_month}.parquet"
133
+
134
+ def _timestamp_to_year_month(self, timestamp_ms: int) -> str:
135
+ """Convert millisecond timestamp to year-month string."""
136
+ dt = datetime.fromtimestamp(timestamp_ms / 1000, tz=UTC)
137
+ return dt.strftime("%Y-%m")
138
+
139
+ def write_ticks(
140
+ self,
141
+ symbol: str,
142
+ ticks: pl.DataFrame | pd.DataFrame,
143
+ *,
144
+ timestamp_col: str = "timestamp",
145
+ ) -> int:
146
+ """Write tick data to Parquet files, partitioned by month.
147
+
148
+ Parameters
149
+ ----------
150
+ symbol : str
151
+ Trading symbol (e.g., "BTCUSDT")
152
+ ticks : pl.DataFrame | pd.DataFrame
153
+ Tick data with timestamp column
154
+ timestamp_col : str
155
+ Name of the timestamp column (milliseconds since epoch or datetime)
156
+
157
+ Returns
158
+ -------
159
+ int
160
+ Number of rows written
161
+
162
+ Notes
163
+ -----
164
+ Tick data is partitioned by month and appended to existing files.
165
+ Duplicates are not automatically removed - use ClickHouse for deduplication.
166
+ """
167
+ # Convert pandas to polars if needed
168
+ if not isinstance(ticks, pl.DataFrame):
169
+ ticks = pl.from_pandas(ticks)
170
+
171
+ if ticks.is_empty():
172
+ return 0
173
+
174
+ # Ensure symbol directory exists
175
+ symbol_dir = self._get_symbol_dir(symbol)
176
+ symbol_dir.mkdir(parents=True, exist_ok=True)
177
+
178
+ # Convert timestamp to milliseconds if datetime
179
+ if ticks.schema[timestamp_col] in (pl.Datetime, pl.Date):
180
+ ticks = ticks.with_columns(
181
+ pl.col(timestamp_col).dt.epoch(time_unit="ms").alias(timestamp_col)
182
+ )
183
+
184
+ # Add year_month column for partitioning (vectorized, no Python per-row calls)
185
+ # MEM-001: Replaced map_elements() with native Polars dt operations
186
+ # Impact: 13.4 GB → ~100 MB (99% reduction)
187
+ ticks = ticks.with_columns(
188
+ pl.col(timestamp_col)
189
+ .cast(pl.Datetime(time_unit="ms"))
190
+ .dt.strftime("%Y-%m")
191
+ .alias("_year_month")
192
+ )
193
+
194
+ # Group by month and write
195
+ total_rows = 0
196
+ for (year_month,), group_df in ticks.group_by("_year_month"):
197
+ parquet_path = self._get_parquet_path(symbol, year_month)
198
+
199
+ # Drop the partition column before writing
200
+ write_df = group_df.drop("_year_month")
201
+
202
+ if parquet_path.exists():
203
+ # Append to existing file
204
+ existing_df = pl.read_parquet(parquet_path)
205
+ combined_df = pl.concat([existing_df, write_df])
206
+ combined_df.write_parquet(
207
+ parquet_path,
208
+ compression=COMPRESSION,
209
+ compression_level=COMPRESSION_LEVEL,
210
+ )
211
+ else:
212
+ # Write new file
213
+ write_df.write_parquet(
214
+ parquet_path,
215
+ compression=COMPRESSION,
216
+ compression_level=COMPRESSION_LEVEL,
217
+ )
218
+
219
+ total_rows += len(write_df)
220
+
221
+ return total_rows
222
+
223
+ # Default Parquet compression ratio (compressed → in-memory expansion)
224
+ # Empirically measured: Binance aggTrades Parquet files expand ~4x
225
+ _COMPRESSION_RATIO: float = 4.0
226
+
227
+ def read_ticks(
228
+ self,
229
+ symbol: str,
230
+ start_ts: int | None = None,
231
+ end_ts: int | None = None,
232
+ *,
233
+ timestamp_col: str = "timestamp",
234
+ max_memory_mb: int | None = None,
235
+ ) -> pl.DataFrame:
236
+ """Read tick data from Parquet files.
237
+
238
+ Parameters
239
+ ----------
240
+ symbol : str
241
+ Trading symbol (e.g., "BTCUSDT")
242
+ start_ts : int | None
243
+ Start timestamp in milliseconds (inclusive)
244
+ end_ts : int | None
245
+ End timestamp in milliseconds (inclusive)
246
+ timestamp_col : str
247
+ Name of the timestamp column
248
+ max_memory_mb : int | None
249
+ Memory budget in MB. If the estimated in-memory size exceeds
250
+ this limit, raises MemoryError with a suggestion to use
251
+ read_ticks_streaming(). None disables the guard.
252
+
253
+ Returns
254
+ -------
255
+ pl.DataFrame
256
+ Tick data filtered by time range
257
+
258
+ Raises
259
+ ------
260
+ MemoryError
261
+ If estimated memory exceeds max_memory_mb budget.
262
+
263
+ Notes
264
+ -----
265
+ Reads all relevant monthly files and concatenates them.
266
+ Uses lazy evaluation for efficient memory usage.
267
+ """
268
+ symbol_dir = self._get_symbol_dir(symbol)
269
+
270
+ if not symbol_dir.exists():
271
+ return pl.DataFrame()
272
+
273
+ # Find relevant parquet files
274
+ parquet_files = sorted(symbol_dir.glob("*.parquet"))
275
+
276
+ if not parquet_files:
277
+ return pl.DataFrame()
278
+
279
+ # Filter files by month if time range specified
280
+ if start_ts is not None and end_ts is not None:
281
+ start_month = self._timestamp_to_year_month(start_ts)
282
+ end_month = self._timestamp_to_year_month(end_ts)
283
+
284
+ parquet_files = [
285
+ f for f in parquet_files if start_month <= f.stem <= end_month
286
+ ]
287
+
288
+ if not parquet_files:
289
+ return pl.DataFrame()
290
+
291
+ # MEM-004: Estimate size before materializing (Issue #49)
292
+ if max_memory_mb is not None:
293
+ total_bytes = sum(f.stat().st_size for f in parquet_files)
294
+ estimated_mb = int(
295
+ total_bytes * self._COMPRESSION_RATIO / (1024 * 1024)
296
+ )
297
+ if estimated_mb > max_memory_mb:
298
+ msg = (
299
+ f"Estimated {estimated_mb} MB for {symbol} "
300
+ f"({len(parquet_files)} files), exceeds budget "
301
+ f"{max_memory_mb} MB. Use read_ticks_streaming() "
302
+ f"for chunked loading."
303
+ )
304
+ raise MemoryError(msg)
305
+
306
+ # LAZY LOADING with predicate pushdown
307
+ # Uses pl.scan_parquet() instead of pl.read_parquet() to enable:
308
+ # 1. Predicate pushdown: filters applied at Parquet row-group level
309
+ # 2. Lazy evaluation: only filtered rows loaded into memory
310
+ # 3. 2x I/O speedup, 50% memory reduction for filtered queries
311
+ lazy_dfs = [pl.scan_parquet(f) for f in parquet_files]
312
+ result = pl.concat(lazy_dfs)
313
+
314
+ # Apply time range filter (pushed down to Parquet)
315
+ if start_ts is not None:
316
+ result = result.filter(pl.col(timestamp_col) >= start_ts)
317
+ if end_ts is not None:
318
+ result = result.filter(pl.col(timestamp_col) <= end_ts)
319
+
320
+ # Sort and materialize
321
+ return result.sort(timestamp_col).collect()
322
+
323
+ def read_ticks_streaming( # noqa: PLR0912
324
+ self,
325
+ symbol: str,
326
+ start_ts: int | None = None,
327
+ end_ts: int | None = None,
328
+ *,
329
+ chunk_size: int = 100_000,
330
+ timestamp_col: str = "timestamp",
331
+ ) -> Iterator[pl.DataFrame]:
332
+ """Read tick data in streaming chunks to avoid OOM on large months.
333
+
334
+ This method yields chunks of tick data instead of loading everything
335
+ into memory at once. Essential for high-volume months like March 2024.
336
+
337
+ Parameters
338
+ ----------
339
+ symbol : str
340
+ Trading symbol (e.g., "BTCUSDT")
341
+ start_ts : int | None
342
+ Start timestamp in milliseconds (inclusive)
343
+ end_ts : int | None
344
+ End timestamp in milliseconds (inclusive)
345
+ chunk_size : int
346
+ Number of rows per chunk (default: 100,000)
347
+ timestamp_col : str
348
+ Name of the timestamp column
349
+
350
+ Yields
351
+ ------
352
+ pl.DataFrame
353
+ Chunks of tick data, sorted by timestamp within each chunk
354
+
355
+ Notes
356
+ -----
357
+ Memory usage is O(chunk_size) instead of O(total_ticks).
358
+ Each chunk is sorted independently; overall order is maintained
359
+ because parquet files are read in month order.
360
+
361
+ Examples
362
+ --------
363
+ >>> storage = TickStorage()
364
+ >>> for chunk in storage.read_ticks_streaming("BTCUSDT", start_ts, end_ts):
365
+ ... process_chunk(chunk)
366
+ """
367
+ symbol_dir = self._get_symbol_dir(symbol)
368
+
369
+ if not symbol_dir.exists():
370
+ return
371
+
372
+ # Find relevant parquet files
373
+ parquet_files = sorted(symbol_dir.glob("*.parquet"))
374
+
375
+ if not parquet_files:
376
+ return
377
+
378
+ # Filter files by month if time range specified
379
+ if start_ts is not None and end_ts is not None:
380
+ start_month = self._timestamp_to_year_month(start_ts)
381
+ end_month = self._timestamp_to_year_month(end_ts)
382
+
383
+ parquet_files = [
384
+ f for f in parquet_files if start_month <= f.stem <= end_month
385
+ ]
386
+
387
+ if not parquet_files:
388
+ return
389
+
390
+ # Process each parquet file using PyArrow's row group-based reading
391
+ # Row groups are Parquet's native chunking mechanism (typically 64K-1M rows)
392
+ # This is the key to avoiding OOM - we never load the entire file into memory
393
+ import pyarrow.parquet as pq
394
+
395
+ for parquet_file in parquet_files:
396
+ # Read parquet file in row groups
397
+ parquet_reader = pq.ParquetFile(parquet_file)
398
+ num_row_groups = parquet_reader.metadata.num_row_groups
399
+
400
+ accumulated_rows: list[pl.DataFrame] = []
401
+ accumulated_count = 0
402
+
403
+ for rg_idx in range(num_row_groups):
404
+ # Read single row group into PyArrow table (memory efficient)
405
+ row_group = parquet_reader.read_row_group(rg_idx)
406
+ chunk_df = pl.from_arrow(row_group)
407
+
408
+ # Apply time range filter using Polars expressions
409
+ if start_ts is not None:
410
+ chunk_df = chunk_df.filter(
411
+ pl.col(timestamp_col) >= pl.lit(start_ts)
412
+ )
413
+ if end_ts is not None:
414
+ chunk_df = chunk_df.filter(pl.col(timestamp_col) <= pl.lit(end_ts))
415
+
416
+ if chunk_df.is_empty():
417
+ continue
418
+
419
+ accumulated_rows.append(chunk_df)
420
+ accumulated_count += len(chunk_df)
421
+
422
+ # Yield when accumulated enough rows
423
+ while accumulated_count >= chunk_size:
424
+ # Concatenate and slice
425
+ combined = pl.concat(accumulated_rows)
426
+ combined = combined.sort(timestamp_col)
427
+
428
+ # Yield chunk_size rows
429
+ yield combined.slice(0, chunk_size)
430
+
431
+ # Keep remainder for next iteration
432
+ remainder_count = accumulated_count - chunk_size
433
+ if remainder_count > 0:
434
+ remainder = combined.slice(chunk_size, remainder_count)
435
+ accumulated_rows = [remainder]
436
+ accumulated_count = len(remainder)
437
+ else:
438
+ accumulated_rows = []
439
+ accumulated_count = 0
440
+
441
+ del combined
442
+
443
+ # Yield any remaining rows
444
+ if accumulated_rows:
445
+ combined = pl.concat(accumulated_rows)
446
+ combined = combined.sort(timestamp_col)
447
+ if not combined.is_empty():
448
+ yield combined
449
+ del combined
450
+
451
+ def has_ticks(
452
+ self,
453
+ symbol: str,
454
+ start_ts: int,
455
+ end_ts: int,
456
+ *,
457
+ min_coverage: float = 0.95,
458
+ timestamp_col: str = "timestamp",
459
+ ) -> bool:
460
+ """Check if tick data exists for the specified time range.
461
+
462
+ Parameters
463
+ ----------
464
+ symbol : str
465
+ Trading symbol
466
+ start_ts : int
467
+ Start timestamp in milliseconds
468
+ end_ts : int
469
+ End timestamp in milliseconds
470
+ min_coverage : float
471
+ Minimum coverage ratio (0.0 to 1.0)
472
+ timestamp_col : str
473
+ Name of the timestamp column
474
+
475
+ Returns
476
+ -------
477
+ bool
478
+ True if sufficient data exists
479
+ """
480
+ tick_data = self.read_ticks(
481
+ symbol, start_ts, end_ts, timestamp_col=timestamp_col
482
+ )
483
+
484
+ if tick_data.is_empty():
485
+ return False
486
+
487
+ actual_start = tick_data[timestamp_col].min()
488
+ actual_end = tick_data[timestamp_col].max()
489
+
490
+ if actual_start is None or actual_end is None:
491
+ return False
492
+
493
+ actual_range = actual_end - actual_start
494
+ requested_range = end_ts - start_ts
495
+
496
+ if requested_range == 0:
497
+ return len(tick_data) > 0
498
+
499
+ coverage = actual_range / requested_range
500
+ return coverage >= min_coverage
501
+
502
+ def list_symbols(self) -> list[str]:
503
+ """List all symbols with stored tick data.
504
+
505
+ Returns
506
+ -------
507
+ list[str]
508
+ List of symbol names
509
+ """
510
+ if not self._ticks_dir.exists():
511
+ return []
512
+
513
+ return sorted(
514
+ d.name for d in self._ticks_dir.iterdir() if d.is_dir() and d.name != ""
515
+ )
516
+
517
+ def list_months(self, symbol: str) -> list[str]:
518
+ """List all months with stored tick data for a symbol.
519
+
520
+ Parameters
521
+ ----------
522
+ symbol : str
523
+ Trading symbol
524
+
525
+ Returns
526
+ -------
527
+ list[str]
528
+ List of year-month strings (e.g., ["2024-01", "2024-02"])
529
+ """
530
+ symbol_dir = self._get_symbol_dir(symbol)
531
+
532
+ if not symbol_dir.exists():
533
+ return []
534
+
535
+ return sorted(f.stem for f in symbol_dir.glob("*.parquet"))
536
+
537
+ def delete_ticks(self, symbol: str, year_month: str | None = None) -> bool:
538
+ """Delete tick data for a symbol or specific month.
539
+
540
+ Parameters
541
+ ----------
542
+ symbol : str
543
+ Trading symbol
544
+ year_month : str | None
545
+ Specific month to delete (e.g., "2024-01"), or None for all
546
+
547
+ Returns
548
+ -------
549
+ bool
550
+ True if files were deleted
551
+ """
552
+ if year_month is not None:
553
+ # Delete specific month
554
+ parquet_path = self._get_parquet_path(symbol, year_month)
555
+ if parquet_path.exists():
556
+ parquet_path.unlink()
557
+ return True
558
+ return False
559
+
560
+ # Delete all data for symbol
561
+ symbol_dir = self._get_symbol_dir(symbol)
562
+ if symbol_dir.exists():
563
+ shutil.rmtree(symbol_dir)
564
+ return True
565
+ return False
566
+
567
+ def get_stats(self, symbol: str) -> dict:
568
+ """Get storage statistics for a symbol.
569
+
570
+ Parameters
571
+ ----------
572
+ symbol : str
573
+ Trading symbol
574
+
575
+ Returns
576
+ -------
577
+ dict
578
+ Statistics including file count, total size, row count, date range
579
+ """
580
+ symbol_dir = self._get_symbol_dir(symbol)
581
+
582
+ if not symbol_dir.exists():
583
+ return {
584
+ "symbol": symbol,
585
+ "exists": False,
586
+ "file_count": 0,
587
+ "total_size_bytes": 0,
588
+ "total_rows": 0,
589
+ }
590
+
591
+ parquet_files = list(symbol_dir.glob("*.parquet"))
592
+ total_size = sum(f.stat().st_size for f in parquet_files)
593
+ total_rows = 0
594
+ months = []
595
+
596
+ for f in parquet_files:
597
+ file_data = pl.read_parquet(f)
598
+ total_rows += len(file_data)
599
+ months.append(f.stem)
600
+
601
+ return {
602
+ "symbol": symbol,
603
+ "exists": True,
604
+ "file_count": len(parquet_files),
605
+ "total_size_bytes": total_size,
606
+ "total_size_mb": total_size / 1024 / 1024,
607
+ "total_rows": total_rows,
608
+ "months": sorted(months),
609
+ "compression": f"{COMPRESSION}-{COMPRESSION_LEVEL}",
610
+ }
611
+
612
+ def fetch_month(
613
+ self,
614
+ symbol: str,
615
+ year: int,
616
+ month: int,
617
+ *,
618
+ timestamp_col: str = "timestamp", # noqa: ARG002 - reserved for filtering
619
+ force_refresh: bool = False,
620
+ ) -> pl.LazyFrame:
621
+ """Fetch tick data for a specific month (lazy loading).
622
+
623
+ Returns a LazyFrame for memory efficiency. If data is not cached,
624
+ this method does NOT automatically download from source - use
625
+ `get_range_bars()` or manual fetching first.
626
+
627
+ Parameters
628
+ ----------
629
+ symbol : str
630
+ Trading symbol (e.g., "BTCUSDT" or "BINANCE_SPOT_BTCUSDT")
631
+ year : int
632
+ Year (e.g., 2024)
633
+ month : int
634
+ Month (1-12)
635
+ timestamp_col : str
636
+ Name of the timestamp column
637
+ force_refresh : bool
638
+ If True, skip cache and return empty LazyFrame (caller must fetch)
639
+
640
+ Returns
641
+ -------
642
+ pl.LazyFrame
643
+ Lazy frame for the month's tick data, or empty LazyFrame if not cached
644
+
645
+ Examples
646
+ --------
647
+ >>> storage = TickStorage()
648
+ >>> lf = storage.fetch_month("BTCUSDT", 2024, 1)
649
+ >>> df = lf.collect() # Materialize when needed
650
+ """
651
+ year_month = f"{year}-{month:02d}"
652
+
653
+ if force_refresh:
654
+ return pl.LazyFrame()
655
+
656
+ parquet_path = self._get_parquet_path(symbol, year_month)
657
+
658
+ if not parquet_path.exists():
659
+ return pl.LazyFrame()
660
+
661
+ return pl.scan_parquet(parquet_path)
662
+
663
+ def fetch_date_range(
664
+ self,
665
+ symbol: str,
666
+ start_date: str,
667
+ end_date: str,
668
+ *,
669
+ timestamp_col: str = "timestamp",
670
+ ) -> Iterator[pl.LazyFrame]:
671
+ """Iterate over tick data for date range, one month at a time.
672
+
673
+ Yields LazyFrames for each month in range. This is the recommended
674
+ approach for processing large date ranges without loading all data
675
+ into memory at once.
676
+
677
+ Parameters
678
+ ----------
679
+ symbol : str
680
+ Trading symbol (e.g., "BTCUSDT")
681
+ start_date : str
682
+ Start date "YYYY-MM-DD"
683
+ end_date : str
684
+ End date "YYYY-MM-DD"
685
+ timestamp_col : str
686
+ Name of the timestamp column
687
+
688
+ Yields
689
+ ------
690
+ pl.LazyFrame
691
+ Lazy frame for each month with available data
692
+
693
+ Examples
694
+ --------
695
+ >>> storage = TickStorage()
696
+ >>> for lf in storage.fetch_date_range("BTCUSDT", "2024-01-01", "2024-03-31"):
697
+ ... df = lf.collect()
698
+ ... print(f"Processing {len(df)} ticks")
699
+
700
+ Notes
701
+ -----
702
+ - Only yields LazyFrames for months with cached data
703
+ - Data is NOT automatically downloaded - use get_range_bars() first
704
+ - Each LazyFrame can be collected independently for O(month) memory
705
+ """
706
+ start_dt = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=UTC)
707
+ end_dt = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC)
708
+
709
+ current = start_dt.replace(day=1)
710
+ while current <= end_dt:
711
+ year = current.year
712
+ month = current.month
713
+
714
+ lf = self.fetch_month(symbol, year, month, timestamp_col=timestamp_col)
715
+
716
+ # Only yield non-empty LazyFrames
717
+ # Note: We can't easily check if LazyFrame is empty without collecting
718
+ # So we yield all and let caller handle empty results
719
+ parquet_path = self._get_parquet_path(symbol, f"{year}-{month:02d}")
720
+ if parquet_path.exists():
721
+ yield lf
722
+
723
+ # Move to next month
724
+ _december = 12
725
+ if current.month == _december:
726
+ current = current.replace(year=current.year + 1, month=1)
727
+ else:
728
+ current = current.replace(month=current.month + 1)