rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,383 @@
1
+ # polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
2
+ # Issue #46: Modularization M3 - Extract process_trades_* functions from __init__.py
3
+ """Convenience functions for processing trades into range bars.
4
+
5
+ Provides multiple entry points for different input formats (pandas, Polars,
6
+ iterators) with automatic DataFrame conversion.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Iterator
12
+ from typing import TYPE_CHECKING
13
+
14
+ import pandas as pd
15
+
16
+ from .core import RangeBarProcessor
17
+
18
+ if TYPE_CHECKING:
19
+ import polars as pl
20
+
21
+ from rangebar.clickhouse import RangeBarCache
22
+
23
+
24
+ def process_trades_to_dataframe(
25
+ trades: list[dict[str, int | float]] | pd.DataFrame,
26
+ threshold_decimal_bps: int = 250,
27
+ ) -> pd.DataFrame:
28
+ """Convenience function to process trades directly to DataFrame.
29
+
30
+ This is the recommended high-level API for most users. Handles both
31
+ list-of-dicts and pandas DataFrame inputs.
32
+
33
+ Parameters
34
+ ----------
35
+ trades : List[Dict] or pd.DataFrame
36
+ Trade data with columns/keys:
37
+ - timestamp: int (milliseconds) or datetime
38
+ - price: float
39
+ - quantity: float (or 'volume')
40
+ threshold_decimal_bps : int, default=250
41
+ Threshold in decimal basis points (250 = 25bps = 0.25%)
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ OHLCV DataFrame ready for backtesting.py, with:
47
+ - DatetimeIndex (timestamp)
48
+ - Capitalized columns: Open, High, Low, Close, Volume
49
+
50
+ Raises
51
+ ------
52
+ ValueError
53
+ If required columns are missing or threshold is invalid
54
+ RuntimeError
55
+ If trades are not sorted chronologically
56
+
57
+ Examples
58
+ --------
59
+ With list of dicts:
60
+
61
+ >>> from rangebar import process_trades_to_dataframe
62
+ >>> trades = [
63
+ ... {"timestamp": 1704067200000, "price": 42000.0, "quantity": 1.5},
64
+ ... {"timestamp": 1704067210000, "price": 42105.0, "quantity": 2.3},
65
+ ... ]
66
+ >>> df = process_trades_to_dataframe(trades, threshold_decimal_bps=250)
67
+
68
+ With pandas DataFrame:
69
+
70
+ >>> import pandas as pd
71
+ >>> trades_df = pd.DataFrame({
72
+ ... "timestamp": pd.date_range("2024-01-01", periods=100, freq="min"),
73
+ ... "price": [42000.0 + i for i in range(100)],
74
+ ... "quantity": [1.5] * 100,
75
+ ... })
76
+ >>> df = process_trades_to_dataframe(trades_df, threshold_decimal_bps=250)
77
+
78
+ With Binance CSV:
79
+
80
+ >>> trades_csv = pd.read_csv("BTCUSDT-aggTrades-2024-01.csv")
81
+ >>> df = process_trades_to_dataframe(trades_csv, threshold_decimal_bps=250)
82
+ >>> # Use with backtesting.py
83
+ >>> from backtesting import Backtest
84
+ >>> bt = Backtest(df, MyStrategy, cash=10000)
85
+ >>> stats = bt.run()
86
+ """
87
+ processor = RangeBarProcessor(threshold_decimal_bps)
88
+
89
+ # Convert DataFrame to list of dicts if needed
90
+ if isinstance(trades, pd.DataFrame):
91
+ # Support both 'quantity' and 'volume' column names
92
+ volume_col = "quantity" if "quantity" in trades.columns else "volume"
93
+
94
+ required = {"timestamp", "price", volume_col}
95
+ missing = required - set(trades.columns)
96
+ if missing:
97
+ msg = (
98
+ f"DataFrame missing required columns: {missing}. "
99
+ "Required: timestamp, price, quantity (or volume)"
100
+ )
101
+ raise ValueError(msg)
102
+
103
+ # Convert timestamp to milliseconds if it's datetime
104
+ trades_copy = trades.copy()
105
+ if pd.api.types.is_datetime64_any_dtype(trades_copy["timestamp"]):
106
+ # Convert datetime to milliseconds since epoch
107
+ trades_copy["timestamp"] = trades_copy["timestamp"].astype("int64") // 10**6
108
+
109
+ # Normalize column name to 'quantity'
110
+ if volume_col == "volume":
111
+ trades_copy = trades_copy.rename(columns={"volume": "quantity"})
112
+
113
+ # Convert to list of dicts
114
+ trades_list = trades_copy[["timestamp", "price", "quantity"]].to_dict("records")
115
+ else:
116
+ trades_list = trades
117
+
118
+ # Process through Rust layer
119
+ bars = processor.process_trades(trades_list)
120
+
121
+ # Convert to DataFrame
122
+ return processor.to_dataframe(bars)
123
+
124
+
125
+ def process_trades_to_dataframe_cached(
126
+ trades: list[dict[str, int | float]] | pd.DataFrame,
127
+ symbol: str,
128
+ threshold_decimal_bps: int = 250,
129
+ cache: RangeBarCache | None = None,
130
+ ) -> pd.DataFrame:
131
+ """Process trades to DataFrame with two-tier ClickHouse caching.
132
+
133
+ This function provides cached processing of trades into range bars.
134
+ It uses a two-tier cache:
135
+ - Tier 1: Raw trades (avoid re-downloading)
136
+ - Tier 2: Computed range bars (avoid re-computing)
137
+
138
+ Parameters
139
+ ----------
140
+ trades : List[Dict] or pd.DataFrame
141
+ Trade data with columns/keys:
142
+ - timestamp: int (milliseconds) or datetime
143
+ - price: float
144
+ - quantity: float (or 'volume')
145
+ symbol : str
146
+ Trading symbol (e.g., "BTCUSDT"). Used as cache key.
147
+ threshold_decimal_bps : int, default=250
148
+ Threshold in decimal basis points (250 = 25bps = 0.25%)
149
+ cache : RangeBarCache | None
150
+ External cache instance. If None, creates one (preflight runs).
151
+
152
+ Returns
153
+ -------
154
+ pd.DataFrame
155
+ OHLCV DataFrame ready for backtesting.py
156
+
157
+ Raises
158
+ ------
159
+ ClickHouseNotConfiguredError
160
+ If no ClickHouse hosts available (with setup guidance)
161
+ ValueError
162
+ If required columns are missing or threshold is invalid
163
+ RuntimeError
164
+ If trades are not sorted chronologically
165
+
166
+ Examples
167
+ --------
168
+ >>> from rangebar import process_trades_to_dataframe_cached
169
+ >>> import pandas as pd
170
+ >>>
171
+ >>> trades = pd.read_csv("BTCUSDT-aggTrades-2024-01.csv")
172
+ >>> df = process_trades_to_dataframe_cached(trades, symbol="BTCUSDT")
173
+ >>>
174
+ >>> # Second call uses cache (fast)
175
+ >>> df2 = process_trades_to_dataframe_cached(trades, symbol="BTCUSDT")
176
+ """
177
+ # Import cache components (lazy import)
178
+ from rangebar.clickhouse import CacheKey
179
+ from rangebar.clickhouse import RangeBarCache as _RangeBarCache
180
+
181
+ # Convert trades to DataFrame if needed for timestamp extraction
182
+ trades_df = pd.DataFrame(trades) if isinstance(trades, list) else trades
183
+
184
+ # Get timestamp range
185
+ if "timestamp" in trades_df.columns:
186
+ ts_col = trades_df["timestamp"]
187
+ if pd.api.types.is_datetime64_any_dtype(ts_col):
188
+ start_ts = int(ts_col.min().timestamp() * 1000)
189
+ end_ts = int(ts_col.max().timestamp() * 1000)
190
+ else:
191
+ start_ts = int(ts_col.min())
192
+ end_ts = int(ts_col.max())
193
+ else:
194
+ msg = "DataFrame missing 'timestamp' column"
195
+ raise ValueError(msg)
196
+
197
+ # Create cache key
198
+ key = CacheKey(
199
+ symbol=symbol,
200
+ threshold_decimal_bps=threshold_decimal_bps,
201
+ start_ts=start_ts,
202
+ end_ts=end_ts,
203
+ )
204
+
205
+ # Use provided cache or create new one
206
+ _cache = cache if cache is not None else _RangeBarCache()
207
+ owns_cache = cache is None
208
+
209
+ try:
210
+ # Check Tier 2 cache (computed range bars)
211
+ if _cache.has_range_bars(key):
212
+ cached_bars = _cache.get_range_bars(key)
213
+ if cached_bars is not None:
214
+ return cached_bars
215
+
216
+ # Compute using core API
217
+ result = process_trades_to_dataframe(trades, threshold_decimal_bps)
218
+
219
+ # Store in Tier 2 cache
220
+ if not result.empty:
221
+ _cache.store_range_bars(key, result)
222
+
223
+ return result
224
+
225
+ finally:
226
+ if owns_cache:
227
+ _cache.close()
228
+
229
+
230
+ def process_trades_chunked(
231
+ trades_iterator: Iterator[dict[str, int | float]],
232
+ threshold_decimal_bps: int = 250,
233
+ chunk_size: int = 100_000,
234
+ ) -> Iterator[pd.DataFrame]:
235
+ """Process trades in chunks to avoid memory spikes.
236
+
237
+ This function enables streaming processing of large datasets without
238
+ loading all trades into memory at once.
239
+
240
+ Parameters
241
+ ----------
242
+ trades_iterator : Iterator[Dict]
243
+ Iterator yielding trade dictionaries with keys:
244
+ timestamp, price, quantity (or volume)
245
+ threshold_decimal_bps : int, default=250
246
+ Threshold in decimal basis points (250 = 25bps = 0.25%)
247
+ chunk_size : int, default=100_000
248
+ Number of trades per chunk
249
+
250
+ Yields
251
+ ------
252
+ pd.DataFrame
253
+ OHLCV bars for each chunk. Note: partial bars may occur at
254
+ chunk boundaries.
255
+
256
+ Examples
257
+ --------
258
+ Process large Parquet file without OOM:
259
+
260
+ >>> import polars as pl
261
+ >>> from rangebar import process_trades_chunked
262
+ >>> lazy_df = pl.scan_parquet("large_trades.parquet")
263
+ >>> for chunk_df in lazy_df.collect().iter_slices(100_000):
264
+ ... trades = chunk_df.to_dicts()
265
+ ... for bars_df in process_trades_chunked(iter(trades)):
266
+ ... print(f"Got {len(bars_df)} bars")
267
+
268
+ Notes
269
+ -----
270
+ Memory usage: O(chunk_size) instead of O(total_trades)
271
+ For datasets >10M trades, use chunk_size=50_000 for safety.
272
+ """
273
+ from itertools import islice
274
+
275
+ processor = RangeBarProcessor(threshold_decimal_bps)
276
+
277
+ while True:
278
+ chunk = list(islice(trades_iterator, chunk_size))
279
+ if not chunk:
280
+ break
281
+
282
+ bars = processor.process_trades(chunk)
283
+ if bars:
284
+ yield processor.to_dataframe(bars)
285
+
286
+
287
+ def process_trades_polars(
288
+ trades: pl.DataFrame | pl.LazyFrame,
289
+ threshold_decimal_bps: int = 250,
290
+ ) -> pd.DataFrame:
291
+ """Process trades from Polars DataFrame (optimized pipeline).
292
+
293
+ This is the recommended API for Polars users. Uses lazy evaluation
294
+ and minimal dict conversion for best performance.
295
+
296
+ Parameters
297
+ ----------
298
+ trades : polars.DataFrame or polars.LazyFrame
299
+ Trade data with columns:
300
+ - timestamp: int64 (milliseconds since epoch)
301
+ - price: float
302
+ - quantity (or volume): float
303
+ threshold_decimal_bps : int, default=250
304
+ Threshold in decimal basis points (250 = 25bps = 0.25%)
305
+
306
+ Returns
307
+ -------
308
+ pd.DataFrame
309
+ OHLCV DataFrame ready for backtesting.py, with:
310
+ - DatetimeIndex (timestamp)
311
+ - Capitalized columns: Open, High, Low, Close, Volume
312
+
313
+ Examples
314
+ --------
315
+ With LazyFrame (predicate pushdown):
316
+
317
+ >>> import polars as pl
318
+ >>> from rangebar import process_trades_polars
319
+ >>> lazy_df = pl.scan_parquet("trades.parquet")
320
+ >>> lazy_filtered = lazy_df.filter(
321
+ ... pl.col("timestamp") >= 1704067200000
322
+ ... )
323
+ >>> df = process_trades_polars(
324
+ ... lazy_filtered, threshold_decimal_bps=250
325
+ ... )
326
+
327
+ With DataFrame:
328
+
329
+ >>> df = pl.read_parquet("trades.parquet")
330
+ >>> bars = process_trades_polars(df)
331
+
332
+ Notes
333
+ -----
334
+ Performance optimization:
335
+ - Only required columns are extracted (timestamp, price, quantity)
336
+ - Lazy evaluation: predicates pushed to I/O layer
337
+ - 2-3x faster than process_trades_to_dataframe() for Polars inputs
338
+ """
339
+ import polars as pl
340
+
341
+ # MEM-003: Apply column selection BEFORE collecting LazyFrame
342
+ # This enables predicate pushdown and avoids materializing unused columns
343
+ # Memory impact: 10-100x reduction depending on filter selectivity
344
+
345
+ # Determine volume column name (works for both DataFrame and LazyFrame)
346
+ if isinstance(trades, pl.LazyFrame):
347
+ available_cols = trades.collect_schema().names()
348
+ else:
349
+ available_cols = trades.columns
350
+
351
+ volume_col = "quantity" if "quantity" in available_cols else "volume"
352
+
353
+ # Build column list - include is_buyer_maker for microstructure features (Issue #30)
354
+ columns = [
355
+ pl.col("timestamp"),
356
+ pl.col("price"),
357
+ pl.col(volume_col).alias("quantity"),
358
+ ]
359
+ if "is_buyer_maker" in available_cols:
360
+ columns.append(pl.col("is_buyer_maker"))
361
+
362
+ # Apply selection (predicates pushed down for LazyFrame)
363
+ trades_selected = trades.select(columns)
364
+
365
+ # Collect AFTER selection (for LazyFrame)
366
+ if isinstance(trades_selected, pl.LazyFrame):
367
+ trades_minimal = trades_selected.collect()
368
+ else:
369
+ trades_minimal = trades_selected
370
+
371
+ # MEM-002: Process in chunks to bound memory (2.5 GB → ~50 MB per chunk)
372
+ # Chunked .to_dicts() avoids materializing 1M+ trade dicts at once
373
+ chunk_size = 100_000
374
+ processor = RangeBarProcessor(threshold_decimal_bps)
375
+ all_bars: list[dict] = []
376
+
377
+ n_rows = len(trades_minimal)
378
+ for start in range(0, n_rows, chunk_size):
379
+ chunk = trades_minimal.slice(start, chunk_size).to_dicts()
380
+ bars = processor.process_trades_streaming(chunk)
381
+ all_bars.extend(bars)
382
+
383
+ return processor.to_dataframe(all_bars)