rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
# polars-exception: backtesting.py requires Pandas DataFrames with DatetimeIndex
|
|
2
|
+
# Issue #46: Modularization M3 - Extract process_trades_* functions from __init__.py
|
|
3
|
+
"""Convenience functions for processing trades into range bars.
|
|
4
|
+
|
|
5
|
+
Provides multiple entry points for different input formats (pandas, Polars,
|
|
6
|
+
iterators) with automatic DataFrame conversion.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from .core import RangeBarProcessor
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import polars as pl
|
|
20
|
+
|
|
21
|
+
from rangebar.clickhouse import RangeBarCache
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def process_trades_to_dataframe(
|
|
25
|
+
trades: list[dict[str, int | float]] | pd.DataFrame,
|
|
26
|
+
threshold_decimal_bps: int = 250,
|
|
27
|
+
) -> pd.DataFrame:
|
|
28
|
+
"""Convenience function to process trades directly to DataFrame.
|
|
29
|
+
|
|
30
|
+
This is the recommended high-level API for most users. Handles both
|
|
31
|
+
list-of-dicts and pandas DataFrame inputs.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
trades : List[Dict] or pd.DataFrame
|
|
36
|
+
Trade data with columns/keys:
|
|
37
|
+
- timestamp: int (milliseconds) or datetime
|
|
38
|
+
- price: float
|
|
39
|
+
- quantity: float (or 'volume')
|
|
40
|
+
threshold_decimal_bps : int, default=250
|
|
41
|
+
Threshold in decimal basis points (250 = 25bps = 0.25%)
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
pd.DataFrame
|
|
46
|
+
OHLCV DataFrame ready for backtesting.py, with:
|
|
47
|
+
- DatetimeIndex (timestamp)
|
|
48
|
+
- Capitalized columns: Open, High, Low, Close, Volume
|
|
49
|
+
|
|
50
|
+
Raises
|
|
51
|
+
------
|
|
52
|
+
ValueError
|
|
53
|
+
If required columns are missing or threshold is invalid
|
|
54
|
+
RuntimeError
|
|
55
|
+
If trades are not sorted chronologically
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
With list of dicts:
|
|
60
|
+
|
|
61
|
+
>>> from rangebar import process_trades_to_dataframe
|
|
62
|
+
>>> trades = [
|
|
63
|
+
... {"timestamp": 1704067200000, "price": 42000.0, "quantity": 1.5},
|
|
64
|
+
... {"timestamp": 1704067210000, "price": 42105.0, "quantity": 2.3},
|
|
65
|
+
... ]
|
|
66
|
+
>>> df = process_trades_to_dataframe(trades, threshold_decimal_bps=250)
|
|
67
|
+
|
|
68
|
+
With pandas DataFrame:
|
|
69
|
+
|
|
70
|
+
>>> import pandas as pd
|
|
71
|
+
>>> trades_df = pd.DataFrame({
|
|
72
|
+
... "timestamp": pd.date_range("2024-01-01", periods=100, freq="min"),
|
|
73
|
+
... "price": [42000.0 + i for i in range(100)],
|
|
74
|
+
... "quantity": [1.5] * 100,
|
|
75
|
+
... })
|
|
76
|
+
>>> df = process_trades_to_dataframe(trades_df, threshold_decimal_bps=250)
|
|
77
|
+
|
|
78
|
+
With Binance CSV:
|
|
79
|
+
|
|
80
|
+
>>> trades_csv = pd.read_csv("BTCUSDT-aggTrades-2024-01.csv")
|
|
81
|
+
>>> df = process_trades_to_dataframe(trades_csv, threshold_decimal_bps=250)
|
|
82
|
+
>>> # Use with backtesting.py
|
|
83
|
+
>>> from backtesting import Backtest
|
|
84
|
+
>>> bt = Backtest(df, MyStrategy, cash=10000)
|
|
85
|
+
>>> stats = bt.run()
|
|
86
|
+
"""
|
|
87
|
+
processor = RangeBarProcessor(threshold_decimal_bps)
|
|
88
|
+
|
|
89
|
+
# Convert DataFrame to list of dicts if needed
|
|
90
|
+
if isinstance(trades, pd.DataFrame):
|
|
91
|
+
# Support both 'quantity' and 'volume' column names
|
|
92
|
+
volume_col = "quantity" if "quantity" in trades.columns else "volume"
|
|
93
|
+
|
|
94
|
+
required = {"timestamp", "price", volume_col}
|
|
95
|
+
missing = required - set(trades.columns)
|
|
96
|
+
if missing:
|
|
97
|
+
msg = (
|
|
98
|
+
f"DataFrame missing required columns: {missing}. "
|
|
99
|
+
"Required: timestamp, price, quantity (or volume)"
|
|
100
|
+
)
|
|
101
|
+
raise ValueError(msg)
|
|
102
|
+
|
|
103
|
+
# Convert timestamp to milliseconds if it's datetime
|
|
104
|
+
trades_copy = trades.copy()
|
|
105
|
+
if pd.api.types.is_datetime64_any_dtype(trades_copy["timestamp"]):
|
|
106
|
+
# Convert datetime to milliseconds since epoch
|
|
107
|
+
trades_copy["timestamp"] = trades_copy["timestamp"].astype("int64") // 10**6
|
|
108
|
+
|
|
109
|
+
# Normalize column name to 'quantity'
|
|
110
|
+
if volume_col == "volume":
|
|
111
|
+
trades_copy = trades_copy.rename(columns={"volume": "quantity"})
|
|
112
|
+
|
|
113
|
+
# Convert to list of dicts
|
|
114
|
+
trades_list = trades_copy[["timestamp", "price", "quantity"]].to_dict("records")
|
|
115
|
+
else:
|
|
116
|
+
trades_list = trades
|
|
117
|
+
|
|
118
|
+
# Process through Rust layer
|
|
119
|
+
bars = processor.process_trades(trades_list)
|
|
120
|
+
|
|
121
|
+
# Convert to DataFrame
|
|
122
|
+
return processor.to_dataframe(bars)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def process_trades_to_dataframe_cached(
|
|
126
|
+
trades: list[dict[str, int | float]] | pd.DataFrame,
|
|
127
|
+
symbol: str,
|
|
128
|
+
threshold_decimal_bps: int = 250,
|
|
129
|
+
cache: RangeBarCache | None = None,
|
|
130
|
+
) -> pd.DataFrame:
|
|
131
|
+
"""Process trades to DataFrame with two-tier ClickHouse caching.
|
|
132
|
+
|
|
133
|
+
This function provides cached processing of trades into range bars.
|
|
134
|
+
It uses a two-tier cache:
|
|
135
|
+
- Tier 1: Raw trades (avoid re-downloading)
|
|
136
|
+
- Tier 2: Computed range bars (avoid re-computing)
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
trades : List[Dict] or pd.DataFrame
|
|
141
|
+
Trade data with columns/keys:
|
|
142
|
+
- timestamp: int (milliseconds) or datetime
|
|
143
|
+
- price: float
|
|
144
|
+
- quantity: float (or 'volume')
|
|
145
|
+
symbol : str
|
|
146
|
+
Trading symbol (e.g., "BTCUSDT"). Used as cache key.
|
|
147
|
+
threshold_decimal_bps : int, default=250
|
|
148
|
+
Threshold in decimal basis points (250 = 25bps = 0.25%)
|
|
149
|
+
cache : RangeBarCache | None
|
|
150
|
+
External cache instance. If None, creates one (preflight runs).
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
pd.DataFrame
|
|
155
|
+
OHLCV DataFrame ready for backtesting.py
|
|
156
|
+
|
|
157
|
+
Raises
|
|
158
|
+
------
|
|
159
|
+
ClickHouseNotConfiguredError
|
|
160
|
+
If no ClickHouse hosts available (with setup guidance)
|
|
161
|
+
ValueError
|
|
162
|
+
If required columns are missing or threshold is invalid
|
|
163
|
+
RuntimeError
|
|
164
|
+
If trades are not sorted chronologically
|
|
165
|
+
|
|
166
|
+
Examples
|
|
167
|
+
--------
|
|
168
|
+
>>> from rangebar import process_trades_to_dataframe_cached
|
|
169
|
+
>>> import pandas as pd
|
|
170
|
+
>>>
|
|
171
|
+
>>> trades = pd.read_csv("BTCUSDT-aggTrades-2024-01.csv")
|
|
172
|
+
>>> df = process_trades_to_dataframe_cached(trades, symbol="BTCUSDT")
|
|
173
|
+
>>>
|
|
174
|
+
>>> # Second call uses cache (fast)
|
|
175
|
+
>>> df2 = process_trades_to_dataframe_cached(trades, symbol="BTCUSDT")
|
|
176
|
+
"""
|
|
177
|
+
# Import cache components (lazy import)
|
|
178
|
+
from rangebar.clickhouse import CacheKey
|
|
179
|
+
from rangebar.clickhouse import RangeBarCache as _RangeBarCache
|
|
180
|
+
|
|
181
|
+
# Convert trades to DataFrame if needed for timestamp extraction
|
|
182
|
+
trades_df = pd.DataFrame(trades) if isinstance(trades, list) else trades
|
|
183
|
+
|
|
184
|
+
# Get timestamp range
|
|
185
|
+
if "timestamp" in trades_df.columns:
|
|
186
|
+
ts_col = trades_df["timestamp"]
|
|
187
|
+
if pd.api.types.is_datetime64_any_dtype(ts_col):
|
|
188
|
+
start_ts = int(ts_col.min().timestamp() * 1000)
|
|
189
|
+
end_ts = int(ts_col.max().timestamp() * 1000)
|
|
190
|
+
else:
|
|
191
|
+
start_ts = int(ts_col.min())
|
|
192
|
+
end_ts = int(ts_col.max())
|
|
193
|
+
else:
|
|
194
|
+
msg = "DataFrame missing 'timestamp' column"
|
|
195
|
+
raise ValueError(msg)
|
|
196
|
+
|
|
197
|
+
# Create cache key
|
|
198
|
+
key = CacheKey(
|
|
199
|
+
symbol=symbol,
|
|
200
|
+
threshold_decimal_bps=threshold_decimal_bps,
|
|
201
|
+
start_ts=start_ts,
|
|
202
|
+
end_ts=end_ts,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Use provided cache or create new one
|
|
206
|
+
_cache = cache if cache is not None else _RangeBarCache()
|
|
207
|
+
owns_cache = cache is None
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
# Check Tier 2 cache (computed range bars)
|
|
211
|
+
if _cache.has_range_bars(key):
|
|
212
|
+
cached_bars = _cache.get_range_bars(key)
|
|
213
|
+
if cached_bars is not None:
|
|
214
|
+
return cached_bars
|
|
215
|
+
|
|
216
|
+
# Compute using core API
|
|
217
|
+
result = process_trades_to_dataframe(trades, threshold_decimal_bps)
|
|
218
|
+
|
|
219
|
+
# Store in Tier 2 cache
|
|
220
|
+
if not result.empty:
|
|
221
|
+
_cache.store_range_bars(key, result)
|
|
222
|
+
|
|
223
|
+
return result
|
|
224
|
+
|
|
225
|
+
finally:
|
|
226
|
+
if owns_cache:
|
|
227
|
+
_cache.close()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def process_trades_chunked(
|
|
231
|
+
trades_iterator: Iterator[dict[str, int | float]],
|
|
232
|
+
threshold_decimal_bps: int = 250,
|
|
233
|
+
chunk_size: int = 100_000,
|
|
234
|
+
) -> Iterator[pd.DataFrame]:
|
|
235
|
+
"""Process trades in chunks to avoid memory spikes.
|
|
236
|
+
|
|
237
|
+
This function enables streaming processing of large datasets without
|
|
238
|
+
loading all trades into memory at once.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
trades_iterator : Iterator[Dict]
|
|
243
|
+
Iterator yielding trade dictionaries with keys:
|
|
244
|
+
timestamp, price, quantity (or volume)
|
|
245
|
+
threshold_decimal_bps : int, default=250
|
|
246
|
+
Threshold in decimal basis points (250 = 25bps = 0.25%)
|
|
247
|
+
chunk_size : int, default=100_000
|
|
248
|
+
Number of trades per chunk
|
|
249
|
+
|
|
250
|
+
Yields
|
|
251
|
+
------
|
|
252
|
+
pd.DataFrame
|
|
253
|
+
OHLCV bars for each chunk. Note: partial bars may occur at
|
|
254
|
+
chunk boundaries.
|
|
255
|
+
|
|
256
|
+
Examples
|
|
257
|
+
--------
|
|
258
|
+
Process large Parquet file without OOM:
|
|
259
|
+
|
|
260
|
+
>>> import polars as pl
|
|
261
|
+
>>> from rangebar import process_trades_chunked
|
|
262
|
+
>>> lazy_df = pl.scan_parquet("large_trades.parquet")
|
|
263
|
+
>>> for chunk_df in lazy_df.collect().iter_slices(100_000):
|
|
264
|
+
... trades = chunk_df.to_dicts()
|
|
265
|
+
... for bars_df in process_trades_chunked(iter(trades)):
|
|
266
|
+
... print(f"Got {len(bars_df)} bars")
|
|
267
|
+
|
|
268
|
+
Notes
|
|
269
|
+
-----
|
|
270
|
+
Memory usage: O(chunk_size) instead of O(total_trades)
|
|
271
|
+
For datasets >10M trades, use chunk_size=50_000 for safety.
|
|
272
|
+
"""
|
|
273
|
+
from itertools import islice
|
|
274
|
+
|
|
275
|
+
processor = RangeBarProcessor(threshold_decimal_bps)
|
|
276
|
+
|
|
277
|
+
while True:
|
|
278
|
+
chunk = list(islice(trades_iterator, chunk_size))
|
|
279
|
+
if not chunk:
|
|
280
|
+
break
|
|
281
|
+
|
|
282
|
+
bars = processor.process_trades(chunk)
|
|
283
|
+
if bars:
|
|
284
|
+
yield processor.to_dataframe(bars)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def process_trades_polars(
|
|
288
|
+
trades: pl.DataFrame | pl.LazyFrame,
|
|
289
|
+
threshold_decimal_bps: int = 250,
|
|
290
|
+
) -> pd.DataFrame:
|
|
291
|
+
"""Process trades from Polars DataFrame (optimized pipeline).
|
|
292
|
+
|
|
293
|
+
This is the recommended API for Polars users. Uses lazy evaluation
|
|
294
|
+
and minimal dict conversion for best performance.
|
|
295
|
+
|
|
296
|
+
Parameters
|
|
297
|
+
----------
|
|
298
|
+
trades : polars.DataFrame or polars.LazyFrame
|
|
299
|
+
Trade data with columns:
|
|
300
|
+
- timestamp: int64 (milliseconds since epoch)
|
|
301
|
+
- price: float
|
|
302
|
+
- quantity (or volume): float
|
|
303
|
+
threshold_decimal_bps : int, default=250
|
|
304
|
+
Threshold in decimal basis points (250 = 25bps = 0.25%)
|
|
305
|
+
|
|
306
|
+
Returns
|
|
307
|
+
-------
|
|
308
|
+
pd.DataFrame
|
|
309
|
+
OHLCV DataFrame ready for backtesting.py, with:
|
|
310
|
+
- DatetimeIndex (timestamp)
|
|
311
|
+
- Capitalized columns: Open, High, Low, Close, Volume
|
|
312
|
+
|
|
313
|
+
Examples
|
|
314
|
+
--------
|
|
315
|
+
With LazyFrame (predicate pushdown):
|
|
316
|
+
|
|
317
|
+
>>> import polars as pl
|
|
318
|
+
>>> from rangebar import process_trades_polars
|
|
319
|
+
>>> lazy_df = pl.scan_parquet("trades.parquet")
|
|
320
|
+
>>> lazy_filtered = lazy_df.filter(
|
|
321
|
+
... pl.col("timestamp") >= 1704067200000
|
|
322
|
+
... )
|
|
323
|
+
>>> df = process_trades_polars(
|
|
324
|
+
... lazy_filtered, threshold_decimal_bps=250
|
|
325
|
+
... )
|
|
326
|
+
|
|
327
|
+
With DataFrame:
|
|
328
|
+
|
|
329
|
+
>>> df = pl.read_parquet("trades.parquet")
|
|
330
|
+
>>> bars = process_trades_polars(df)
|
|
331
|
+
|
|
332
|
+
Notes
|
|
333
|
+
-----
|
|
334
|
+
Performance optimization:
|
|
335
|
+
- Only required columns are extracted (timestamp, price, quantity)
|
|
336
|
+
- Lazy evaluation: predicates pushed to I/O layer
|
|
337
|
+
- 2-3x faster than process_trades_to_dataframe() for Polars inputs
|
|
338
|
+
"""
|
|
339
|
+
import polars as pl
|
|
340
|
+
|
|
341
|
+
# MEM-003: Apply column selection BEFORE collecting LazyFrame
|
|
342
|
+
# This enables predicate pushdown and avoids materializing unused columns
|
|
343
|
+
# Memory impact: 10-100x reduction depending on filter selectivity
|
|
344
|
+
|
|
345
|
+
# Determine volume column name (works for both DataFrame and LazyFrame)
|
|
346
|
+
if isinstance(trades, pl.LazyFrame):
|
|
347
|
+
available_cols = trades.collect_schema().names()
|
|
348
|
+
else:
|
|
349
|
+
available_cols = trades.columns
|
|
350
|
+
|
|
351
|
+
volume_col = "quantity" if "quantity" in available_cols else "volume"
|
|
352
|
+
|
|
353
|
+
# Build column list - include is_buyer_maker for microstructure features (Issue #30)
|
|
354
|
+
columns = [
|
|
355
|
+
pl.col("timestamp"),
|
|
356
|
+
pl.col("price"),
|
|
357
|
+
pl.col(volume_col).alias("quantity"),
|
|
358
|
+
]
|
|
359
|
+
if "is_buyer_maker" in available_cols:
|
|
360
|
+
columns.append(pl.col("is_buyer_maker"))
|
|
361
|
+
|
|
362
|
+
# Apply selection (predicates pushed down for LazyFrame)
|
|
363
|
+
trades_selected = trades.select(columns)
|
|
364
|
+
|
|
365
|
+
# Collect AFTER selection (for LazyFrame)
|
|
366
|
+
if isinstance(trades_selected, pl.LazyFrame):
|
|
367
|
+
trades_minimal = trades_selected.collect()
|
|
368
|
+
else:
|
|
369
|
+
trades_minimal = trades_selected
|
|
370
|
+
|
|
371
|
+
# MEM-002: Process in chunks to bound memory (2.5 GB → ~50 MB per chunk)
|
|
372
|
+
# Chunked .to_dicts() avoids materializing 1M+ trade dicts at once
|
|
373
|
+
chunk_size = 100_000
|
|
374
|
+
processor = RangeBarProcessor(threshold_decimal_bps)
|
|
375
|
+
all_bars: list[dict] = []
|
|
376
|
+
|
|
377
|
+
n_rows = len(trades_minimal)
|
|
378
|
+
for start in range(0, n_rows, chunk_size):
|
|
379
|
+
chunk = trades_minimal.slice(start, chunk_size).to_dicts()
|
|
380
|
+
bars = processor.process_trades_streaming(chunk)
|
|
381
|
+
all_bars.extend(bars)
|
|
382
|
+
|
|
383
|
+
return processor.to_dataframe(all_bars)
|