rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""SSH tunnel manager for remote ClickHouse hosts.
|
|
2
|
+
|
|
3
|
+
This module provides SSH tunnel management for connecting to ClickHouse
|
|
4
|
+
on remote GPU workstations when direct network access is not available.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import socket
|
|
10
|
+
import subprocess
|
|
11
|
+
import time
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from types import TracebackType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _find_free_port() -> int:
|
|
19
|
+
"""Find a free local port.
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
int
|
|
24
|
+
Available port number
|
|
25
|
+
"""
|
|
26
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
27
|
+
s.bind(("localhost", 0))
|
|
28
|
+
return s.getsockname()[1]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_port_open(host: str, port: int, timeout: float = 0.5) -> bool:
|
|
32
|
+
"""Check if a port is open.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
host : str
|
|
37
|
+
Host to check
|
|
38
|
+
port : int
|
|
39
|
+
Port to check
|
|
40
|
+
timeout : float
|
|
41
|
+
Connection timeout
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
bool
|
|
46
|
+
True if port is open
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
50
|
+
s.settimeout(timeout)
|
|
51
|
+
return s.connect_ex((host, port)) == 0
|
|
52
|
+
except OSError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SSHTunnel:
|
|
57
|
+
"""Manages SSH tunnel to remote ClickHouse host.
|
|
58
|
+
|
|
59
|
+
Creates an SSH tunnel from a local port to the remote host's
|
|
60
|
+
ClickHouse port (default 8123). Use as a context manager.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
ssh_alias : str
|
|
65
|
+
SSH alias from ~/.ssh/config
|
|
66
|
+
remote_port : int
|
|
67
|
+
Remote ClickHouse port (default: 8123)
|
|
68
|
+
local_port : int | None
|
|
69
|
+
Local port to use (default: auto-assign)
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> with SSHTunnel("my-gpu-host") as local_port:
|
|
74
|
+
... client = get_client("localhost", local_port)
|
|
75
|
+
... # Use client...
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
ssh_alias: str,
|
|
81
|
+
remote_port: int = 8123,
|
|
82
|
+
local_port: int | None = None,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Initialize tunnel configuration."""
|
|
85
|
+
self.ssh_alias = ssh_alias
|
|
86
|
+
self.remote_port = remote_port
|
|
87
|
+
self._local_port = local_port
|
|
88
|
+
self._process: subprocess.Popen[bytes] | None = None
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def local_port(self) -> int | None:
|
|
92
|
+
"""Get the local port (assigned when tunnel starts)."""
|
|
93
|
+
return self._local_port
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def is_active(self) -> bool:
|
|
97
|
+
"""Check if tunnel is active."""
|
|
98
|
+
return (
|
|
99
|
+
self._process is not None
|
|
100
|
+
and self._process.poll() is None
|
|
101
|
+
and self._local_port is not None
|
|
102
|
+
and _is_port_open("localhost", self._local_port)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def start(self, timeout: float = 5.0) -> int:
|
|
106
|
+
"""Start the SSH tunnel.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
timeout : float
|
|
111
|
+
Maximum time to wait for tunnel to be ready
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
int
|
|
116
|
+
Local port the tunnel is listening on
|
|
117
|
+
|
|
118
|
+
Raises
|
|
119
|
+
------
|
|
120
|
+
RuntimeError
|
|
121
|
+
If tunnel fails to start
|
|
122
|
+
"""
|
|
123
|
+
if self._process is not None:
|
|
124
|
+
msg = "Tunnel already started"
|
|
125
|
+
raise RuntimeError(msg)
|
|
126
|
+
|
|
127
|
+
# Assign local port if not specified
|
|
128
|
+
if self._local_port is None:
|
|
129
|
+
self._local_port = _find_free_port()
|
|
130
|
+
|
|
131
|
+
# Start SSH tunnel process
|
|
132
|
+
self._process = subprocess.Popen(
|
|
133
|
+
[
|
|
134
|
+
"ssh",
|
|
135
|
+
"-N", # Don't execute remote command
|
|
136
|
+
"-o",
|
|
137
|
+
"ExitOnForwardFailure=yes",
|
|
138
|
+
"-o",
|
|
139
|
+
"ServerAliveInterval=30",
|
|
140
|
+
"-o",
|
|
141
|
+
"ServerAliveCountMax=3",
|
|
142
|
+
"-L",
|
|
143
|
+
f"{self._local_port}:localhost:{self.remote_port}",
|
|
144
|
+
self.ssh_alias,
|
|
145
|
+
],
|
|
146
|
+
stdout=subprocess.DEVNULL,
|
|
147
|
+
stderr=subprocess.PIPE,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Wait for tunnel to be ready
|
|
151
|
+
deadline = time.monotonic() + timeout
|
|
152
|
+
while time.monotonic() < deadline:
|
|
153
|
+
# Check if port is open first - handles SSH ControlMaster case
|
|
154
|
+
# where the ssh process exits immediately after handing off to master
|
|
155
|
+
if _is_port_open("localhost", self._local_port):
|
|
156
|
+
return self._local_port
|
|
157
|
+
|
|
158
|
+
# Check if process died with error (non-zero exit)
|
|
159
|
+
exit_code = self._process.poll()
|
|
160
|
+
if exit_code is not None:
|
|
161
|
+
# Exit code 0 with ControlMaster means forwarding was handed off
|
|
162
|
+
# to master connection - give it a moment to activate
|
|
163
|
+
if exit_code == 0:
|
|
164
|
+
time.sleep(0.2)
|
|
165
|
+
if _is_port_open("localhost", self._local_port):
|
|
166
|
+
# Forwarding via ControlMaster succeeded, but we don't
|
|
167
|
+
# own the process anymore - set to None so stop() is a no-op
|
|
168
|
+
self._process = None
|
|
169
|
+
return self._local_port
|
|
170
|
+
# Non-zero exit or port still not open after ControlMaster handoff
|
|
171
|
+
stderr = ""
|
|
172
|
+
if self._process.stderr:
|
|
173
|
+
stderr = self._process.stderr.read().decode()
|
|
174
|
+
msg = (
|
|
175
|
+
f"SSH tunnel to {self.ssh_alias} failed "
|
|
176
|
+
f"(exit={exit_code}): {stderr}"
|
|
177
|
+
)
|
|
178
|
+
self._process = None
|
|
179
|
+
raise RuntimeError(msg)
|
|
180
|
+
|
|
181
|
+
time.sleep(0.1)
|
|
182
|
+
|
|
183
|
+
# Timeout - kill process
|
|
184
|
+
self._cleanup()
|
|
185
|
+
msg = f"SSH tunnel to {self.ssh_alias} timed out after {timeout}s"
|
|
186
|
+
raise RuntimeError(msg)
|
|
187
|
+
|
|
188
|
+
def stop(self) -> None:
|
|
189
|
+
"""Stop the SSH tunnel."""
|
|
190
|
+
self._cleanup()
|
|
191
|
+
|
|
192
|
+
def _cleanup(self) -> None:
|
|
193
|
+
"""Clean up tunnel resources."""
|
|
194
|
+
if self._process is not None:
|
|
195
|
+
try:
|
|
196
|
+
self._process.terminate()
|
|
197
|
+
self._process.wait(timeout=5)
|
|
198
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
199
|
+
try:
|
|
200
|
+
self._process.kill()
|
|
201
|
+
self._process.wait(timeout=1)
|
|
202
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
203
|
+
pass
|
|
204
|
+
finally:
|
|
205
|
+
self._process = None
|
|
206
|
+
|
|
207
|
+
def __enter__(self) -> int:
|
|
208
|
+
"""Start tunnel and return local port."""
|
|
209
|
+
return self.start()
|
|
210
|
+
|
|
211
|
+
def __exit__(
|
|
212
|
+
self,
|
|
213
|
+
exc_type: type[BaseException] | None,
|
|
214
|
+
exc_val: BaseException | None,
|
|
215
|
+
exc_tb: TracebackType | None,
|
|
216
|
+
) -> None:
|
|
217
|
+
"""Stop tunnel on exit."""
|
|
218
|
+
self.stop()
|
|
219
|
+
|
|
220
|
+
def __del__(self) -> None:
|
|
221
|
+
"""Clean up on garbage collection."""
|
|
222
|
+
self._cleanup()
|
rangebar/constants.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Constants and presets for rangebar-py.
|
|
2
|
+
|
|
3
|
+
This module centralizes all constants to eliminate duplication across the codebase.
|
|
4
|
+
Import from here instead of defining locally.
|
|
5
|
+
|
|
6
|
+
SSoT (Single Source of Truth) for:
|
|
7
|
+
- MICROSTRUCTURE_COLUMNS: Optional microstructure feature columns
|
|
8
|
+
- TIER1_SYMBOLS: High-liquidity crypto symbols
|
|
9
|
+
- THRESHOLD_PRESETS: Named threshold values in decimal basis points
|
|
10
|
+
- THRESHOLD_DECIMAL_MIN/MAX: Valid threshold range
|
|
11
|
+
- _CRYPTO_BASES: Known crypto base symbols for asset class detection
|
|
12
|
+
- _FOREX_CURRENCIES: Known forex currencies for asset class detection
|
|
13
|
+
- MEM_GUARDS: Memory guard registry (Issue #49)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Schema Version Constants (Cache Evolution)
|
|
20
|
+
# =============================================================================
|
|
21
|
+
# Used for cache validation and schema evolution tracking.
|
|
22
|
+
# Increment when schema changes require cache invalidation.
|
|
23
|
+
#
|
|
24
|
+
# Version history:
|
|
25
|
+
# - 6.0.0: OHLCV only (legacy, pre-microstructure)
|
|
26
|
+
# - 7.0.0: Added 15 microstructure columns (Issue #25)
|
|
27
|
+
# - 10.0.0: Added ouroboros_mode column
|
|
28
|
+
# - 11.0.0: Current version with modular architecture
|
|
29
|
+
|
|
30
|
+
SCHEMA_VERSION_OHLCV_ONLY: str = "6.0.0" # Pre-microstructure (legacy)
|
|
31
|
+
SCHEMA_VERSION_MICROSTRUCTURE: str = "7.0.0" # Added 15 microstructure columns
|
|
32
|
+
SCHEMA_VERSION_OUROBOROS: str = "10.0.0" # Added ouroboros_mode column
|
|
33
|
+
|
|
34
|
+
# Minimum versions required for features
|
|
35
|
+
MIN_VERSION_FOR_MICROSTRUCTURE: str = SCHEMA_VERSION_MICROSTRUCTURE
|
|
36
|
+
MIN_VERSION_FOR_OUROBOROS: str = SCHEMA_VERSION_OUROBOROS
|
|
37
|
+
|
|
38
|
+
# =============================================================================
|
|
39
|
+
# Microstructure Columns (Issue #25, v7.0+)
|
|
40
|
+
# =============================================================================
|
|
41
|
+
# These columns are optional and only present when include_microstructure=True
|
|
42
|
+
# or when bars are generated with microstructure features enabled.
|
|
43
|
+
#
|
|
44
|
+
# IMPORTANT: Keep this list in sync with:
|
|
45
|
+
# - crates/rangebar-core/src/bar.rs (Rust struct fields)
|
|
46
|
+
# - python/rangebar/clickhouse/schema.sql (ClickHouse columns)
|
|
47
|
+
|
|
48
|
+
MICROSTRUCTURE_COLUMNS: tuple[str, ...] = (
|
|
49
|
+
# Basic extended columns
|
|
50
|
+
"vwap",
|
|
51
|
+
"buy_volume",
|
|
52
|
+
"sell_volume",
|
|
53
|
+
"individual_trade_count",
|
|
54
|
+
"agg_record_count",
|
|
55
|
+
# Microstructure features (Issue #25)
|
|
56
|
+
"duration_us",
|
|
57
|
+
"ofi",
|
|
58
|
+
"vwap_close_deviation",
|
|
59
|
+
"price_impact",
|
|
60
|
+
"kyle_lambda_proxy",
|
|
61
|
+
"trade_intensity",
|
|
62
|
+
"volume_per_trade",
|
|
63
|
+
"aggression_ratio",
|
|
64
|
+
"aggregation_density",
|
|
65
|
+
"turnover_imbalance",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# =============================================================================
|
|
69
|
+
# Inter-Bar Feature Columns (Issue #59, v12.0+)
|
|
70
|
+
# =============================================================================
|
|
71
|
+
# Computed from a lookback window of trades BEFORE each bar opens.
|
|
72
|
+
# All features are Optional - None when no lookback data available.
|
|
73
|
+
#
|
|
74
|
+
# IMPORTANT: Keep this list in sync with:
|
|
75
|
+
# - crates/rangebar-core/src/interbar.rs (Rust feature computation)
|
|
76
|
+
# - crates/rangebar-core/src/types.rs (RangeBar struct fields)
|
|
77
|
+
# - python/rangebar/clickhouse/schema.sql (ClickHouse columns)
|
|
78
|
+
|
|
79
|
+
INTER_BAR_FEATURE_COLUMNS: tuple[str, ...] = (
|
|
80
|
+
# Tier 1: Core features (7 features, min 1 trade)
|
|
81
|
+
"lookback_trade_count",
|
|
82
|
+
"lookback_ofi",
|
|
83
|
+
"lookback_duration_us",
|
|
84
|
+
"lookback_intensity",
|
|
85
|
+
"lookback_vwap_raw",
|
|
86
|
+
"lookback_vwap_position",
|
|
87
|
+
"lookback_count_imbalance",
|
|
88
|
+
# Tier 2: Statistical features (5 features)
|
|
89
|
+
"lookback_kyle_lambda",
|
|
90
|
+
"lookback_burstiness",
|
|
91
|
+
"lookback_volume_skew",
|
|
92
|
+
"lookback_volume_kurt",
|
|
93
|
+
"lookback_price_range",
|
|
94
|
+
# Tier 3: Advanced features (4 features, min 60+ trades)
|
|
95
|
+
"lookback_kaufman_er",
|
|
96
|
+
"lookback_garman_klass_vol",
|
|
97
|
+
"lookback_hurst",
|
|
98
|
+
"lookback_permutation_entropy",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# =============================================================================
|
|
102
|
+
# Tier-1 Symbols (high-liquidity, available on all Binance markets)
|
|
103
|
+
# =============================================================================
|
|
104
|
+
|
|
105
|
+
TIER1_SYMBOLS: tuple[str, ...] = (
|
|
106
|
+
"AAVE",
|
|
107
|
+
"ADA",
|
|
108
|
+
"AVAX",
|
|
109
|
+
"BCH",
|
|
110
|
+
"BNB",
|
|
111
|
+
"BTC",
|
|
112
|
+
"DOGE",
|
|
113
|
+
"ETH",
|
|
114
|
+
"FIL",
|
|
115
|
+
"LINK",
|
|
116
|
+
"LTC",
|
|
117
|
+
"NEAR",
|
|
118
|
+
"SOL",
|
|
119
|
+
"SUI",
|
|
120
|
+
"UNI",
|
|
121
|
+
"WIF",
|
|
122
|
+
"WLD",
|
|
123
|
+
"XRP",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# =============================================================================
|
|
127
|
+
# Threshold Range (from rangebar-core)
|
|
128
|
+
# =============================================================================
|
|
129
|
+
|
|
130
|
+
THRESHOLD_DECIMAL_MIN: int = 1 # 1 dbps = 0.001%
|
|
131
|
+
THRESHOLD_DECIMAL_MAX: int = 100_000 # 100,000 dbps = 100%
|
|
132
|
+
|
|
133
|
+
# =============================================================================
|
|
134
|
+
# Threshold Presets (in decimal basis points)
|
|
135
|
+
# =============================================================================
|
|
136
|
+
# 1 dbps = 0.001% = 0.00001 (one-tenth of a basis point)
|
|
137
|
+
# Example: 250 dbps = 0.25%
|
|
138
|
+
|
|
139
|
+
THRESHOLD_PRESETS: dict[str, int] = {
|
|
140
|
+
"micro": 10, # 10 dbps = 0.01% (scalping)
|
|
141
|
+
"tight": 50, # 50 dbps = 0.05% (day trading)
|
|
142
|
+
"standard": 100, # 100 dbps = 0.1% (swing trading)
|
|
143
|
+
"medium": 250, # 250 dbps = 0.25% (default)
|
|
144
|
+
"wide": 500, # 500 dbps = 0.5% (position trading)
|
|
145
|
+
"macro": 1000, # 100bps = 1% (long-term)
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# =============================================================================
|
|
149
|
+
# Asset Class Detection Helpers
|
|
150
|
+
# =============================================================================
|
|
151
|
+
|
|
152
|
+
# Common crypto base symbols for detection
|
|
153
|
+
_CRYPTO_BASES: frozenset[str] = frozenset(
|
|
154
|
+
{
|
|
155
|
+
"BTC",
|
|
156
|
+
"ETH",
|
|
157
|
+
"BNB",
|
|
158
|
+
"SOL",
|
|
159
|
+
"XRP",
|
|
160
|
+
"ADA",
|
|
161
|
+
"DOGE",
|
|
162
|
+
"DOT",
|
|
163
|
+
"MATIC",
|
|
164
|
+
"AVAX",
|
|
165
|
+
"LINK",
|
|
166
|
+
"UNI",
|
|
167
|
+
"ATOM",
|
|
168
|
+
"LTC",
|
|
169
|
+
"ETC",
|
|
170
|
+
"XLM",
|
|
171
|
+
"ALGO",
|
|
172
|
+
"NEAR",
|
|
173
|
+
"FIL",
|
|
174
|
+
"APT",
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Common forex base/quote currencies
|
|
179
|
+
_FOREX_CURRENCIES: frozenset[str] = frozenset(
|
|
180
|
+
{
|
|
181
|
+
"EUR",
|
|
182
|
+
"USD",
|
|
183
|
+
"GBP",
|
|
184
|
+
"JPY",
|
|
185
|
+
"CHF",
|
|
186
|
+
"AUD",
|
|
187
|
+
"NZD",
|
|
188
|
+
"CAD",
|
|
189
|
+
"SEK",
|
|
190
|
+
"NOK",
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# =============================================================================
|
|
195
|
+
# Continuity Validation Constants
|
|
196
|
+
# =============================================================================
|
|
197
|
+
|
|
198
|
+
# Default tolerance for junction continuity validation (0.01% = 0.0001)
|
|
199
|
+
CONTINUITY_TOLERANCE_PCT: float = 0.0001
|
|
200
|
+
|
|
201
|
+
# =============================================================================
|
|
202
|
+
# Exchange Session Column Names (Ouroboros feature)
|
|
203
|
+
# =============================================================================
|
|
204
|
+
|
|
205
|
+
EXCHANGE_SESSION_COLUMNS: tuple[str, ...] = (
|
|
206
|
+
"exchange_session_sydney",
|
|
207
|
+
"exchange_session_tokyo",
|
|
208
|
+
"exchange_session_london",
|
|
209
|
+
"exchange_session_newyork",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# =============================================================================
|
|
213
|
+
# All Optional Columns (for cache operations)
|
|
214
|
+
# =============================================================================
|
|
215
|
+
# Union of microstructure + exchange session columns
|
|
216
|
+
|
|
217
|
+
ALL_OPTIONAL_COLUMNS: tuple[str, ...] = (
|
|
218
|
+
*MICROSTRUCTURE_COLUMNS,
|
|
219
|
+
*EXCHANGE_SESSION_COLUMNS,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# =============================================================================
|
|
223
|
+
# Memory Guard Registry (Issue #49)
|
|
224
|
+
# =============================================================================
|
|
225
|
+
# Each guard prevents a specific memory exhaustion pattern.
|
|
226
|
+
# Code references use "# MEM-XXX:" comments for traceability.
|
|
227
|
+
#
|
|
228
|
+
# Guards are organized by pipeline stage:
|
|
229
|
+
# Loading → MEM-001, MEM-004, MEM-007, MEM-010
|
|
230
|
+
# Process → MEM-002, MEM-003
|
|
231
|
+
# Concat → MEM-006, MEM-008
|
|
232
|
+
# Test → MEM-005
|
|
233
|
+
# Process → MEM-009
|
|
234
|
+
#
|
|
235
|
+
# When adding a new guard, assign the next number and add an entry here.
|
|
236
|
+
|
|
237
|
+
MEM_GUARDS: dict[str, dict[str, str]] = {
|
|
238
|
+
"MEM-001": {
|
|
239
|
+
"description": "Avoid map_elements() in Parquet parsing (native Polars ops)",
|
|
240
|
+
"location": "storage/parquet.py:185",
|
|
241
|
+
"stage": "loading",
|
|
242
|
+
},
|
|
243
|
+
"MEM-002": {
|
|
244
|
+
"description": "Process trades in 100K chunks (~15 MB each)",
|
|
245
|
+
"location": "orchestration/helpers.py:274, processors/api.py:371",
|
|
246
|
+
"stage": "processing",
|
|
247
|
+
},
|
|
248
|
+
"MEM-003": {
|
|
249
|
+
"description": "Select columns BEFORE .collect() on LazyFrame",
|
|
250
|
+
"location": "orchestration/helpers.py:236, processors/api.py:341",
|
|
251
|
+
"stage": "processing",
|
|
252
|
+
},
|
|
253
|
+
"MEM-004": {
|
|
254
|
+
"description": "Guard read_ticks() with size estimation before .collect()",
|
|
255
|
+
"location": "storage/parquet.py",
|
|
256
|
+
"stage": "loading",
|
|
257
|
+
},
|
|
258
|
+
"MEM-005": {
|
|
259
|
+
"description": "gc.collect() after each test to prevent accumulation",
|
|
260
|
+
"location": "tests/conftest.py:26",
|
|
261
|
+
"stage": "testing",
|
|
262
|
+
},
|
|
263
|
+
"MEM-006": {
|
|
264
|
+
"description": "Use Polars concat instead of pandas for memory efficiency",
|
|
265
|
+
"location": "conversion.py:107, orchestration/precompute.py:404",
|
|
266
|
+
"stage": "concatenation",
|
|
267
|
+
},
|
|
268
|
+
"MEM-007": {
|
|
269
|
+
"description": "Guard deprecated _fetch_binance() with date range limit",
|
|
270
|
+
"location": "orchestration/helpers.py:136",
|
|
271
|
+
"stage": "loading",
|
|
272
|
+
},
|
|
273
|
+
"MEM-008": {
|
|
274
|
+
"description": "Streaming bar accumulation (avoid holding all in memory)",
|
|
275
|
+
"location": "orchestration/range_bars.py",
|
|
276
|
+
"stage": "concatenation",
|
|
277
|
+
},
|
|
278
|
+
"MEM-009": {
|
|
279
|
+
"description": "Process-level RLIMIT_AS cap (MemoryError instead of OOM kill)",
|
|
280
|
+
"location": "resource_guard.py",
|
|
281
|
+
"stage": "process",
|
|
282
|
+
},
|
|
283
|
+
"MEM-010": {
|
|
284
|
+
"description": "Pre-flight memory estimation before tick loading",
|
|
285
|
+
"location": "resource_guard.py",
|
|
286
|
+
"stage": "loading",
|
|
287
|
+
},
|
|
288
|
+
}
|
rangebar/conversion.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Conversion utilities for rangebar-py.
|
|
2
|
+
|
|
3
|
+
This module provides dtype conversion, normalization, and DataFrame manipulation
|
|
4
|
+
utilities used throughout the codebase. These functions handle:
|
|
5
|
+
- Converting bar dictionaries to Polars/pandas DataFrames
|
|
6
|
+
- Concatenating DataFrames with consistent dtypes
|
|
7
|
+
- Normalizing datetime precision (Issue #44 fix)
|
|
8
|
+
- Converting PyArrow dtypes to numpy for compatibility
|
|
9
|
+
|
|
10
|
+
SSoT (Single Source of Truth) for:
|
|
11
|
+
- _bars_list_to_polars: Convert bar dicts to Polars DataFrame
|
|
12
|
+
- _concat_pandas_via_polars: Memory-efficient DataFrame concatenation
|
|
13
|
+
- normalize_temporal_precision: Fix mixed datetime precision
|
|
14
|
+
- normalize_arrow_dtypes: Convert PyArrow to numpy dtypes
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import polars as pl
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _bars_list_to_polars(
|
|
28
|
+
bars: list[dict],
|
|
29
|
+
include_microstructure: bool = False,
|
|
30
|
+
) -> pl.DataFrame:
|
|
31
|
+
"""Convert list of bar dicts to Polars DataFrame in backtesting.py format.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
bars : list[dict]
|
|
36
|
+
List of bar dictionaries from processor
|
|
37
|
+
include_microstructure : bool
|
|
38
|
+
Include microstructure columns
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
pl.DataFrame
|
|
43
|
+
DataFrame with OHLCV columns (capitalized), DatetimeIndex
|
|
44
|
+
"""
|
|
45
|
+
import polars as pl
|
|
46
|
+
|
|
47
|
+
if not bars:
|
|
48
|
+
return pl.DataFrame()
|
|
49
|
+
|
|
50
|
+
bars_df = pl.DataFrame(bars)
|
|
51
|
+
|
|
52
|
+
# Convert timestamp to datetime
|
|
53
|
+
if "timestamp" in bars_df.columns:
|
|
54
|
+
bars_df = bars_df.with_columns(
|
|
55
|
+
pl.col("timestamp")
|
|
56
|
+
.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%.f%:z")
|
|
57
|
+
.alias("timestamp")
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Rename to backtesting.py format
|
|
61
|
+
rename_map = {
|
|
62
|
+
"open": "Open",
|
|
63
|
+
"high": "High",
|
|
64
|
+
"low": "Low",
|
|
65
|
+
"close": "Close",
|
|
66
|
+
"volume": "Volume",
|
|
67
|
+
}
|
|
68
|
+
bars_df = bars_df.rename(
|
|
69
|
+
{k: v for k, v in rename_map.items() if k in bars_df.columns}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Select columns
|
|
73
|
+
base_cols = ["timestamp", "Open", "High", "Low", "Close", "Volume"]
|
|
74
|
+
if include_microstructure:
|
|
75
|
+
# Include all columns
|
|
76
|
+
return bars_df
|
|
77
|
+
# Only OHLCV columns
|
|
78
|
+
available = [c for c in base_cols if c in bars_df.columns]
|
|
79
|
+
return bars_df.select(available)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def normalize_temporal_precision(pldf: pl.DataFrame) -> pl.DataFrame:
|
|
83
|
+
"""Normalize datetime columns to microsecond precision.
|
|
84
|
+
|
|
85
|
+
This prevents SchemaError when concatenating DataFrames with mixed
|
|
86
|
+
datetime precision (e.g., μs vs ns). See Issue #44.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
pldf : pl.DataFrame
|
|
91
|
+
Polars DataFrame to normalize
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
pl.DataFrame
|
|
96
|
+
DataFrame with all datetime columns cast to microsecond precision
|
|
97
|
+
"""
|
|
98
|
+
import polars as pl
|
|
99
|
+
|
|
100
|
+
for col in pldf.columns:
|
|
101
|
+
if pldf[col].dtype.is_temporal():
|
|
102
|
+
pldf = pldf.with_columns(pl.col(col).dt.cast_time_unit("us"))
|
|
103
|
+
return pldf
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _concat_pandas_via_polars(dfs: list[pd.DataFrame]) -> pd.DataFrame:
|
|
107
|
+
"""Concatenate pandas DataFrames using Polars for memory efficiency (MEM-006).
|
|
108
|
+
|
|
109
|
+
This function uses Polars' more efficient concatenation instead of pd.concat,
|
|
110
|
+
reducing memory fragmentation and improving performance for large datasets.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
dfs : list[pd.DataFrame]
|
|
115
|
+
List of pandas DataFrames to concatenate
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
pd.DataFrame
|
|
120
|
+
Concatenated DataFrame with sorted DatetimeIndex
|
|
121
|
+
"""
|
|
122
|
+
import polars as pl
|
|
123
|
+
|
|
124
|
+
if not dfs:
|
|
125
|
+
return pd.DataFrame()
|
|
126
|
+
|
|
127
|
+
if len(dfs) == 1:
|
|
128
|
+
return dfs[0]
|
|
129
|
+
|
|
130
|
+
# Convert to Polars
|
|
131
|
+
pl_dfs = [pl.from_pandas(df.reset_index()) for df in dfs]
|
|
132
|
+
|
|
133
|
+
# Normalize datetime columns to consistent precision (μs) before concat
|
|
134
|
+
# This prevents SchemaError when months have mixed precision (Issue #44)
|
|
135
|
+
normalized = [normalize_temporal_precision(pldf) for pldf in pl_dfs]
|
|
136
|
+
combined = pl.concat(normalized)
|
|
137
|
+
|
|
138
|
+
# Sort by timestamp/index column
|
|
139
|
+
index_col = "timestamp" if "timestamp" in combined.columns else combined.columns[0]
|
|
140
|
+
combined = combined.sort(index_col)
|
|
141
|
+
|
|
142
|
+
# Convert back to pandas with proper index
|
|
143
|
+
result = combined.to_pandas()
|
|
144
|
+
if index_col in result.columns:
|
|
145
|
+
result = result.set_index(index_col)
|
|
146
|
+
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def normalize_arrow_dtypes(
|
|
151
|
+
df: pd.DataFrame, columns: list[str] | None = None
|
|
152
|
+
) -> pd.DataFrame:
|
|
153
|
+
"""Convert PyArrow dtypes to numpy for compatibility.
|
|
154
|
+
|
|
155
|
+
ClickHouse query_df_arrow returns double[pyarrow], but process_trades
|
|
156
|
+
returns float64. This function normalizes the dtypes.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
df : pd.DataFrame
|
|
161
|
+
DataFrame potentially containing PyArrow dtypes
|
|
162
|
+
columns : list[str] | None
|
|
163
|
+
Columns to normalize. If None, normalizes OHLCV columns.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
pd.DataFrame
|
|
168
|
+
DataFrame with numpy dtypes
|
|
169
|
+
"""
|
|
170
|
+
if columns is None:
|
|
171
|
+
columns = ["Open", "High", "Low", "Close", "Volume"]
|
|
172
|
+
|
|
173
|
+
for col in columns:
|
|
174
|
+
if col in df.columns:
|
|
175
|
+
df[col] = df[col].astype("float64")
|
|
176
|
+
|
|
177
|
+
return df
|