rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,222 @@
1
+ """SSH tunnel manager for remote ClickHouse hosts.
2
+
3
+ This module provides SSH tunnel management for connecting to ClickHouse
4
+ on remote GPU workstations when direct network access is not available.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import socket
10
+ import subprocess
11
+ import time
12
+ from typing import TYPE_CHECKING
13
+
14
+ if TYPE_CHECKING:
15
+ from types import TracebackType
16
+
17
+
18
+ def _find_free_port() -> int:
19
+ """Find a free local port.
20
+
21
+ Returns
22
+ -------
23
+ int
24
+ Available port number
25
+ """
26
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
27
+ s.bind(("localhost", 0))
28
+ return s.getsockname()[1]
29
+
30
+
31
+ def _is_port_open(host: str, port: int, timeout: float = 0.5) -> bool:
32
+ """Check if a port is open.
33
+
34
+ Parameters
35
+ ----------
36
+ host : str
37
+ Host to check
38
+ port : int
39
+ Port to check
40
+ timeout : float
41
+ Connection timeout
42
+
43
+ Returns
44
+ -------
45
+ bool
46
+ True if port is open
47
+ """
48
+ try:
49
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
50
+ s.settimeout(timeout)
51
+ return s.connect_ex((host, port)) == 0
52
+ except OSError:
53
+ return False
54
+
55
+
56
+ class SSHTunnel:
57
+ """Manages SSH tunnel to remote ClickHouse host.
58
+
59
+ Creates an SSH tunnel from a local port to the remote host's
60
+ ClickHouse port (default 8123). Use as a context manager.
61
+
62
+ Parameters
63
+ ----------
64
+ ssh_alias : str
65
+ SSH alias from ~/.ssh/config
66
+ remote_port : int
67
+ Remote ClickHouse port (default: 8123)
68
+ local_port : int | None
69
+ Local port to use (default: auto-assign)
70
+
71
+ Examples
72
+ --------
73
+ >>> with SSHTunnel("my-gpu-host") as local_port:
74
+ ... client = get_client("localhost", local_port)
75
+ ... # Use client...
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ ssh_alias: str,
81
+ remote_port: int = 8123,
82
+ local_port: int | None = None,
83
+ ) -> None:
84
+ """Initialize tunnel configuration."""
85
+ self.ssh_alias = ssh_alias
86
+ self.remote_port = remote_port
87
+ self._local_port = local_port
88
+ self._process: subprocess.Popen[bytes] | None = None
89
+
90
+ @property
91
+ def local_port(self) -> int | None:
92
+ """Get the local port (assigned when tunnel starts)."""
93
+ return self._local_port
94
+
95
+ @property
96
+ def is_active(self) -> bool:
97
+ """Check if tunnel is active."""
98
+ return (
99
+ self._process is not None
100
+ and self._process.poll() is None
101
+ and self._local_port is not None
102
+ and _is_port_open("localhost", self._local_port)
103
+ )
104
+
105
+ def start(self, timeout: float = 5.0) -> int:
106
+ """Start the SSH tunnel.
107
+
108
+ Parameters
109
+ ----------
110
+ timeout : float
111
+ Maximum time to wait for tunnel to be ready
112
+
113
+ Returns
114
+ -------
115
+ int
116
+ Local port the tunnel is listening on
117
+
118
+ Raises
119
+ ------
120
+ RuntimeError
121
+ If tunnel fails to start
122
+ """
123
+ if self._process is not None:
124
+ msg = "Tunnel already started"
125
+ raise RuntimeError(msg)
126
+
127
+ # Assign local port if not specified
128
+ if self._local_port is None:
129
+ self._local_port = _find_free_port()
130
+
131
+ # Start SSH tunnel process
132
+ self._process = subprocess.Popen(
133
+ [
134
+ "ssh",
135
+ "-N", # Don't execute remote command
136
+ "-o",
137
+ "ExitOnForwardFailure=yes",
138
+ "-o",
139
+ "ServerAliveInterval=30",
140
+ "-o",
141
+ "ServerAliveCountMax=3",
142
+ "-L",
143
+ f"{self._local_port}:localhost:{self.remote_port}",
144
+ self.ssh_alias,
145
+ ],
146
+ stdout=subprocess.DEVNULL,
147
+ stderr=subprocess.PIPE,
148
+ )
149
+
150
+ # Wait for tunnel to be ready
151
+ deadline = time.monotonic() + timeout
152
+ while time.monotonic() < deadline:
153
+ # Check if port is open first - handles SSH ControlMaster case
154
+ # where the ssh process exits immediately after handing off to master
155
+ if _is_port_open("localhost", self._local_port):
156
+ return self._local_port
157
+
158
+ # Check if process died with error (non-zero exit)
159
+ exit_code = self._process.poll()
160
+ if exit_code is not None:
161
+ # Exit code 0 with ControlMaster means forwarding was handed off
162
+ # to master connection - give it a moment to activate
163
+ if exit_code == 0:
164
+ time.sleep(0.2)
165
+ if _is_port_open("localhost", self._local_port):
166
+ # Forwarding via ControlMaster succeeded, but we don't
167
+ # own the process anymore - set to None so stop() is a no-op
168
+ self._process = None
169
+ return self._local_port
170
+ # Non-zero exit or port still not open after ControlMaster handoff
171
+ stderr = ""
172
+ if self._process.stderr:
173
+ stderr = self._process.stderr.read().decode()
174
+ msg = (
175
+ f"SSH tunnel to {self.ssh_alias} failed "
176
+ f"(exit={exit_code}): {stderr}"
177
+ )
178
+ self._process = None
179
+ raise RuntimeError(msg)
180
+
181
+ time.sleep(0.1)
182
+
183
+ # Timeout - kill process
184
+ self._cleanup()
185
+ msg = f"SSH tunnel to {self.ssh_alias} timed out after {timeout}s"
186
+ raise RuntimeError(msg)
187
+
188
+ def stop(self) -> None:
189
+ """Stop the SSH tunnel."""
190
+ self._cleanup()
191
+
192
+ def _cleanup(self) -> None:
193
+ """Clean up tunnel resources."""
194
+ if self._process is not None:
195
+ try:
196
+ self._process.terminate()
197
+ self._process.wait(timeout=5)
198
+ except (subprocess.TimeoutExpired, OSError):
199
+ try:
200
+ self._process.kill()
201
+ self._process.wait(timeout=1)
202
+ except (subprocess.TimeoutExpired, OSError):
203
+ pass
204
+ finally:
205
+ self._process = None
206
+
207
+ def __enter__(self) -> int:
208
+ """Start tunnel and return local port."""
209
+ return self.start()
210
+
211
+ def __exit__(
212
+ self,
213
+ exc_type: type[BaseException] | None,
214
+ exc_val: BaseException | None,
215
+ exc_tb: TracebackType | None,
216
+ ) -> None:
217
+ """Stop tunnel on exit."""
218
+ self.stop()
219
+
220
+ def __del__(self) -> None:
221
+ """Clean up on garbage collection."""
222
+ self._cleanup()
rangebar/constants.py ADDED
@@ -0,0 +1,288 @@
1
+ """Constants and presets for rangebar-py.
2
+
3
+ This module centralizes all constants to eliminate duplication across the codebase.
4
+ Import from here instead of defining locally.
5
+
6
+ SSoT (Single Source of Truth) for:
7
+ - MICROSTRUCTURE_COLUMNS: Optional microstructure feature columns
8
+ - TIER1_SYMBOLS: High-liquidity crypto symbols
9
+ - THRESHOLD_PRESETS: Named threshold values in decimal basis points
10
+ - THRESHOLD_DECIMAL_MIN/MAX: Valid threshold range
11
+ - _CRYPTO_BASES: Known crypto base symbols for asset class detection
12
+ - _FOREX_CURRENCIES: Known forex currencies for asset class detection
13
+ - MEM_GUARDS: Memory guard registry (Issue #49)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ # =============================================================================
19
+ # Schema Version Constants (Cache Evolution)
20
+ # =============================================================================
21
+ # Used for cache validation and schema evolution tracking.
22
+ # Increment when schema changes require cache invalidation.
23
+ #
24
+ # Version history:
25
+ # - 6.0.0: OHLCV only (legacy, pre-microstructure)
26
+ # - 7.0.0: Added 15 microstructure columns (Issue #25)
27
+ # - 10.0.0: Added ouroboros_mode column
28
+ # - 11.0.0: Current version with modular architecture
29
+
30
+ SCHEMA_VERSION_OHLCV_ONLY: str = "6.0.0" # Pre-microstructure (legacy)
31
+ SCHEMA_VERSION_MICROSTRUCTURE: str = "7.0.0" # Added 15 microstructure columns
32
+ SCHEMA_VERSION_OUROBOROS: str = "10.0.0" # Added ouroboros_mode column
33
+
34
+ # Minimum versions required for features
35
+ MIN_VERSION_FOR_MICROSTRUCTURE: str = SCHEMA_VERSION_MICROSTRUCTURE
36
+ MIN_VERSION_FOR_OUROBOROS: str = SCHEMA_VERSION_OUROBOROS
37
+
38
+ # =============================================================================
39
+ # Microstructure Columns (Issue #25, v7.0+)
40
+ # =============================================================================
41
+ # These columns are optional and only present when include_microstructure=True
42
+ # or when bars are generated with microstructure features enabled.
43
+ #
44
+ # IMPORTANT: Keep this list in sync with:
45
+ # - crates/rangebar-core/src/bar.rs (Rust struct fields)
46
+ # - python/rangebar/clickhouse/schema.sql (ClickHouse columns)
47
+
48
+ MICROSTRUCTURE_COLUMNS: tuple[str, ...] = (
49
+ # Basic extended columns
50
+ "vwap",
51
+ "buy_volume",
52
+ "sell_volume",
53
+ "individual_trade_count",
54
+ "agg_record_count",
55
+ # Microstructure features (Issue #25)
56
+ "duration_us",
57
+ "ofi",
58
+ "vwap_close_deviation",
59
+ "price_impact",
60
+ "kyle_lambda_proxy",
61
+ "trade_intensity",
62
+ "volume_per_trade",
63
+ "aggression_ratio",
64
+ "aggregation_density",
65
+ "turnover_imbalance",
66
+ )
67
+
68
+ # =============================================================================
69
+ # Inter-Bar Feature Columns (Issue #59, v12.0+)
70
+ # =============================================================================
71
+ # Computed from a lookback window of trades BEFORE each bar opens.
72
+ # All features are Optional - None when no lookback data available.
73
+ #
74
+ # IMPORTANT: Keep this list in sync with:
75
+ # - crates/rangebar-core/src/interbar.rs (Rust feature computation)
76
+ # - crates/rangebar-core/src/types.rs (RangeBar struct fields)
77
+ # - python/rangebar/clickhouse/schema.sql (ClickHouse columns)
78
+
79
+ INTER_BAR_FEATURE_COLUMNS: tuple[str, ...] = (
80
+ # Tier 1: Core features (7 features, min 1 trade)
81
+ "lookback_trade_count",
82
+ "lookback_ofi",
83
+ "lookback_duration_us",
84
+ "lookback_intensity",
85
+ "lookback_vwap_raw",
86
+ "lookback_vwap_position",
87
+ "lookback_count_imbalance",
88
+ # Tier 2: Statistical features (5 features)
89
+ "lookback_kyle_lambda",
90
+ "lookback_burstiness",
91
+ "lookback_volume_skew",
92
+ "lookback_volume_kurt",
93
+ "lookback_price_range",
94
+ # Tier 3: Advanced features (4 features, min 60+ trades)
95
+ "lookback_kaufman_er",
96
+ "lookback_garman_klass_vol",
97
+ "lookback_hurst",
98
+ "lookback_permutation_entropy",
99
+ )
100
+
101
+ # =============================================================================
102
+ # Tier-1 Symbols (high-liquidity, available on all Binance markets)
103
+ # =============================================================================
104
+
105
+ TIER1_SYMBOLS: tuple[str, ...] = (
106
+ "AAVE",
107
+ "ADA",
108
+ "AVAX",
109
+ "BCH",
110
+ "BNB",
111
+ "BTC",
112
+ "DOGE",
113
+ "ETH",
114
+ "FIL",
115
+ "LINK",
116
+ "LTC",
117
+ "NEAR",
118
+ "SOL",
119
+ "SUI",
120
+ "UNI",
121
+ "WIF",
122
+ "WLD",
123
+ "XRP",
124
+ )
125
+
126
+ # =============================================================================
127
+ # Threshold Range (from rangebar-core)
128
+ # =============================================================================
129
+
130
+ THRESHOLD_DECIMAL_MIN: int = 1 # 1 dbps = 0.001%
131
+ THRESHOLD_DECIMAL_MAX: int = 100_000 # 100,000 dbps = 100%
132
+
133
+ # =============================================================================
134
+ # Threshold Presets (in decimal basis points)
135
+ # =============================================================================
136
+ # 1 dbps = 0.001% = 0.00001 (one-tenth of a basis point)
137
+ # Example: 250 dbps = 0.25%
138
+
139
+ THRESHOLD_PRESETS: dict[str, int] = {
140
+ "micro": 10, # 10 dbps = 0.01% (scalping)
141
+ "tight": 50, # 50 dbps = 0.05% (day trading)
142
+ "standard": 100, # 100 dbps = 0.1% (swing trading)
143
+ "medium": 250, # 250 dbps = 0.25% (default)
144
+ "wide": 500, # 500 dbps = 0.5% (position trading)
145
+ "macro": 1000, # 100bps = 1% (long-term)
146
+ }
147
+
148
+ # =============================================================================
149
+ # Asset Class Detection Helpers
150
+ # =============================================================================
151
+
152
+ # Common crypto base symbols for detection
153
+ _CRYPTO_BASES: frozenset[str] = frozenset(
154
+ {
155
+ "BTC",
156
+ "ETH",
157
+ "BNB",
158
+ "SOL",
159
+ "XRP",
160
+ "ADA",
161
+ "DOGE",
162
+ "DOT",
163
+ "MATIC",
164
+ "AVAX",
165
+ "LINK",
166
+ "UNI",
167
+ "ATOM",
168
+ "LTC",
169
+ "ETC",
170
+ "XLM",
171
+ "ALGO",
172
+ "NEAR",
173
+ "FIL",
174
+ "APT",
175
+ }
176
+ )
177
+
178
+ # Common forex base/quote currencies
179
+ _FOREX_CURRENCIES: frozenset[str] = frozenset(
180
+ {
181
+ "EUR",
182
+ "USD",
183
+ "GBP",
184
+ "JPY",
185
+ "CHF",
186
+ "AUD",
187
+ "NZD",
188
+ "CAD",
189
+ "SEK",
190
+ "NOK",
191
+ }
192
+ )
193
+
194
+ # =============================================================================
195
+ # Continuity Validation Constants
196
+ # =============================================================================
197
+
198
+ # Default tolerance for junction continuity validation (0.01% = 0.0001)
199
+ CONTINUITY_TOLERANCE_PCT: float = 0.0001
200
+
201
+ # =============================================================================
202
+ # Exchange Session Column Names (Ouroboros feature)
203
+ # =============================================================================
204
+
205
+ EXCHANGE_SESSION_COLUMNS: tuple[str, ...] = (
206
+ "exchange_session_sydney",
207
+ "exchange_session_tokyo",
208
+ "exchange_session_london",
209
+ "exchange_session_newyork",
210
+ )
211
+
212
+ # =============================================================================
213
+ # All Optional Columns (for cache operations)
214
+ # =============================================================================
215
+ # Union of microstructure + exchange session columns
216
+
217
+ ALL_OPTIONAL_COLUMNS: tuple[str, ...] = (
218
+ *MICROSTRUCTURE_COLUMNS,
219
+ *EXCHANGE_SESSION_COLUMNS,
220
+ )
221
+
222
+ # =============================================================================
223
+ # Memory Guard Registry (Issue #49)
224
+ # =============================================================================
225
+ # Each guard prevents a specific memory exhaustion pattern.
226
+ # Code references use "# MEM-XXX:" comments for traceability.
227
+ #
228
+ # Guards are organized by pipeline stage:
229
+ # Loading → MEM-001, MEM-004, MEM-007, MEM-010
230
+ # Process → MEM-002, MEM-003
231
+ # Concat → MEM-006, MEM-008
232
+ # Test → MEM-005
233
+ # Process → MEM-009
234
+ #
235
+ # When adding a new guard, assign the next number and add an entry here.
236
+
237
+ MEM_GUARDS: dict[str, dict[str, str]] = {
238
+ "MEM-001": {
239
+ "description": "Avoid map_elements() in Parquet parsing (native Polars ops)",
240
+ "location": "storage/parquet.py:185",
241
+ "stage": "loading",
242
+ },
243
+ "MEM-002": {
244
+ "description": "Process trades in 100K chunks (~15 MB each)",
245
+ "location": "orchestration/helpers.py:274, processors/api.py:371",
246
+ "stage": "processing",
247
+ },
248
+ "MEM-003": {
249
+ "description": "Select columns BEFORE .collect() on LazyFrame",
250
+ "location": "orchestration/helpers.py:236, processors/api.py:341",
251
+ "stage": "processing",
252
+ },
253
+ "MEM-004": {
254
+ "description": "Guard read_ticks() with size estimation before .collect()",
255
+ "location": "storage/parquet.py",
256
+ "stage": "loading",
257
+ },
258
+ "MEM-005": {
259
+ "description": "gc.collect() after each test to prevent accumulation",
260
+ "location": "tests/conftest.py:26",
261
+ "stage": "testing",
262
+ },
263
+ "MEM-006": {
264
+ "description": "Use Polars concat instead of pandas for memory efficiency",
265
+ "location": "conversion.py:107, orchestration/precompute.py:404",
266
+ "stage": "concatenation",
267
+ },
268
+ "MEM-007": {
269
+ "description": "Guard deprecated _fetch_binance() with date range limit",
270
+ "location": "orchestration/helpers.py:136",
271
+ "stage": "loading",
272
+ },
273
+ "MEM-008": {
274
+ "description": "Streaming bar accumulation (avoid holding all in memory)",
275
+ "location": "orchestration/range_bars.py",
276
+ "stage": "concatenation",
277
+ },
278
+ "MEM-009": {
279
+ "description": "Process-level RLIMIT_AS cap (MemoryError instead of OOM kill)",
280
+ "location": "resource_guard.py",
281
+ "stage": "process",
282
+ },
283
+ "MEM-010": {
284
+ "description": "Pre-flight memory estimation before tick loading",
285
+ "location": "resource_guard.py",
286
+ "stage": "loading",
287
+ },
288
+ }
rangebar/conversion.py ADDED
@@ -0,0 +1,177 @@
1
+ """Conversion utilities for rangebar-py.
2
+
3
+ This module provides dtype conversion, normalization, and DataFrame manipulation
4
+ utilities used throughout the codebase. These functions handle:
5
+ - Converting bar dictionaries to Polars/pandas DataFrames
6
+ - Concatenating DataFrames with consistent dtypes
7
+ - Normalizing datetime precision (Issue #44 fix)
8
+ - Converting PyArrow dtypes to numpy for compatibility
9
+
10
+ SSoT (Single Source of Truth) for:
11
+ - _bars_list_to_polars: Convert bar dicts to Polars DataFrame
12
+ - _concat_pandas_via_polars: Memory-efficient DataFrame concatenation
13
+ - normalize_temporal_precision: Fix mixed datetime precision
14
+ - normalize_arrow_dtypes: Convert PyArrow to numpy dtypes
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from typing import TYPE_CHECKING
20
+
21
+ import pandas as pd
22
+
23
+ if TYPE_CHECKING:
24
+ import polars as pl
25
+
26
+
27
+ def _bars_list_to_polars(
28
+ bars: list[dict],
29
+ include_microstructure: bool = False,
30
+ ) -> pl.DataFrame:
31
+ """Convert list of bar dicts to Polars DataFrame in backtesting.py format.
32
+
33
+ Parameters
34
+ ----------
35
+ bars : list[dict]
36
+ List of bar dictionaries from processor
37
+ include_microstructure : bool
38
+ Include microstructure columns
39
+
40
+ Returns
41
+ -------
42
+ pl.DataFrame
43
+ DataFrame with OHLCV columns (capitalized), DatetimeIndex
44
+ """
45
+ import polars as pl
46
+
47
+ if not bars:
48
+ return pl.DataFrame()
49
+
50
+ bars_df = pl.DataFrame(bars)
51
+
52
+ # Convert timestamp to datetime
53
+ if "timestamp" in bars_df.columns:
54
+ bars_df = bars_df.with_columns(
55
+ pl.col("timestamp")
56
+ .str.to_datetime(format="%Y-%m-%dT%H:%M:%S%.f%:z")
57
+ .alias("timestamp")
58
+ )
59
+
60
+ # Rename to backtesting.py format
61
+ rename_map = {
62
+ "open": "Open",
63
+ "high": "High",
64
+ "low": "Low",
65
+ "close": "Close",
66
+ "volume": "Volume",
67
+ }
68
+ bars_df = bars_df.rename(
69
+ {k: v for k, v in rename_map.items() if k in bars_df.columns}
70
+ )
71
+
72
+ # Select columns
73
+ base_cols = ["timestamp", "Open", "High", "Low", "Close", "Volume"]
74
+ if include_microstructure:
75
+ # Include all columns
76
+ return bars_df
77
+ # Only OHLCV columns
78
+ available = [c for c in base_cols if c in bars_df.columns]
79
+ return bars_df.select(available)
80
+
81
+
82
+ def normalize_temporal_precision(pldf: pl.DataFrame) -> pl.DataFrame:
83
+ """Normalize datetime columns to microsecond precision.
84
+
85
+ This prevents SchemaError when concatenating DataFrames with mixed
86
+ datetime precision (e.g., μs vs ns). See Issue #44.
87
+
88
+ Parameters
89
+ ----------
90
+ pldf : pl.DataFrame
91
+ Polars DataFrame to normalize
92
+
93
+ Returns
94
+ -------
95
+ pl.DataFrame
96
+ DataFrame with all datetime columns cast to microsecond precision
97
+ """
98
+ import polars as pl
99
+
100
+ for col in pldf.columns:
101
+ if pldf[col].dtype.is_temporal():
102
+ pldf = pldf.with_columns(pl.col(col).dt.cast_time_unit("us"))
103
+ return pldf
104
+
105
+
106
+ def _concat_pandas_via_polars(dfs: list[pd.DataFrame]) -> pd.DataFrame:
107
+ """Concatenate pandas DataFrames using Polars for memory efficiency (MEM-006).
108
+
109
+ This function uses Polars' more efficient concatenation instead of pd.concat,
110
+ reducing memory fragmentation and improving performance for large datasets.
111
+
112
+ Parameters
113
+ ----------
114
+ dfs : list[pd.DataFrame]
115
+ List of pandas DataFrames to concatenate
116
+
117
+ Returns
118
+ -------
119
+ pd.DataFrame
120
+ Concatenated DataFrame with sorted DatetimeIndex
121
+ """
122
+ import polars as pl
123
+
124
+ if not dfs:
125
+ return pd.DataFrame()
126
+
127
+ if len(dfs) == 1:
128
+ return dfs[0]
129
+
130
+ # Convert to Polars
131
+ pl_dfs = [pl.from_pandas(df.reset_index()) for df in dfs]
132
+
133
+ # Normalize datetime columns to consistent precision (μs) before concat
134
+ # This prevents SchemaError when months have mixed precision (Issue #44)
135
+ normalized = [normalize_temporal_precision(pldf) for pldf in pl_dfs]
136
+ combined = pl.concat(normalized)
137
+
138
+ # Sort by timestamp/index column
139
+ index_col = "timestamp" if "timestamp" in combined.columns else combined.columns[0]
140
+ combined = combined.sort(index_col)
141
+
142
+ # Convert back to pandas with proper index
143
+ result = combined.to_pandas()
144
+ if index_col in result.columns:
145
+ result = result.set_index(index_col)
146
+
147
+ return result
148
+
149
+
150
+ def normalize_arrow_dtypes(
151
+ df: pd.DataFrame, columns: list[str] | None = None
152
+ ) -> pd.DataFrame:
153
+ """Convert PyArrow dtypes to numpy for compatibility.
154
+
155
+ ClickHouse query_df_arrow returns double[pyarrow], but process_trades
156
+ returns float64. This function normalizes the dtypes.
157
+
158
+ Parameters
159
+ ----------
160
+ df : pd.DataFrame
161
+ DataFrame potentially containing PyArrow dtypes
162
+ columns : list[str] | None
163
+ Columns to normalize. If None, normalizes OHLCV columns.
164
+
165
+ Returns
166
+ -------
167
+ pd.DataFrame
168
+ DataFrame with numpy dtypes
169
+ """
170
+ if columns is None:
171
+ columns = ["Open", "High", "Low", "Close", "Volume"]
172
+
173
+ for col in columns:
174
+ if col in df.columns:
175
+ df[col] = df[col].astype("float64")
176
+
177
+ return df