gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ClickHouse connection and schema management for gapless-crypto-data v4.0.0.
|
|
3
|
+
|
|
4
|
+
Provides ClickHouseConnection class for database operations using clickhouse-driver.
|
|
5
|
+
Replaces QuestDB implementation (ADR-0003) for future-proofing and ecosystem maturity.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
9
|
+
|
|
10
|
+
with ClickHouseConnection() as conn:
|
|
11
|
+
conn.execute("SELECT 1")
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .config import ClickHouseConfig
|
|
15
|
+
from .connection import ClickHouseConnection
|
|
16
|
+
|
|
17
|
+
__all__ = ["ClickHouseConnection", "ClickHouseConfig"]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ClickHouse configuration for gapless-crypto-data v4.0.0.
|
|
3
|
+
|
|
4
|
+
Environment variable support for connection parameters.
|
|
5
|
+
Follows same pattern as QuestDBConfig (ADR-0003).
|
|
6
|
+
|
|
7
|
+
Error Handling: Raise and propagate (no fallback, no defaults for required params)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ClickHouseConfig:
|
|
16
|
+
"""
|
|
17
|
+
ClickHouse connection configuration.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
host: ClickHouse server hostname (default: localhost)
|
|
21
|
+
port: Native protocol port (default: 9000)
|
|
22
|
+
http_port: HTTP interface port (default: 8123)
|
|
23
|
+
database: Database name (default: default)
|
|
24
|
+
user: Username (default: default)
|
|
25
|
+
password: Password (default: empty)
|
|
26
|
+
secure: Enable TLS/SSL for secure connections (default: False, required for ClickHouse Cloud)
|
|
27
|
+
|
|
28
|
+
Environment Variables:
|
|
29
|
+
CLICKHOUSE_HOST: Override host
|
|
30
|
+
CLICKHOUSE_PORT: Override native protocol port
|
|
31
|
+
CLICKHOUSE_HTTP_PORT: Override HTTP port
|
|
32
|
+
CLICKHOUSE_DATABASE: Override database name
|
|
33
|
+
CLICKHOUSE_USER: Override username
|
|
34
|
+
CLICKHOUSE_PASSWORD: Override password
|
|
35
|
+
CLICKHOUSE_SECURE: Enable TLS/SSL (set to 'true' for ClickHouse Cloud)
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
# Default configuration (localhost)
|
|
39
|
+
config = ClickHouseConfig.from_env()
|
|
40
|
+
|
|
41
|
+
# Custom configuration
|
|
42
|
+
config = ClickHouseConfig(host="clickhouse.example.com", port=9000)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
host: str = "localhost"
|
|
46
|
+
port: int = 9000
|
|
47
|
+
http_port: int = 8123
|
|
48
|
+
database: str = "default"
|
|
49
|
+
user: str = "default"
|
|
50
|
+
password: str = ""
|
|
51
|
+
secure: bool = False
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_env(cls) -> "ClickHouseConfig":
|
|
55
|
+
"""
|
|
56
|
+
Create configuration from environment variables.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
ClickHouseConfig with values from environment or defaults
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If CLICKHOUSE_PORT is not a valid integer
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
export CLICKHOUSE_HOST=clickhouse.example.com
|
|
66
|
+
export CLICKHOUSE_PORT=9000
|
|
67
|
+
config = ClickHouseConfig.from_env()
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
port = int(os.getenv("CLICKHOUSE_PORT", "9000"))
|
|
71
|
+
http_port = int(os.getenv("CLICKHOUSE_HTTP_PORT", "8123"))
|
|
72
|
+
except ValueError as e:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Invalid CLICKHOUSE_PORT or CLICKHOUSE_HTTP_PORT (must be integer): {e}"
|
|
75
|
+
) from e
|
|
76
|
+
|
|
77
|
+
return cls(
|
|
78
|
+
host=os.getenv("CLICKHOUSE_HOST", "localhost"),
|
|
79
|
+
port=port,
|
|
80
|
+
http_port=http_port,
|
|
81
|
+
database=os.getenv("CLICKHOUSE_DATABASE", "default"),
|
|
82
|
+
user=os.getenv("CLICKHOUSE_USER", "default"),
|
|
83
|
+
password=os.getenv("CLICKHOUSE_PASSWORD", ""),
|
|
84
|
+
secure=os.getenv("CLICKHOUSE_SECURE", "false").lower() in ("true", "1", "yes"),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def validate(self) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Validate configuration parameters.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If any parameter is invalid
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
config = ClickHouseConfig(port=-1)
|
|
96
|
+
config.validate() # Raises ValueError
|
|
97
|
+
"""
|
|
98
|
+
if not self.host:
|
|
99
|
+
raise ValueError("host cannot be empty")
|
|
100
|
+
|
|
101
|
+
if not (1 <= self.port <= 65535):
|
|
102
|
+
raise ValueError(f"port must be between 1 and 65535, got {self.port}")
|
|
103
|
+
|
|
104
|
+
if not (1 <= self.http_port <= 65535):
|
|
105
|
+
raise ValueError(f"http_port must be between 1 and 65535, got {self.http_port}")
|
|
106
|
+
|
|
107
|
+
if not self.database:
|
|
108
|
+
raise ValueError("database cannot be empty")
|
|
109
|
+
|
|
110
|
+
if not self.user:
|
|
111
|
+
raise ValueError("user cannot be empty")
|
|
112
|
+
|
|
113
|
+
def __repr__(self) -> str:
|
|
114
|
+
"""String representation (hide password)."""
|
|
115
|
+
return (
|
|
116
|
+
f"ClickHouseConfig(host='{self.host}', port={self.port}, "
|
|
117
|
+
f"http_port={self.http_port}, database='{self.database}', "
|
|
118
|
+
f"user='{self.user}', password='***', secure={self.secure})"
|
|
119
|
+
)
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ClickHouse connection management for gapless-crypto-clickhouse v6.0.0.
|
|
3
|
+
|
|
4
|
+
Provides context-managed connection to ClickHouse using clickhouse-connect with Apache Arrow support.
|
|
5
|
+
Replaces clickhouse-driver (ADR-0023) for 3x faster queries and 4x less memory.
|
|
6
|
+
|
|
7
|
+
Error Handling: Raise and propagate (no fallback, no retry, no silent failures)
|
|
8
|
+
SLOs: Availability (connection health checks), Correctness (query validation),
|
|
9
|
+
Observability (connection logging), Maintainability (standard HTTP client)
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
13
|
+
|
|
14
|
+
with ClickHouseConnection() as conn:
|
|
15
|
+
df = conn.query_dataframe("SELECT * FROM ohlcv FINAL LIMIT 10")
|
|
16
|
+
print(df) # pandas DataFrame with Arrow-optimized internals
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
import clickhouse_connect
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
from .config import ClickHouseConfig
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ClickHouseConnection:
|
|
31
|
+
"""
|
|
32
|
+
Context-managed ClickHouse connection with Apache Arrow support.
|
|
33
|
+
|
|
34
|
+
Provides execute() for queries and insert_dataframe() for bulk inserts.
|
|
35
|
+
Uses HTTP protocol (port 8123) with Apache Arrow for zero-copy DataFrame creation.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
config: ClickHouse configuration
|
|
39
|
+
client: clickhouse-connect Client instance
|
|
40
|
+
|
|
41
|
+
Error Handling:
|
|
42
|
+
- Connection failures raise Exception
|
|
43
|
+
- Query failures raise Exception
|
|
44
|
+
- No retries, no fallbacks (raise and propagate policy)
|
|
45
|
+
|
|
46
|
+
Performance:
|
|
47
|
+
- Arrow-optimized queries: 3x faster DataFrame creation
|
|
48
|
+
- Zero-copy when possible: 4x less memory
|
|
49
|
+
- HTTP protocol: nginx/reverse proxy compatible
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
with ClickHouseConnection() as conn:
|
|
53
|
+
# Execute query (returns tuples)
|
|
54
|
+
result = conn.execute("SELECT COUNT(*) FROM ohlcv")
|
|
55
|
+
|
|
56
|
+
# Query DataFrame (Arrow-optimized internally)
|
|
57
|
+
df = conn.query_dataframe("SELECT * FROM ohlcv FINAL LIMIT 10")
|
|
58
|
+
|
|
59
|
+
# Insert DataFrame
|
|
60
|
+
df = pd.DataFrame({"col": [1, 2, 3]})
|
|
61
|
+
conn.insert_dataframe(df, "test_table")
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, config: Optional[ClickHouseConfig] = None) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Initialize ClickHouse connection.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config: ClickHouse configuration (default: from environment)
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If configuration is invalid
|
|
73
|
+
Exception: If connection fails
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
# Default configuration (localhost)
|
|
77
|
+
conn = ClickHouseConnection()
|
|
78
|
+
|
|
79
|
+
# Custom configuration
|
|
80
|
+
config = ClickHouseConfig(host="clickhouse.example.com")
|
|
81
|
+
conn = ClickHouseConnection(config)
|
|
82
|
+
"""
|
|
83
|
+
self.config = config or ClickHouseConfig.from_env()
|
|
84
|
+
self.config.validate()
|
|
85
|
+
|
|
86
|
+
logger.info(
|
|
87
|
+
f"Initializing ClickHouse connection: {self.config.host}:{self.config.http_port} "
|
|
88
|
+
f"(HTTP protocol with Arrow support)"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
# clickhouse-connect uses HTTP protocol (port 8123 local, 8443 Cloud)
|
|
93
|
+
self.client = clickhouse_connect.get_client(
|
|
94
|
+
host=self.config.host,
|
|
95
|
+
port=self.config.http_port,
|
|
96
|
+
database=self.config.database,
|
|
97
|
+
username=self.config.user,
|
|
98
|
+
password=self.config.password,
|
|
99
|
+
secure=self.config.secure, # Enable TLS/SSL for ClickHouse Cloud
|
|
100
|
+
# Performance settings
|
|
101
|
+
settings={
|
|
102
|
+
"max_block_size": 100000, # Batch size for queries
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
raise Exception(
|
|
107
|
+
f"Failed to connect to ClickHouse at {self.config.host}:{self.config.http_port}: {e}"
|
|
108
|
+
) from e
|
|
109
|
+
|
|
110
|
+
def __enter__(self) -> "ClickHouseConnection":
|
|
111
|
+
"""Context manager entry with schema validation."""
|
|
112
|
+
if not self.health_check():
|
|
113
|
+
raise Exception("ClickHouse health check failed during context manager entry")
|
|
114
|
+
|
|
115
|
+
# Schema validation (ADR-0024)
|
|
116
|
+
from .schema_validator import SchemaValidationError, SchemaValidator
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
validator = SchemaValidator(self)
|
|
120
|
+
validator.validate_schema()
|
|
121
|
+
logger.info("Schema validation passed")
|
|
122
|
+
except SchemaValidationError as e:
|
|
123
|
+
logger.error(f"Schema validation failed: {e}")
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
logger.debug("ClickHouse connection opened")
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
130
|
+
"""Context manager exit (cleanup)."""
|
|
131
|
+
if self.client:
|
|
132
|
+
self.client.close()
|
|
133
|
+
logger.debug("ClickHouse connection closed")
|
|
134
|
+
|
|
135
|
+
def health_check(self) -> bool:
|
|
136
|
+
"""
|
|
137
|
+
Verify ClickHouse connection is alive.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if connection is healthy, False otherwise
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
conn = ClickHouseConnection()
|
|
144
|
+
if conn.health_check():
|
|
145
|
+
print("Connection healthy")
|
|
146
|
+
"""
|
|
147
|
+
try:
|
|
148
|
+
result = self.client.command("SELECT 1")
|
|
149
|
+
if result != 1:
|
|
150
|
+
logger.error(f"Health check failed: unexpected result {result}")
|
|
151
|
+
return False
|
|
152
|
+
logger.debug("ClickHouse health check passed")
|
|
153
|
+
return True
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error(f"ClickHouse health check failed: {e}")
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
def execute(self, query: str, params: Optional[Dict[str, Any]] = None) -> List[Tuple[Any, ...]]:
|
|
159
|
+
"""
|
|
160
|
+
Execute SQL query with parameter substitution.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
query: SQL query string (use {name:Type} placeholders for clickhouse-connect)
|
|
164
|
+
params: Query parameters (dict mapping placeholder names to values)
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
List of result tuples
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
Exception: If query execution fails
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
# Simple query
|
|
174
|
+
result = conn.execute("SELECT 1") # [(1,)]
|
|
175
|
+
|
|
176
|
+
# Parameterized query (clickhouse-connect format)
|
|
177
|
+
result = conn.execute(
|
|
178
|
+
"SELECT * FROM ohlcv WHERE symbol = {symbol:String}",
|
|
179
|
+
params={'symbol': 'BTCUSDT'}
|
|
180
|
+
)
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
logger.debug(f"Executing query: {query[:100]}...")
|
|
184
|
+
result = self.client.query(query, parameters=params or {})
|
|
185
|
+
rows = result.result_rows
|
|
186
|
+
logger.debug(f"Query returned {len(rows)} rows")
|
|
187
|
+
return rows
|
|
188
|
+
except Exception as e:
|
|
189
|
+
raise Exception(f"Query execution failed: {query[:100]}...\nError: {e}") from e
|
|
190
|
+
|
|
191
|
+
def query_dataframe(self, query: str, params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
192
|
+
"""
|
|
193
|
+
Execute SQL query and return results as pandas DataFrame with Arrow optimization.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
query: SQL query string (use {name:Type} placeholders for clickhouse-connect)
|
|
197
|
+
params: Query parameters (dict mapping placeholder names to values)
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
pandas DataFrame with query results (Arrow-optimized internally)
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
Exception: If query execution fails
|
|
204
|
+
|
|
205
|
+
Performance:
|
|
206
|
+
- Arrow format enabled: 3x faster DataFrame creation
|
|
207
|
+
- Zero-copy when compatible: 4x less memory
|
|
208
|
+
- Automatic fallback if Arrow not available
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
# Simple query
|
|
212
|
+
df = conn.query_dataframe("SELECT * FROM ohlcv FINAL LIMIT 10")
|
|
213
|
+
|
|
214
|
+
# Parameterized query
|
|
215
|
+
df = conn.query_dataframe(
|
|
216
|
+
"SELECT * FROM ohlcv FINAL WHERE symbol = {symbol:String}",
|
|
217
|
+
params={'symbol': 'BTCUSDT'}
|
|
218
|
+
)
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
logger.debug(f"Executing query (DataFrame, Arrow-optimized): {query[:100]}...")
|
|
222
|
+
# Use Arrow-optimized query method for 3x faster DataFrame creation
|
|
223
|
+
df = self.client.query_df_arrow(query, parameters=params or {})
|
|
224
|
+
logger.debug(f"Query returned {len(df)} rows (Arrow-optimized)")
|
|
225
|
+
return df
|
|
226
|
+
except Exception as e:
|
|
227
|
+
raise Exception(f"Query execution failed: {query[:100]}...\nError: {e}") from e
|
|
228
|
+
|
|
229
|
+
def insert_dataframe(self, df: pd.DataFrame, table: str) -> int:
|
|
230
|
+
"""
|
|
231
|
+
Bulk insert DataFrame to ClickHouse table.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
df: pandas DataFrame with data to insert
|
|
235
|
+
table: Target table name
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Number of rows inserted
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
Exception: If insert fails
|
|
242
|
+
ValueError: If DataFrame is empty or has invalid schema
|
|
243
|
+
|
|
244
|
+
Example:
|
|
245
|
+
df = pd.DataFrame({
|
|
246
|
+
'timestamp': pd.to_datetime(['2024-01-01']),
|
|
247
|
+
'symbol': ['BTCUSDT'],
|
|
248
|
+
'open': [50000.0]
|
|
249
|
+
})
|
|
250
|
+
rows = conn.insert_dataframe(df, 'ohlcv')
|
|
251
|
+
print(f"Inserted {rows} rows")
|
|
252
|
+
"""
|
|
253
|
+
if df.empty:
|
|
254
|
+
logger.warning(f"Empty DataFrame, skipping insert to {table}")
|
|
255
|
+
return 0
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
logger.info(f"Inserting {len(df)} rows to {table}")
|
|
259
|
+
|
|
260
|
+
# Use standard insert (Arrow benefits are mainly on query side)
|
|
261
|
+
self.client.insert_df(table, df)
|
|
262
|
+
|
|
263
|
+
logger.info(f"Successfully inserted {len(df)} rows to {table}")
|
|
264
|
+
return len(df)
|
|
265
|
+
|
|
266
|
+
except Exception as e:
|
|
267
|
+
raise Exception(f"Bulk insert failed for table {table} ({len(df)} rows): {e}") from e
|
|
268
|
+
except ValueError as e:
|
|
269
|
+
raise ValueError(f"Invalid DataFrame schema for table {table}: {e}") from e
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
-- ClickHouse Schema for gapless-crypto-data v4.0.0
|
|
2
|
+
-- ADR-0005: ClickHouse Migration for Future-Proofing
|
|
3
|
+
--
|
|
4
|
+
-- ReplacingMergeTree engine with deterministic versioning for zero-gap guarantee.
|
|
5
|
+
-- Preserves ADR-0004 futures support (instrument_type column for spot/futures).
|
|
6
|
+
--
|
|
7
|
+
-- Error Handling: Raise and propagate (no silent failures)
|
|
8
|
+
-- SLOs: Availability, Correctness (zero-gap via _version), Observability, Maintainability
|
|
9
|
+
|
|
10
|
+
CREATE TABLE IF NOT EXISTS ohlcv (
|
|
11
|
+
-- Primary timestamp (microsecond precision - ADR-0021)
|
|
12
|
+
-- Upgraded from DateTime64(3) to support Binance's 2025-01-01 format transition:
|
|
13
|
+
-- Spot data: microseconds (16 digits) after 2025-01-01
|
|
14
|
+
-- Futures data: milliseconds (13 digits), converted to microseconds during ingestion
|
|
15
|
+
timestamp DateTime64(6) CODEC(DoubleDelta, LZ4),
|
|
16
|
+
|
|
17
|
+
-- Metadata columns (low-cardinality, optimized for indexing)
|
|
18
|
+
symbol LowCardinality(String) CODEC(ZSTD(3)), -- Trading pair (e.g., "BTCUSDT")
|
|
19
|
+
timeframe LowCardinality(String) CODEC(ZSTD(3)), -- Timeframe (e.g., "1h", "1mo")
|
|
20
|
+
instrument_type LowCardinality(String) CODEC(ZSTD(3)), -- 'spot' or 'futures-um' (ADR-0004, ADR-0021)
|
|
21
|
+
data_source LowCardinality(String) CODEC(ZSTD(3)), -- 'cloudfront'
|
|
22
|
+
|
|
23
|
+
-- OHLCV data (core price/volume metrics)
|
|
24
|
+
open Float64 CODEC(Gorilla, LZ4),
|
|
25
|
+
high Float64 CODEC(Gorilla, LZ4),
|
|
26
|
+
low Float64 CODEC(Gorilla, LZ4),
|
|
27
|
+
close Float64 CODEC(Gorilla, LZ4),
|
|
28
|
+
volume Float64 CODEC(Gorilla, LZ4),
|
|
29
|
+
|
|
30
|
+
-- Additional microstructure metrics (Binance 11-column format)
|
|
31
|
+
close_time DateTime64(6) CODEC(DoubleDelta, LZ4), -- Upgraded to microsecond precision
|
|
32
|
+
quote_asset_volume Float64 CODEC(Gorilla, LZ4),
|
|
33
|
+
number_of_trades Int64 CODEC(Delta, LZ4),
|
|
34
|
+
taker_buy_base_asset_volume Float64 CODEC(Gorilla, LZ4),
|
|
35
|
+
taker_buy_quote_asset_volume Float64 CODEC(Gorilla, LZ4),
|
|
36
|
+
|
|
37
|
+
-- Futures-specific data (ADR-0021, v3.2.0+)
|
|
38
|
+
funding_rate Nullable(Float64) CODEC(Gorilla, LZ4), -- NULL for spot, initially NULL for futures
|
|
39
|
+
|
|
40
|
+
-- Deduplication support (application-level, preserves zero-gap guarantee)
|
|
41
|
+
_version UInt64 CODEC(Delta, LZ4), -- Deterministic hash of row content
|
|
42
|
+
_sign Int8 DEFAULT 1 -- ReplacingMergeTree sign (1 for active rows)
|
|
43
|
+
|
|
44
|
+
) ENGINE = ReplacingMergeTree(_version)
|
|
45
|
+
ORDER BY (timestamp, symbol, timeframe, instrument_type)
|
|
46
|
+
PARTITION BY toYYYYMMDD(timestamp)
|
|
47
|
+
SETTINGS
|
|
48
|
+
index_granularity = 8192, -- Default granularity (good for time-series)
|
|
49
|
+
allow_nullable_key = 0, -- Disallow NULL in ORDER BY keys (data quality)
|
|
50
|
+
merge_with_ttl_timeout = 86400; -- Merge within 24 hours (background deduplication)
|
|
51
|
+
|
|
52
|
+
-- Rationale:
|
|
53
|
+
-- 1. ReplacingMergeTree(_version): Handles duplicates via background merges
|
|
54
|
+
-- - _version is deterministic hash of (timestamp, OHLCV, symbol, timeframe, instrument_type)
|
|
55
|
+
-- - Identical writes → identical _version → consistent merge outcome
|
|
56
|
+
-- - Preserves zero-gap guarantee via deterministic deduplication
|
|
57
|
+
--
|
|
58
|
+
-- 2. ORDER BY composite key: (timestamp, symbol, timeframe, instrument_type)
|
|
59
|
+
-- - Optimizes queries filtering by these columns
|
|
60
|
+
-- - ClickHouse uses ORDER BY as primary key (unlike PostgreSQL)
|
|
61
|
+
--
|
|
62
|
+
-- 3. PARTITION BY toYYYYMMDD(timestamp): Daily partitions
|
|
63
|
+
-- - Matches ADR-0003 QuestDB partition strategy (PARTITION BY DAY)
|
|
64
|
+
-- - Enables efficient partition pruning for date-range queries
|
|
65
|
+
--
|
|
66
|
+
-- 4. LowCardinality(String): ClickHouse equivalent to QuestDB SYMBOL
|
|
67
|
+
-- - Optimizes storage for low-cardinality columns (symbol, timeframe, etc.)
|
|
68
|
+
-- - Automatic dictionary encoding (similar to SYMBOL capacity)
|
|
69
|
+
--
|
|
70
|
+
-- 5. CODEC compression:
|
|
71
|
+
-- - DoubleDelta: Optimized for timestamps (sequential values)
|
|
72
|
+
-- - Gorilla: Optimized for float values (OHLCV data)
|
|
73
|
+
-- - Delta: Optimized for integer sequences (number_of_trades)
|
|
74
|
+
-- - ZSTD: General-purpose compression for string columns
|
|
75
|
+
--
|
|
76
|
+
-- 6. DateTime64(6): Microsecond precision (ADR-0021)
|
|
77
|
+
-- - Upgraded from DateTime64(3) to support Binance's 2025-01-01 format transition
|
|
78
|
+
-- - Spot data: microseconds (16 digits) after 2025-01-01
|
|
79
|
+
-- - Futures data: milliseconds (13 digits), converted to microseconds during ingestion
|
|
80
|
+
-- - Universal microsecond precision prevents timestamp errors
|
|
81
|
+
-- - ClickHouse equivalent to QuestDB TIMESTAMP type
|
|
82
|
+
|
|
83
|
+
-- Zero-Gap Guarantee:
|
|
84
|
+
-- Unlike QuestDB DEDUP ENABLE UPSERT KEYS (immediate consistency),
|
|
85
|
+
-- ClickHouse uses eventual consistency (duplicates visible until merge).
|
|
86
|
+
-- Application-level deterministic versioning ensures consistent merge outcomes.
|
|
87
|
+
--
|
|
88
|
+
-- Query pattern for deduplicated results:
|
|
89
|
+
-- SELECT * FROM ohlcv FINAL WHERE symbol = 'BTCUSDT' AND timeframe = '1h';
|
|
90
|
+
--
|
|
91
|
+
-- FINAL keyword forces deduplication at query time (10-30% performance overhead).
|
|
92
|
+
-- This is an acceptable trade-off for zero-gap guarantee preservation.
|
|
93
|
+
|
|
94
|
+
-- Migration from QuestDB (ADR-0003):
|
|
95
|
+
-- QuestDB SYMBOL → ClickHouse LowCardinality(String)
|
|
96
|
+
-- QuestDB DEDUP ENABLE UPSERT KEYS → ClickHouse ReplacingMergeTree(_version)
|
|
97
|
+
-- QuestDB PARTITION BY DAY → ClickHouse PARTITION BY toYYYYMMDD(timestamp)
|
|
98
|
+
-- QuestDB PostgreSQL wire protocol → ClickHouse native protocol (clickhouse-driver)
|