gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""ClickHouse Schema Validator for v6.0.0.
|
|
2
|
+
|
|
3
|
+
Validates runtime schema matches expected schema.sql definition.
|
|
4
|
+
Raises SchemaValidationError on mismatch (no fallback, no retry).
|
|
5
|
+
|
|
6
|
+
**SLO Focus**: Correctness (prevents 1000x data loss from DateTime64(3) vs DateTime64(6) mismatch)
|
|
7
|
+
|
|
8
|
+
**ADR**: ADR-0024 (Comprehensive Validation Canonicity)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Dict, List, Tuple
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ExpectedSchema:
|
|
20
|
+
"""Expected ohlcv table schema (v4.0.0+ with DateTime64(6) microsecond precision).
|
|
21
|
+
|
|
22
|
+
Reference: src/gapless_crypto_clickhouse/clickhouse/schema.sql
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
columns: Dict[str, str] = field(
|
|
26
|
+
default_factory=lambda: {
|
|
27
|
+
# Metadata columns
|
|
28
|
+
"symbol": "LowCardinality(String)",
|
|
29
|
+
"timeframe": "LowCardinality(String)",
|
|
30
|
+
"instrument_type": "LowCardinality(String)",
|
|
31
|
+
"data_source": "LowCardinality(String)",
|
|
32
|
+
"timestamp": "DateTime64(6)", # Microsecond precision (ADR-0021)
|
|
33
|
+
# OHLCV columns
|
|
34
|
+
"open": "Float64",
|
|
35
|
+
"high": "Float64",
|
|
36
|
+
"low": "Float64",
|
|
37
|
+
"close": "Float64",
|
|
38
|
+
"volume": "Float64",
|
|
39
|
+
# Microstructure columns
|
|
40
|
+
"close_time": "DateTime64(6)", # Microsecond precision (ADR-0021)
|
|
41
|
+
"quote_asset_volume": "Float64",
|
|
42
|
+
"number_of_trades": "Int64",
|
|
43
|
+
"taker_buy_base_asset_volume": "Float64",
|
|
44
|
+
"taker_buy_quote_asset_volume": "Float64",
|
|
45
|
+
# Futures-specific column
|
|
46
|
+
"funding_rate": "Nullable(Float64)",
|
|
47
|
+
# Internal deduplication columns
|
|
48
|
+
"_version": "UInt64",
|
|
49
|
+
"_sign": "Int8",
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
engine: str = "ReplacingMergeTree"
|
|
54
|
+
partition_key: str = "toYYYYMMDD(timestamp)"
|
|
55
|
+
sorting_key: Tuple[str, ...] = ("timestamp", "symbol", "timeframe", "instrument_type")
|
|
56
|
+
|
|
57
|
+
# Expected compression codecs (optional validation)
|
|
58
|
+
expected_codecs: Dict[str, str] = field(
|
|
59
|
+
default_factory=lambda: {
|
|
60
|
+
"timestamp": "DoubleDelta",
|
|
61
|
+
"close_time": "DoubleDelta",
|
|
62
|
+
"open": "Gorilla",
|
|
63
|
+
"high": "Gorilla",
|
|
64
|
+
"low": "Gorilla",
|
|
65
|
+
"close": "Gorilla",
|
|
66
|
+
"volume": "Gorilla",
|
|
67
|
+
"quote_asset_volume": "Gorilla",
|
|
68
|
+
"taker_buy_base_asset_volume": "Gorilla",
|
|
69
|
+
"taker_buy_quote_asset_volume": "Gorilla",
|
|
70
|
+
"funding_rate": "Gorilla",
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SchemaValidationError(Exception):
|
|
76
|
+
"""Raised when schema validation fails.
|
|
77
|
+
|
|
78
|
+
**Behavior**: STRICT - No fallback, no retry, no silent failures.
|
|
79
|
+
Propagate to caller (SLO requirement).
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SchemaValidator:
|
|
86
|
+
"""Validates ClickHouse schema at runtime against expected schema.sql definition.
|
|
87
|
+
|
|
88
|
+
**Usage**:
|
|
89
|
+
```python
|
|
90
|
+
with ClickHouseConnection() as conn:
|
|
91
|
+
validator = SchemaValidator(conn)
|
|
92
|
+
validator.validate_schema() # Raises SchemaValidationError on mismatch
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Validation Scope**:
|
|
96
|
+
- Column types (DateTime64(6) vs DateTime64(3) detection)
|
|
97
|
+
- Engine configuration (ReplacingMergeTree with _version)
|
|
98
|
+
- Partition key (daily partitions for pruning)
|
|
99
|
+
- Sorting key (query optimization)
|
|
100
|
+
- Compression codecs (storage efficiency, optional)
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, connection):
|
|
104
|
+
"""Initialize validator with ClickHouse connection.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
connection: ClickHouseConnection instance (must be opened)
|
|
108
|
+
"""
|
|
109
|
+
self.connection = connection
|
|
110
|
+
self.expected = ExpectedSchema()
|
|
111
|
+
|
|
112
|
+
def validate_schema(self) -> Dict[str, any]:
|
|
113
|
+
"""Validate ohlcv table schema matches expectations.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Validation report: {"status": "valid", "errors": []}
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
SchemaValidationError: If critical mismatch detected (STRICT mode)
|
|
120
|
+
"""
|
|
121
|
+
errors = []
|
|
122
|
+
|
|
123
|
+
# 1. Validate column types
|
|
124
|
+
try:
|
|
125
|
+
column_errors = self._validate_column_types()
|
|
126
|
+
errors.extend(column_errors)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
errors.append(f"Column type validation failed: {e}")
|
|
129
|
+
|
|
130
|
+
# 2. Validate engine configuration
|
|
131
|
+
try:
|
|
132
|
+
engine_errors = self._validate_engine()
|
|
133
|
+
errors.extend(engine_errors)
|
|
134
|
+
except Exception as e:
|
|
135
|
+
errors.append(f"Engine validation failed: {e}")
|
|
136
|
+
|
|
137
|
+
# 3. Validate partitioning
|
|
138
|
+
try:
|
|
139
|
+
partition_errors = self._validate_partitioning()
|
|
140
|
+
errors.extend(partition_errors)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
errors.append(f"Partition validation failed: {e}")
|
|
143
|
+
|
|
144
|
+
# 4. Validate sorting key
|
|
145
|
+
try:
|
|
146
|
+
sorting_errors = self._validate_sorting_key()
|
|
147
|
+
errors.extend(sorting_errors)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
errors.append(f"Sorting key validation failed: {e}")
|
|
150
|
+
|
|
151
|
+
if errors:
|
|
152
|
+
raise SchemaValidationError(
|
|
153
|
+
f"Schema validation failed ({len(errors)} errors):\n"
|
|
154
|
+
+ "\n".join(f" - {e}" for e in errors)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
logger.info("Schema validation passed: ohlcv table matches expected schema")
|
|
158
|
+
return {"status": "valid", "errors": []}
|
|
159
|
+
|
|
160
|
+
def _validate_column_types(self) -> List[str]:
|
|
161
|
+
"""Validate column types match expected schema.
|
|
162
|
+
|
|
163
|
+
**Critical Check**: DateTime64(6) vs DateTime64(3) mismatch
|
|
164
|
+
(prevents 1000x data loss)
|
|
165
|
+
"""
|
|
166
|
+
query = """
|
|
167
|
+
SELECT name, type
|
|
168
|
+
FROM system.columns
|
|
169
|
+
WHERE database = 'default' AND table = 'ohlcv'
|
|
170
|
+
ORDER BY name
|
|
171
|
+
"""
|
|
172
|
+
result = self.connection.execute(query)
|
|
173
|
+
actual_columns = {row[0]: row[1] for row in result}
|
|
174
|
+
|
|
175
|
+
errors = []
|
|
176
|
+
|
|
177
|
+
# Check for missing columns
|
|
178
|
+
for col, expected_type in self.expected.columns.items():
|
|
179
|
+
if col not in actual_columns:
|
|
180
|
+
errors.append(f"Missing column: {col} (expected type: {expected_type})")
|
|
181
|
+
|
|
182
|
+
# Check for type mismatches
|
|
183
|
+
for col, expected_type in self.expected.columns.items():
|
|
184
|
+
if col in actual_columns:
|
|
185
|
+
actual_type = actual_columns[col]
|
|
186
|
+
if actual_type != expected_type:
|
|
187
|
+
errors.append(
|
|
188
|
+
f"Type mismatch: {col} (expected {expected_type}, got {actual_type})"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Special warning for DateTime64 precision mismatch
|
|
192
|
+
if "DateTime64" in expected_type and "DateTime64" in actual_type:
|
|
193
|
+
if "DateTime64(6)" in expected_type and "DateTime64(3)" in actual_type:
|
|
194
|
+
errors.append(
|
|
195
|
+
f" ⚠️ CRITICAL: {col} has millisecond precision (3) "
|
|
196
|
+
f"but microsecond precision (6) required. "
|
|
197
|
+
f"This causes 1000x data loss! See ADR-0021."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Check for unexpected columns (informational, not error)
|
|
201
|
+
extra_columns = set(actual_columns.keys()) - set(self.expected.columns.keys())
|
|
202
|
+
if extra_columns:
|
|
203
|
+
logger.warning(f"Unexpected columns in ohlcv table: {extra_columns}")
|
|
204
|
+
|
|
205
|
+
return errors
|
|
206
|
+
|
|
207
|
+
def _validate_engine(self) -> List[str]:
|
|
208
|
+
"""Validate table engine is ReplacingMergeTree with _version column."""
|
|
209
|
+
query = """
|
|
210
|
+
SELECT engine, engine_full
|
|
211
|
+
FROM system.tables
|
|
212
|
+
WHERE database = 'default' AND name = 'ohlcv'
|
|
213
|
+
"""
|
|
214
|
+
result = self.connection.execute(query)
|
|
215
|
+
|
|
216
|
+
if not result:
|
|
217
|
+
return ["Table 'ohlcv' not found in database 'default'"]
|
|
218
|
+
|
|
219
|
+
engine, engine_full = result[0]
|
|
220
|
+
errors = []
|
|
221
|
+
|
|
222
|
+
if engine != self.expected.engine:
|
|
223
|
+
errors.append(
|
|
224
|
+
f"Wrong engine: expected {self.expected.engine}, got {engine}. "
|
|
225
|
+
f"ReplacingMergeTree required for zero-gap guarantee."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Verify _version column is used for deduplication
|
|
229
|
+
if "_version" not in engine_full:
|
|
230
|
+
errors.append(
|
|
231
|
+
f"ReplacingMergeTree version column missing. "
|
|
232
|
+
f"Expected '_version' in engine definition, got: {engine_full}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return errors
|
|
236
|
+
|
|
237
|
+
def _validate_partitioning(self) -> List[str]:
|
|
238
|
+
"""Validate partition key for daily partitions (performance optimization)."""
|
|
239
|
+
query = """
|
|
240
|
+
SELECT partition_key
|
|
241
|
+
FROM system.tables
|
|
242
|
+
WHERE database = 'default' AND name = 'ohlcv'
|
|
243
|
+
"""
|
|
244
|
+
result = self.connection.execute(query)
|
|
245
|
+
|
|
246
|
+
if not result:
|
|
247
|
+
return ["Cannot retrieve partition_key for ohlcv table"]
|
|
248
|
+
|
|
249
|
+
actual_partition_key = result[0][0]
|
|
250
|
+
|
|
251
|
+
if actual_partition_key != self.expected.partition_key:
|
|
252
|
+
return [
|
|
253
|
+
f"Partition key mismatch: "
|
|
254
|
+
f"expected '{self.expected.partition_key}', got '{actual_partition_key}'. "
|
|
255
|
+
f"Daily partitioning required for query performance."
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
def _validate_sorting_key(self) -> List[str]:
|
|
261
|
+
"""Validate sorting key (ORDER BY) for query optimization."""
|
|
262
|
+
query = """
|
|
263
|
+
SELECT sorting_key
|
|
264
|
+
FROM system.tables
|
|
265
|
+
WHERE database = 'default' AND name = 'ohlcv'
|
|
266
|
+
"""
|
|
267
|
+
result = self.connection.execute(query)
|
|
268
|
+
|
|
269
|
+
if not result:
|
|
270
|
+
return ["Cannot retrieve sorting_key for ohlcv table"]
|
|
271
|
+
|
|
272
|
+
actual_sorting_key = result[0][0]
|
|
273
|
+
expected_sorting_key = ", ".join(self.expected.sorting_key)
|
|
274
|
+
|
|
275
|
+
if actual_sorting_key != expected_sorting_key:
|
|
276
|
+
return [
|
|
277
|
+
f"Sorting key mismatch: "
|
|
278
|
+
f"expected ({expected_sorting_key}), got ({actual_sorting_key}). "
|
|
279
|
+
f"Correct sorting key required for query performance."
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
return []
|
|
283
|
+
|
|
284
|
+
def _validate_compression(self) -> List[str]:
|
|
285
|
+
"""Validate compression codecs (optional, informational warnings only).
|
|
286
|
+
|
|
287
|
+
Note: Missing compression doesn't prevent correctness, only increases storage.
|
|
288
|
+
"""
|
|
289
|
+
query = """
|
|
290
|
+
SELECT name, compression_codec
|
|
291
|
+
FROM system.columns
|
|
292
|
+
WHERE database = 'default' AND table = 'ohlcv'
|
|
293
|
+
"""
|
|
294
|
+
result = self.connection.execute(query)
|
|
295
|
+
actual_codecs = {row[0]: row[1] for row in result}
|
|
296
|
+
|
|
297
|
+
warnings = []
|
|
298
|
+
|
|
299
|
+
for col, expected_codec in self.expected.expected_codecs.items():
|
|
300
|
+
if col in actual_codecs:
|
|
301
|
+
actual_codec = actual_codecs[col]
|
|
302
|
+
if expected_codec not in actual_codec:
|
|
303
|
+
warnings.append(
|
|
304
|
+
f"Suboptimal compression for {col}: "
|
|
305
|
+
f"expected {expected_codec}, got {actual_codec}"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if warnings:
|
|
309
|
+
logger.warning(f"Compression codec warnings: {warnings}")
|
|
310
|
+
|
|
311
|
+
# Return empty list (warnings don't fail validation)
|
|
312
|
+
return []
|