gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,312 @@
1
+ """ClickHouse Schema Validator for v6.0.0.
2
+
3
+ Validates runtime schema matches expected schema.sql definition.
4
+ Raises SchemaValidationError on mismatch (no fallback, no retry).
5
+
6
+ **SLO Focus**: Correctness (prevents 1000x data loss from DateTime64(3) vs DateTime64(6) mismatch)
7
+
8
+ **ADR**: ADR-0024 (Comprehensive Validation Canonicity)
9
+ """
10
+
11
+ import logging
12
+ from dataclasses import dataclass, field
13
+ from typing import Dict, List, Tuple
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class ExpectedSchema:
20
+ """Expected ohlcv table schema (v4.0.0+ with DateTime64(6) microsecond precision).
21
+
22
+ Reference: src/gapless_crypto_clickhouse/clickhouse/schema.sql
23
+ """
24
+
25
+ columns: Dict[str, str] = field(
26
+ default_factory=lambda: {
27
+ # Metadata columns
28
+ "symbol": "LowCardinality(String)",
29
+ "timeframe": "LowCardinality(String)",
30
+ "instrument_type": "LowCardinality(String)",
31
+ "data_source": "LowCardinality(String)",
32
+ "timestamp": "DateTime64(6)", # Microsecond precision (ADR-0021)
33
+ # OHLCV columns
34
+ "open": "Float64",
35
+ "high": "Float64",
36
+ "low": "Float64",
37
+ "close": "Float64",
38
+ "volume": "Float64",
39
+ # Microstructure columns
40
+ "close_time": "DateTime64(6)", # Microsecond precision (ADR-0021)
41
+ "quote_asset_volume": "Float64",
42
+ "number_of_trades": "Int64",
43
+ "taker_buy_base_asset_volume": "Float64",
44
+ "taker_buy_quote_asset_volume": "Float64",
45
+ # Futures-specific column
46
+ "funding_rate": "Nullable(Float64)",
47
+ # Internal deduplication columns
48
+ "_version": "UInt64",
49
+ "_sign": "Int8",
50
+ }
51
+ )
52
+
53
+ engine: str = "ReplacingMergeTree"
54
+ partition_key: str = "toYYYYMMDD(timestamp)"
55
+ sorting_key: Tuple[str, ...] = ("timestamp", "symbol", "timeframe", "instrument_type")
56
+
57
+ # Expected compression codecs (optional validation)
58
+ expected_codecs: Dict[str, str] = field(
59
+ default_factory=lambda: {
60
+ "timestamp": "DoubleDelta",
61
+ "close_time": "DoubleDelta",
62
+ "open": "Gorilla",
63
+ "high": "Gorilla",
64
+ "low": "Gorilla",
65
+ "close": "Gorilla",
66
+ "volume": "Gorilla",
67
+ "quote_asset_volume": "Gorilla",
68
+ "taker_buy_base_asset_volume": "Gorilla",
69
+ "taker_buy_quote_asset_volume": "Gorilla",
70
+ "funding_rate": "Gorilla",
71
+ }
72
+ )
73
+
74
+
75
+ class SchemaValidationError(Exception):
76
+ """Raised when schema validation fails.
77
+
78
+ **Behavior**: STRICT - No fallback, no retry, no silent failures.
79
+ Propagate to caller (SLO requirement).
80
+ """
81
+
82
+ pass
83
+
84
+
85
+ class SchemaValidator:
86
+ """Validates ClickHouse schema at runtime against expected schema.sql definition.
87
+
88
+ **Usage**:
89
+ ```python
90
+ with ClickHouseConnection() as conn:
91
+ validator = SchemaValidator(conn)
92
+ validator.validate_schema() # Raises SchemaValidationError on mismatch
93
+ ```
94
+
95
+ **Validation Scope**:
96
+ - Column types (DateTime64(6) vs DateTime64(3) detection)
97
+ - Engine configuration (ReplacingMergeTree with _version)
98
+ - Partition key (daily partitions for pruning)
99
+ - Sorting key (query optimization)
100
+ - Compression codecs (storage efficiency, optional)
101
+ """
102
+
103
+ def __init__(self, connection):
104
+ """Initialize validator with ClickHouse connection.
105
+
106
+ Args:
107
+ connection: ClickHouseConnection instance (must be opened)
108
+ """
109
+ self.connection = connection
110
+ self.expected = ExpectedSchema()
111
+
112
+ def validate_schema(self) -> Dict[str, any]:
113
+ """Validate ohlcv table schema matches expectations.
114
+
115
+ Returns:
116
+ Validation report: {"status": "valid", "errors": []}
117
+
118
+ Raises:
119
+ SchemaValidationError: If critical mismatch detected (STRICT mode)
120
+ """
121
+ errors = []
122
+
123
+ # 1. Validate column types
124
+ try:
125
+ column_errors = self._validate_column_types()
126
+ errors.extend(column_errors)
127
+ except Exception as e:
128
+ errors.append(f"Column type validation failed: {e}")
129
+
130
+ # 2. Validate engine configuration
131
+ try:
132
+ engine_errors = self._validate_engine()
133
+ errors.extend(engine_errors)
134
+ except Exception as e:
135
+ errors.append(f"Engine validation failed: {e}")
136
+
137
+ # 3. Validate partitioning
138
+ try:
139
+ partition_errors = self._validate_partitioning()
140
+ errors.extend(partition_errors)
141
+ except Exception as e:
142
+ errors.append(f"Partition validation failed: {e}")
143
+
144
+ # 4. Validate sorting key
145
+ try:
146
+ sorting_errors = self._validate_sorting_key()
147
+ errors.extend(sorting_errors)
148
+ except Exception as e:
149
+ errors.append(f"Sorting key validation failed: {e}")
150
+
151
+ if errors:
152
+ raise SchemaValidationError(
153
+ f"Schema validation failed ({len(errors)} errors):\n"
154
+ + "\n".join(f" - {e}" for e in errors)
155
+ )
156
+
157
+ logger.info("Schema validation passed: ohlcv table matches expected schema")
158
+ return {"status": "valid", "errors": []}
159
+
160
+ def _validate_column_types(self) -> List[str]:
161
+ """Validate column types match expected schema.
162
+
163
+ **Critical Check**: DateTime64(6) vs DateTime64(3) mismatch
164
+ (prevents 1000x data loss)
165
+ """
166
+ query = """
167
+ SELECT name, type
168
+ FROM system.columns
169
+ WHERE database = 'default' AND table = 'ohlcv'
170
+ ORDER BY name
171
+ """
172
+ result = self.connection.execute(query)
173
+ actual_columns = {row[0]: row[1] for row in result}
174
+
175
+ errors = []
176
+
177
+ # Check for missing columns
178
+ for col, expected_type in self.expected.columns.items():
179
+ if col not in actual_columns:
180
+ errors.append(f"Missing column: {col} (expected type: {expected_type})")
181
+
182
+ # Check for type mismatches
183
+ for col, expected_type in self.expected.columns.items():
184
+ if col in actual_columns:
185
+ actual_type = actual_columns[col]
186
+ if actual_type != expected_type:
187
+ errors.append(
188
+ f"Type mismatch: {col} (expected {expected_type}, got {actual_type})"
189
+ )
190
+
191
+ # Special warning for DateTime64 precision mismatch
192
+ if "DateTime64" in expected_type and "DateTime64" in actual_type:
193
+ if "DateTime64(6)" in expected_type and "DateTime64(3)" in actual_type:
194
+ errors.append(
195
+ f" ⚠️ CRITICAL: {col} has millisecond precision (3) "
196
+ f"but microsecond precision (6) required. "
197
+ f"This causes 1000x data loss! See ADR-0021."
198
+ )
199
+
200
+ # Check for unexpected columns (informational, not error)
201
+ extra_columns = set(actual_columns.keys()) - set(self.expected.columns.keys())
202
+ if extra_columns:
203
+ logger.warning(f"Unexpected columns in ohlcv table: {extra_columns}")
204
+
205
+ return errors
206
+
207
+ def _validate_engine(self) -> List[str]:
208
+ """Validate table engine is ReplacingMergeTree with _version column."""
209
+ query = """
210
+ SELECT engine, engine_full
211
+ FROM system.tables
212
+ WHERE database = 'default' AND name = 'ohlcv'
213
+ """
214
+ result = self.connection.execute(query)
215
+
216
+ if not result:
217
+ return ["Table 'ohlcv' not found in database 'default'"]
218
+
219
+ engine, engine_full = result[0]
220
+ errors = []
221
+
222
+ if engine != self.expected.engine:
223
+ errors.append(
224
+ f"Wrong engine: expected {self.expected.engine}, got {engine}. "
225
+ f"ReplacingMergeTree required for zero-gap guarantee."
226
+ )
227
+
228
+ # Verify _version column is used for deduplication
229
+ if "_version" not in engine_full:
230
+ errors.append(
231
+ f"ReplacingMergeTree version column missing. "
232
+ f"Expected '_version' in engine definition, got: {engine_full}"
233
+ )
234
+
235
+ return errors
236
+
237
+ def _validate_partitioning(self) -> List[str]:
238
+ """Validate partition key for daily partitions (performance optimization)."""
239
+ query = """
240
+ SELECT partition_key
241
+ FROM system.tables
242
+ WHERE database = 'default' AND name = 'ohlcv'
243
+ """
244
+ result = self.connection.execute(query)
245
+
246
+ if not result:
247
+ return ["Cannot retrieve partition_key for ohlcv table"]
248
+
249
+ actual_partition_key = result[0][0]
250
+
251
+ if actual_partition_key != self.expected.partition_key:
252
+ return [
253
+ f"Partition key mismatch: "
254
+ f"expected '{self.expected.partition_key}', got '{actual_partition_key}'. "
255
+ f"Daily partitioning required for query performance."
256
+ ]
257
+
258
+ return []
259
+
260
+ def _validate_sorting_key(self) -> List[str]:
261
+ """Validate sorting key (ORDER BY) for query optimization."""
262
+ query = """
263
+ SELECT sorting_key
264
+ FROM system.tables
265
+ WHERE database = 'default' AND name = 'ohlcv'
266
+ """
267
+ result = self.connection.execute(query)
268
+
269
+ if not result:
270
+ return ["Cannot retrieve sorting_key for ohlcv table"]
271
+
272
+ actual_sorting_key = result[0][0]
273
+ expected_sorting_key = ", ".join(self.expected.sorting_key)
274
+
275
+ if actual_sorting_key != expected_sorting_key:
276
+ return [
277
+ f"Sorting key mismatch: "
278
+ f"expected ({expected_sorting_key}), got ({actual_sorting_key}). "
279
+ f"Correct sorting key required for query performance."
280
+ ]
281
+
282
+ return []
283
+
284
+ def _validate_compression(self) -> List[str]:
285
+ """Validate compression codecs (optional, informational warnings only).
286
+
287
+ Note: Missing compression doesn't prevent correctness, only increases storage.
288
+ """
289
+ query = """
290
+ SELECT name, compression_codec
291
+ FROM system.columns
292
+ WHERE database = 'default' AND table = 'ohlcv'
293
+ """
294
+ result = self.connection.execute(query)
295
+ actual_codecs = {row[0]: row[1] for row in result}
296
+
297
+ warnings = []
298
+
299
+ for col, expected_codec in self.expected.expected_codecs.items():
300
+ if col in actual_codecs:
301
+ actual_codec = actual_codecs[col]
302
+ if expected_codec not in actual_codec:
303
+ warnings.append(
304
+ f"Suboptimal compression for {col}: "
305
+ f"expected {expected_codec}, got {actual_codec}"
306
+ )
307
+
308
+ if warnings:
309
+ logger.warning(f"Compression codec warnings: {warnings}")
310
+
311
+ # Return empty list (warnings don't fail validation)
312
+ return []