gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,446 @@
1
+ """
2
+ ClickHouse Bulk Loader for gapless-crypto-clickhouse v6.0.0.
3
+
4
+ Ultra-fast historical data ingestion from Binance Public Data Repository to ClickHouse.
5
+ Preserves 22x speedup advantage of CloudFront CDN + zero-gap guarantee via deterministic versioning.
6
+
7
+ Architecture:
8
+ CloudFront ZIP → Extract (temp) → Parse (pandas) → Add _version → ClickHouse (Arrow) → Delete temp
9
+
10
+ Performance:
11
+ - Download: 22x faster than REST API (CloudFront CDN, unchanged from QuestDB)
12
+ - Ingestion: >100K rows/sec via clickhouse-connect Arrow bulk insert (ADR-0023)
13
+ - Storage: No persistent intermediate files (transient extraction only)
14
+
15
+ Zero-Gap Guarantee:
16
+ - Deterministic _version hash ensures identical writes → identical versions
17
+ - ReplacingMergeTree merges duplicate rows with same _version
18
+ - Query with FINAL keyword returns deduplicated results
19
+
20
+ Error Handling:
21
+ - Raise and propagate download failures (no retry)
22
+ - Raise and propagate extraction failures (no fallback)
23
+ - Raise and propagate ingestion failures (no silent drops)
24
+ - Temporary files cleaned up even on errors
25
+
26
+ SLOs:
27
+ - Availability: CloudFront 99.99% SLA, connection failures propagate
28
+ - Correctness: Zero-gap guarantee via deterministic versioning
29
+ - Observability: Ingestion metrics logged at INFO level
30
+ - Maintainability: Standard clickhouse-connect HTTP client, Arrow-optimized (ADR-0023)
31
+
32
+ Usage:
33
+ from gapless_crypto_clickhouse.collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
34
+ from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
35
+
36
+ with ClickHouseConnection() as conn:
37
+ loader = ClickHouseBulkLoader(conn, instrument_type="spot")
38
+ loader.ingest_month(symbol="BTCUSDT", timeframe="1h", year=2024, month=1)
39
+ """
40
+
41
+ import hashlib
42
+ import logging
43
+ import tempfile
44
+ import urllib.request
45
+ import zipfile
46
+ from pathlib import Path
47
+
48
+ import pandas as pd
49
+
50
+ from ..clickhouse.connection import ClickHouseConnection
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ class ClickHouseBulkLoader:
56
+ """
57
+ Bulk data loader from Binance Public Data Repository to ClickHouse.
58
+
59
+ Downloads monthly ZIP archives from CloudFront CDN, extracts to temporary location,
60
+ parses CSV, adds deterministic _version for deduplication, and ingests to ClickHouse.
61
+
62
+ Attributes:
63
+ connection: ClickHouse connection for bulk inserts
64
+ instrument_type: 'spot' or 'futures' (ADR-0004)
65
+ base_url: Binance Public Data Repository base URL
66
+
67
+ Error Handling:
68
+ - Download failures raise urllib.error.HTTPError
69
+ - Extraction failures raise zipfile.BadZipFile
70
+ - Ingestion failures raise Exception
71
+ - Temporary files cleaned up in all cases
72
+
73
+ Performance:
74
+ - CloudFront CDN: 22x faster than REST API
75
+ - ClickHouse bulk insert: 100-500K rows/sec
76
+ - Memory efficient: Streaming CSV→DataFrame→ClickHouse
77
+
78
+ Examples:
79
+ # Single month ingestion (spot)
80
+ with ClickHouseConnection() as conn:
81
+ loader = ClickHouseBulkLoader(conn, instrument_type="spot")
82
+ loader.ingest_month(symbol="BTCUSDT", timeframe="1h", year=2024, month=1)
83
+
84
+ # Single month ingestion (futures)
85
+ with ClickHouseConnection() as conn:
86
+ loader = ClickHouseBulkLoader(conn, instrument_type="futures")
87
+ loader.ingest_month(symbol="BTCUSDT", timeframe="1h", year=2024, month=1)
88
+ """
89
+
90
+ # Binance Public Data Repository base URLs
91
+ SPOT_BASE_URL = "https://data.binance.vision/data/spot"
92
+ FUTURES_BASE_URL = "https://data.binance.vision/data/futures/um" # USDT-margined
93
+
94
+ # Supported timeframes (all 16 Binance timeframes, empirically validated ADR-0003)
95
+ SUPPORTED_TIMEFRAMES = [
96
+ "1s",
97
+ "1m",
98
+ "3m",
99
+ "5m",
100
+ "15m",
101
+ "30m",
102
+ "1h",
103
+ "2h",
104
+ "4h",
105
+ "6h",
106
+ "8h",
107
+ "12h",
108
+ "1d",
109
+ "3d", # Three-day (exotic timeframe)
110
+ "1w", # Weekly (exotic timeframe)
111
+ "1mo", # Monthly (exotic timeframe) - Binance uses "1mo" not "1M"
112
+ ]
113
+
114
+ def __init__(self, connection: ClickHouseConnection, instrument_type: str = "spot") -> None:
115
+ """
116
+ Initialize ClickHouse bulk loader.
117
+
118
+ Args:
119
+ connection: Active ClickHouse connection
120
+ instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
121
+
122
+ Raises:
123
+ ValueError: If connection or instrument_type is invalid
124
+ """
125
+ if not isinstance(connection, ClickHouseConnection):
126
+ raise ValueError(f"Expected ClickHouseConnection, got {type(connection).__name__}")
127
+
128
+ if instrument_type not in ("spot", "futures"):
129
+ raise ValueError(
130
+ f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
131
+ )
132
+
133
+ self.connection = connection
134
+ self.instrument_type = instrument_type
135
+
136
+ # Set base_url based on instrument_type
137
+ self.base_url = self.SPOT_BASE_URL if instrument_type == "spot" else self.FUTURES_BASE_URL
138
+
139
+ logger.info(
140
+ f"ClickHouse bulk loader initialized (instrument_type={instrument_type}, base_url={self.base_url})"
141
+ )
142
+
143
+ def ingest_month(self, symbol: str, timeframe: str, year: int, month: int) -> int:
144
+ """
145
+ Ingest one month of OHLCV data from Binance Public Data Repository.
146
+
147
+ Args:
148
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
149
+ timeframe: Timeframe string (e.g., "1h")
150
+ year: Year (e.g., 2024)
151
+ month: Month (1-12)
152
+
153
+ Returns:
154
+ Number of rows ingested
155
+
156
+ Raises:
157
+ ValueError: If parameters are invalid
158
+ urllib.error.HTTPError: If download fails (404, 403, etc.)
159
+ zipfile.BadZipFile: If ZIP extraction fails
160
+ Exception: If ingestion fails
161
+
162
+ Example:
163
+ rows = loader.ingest_month("BTCUSDT", "1h", 2024, 1)
164
+ print(f"Ingested {rows} rows")
165
+ """
166
+ # Validate inputs
167
+ symbol = symbol.upper()
168
+ if timeframe not in self.SUPPORTED_TIMEFRAMES:
169
+ raise ValueError(
170
+ f"Unsupported timeframe: {timeframe}. Must be one of {self.SUPPORTED_TIMEFRAMES}"
171
+ )
172
+ if not (1 <= month <= 12):
173
+ raise ValueError(f"Month must be 1-12, got {month}")
174
+
175
+ # Construct URL
176
+ url = (
177
+ f"{self.base_url}/monthly/klines/"
178
+ f"{symbol}/{timeframe}/{symbol}-{timeframe}-{year}-{month:02d}.zip"
179
+ )
180
+
181
+ # Use temporary directory for transient files
182
+ with tempfile.TemporaryDirectory() as temp_dir:
183
+ temp_path = Path(temp_dir)
184
+
185
+ # Download ZIP
186
+ zip_path = temp_path / f"{symbol}-{timeframe}-{year}-{month:02d}.zip"
187
+ self._download_cloudfront(url, zip_path)
188
+
189
+ # Extract CSV
190
+ csv_path = self._extract_zip(zip_path, temp_path)
191
+
192
+ # Parse CSV
193
+ df = self._parse_csv(csv_path, symbol, timeframe)
194
+
195
+ # Ingest to ClickHouse
196
+ rows_ingested = self._ingest_dataframe(df)
197
+
198
+ logger.info(
199
+ f"Completed ingestion: {symbol} {timeframe} {year}-{month:02d} ({rows_ingested} rows)"
200
+ )
201
+ return rows_ingested
202
+
203
+ def _download_cloudfront(self, url: str, dest_path: Path) -> None:
204
+ """
205
+ Download file from CloudFront CDN.
206
+
207
+ Reused from QuestDBBulkLoader (CloudFront logic unchanged).
208
+ """
209
+ logger.info(f"Downloading from CloudFront: {url}")
210
+
211
+ try:
212
+ urllib.request.urlretrieve(url, dest_path)
213
+ logger.info(f"Download complete: {dest_path.stat().st_size} bytes")
214
+ except urllib.error.HTTPError as e:
215
+ raise urllib.error.HTTPError(
216
+ url=url,
217
+ code=e.code,
218
+ msg=f"CloudFront download failed: {e.reason}",
219
+ hdrs=e.headers,
220
+ fp=None,
221
+ ) from e
222
+ except urllib.error.URLError as e:
223
+ raise urllib.error.URLError(
224
+ f"Network error downloading from CloudFront: {e.reason}"
225
+ ) from e
226
+
227
+ def _extract_zip(self, zip_path: Path, extract_dir: Path) -> Path:
228
+ """
229
+ Extract ZIP archive to temporary directory.
230
+
231
+ Reused from QuestDBBulkLoader (extraction logic unchanged).
232
+ """
233
+ logger.info(f"Extracting ZIP: {zip_path}")
234
+
235
+ try:
236
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
237
+ zip_ref.extractall(extract_dir)
238
+
239
+ # Find CSV file
240
+ csv_files = list(extract_dir.glob("*.csv"))
241
+ if not csv_files:
242
+ raise FileNotFoundError(f"No CSV file found in ZIP archive: {zip_path}")
243
+
244
+ if len(csv_files) > 1:
245
+ logger.warning(f"Multiple CSV files found in ZIP, using first: {csv_files[0]}")
246
+
247
+ csv_path = csv_files[0]
248
+ logger.info(f"Extracted CSV: {csv_path} ({csv_path.stat().st_size} bytes)")
249
+ return csv_path
250
+
251
+ except zipfile.BadZipFile as e:
252
+ raise zipfile.BadZipFile(f"Corrupted ZIP file: {zip_path}") from e
253
+
254
+ def _parse_csv(self, csv_path: Path, symbol: str, timeframe: str) -> pd.DataFrame:
255
+ """
256
+ Parse Binance CSV file to DataFrame.
257
+
258
+ Reused from QuestDBBulkLoader with ADR-0004 futures support.
259
+ Handles both spot (11-column, no header) and futures (12-column, with header).
260
+ """
261
+ logger.info(f"Parsing CSV: {csv_path} (instrument_type={self.instrument_type})")
262
+
263
+ try:
264
+ # Auto-detect CSV format by checking first line
265
+ with open(csv_path, "r", encoding="utf-8") as f:
266
+ first_line = f.readline().strip()
267
+
268
+ has_header = first_line.startswith("open_time") # Futures CSV has header
269
+
270
+ if has_header:
271
+ # Futures format: 12 columns with header
272
+ df = pd.read_csv(csv_path, header=0, index_col=False)
273
+
274
+ # Validate column count (expect 12 columns)
275
+ if len(df.columns) != 12:
276
+ raise ValueError(
277
+ f"Expected 12 columns for futures format, got {len(df.columns)}. "
278
+ f"Columns: {df.columns.tolist()}"
279
+ )
280
+
281
+ # Drop the "ignore" column (always empty in futures CSV)
282
+ df = df.drop(columns=["ignore"])
283
+
284
+ # Rename futures column names to match spot format for consistency
285
+ df = df.rename(
286
+ columns={
287
+ "count": "number_of_trades",
288
+ "quote_volume": "quote_asset_volume",
289
+ "taker_buy_volume": "taker_buy_base_asset_volume",
290
+ "taker_buy_quote_volume": "taker_buy_quote_asset_volume",
291
+ }
292
+ )
293
+
294
+ else:
295
+ # Spot format: 11 columns, no header
296
+ df = pd.read_csv(
297
+ csv_path,
298
+ header=None,
299
+ index_col=False,
300
+ names=[
301
+ "open_time",
302
+ "open",
303
+ "high",
304
+ "low",
305
+ "close",
306
+ "volume",
307
+ "close_time",
308
+ "quote_asset_volume",
309
+ "number_of_trades",
310
+ "taker_buy_base_asset_volume",
311
+ "taker_buy_quote_asset_volume",
312
+ ],
313
+ )
314
+
315
+ # Validate column count
316
+ if len(df.columns) != 11:
317
+ raise ValueError(
318
+ f"Expected 11 columns for spot format, got {len(df.columns)}. "
319
+ f"Columns: {df.columns.tolist()}"
320
+ )
321
+
322
+ # Convert timestamps (ms → datetime)
323
+ df["timestamp"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)
324
+ df["close_time"] = pd.to_datetime(df["close_time"], unit="ms", utc=True)
325
+
326
+ # Drop open_time (redundant with timestamp)
327
+ df = df.drop(columns=["open_time"])
328
+
329
+ # Add metadata columns
330
+ df["symbol"] = symbol
331
+ df["timeframe"] = timeframe
332
+ df["data_source"] = "cloudfront"
333
+ df["instrument_type"] = self.instrument_type
334
+
335
+ logger.info(
336
+ f"Parsed {len(df)} rows from CSV (format={'futures' if has_header else 'spot'})"
337
+ )
338
+ return df
339
+
340
+ except pd.errors.ParserError as e:
341
+ raise pd.errors.ParserError(f"Failed to parse CSV {csv_path}: {e}") from e
342
+
343
+ def _compute_version_hash(self, row: pd.Series) -> int:
344
+ """
345
+ Compute deterministic _version hash for ReplacingMergeTree deduplication.
346
+
347
+ Creates hash from (timestamp, OHLCV, symbol, timeframe, instrument_type).
348
+ Ensures identical writes → identical _version → consistent merge outcome.
349
+
350
+ Args:
351
+ row: pandas Series with OHLCV data
352
+
353
+ Returns:
354
+ UInt64 hash value (0 to 2^64-1)
355
+
356
+ Example:
357
+ row = pd.Series({'timestamp': ..., 'open': 50000, ...})
358
+ version = self._compute_version_hash(row) # e.g., 1234567890
359
+ """
360
+ # Create deterministic string from row content
361
+ version_input = (
362
+ f"{row['timestamp']}"
363
+ f"{row['open']}{row['high']}{row['low']}{row['close']}{row['volume']}"
364
+ f"{row['symbol']}{row['timeframe']}{row['instrument_type']}"
365
+ )
366
+
367
+ # Use SHA256 for cryptographic hash (deterministic, collision-resistant)
368
+ hash_bytes = hashlib.sha256(version_input.encode("utf-8")).digest()
369
+
370
+ # Convert first 8 bytes to UInt64
371
+ version = int.from_bytes(hash_bytes[:8], byteorder="big", signed=False)
372
+
373
+ return version
374
+
375
+ def _ingest_dataframe(self, df: pd.DataFrame) -> int:
376
+ """
377
+ Ingest DataFrame to ClickHouse with deterministic versioning.
378
+
379
+ Adds _version (deterministic hash) and _sign (1 for active rows) columns
380
+ for ReplacingMergeTree deduplication.
381
+
382
+ Args:
383
+ df: DataFrame with OHLCV data
384
+
385
+ Returns:
386
+ Number of rows ingested
387
+
388
+ Raises:
389
+ ClickHouseError: If ingestion fails
390
+ """
391
+ if df.empty:
392
+ logger.warning("Empty DataFrame, skipping ingestion")
393
+ return 0
394
+
395
+ logger.info(f"Ingesting {len(df)} rows to ClickHouse")
396
+
397
+ try:
398
+ # Prepare DataFrame for bulk ingestion
399
+ df_ingest = df.copy()
400
+
401
+ # Add deterministic _version hash (row-by-row)
402
+ logger.debug("Computing deterministic _version hashes...")
403
+ df_ingest["_version"] = df_ingest.apply(self._compute_version_hash, axis=1)
404
+
405
+ # Add _sign column (1 for all active rows)
406
+ df_ingest["_sign"] = 1
407
+
408
+ # Convert number_of_trades to integer (schema requires Int64)
409
+ df_ingest["number_of_trades"] = df_ingest["number_of_trades"].astype("int64")
410
+
411
+ # Add funding_rate column (NULL for spot, initially NULL for futures)
412
+ # Schema added in v3.2.0 (ADR-0021) - Nullable(Float64)
413
+ if "funding_rate" not in df_ingest.columns:
414
+ df_ingest["funding_rate"] = None
415
+
416
+ # Reorder columns to match ClickHouse schema
417
+ column_order = [
418
+ "timestamp",
419
+ "symbol",
420
+ "timeframe",
421
+ "instrument_type",
422
+ "data_source",
423
+ "open",
424
+ "high",
425
+ "low",
426
+ "close",
427
+ "volume",
428
+ "close_time",
429
+ "quote_asset_volume",
430
+ "number_of_trades",
431
+ "taker_buy_base_asset_volume",
432
+ "taker_buy_quote_asset_volume",
433
+ "funding_rate",
434
+ "_version",
435
+ "_sign",
436
+ ]
437
+ df_ingest = df_ingest[column_order]
438
+
439
+ # Bulk insert to ClickHouse
440
+ rows_inserted = self.connection.insert_dataframe(df_ingest, table="ohlcv")
441
+
442
+ logger.info(f"Successfully ingested {rows_inserted} rows")
443
+ return rows_inserted
444
+
445
+ except Exception as e:
446
+ raise RuntimeError(f"Ingestion failed for {len(df)} rows: {e}") from e