gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,642 @@
1
+ """
2
+ ClickHouse query interface for gapless-crypto-data v4.0.0.
3
+
4
+ SQL query abstraction returning pandas DataFrames for backward compatibility.
5
+ Provides high-level methods for common OHLCV queries with automatic connection management.
6
+
7
+ Architecture:
8
+ - ClickHouse native protocol (port 9000) for queries
9
+ - pandas DataFrame return type for compatibility with v3.x API
10
+ - SQL-based filtering and aggregation with FINAL keyword for deduplication
11
+
12
+ Error Handling:
13
+ - Raise and propagate query failures (no fallbacks)
14
+ - Raise and propagate connection failures (no retries)
15
+ - Invalid parameters raise ValueError
16
+
17
+ SLOs:
18
+ - Availability: Query failures propagate to caller
19
+ - Correctness: Zero-gap guarantee via deterministic versioning + FINAL keyword
20
+ - Observability: Query execution logged at DEBUG level
21
+ - Maintainability: Standard SQL queries, pandas DataFrame output
22
+
23
+ Usage:
24
+ from gapless_crypto_clickhouse.clickhouse_query import OHLCVQuery
25
+ from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
26
+
27
+ with ClickHouseConnection() as conn:
28
+ query = OHLCVQuery(conn)
29
+
30
+ # Get latest 100 bars
31
+ df = query.get_latest("BTCUSDT", "1h", limit=100)
32
+
33
+ # Get date range
34
+ df = query.get_range(
35
+ "ETHUSDT", "1h",
36
+ start="2024-01-01",
37
+ end="2024-12-31"
38
+ )
39
+
40
+ # Multi-symbol comparison
41
+ df = query.get_multi_symbol(
42
+ ["BTCUSDT", "ETHUSDT", "SOLUSDT"],
43
+ "1h",
44
+ start="2024-01-01",
45
+ end="2024-01-31"
46
+ )
47
+ """
48
+
49
+ import logging
50
+ from typing import Any, Dict, List, Optional
51
+
52
+ import pandas as pd
53
+
54
+ from .clickhouse.connection import ClickHouseConnection
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ class OHLCVQuery:
60
+ """
61
+ High-level query interface for OHLCV data in ClickHouse.
62
+
63
+ Provides pandas DataFrame-based API for querying time-series OHLCV data
64
+ with automatic connection management and SQL query construction.
65
+
66
+ Attributes:
67
+ connection: ClickHouse connection for native protocol queries
68
+
69
+ Error Handling:
70
+ - Connection failures raise ConnectionError
71
+ - Query failures raise Exception
72
+ - Invalid parameters raise ValueError
73
+ - No retries, no fallbacks
74
+
75
+ Performance:
76
+ - Query latency: <1s for typical OHLCV ranges (1M rows)
77
+ - FINAL keyword overhead: 10-30% (required for deduplication)
78
+ - Result set: Materialized to pandas DataFrame
79
+ - Memory: Entire result loaded into memory
80
+
81
+ Examples:
82
+ # Get latest data
83
+ with ClickHouseConnection() as conn:
84
+ query = OHLCVQuery(conn)
85
+ df = query.get_latest("BTCUSDT", "1h", limit=1000)
86
+ print(f"Latest close: {df.iloc[-1]['close']}")
87
+
88
+ # Date range query
89
+ with ClickHouseConnection() as conn:
90
+ query = OHLCVQuery(conn)
91
+ df = query.get_range(
92
+ "ETHUSDT", "1h",
93
+ start="2024-01-01",
94
+ end="2024-12-31"
95
+ )
96
+ print(f"Total bars: {len(df)}")
97
+
98
+ # Multi-symbol query
99
+ with ClickHouseConnection() as conn:
100
+ query = OHLCVQuery(conn)
101
+ df = query.get_multi_symbol(
102
+ ["BTCUSDT", "ETHUSDT"],
103
+ "1h",
104
+ start="2024-01-01",
105
+ end="2024-01-31"
106
+ )
107
+ print(df.groupby("symbol")["close"].mean())
108
+ """
109
+
110
+ def __init__(self, connection: ClickHouseConnection) -> None:
111
+ """
112
+ Initialize OHLCV query interface.
113
+
114
+ Args:
115
+ connection: Active ClickHouse connection
116
+
117
+ Raises:
118
+ ValueError: If connection is invalid
119
+ """
120
+ if not isinstance(connection, ClickHouseConnection):
121
+ raise ValueError(f"Expected ClickHouseConnection, got {type(connection).__name__}")
122
+
123
+ self.connection = connection
124
+ logger.debug("OHLCVQuery interface initialized")
125
+
126
+ def get_latest(
127
+ self, symbol: str, timeframe: str, limit: int = 1000, instrument_type: str = "spot"
128
+ ) -> pd.DataFrame:
129
+ """
130
+ Get latest N bars for a symbol and timeframe.
131
+
132
+ Args:
133
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
134
+ timeframe: Timeframe string (e.g., "1h")
135
+ limit: Number of bars to retrieve (default: 1000)
136
+ instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
137
+
138
+ Returns:
139
+ pandas DataFrame with OHLCV data, sorted by timestamp (oldest first)
140
+
141
+ Raises:
142
+ ValueError: If parameters are invalid
143
+ Exception: If query fails
144
+ ConnectionError: If database connection fails
145
+
146
+ Example:
147
+ # Spot data (default)
148
+ df = query.get_latest("BTCUSDT", "1h", limit=100)
149
+
150
+ # Futures data
151
+ df = query.get_latest("BTCUSDT", "1h", limit=100, instrument_type="futures")
152
+
153
+ print(df.columns)
154
+ # ['timestamp', 'symbol', 'timeframe', 'instrument_type', 'open', 'high', 'low',
155
+ # 'close', 'volume', 'close_time', 'quote_asset_volume',
156
+ # 'number_of_trades', 'taker_buy_base_asset_volume',
157
+ # 'taker_buy_quote_asset_volume', 'data_source']
158
+ """
159
+ # Validate inputs
160
+ if not symbol:
161
+ raise ValueError("Symbol cannot be empty")
162
+ if not timeframe:
163
+ raise ValueError("Timeframe cannot be empty")
164
+ if limit <= 0:
165
+ raise ValueError(f"Limit must be positive, got {limit}")
166
+ if instrument_type not in ("spot", "futures"):
167
+ raise ValueError(
168
+ f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
169
+ )
170
+
171
+ symbol = symbol.upper()
172
+
173
+ sql = """
174
+ SELECT
175
+ timestamp,
176
+ symbol,
177
+ timeframe,
178
+ instrument_type,
179
+ open,
180
+ high,
181
+ low,
182
+ close,
183
+ volume,
184
+ close_time,
185
+ quote_asset_volume,
186
+ number_of_trades,
187
+ taker_buy_base_asset_volume,
188
+ taker_buy_quote_asset_volume,
189
+ data_source
190
+ FROM ohlcv FINAL
191
+ WHERE symbol = %(symbol)s
192
+ AND timeframe = %(timeframe)s
193
+ AND instrument_type = %(instrument_type)s
194
+ ORDER BY timestamp DESC
195
+ LIMIT %(limit)s
196
+ """
197
+
198
+ logger.debug(f"Querying latest {limit} bars for {symbol} {timeframe} ({instrument_type})")
199
+
200
+ try:
201
+ # Execute query
202
+ df = self.connection.query_dataframe(
203
+ sql,
204
+ params={
205
+ "symbol": symbol,
206
+ "timeframe": timeframe,
207
+ "instrument_type": instrument_type,
208
+ "limit": limit,
209
+ },
210
+ )
211
+
212
+ # Reverse to chronological order (oldest first)
213
+ df = df.iloc[::-1].reset_index(drop=True)
214
+
215
+ logger.info(f"Retrieved {len(df)} bars for {symbol} {timeframe} ({instrument_type})")
216
+ return df
217
+
218
+ except Exception as e:
219
+ raise Exception(
220
+ f"Query failed for {symbol} {timeframe} ({instrument_type}): {e}"
221
+ ) from e
222
+
223
+ def get_range(
224
+ self,
225
+ symbol: str,
226
+ timeframe: str,
227
+ start: str,
228
+ end: str,
229
+ instrument_type: str = "spot",
230
+ ) -> pd.DataFrame:
231
+ """
232
+ Get OHLCV data for a specific date range.
233
+
234
+ Args:
235
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
236
+ timeframe: Timeframe string (e.g., "1h")
237
+ start: Start date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
238
+ end: End date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
239
+ instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
240
+
241
+ Returns:
242
+ pandas DataFrame with OHLCV data, sorted by timestamp
243
+
244
+ Raises:
245
+ ValueError: If parameters are invalid
246
+ Exception: If query fails
247
+ ConnectionError: If database connection fails
248
+
249
+ Example:
250
+ # Spot data (default)
251
+ df = query.get_range(
252
+ "ETHUSDT", "1h",
253
+ start="2024-01-01",
254
+ end="2024-01-31"
255
+ )
256
+
257
+ # Futures data
258
+ df = query.get_range(
259
+ "BTCUSDT", "1h",
260
+ start="2024-01-01",
261
+ end="2024-01-31",
262
+ instrument_type="futures"
263
+ )
264
+
265
+ print(f"Total bars: {len(df)}")
266
+ print(f"First: {df.iloc[0]['timestamp']}")
267
+ print(f"Last: {df.iloc[-1]['timestamp']}")
268
+ """
269
+ # Validate inputs
270
+ if not symbol:
271
+ raise ValueError("Symbol cannot be empty")
272
+ if not timeframe:
273
+ raise ValueError("Timeframe cannot be empty")
274
+ if instrument_type not in ("spot", "futures"):
275
+ raise ValueError(
276
+ f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
277
+ )
278
+
279
+ symbol = symbol.upper()
280
+
281
+ # Parse dates (validate format)
282
+ try:
283
+ start_dt = pd.to_datetime(start)
284
+ end_dt = pd.to_datetime(end)
285
+ except Exception as e:
286
+ raise ValueError(
287
+ f"Invalid date format. Expected 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS', got start='{start}', end='{end}'"
288
+ ) from e
289
+
290
+ if start_dt >= end_dt:
291
+ raise ValueError(f"Start date must be before end date, got start={start}, end={end}")
292
+
293
+ sql = """
294
+ SELECT
295
+ timestamp,
296
+ symbol,
297
+ timeframe,
298
+ instrument_type,
299
+ open,
300
+ high,
301
+ low,
302
+ close,
303
+ volume,
304
+ close_time,
305
+ quote_asset_volume,
306
+ number_of_trades,
307
+ taker_buy_base_asset_volume,
308
+ taker_buy_quote_asset_volume,
309
+ data_source
310
+ FROM ohlcv FINAL
311
+ WHERE symbol = %(symbol)s
312
+ AND timeframe = %(timeframe)s
313
+ AND instrument_type = %(instrument_type)s
314
+ AND timestamp >= toDateTime(%(start)s)
315
+ AND timestamp <= toDateTime(%(end)s)
316
+ ORDER BY timestamp ASC
317
+ """
318
+
319
+ logger.debug(f"Querying {symbol} {timeframe} ({instrument_type}) from {start} to {end}")
320
+
321
+ try:
322
+ df = self.connection.query_dataframe(
323
+ sql,
324
+ params={
325
+ "symbol": symbol,
326
+ "timeframe": timeframe,
327
+ "instrument_type": instrument_type,
328
+ "start": start,
329
+ "end": end,
330
+ },
331
+ )
332
+
333
+ logger.info(
334
+ f"Retrieved {len(df)} bars for {symbol} {timeframe} ({instrument_type}) ({start} to {end})"
335
+ )
336
+ return df
337
+
338
+ except Exception as e:
339
+ raise Exception(
340
+ f"Query failed for {symbol} {timeframe} ({instrument_type}) {start} to {end}: {e}"
341
+ ) from e
342
+
343
+ def get_multi_symbol(
344
+ self,
345
+ symbols: List[str],
346
+ timeframe: str,
347
+ start: str,
348
+ end: str,
349
+ instrument_type: str = "spot",
350
+ ) -> pd.DataFrame:
351
+ """
352
+ Get OHLCV data for multiple symbols in a date range.
353
+
354
+ Useful for multi-symbol analysis and comparison.
355
+
356
+ Args:
357
+ symbols: List of trading pair symbols (e.g., ["BTCUSDT", "ETHUSDT"])
358
+ timeframe: Timeframe string (e.g., "1h")
359
+ start: Start date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
360
+ end: End date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
361
+ instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
362
+
363
+ Returns:
364
+ pandas DataFrame with OHLCV data for all symbols, sorted by symbol then timestamp
365
+
366
+ Raises:
367
+ ValueError: If parameters are invalid
368
+ Exception: If query fails
369
+ ConnectionError: If database connection fails
370
+
371
+ Example:
372
+ # Spot data (default)
373
+ df = query.get_multi_symbol(
374
+ ["BTCUSDT", "ETHUSDT", "SOLUSDT"],
375
+ "1h",
376
+ start="2024-01-01",
377
+ end="2024-01-31"
378
+ )
379
+
380
+ # Futures data
381
+ df = query.get_multi_symbol(
382
+ ["BTCUSDT", "ETHUSDT"],
383
+ "1h",
384
+ start="2024-01-01",
385
+ end="2024-01-31",
386
+ instrument_type="futures"
387
+ )
388
+
389
+ # Group by symbol for analysis
390
+ summary = df.groupby("symbol").agg({
391
+ "close": ["mean", "min", "max"],
392
+ "volume": "sum"
393
+ })
394
+ print(summary)
395
+ """
396
+ # Validate inputs
397
+ if not symbols:
398
+ raise ValueError("Symbols list cannot be empty")
399
+ if not timeframe:
400
+ raise ValueError("Timeframe cannot be empty")
401
+ if instrument_type not in ("spot", "futures"):
402
+ raise ValueError(
403
+ f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
404
+ )
405
+
406
+ symbols = [s.upper() for s in symbols]
407
+
408
+ # Parse dates
409
+ try:
410
+ start_dt = pd.to_datetime(start)
411
+ end_dt = pd.to_datetime(end)
412
+ except Exception as e:
413
+ raise ValueError(
414
+ f"Invalid date format. Expected 'YYYY-MM-DD', got start='{start}', end='{end}'"
415
+ ) from e
416
+
417
+ if start_dt >= end_dt:
418
+ raise ValueError(f"Start date must be before end date, got start={start}, end={end}")
419
+
420
+ # ClickHouse IN clause with array parameter
421
+ sql = """
422
+ SELECT
423
+ timestamp,
424
+ symbol,
425
+ timeframe,
426
+ instrument_type,
427
+ open,
428
+ high,
429
+ low,
430
+ close,
431
+ volume,
432
+ close_time,
433
+ quote_asset_volume,
434
+ number_of_trades,
435
+ taker_buy_base_asset_volume,
436
+ taker_buy_quote_asset_volume,
437
+ data_source
438
+ FROM ohlcv FINAL
439
+ WHERE symbol IN %(symbols)s
440
+ AND timeframe = %(timeframe)s
441
+ AND instrument_type = %(instrument_type)s
442
+ AND timestamp >= toDateTime(%(start)s)
443
+ AND timestamp <= toDateTime(%(end)s)
444
+ ORDER BY symbol ASC, timestamp ASC
445
+ """
446
+
447
+ logger.debug(
448
+ f"Querying {len(symbols)} symbols ({', '.join(symbols)}) {timeframe} ({instrument_type}) from {start} to {end}"
449
+ )
450
+
451
+ try:
452
+ df = self.connection.query_dataframe(
453
+ sql,
454
+ params={
455
+ "symbols": symbols,
456
+ "timeframe": timeframe,
457
+ "instrument_type": instrument_type,
458
+ "start": start,
459
+ "end": end,
460
+ },
461
+ )
462
+
463
+ logger.info(
464
+ f"Retrieved {len(df)} bars for {len(symbols)} symbols ({instrument_type}) ({start} to {end})"
465
+ )
466
+ return df
467
+
468
+ except Exception as e:
469
+ raise Exception(
470
+ f"Multi-symbol query failed for {timeframe} ({instrument_type}) {start} to {end}: {e}"
471
+ ) from e
472
+
473
+ def execute_sql(self, sql: str, params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
474
+ """
475
+ Execute raw SQL query and return results as DataFrame.
476
+
477
+ For advanced queries not covered by high-level methods.
478
+
479
+ Args:
480
+ sql: SQL query string (use %(name)s placeholders for parameters)
481
+ params: Query parameters dict (optional)
482
+
483
+ Returns:
484
+ pandas DataFrame with query results
485
+
486
+ Raises:
487
+ ValueError: If SQL is empty
488
+ Exception: If query fails
489
+ ConnectionError: If database connection fails
490
+
491
+ Security:
492
+ Always use parameterized queries (%(name)s placeholders) to prevent SQL injection.
493
+ NEVER concatenate user input directly into SQL strings.
494
+
495
+ Example:
496
+ # Parameterized query (SAFE)
497
+ df = query.execute_sql(
498
+ "SELECT * FROM ohlcv FINAL WHERE symbol = %(symbol)s AND close > %(price)s LIMIT 10",
499
+ {"symbol": "BTCUSDT", "price": 50000.0}
500
+ )
501
+
502
+ # Direct string concatenation (UNSAFE - don't do this)
503
+ # df = query.execute_sql(f"SELECT * FROM ohlcv WHERE symbol = '{user_input}'")
504
+ """
505
+ if not sql or not sql.strip():
506
+ raise ValueError("SQL query cannot be empty")
507
+
508
+ logger.debug(f"Executing raw SQL query: {sql[:100]}...")
509
+
510
+ try:
511
+ df = self.connection.query_dataframe(sql, params)
512
+
513
+ logger.info(f"Raw SQL query returned {len(df)} rows")
514
+ return df
515
+
516
+ except Exception as e:
517
+ raise Exception(f"Raw SQL query failed: {e}") from e
518
+
519
+ def detect_gaps(
520
+ self, symbol: str, timeframe: str, start: str, end: str, instrument_type: str = "spot"
521
+ ) -> pd.DataFrame:
522
+ """
523
+ Detect timestamp gaps in OHLCV data using SQL sequence analysis.
524
+
525
+ Uses ClickHouse window functions to find missing bars based on
526
+ expected timeframe intervals.
527
+
528
+ Args:
529
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
530
+ timeframe: Timeframe string (e.g., "1h")
531
+ start: Start date in "YYYY-MM-DD" format
532
+ end: End date in "YYYY-MM-DD" format
533
+ instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
534
+
535
+ Returns:
536
+ pandas DataFrame with gap information:
537
+ - gap_start: Timestamp where gap starts
538
+ - gap_end: Timestamp where gap ends
539
+ - expected_bars: Number of missing bars in gap
540
+
541
+ Raises:
542
+ ValueError: If parameters are invalid
543
+ Exception: If query fails
544
+
545
+ Example:
546
+ gaps = query.detect_gaps("BTCUSDT", "1h", "2024-01-01", "2024-12-31")
547
+ if gaps.empty:
548
+ print("No gaps found!")
549
+ else:
550
+ print(f"Found {len(gaps)} gaps:")
551
+ print(gaps)
552
+ """
553
+ # Map timeframe to seconds for gap detection
554
+ timeframe_to_seconds = {
555
+ "1s": 1,
556
+ "1m": 60,
557
+ "3m": 180,
558
+ "5m": 300,
559
+ "15m": 900,
560
+ "30m": 1800,
561
+ "1h": 3600,
562
+ "2h": 7200,
563
+ "4h": 14400,
564
+ "6h": 21600,
565
+ "8h": 28800,
566
+ "12h": 43200,
567
+ "1d": 86400,
568
+ "3d": 259200,
569
+ "1w": 604800,
570
+ "1mo": 2592000, # Approximate: 30 days
571
+ }
572
+
573
+ if timeframe not in timeframe_to_seconds:
574
+ raise ValueError(f"Unsupported timeframe for gap detection: {timeframe}")
575
+ if instrument_type not in ("spot", "futures"):
576
+ raise ValueError(
577
+ f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
578
+ )
579
+
580
+ interval_seconds = timeframe_to_seconds[timeframe]
581
+ symbol = symbol.upper()
582
+
583
+ # SQL to detect gaps using lagInFrame window function
584
+ sql = """
585
+ WITH lagged AS (
586
+ SELECT
587
+ timestamp,
588
+ lagInFrame(timestamp) OVER (ORDER BY timestamp) AS prev_timestamp
589
+ FROM ohlcv FINAL
590
+ WHERE symbol = %(symbol)s
591
+ AND timeframe = %(timeframe)s
592
+ AND instrument_type = %(instrument_type)s
593
+ AND timestamp >= toDateTime(%(start)s)
594
+ AND timestamp <= toDateTime(%(end)s)
595
+ ),
596
+ gaps AS (
597
+ SELECT
598
+ prev_timestamp AS gap_start,
599
+ timestamp AS gap_end,
600
+ toFloat64(dateDiff('second', prev_timestamp, timestamp)) / %(interval_seconds)s AS bars_diff
601
+ FROM lagged
602
+ WHERE prev_timestamp != toDateTime(0)
603
+ )
604
+ SELECT
605
+ gap_start,
606
+ gap_end,
607
+ bars_diff - 1 AS expected_bars
608
+ FROM gaps
609
+ WHERE bars_diff > 1
610
+ """
611
+
612
+ logger.debug(
613
+ f"Detecting gaps for {symbol} {timeframe} ({instrument_type}) from {start} to {end}"
614
+ )
615
+
616
+ try:
617
+ df = self.connection.query_dataframe(
618
+ sql,
619
+ params={
620
+ "symbol": symbol,
621
+ "timeframe": timeframe,
622
+ "instrument_type": instrument_type,
623
+ "start": start,
624
+ "end": end,
625
+ "interval_seconds": interval_seconds,
626
+ },
627
+ )
628
+
629
+ if df.empty:
630
+ logger.info(f"No gaps found for {symbol} {timeframe} ({instrument_type})")
631
+ else:
632
+ logger.warning(
633
+ f"Found {len(df)} gaps for {symbol} {timeframe} ({instrument_type}) "
634
+ f"(total missing bars: {df['expected_bars'].sum()})"
635
+ )
636
+
637
+ return df
638
+
639
+ except Exception as e:
640
+ raise Exception(
641
+ f"Gap detection query failed for {symbol} {timeframe} ({instrument_type}): {e}"
642
+ ) from e
@@ -0,0 +1,21 @@
1
+ """
2
+ Collectors module.
3
+
4
+ High-performance data collection components with hybrid concurrent architecture.
5
+ """
6
+
7
+ from .binance_public_data_collector import BinancePublicDataCollector
8
+ from .concurrent_collection_orchestrator import CollectionResult, ConcurrentCollectionOrchestrator
9
+ from .httpx_downloader import ConcurrentDownloadManager, DownloadResult
10
+ from .hybrid_url_generator import DataSource, DownloadTask, HybridUrlGenerator
11
+
12
+ __all__ = [
13
+ "BinancePublicDataCollector",
14
+ "HybridUrlGenerator",
15
+ "DownloadTask",
16
+ "DataSource",
17
+ "ConcurrentDownloadManager",
18
+ "DownloadResult",
19
+ "ConcurrentCollectionOrchestrator",
20
+ "CollectionResult",
21
+ ]