gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ClickHouse query interface for gapless-crypto-data v4.0.0.
|
|
3
|
+
|
|
4
|
+
SQL query abstraction returning pandas DataFrames for backward compatibility.
|
|
5
|
+
Provides high-level methods for common OHLCV queries with automatic connection management.
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
- ClickHouse native protocol (port 9000) for queries
|
|
9
|
+
- pandas DataFrame return type for compatibility with v3.x API
|
|
10
|
+
- SQL-based filtering and aggregation with FINAL keyword for deduplication
|
|
11
|
+
|
|
12
|
+
Error Handling:
|
|
13
|
+
- Raise and propagate query failures (no fallbacks)
|
|
14
|
+
- Raise and propagate connection failures (no retries)
|
|
15
|
+
- Invalid parameters raise ValueError
|
|
16
|
+
|
|
17
|
+
SLOs:
|
|
18
|
+
- Availability: Query failures propagate to caller
|
|
19
|
+
- Correctness: Zero-gap guarantee via deterministic versioning + FINAL keyword
|
|
20
|
+
- Observability: Query execution logged at DEBUG level
|
|
21
|
+
- Maintainability: Standard SQL queries, pandas DataFrame output
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
from gapless_crypto_clickhouse.clickhouse_query import OHLCVQuery
|
|
25
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
26
|
+
|
|
27
|
+
with ClickHouseConnection() as conn:
|
|
28
|
+
query = OHLCVQuery(conn)
|
|
29
|
+
|
|
30
|
+
# Get latest 100 bars
|
|
31
|
+
df = query.get_latest("BTCUSDT", "1h", limit=100)
|
|
32
|
+
|
|
33
|
+
# Get date range
|
|
34
|
+
df = query.get_range(
|
|
35
|
+
"ETHUSDT", "1h",
|
|
36
|
+
start="2024-01-01",
|
|
37
|
+
end="2024-12-31"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Multi-symbol comparison
|
|
41
|
+
df = query.get_multi_symbol(
|
|
42
|
+
["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
43
|
+
"1h",
|
|
44
|
+
start="2024-01-01",
|
|
45
|
+
end="2024-01-31"
|
|
46
|
+
)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import logging
|
|
50
|
+
from typing import Any, Dict, List, Optional
|
|
51
|
+
|
|
52
|
+
import pandas as pd
|
|
53
|
+
|
|
54
|
+
from .clickhouse.connection import ClickHouseConnection
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class OHLCVQuery:
|
|
60
|
+
"""
|
|
61
|
+
High-level query interface for OHLCV data in ClickHouse.
|
|
62
|
+
|
|
63
|
+
Provides pandas DataFrame-based API for querying time-series OHLCV data
|
|
64
|
+
with automatic connection management and SQL query construction.
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
connection: ClickHouse connection for native protocol queries
|
|
68
|
+
|
|
69
|
+
Error Handling:
|
|
70
|
+
- Connection failures raise ConnectionError
|
|
71
|
+
- Query failures raise Exception
|
|
72
|
+
- Invalid parameters raise ValueError
|
|
73
|
+
- No retries, no fallbacks
|
|
74
|
+
|
|
75
|
+
Performance:
|
|
76
|
+
- Query latency: <1s for typical OHLCV ranges (1M rows)
|
|
77
|
+
- FINAL keyword overhead: 10-30% (required for deduplication)
|
|
78
|
+
- Result set: Materialized to pandas DataFrame
|
|
79
|
+
- Memory: Entire result loaded into memory
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
# Get latest data
|
|
83
|
+
with ClickHouseConnection() as conn:
|
|
84
|
+
query = OHLCVQuery(conn)
|
|
85
|
+
df = query.get_latest("BTCUSDT", "1h", limit=1000)
|
|
86
|
+
print(f"Latest close: {df.iloc[-1]['close']}")
|
|
87
|
+
|
|
88
|
+
# Date range query
|
|
89
|
+
with ClickHouseConnection() as conn:
|
|
90
|
+
query = OHLCVQuery(conn)
|
|
91
|
+
df = query.get_range(
|
|
92
|
+
"ETHUSDT", "1h",
|
|
93
|
+
start="2024-01-01",
|
|
94
|
+
end="2024-12-31"
|
|
95
|
+
)
|
|
96
|
+
print(f"Total bars: {len(df)}")
|
|
97
|
+
|
|
98
|
+
# Multi-symbol query
|
|
99
|
+
with ClickHouseConnection() as conn:
|
|
100
|
+
query = OHLCVQuery(conn)
|
|
101
|
+
df = query.get_multi_symbol(
|
|
102
|
+
["BTCUSDT", "ETHUSDT"],
|
|
103
|
+
"1h",
|
|
104
|
+
start="2024-01-01",
|
|
105
|
+
end="2024-01-31"
|
|
106
|
+
)
|
|
107
|
+
print(df.groupby("symbol")["close"].mean())
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, connection: ClickHouseConnection) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Initialize OHLCV query interface.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
connection: Active ClickHouse connection
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
ValueError: If connection is invalid
|
|
119
|
+
"""
|
|
120
|
+
if not isinstance(connection, ClickHouseConnection):
|
|
121
|
+
raise ValueError(f"Expected ClickHouseConnection, got {type(connection).__name__}")
|
|
122
|
+
|
|
123
|
+
self.connection = connection
|
|
124
|
+
logger.debug("OHLCVQuery interface initialized")
|
|
125
|
+
|
|
126
|
+
def get_latest(
|
|
127
|
+
self, symbol: str, timeframe: str, limit: int = 1000, instrument_type: str = "spot"
|
|
128
|
+
) -> pd.DataFrame:
|
|
129
|
+
"""
|
|
130
|
+
Get latest N bars for a symbol and timeframe.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT")
|
|
134
|
+
timeframe: Timeframe string (e.g., "1h")
|
|
135
|
+
limit: Number of bars to retrieve (default: 1000)
|
|
136
|
+
instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
pandas DataFrame with OHLCV data, sorted by timestamp (oldest first)
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If parameters are invalid
|
|
143
|
+
Exception: If query fails
|
|
144
|
+
ConnectionError: If database connection fails
|
|
145
|
+
|
|
146
|
+
Example:
|
|
147
|
+
# Spot data (default)
|
|
148
|
+
df = query.get_latest("BTCUSDT", "1h", limit=100)
|
|
149
|
+
|
|
150
|
+
# Futures data
|
|
151
|
+
df = query.get_latest("BTCUSDT", "1h", limit=100, instrument_type="futures")
|
|
152
|
+
|
|
153
|
+
print(df.columns)
|
|
154
|
+
# ['timestamp', 'symbol', 'timeframe', 'instrument_type', 'open', 'high', 'low',
|
|
155
|
+
# 'close', 'volume', 'close_time', 'quote_asset_volume',
|
|
156
|
+
# 'number_of_trades', 'taker_buy_base_asset_volume',
|
|
157
|
+
# 'taker_buy_quote_asset_volume', 'data_source']
|
|
158
|
+
"""
|
|
159
|
+
# Validate inputs
|
|
160
|
+
if not symbol:
|
|
161
|
+
raise ValueError("Symbol cannot be empty")
|
|
162
|
+
if not timeframe:
|
|
163
|
+
raise ValueError("Timeframe cannot be empty")
|
|
164
|
+
if limit <= 0:
|
|
165
|
+
raise ValueError(f"Limit must be positive, got {limit}")
|
|
166
|
+
if instrument_type not in ("spot", "futures"):
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
symbol = symbol.upper()
|
|
172
|
+
|
|
173
|
+
sql = """
|
|
174
|
+
SELECT
|
|
175
|
+
timestamp,
|
|
176
|
+
symbol,
|
|
177
|
+
timeframe,
|
|
178
|
+
instrument_type,
|
|
179
|
+
open,
|
|
180
|
+
high,
|
|
181
|
+
low,
|
|
182
|
+
close,
|
|
183
|
+
volume,
|
|
184
|
+
close_time,
|
|
185
|
+
quote_asset_volume,
|
|
186
|
+
number_of_trades,
|
|
187
|
+
taker_buy_base_asset_volume,
|
|
188
|
+
taker_buy_quote_asset_volume,
|
|
189
|
+
data_source
|
|
190
|
+
FROM ohlcv FINAL
|
|
191
|
+
WHERE symbol = %(symbol)s
|
|
192
|
+
AND timeframe = %(timeframe)s
|
|
193
|
+
AND instrument_type = %(instrument_type)s
|
|
194
|
+
ORDER BY timestamp DESC
|
|
195
|
+
LIMIT %(limit)s
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
logger.debug(f"Querying latest {limit} bars for {symbol} {timeframe} ({instrument_type})")
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
# Execute query
|
|
202
|
+
df = self.connection.query_dataframe(
|
|
203
|
+
sql,
|
|
204
|
+
params={
|
|
205
|
+
"symbol": symbol,
|
|
206
|
+
"timeframe": timeframe,
|
|
207
|
+
"instrument_type": instrument_type,
|
|
208
|
+
"limit": limit,
|
|
209
|
+
},
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Reverse to chronological order (oldest first)
|
|
213
|
+
df = df.iloc[::-1].reset_index(drop=True)
|
|
214
|
+
|
|
215
|
+
logger.info(f"Retrieved {len(df)} bars for {symbol} {timeframe} ({instrument_type})")
|
|
216
|
+
return df
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
raise Exception(
|
|
220
|
+
f"Query failed for {symbol} {timeframe} ({instrument_type}): {e}"
|
|
221
|
+
) from e
|
|
222
|
+
|
|
223
|
+
def get_range(
|
|
224
|
+
self,
|
|
225
|
+
symbol: str,
|
|
226
|
+
timeframe: str,
|
|
227
|
+
start: str,
|
|
228
|
+
end: str,
|
|
229
|
+
instrument_type: str = "spot",
|
|
230
|
+
) -> pd.DataFrame:
|
|
231
|
+
"""
|
|
232
|
+
Get OHLCV data for a specific date range.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT")
|
|
236
|
+
timeframe: Timeframe string (e.g., "1h")
|
|
237
|
+
start: Start date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
238
|
+
end: End date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
239
|
+
instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
pandas DataFrame with OHLCV data, sorted by timestamp
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
ValueError: If parameters are invalid
|
|
246
|
+
Exception: If query fails
|
|
247
|
+
ConnectionError: If database connection fails
|
|
248
|
+
|
|
249
|
+
Example:
|
|
250
|
+
# Spot data (default)
|
|
251
|
+
df = query.get_range(
|
|
252
|
+
"ETHUSDT", "1h",
|
|
253
|
+
start="2024-01-01",
|
|
254
|
+
end="2024-01-31"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Futures data
|
|
258
|
+
df = query.get_range(
|
|
259
|
+
"BTCUSDT", "1h",
|
|
260
|
+
start="2024-01-01",
|
|
261
|
+
end="2024-01-31",
|
|
262
|
+
instrument_type="futures"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
print(f"Total bars: {len(df)}")
|
|
266
|
+
print(f"First: {df.iloc[0]['timestamp']}")
|
|
267
|
+
print(f"Last: {df.iloc[-1]['timestamp']}")
|
|
268
|
+
"""
|
|
269
|
+
# Validate inputs
|
|
270
|
+
if not symbol:
|
|
271
|
+
raise ValueError("Symbol cannot be empty")
|
|
272
|
+
if not timeframe:
|
|
273
|
+
raise ValueError("Timeframe cannot be empty")
|
|
274
|
+
if instrument_type not in ("spot", "futures"):
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
symbol = symbol.upper()
|
|
280
|
+
|
|
281
|
+
# Parse dates (validate format)
|
|
282
|
+
try:
|
|
283
|
+
start_dt = pd.to_datetime(start)
|
|
284
|
+
end_dt = pd.to_datetime(end)
|
|
285
|
+
except Exception as e:
|
|
286
|
+
raise ValueError(
|
|
287
|
+
f"Invalid date format. Expected 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS', got start='{start}', end='{end}'"
|
|
288
|
+
) from e
|
|
289
|
+
|
|
290
|
+
if start_dt >= end_dt:
|
|
291
|
+
raise ValueError(f"Start date must be before end date, got start={start}, end={end}")
|
|
292
|
+
|
|
293
|
+
sql = """
|
|
294
|
+
SELECT
|
|
295
|
+
timestamp,
|
|
296
|
+
symbol,
|
|
297
|
+
timeframe,
|
|
298
|
+
instrument_type,
|
|
299
|
+
open,
|
|
300
|
+
high,
|
|
301
|
+
low,
|
|
302
|
+
close,
|
|
303
|
+
volume,
|
|
304
|
+
close_time,
|
|
305
|
+
quote_asset_volume,
|
|
306
|
+
number_of_trades,
|
|
307
|
+
taker_buy_base_asset_volume,
|
|
308
|
+
taker_buy_quote_asset_volume,
|
|
309
|
+
data_source
|
|
310
|
+
FROM ohlcv FINAL
|
|
311
|
+
WHERE symbol = %(symbol)s
|
|
312
|
+
AND timeframe = %(timeframe)s
|
|
313
|
+
AND instrument_type = %(instrument_type)s
|
|
314
|
+
AND timestamp >= toDateTime(%(start)s)
|
|
315
|
+
AND timestamp <= toDateTime(%(end)s)
|
|
316
|
+
ORDER BY timestamp ASC
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
logger.debug(f"Querying {symbol} {timeframe} ({instrument_type}) from {start} to {end}")
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
df = self.connection.query_dataframe(
|
|
323
|
+
sql,
|
|
324
|
+
params={
|
|
325
|
+
"symbol": symbol,
|
|
326
|
+
"timeframe": timeframe,
|
|
327
|
+
"instrument_type": instrument_type,
|
|
328
|
+
"start": start,
|
|
329
|
+
"end": end,
|
|
330
|
+
},
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
logger.info(
|
|
334
|
+
f"Retrieved {len(df)} bars for {symbol} {timeframe} ({instrument_type}) ({start} to {end})"
|
|
335
|
+
)
|
|
336
|
+
return df
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
raise Exception(
|
|
340
|
+
f"Query failed for {symbol} {timeframe} ({instrument_type}) {start} to {end}: {e}"
|
|
341
|
+
) from e
|
|
342
|
+
|
|
343
|
+
def get_multi_symbol(
|
|
344
|
+
self,
|
|
345
|
+
symbols: List[str],
|
|
346
|
+
timeframe: str,
|
|
347
|
+
start: str,
|
|
348
|
+
end: str,
|
|
349
|
+
instrument_type: str = "spot",
|
|
350
|
+
) -> pd.DataFrame:
|
|
351
|
+
"""
|
|
352
|
+
Get OHLCV data for multiple symbols in a date range.
|
|
353
|
+
|
|
354
|
+
Useful for multi-symbol analysis and comparison.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
symbols: List of trading pair symbols (e.g., ["BTCUSDT", "ETHUSDT"])
|
|
358
|
+
timeframe: Timeframe string (e.g., "1h")
|
|
359
|
+
start: Start date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
360
|
+
end: End date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
361
|
+
instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
pandas DataFrame with OHLCV data for all symbols, sorted by symbol then timestamp
|
|
365
|
+
|
|
366
|
+
Raises:
|
|
367
|
+
ValueError: If parameters are invalid
|
|
368
|
+
Exception: If query fails
|
|
369
|
+
ConnectionError: If database connection fails
|
|
370
|
+
|
|
371
|
+
Example:
|
|
372
|
+
# Spot data (default)
|
|
373
|
+
df = query.get_multi_symbol(
|
|
374
|
+
["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
375
|
+
"1h",
|
|
376
|
+
start="2024-01-01",
|
|
377
|
+
end="2024-01-31"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Futures data
|
|
381
|
+
df = query.get_multi_symbol(
|
|
382
|
+
["BTCUSDT", "ETHUSDT"],
|
|
383
|
+
"1h",
|
|
384
|
+
start="2024-01-01",
|
|
385
|
+
end="2024-01-31",
|
|
386
|
+
instrument_type="futures"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Group by symbol for analysis
|
|
390
|
+
summary = df.groupby("symbol").agg({
|
|
391
|
+
"close": ["mean", "min", "max"],
|
|
392
|
+
"volume": "sum"
|
|
393
|
+
})
|
|
394
|
+
print(summary)
|
|
395
|
+
"""
|
|
396
|
+
# Validate inputs
|
|
397
|
+
if not symbols:
|
|
398
|
+
raise ValueError("Symbols list cannot be empty")
|
|
399
|
+
if not timeframe:
|
|
400
|
+
raise ValueError("Timeframe cannot be empty")
|
|
401
|
+
if instrument_type not in ("spot", "futures"):
|
|
402
|
+
raise ValueError(
|
|
403
|
+
f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
symbols = [s.upper() for s in symbols]
|
|
407
|
+
|
|
408
|
+
# Parse dates
|
|
409
|
+
try:
|
|
410
|
+
start_dt = pd.to_datetime(start)
|
|
411
|
+
end_dt = pd.to_datetime(end)
|
|
412
|
+
except Exception as e:
|
|
413
|
+
raise ValueError(
|
|
414
|
+
f"Invalid date format. Expected 'YYYY-MM-DD', got start='{start}', end='{end}'"
|
|
415
|
+
) from e
|
|
416
|
+
|
|
417
|
+
if start_dt >= end_dt:
|
|
418
|
+
raise ValueError(f"Start date must be before end date, got start={start}, end={end}")
|
|
419
|
+
|
|
420
|
+
# ClickHouse IN clause with array parameter
|
|
421
|
+
sql = """
|
|
422
|
+
SELECT
|
|
423
|
+
timestamp,
|
|
424
|
+
symbol,
|
|
425
|
+
timeframe,
|
|
426
|
+
instrument_type,
|
|
427
|
+
open,
|
|
428
|
+
high,
|
|
429
|
+
low,
|
|
430
|
+
close,
|
|
431
|
+
volume,
|
|
432
|
+
close_time,
|
|
433
|
+
quote_asset_volume,
|
|
434
|
+
number_of_trades,
|
|
435
|
+
taker_buy_base_asset_volume,
|
|
436
|
+
taker_buy_quote_asset_volume,
|
|
437
|
+
data_source
|
|
438
|
+
FROM ohlcv FINAL
|
|
439
|
+
WHERE symbol IN %(symbols)s
|
|
440
|
+
AND timeframe = %(timeframe)s
|
|
441
|
+
AND instrument_type = %(instrument_type)s
|
|
442
|
+
AND timestamp >= toDateTime(%(start)s)
|
|
443
|
+
AND timestamp <= toDateTime(%(end)s)
|
|
444
|
+
ORDER BY symbol ASC, timestamp ASC
|
|
445
|
+
"""
|
|
446
|
+
|
|
447
|
+
logger.debug(
|
|
448
|
+
f"Querying {len(symbols)} symbols ({', '.join(symbols)}) {timeframe} ({instrument_type}) from {start} to {end}"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
df = self.connection.query_dataframe(
|
|
453
|
+
sql,
|
|
454
|
+
params={
|
|
455
|
+
"symbols": symbols,
|
|
456
|
+
"timeframe": timeframe,
|
|
457
|
+
"instrument_type": instrument_type,
|
|
458
|
+
"start": start,
|
|
459
|
+
"end": end,
|
|
460
|
+
},
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
logger.info(
|
|
464
|
+
f"Retrieved {len(df)} bars for {len(symbols)} symbols ({instrument_type}) ({start} to {end})"
|
|
465
|
+
)
|
|
466
|
+
return df
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
raise Exception(
|
|
470
|
+
f"Multi-symbol query failed for {timeframe} ({instrument_type}) {start} to {end}: {e}"
|
|
471
|
+
) from e
|
|
472
|
+
|
|
473
|
+
def execute_sql(self, sql: str, params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
474
|
+
"""
|
|
475
|
+
Execute raw SQL query and return results as DataFrame.
|
|
476
|
+
|
|
477
|
+
For advanced queries not covered by high-level methods.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
sql: SQL query string (use %(name)s placeholders for parameters)
|
|
481
|
+
params: Query parameters dict (optional)
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
pandas DataFrame with query results
|
|
485
|
+
|
|
486
|
+
Raises:
|
|
487
|
+
ValueError: If SQL is empty
|
|
488
|
+
Exception: If query fails
|
|
489
|
+
ConnectionError: If database connection fails
|
|
490
|
+
|
|
491
|
+
Security:
|
|
492
|
+
Always use parameterized queries (%(name)s placeholders) to prevent SQL injection.
|
|
493
|
+
NEVER concatenate user input directly into SQL strings.
|
|
494
|
+
|
|
495
|
+
Example:
|
|
496
|
+
# Parameterized query (SAFE)
|
|
497
|
+
df = query.execute_sql(
|
|
498
|
+
"SELECT * FROM ohlcv FINAL WHERE symbol = %(symbol)s AND close > %(price)s LIMIT 10",
|
|
499
|
+
{"symbol": "BTCUSDT", "price": 50000.0}
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Direct string concatenation (UNSAFE - don't do this)
|
|
503
|
+
# df = query.execute_sql(f"SELECT * FROM ohlcv WHERE symbol = '{user_input}'")
|
|
504
|
+
"""
|
|
505
|
+
if not sql or not sql.strip():
|
|
506
|
+
raise ValueError("SQL query cannot be empty")
|
|
507
|
+
|
|
508
|
+
logger.debug(f"Executing raw SQL query: {sql[:100]}...")
|
|
509
|
+
|
|
510
|
+
try:
|
|
511
|
+
df = self.connection.query_dataframe(sql, params)
|
|
512
|
+
|
|
513
|
+
logger.info(f"Raw SQL query returned {len(df)} rows")
|
|
514
|
+
return df
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
raise Exception(f"Raw SQL query failed: {e}") from e
|
|
518
|
+
|
|
519
|
+
def detect_gaps(
|
|
520
|
+
self, symbol: str, timeframe: str, start: str, end: str, instrument_type: str = "spot"
|
|
521
|
+
) -> pd.DataFrame:
|
|
522
|
+
"""
|
|
523
|
+
Detect timestamp gaps in OHLCV data using SQL sequence analysis.
|
|
524
|
+
|
|
525
|
+
Uses ClickHouse window functions to find missing bars based on
|
|
526
|
+
expected timeframe intervals.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT")
|
|
530
|
+
timeframe: Timeframe string (e.g., "1h")
|
|
531
|
+
start: Start date in "YYYY-MM-DD" format
|
|
532
|
+
end: End date in "YYYY-MM-DD" format
|
|
533
|
+
instrument_type: Instrument type ("spot" or "futures"), defaults to "spot"
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
pandas DataFrame with gap information:
|
|
537
|
+
- gap_start: Timestamp where gap starts
|
|
538
|
+
- gap_end: Timestamp where gap ends
|
|
539
|
+
- expected_bars: Number of missing bars in gap
|
|
540
|
+
|
|
541
|
+
Raises:
|
|
542
|
+
ValueError: If parameters are invalid
|
|
543
|
+
Exception: If query fails
|
|
544
|
+
|
|
545
|
+
Example:
|
|
546
|
+
gaps = query.detect_gaps("BTCUSDT", "1h", "2024-01-01", "2024-12-31")
|
|
547
|
+
if gaps.empty:
|
|
548
|
+
print("No gaps found!")
|
|
549
|
+
else:
|
|
550
|
+
print(f"Found {len(gaps)} gaps:")
|
|
551
|
+
print(gaps)
|
|
552
|
+
"""
|
|
553
|
+
# Map timeframe to seconds for gap detection
|
|
554
|
+
timeframe_to_seconds = {
|
|
555
|
+
"1s": 1,
|
|
556
|
+
"1m": 60,
|
|
557
|
+
"3m": 180,
|
|
558
|
+
"5m": 300,
|
|
559
|
+
"15m": 900,
|
|
560
|
+
"30m": 1800,
|
|
561
|
+
"1h": 3600,
|
|
562
|
+
"2h": 7200,
|
|
563
|
+
"4h": 14400,
|
|
564
|
+
"6h": 21600,
|
|
565
|
+
"8h": 28800,
|
|
566
|
+
"12h": 43200,
|
|
567
|
+
"1d": 86400,
|
|
568
|
+
"3d": 259200,
|
|
569
|
+
"1w": 604800,
|
|
570
|
+
"1mo": 2592000, # Approximate: 30 days
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if timeframe not in timeframe_to_seconds:
|
|
574
|
+
raise ValueError(f"Unsupported timeframe for gap detection: {timeframe}")
|
|
575
|
+
if instrument_type not in ("spot", "futures"):
|
|
576
|
+
raise ValueError(
|
|
577
|
+
f"Invalid instrument_type: '{instrument_type}'. Must be 'spot' or 'futures'"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
interval_seconds = timeframe_to_seconds[timeframe]
|
|
581
|
+
symbol = symbol.upper()
|
|
582
|
+
|
|
583
|
+
# SQL to detect gaps using lagInFrame window function
|
|
584
|
+
sql = """
|
|
585
|
+
WITH lagged AS (
|
|
586
|
+
SELECT
|
|
587
|
+
timestamp,
|
|
588
|
+
lagInFrame(timestamp) OVER (ORDER BY timestamp) AS prev_timestamp
|
|
589
|
+
FROM ohlcv FINAL
|
|
590
|
+
WHERE symbol = %(symbol)s
|
|
591
|
+
AND timeframe = %(timeframe)s
|
|
592
|
+
AND instrument_type = %(instrument_type)s
|
|
593
|
+
AND timestamp >= toDateTime(%(start)s)
|
|
594
|
+
AND timestamp <= toDateTime(%(end)s)
|
|
595
|
+
),
|
|
596
|
+
gaps AS (
|
|
597
|
+
SELECT
|
|
598
|
+
prev_timestamp AS gap_start,
|
|
599
|
+
timestamp AS gap_end,
|
|
600
|
+
toFloat64(dateDiff('second', prev_timestamp, timestamp)) / %(interval_seconds)s AS bars_diff
|
|
601
|
+
FROM lagged
|
|
602
|
+
WHERE prev_timestamp != toDateTime(0)
|
|
603
|
+
)
|
|
604
|
+
SELECT
|
|
605
|
+
gap_start,
|
|
606
|
+
gap_end,
|
|
607
|
+
bars_diff - 1 AS expected_bars
|
|
608
|
+
FROM gaps
|
|
609
|
+
WHERE bars_diff > 1
|
|
610
|
+
"""
|
|
611
|
+
|
|
612
|
+
logger.debug(
|
|
613
|
+
f"Detecting gaps for {symbol} {timeframe} ({instrument_type}) from {start} to {end}"
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
try:
|
|
617
|
+
df = self.connection.query_dataframe(
|
|
618
|
+
sql,
|
|
619
|
+
params={
|
|
620
|
+
"symbol": symbol,
|
|
621
|
+
"timeframe": timeframe,
|
|
622
|
+
"instrument_type": instrument_type,
|
|
623
|
+
"start": start,
|
|
624
|
+
"end": end,
|
|
625
|
+
"interval_seconds": interval_seconds,
|
|
626
|
+
},
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if df.empty:
|
|
630
|
+
logger.info(f"No gaps found for {symbol} {timeframe} ({instrument_type})")
|
|
631
|
+
else:
|
|
632
|
+
logger.warning(
|
|
633
|
+
f"Found {len(df)} gaps for {symbol} {timeframe} ({instrument_type}) "
|
|
634
|
+
f"(total missing bars: {df['expected_bars'].sum()})"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
return df
|
|
638
|
+
|
|
639
|
+
except Exception as e:
|
|
640
|
+
raise Exception(
|
|
641
|
+
f"Gap detection query failed for {symbol} {timeframe} ({instrument_type}): {e}"
|
|
642
|
+
) from e
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Collectors module.
|
|
3
|
+
|
|
4
|
+
High-performance data collection components with hybrid concurrent architecture.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .binance_public_data_collector import BinancePublicDataCollector
|
|
8
|
+
from .concurrent_collection_orchestrator import CollectionResult, ConcurrentCollectionOrchestrator
|
|
9
|
+
from .httpx_downloader import ConcurrentDownloadManager, DownloadResult
|
|
10
|
+
from .hybrid_url_generator import DataSource, DownloadTask, HybridUrlGenerator
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BinancePublicDataCollector",
|
|
14
|
+
"HybridUrlGenerator",
|
|
15
|
+
"DownloadTask",
|
|
16
|
+
"DataSource",
|
|
17
|
+
"ConcurrentDownloadManager",
|
|
18
|
+
"DownloadResult",
|
|
19
|
+
"ConcurrentCollectionOrchestrator",
|
|
20
|
+
"CollectionResult",
|
|
21
|
+
]
|