gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Query API for gapless-crypto-clickhouse v6.0.0.
|
|
3
|
+
|
|
4
|
+
Provides unified query_ohlcv() function with lazy auto-ingestion addressing Alpha Forge's request.
|
|
5
|
+
|
|
6
|
+
Workflow:
|
|
7
|
+
1. Check if data exists in ClickHouse
|
|
8
|
+
2. If missing and auto_ingest enabled: download from Binance and ingest
|
|
9
|
+
3. Query ClickHouse with FINAL keyword for deduplication
|
|
10
|
+
4. If fill_gaps enabled: detect and fill gaps via REST API
|
|
11
|
+
5. Return pandas DataFrame (Arrow-optimized internally)
|
|
12
|
+
|
|
13
|
+
Error Handling: Raise and propagate (no fallback, no retry, no silent failures)
|
|
14
|
+
SLOs: Availability, Correctness (zero-gap guarantee), Observability, Maintainability
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
from gapless_crypto_clickhouse import query_ohlcv
|
|
18
|
+
|
|
19
|
+
# Basic query with auto-ingestion
|
|
20
|
+
df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")
|
|
21
|
+
|
|
22
|
+
# Multi-symbol query
|
|
23
|
+
df = query_ohlcv(["BTCUSDT", "ETHUSDT"], "1h", "2024-01-01", "2024-01-31")
|
|
24
|
+
|
|
25
|
+
# Futures data
|
|
26
|
+
df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31", instrument_type="futures-um")
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
from typing import List, Optional, Union
|
|
32
|
+
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
from .api import InstrumentType
|
|
36
|
+
from .clickhouse import ClickHouseConnection
|
|
37
|
+
from .clickhouse.config import ClickHouseConfig
|
|
38
|
+
from .clickhouse_query import OHLCVQuery
|
|
39
|
+
from .collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def query_ohlcv(
|
|
45
|
+
symbol: Union[str, List[str]],
|
|
46
|
+
timeframe: str,
|
|
47
|
+
start_date: str,
|
|
48
|
+
end_date: str,
|
|
49
|
+
instrument_type: InstrumentType = "spot",
|
|
50
|
+
auto_ingest: bool = True,
|
|
51
|
+
fill_gaps: bool = True,
|
|
52
|
+
clickhouse_config: Optional[ClickHouseConfig] = None,
|
|
53
|
+
) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Query OHLCV data from ClickHouse with lazy auto-ingestion.
|
|
56
|
+
|
|
57
|
+
Addresses Alpha Forge's feature request: unified query API with automatic data download
|
|
58
|
+
when missing from database.
|
|
59
|
+
|
|
60
|
+
Workflow:
|
|
61
|
+
1. Connect to ClickHouse
|
|
62
|
+
2. Check if requested data exists (COUNT query)
|
|
63
|
+
3. If missing rows and auto_ingest=True: download from Binance + ingest to ClickHouse
|
|
64
|
+
4. Query data with FINAL keyword (deduplication)
|
|
65
|
+
5. If fill_gaps=True: detect gaps and fill via REST API
|
|
66
|
+
6. Return DataFrame (Arrow-optimized, 3x faster)
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT") or list of symbols
|
|
70
|
+
timeframe: Timeframe string (e.g., "1h", "4h", "1d")
|
|
71
|
+
start_date: Start date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
72
|
+
end_date: End date in "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" format
|
|
73
|
+
instrument_type: "spot" or "futures-um" (default: "spot")
|
|
74
|
+
auto_ingest: If True, automatically download and ingest missing data (default: True)
|
|
75
|
+
fill_gaps: If True, detect and fill gaps using REST API (default: True)
|
|
76
|
+
clickhouse_config: Optional ClickHouse configuration (default: from environment)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
pandas DataFrame with OHLCV data (Arrow-optimized internally)
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ValueError: If parameters are invalid
|
|
83
|
+
Exception: If query or ingestion fails
|
|
84
|
+
|
|
85
|
+
Performance:
|
|
86
|
+
- First query (auto-ingest): 30-60s (download + ingest + query)
|
|
87
|
+
- Cached query: 0.1-2s (query only, 3x faster with Arrow)
|
|
88
|
+
- Memory: 75% less vs clickhouse-driver (Arrow zero-copy)
|
|
89
|
+
|
|
90
|
+
Examples:
|
|
91
|
+
# Basic query (auto-downloads if missing)
|
|
92
|
+
df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")
|
|
93
|
+
print(f"Rows: {len(df)}") # 744 rows (31 days * 24 hours)
|
|
94
|
+
|
|
95
|
+
# Multi-symbol query
|
|
96
|
+
df = query_ohlcv(
|
|
97
|
+
["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
98
|
+
"1h",
|
|
99
|
+
"2024-01-01",
|
|
100
|
+
"2024-01-31"
|
|
101
|
+
)
|
|
102
|
+
print(df.groupby("symbol")["close"].mean())
|
|
103
|
+
|
|
104
|
+
# Futures data
|
|
105
|
+
df = query_ohlcv(
|
|
106
|
+
"BTCUSDT",
|
|
107
|
+
"1h",
|
|
108
|
+
"2024-01-01",
|
|
109
|
+
"2024-01-31",
|
|
110
|
+
instrument_type="futures-um"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Query without auto-ingestion (faster, raises if data missing)
|
|
114
|
+
df = query_ohlcv(
|
|
115
|
+
"BTCUSDT",
|
|
116
|
+
"1h",
|
|
117
|
+
"2024-01-01",
|
|
118
|
+
"2024-01-31",
|
|
119
|
+
auto_ingest=False
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Query without gap filling (faster, may have gaps)
|
|
123
|
+
df = query_ohlcv(
|
|
124
|
+
"BTCUSDT",
|
|
125
|
+
"1h",
|
|
126
|
+
"2024-01-01",
|
|
127
|
+
"2024-01-31",
|
|
128
|
+
fill_gaps=False
|
|
129
|
+
)
|
|
130
|
+
"""
|
|
131
|
+
# Normalize symbol to list
|
|
132
|
+
symbols = [symbol] if isinstance(symbol, str) else symbol
|
|
133
|
+
|
|
134
|
+
# Validate parameters
|
|
135
|
+
if not symbols:
|
|
136
|
+
raise ValueError("symbol cannot be empty")
|
|
137
|
+
|
|
138
|
+
for sym in symbols:
|
|
139
|
+
if not sym:
|
|
140
|
+
raise ValueError(f"Invalid symbol: {sym}")
|
|
141
|
+
|
|
142
|
+
if not timeframe:
|
|
143
|
+
raise ValueError("timeframe cannot be empty")
|
|
144
|
+
|
|
145
|
+
if not start_date or not end_date:
|
|
146
|
+
raise ValueError("start_date and end_date are required")
|
|
147
|
+
|
|
148
|
+
# Connect to ClickHouse
|
|
149
|
+
config = clickhouse_config or ClickHouseConfig.from_env()
|
|
150
|
+
with ClickHouseConnection(config) as conn:
|
|
151
|
+
query = OHLCVQuery(conn)
|
|
152
|
+
loader = ClickHouseBulkLoader(conn, instrument_type=instrument_type)
|
|
153
|
+
|
|
154
|
+
# Process each symbol
|
|
155
|
+
dataframes = []
|
|
156
|
+
for sym in symbols:
|
|
157
|
+
logger.info(
|
|
158
|
+
f"Processing {sym} {timeframe} {instrument_type} "
|
|
159
|
+
f"({start_date} to {end_date}, auto_ingest={auto_ingest}, fill_gaps={fill_gaps})"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Step 1: Check if data exists
|
|
163
|
+
existing_count = _count_existing_rows(
|
|
164
|
+
query, sym, timeframe, start_date, end_date, instrument_type
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Calculate expected row count (approximate, based on timeframe)
|
|
168
|
+
expected_count = _estimate_expected_rows(start_date, end_date, timeframe)
|
|
169
|
+
|
|
170
|
+
logger.info(
|
|
171
|
+
f"{sym}: Found {existing_count} rows in ClickHouse "
|
|
172
|
+
f"(expected ~{expected_count} rows)"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Step 2: Auto-ingest if missing data
|
|
176
|
+
if auto_ingest and existing_count < expected_count * 0.5:
|
|
177
|
+
logger.info(
|
|
178
|
+
f"{sym}: Auto-ingesting missing data "
|
|
179
|
+
f"(found {existing_count}/{expected_count} rows)"
|
|
180
|
+
)
|
|
181
|
+
_auto_ingest_date_range(
|
|
182
|
+
loader, sym, timeframe, start_date, end_date, instrument_type
|
|
183
|
+
)
|
|
184
|
+
elif existing_count == 0:
|
|
185
|
+
logger.warning(f"{sym}: No data found in ClickHouse and auto_ingest={auto_ingest}")
|
|
186
|
+
|
|
187
|
+
# Step 3: Query data with FINAL
|
|
188
|
+
df = query.get_range(
|
|
189
|
+
symbol=sym,
|
|
190
|
+
timeframe=timeframe,
|
|
191
|
+
start=start_date,
|
|
192
|
+
end=end_date,
|
|
193
|
+
instrument_type=instrument_type,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
logger.info(f"{sym}: Retrieved {len(df)} rows from ClickHouse (Arrow-optimized)")
|
|
197
|
+
|
|
198
|
+
# Step 4: Fill gaps if enabled
|
|
199
|
+
if fill_gaps and len(df) > 0:
|
|
200
|
+
gaps = query.detect_gaps(
|
|
201
|
+
symbol=sym,
|
|
202
|
+
timeframe=timeframe,
|
|
203
|
+
start=start_date,
|
|
204
|
+
end=end_date,
|
|
205
|
+
instrument_type=instrument_type,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if len(gaps) > 0:
|
|
209
|
+
logger.info(f"{sym}: Detected {len(gaps)} gaps, filling via REST API")
|
|
210
|
+
# TODO: Implement gap filling via REST API
|
|
211
|
+
# This requires integrating the gap filler from gapless-crypto-data
|
|
212
|
+
logger.warning(
|
|
213
|
+
f"{sym}: Gap filling not yet implemented in v6.0.0 "
|
|
214
|
+
f"(detected {len(gaps)} gaps)"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
dataframes.append(df)
|
|
218
|
+
|
|
219
|
+
# Combine results
|
|
220
|
+
if len(dataframes) == 1:
|
|
221
|
+
return dataframes[0]
|
|
222
|
+
else:
|
|
223
|
+
return pd.concat(dataframes, ignore_index=True)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _count_existing_rows(
|
|
227
|
+
query: OHLCVQuery,
|
|
228
|
+
symbol: str,
|
|
229
|
+
timeframe: str,
|
|
230
|
+
start_date: str,
|
|
231
|
+
end_date: str,
|
|
232
|
+
instrument_type: InstrumentType,
|
|
233
|
+
) -> int:
|
|
234
|
+
"""
|
|
235
|
+
Count existing rows in ClickHouse for given parameters.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
query: OHLCVQuery instance
|
|
239
|
+
symbol: Trading pair symbol
|
|
240
|
+
timeframe: Timeframe string
|
|
241
|
+
start_date: Start date string
|
|
242
|
+
end_date: End date string
|
|
243
|
+
instrument_type: "spot" or "futures-um"
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Number of existing rows
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
Exception: If query fails
|
|
250
|
+
"""
|
|
251
|
+
try:
|
|
252
|
+
# Use connection's execute method for COUNT query
|
|
253
|
+
result = query.connection.execute(
|
|
254
|
+
"""
|
|
255
|
+
SELECT COUNT(*) as count
|
|
256
|
+
FROM ohlcv FINAL
|
|
257
|
+
WHERE symbol = {symbol:String}
|
|
258
|
+
AND timeframe = {timeframe:String}
|
|
259
|
+
AND instrument_type = {instrument_type:String}
|
|
260
|
+
AND timestamp >= parseDateTime64BestEffort({start_date:String})
|
|
261
|
+
AND timestamp <= parseDateTime64BestEffort({end_date:String})
|
|
262
|
+
""",
|
|
263
|
+
params={
|
|
264
|
+
"symbol": symbol,
|
|
265
|
+
"timeframe": timeframe,
|
|
266
|
+
"instrument_type": instrument_type,
|
|
267
|
+
"start_date": start_date,
|
|
268
|
+
"end_date": end_date,
|
|
269
|
+
},
|
|
270
|
+
)
|
|
271
|
+
return result[0][0] if result else 0
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.error(f"Failed to count existing rows: {e}")
|
|
274
|
+
raise Exception(f"Count query failed: {e}") from e
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _estimate_expected_rows(start_date: str, end_date: str, timeframe: str) -> int:
|
|
278
|
+
"""
|
|
279
|
+
Estimate expected number of rows based on date range and timeframe.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
start_date: Start date string
|
|
283
|
+
end_date: End date string
|
|
284
|
+
timeframe: Timeframe string (e.g., "1h", "4h", "1d")
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Estimated number of rows
|
|
288
|
+
|
|
289
|
+
Raises:
|
|
290
|
+
ValueError: If timeframe format is invalid
|
|
291
|
+
"""
|
|
292
|
+
# Parse dates
|
|
293
|
+
start = pd.to_datetime(start_date)
|
|
294
|
+
end = pd.to_datetime(end_date)
|
|
295
|
+
duration_hours = (end - start).total_seconds() / 3600
|
|
296
|
+
|
|
297
|
+
# Map timeframe to hours
|
|
298
|
+
timeframe_hours = {
|
|
299
|
+
"1s": 1 / 3600,
|
|
300
|
+
"1m": 1 / 60,
|
|
301
|
+
"3m": 3 / 60,
|
|
302
|
+
"5m": 5 / 60,
|
|
303
|
+
"15m": 15 / 60,
|
|
304
|
+
"30m": 30 / 60,
|
|
305
|
+
"1h": 1,
|
|
306
|
+
"2h": 2,
|
|
307
|
+
"4h": 4,
|
|
308
|
+
"6h": 6,
|
|
309
|
+
"8h": 8,
|
|
310
|
+
"12h": 12,
|
|
311
|
+
"1d": 24,
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
if timeframe not in timeframe_hours:
|
|
315
|
+
raise ValueError(f"Unsupported timeframe: {timeframe}")
|
|
316
|
+
|
|
317
|
+
return int(duration_hours / timeframe_hours[timeframe])
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _auto_ingest_date_range(
|
|
321
|
+
loader: ClickHouseBulkLoader,
|
|
322
|
+
symbol: str,
|
|
323
|
+
timeframe: str,
|
|
324
|
+
start_date: str,
|
|
325
|
+
end_date: str,
|
|
326
|
+
instrument_type: InstrumentType,
|
|
327
|
+
) -> int:
|
|
328
|
+
"""
|
|
329
|
+
Auto-ingest data for date range by month.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
loader: ClickHouseBulkLoader instance
|
|
333
|
+
symbol: Trading pair symbol
|
|
334
|
+
timeframe: Timeframe string
|
|
335
|
+
start_date: Start date string
|
|
336
|
+
end_date: End date string
|
|
337
|
+
instrument_type: "spot" or "futures-um"
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Total rows ingested
|
|
341
|
+
|
|
342
|
+
Raises:
|
|
343
|
+
Exception: If ingestion fails
|
|
344
|
+
"""
|
|
345
|
+
start = pd.to_datetime(start_date)
|
|
346
|
+
end = pd.to_datetime(end_date)
|
|
347
|
+
|
|
348
|
+
total_rows = 0
|
|
349
|
+
|
|
350
|
+
# Generate month ranges
|
|
351
|
+
current = start
|
|
352
|
+
while current <= end:
|
|
353
|
+
year = current.year
|
|
354
|
+
month = current.month
|
|
355
|
+
|
|
356
|
+
logger.info(f"Auto-ingesting {symbol} {timeframe} {year}-{month:02d}")
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
rows = loader.ingest_month(symbol, timeframe, year, month)
|
|
360
|
+
total_rows += rows
|
|
361
|
+
logger.info(f"Ingested {rows} rows for {symbol} {year}-{month:02d}")
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.warning(
|
|
364
|
+
f"Failed to ingest {symbol} {year}-{month:02d}: {e} (month may not exist yet)"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Move to next month
|
|
368
|
+
if month == 12:
|
|
369
|
+
current = datetime(year + 1, 1, 1)
|
|
370
|
+
else:
|
|
371
|
+
current = datetime(year, month + 1, 1)
|
|
372
|
+
|
|
373
|
+
logger.info(f"Auto-ingestion complete: {total_rows} total rows for {symbol}")
|
|
374
|
+
return total_rows
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resume system for gapless-crypto-data.
|
|
3
|
+
|
|
4
|
+
Provides intelligent checkpointing and resume capabilities for large-scale data collection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .intelligent_checkpointing import CheckpointError, IntelligentCheckpointManager
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"IntelligentCheckpointManager",
|
|
11
|
+
"CheckpointError",
|
|
12
|
+
]
|