gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
HTTPX Async Download Manager for Binance ZIP Files
|
|
4
|
+
|
|
5
|
+
High-performance concurrent downloader optimized for Binance Vision ZIP files.
|
|
6
|
+
Supports up to 13+ simultaneous downloads with connection pooling and retry logic.
|
|
7
|
+
|
|
8
|
+
Since Binance Vision serves static ZIP files from CDN (not API endpoints),
|
|
9
|
+
there are no rate limits - we can maximize concurrent downloads for optimal performance.
|
|
10
|
+
|
|
11
|
+
Key optimizations:
|
|
12
|
+
- High concurrency (13+ simultaneous downloads)
|
|
13
|
+
- Connection pooling and HTTP/2 support
|
|
14
|
+
- Retry logic with exponential backoff
|
|
15
|
+
- Memory-efficient streaming for large ZIP files
|
|
16
|
+
- Progress tracking for concurrent operations
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import csv
|
|
21
|
+
import io
|
|
22
|
+
import logging
|
|
23
|
+
import zipfile
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
import httpx
|
|
29
|
+
|
|
30
|
+
from .hybrid_url_generator import DownloadTask
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DownloadResult:
|
|
35
|
+
"""Result of a download operation."""
|
|
36
|
+
|
|
37
|
+
task: DownloadTask
|
|
38
|
+
success: bool
|
|
39
|
+
data: Optional[List[List[str]]] = None # Parsed CSV rows
|
|
40
|
+
error: Optional[str] = None
|
|
41
|
+
download_time: float = 0.0
|
|
42
|
+
file_size_bytes: int = 0
|
|
43
|
+
status_code: Optional[int] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ConcurrentDownloadManager:
|
|
47
|
+
"""
|
|
48
|
+
High-performance concurrent downloader for Binance Vision ZIP files.
|
|
49
|
+
|
|
50
|
+
Optimized for static ZIP file downloads with maximum concurrency since
|
|
51
|
+
Binance Vision CDN has no rate limiting (unlike API endpoints).
|
|
52
|
+
|
|
53
|
+
Features:
|
|
54
|
+
- Up to 13+ simultaneous downloads (configurable)
|
|
55
|
+
- HTTP/2 connection pooling for efficiency
|
|
56
|
+
- Automatic retry with exponential backoff
|
|
57
|
+
- Memory-efficient ZIP processing
|
|
58
|
+
- Real-time progress tracking
|
|
59
|
+
- Comprehensive error handling
|
|
60
|
+
|
|
61
|
+
Performance optimizations:
|
|
62
|
+
- Single persistent HTTP client with connection reuse
|
|
63
|
+
- Streaming ZIP extraction to minimize memory usage
|
|
64
|
+
- Concurrent processing of multiple files
|
|
65
|
+
- Intelligent timeout and retry strategies
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
Basic concurrent downloading:
|
|
69
|
+
|
|
70
|
+
>>> manager = ConcurrentDownloadManager(max_concurrent=13)
|
|
71
|
+
>>> async with manager:
|
|
72
|
+
... results = await manager.download_tasks(download_tasks)
|
|
73
|
+
>>> successful = [r for r in results if r.success]
|
|
74
|
+
>>> print(f"Downloaded {len(successful)}/{len(results)} files")
|
|
75
|
+
|
|
76
|
+
Custom configuration:
|
|
77
|
+
|
|
78
|
+
>>> manager = ConcurrentDownloadManager(
|
|
79
|
+
... max_concurrent=15, # Higher concurrency
|
|
80
|
+
... connection_pool_size=25, # More connections
|
|
81
|
+
... timeout=120, # Longer timeout for large files
|
|
82
|
+
... max_retries=5 # More retry attempts
|
|
83
|
+
... )
|
|
84
|
+
|
|
85
|
+
With progress callback:
|
|
86
|
+
|
|
87
|
+
>>> def progress_callback(completed, total, current_task):
|
|
88
|
+
... print(f"Progress: {completed}/{total} - {current_task.filename}")
|
|
89
|
+
>>>
|
|
90
|
+
>>> async with manager:
|
|
91
|
+
... results = await manager.download_tasks(tasks, progress_callback)
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(
|
|
95
|
+
self,
|
|
96
|
+
max_concurrent: int = 13,
|
|
97
|
+
connection_pool_size: int = 20,
|
|
98
|
+
timeout: float = 60.0,
|
|
99
|
+
max_retries: int = 3,
|
|
100
|
+
retry_delay: float = 1.0,
|
|
101
|
+
retry_multiplier: float = 2.0,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Initialize concurrent download manager.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
max_concurrent: Maximum simultaneous downloads (13+ recommended for ZIP files)
|
|
108
|
+
connection_pool_size: HTTP connection pool size
|
|
109
|
+
timeout: Per-download timeout in seconds
|
|
110
|
+
max_retries: Maximum retry attempts for failed downloads
|
|
111
|
+
retry_delay: Initial retry delay in seconds
|
|
112
|
+
retry_multiplier: Exponential backoff multiplier
|
|
113
|
+
"""
|
|
114
|
+
self.max_concurrent = max_concurrent
|
|
115
|
+
self.connection_pool_size = connection_pool_size
|
|
116
|
+
self.timeout = timeout
|
|
117
|
+
self.max_retries = max_retries
|
|
118
|
+
self.retry_delay = retry_delay
|
|
119
|
+
self.retry_multiplier = retry_multiplier
|
|
120
|
+
|
|
121
|
+
# HTTP client will be initialized in __aenter__
|
|
122
|
+
self.client: Optional[httpx.AsyncClient] = None
|
|
123
|
+
self.semaphore: Optional[asyncio.Semaphore] = None
|
|
124
|
+
|
|
125
|
+
self.logger = logging.getLogger(__name__)
|
|
126
|
+
|
|
127
|
+
async def __aenter__(self):
|
|
128
|
+
"""Initialize async HTTP client and semaphore."""
|
|
129
|
+
# Configure HTTP client for optimal ZIP file downloading
|
|
130
|
+
limits = httpx.Limits(
|
|
131
|
+
max_keepalive_connections=self.connection_pool_size,
|
|
132
|
+
max_connections=self.connection_pool_size + 10, # Extra headroom
|
|
133
|
+
keepalive_expiry=30.0, # Keep connections alive
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
timeout = httpx.Timeout(
|
|
137
|
+
connect=10.0, # Connection timeout
|
|
138
|
+
read=self.timeout, # Read timeout for large ZIP files
|
|
139
|
+
write=10.0, # Write timeout
|
|
140
|
+
pool=5.0, # Pool timeout
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self.client = httpx.AsyncClient(
|
|
144
|
+
limits=limits,
|
|
145
|
+
timeout=timeout,
|
|
146
|
+
http2=False, # Disable HTTP/2 to avoid h2 dependency
|
|
147
|
+
follow_redirects=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Semaphore to control concurrent downloads
|
|
151
|
+
self.semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
152
|
+
|
|
153
|
+
return self
|
|
154
|
+
|
|
155
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
156
|
+
"""Clean up HTTP client."""
|
|
157
|
+
if self.client:
|
|
158
|
+
await self.client.aclose()
|
|
159
|
+
self.client = None
|
|
160
|
+
|
|
161
|
+
async def download_tasks(
|
|
162
|
+
self,
|
|
163
|
+
tasks: List[DownloadTask],
|
|
164
|
+
progress_callback: Optional[Callable[[int, int, DownloadTask], None]] = None,
|
|
165
|
+
) -> List[DownloadResult]:
|
|
166
|
+
"""
|
|
167
|
+
Download multiple tasks concurrently with progress tracking.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
tasks: List of download tasks to execute
|
|
171
|
+
progress_callback: Optional callback for progress updates
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of download results in same order as input tasks
|
|
175
|
+
"""
|
|
176
|
+
if not self.client or not self.semaphore:
|
|
177
|
+
raise RuntimeError("DownloadManager must be used as async context manager")
|
|
178
|
+
|
|
179
|
+
self.logger.info(f"Starting concurrent download of {len(tasks)} files")
|
|
180
|
+
self.logger.info(f"Max concurrent downloads: {self.max_concurrent}")
|
|
181
|
+
|
|
182
|
+
# Track completed downloads for progress reporting
|
|
183
|
+
completed_count = 0
|
|
184
|
+
total_count = len(tasks)
|
|
185
|
+
|
|
186
|
+
async def download_with_progress(task: DownloadTask) -> DownloadResult:
|
|
187
|
+
nonlocal completed_count
|
|
188
|
+
|
|
189
|
+
result = await self._download_single_task(task)
|
|
190
|
+
|
|
191
|
+
completed_count += 1
|
|
192
|
+
if progress_callback:
|
|
193
|
+
progress_callback(completed_count, total_count, task)
|
|
194
|
+
|
|
195
|
+
return result
|
|
196
|
+
|
|
197
|
+
# Execute all downloads concurrently
|
|
198
|
+
download_coroutines = [download_with_progress(task) for task in tasks]
|
|
199
|
+
results = await asyncio.gather(*download_coroutines, return_exceptions=True)
|
|
200
|
+
|
|
201
|
+
# Handle any exceptions that occurred
|
|
202
|
+
final_results = []
|
|
203
|
+
for i, result in enumerate(results):
|
|
204
|
+
if isinstance(result, Exception):
|
|
205
|
+
error_result = DownloadResult(task=tasks[i], success=False, error=str(result))
|
|
206
|
+
final_results.append(error_result)
|
|
207
|
+
self.logger.error(f"Download failed for {tasks[i].filename}: {result}")
|
|
208
|
+
else:
|
|
209
|
+
final_results.append(result)
|
|
210
|
+
|
|
211
|
+
success_count = sum(1 for r in final_results if r.success)
|
|
212
|
+
self.logger.info(f"Download completed: {success_count}/{total_count} successful")
|
|
213
|
+
|
|
214
|
+
return final_results
|
|
215
|
+
|
|
216
|
+
async def _download_single_task(self, task: DownloadTask) -> DownloadResult:
|
|
217
|
+
"""
|
|
218
|
+
Download and process a single ZIP file task with retry logic.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
task: Download task to execute
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Download result with parsed CSV data or error information
|
|
225
|
+
"""
|
|
226
|
+
async with self.semaphore: # Limit concurrent downloads
|
|
227
|
+
start_time = datetime.now()
|
|
228
|
+
|
|
229
|
+
for attempt in range(self.max_retries + 1):
|
|
230
|
+
try:
|
|
231
|
+
result = await self._attempt_download(task)
|
|
232
|
+
|
|
233
|
+
# Calculate download time
|
|
234
|
+
download_time = (datetime.now() - start_time).total_seconds()
|
|
235
|
+
result.download_time = download_time
|
|
236
|
+
|
|
237
|
+
if result.success:
|
|
238
|
+
self.logger.debug(
|
|
239
|
+
f"✅ Downloaded {task.filename} in {download_time:.1f}s "
|
|
240
|
+
f"({result.file_size_bytes / 1024 / 1024:.1f} MB)"
|
|
241
|
+
)
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
# If not the last attempt, wait before retrying
|
|
245
|
+
if attempt < self.max_retries:
|
|
246
|
+
delay = self.retry_delay * (self.retry_multiplier**attempt)
|
|
247
|
+
self.logger.warning(
|
|
248
|
+
f"⚠️ Download failed for {task.filename} (attempt {attempt + 1}), "
|
|
249
|
+
f"retrying in {delay:.1f}s: {result.error}"
|
|
250
|
+
)
|
|
251
|
+
await asyncio.sleep(delay)
|
|
252
|
+
else:
|
|
253
|
+
self.logger.error(
|
|
254
|
+
f"❌ Download failed after {self.max_retries + 1} attempts: {task.filename}"
|
|
255
|
+
)
|
|
256
|
+
return result
|
|
257
|
+
|
|
258
|
+
except Exception as e:
|
|
259
|
+
error_msg = f"Unexpected error: {str(e)}"
|
|
260
|
+
self.logger.error(f"❌ Download exception for {task.filename}: {error_msg}")
|
|
261
|
+
|
|
262
|
+
if attempt == self.max_retries:
|
|
263
|
+
return DownloadResult(
|
|
264
|
+
task=task,
|
|
265
|
+
success=False,
|
|
266
|
+
error=error_msg,
|
|
267
|
+
download_time=(datetime.now() - start_time).total_seconds(),
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Wait before retrying
|
|
271
|
+
delay = self.retry_delay * (self.retry_multiplier**attempt)
|
|
272
|
+
await asyncio.sleep(delay)
|
|
273
|
+
|
|
274
|
+
async def _attempt_download(self, task: DownloadTask) -> DownloadResult:
|
|
275
|
+
"""
|
|
276
|
+
Single download attempt for a ZIP file.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
task: Download task to execute
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Download result with success/failure status and data
|
|
283
|
+
"""
|
|
284
|
+
try:
|
|
285
|
+
# Download ZIP file
|
|
286
|
+
response = await self.client.get(task.url)
|
|
287
|
+
|
|
288
|
+
if response.status_code != 200:
|
|
289
|
+
return DownloadResult(
|
|
290
|
+
task=task,
|
|
291
|
+
success=False,
|
|
292
|
+
error=f"HTTP {response.status_code}",
|
|
293
|
+
status_code=response.status_code,
|
|
294
|
+
file_size_bytes=len(response.content) if response.content else 0,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Process ZIP file in memory
|
|
298
|
+
zip_content = response.content
|
|
299
|
+
file_size = len(zip_content)
|
|
300
|
+
|
|
301
|
+
# Extract and parse CSV from ZIP
|
|
302
|
+
csv_data = self._extract_csv_from_zip(zip_content, task.filename)
|
|
303
|
+
|
|
304
|
+
return DownloadResult(
|
|
305
|
+
task=task,
|
|
306
|
+
success=True,
|
|
307
|
+
data=csv_data,
|
|
308
|
+
status_code=response.status_code,
|
|
309
|
+
file_size_bytes=file_size,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
except httpx.TimeoutException:
|
|
313
|
+
return DownloadResult(task=task, success=False, error=f"Timeout after {self.timeout}s")
|
|
314
|
+
except httpx.ConnectError as e:
|
|
315
|
+
return DownloadResult(task=task, success=False, error=f"Connection error: {str(e)}")
|
|
316
|
+
except Exception as e:
|
|
317
|
+
return DownloadResult(task=task, success=False, error=f"Processing error: {str(e)}")
|
|
318
|
+
|
|
319
|
+
def _extract_csv_from_zip(self, zip_content: bytes, zip_filename: str) -> List[List[str]]:
|
|
320
|
+
"""
|
|
321
|
+
Extract and parse CSV data from ZIP file content.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
zip_content: Raw ZIP file bytes
|
|
325
|
+
zip_filename: Name of ZIP file (for CSV filename inference)
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
List of CSV rows as string lists
|
|
329
|
+
|
|
330
|
+
Raises:
|
|
331
|
+
Exception: If ZIP extraction or CSV parsing fails
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
# Expected CSV filename (remove .zip extension, add .csv)
|
|
335
|
+
expected_csv_name = zip_filename.replace(".zip", ".csv")
|
|
336
|
+
|
|
337
|
+
# Extract CSV from ZIP
|
|
338
|
+
with zipfile.ZipFile(io.BytesIO(zip_content), "r") as zip_file:
|
|
339
|
+
if expected_csv_name not in zip_file.namelist():
|
|
340
|
+
raise ValueError(f"CSV file {expected_csv_name} not found in ZIP")
|
|
341
|
+
|
|
342
|
+
with zip_file.open(expected_csv_name) as csv_file:
|
|
343
|
+
csv_content = csv_file.read().decode("utf-8")
|
|
344
|
+
|
|
345
|
+
# Parse CSV content
|
|
346
|
+
csv_rows = list(csv.reader(csv_content.strip().split("\n")))
|
|
347
|
+
|
|
348
|
+
if not csv_rows:
|
|
349
|
+
raise ValueError("Empty CSV file")
|
|
350
|
+
|
|
351
|
+
return csv_rows
|
|
352
|
+
|
|
353
|
+
except zipfile.BadZipFile:
|
|
354
|
+
raise ValueError("Invalid ZIP file format")
|
|
355
|
+
except UnicodeDecodeError:
|
|
356
|
+
raise ValueError("CSV file encoding error")
|
|
357
|
+
except Exception as e:
|
|
358
|
+
raise ValueError(f"ZIP processing failed: {str(e)}")
|
|
359
|
+
|
|
360
|
+
async def test_connection(self, test_url: str) -> Dict[str, Any]:
|
|
361
|
+
"""
|
|
362
|
+
Test connection and performance to Binance Vision.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
test_url: URL to test (should be a small ZIP file)
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Connection test results with timing and status information
|
|
369
|
+
"""
|
|
370
|
+
if not self.client:
|
|
371
|
+
raise RuntimeError("DownloadManager must be used as async context manager")
|
|
372
|
+
|
|
373
|
+
start_time = datetime.now()
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
response = await self.client.head(test_url)
|
|
377
|
+
end_time = datetime.now()
|
|
378
|
+
|
|
379
|
+
return {
|
|
380
|
+
"success": True,
|
|
381
|
+
"status_code": response.status_code,
|
|
382
|
+
"response_time_ms": (end_time - start_time).total_seconds() * 1000,
|
|
383
|
+
"headers": dict(response.headers),
|
|
384
|
+
"url": test_url,
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
except Exception as e:
|
|
388
|
+
end_time = datetime.now()
|
|
389
|
+
|
|
390
|
+
return {
|
|
391
|
+
"success": False,
|
|
392
|
+
"error": str(e),
|
|
393
|
+
"response_time_ms": (end_time - start_time).total_seconds() * 1000,
|
|
394
|
+
"url": test_url,
|
|
395
|
+
}
|