gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,395 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTTPX Async Download Manager for Binance ZIP Files
4
+
5
+ High-performance concurrent downloader optimized for Binance Vision ZIP files.
6
+ Supports up to 13+ simultaneous downloads with connection pooling and retry logic.
7
+
8
+ Since Binance Vision serves static ZIP files from CDN (not API endpoints),
9
+ there are no rate limits - we can maximize concurrent downloads for optimal performance.
10
+
11
+ Key optimizations:
12
+ - High concurrency (13+ simultaneous downloads)
13
+ - Connection pooling and HTTP/2 support
14
+ - Retry logic with exponential backoff
15
+ - Memory-efficient streaming for large ZIP files
16
+ - Progress tracking for concurrent operations
17
+ """
18
+
19
+ import asyncio
20
+ import csv
21
+ import io
22
+ import logging
23
+ import zipfile
24
+ from dataclasses import dataclass
25
+ from datetime import datetime
26
+ from typing import Any, Callable, Dict, List, Optional
27
+
28
+ import httpx
29
+
30
+ from .hybrid_url_generator import DownloadTask
31
+
32
+
33
+ @dataclass
34
+ class DownloadResult:
35
+ """Result of a download operation."""
36
+
37
+ task: DownloadTask
38
+ success: bool
39
+ data: Optional[List[List[str]]] = None # Parsed CSV rows
40
+ error: Optional[str] = None
41
+ download_time: float = 0.0
42
+ file_size_bytes: int = 0
43
+ status_code: Optional[int] = None
44
+
45
+
46
+ class ConcurrentDownloadManager:
47
+ """
48
+ High-performance concurrent downloader for Binance Vision ZIP files.
49
+
50
+ Optimized for static ZIP file downloads with maximum concurrency since
51
+ Binance Vision CDN has no rate limiting (unlike API endpoints).
52
+
53
+ Features:
54
+ - Up to 13+ simultaneous downloads (configurable)
55
+ - HTTP/2 connection pooling for efficiency
56
+ - Automatic retry with exponential backoff
57
+ - Memory-efficient ZIP processing
58
+ - Real-time progress tracking
59
+ - Comprehensive error handling
60
+
61
+ Performance optimizations:
62
+ - Single persistent HTTP client with connection reuse
63
+ - Streaming ZIP extraction to minimize memory usage
64
+ - Concurrent processing of multiple files
65
+ - Intelligent timeout and retry strategies
66
+
67
+ Examples:
68
+ Basic concurrent downloading:
69
+
70
+ >>> manager = ConcurrentDownloadManager(max_concurrent=13)
71
+ >>> async with manager:
72
+ ... results = await manager.download_tasks(download_tasks)
73
+ >>> successful = [r for r in results if r.success]
74
+ >>> print(f"Downloaded {len(successful)}/{len(results)} files")
75
+
76
+ Custom configuration:
77
+
78
+ >>> manager = ConcurrentDownloadManager(
79
+ ... max_concurrent=15, # Higher concurrency
80
+ ... connection_pool_size=25, # More connections
81
+ ... timeout=120, # Longer timeout for large files
82
+ ... max_retries=5 # More retry attempts
83
+ ... )
84
+
85
+ With progress callback:
86
+
87
+ >>> def progress_callback(completed, total, current_task):
88
+ ... print(f"Progress: {completed}/{total} - {current_task.filename}")
89
+ >>>
90
+ >>> async with manager:
91
+ ... results = await manager.download_tasks(tasks, progress_callback)
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ max_concurrent: int = 13,
97
+ connection_pool_size: int = 20,
98
+ timeout: float = 60.0,
99
+ max_retries: int = 3,
100
+ retry_delay: float = 1.0,
101
+ retry_multiplier: float = 2.0,
102
+ ):
103
+ """
104
+ Initialize concurrent download manager.
105
+
106
+ Args:
107
+ max_concurrent: Maximum simultaneous downloads (13+ recommended for ZIP files)
108
+ connection_pool_size: HTTP connection pool size
109
+ timeout: Per-download timeout in seconds
110
+ max_retries: Maximum retry attempts for failed downloads
111
+ retry_delay: Initial retry delay in seconds
112
+ retry_multiplier: Exponential backoff multiplier
113
+ """
114
+ self.max_concurrent = max_concurrent
115
+ self.connection_pool_size = connection_pool_size
116
+ self.timeout = timeout
117
+ self.max_retries = max_retries
118
+ self.retry_delay = retry_delay
119
+ self.retry_multiplier = retry_multiplier
120
+
121
+ # HTTP client will be initialized in __aenter__
122
+ self.client: Optional[httpx.AsyncClient] = None
123
+ self.semaphore: Optional[asyncio.Semaphore] = None
124
+
125
+ self.logger = logging.getLogger(__name__)
126
+
127
+ async def __aenter__(self):
128
+ """Initialize async HTTP client and semaphore."""
129
+ # Configure HTTP client for optimal ZIP file downloading
130
+ limits = httpx.Limits(
131
+ max_keepalive_connections=self.connection_pool_size,
132
+ max_connections=self.connection_pool_size + 10, # Extra headroom
133
+ keepalive_expiry=30.0, # Keep connections alive
134
+ )
135
+
136
+ timeout = httpx.Timeout(
137
+ connect=10.0, # Connection timeout
138
+ read=self.timeout, # Read timeout for large ZIP files
139
+ write=10.0, # Write timeout
140
+ pool=5.0, # Pool timeout
141
+ )
142
+
143
+ self.client = httpx.AsyncClient(
144
+ limits=limits,
145
+ timeout=timeout,
146
+ http2=False, # Disable HTTP/2 to avoid h2 dependency
147
+ follow_redirects=True,
148
+ )
149
+
150
+ # Semaphore to control concurrent downloads
151
+ self.semaphore = asyncio.Semaphore(self.max_concurrent)
152
+
153
+ return self
154
+
155
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
156
+ """Clean up HTTP client."""
157
+ if self.client:
158
+ await self.client.aclose()
159
+ self.client = None
160
+
161
+ async def download_tasks(
162
+ self,
163
+ tasks: List[DownloadTask],
164
+ progress_callback: Optional[Callable[[int, int, DownloadTask], None]] = None,
165
+ ) -> List[DownloadResult]:
166
+ """
167
+ Download multiple tasks concurrently with progress tracking.
168
+
169
+ Args:
170
+ tasks: List of download tasks to execute
171
+ progress_callback: Optional callback for progress updates
172
+
173
+ Returns:
174
+ List of download results in same order as input tasks
175
+ """
176
+ if not self.client or not self.semaphore:
177
+ raise RuntimeError("DownloadManager must be used as async context manager")
178
+
179
+ self.logger.info(f"Starting concurrent download of {len(tasks)} files")
180
+ self.logger.info(f"Max concurrent downloads: {self.max_concurrent}")
181
+
182
+ # Track completed downloads for progress reporting
183
+ completed_count = 0
184
+ total_count = len(tasks)
185
+
186
+ async def download_with_progress(task: DownloadTask) -> DownloadResult:
187
+ nonlocal completed_count
188
+
189
+ result = await self._download_single_task(task)
190
+
191
+ completed_count += 1
192
+ if progress_callback:
193
+ progress_callback(completed_count, total_count, task)
194
+
195
+ return result
196
+
197
+ # Execute all downloads concurrently
198
+ download_coroutines = [download_with_progress(task) for task in tasks]
199
+ results = await asyncio.gather(*download_coroutines, return_exceptions=True)
200
+
201
+ # Handle any exceptions that occurred
202
+ final_results = []
203
+ for i, result in enumerate(results):
204
+ if isinstance(result, Exception):
205
+ error_result = DownloadResult(task=tasks[i], success=False, error=str(result))
206
+ final_results.append(error_result)
207
+ self.logger.error(f"Download failed for {tasks[i].filename}: {result}")
208
+ else:
209
+ final_results.append(result)
210
+
211
+ success_count = sum(1 for r in final_results if r.success)
212
+ self.logger.info(f"Download completed: {success_count}/{total_count} successful")
213
+
214
+ return final_results
215
+
216
+ async def _download_single_task(self, task: DownloadTask) -> DownloadResult:
217
+ """
218
+ Download and process a single ZIP file task with retry logic.
219
+
220
+ Args:
221
+ task: Download task to execute
222
+
223
+ Returns:
224
+ Download result with parsed CSV data or error information
225
+ """
226
+ async with self.semaphore: # Limit concurrent downloads
227
+ start_time = datetime.now()
228
+
229
+ for attempt in range(self.max_retries + 1):
230
+ try:
231
+ result = await self._attempt_download(task)
232
+
233
+ # Calculate download time
234
+ download_time = (datetime.now() - start_time).total_seconds()
235
+ result.download_time = download_time
236
+
237
+ if result.success:
238
+ self.logger.debug(
239
+ f"✅ Downloaded {task.filename} in {download_time:.1f}s "
240
+ f"({result.file_size_bytes / 1024 / 1024:.1f} MB)"
241
+ )
242
+ return result
243
+
244
+ # If not the last attempt, wait before retrying
245
+ if attempt < self.max_retries:
246
+ delay = self.retry_delay * (self.retry_multiplier**attempt)
247
+ self.logger.warning(
248
+ f"⚠️ Download failed for {task.filename} (attempt {attempt + 1}), "
249
+ f"retrying in {delay:.1f}s: {result.error}"
250
+ )
251
+ await asyncio.sleep(delay)
252
+ else:
253
+ self.logger.error(
254
+ f"❌ Download failed after {self.max_retries + 1} attempts: {task.filename}"
255
+ )
256
+ return result
257
+
258
+ except Exception as e:
259
+ error_msg = f"Unexpected error: {str(e)}"
260
+ self.logger.error(f"❌ Download exception for {task.filename}: {error_msg}")
261
+
262
+ if attempt == self.max_retries:
263
+ return DownloadResult(
264
+ task=task,
265
+ success=False,
266
+ error=error_msg,
267
+ download_time=(datetime.now() - start_time).total_seconds(),
268
+ )
269
+
270
+ # Wait before retrying
271
+ delay = self.retry_delay * (self.retry_multiplier**attempt)
272
+ await asyncio.sleep(delay)
273
+
274
+ async def _attempt_download(self, task: DownloadTask) -> DownloadResult:
275
+ """
276
+ Single download attempt for a ZIP file.
277
+
278
+ Args:
279
+ task: Download task to execute
280
+
281
+ Returns:
282
+ Download result with success/failure status and data
283
+ """
284
+ try:
285
+ # Download ZIP file
286
+ response = await self.client.get(task.url)
287
+
288
+ if response.status_code != 200:
289
+ return DownloadResult(
290
+ task=task,
291
+ success=False,
292
+ error=f"HTTP {response.status_code}",
293
+ status_code=response.status_code,
294
+ file_size_bytes=len(response.content) if response.content else 0,
295
+ )
296
+
297
+ # Process ZIP file in memory
298
+ zip_content = response.content
299
+ file_size = len(zip_content)
300
+
301
+ # Extract and parse CSV from ZIP
302
+ csv_data = self._extract_csv_from_zip(zip_content, task.filename)
303
+
304
+ return DownloadResult(
305
+ task=task,
306
+ success=True,
307
+ data=csv_data,
308
+ status_code=response.status_code,
309
+ file_size_bytes=file_size,
310
+ )
311
+
312
+ except httpx.TimeoutException:
313
+ return DownloadResult(task=task, success=False, error=f"Timeout after {self.timeout}s")
314
+ except httpx.ConnectError as e:
315
+ return DownloadResult(task=task, success=False, error=f"Connection error: {str(e)}")
316
+ except Exception as e:
317
+ return DownloadResult(task=task, success=False, error=f"Processing error: {str(e)}")
318
+
319
+ def _extract_csv_from_zip(self, zip_content: bytes, zip_filename: str) -> List[List[str]]:
320
+ """
321
+ Extract and parse CSV data from ZIP file content.
322
+
323
+ Args:
324
+ zip_content: Raw ZIP file bytes
325
+ zip_filename: Name of ZIP file (for CSV filename inference)
326
+
327
+ Returns:
328
+ List of CSV rows as string lists
329
+
330
+ Raises:
331
+ Exception: If ZIP extraction or CSV parsing fails
332
+ """
333
+ try:
334
+ # Expected CSV filename (remove .zip extension, add .csv)
335
+ expected_csv_name = zip_filename.replace(".zip", ".csv")
336
+
337
+ # Extract CSV from ZIP
338
+ with zipfile.ZipFile(io.BytesIO(zip_content), "r") as zip_file:
339
+ if expected_csv_name not in zip_file.namelist():
340
+ raise ValueError(f"CSV file {expected_csv_name} not found in ZIP")
341
+
342
+ with zip_file.open(expected_csv_name) as csv_file:
343
+ csv_content = csv_file.read().decode("utf-8")
344
+
345
+ # Parse CSV content
346
+ csv_rows = list(csv.reader(csv_content.strip().split("\n")))
347
+
348
+ if not csv_rows:
349
+ raise ValueError("Empty CSV file")
350
+
351
+ return csv_rows
352
+
353
+ except zipfile.BadZipFile:
354
+ raise ValueError("Invalid ZIP file format")
355
+ except UnicodeDecodeError:
356
+ raise ValueError("CSV file encoding error")
357
+ except Exception as e:
358
+ raise ValueError(f"ZIP processing failed: {str(e)}")
359
+
360
+ async def test_connection(self, test_url: str) -> Dict[str, Any]:
361
+ """
362
+ Test connection and performance to Binance Vision.
363
+
364
+ Args:
365
+ test_url: URL to test (should be a small ZIP file)
366
+
367
+ Returns:
368
+ Connection test results with timing and status information
369
+ """
370
+ if not self.client:
371
+ raise RuntimeError("DownloadManager must be used as async context manager")
372
+
373
+ start_time = datetime.now()
374
+
375
+ try:
376
+ response = await self.client.head(test_url)
377
+ end_time = datetime.now()
378
+
379
+ return {
380
+ "success": True,
381
+ "status_code": response.status_code,
382
+ "response_time_ms": (end_time - start_time).total_seconds() * 1000,
383
+ "headers": dict(response.headers),
384
+ "url": test_url,
385
+ }
386
+
387
+ except Exception as e:
388
+ end_time = datetime.now()
389
+
390
+ return {
391
+ "success": False,
392
+ "error": str(e),
393
+ "response_time_ms": (end_time - start_time).total_seconds() * 1000,
394
+ "url": test_url,
395
+ }