gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,407 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Concurrent Collection Orchestrator
4
+
5
+ High-performance orchestrator that combines hybrid URL generation with concurrent
6
+ downloading for maximum throughput while maintaining data integrity.
7
+
8
+ Integrates:
9
+ - HybridUrlGenerator: Smart monthly+daily strategy
10
+ - ConcurrentDownloadManager: HTTPX async downloads with 13 concurrent connections
11
+ - BinancePublicDataCollector: Data processing and validation
12
+ """
13
+
14
+ import asyncio
15
+ import logging
16
+ from dataclasses import dataclass
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Union
20
+
21
+ from .httpx_downloader import ConcurrentDownloadManager
22
+ from .hybrid_url_generator import DataSource, HybridUrlGenerator
23
+
24
+
25
+ @dataclass
26
+ class CollectionResult:
27
+ """Result of concurrent collection operation."""
28
+
29
+ success: bool
30
+ timeframe: str
31
+ total_tasks: int
32
+ successful_downloads: int
33
+ failed_downloads: int
34
+ total_bars: int
35
+ collection_time: float
36
+ data_source_breakdown: Dict[str, int] # monthly vs daily counts
37
+ processed_data: Optional[List[List[str]]] = None
38
+ errors: Optional[List[str]] = None
39
+
40
+
41
+ class ConcurrentCollectionOrchestrator:
42
+ """
43
+ High-performance concurrent collection orchestrator for hybrid Binance data collection.
44
+
45
+ Combines the hybrid URL generation strategy with concurrent HTTPX downloading
46
+ to achieve maximum throughput while maintaining authentic data integrity.
47
+
48
+ Features:
49
+ - Hybrid monthly+daily strategy for optimal performance
50
+ - 13 concurrent downloads with connection pooling
51
+ - Intelligent data source selection based on age
52
+ - Real-time progress tracking and error handling
53
+ - Seamless integration with existing BinancePublicDataCollector
54
+
55
+ Performance Benefits:
56
+ - 10-15x faster than sequential downloads
57
+ - Optimal memory usage with streaming processing
58
+ - Automatic retry logic with exponential backoff
59
+ - Connection reuse and HTTP/2 support (when available)
60
+
61
+ Examples:
62
+ Basic concurrent collection:
63
+
64
+ >>> orchestrator = ConcurrentCollectionOrchestrator(
65
+ ... symbol="BTCUSDT",
66
+ ... start_date=datetime(2024, 1, 1),
67
+ ... end_date=datetime(2024, 12, 31)
68
+ ... )
69
+ >>> async with orchestrator:
70
+ ... result = await orchestrator.collect_timeframe_concurrent("1h")
71
+ ... print(f"Collected {result.total_bars} bars in {result.collection_time:.1f}s")
72
+ Collected 8760 bars in 12.3s
73
+
74
+ Multiple timeframes with progress tracking:
75
+
76
+ >>> def progress_callback(completed, total, current_task):
77
+ ... print(f"Progress: {completed}/{total} - {current_task.filename}")
78
+ >>>
79
+ >>> orchestrator = ConcurrentCollectionOrchestrator(symbol="ETHUSDT")
80
+ >>> async with orchestrator:
81
+ ... results = await orchestrator.collect_multiple_timeframes_concurrent(
82
+ ... ["1h", "4h"], progress_callback=progress_callback
83
+ ... )
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ symbol: str = "SOLUSDT",
89
+ start_date: Optional[datetime] = None,
90
+ end_date: Optional[datetime] = None,
91
+ output_dir: Optional[Union[str, Path]] = None,
92
+ max_concurrent: int = 13,
93
+ daily_lookback_days: int = 30,
94
+ timeout: float = 60.0,
95
+ max_retries: int = 3,
96
+ ):
97
+ """
98
+ Initialize concurrent collection orchestrator.
99
+
100
+ Args:
101
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
102
+ start_date: Collection start date
103
+ end_date: Collection end date
104
+ output_dir: Directory to save collected data
105
+ max_concurrent: Maximum simultaneous downloads (13 optimized for ZIP files)
106
+ daily_lookback_days: Days to use daily files for recent data
107
+ timeout: Download timeout per file in seconds
108
+ max_retries: Maximum retry attempts for failed downloads
109
+ """
110
+ self.symbol = symbol
111
+ self.start_date = start_date or datetime(2020, 8, 15)
112
+ self.end_date = end_date or datetime(2025, 3, 20)
113
+ self.max_concurrent = max_concurrent
114
+ self.timeout = timeout
115
+ self.max_retries = max_retries
116
+
117
+ # Configure output directory
118
+ if output_dir:
119
+ self.output_dir = Path(output_dir)
120
+ else:
121
+ self.output_dir = Path(__file__).parent.parent / "sample_data"
122
+ self.output_dir.mkdir(parents=True, exist_ok=True)
123
+
124
+ # Initialize components
125
+ self.url_generator = HybridUrlGenerator(
126
+ daily_lookback_days=daily_lookback_days, max_concurrent_per_batch=max_concurrent
127
+ )
128
+
129
+ self.download_manager: Optional[ConcurrentDownloadManager] = None
130
+ self.logger = logging.getLogger(__name__)
131
+
132
+ self.logger.info(f"Initialized ConcurrentCollectionOrchestrator for {symbol}")
133
+ self.logger.info(f"Date range: {self.start_date} to {self.end_date}")
134
+ self.logger.info(f"Max concurrent downloads: {max_concurrent}")
135
+
136
+ async def __aenter__(self):
137
+ """Initialize async components."""
138
+ self.download_manager = ConcurrentDownloadManager(
139
+ max_concurrent=self.max_concurrent, timeout=self.timeout, max_retries=self.max_retries
140
+ )
141
+ await self.download_manager.__aenter__()
142
+ return self
143
+
144
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
145
+ """Clean up async components."""
146
+ if self.download_manager:
147
+ await self.download_manager.__aexit__(exc_type, exc_val, exc_tb)
148
+
149
+ async def collect_timeframe_concurrent(
150
+ self, timeframe: str, progress_callback: Optional[callable] = None
151
+ ) -> CollectionResult:
152
+ """
153
+ Collect data for a single timeframe using concurrent hybrid strategy.
154
+
155
+ Args:
156
+ timeframe: Timeframe to collect (e.g., "1h", "4h")
157
+ progress_callback: Optional callback for progress updates
158
+
159
+ Returns:
160
+ CollectionResult with comprehensive collection statistics
161
+ """
162
+ start_time = datetime.now()
163
+ self.logger.info(f"Starting concurrent collection for {self.symbol} {timeframe}")
164
+
165
+ try:
166
+ # Generate hybrid download tasks
167
+ download_tasks = self.url_generator.generate_download_tasks(
168
+ symbol=self.symbol,
169
+ timeframe=timeframe,
170
+ start_date=self.start_date,
171
+ end_date=self.end_date,
172
+ )
173
+
174
+ if not download_tasks:
175
+ return CollectionResult(
176
+ success=False,
177
+ timeframe=timeframe,
178
+ total_tasks=0,
179
+ successful_downloads=0,
180
+ failed_downloads=0,
181
+ total_bars=0,
182
+ collection_time=0.0,
183
+ data_source_breakdown={},
184
+ errors=["No download tasks generated"],
185
+ )
186
+
187
+ # Log strategy breakdown
188
+ monthly_tasks, daily_tasks = self.url_generator.separate_tasks_by_source(download_tasks)
189
+ self.logger.info(
190
+ f"Download strategy: {len(monthly_tasks)} monthly + {len(daily_tasks)} daily = {len(download_tasks)} total"
191
+ )
192
+
193
+ # Execute concurrent downloads
194
+ if not self.download_manager:
195
+ raise RuntimeError("Download manager not initialized - use async context manager")
196
+
197
+ download_results = await self.download_manager.download_tasks(
198
+ download_tasks, progress_callback
199
+ )
200
+
201
+ # Process results
202
+ processed_data = []
203
+ successful_downloads = 0
204
+ failed_downloads = 0
205
+ errors = []
206
+
207
+ for result in download_results:
208
+ if result.success and result.data:
209
+ processed_data.extend(result.data)
210
+ successful_downloads += 1
211
+ else:
212
+ failed_downloads += 1
213
+ if result.error:
214
+ errors.append(f"{result.task.filename}: {result.error}")
215
+
216
+ # Sort chronologically
217
+ if processed_data:
218
+ processed_data.sort(key=lambda row: row[0]) # Sort by timestamp
219
+
220
+ # Calculate data source breakdown
221
+ monthly_successful = sum(
222
+ 1
223
+ for r in download_results
224
+ if r.success and r.task.source_type == DataSource.MONTHLY
225
+ )
226
+ daily_successful = sum(
227
+ 1 for r in download_results if r.success and r.task.source_type == DataSource.DAILY
228
+ )
229
+
230
+ collection_time = (datetime.now() - start_time).total_seconds()
231
+
232
+ result = CollectionResult(
233
+ success=successful_downloads > 0,
234
+ timeframe=timeframe,
235
+ total_tasks=len(download_tasks),
236
+ successful_downloads=successful_downloads,
237
+ failed_downloads=failed_downloads,
238
+ total_bars=len(processed_data),
239
+ collection_time=collection_time,
240
+ data_source_breakdown={"monthly": monthly_successful, "daily": daily_successful},
241
+ processed_data=processed_data,
242
+ errors=errors if errors else None,
243
+ )
244
+
245
+ # Log results
246
+ self.logger.info(f"Collection completed for {timeframe}:")
247
+ self.logger.info(f" Tasks: {successful_downloads}/{len(download_tasks)} successful")
248
+ self.logger.info(f" Data: {len(processed_data)} bars in {collection_time:.1f}s")
249
+ self.logger.info(f" Sources: {monthly_successful} monthly + {daily_successful} daily")
250
+
251
+ return result
252
+
253
+ except Exception as e:
254
+ collection_time = (datetime.now() - start_time).total_seconds()
255
+ self.logger.error(f"Collection failed for {timeframe}: {e}")
256
+
257
+ return CollectionResult(
258
+ success=False,
259
+ timeframe=timeframe,
260
+ total_tasks=0,
261
+ successful_downloads=0,
262
+ failed_downloads=0,
263
+ total_bars=0,
264
+ collection_time=collection_time,
265
+ data_source_breakdown={},
266
+ errors=[str(e)],
267
+ )
268
+
269
+ async def collect_multiple_timeframes_concurrent(
270
+ self, timeframes: List[str], progress_callback: Optional[callable] = None
271
+ ) -> Dict[str, CollectionResult]:
272
+ """
273
+ Collect data for multiple timeframes concurrently.
274
+
275
+ Args:
276
+ timeframes: List of timeframes to collect
277
+ progress_callback: Optional callback for progress updates
278
+
279
+ Returns:
280
+ Dictionary mapping timeframes to CollectionResult objects
281
+ """
282
+ self.logger.info(f"Starting concurrent collection for {len(timeframes)} timeframes")
283
+ start_time = datetime.now()
284
+
285
+ # Create collection tasks for each timeframe
286
+ collection_tasks = []
287
+ for timeframe in timeframes:
288
+ task = self.collect_timeframe_concurrent(timeframe, progress_callback)
289
+ collection_tasks.append((timeframe, task))
290
+
291
+ # Execute all timeframe collections concurrently
292
+ results = {}
293
+ completed_tasks = await asyncio.gather(
294
+ *[task for _, task in collection_tasks], return_exceptions=True
295
+ )
296
+
297
+ # Process results
298
+ for i, result in enumerate(completed_tasks):
299
+ timeframe = timeframes[i]
300
+ if isinstance(result, Exception):
301
+ results[timeframe] = CollectionResult(
302
+ success=False,
303
+ timeframe=timeframe,
304
+ total_tasks=0,
305
+ successful_downloads=0,
306
+ failed_downloads=0,
307
+ total_bars=0,
308
+ collection_time=0.0,
309
+ data_source_breakdown={},
310
+ errors=[str(result)],
311
+ )
312
+ self.logger.error(f"Collection failed for {timeframe}: {result}")
313
+ else:
314
+ results[timeframe] = result
315
+
316
+ total_time = (datetime.now() - start_time).total_seconds()
317
+ successful_timeframes = sum(1 for r in results.values() if r.success)
318
+ total_bars = sum(r.total_bars for r in results.values())
319
+
320
+ self.logger.info("Multi-timeframe collection completed:")
321
+ self.logger.info(f" Timeframes: {successful_timeframes}/{len(timeframes)} successful")
322
+ self.logger.info(f" Total bars: {total_bars:,}")
323
+ self.logger.info(f" Total time: {total_time:.1f}s")
324
+
325
+ return results
326
+
327
+ def get_collection_strategy_summary(self, timeframe: str) -> Dict[str, Any]:
328
+ """
329
+ Get summary of collection strategy for the given timeframe.
330
+
331
+ Args:
332
+ timeframe: Timeframe to analyze
333
+
334
+ Returns:
335
+ Dictionary with strategy summary and performance estimates
336
+ """
337
+ return self.url_generator.get_collection_strategy_summary(
338
+ symbol=self.symbol,
339
+ timeframe=timeframe,
340
+ start_date=self.start_date,
341
+ end_date=self.end_date,
342
+ )
343
+
344
+ async def test_connection_performance(self) -> Dict[str, Any]:
345
+ """
346
+ Test connection performance to Binance Vision servers.
347
+
348
+ Returns:
349
+ Dictionary with connection test results and performance metrics
350
+ """
351
+ if not self.download_manager:
352
+ raise RuntimeError("Download manager not initialized - use async context manager")
353
+
354
+ # Test with a small monthly file
355
+ test_url = f"https://data.binance.vision/data/spot/monthly/klines/{self.symbol}/1h/{self.symbol}-1h-2024-01.zip"
356
+
357
+ return await self.download_manager.test_connection(test_url)
358
+
359
+ def estimate_collection_time(self, timeframes: List[str]) -> Dict[str, Any]:
360
+ """
361
+ Estimate collection time and resource requirements.
362
+
363
+ Args:
364
+ timeframes: List of timeframes to estimate
365
+
366
+ Returns:
367
+ Dictionary with time estimates and resource requirements
368
+ """
369
+ total_tasks = 0
370
+ monthly_tasks = 0
371
+ daily_tasks = 0
372
+
373
+ for timeframe in timeframes:
374
+ tasks = self.url_generator.generate_download_tasks(
375
+ symbol=self.symbol,
376
+ timeframe=timeframe,
377
+ start_date=self.start_date,
378
+ end_date=self.end_date,
379
+ )
380
+ total_tasks += len(tasks)
381
+
382
+ monthly, daily = self.url_generator.separate_tasks_by_source(tasks)
383
+ monthly_tasks += len(monthly)
384
+ daily_tasks += len(daily)
385
+
386
+ # Estimate based on concurrent batches
387
+ batches_needed = (total_tasks + self.max_concurrent - 1) // self.max_concurrent
388
+
389
+ # Conservative estimates based on file sizes and network speed
390
+ avg_monthly_time = 3.0 # seconds per monthly file
391
+ avg_daily_time = 1.0 # seconds per daily file
392
+
393
+ estimated_time = (
394
+ monthly_tasks * avg_monthly_time + daily_tasks * avg_daily_time
395
+ ) / self.max_concurrent
396
+
397
+ return {
398
+ "total_tasks": total_tasks,
399
+ "monthly_tasks": monthly_tasks,
400
+ "daily_tasks": daily_tasks,
401
+ "concurrent_batches": batches_needed,
402
+ "max_concurrent": self.max_concurrent,
403
+ "estimated_time_seconds": estimated_time,
404
+ "estimated_time_minutes": estimated_time / 60,
405
+ "timeframes": timeframes,
406
+ "strategy": "hybrid_monthly_daily_concurrent",
407
+ }
@@ -0,0 +1,123 @@
1
+ """
2
+ CSV Format Detector for Binance kline data.
3
+
4
+ Detects whether a CSV file is in spot format (11 columns, no header) or
5
+ futures format (12 columns, with header row).
6
+
7
+ Usage:
8
+ from gapless_crypto_clickhouse.collectors.csv_format_detector import detect_csv_format
9
+
10
+ format_type = detect_csv_format("BTCUSDT-1h-2024-01.csv")
11
+ # Returns: "spot" or "futures"
12
+ """
13
+
14
+ import logging
15
+ from pathlib import Path
16
+ from typing import Literal
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def detect_csv_format(csv_path: Path) -> Literal["spot", "futures"]:
22
+ """
23
+ Detect CSV format by reading first line.
24
+
25
+ Binance CSV formats:
26
+ - Spot: 11 columns, no header (starts with timestamp)
27
+ - Futures: 12 columns, with header row (starts with "open_time")
28
+
29
+ Detection strategy:
30
+ 1. Read first line
31
+ 2. Check if first field is "open_time" (futures header)
32
+ 3. If not, assume spot format (no header, starts with timestamp)
33
+
34
+ Args:
35
+ csv_path: Path to CSV file
36
+
37
+ Returns:
38
+ "spot" or "futures"
39
+
40
+ Raises:
41
+ FileNotFoundError: If CSV file doesn't exist
42
+ ValueError: If first line is empty or unreadable
43
+ RuntimeError: If format detection fails
44
+
45
+ Examples:
46
+ >>> detect_csv_format(Path("BTCUSDT-1h-2024-01.csv"))
47
+ "spot"
48
+
49
+ >>> detect_csv_format(Path("BTCUSDT-1h-2024-01-futures.csv"))
50
+ "futures"
51
+ """
52
+ if not csv_path.exists():
53
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
54
+
55
+ try:
56
+ with open(csv_path, "r", encoding="utf-8") as f:
57
+ first_line = f.readline().strip()
58
+
59
+ if not first_line:
60
+ raise ValueError(f"CSV file is empty: {csv_path}")
61
+
62
+ # Futures CSV has header row starting with "open_time"
63
+ # Spot CSV has no header, first line starts with timestamp (numeric)
64
+ if first_line.startswith("open_time"):
65
+ logger.debug(f"Detected futures format (header present): {csv_path}")
66
+ return "futures"
67
+ else:
68
+ # Verify it's a valid spot format (first field should be numeric timestamp)
69
+ first_field = first_line.split(",")[0]
70
+ try:
71
+ int(first_field) # Timestamp should be parseable as integer
72
+ logger.debug(f"Detected spot format (no header, timestamp first): {csv_path}")
73
+ return "spot"
74
+ except ValueError as e:
75
+ raise RuntimeError(
76
+ f"Failed to detect format for {csv_path}. "
77
+ f"First field '{first_field}' is neither 'open_time' (futures header) "
78
+ f"nor a valid timestamp (spot format). Error: {e}"
79
+ ) from e
80
+
81
+ except (OSError, UnicodeDecodeError) as e:
82
+ raise RuntimeError(f"Failed to read CSV file {csv_path}: {e}") from e
83
+
84
+
85
+ def count_csv_columns(csv_path: Path, has_header: bool = False) -> int:
86
+ """
87
+ Count columns in CSV file by reading first data line.
88
+
89
+ Args:
90
+ csv_path: Path to CSV file
91
+ has_header: If True, skip first line (header)
92
+
93
+ Returns:
94
+ Number of columns
95
+
96
+ Raises:
97
+ FileNotFoundError: If CSV file doesn't exist
98
+ ValueError: If CSV has no data lines
99
+
100
+ Examples:
101
+ >>> count_csv_columns(Path("spot.csv"), has_header=False)
102
+ 11
103
+
104
+ >>> count_csv_columns(Path("futures.csv"), has_header=True)
105
+ 12
106
+ """
107
+ if not csv_path.exists():
108
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
109
+
110
+ try:
111
+ with open(csv_path, "r", encoding="utf-8") as f:
112
+ if has_header:
113
+ f.readline() # Skip header
114
+ first_data_line = f.readline().strip()
115
+
116
+ if not first_data_line:
117
+ raise ValueError(f"CSV file has no data lines: {csv_path}")
118
+
119
+ columns = first_data_line.split(",")
120
+ return len(columns)
121
+
122
+ except (OSError, UnicodeDecodeError) as e:
123
+ raise RuntimeError(f"Failed to read CSV file {csv_path}: {e}") from e