gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1994 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Binance Public Data Collector
4
+
5
+ Ultra-fast historical data collection using Binance's official public data repository.
6
+ 10-100x faster than API calls, with complete historical coverage.
7
+
8
+ Data source: https://data.binance.vision/data/spot/monthly/klines/
9
+ """
10
+
11
+ import argparse
12
+ import csv
13
+ import hashlib
14
+ import json
15
+ import shutil
16
+ import tempfile
17
+ import urllib.request
18
+ import warnings
19
+ import zipfile
20
+ from datetime import datetime, timedelta, timezone
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional, Tuple, Union
23
+
24
+ import pandas as pd
25
+
26
+ from ..gap_filling.universal_gap_filler import UniversalGapFiller
27
+ from ..utils.etag_cache import ETagCache
28
+ from ..utils.timeframe_constants import TIMEFRAME_TO_MINUTES
29
+ from ..utils.timestamp_format_analyzer import TimestampFormatAnalyzer
30
+ from ..validation.csv_validator import CSVValidator
31
+
32
+
33
+ class BinancePublicDataCollector:
34
+ """Ultra-fast cryptocurrency spot data collection from Binance's public data repository.
35
+
36
+ This collector provides 10-100x faster data collection compared to API calls by
37
+ downloading pre-generated monthly ZIP files from Binance's official public data repository.
38
+ Supports complete historical coverage with full 11-column microstructure format including
39
+ order flow metrics.
40
+
41
+ Features:
42
+ - Ultra-fast bulk data collection from monthly ZIP archives
43
+ - Complete historical coverage from 2017 onwards
44
+ - Full 11-column microstructure format with order flow data
45
+ - Automatic gap detection and filling capabilities
46
+ - Built-in data validation and integrity checks
47
+ - Support for all major timeframes (1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h)
48
+ - DataFrame-first Python API with seamless pandas integration
49
+
50
+ Data Format:
51
+ The collector outputs CSV files with 11 columns providing complete market microstructure:
52
+ - OHLCV: Open, High, Low, Close, Volume
53
+ - Timestamps: Open Time, Close Time
54
+ - Order Flow: Quote Asset Volume, Number of Trades
55
+ - Taker Metrics: Taker Buy Base Volume, Taker Buy Quote Volume
56
+
57
+ Examples:
58
+ For simple data collection, consider using the function-based API:
59
+
60
+ >>> import gapless_crypto_clickhouse as gcd
61
+ >>> df = gcd.fetch_data("BTCUSDT", "1h", start="2024-01-01", end="2024-12-31")
62
+
63
+ Advanced usage with this class for complex workflows:
64
+
65
+ >>> collector = BinancePublicDataCollector()
66
+ >>> result = collector.collect_timeframe_data("1h")
67
+ >>> df = result["dataframe"]
68
+ >>> print(f"Collected {len(df)} bars of {collector.symbol} data")
69
+ Collected 26280 bars of SOLUSDT data
70
+
71
+ Custom configuration and multiple timeframes:
72
+
73
+ >>> collector = BinancePublicDataCollector(
74
+ ... symbol="BTCUSDT",
75
+ ... start_date="2023-01-01",
76
+ ... end_date="2023-12-31",
77
+ ... output_dir="./crypto_data"
78
+ ... )
79
+ >>> results = collector.collect_multiple_timeframes(["1h", "4h"])
80
+ >>> for timeframe, result in results.items():
81
+ ... print(f"{timeframe}: {len(result['dataframe'])} bars")
82
+ 1h: 8760 bars
83
+ 4h: 2190 bars
84
+
85
+ Note:
86
+ This collector supports 713 USDT perpetual symbols for both spot and futures-um markets.
87
+ Symbol validation is handled in the API layer via get_supported_symbols().
88
+ See ADR-0022 for spot/futures alignment rationale.
89
+ """
90
+
91
+ def _validate_symbol(self, symbol: str) -> str:
92
+ """
93
+ Validate and sanitize symbol input for security.
94
+
95
+ This method prevents path traversal attacks and ensures symbol format integrity
96
+ by rejecting invalid characters and malformed inputs.
97
+
98
+ Args:
99
+ symbol: Trading pair symbol to validate (e.g., "BTCUSDT", "SOLUSDT")
100
+
101
+ Returns:
102
+ Validated and normalized symbol string (uppercase, stripped)
103
+
104
+ Raises:
105
+ ValueError: If symbol is None, empty, or contains invalid characters
106
+
107
+ Security:
108
+ - Prevents path traversal attacks (CWE-22)
109
+ - Blocks directory navigation characters (/, \\, ., ..)
110
+ - Enforces alphanumeric-only input
111
+ - Protects file operations using symbol in paths
112
+
113
+ Examples:
114
+ >>> collector._validate_symbol("btcusdt")
115
+ 'BTCUSDT'
116
+
117
+ >>> collector._validate_symbol("BTC/../etc/passwd")
118
+ ValueError: Symbol contains invalid characters...
119
+
120
+ >>> collector._validate_symbol("")
121
+ ValueError: Symbol cannot be empty
122
+
123
+ >>> collector._validate_symbol(None)
124
+ ValueError: Symbol cannot be None
125
+ """
126
+ # SEC-03: None value validation
127
+ if symbol is None:
128
+ raise ValueError("Symbol cannot be None")
129
+
130
+ # SEC-02: Empty string validation
131
+ if not symbol or not symbol.strip():
132
+ raise ValueError("Symbol cannot be empty")
133
+
134
+ # SEC-01: Path traversal prevention
135
+ import re
136
+
137
+ if re.search(r"[./\\]", symbol):
138
+ raise ValueError(
139
+ f"Symbol contains invalid characters: {symbol}\n"
140
+ f"Symbol must be alphanumeric (e.g., BTCUSDT, SOLUSDT)"
141
+ )
142
+
143
+ # Normalize to uppercase and strip whitespace
144
+ symbol = symbol.upper().strip()
145
+
146
+ # Whitelist validation - only alphanumeric characters
147
+ if not re.match(r"^[A-Z0-9]+$", symbol):
148
+ raise ValueError(
149
+ f"Symbol must be alphanumeric: {symbol}\nValid examples: BTCUSDT, ETHUSDT, SOLUSDT"
150
+ )
151
+
152
+ return symbol
153
+
154
+ # ADR-0021: URL constants for spot and futures
155
+ SPOT_BASE_URL = "https://data.binance.vision/data/spot"
156
+ FUTURES_BASE_URL = "https://data.binance.vision/data/futures/um"
157
+
158
+ def __init__(
159
+ self,
160
+ symbol: str = "SOLUSDT",
161
+ start_date: str = "2020-08-15",
162
+ end_date: str = "2025-03-20",
163
+ output_dir: Optional[Union[str, Path]] = None,
164
+ output_format: str = "csv",
165
+ instrument_type: str = "spot", # ADR-0021: UM futures support
166
+ ) -> None:
167
+ """Initialize the Binance Public Data Collector.
168
+
169
+ Args:
170
+ symbol (str, optional): Trading pair symbol in USDT format.
171
+ Must be alphanumeric (A-Z, 0-9) only. Path characters (/, \\, .)
172
+ and special characters are rejected for security.
173
+ Symbol is normalized to uppercase.
174
+ Defaults to "SOLUSDT".
175
+ start_date (str, optional): Start date in YYYY-MM-DD format.
176
+ Data collection begins from this date (inclusive).
177
+ Must be on or before end_date.
178
+ Defaults to "2020-08-15".
179
+ end_date (str, optional): End date in YYYY-MM-DD format.
180
+ Data collection ends on this date (inclusive, 23:59:59).
181
+ Must be on or after start_date.
182
+ Defaults to "2025-03-20".
183
+ output_dir (str or Path, optional): Directory to save files.
184
+ If None, saves to package's sample_data directory.
185
+ Defaults to None.
186
+ output_format (str, optional): Output format ("csv" or "parquet").
187
+ CSV provides universal compatibility, Parquet offers 5-10x compression.
188
+ Defaults to "csv".
189
+ instrument_type (str, optional): Instrument type - "spot" or "futures-um".
190
+ Both types support 713 perpetual symbols. Defaults to "spot".
191
+
192
+ Raises:
193
+ ValueError: If instrument_type is not "spot" or "futures-um"
194
+ ValueError: If symbol is None, empty, or contains invalid characters
195
+ (path traversal, special characters, non-alphanumeric).
196
+ ValueError: If date format is incorrect (not YYYY-MM-DD).
197
+ ValueError: If end_date is before start_date.
198
+ ValueError: If output_format is not 'csv' or 'parquet'.
199
+ FileNotFoundError: If output_dir path is invalid.
200
+
201
+ Security:
202
+ Input validation prevents path traversal attacks (CWE-22) by:
203
+ - Rejecting symbols with path characters (/, \\, ., ..)
204
+ - Enforcing alphanumeric-only symbols
205
+ - Validating date range logic
206
+ - Normalizing inputs to uppercase
207
+
208
+ Examples:
209
+ >>> # Default configuration (SOLUSDT spot, 4+ years of data)
210
+ >>> collector = BinancePublicDataCollector()
211
+
212
+ >>> # Futures data collection (NEW in v3.2.0)
213
+ >>> collector = BinancePublicDataCollector(
214
+ ... symbol="BTCUSDT",
215
+ ... start_date="2024-01-01",
216
+ ... end_date="2024-12-31",
217
+ ... instrument_type="futures-um"
218
+ ... )
219
+
220
+ >>> # Custom output directory with Parquet format
221
+ >>> collector = BinancePublicDataCollector(
222
+ ... symbol="ETHUSDT",
223
+ ... output_dir="/path/to/crypto/data",
224
+ ... output_format="parquet"
225
+ ... )
226
+ """
227
+ # ADR-0021: Validate instrument type first (fail fast)
228
+ if instrument_type not in ("spot", "futures-um"):
229
+ raise ValueError(
230
+ f"Invalid instrument_type '{instrument_type}'. Must be 'spot' or 'futures-um'"
231
+ )
232
+ self.instrument_type = instrument_type
233
+
234
+ # Validate and assign symbol (SEC-01, SEC-02, SEC-03)
235
+ self.symbol = self._validate_symbol(symbol)
236
+
237
+ # Parse and assign dates with validation
238
+ try:
239
+ self.start_date = datetime.strptime(start_date, "%Y-%m-%d")
240
+ # Make end_date inclusive of the full day (23:59:59)
241
+ self.end_date = datetime.strptime(end_date, "%Y-%m-%d").replace(
242
+ hour=23, minute=59, second=59
243
+ )
244
+ except ValueError as e:
245
+ raise ValueError(f"Invalid date format. Use YYYY-MM-DD format. Error: {e}") from e
246
+
247
+ # SEC-04: Validate date range logic
248
+ if self.end_date < self.start_date:
249
+ raise ValueError(
250
+ f"Invalid date range: end_date ({self.end_date.strftime('%Y-%m-%d')}) "
251
+ f"is before start_date ({self.start_date.strftime('%Y-%m-%d')})"
252
+ )
253
+
254
+ # ADR-0021: URL routing based on instrument type
255
+ if instrument_type == "spot":
256
+ self.base_url = f"{self.SPOT_BASE_URL}/monthly/klines"
257
+ else: # futures-um
258
+ self.base_url = f"{self.FUTURES_BASE_URL}/monthly/klines"
259
+
260
+ # Initialize ETag cache for bandwidth optimization (90% reduction on re-runs)
261
+ self.etag_cache = ETagCache()
262
+
263
+ # Validate and store output format
264
+ if output_format not in ["csv", "parquet"]:
265
+ raise ValueError(f"output_format must be 'csv' or 'parquet', got '{output_format}'")
266
+ self.output_format = output_format
267
+
268
+ # Configure output directory - use provided path or default to sample_data
269
+ if output_dir:
270
+ self.output_dir = Path(output_dir)
271
+ else:
272
+ self.output_dir = Path(__file__).parent.parent / "sample_data"
273
+
274
+ # Ensure output directory exists
275
+ self.output_dir.mkdir(parents=True, exist_ok=True)
276
+
277
+ # Initialize Rich console for progress indicators
278
+ # Simple logging instead of Rich console
279
+
280
+ # Available timeframes on Binance public data
281
+ self.available_timeframes = [
282
+ "1s",
283
+ "1m",
284
+ "3m",
285
+ "5m",
286
+ "15m",
287
+ "30m",
288
+ "1h",
289
+ "2h",
290
+ "4h",
291
+ "6h",
292
+ "8h",
293
+ "12h",
294
+ "1d",
295
+ "3d",
296
+ "1w",
297
+ "1mo",
298
+ ]
299
+
300
+ # Validate date range and symbol
301
+ self._validate_parameters()
302
+
303
+ print("Binance Public Data Collector")
304
+ print(f"Symbol: {self.symbol}")
305
+ print(
306
+ f"Date Range: {self.start_date.strftime('%Y-%m-%d')} to {self.end_date.strftime('%Y-%m-%d')}"
307
+ )
308
+ print(f"Data Source: {self.base_url}")
309
+
310
+ def _validate_parameters(self):
311
+ """Validate date range parameters.
312
+
313
+ Note: Symbol validation is handled in the API layer via get_supported_symbols().
314
+ See ADR-0022 for symbol alignment rationale (spot and futures both use
315
+ binance-futures-availability package for 713 validated symbols).
316
+ """
317
+ today = datetime.now().date()
318
+ yesterday = today - timedelta(days=1)
319
+
320
+ # Check for future dates
321
+ if self.end_date.date() > yesterday:
322
+ warnings.warn(
323
+ f"⚠️ Requested end date {self.end_date.strftime('%Y-%m-%d')} is in the future. "
324
+ f"Binance public data is typically available up to {yesterday}. "
325
+ f"Recent data may not be available and requests may fail with 404 errors.",
326
+ UserWarning,
327
+ stacklevel=2,
328
+ )
329
+
330
+ def generate_monthly_urls(self, trading_timeframe: str) -> List[Tuple[str, str, str]]:
331
+ """Generate list of monthly ZIP file URLs to download."""
332
+ monthly_zip_urls = []
333
+ current_month_date = self.start_date.replace(day=1) # Start of month
334
+
335
+ while current_month_date <= self.end_date:
336
+ year_month_string = current_month_date.strftime("%Y-%m")
337
+ zip_filename = f"{self.symbol}-{trading_timeframe}-{year_month_string}.zip"
338
+ binance_zip_url = f"{self.base_url}/{self.symbol}/{trading_timeframe}/{zip_filename}"
339
+ monthly_zip_urls.append((binance_zip_url, year_month_string, zip_filename))
340
+
341
+ # Move to next month
342
+ if current_month_date.month == 12:
343
+ current_month_date = current_month_date.replace(
344
+ year=current_month_date.year + 1, month=1
345
+ )
346
+ else:
347
+ current_month_date = current_month_date.replace(month=current_month_date.month + 1)
348
+
349
+ return monthly_zip_urls
350
+
351
+ def download_and_extract_month(self, binance_zip_url, zip_filename):
352
+ """Download and extract a single monthly ZIP file with ETag caching.
353
+
354
+ ETag caching reduces bandwidth by storing ZIP files locally and using
355
+ HTTP conditional requests (If-None-Match) to check if the file has changed.
356
+ Since Binance historical data is immutable, this achieves 90%+ bandwidth
357
+ reduction on re-runs.
358
+ """
359
+ print(f" Downloading {zip_filename}...")
360
+
361
+ try:
362
+ # Local cache path for ZIP files (XDG-compliant)
363
+ cache_zip_path = self.etag_cache.cache_dir / "zips" / zip_filename
364
+ cache_zip_path.parent.mkdir(parents=True, exist_ok=True)
365
+
366
+ # Check cache for ETag
367
+ cached_etag = self.etag_cache.get_etag(binance_zip_url)
368
+
369
+ # If we have both ETag and local file, check if remote changed
370
+ if cached_etag and cache_zip_path.exists():
371
+ request = urllib.request.Request(binance_zip_url)
372
+ request.add_header("If-None-Match", cached_etag)
373
+ print(f" 💾 Cache check: ETag {cached_etag[:8]}...")
374
+
375
+ try:
376
+ with urllib.request.urlopen(request, timeout=60) as http_response:
377
+ if http_response.status == 304:
378
+ # 304 Not Modified - use cached ZIP file
379
+ print(
380
+ f" ✅ Cache HIT: {zip_filename} not modified (0 bytes downloaded)"
381
+ )
382
+ # Load data from cached ZIP file
383
+ with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
384
+ expected_csv_filename = zip_filename.replace(".zip", ".csv")
385
+ if expected_csv_filename in zip_file_handle.namelist():
386
+ with zip_file_handle.open(
387
+ expected_csv_filename
388
+ ) as extracted_csv_file:
389
+ csv_file_content = extracted_csv_file.read().decode("utf-8")
390
+ return list(
391
+ csv.reader(csv_file_content.strip().split("\n"))
392
+ )
393
+ else:
394
+ print(f" ⚠️ CSV file not found in cached {zip_filename}")
395
+ # Cache corrupted, delete and re-download
396
+ cache_zip_path.unlink()
397
+ self.etag_cache.invalidate(binance_zip_url)
398
+ elif http_response.status == 200:
399
+ # ETag changed - download new version
400
+ response_etag = http_response.headers.get("ETag")
401
+ content_length = http_response.headers.get("Content-Length", 0)
402
+
403
+ # Download to cache
404
+ with open(cache_zip_path, "wb") as cache_file:
405
+ shutil.copyfileobj(http_response, cache_file)
406
+
407
+ # Update ETag cache
408
+ if response_etag:
409
+ self.etag_cache.update_etag(
410
+ binance_zip_url, response_etag, int(content_length)
411
+ )
412
+ print(f" 📦 Cache UPDATE: Downloaded {zip_filename}")
413
+
414
+ # Extract CSV data from cached file
415
+ with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
416
+ expected_csv_filename = zip_filename.replace(".zip", ".csv")
417
+ if expected_csv_filename in zip_file_handle.namelist():
418
+ with zip_file_handle.open(
419
+ expected_csv_filename
420
+ ) as extracted_csv_file:
421
+ csv_file_content = extracted_csv_file.read().decode("utf-8")
422
+ return list(
423
+ csv.reader(csv_file_content.strip().split("\n"))
424
+ )
425
+ else:
426
+ print(
427
+ f" ⚠️ HTTP {http_response.status} - {zip_filename} not available"
428
+ )
429
+ return []
430
+ except urllib.error.HTTPError as e:
431
+ if e.code == 304:
432
+ # Handle 304 explicitly - load from cache
433
+ print(f" ✅ Cache HIT: {zip_filename} not modified (0 bytes downloaded)")
434
+ with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
435
+ expected_csv_filename = zip_filename.replace(".zip", ".csv")
436
+ if expected_csv_filename in zip_file_handle.namelist():
437
+ with zip_file_handle.open(
438
+ expected_csv_filename
439
+ ) as extracted_csv_file:
440
+ csv_file_content = extracted_csv_file.read().decode("utf-8")
441
+ return list(csv.reader(csv_file_content.strip().split("\n")))
442
+ else:
443
+ raise
444
+ else:
445
+ # No cache - download fresh
446
+ request = urllib.request.Request(binance_zip_url)
447
+ with urllib.request.urlopen(request, timeout=60) as http_response:
448
+ response_etag = http_response.headers.get("ETag")
449
+ content_length = http_response.headers.get("Content-Length", 0)
450
+
451
+ # Download to cache
452
+ with open(cache_zip_path, "wb") as cache_file:
453
+ shutil.copyfileobj(http_response, cache_file)
454
+
455
+ # Update ETag cache
456
+ if response_etag:
457
+ self.etag_cache.update_etag(
458
+ binance_zip_url, response_etag, int(content_length)
459
+ )
460
+ print(f" 📦 Cache MISS: Downloaded {zip_filename}")
461
+
462
+ # Extract CSV data from cached file
463
+ with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
464
+ expected_csv_filename = zip_filename.replace(".zip", ".csv")
465
+ if expected_csv_filename in zip_file_handle.namelist():
466
+ with zip_file_handle.open(expected_csv_filename) as extracted_csv_file:
467
+ csv_file_content = extracted_csv_file.read().decode("utf-8")
468
+ return list(csv.reader(csv_file_content.strip().split("\n")))
469
+ else:
470
+ print(f" ⚠️ CSV file not found in {zip_filename}")
471
+ return []
472
+
473
+ except Exception as download_exception:
474
+ print(f" ❌ Error downloading {zip_filename}: {download_exception}")
475
+
476
+ # Implement automatic fallback to daily files when monthly fails
477
+ print(f" 🔄 Attempting daily file fallback for {zip_filename}")
478
+ return self._fallback_to_daily_files(zip_filename)
479
+
480
+ def _fallback_to_daily_files(self, failed_monthly_filename):
481
+ """
482
+ Fallback to daily file downloads when monthly file is not available.
483
+
484
+ Automatically downloads individual daily files for the failed month
485
+ and combines them into a single dataset for seamless operation.
486
+
487
+ Args:
488
+ failed_monthly_filename: The monthly filename that failed (e.g., "BTCUSDT-1d-2025-09.zip")
489
+
490
+ Returns:
491
+ List of combined daily data, or empty list if all daily files also fail
492
+ """
493
+ # Extract symbol, timeframe, and year-month from failed filename
494
+ # Format: "BTCUSDT-1d-2025-09.zip"
495
+ parts = failed_monthly_filename.replace(".zip", "").split("-")
496
+ if len(parts) < 4:
497
+ print(f" ❌ Cannot parse monthly filename: {failed_monthly_filename}")
498
+ return []
499
+
500
+ symbol = parts[0]
501
+ timeframe = parts[1]
502
+ year = parts[2]
503
+ month = parts[3]
504
+
505
+ print(f" 📅 Fallback: Downloading daily files for {symbol} {timeframe} {year}-{month}")
506
+
507
+ # Generate daily URLs for the entire month
508
+ daily_urls = self._generate_daily_urls_for_month(symbol, timeframe, year, month)
509
+
510
+ # Download all daily files for this month
511
+ combined_daily_data = []
512
+ successful_daily_downloads = 0
513
+
514
+ for daily_url, daily_filename in daily_urls:
515
+ daily_data = self._download_and_extract_daily_file(daily_url, daily_filename)
516
+ if daily_data:
517
+ combined_daily_data.extend(daily_data)
518
+ successful_daily_downloads += 1
519
+
520
+ if successful_daily_downloads > 0:
521
+ print(
522
+ f" ✅ Daily fallback successful: {successful_daily_downloads}/{len(daily_urls)} daily files retrieved"
523
+ )
524
+ return combined_daily_data
525
+ else:
526
+ print(f" ❌ Daily fallback failed: No daily files available for {year}-{month}")
527
+ return []
528
+
529
+ def _generate_daily_urls_for_month(self, symbol, timeframe, year, month):
530
+ """Generate daily URLs for all days in a specific month."""
531
+ from calendar import monthrange
532
+
533
+ # Get number of days in the month
534
+ year_int = int(year)
535
+ month_int = int(month)
536
+ _, days_in_month = monthrange(year_int, month_int)
537
+
538
+ daily_urls = []
539
+
540
+ # Use daily data URL pattern: https://data.binance.vision/data/spot/daily/klines/
541
+ daily_base_url = self.base_url.replace("/monthly/", "/daily/")
542
+
543
+ for day in range(1, days_in_month + 1):
544
+ date_str = f"{year}-{month_int:02d}-{day:02d}"
545
+ daily_filename = f"{symbol}-{timeframe}-{date_str}.zip"
546
+ daily_url = f"{daily_base_url}/{symbol}/{timeframe}/{daily_filename}"
547
+ daily_urls.append((daily_url, daily_filename))
548
+
549
+ return daily_urls
550
+
551
+ def _download_and_extract_daily_file(self, daily_url, daily_filename):
552
+ """Download and extract a single daily ZIP file."""
553
+ try:
554
+ with tempfile.NamedTemporaryFile() as temporary_zip_file:
555
+ # Download daily ZIP file
556
+ with urllib.request.urlopen(daily_url, timeout=30) as http_response:
557
+ if http_response.status == 200:
558
+ shutil.copyfileobj(http_response, temporary_zip_file)
559
+ temporary_zip_file.flush()
560
+ else:
561
+ # Daily file not available (normal for future dates or weekends)
562
+ return []
563
+
564
+ # Extract CSV data from daily file
565
+ with zipfile.ZipFile(temporary_zip_file.name, "r") as zip_file_handle:
566
+ expected_csv_filename = daily_filename.replace(".zip", ".csv")
567
+ if expected_csv_filename in zip_file_handle.namelist():
568
+ with zip_file_handle.open(expected_csv_filename) as extracted_csv_file:
569
+ csv_file_content = extracted_csv_file.read().decode("utf-8")
570
+ return list(csv.reader(csv_file_content.strip().split("\n")))
571
+ else:
572
+ return []
573
+
574
+ except Exception:
575
+ # Silent failure for daily files - many days may not have data
576
+ return []
577
+
578
+ def _detect_header_intelligent(self, raw_csv_data):
579
+ """Intelligent header detection - determine if first row is data or header."""
580
+ if not raw_csv_data:
581
+ return False
582
+
583
+ first_csv_row = raw_csv_data[0]
584
+ if len(first_csv_row) < 6:
585
+ return False
586
+
587
+ # Header detection heuristics
588
+ try:
589
+ # Test if first field is numeric timestamp
590
+ first_field_value = int(first_csv_row[0])
591
+
592
+ # ✅ BOUNDARY FIX: Support both milliseconds (13-digit) AND microseconds (16-digit) formats
593
+ # Valid timestamp ranges:
594
+ # Milliseconds: 1000000000000 (2001) to 9999999999999 (2286)
595
+ # Microseconds: 1000000000000000 (2001) to 9999999999999999 (2286)
596
+ is_valid_millisecond_timestamp = 1000000000000 <= first_field_value <= 9999999999999
597
+ is_valid_microsecond_timestamp = (
598
+ 1000000000000000 <= first_field_value <= 9999999999999999
599
+ )
600
+
601
+ if is_valid_millisecond_timestamp or is_valid_microsecond_timestamp:
602
+ # Test if other fields are numeric (prices/volumes)
603
+ for ohlcv_field_index in [1, 2, 3, 4, 5]: # OHLCV fields
604
+ float(first_csv_row[ohlcv_field_index])
605
+ return False # All numeric = data row
606
+ else:
607
+ return True # Invalid timestamp = likely header
608
+
609
+ except (ValueError, IndexError):
610
+ # Non-numeric first field = header
611
+ return True
612
+
613
+ def process_raw_data(self, raw_csv_data):
614
+ """Convert raw Binance CSV data with comprehensive timestamp format tracking and transition detection."""
615
+ processed_candle_data = []
616
+ self.corruption_log = getattr(self, "corruption_log", [])
617
+
618
+ # Initialize timestamp format analyzer
619
+ format_analyzer = TimestampFormatAnalyzer()
620
+ format_analyzer.initialize_tracking()
621
+
622
+ # Intelligent header detection
623
+ csv_has_header = self._detect_header_intelligent(raw_csv_data)
624
+ data_start_row_index = 1 if csv_has_header else 0
625
+
626
+ # Store header detection results for metadata
627
+ self._header_detected = csv_has_header
628
+ self._header_content = raw_csv_data[0][:6] if csv_has_header else None
629
+ self._data_start_row = data_start_row_index
630
+
631
+ if csv_has_header:
632
+ print(f" 📋 Header detected: {raw_csv_data[0][:6]}")
633
+ else:
634
+ print(" 📊 Pure data format detected (no header)")
635
+
636
+ format_transition_logged = False
637
+
638
+ for csv_row_index, csv_row_data in enumerate(
639
+ raw_csv_data[data_start_row_index:], start=data_start_row_index
640
+ ):
641
+ if len(csv_row_data) >= 6: # Binance format has 12 columns but we need first 6
642
+ try:
643
+ # Binance format: [timestamp, open, high, low, close, volume, close_time, quote_volume, count, taker_buy_volume, taker_buy_quote_volume, ignore]
644
+ raw_timestamp_value = int(csv_row_data[0])
645
+
646
+ # Comprehensive format detection with transition tracking
647
+ (
648
+ detected_timestamp_format,
649
+ converted_timestamp_seconds,
650
+ format_validation_result,
651
+ ) = format_analyzer.analyze_timestamp_format(raw_timestamp_value, csv_row_index)
652
+
653
+ # Track format transitions and update statistics
654
+ if format_analyzer.current_format is None:
655
+ print(f" 🎯 Initial timestamp format: {detected_timestamp_format}")
656
+
657
+ transition_detected = format_analyzer.update_format_stats(
658
+ detected_timestamp_format, raw_timestamp_value, csv_row_index
659
+ )
660
+
661
+ if transition_detected and not format_transition_logged:
662
+ last_transition = format_analyzer.format_transitions[-1]
663
+ print(
664
+ f" 🔄 Format transition detected: {last_transition['from_format']} → {detected_timestamp_format}"
665
+ )
666
+ format_transition_logged = True
667
+
668
+ # Skip if validation failed
669
+ if not format_validation_result["valid"]:
670
+ self.corruption_log.append(format_validation_result["error_details"])
671
+ continue
672
+
673
+ # ✅ CRITICAL FIX: Use UTC to match Binance's native timezone
674
+ # Eliminates artificial DST gaps caused by local timezone conversion
675
+ utc_datetime = datetime.fromtimestamp(converted_timestamp_seconds, timezone.utc)
676
+
677
+ # ✅ BOUNDARY FIX: Don't filter per-monthly-file to preserve month boundaries
678
+ # Enhanced processing: capture all 11 essential Binance columns for complete microstructure analysis
679
+ processed_candle_row = [
680
+ utc_datetime.strftime("%Y-%m-%d %H:%M:%S"), # date (from open_time)
681
+ float(csv_row_data[1]), # open
682
+ float(csv_row_data[2]), # high
683
+ float(csv_row_data[3]), # low
684
+ float(csv_row_data[4]), # close
685
+ float(csv_row_data[5]), # volume (base asset volume)
686
+ # Additional microstructure columns for professional analysis
687
+ datetime.fromtimestamp(
688
+ int(csv_row_data[6])
689
+ / (1000000 if len(str(int(csv_row_data[6]))) >= 16 else 1000),
690
+ timezone.utc,
691
+ ).strftime("%Y-%m-%d %H:%M:%S"), # close_time
692
+ float(csv_row_data[7]), # quote_asset_volume
693
+ int(csv_row_data[8]), # number_of_trades
694
+ float(csv_row_data[9]), # taker_buy_base_asset_volume
695
+ float(csv_row_data[10]), # taker_buy_quote_asset_volume
696
+ ]
697
+ processed_candle_data.append(processed_candle_row)
698
+
699
+ except (ValueError, OSError, OverflowError) as parsing_exception:
700
+ format_analyzer.format_stats["unknown"]["count"] += 1
701
+ error_record = {
702
+ "row_index": csv_row_index,
703
+ "error_type": "timestamp_parse_error",
704
+ "error_message": str(parsing_exception),
705
+ "raw_row": csv_row_data[:10] if len(csv_row_data) > 10 else csv_row_data,
706
+ }
707
+ self.corruption_log.append(error_record)
708
+ format_analyzer.format_stats["unknown"]["errors"].append(error_record)
709
+ continue
710
+ else:
711
+ # Record insufficient columns
712
+ self.corruption_log.append(
713
+ {
714
+ "row_index": csv_row_index,
715
+ "error_type": "insufficient_columns",
716
+ "column_count": len(csv_row_data),
717
+ "raw_row": csv_row_data,
718
+ }
719
+ )
720
+
721
+ # Report comprehensive format analysis
722
+ format_analyzer.report_format_analysis()
723
+
724
+ # Store format analysis summary for metadata
725
+ self._format_analysis_summary = format_analyzer.get_format_analysis_summary()
726
+
727
+ return processed_candle_data
728
+
729
+ def collect_timeframe_data(self, trading_timeframe: str) -> Dict[str, Any]:
730
+ """Collect complete historical data for a single timeframe with full 11-column microstructure format.
731
+
732
+ Downloads and processes monthly ZIP files from Binance's public data repository
733
+ for the specified timeframe. Automatically handles data processing, validation,
734
+ and saves to CSV while returning a DataFrame for immediate use.
735
+
736
+ Args:
737
+ trading_timeframe (str): Timeframe for data collection.
738
+ Must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
739
+
740
+ Returns:
741
+ dict: Collection results containing:
742
+ - dataframe (pd.DataFrame): Complete OHLCV data with 11 columns:
743
+ * date: Timestamp (open time)
744
+ * open, high, low, close: Price data
745
+ * volume: Base asset volume
746
+ * close_time: Timestamp (close time)
747
+ * quote_asset_volume: Quote asset volume
748
+ * number_of_trades: Trade count
749
+ * taker_buy_base_asset_volume: Taker buy base volume
750
+ * taker_buy_quote_asset_volume: Taker buy quote volume
751
+ - filepath (Path): Path to saved CSV file
752
+ - stats (dict): Collection statistics including duration and bar count
753
+
754
+ Raises:
755
+ ValueError: If trading_timeframe is not supported.
756
+ ConnectionError: If download from Binance repository fails.
757
+ FileNotFoundError: If output directory is invalid.
758
+
759
+ Examples:
760
+ >>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
761
+ >>> result = collector.collect_timeframe_data("1h")
762
+ >>> df = result["dataframe"]
763
+ >>> print(f"Collected {len(df)} hourly bars")
764
+ >>> print(f"Date range: {df['date'].min()} to {df['date'].max()}")
765
+ Collected 26280 hourly bars
766
+ Date range: 2020-08-15 01:00:00 to 2025-03-20 23:00:00
767
+
768
+ >>> # Access microstructure data
769
+ >>> print(f"Total trades: {df['number_of_trades'].sum():,}")
770
+ >>> print(f"Average taker buy ratio: {df['taker_buy_base_asset_volume'].sum() / df['volume'].sum():.2%}")
771
+ Total trades: 15,234,567
772
+ Average taker buy ratio: 51.23%
773
+
774
+ Note:
775
+ This method processes data chronologically and may take several minutes
776
+ for large date ranges due to monthly ZIP file downloads. Progress is
777
+ displayed during collection.
778
+ """
779
+ print(f"\n{'=' * 60}")
780
+ print(f"COLLECTING {trading_timeframe.upper()} DATA FROM BINANCE PUBLIC REPOSITORY")
781
+ print(f"{'=' * 60}")
782
+
783
+ if trading_timeframe not in self.available_timeframes:
784
+ print(f"❌ Timeframe '{trading_timeframe}' not available")
785
+ print(f"📊 Available timeframes: {', '.join(self.available_timeframes)}")
786
+ print("💡 Use 'gapless-crypto-data --list-timeframes' for detailed descriptions")
787
+ return None
788
+
789
+ # Generate monthly URLs
790
+ monthly_zip_urls = self.generate_monthly_urls(trading_timeframe)
791
+ print(f"Monthly files to download: {len(monthly_zip_urls)}")
792
+
793
+ # Collect data from all months
794
+ combined_candle_data = []
795
+ successful_download_count = 0
796
+
797
+ for binance_zip_url, year_month_string, zip_filename in monthly_zip_urls:
798
+ raw_monthly_csv_data = self.download_and_extract_month(binance_zip_url, zip_filename)
799
+ if raw_monthly_csv_data:
800
+ processed_monthly_data = self.process_raw_data(raw_monthly_csv_data)
801
+ combined_candle_data.extend(processed_monthly_data)
802
+ successful_download_count += 1
803
+ print(f" ✅ {len(processed_monthly_data):,} bars from {year_month_string}")
804
+ else:
805
+ print(f" ⚠️ No data from {year_month_string}")
806
+
807
+ print("\nCollection Summary:")
808
+ print(f" Successful downloads: {successful_download_count}/{len(monthly_zip_urls)}")
809
+ print(f" Total bars collected: {len(combined_candle_data):,}")
810
+
811
+ # ETag cache statistics for observability
812
+ cache_stats = self.etag_cache.get_cache_stats()
813
+ if cache_stats["total_entries"] > 0:
814
+ total_cached_size_mb = cache_stats["total_cached_size"] / (1024 * 1024)
815
+ print(
816
+ f" ETag cache: {cache_stats['total_entries']} entries, {total_cached_size_mb:.1f} MB tracked"
817
+ )
818
+
819
+ if combined_candle_data:
820
+ # Sort by timestamp to ensure chronological order
821
+ combined_candle_data.sort(key=lambda candle_row: candle_row[0])
822
+ print(
823
+ f" Pre-filtering range: {combined_candle_data[0][0]} to {combined_candle_data[-1][0]}"
824
+ )
825
+
826
+ # ✅ BOUNDARY FIX: Apply final date range filtering after combining all monthly data
827
+ # This preserves month boundaries while respecting the requested date range
828
+ date_filtered_data = []
829
+ for candle_row in combined_candle_data:
830
+ candle_datetime = datetime.strptime(candle_row[0], "%Y-%m-%d %H:%M:%S")
831
+ if self.start_date <= candle_datetime <= self.end_date:
832
+ date_filtered_data.append(candle_row)
833
+
834
+ print(f" Post-filtering: {len(date_filtered_data):,} bars in requested range")
835
+ if date_filtered_data:
836
+ print(f" Final range: {date_filtered_data[0][0]} to {date_filtered_data[-1][0]}")
837
+
838
+ # Save to CSV and return DataFrame for seamless Python integration
839
+ if date_filtered_data:
840
+ # Calculate collection stats for metadata
841
+ collection_stats = {
842
+ "method": "direct_download",
843
+ "duration": 0.0, # Minimal for single timeframe
844
+ "bars_per_second": 0,
845
+ "total_bars": len(date_filtered_data),
846
+ }
847
+
848
+ # Save to CSV file (addresses the output_dir bug)
849
+ filepath = self.save_data(trading_timeframe, date_filtered_data, collection_stats)
850
+
851
+ # Convert to DataFrame for Python API users
852
+ columns = [
853
+ "date",
854
+ "open",
855
+ "high",
856
+ "low",
857
+ "close",
858
+ "volume",
859
+ "close_time",
860
+ "quote_asset_volume",
861
+ "number_of_trades",
862
+ "taker_buy_base_asset_volume",
863
+ "taker_buy_quote_asset_volume",
864
+ ]
865
+ df = pd.DataFrame(date_filtered_data, columns=columns)
866
+
867
+ # Convert numeric columns
868
+ numeric_cols = [
869
+ "open",
870
+ "high",
871
+ "low",
872
+ "close",
873
+ "volume",
874
+ "quote_asset_volume",
875
+ "number_of_trades",
876
+ "taker_buy_base_asset_volume",
877
+ "taker_buy_quote_asset_volume",
878
+ ]
879
+ for col in numeric_cols:
880
+ df[col] = pd.to_numeric(df[col], errors="coerce")
881
+
882
+ # Convert date columns to datetime
883
+ df["date"] = pd.to_datetime(df["date"])
884
+ df["close_time"] = pd.to_datetime(df["close_time"])
885
+
886
+ return {"dataframe": df, "filepath": filepath, "stats": collection_stats}
887
+
888
+ return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
889
+
890
+ # Save to CSV and return DataFrame for unfiltered data
891
+ if combined_candle_data:
892
+ # Calculate collection stats for metadata
893
+ collection_stats = {
894
+ "method": "direct_download",
895
+ "duration": 0.0, # Minimal for single timeframe
896
+ "bars_per_second": 0,
897
+ "total_bars": len(combined_candle_data),
898
+ }
899
+
900
+ # Save to CSV file (addresses the output_dir bug)
901
+ filepath = self.save_data(trading_timeframe, combined_candle_data, collection_stats)
902
+
903
+ # Convert to DataFrame for Python API users
904
+ columns = [
905
+ "date",
906
+ "open",
907
+ "high",
908
+ "low",
909
+ "close",
910
+ "volume",
911
+ "close_time",
912
+ "quote_asset_volume",
913
+ "number_of_trades",
914
+ "taker_buy_base_asset_volume",
915
+ "taker_buy_quote_asset_volume",
916
+ ]
917
+ df = pd.DataFrame(combined_candle_data, columns=columns)
918
+
919
+ # Convert numeric columns
920
+ numeric_cols = [
921
+ "open",
922
+ "high",
923
+ "low",
924
+ "close",
925
+ "volume",
926
+ "quote_asset_volume",
927
+ "number_of_trades",
928
+ "taker_buy_base_asset_volume",
929
+ "taker_buy_quote_asset_volume",
930
+ ]
931
+ for col in numeric_cols:
932
+ df[col] = pd.to_numeric(df[col], errors="coerce")
933
+
934
+ # Convert date columns to datetime
935
+ df["date"] = pd.to_datetime(df["date"])
936
+ df["close_time"] = pd.to_datetime(df["close_time"])
937
+
938
+ return {"dataframe": df, "filepath": filepath, "stats": collection_stats}
939
+
940
+ return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
941
+
942
+ def generate_metadata(
943
+ self, trading_timeframe, candle_data, collection_performance_stats, gap_analysis_result=None
944
+ ):
945
+ """Generate comprehensive metadata for 11-column microstructure format."""
946
+ if not candle_data:
947
+ return {}
948
+
949
+ # Calculate statistics
950
+ price_values = []
951
+ volume_values = []
952
+ for candle_row in candle_data:
953
+ price_values.extend([candle_row[2], candle_row[3]]) # high, low
954
+ volume_values.append(candle_row[5])
955
+
956
+ return {
957
+ "version": "v2.10.0",
958
+ "generator": "BinancePublicDataCollector",
959
+ "generation_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
960
+ "data_source": "Binance Public Data Repository",
961
+ "data_source_url": self.base_url,
962
+ "market_type": "spot",
963
+ "symbol": self.symbol,
964
+ "timeframe": trading_timeframe,
965
+ "collection_method": "direct_download",
966
+ "target_period": {
967
+ "start": self.start_date.isoformat(),
968
+ "end": self.end_date.isoformat(),
969
+ "total_days": (self.end_date - self.start_date).days,
970
+ },
971
+ "actual_bars": len(candle_data),
972
+ "date_range": {
973
+ "start": candle_data[0][0] if candle_data else None,
974
+ "end": candle_data[-1][0] if candle_data else None,
975
+ },
976
+ "statistics": {
977
+ "price_min": min(price_values) if price_values else 0,
978
+ "price_max": max(price_values) if price_values else 0,
979
+ "volume_total": sum(volume_values) if volume_values else 0,
980
+ "volume_mean": sum(volume_values) / len(volume_values) if volume_values else 0,
981
+ },
982
+ "collection_performance": collection_performance_stats,
983
+ "data_integrity": {
984
+ "chronological_order": True,
985
+ "data_hash": self._calculate_data_hash(candle_data),
986
+ "corruption_detected": len(getattr(self, "corruption_log", [])) > 0,
987
+ "corrupted_rows_count": len(getattr(self, "corruption_log", [])),
988
+ "corruption_details": getattr(self, "corruption_log", []),
989
+ "header_detection": {
990
+ "header_found": getattr(self, "_header_detected", False),
991
+ "header_content": getattr(self, "_header_content", None),
992
+ "data_start_row": getattr(self, "_data_start_row", 0),
993
+ },
994
+ },
995
+ "timestamp_format_analysis": getattr(
996
+ self,
997
+ "_format_analysis_summary",
998
+ {
999
+ "total_rows_analyzed": 0,
1000
+ "formats_detected": {},
1001
+ "transitions_detected": 0,
1002
+ "transition_details": [],
1003
+ "primary_format": "unknown",
1004
+ "format_consistency": True,
1005
+ "analysis_note": "Format analysis not available - may be legacy collection",
1006
+ },
1007
+ ),
1008
+ "enhanced_microstructure_format": {
1009
+ "format_version": "v2.10.0",
1010
+ "total_columns": len(candle_data[0]) if candle_data else 11,
1011
+ "enhanced_features": [
1012
+ "quote_asset_volume",
1013
+ "number_of_trades",
1014
+ "taker_buy_base_asset_volume",
1015
+ "taker_buy_quote_asset_volume",
1016
+ "close_time",
1017
+ ],
1018
+ "analysis_capabilities": [
1019
+ "order_flow_analysis",
1020
+ "liquidity_metrics",
1021
+ "market_microstructure",
1022
+ "trade_weighted_prices",
1023
+ "institutional_data_patterns",
1024
+ ],
1025
+ "professional_features": True,
1026
+ "api_format_compatibility": True,
1027
+ },
1028
+ "gap_analysis": gap_analysis_result
1029
+ or {
1030
+ "analysis_performed": False,
1031
+ "total_gaps_detected": 0,
1032
+ "gaps_filled": 0,
1033
+ "gaps_remaining": 0,
1034
+ "gap_details": [],
1035
+ "gap_filling_method": "authentic_binance_api",
1036
+ "data_completeness_score": 1.0,
1037
+ "note": "Gap analysis can be performed using UniversalGapFiller.detect_all_gaps()",
1038
+ },
1039
+ "compliance": {
1040
+ "zero_magic_numbers": True,
1041
+ "temporal_integrity": True,
1042
+ "authentic_spot_data_only": True,
1043
+ "official_binance_source": True,
1044
+ "binance_format_transition_aware": True,
1045
+ "supports_milliseconds_microseconds": True,
1046
+ "full_binance_microstructure_format": True,
1047
+ "professional_trading_ready": True,
1048
+ },
1049
+ }
1050
+
1051
+ def _perform_gap_analysis(self, data, timeframe):
1052
+ """Perform gap analysis on collected data and return detailed results."""
1053
+ if not data or len(data) < 2:
1054
+ return {
1055
+ "analysis_performed": True,
1056
+ "total_gaps_detected": 0,
1057
+ "gaps_filled": 0,
1058
+ "gaps_remaining": 0,
1059
+ "gap_details": [],
1060
+ "gap_filling_method": "authentic_binance_api",
1061
+ "data_completeness_score": 1.0,
1062
+ "note": "Insufficient data for gap analysis (< 2 rows)",
1063
+ }
1064
+
1065
+ # Calculate expected interval in minutes using centralized constants
1066
+ interval_minutes = TIMEFRAME_TO_MINUTES.get(timeframe, 60)
1067
+ expected_gap_minutes = interval_minutes
1068
+
1069
+ # Analyze timestamp gaps
1070
+ gaps_detected = []
1071
+ total_bars_expected = 0
1072
+
1073
+ for i in range(1, len(data)):
1074
+ current_time = datetime.strptime(data[i][0], "%Y-%m-%d %H:%M:%S")
1075
+ previous_time = datetime.strptime(data[i - 1][0], "%Y-%m-%d %H:%M:%S")
1076
+
1077
+ actual_gap_minutes = (current_time - previous_time).total_seconds() / 60
1078
+
1079
+ if actual_gap_minutes > expected_gap_minutes * 1.5: # Allow 50% tolerance
1080
+ missing_bars = int(actual_gap_minutes / expected_gap_minutes) - 1
1081
+ if missing_bars > 0:
1082
+ gaps_detected.append(
1083
+ {
1084
+ "gap_start": data[i - 1][0],
1085
+ "gap_end": data[i][0],
1086
+ "missing_bars": missing_bars,
1087
+ "duration_minutes": actual_gap_minutes - expected_gap_minutes,
1088
+ }
1089
+ )
1090
+ total_bars_expected += missing_bars
1091
+
1092
+ # Calculate completeness score
1093
+ total_bars_collected = len(data)
1094
+ total_bars_should_exist = total_bars_collected + total_bars_expected
1095
+ completeness_score = (
1096
+ total_bars_collected / total_bars_should_exist if total_bars_should_exist > 0 else 1.0
1097
+ )
1098
+
1099
+ return {
1100
+ "analysis_performed": True,
1101
+ "total_gaps_detected": len(gaps_detected),
1102
+ "gaps_filled": 0, # Will be updated during gap filling process
1103
+ "gaps_remaining": len(gaps_detected),
1104
+ "gap_details": gaps_detected[:10], # Limit to first 10 gaps for metadata size
1105
+ "total_missing_bars": total_bars_expected,
1106
+ "gap_filling_method": "authentic_binance_api",
1107
+ "data_completeness_score": round(completeness_score, 4),
1108
+ "analysis_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
1109
+ "analysis_parameters": {
1110
+ "timeframe": timeframe,
1111
+ "expected_interval_minutes": expected_gap_minutes,
1112
+ "tolerance_factor": 1.5,
1113
+ },
1114
+ }
1115
+
1116
+ def _calculate_data_hash(self, data):
1117
+ """Calculate hash of data for integrity verification."""
1118
+ data_string = "\n".join(",".join(map(str, row)) for row in data)
1119
+ return hashlib.sha256(data_string.encode()).hexdigest()
1120
+
1121
+ def save_data(self, timeframe: str, data: List[List], collection_stats: Dict[str, Any]) -> Path:
1122
+ """Save data to file with format determined by output_format (CSV or Parquet)."""
1123
+ if not data:
1124
+ print(f"❌ No data to save for {timeframe}")
1125
+ return None
1126
+
1127
+ # Generate filename with appropriate extension
1128
+ start_date_str = self.start_date.strftime("%Y%m%d")
1129
+ end_date_str = datetime.strptime(data[-1][0], "%Y-%m-%d %H:%M:%S").strftime("%Y%m%d")
1130
+ version = "v2.10.0" # Updated version for Parquet support
1131
+ file_extension = self.output_format
1132
+ filename = f"binance_spot_{self.symbol}-{timeframe}_{start_date_str}-{end_date_str}_{version}.{file_extension}"
1133
+ filepath = self.output_dir / filename
1134
+
1135
+ # Ensure output directory exists
1136
+ self.output_dir.mkdir(parents=True, exist_ok=True)
1137
+
1138
+ # Perform gap analysis on collected data
1139
+ gap_analysis = self._perform_gap_analysis(data, timeframe)
1140
+
1141
+ # Generate metadata with gap analysis results
1142
+ metadata = self.generate_metadata(timeframe, data, collection_stats, gap_analysis)
1143
+
1144
+ # Convert data to DataFrame for both formats
1145
+ df = pd.DataFrame(
1146
+ data,
1147
+ columns=[
1148
+ "date",
1149
+ "open",
1150
+ "high",
1151
+ "low",
1152
+ "close",
1153
+ "volume",
1154
+ "close_time",
1155
+ "quote_asset_volume",
1156
+ "number_of_trades",
1157
+ "taker_buy_base_asset_volume",
1158
+ "taker_buy_quote_asset_volume",
1159
+ ],
1160
+ )
1161
+
1162
+ # Convert date column to datetime
1163
+ df["date"] = pd.to_datetime(df["date"])
1164
+
1165
+ if self.output_format == "parquet":
1166
+ # Save as Parquet with metadata
1167
+ df.to_parquet(filepath, engine="pyarrow", compression="snappy", index=False)
1168
+ print(f"📊 Saved {len(df):,} bars to {filepath.name} (Parquet format)")
1169
+ else:
1170
+ # Save as CSV with metadata headers (existing logic)
1171
+ with open(filepath, "w", newline="") as f:
1172
+ # Write metadata headers
1173
+ f.write(f"# Binance Spot Market Data {metadata['version']}\n")
1174
+ f.write(f"# Generated: {metadata['generation_timestamp']}\n")
1175
+ f.write(f"# Source: {metadata['data_source']}\n")
1176
+ f.write(
1177
+ f"# Market: {metadata['market_type'].upper()} | Symbol: {metadata['symbol']} | Timeframe: {metadata['timeframe']}\n"
1178
+ )
1179
+ f.write(f"# Coverage: {metadata['actual_bars']:,} bars\n")
1180
+ f.write(
1181
+ f"# Period: {metadata['date_range']['start']} to {metadata['date_range']['end']}\n"
1182
+ )
1183
+ f.write(
1184
+ f"# Collection: {collection_stats['method']} in {collection_stats['duration']:.1f}s\n"
1185
+ )
1186
+ f.write(f"# Data Hash: {metadata['data_integrity']['data_hash'][:16]}...\n")
1187
+ f.write(
1188
+ "# Compliance: Zero-Magic-Numbers, Temporal-Integrity, Official-Binance-Source\n"
1189
+ )
1190
+ f.write("#\n")
1191
+
1192
+ # Write CSV data
1193
+ df.to_csv(f, index=False)
1194
+ print(f"📊 Saved {len(df):,} bars to {filepath.name} (CSV format)")
1195
+
1196
+ # Save metadata as JSON
1197
+ metadata_filepath = filepath.with_suffix(".metadata.json")
1198
+ with open(metadata_filepath, "w") as f:
1199
+ json.dump(metadata, f, indent=2)
1200
+
1201
+ file_size_mb = filepath.stat().st_size / (1024 * 1024)
1202
+ print(f"\n✅ Created: {filepath.name} ({file_size_mb:.1f} MB)")
1203
+ print(f"✅ Metadata: {metadata_filepath.name}")
1204
+
1205
+ return filepath
1206
+
1207
+ def collect_multiple_timeframes(
1208
+ self, timeframes: Optional[List[str]] = None
1209
+ ) -> Dict[str, Dict[str, Any]]:
1210
+ """Collect data for multiple timeframes with comprehensive progress tracking.
1211
+
1212
+ Efficiently collects historical data across multiple timeframes in sequence,
1213
+ providing a complete dataset for multi-timeframe analysis. Each timeframe
1214
+ is processed independently with full validation and progress reporting.
1215
+
1216
+ Args:
1217
+ timeframes (list, optional): List of timeframes to collect.
1218
+ Each must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
1219
+ If None, defaults to ["1m", "3m", "5m", "15m", "30m", "1h", "2h"].
1220
+
1221
+ Returns:
1222
+ dict: Collection results by timeframe, where each key is a timeframe string
1223
+ and each value is a dict containing:
1224
+ - dataframe (pd.DataFrame): Complete OHLCV data with 11 columns
1225
+ - filepath (Path): Path to saved CSV file
1226
+ - stats (dict): Collection statistics
1227
+
1228
+ Raises:
1229
+ ValueError: If any timeframe in the list is not supported.
1230
+ ConnectionError: If download from Binance repository fails.
1231
+
1232
+ Examples:
1233
+ Default comprehensive collection:
1234
+
1235
+ >>> collector = BinancePublicDataCollector(symbol="ETHUSDT")
1236
+ >>> results = collector.collect_multiple_timeframes()
1237
+ >>> for timeframe, result in results.items():
1238
+ ... df = result["dataframe"]
1239
+ ... print(f"{timeframe}: {len(df):,} bars saved to {result['filepath'].name}")
1240
+ 1m: 1,574,400 bars saved to ETHUSDT_1m_2020-08-15_to_2025-03-20.csv
1241
+ 3m: 524,800 bars saved to ETHUSDT_3m_2020-08-15_to_2025-03-20.csv
1242
+
1243
+ Custom timeframes for specific analysis:
1244
+
1245
+ >>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
1246
+ >>> results = collector.collect_multiple_timeframes(["1h", "4h"])
1247
+ >>> hourly_df = results["1h"]["dataframe"]
1248
+ >>> four_hour_df = results["4h"]["dataframe"]
1249
+ >>> print(f"Hourly data: {len(hourly_df)} bars")
1250
+ >>> print(f"4-hour data: {len(four_hour_df)} bars")
1251
+ Hourly data: 26,280 bars
1252
+ 4-hour data: 6,570 bars
1253
+
1254
+ Access collection statistics:
1255
+
1256
+ >>> results = collector.collect_multiple_timeframes(["1h"])
1257
+ >>> stats = results["1h"]["stats"]
1258
+ >>> print(f"Collection took {stats['duration']:.1f} seconds")
1259
+ >>> print(f"Processing rate: {stats['bars_per_second']:,.0f} bars/sec")
1260
+ Collection took 45.2 seconds
1261
+ Processing rate: 582 bars/sec
1262
+
1263
+ Note:
1264
+ Processing time scales with the number of timeframes and date range.
1265
+ Progress is displayed in real-time with Rich progress bars.
1266
+ All timeframes are collected sequentially to avoid overwhelming
1267
+ Binance's public data servers.
1268
+ """
1269
+ if timeframes is None:
1270
+ timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
1271
+
1272
+ print("\n🚀 BINANCE PUBLIC DATA ULTRA-FAST COLLECTION")
1273
+ print(f"Timeframes: {timeframes}")
1274
+ print("=" * 80)
1275
+
1276
+ results = {}
1277
+ overall_start = datetime.now()
1278
+
1279
+ for i, timeframe in enumerate(timeframes):
1280
+ print(f"Processing {timeframe} ({i + 1}/{len(timeframes)})...")
1281
+
1282
+ result = self.collect_timeframe_data(timeframe)
1283
+
1284
+ if result and result.get("filepath"):
1285
+ filepath = result["filepath"]
1286
+ results[timeframe] = filepath
1287
+ file_size_mb = filepath.stat().st_size / (1024 * 1024)
1288
+ print(f"✅ {timeframe}: {filepath.name} ({file_size_mb:.1f} MB)")
1289
+ else:
1290
+ print(f"❌ Failed to collect {timeframe} data")
1291
+
1292
+ overall_duration = (datetime.now() - overall_start).total_seconds()
1293
+
1294
+ print("\n" + "=" * 80)
1295
+ print("🎉 ULTRA-FAST COLLECTION COMPLETE")
1296
+ print(
1297
+ f"⏱️ Total time: {overall_duration:.1f} seconds ({overall_duration / 60:.1f} minutes)"
1298
+ )
1299
+ print(f"📊 Generated {len(results)} files")
1300
+
1301
+ return results
1302
+
1303
+ async def collect_timeframe_data_concurrent(self, trading_timeframe: str) -> Dict[str, Any]:
1304
+ """
1305
+ Collect data using high-performance concurrent hybrid strategy.
1306
+
1307
+ This method uses the ConcurrentCollectionOrchestrator to achieve 10-15x faster
1308
+ data collection through parallel downloads of monthly and daily ZIP files.
1309
+
1310
+ Args:
1311
+ trading_timeframe (str): Timeframe for data collection.
1312
+ Must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
1313
+
1314
+ Returns:
1315
+ dict: Collection results containing:
1316
+ - dataframe (pd.DataFrame): Complete OHLCV data with 11 columns
1317
+ - filepath (Path): Path to saved CSV file
1318
+ - stats (dict): Collection statistics including performance metrics
1319
+ - collection_method (str): "concurrent_hybrid"
1320
+
1321
+ Examples:
1322
+ >>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
1323
+ >>> result = await collector.collect_timeframe_data_concurrent("1h")
1324
+ >>> df = result["dataframe"]
1325
+ >>> print(f"Collected {len(df)} bars in {result['stats']['collection_time']:.1f}s")
1326
+ >>> print(f"Performance: {result['stats']['bars_per_second']:.0f} bars/sec")
1327
+ Collected 8760 bars in 12.3s
1328
+ Performance: 712 bars/sec
1329
+
1330
+ Note:
1331
+ This is the recommended high-performance method for new applications.
1332
+ Falls back to synchronous method if async context is not available.
1333
+ """
1334
+ from .concurrent_collection_orchestrator import ConcurrentCollectionOrchestrator
1335
+
1336
+ print(f"\n{'=' * 60}")
1337
+ print(f"CONCURRENT COLLECTION: {trading_timeframe.upper()} DATA")
1338
+ print(f"Strategy: Hybrid Monthly+Daily with {13} Concurrent Downloads")
1339
+ print(f"{'=' * 60}")
1340
+
1341
+ if trading_timeframe not in self.available_timeframes:
1342
+ print(f"❌ Timeframe '{trading_timeframe}' not available")
1343
+ print(f"📊 Available timeframes: {', '.join(self.available_timeframes)}")
1344
+ return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
1345
+
1346
+ try:
1347
+ # Initialize concurrent orchestrator
1348
+ orchestrator = ConcurrentCollectionOrchestrator(
1349
+ symbol=self.symbol,
1350
+ start_date=self.start_date,
1351
+ end_date=self.end_date,
1352
+ output_dir=self.output_dir,
1353
+ max_concurrent=13,
1354
+ )
1355
+
1356
+ async with orchestrator:
1357
+ # Execute concurrent collection
1358
+ collection_result = await orchestrator.collect_timeframe_concurrent(
1359
+ trading_timeframe, progress_callback=self._progress_callback
1360
+ )
1361
+
1362
+ if not collection_result.success or not collection_result.processed_data:
1363
+ print(f"❌ Concurrent collection failed for {trading_timeframe}")
1364
+ if collection_result.errors:
1365
+ for error in collection_result.errors:
1366
+ print(f" Error: {error}")
1367
+ return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
1368
+
1369
+ # Process data using existing methods
1370
+ processed_data = collection_result.processed_data
1371
+
1372
+ # Calculate performance stats
1373
+ bars_per_second = (
1374
+ collection_result.total_bars / collection_result.collection_time
1375
+ if collection_result.collection_time > 0
1376
+ else 0
1377
+ )
1378
+
1379
+ collection_stats = {
1380
+ "method": "concurrent_hybrid",
1381
+ "duration": collection_result.collection_time,
1382
+ "bars_per_second": bars_per_second,
1383
+ "total_bars": collection_result.total_bars,
1384
+ "successful_downloads": collection_result.successful_downloads,
1385
+ "failed_downloads": collection_result.failed_downloads,
1386
+ "data_source_breakdown": collection_result.data_source_breakdown,
1387
+ "concurrent_downloads": 13,
1388
+ "strategy": "monthly_historical_daily_recent",
1389
+ }
1390
+
1391
+ # Save to CSV using existing method
1392
+ filepath = self.save_data(trading_timeframe, processed_data, collection_stats)
1393
+
1394
+ # Convert to DataFrame
1395
+ columns = [
1396
+ "date",
1397
+ "open",
1398
+ "high",
1399
+ "low",
1400
+ "close",
1401
+ "volume",
1402
+ "close_time",
1403
+ "quote_asset_volume",
1404
+ "number_of_trades",
1405
+ "taker_buy_base_asset_volume",
1406
+ "taker_buy_quote_asset_volume",
1407
+ ]
1408
+ df = pd.DataFrame(processed_data, columns=columns)
1409
+
1410
+ # Convert numeric columns
1411
+ numeric_cols = [
1412
+ "open",
1413
+ "high",
1414
+ "low",
1415
+ "close",
1416
+ "volume",
1417
+ "quote_asset_volume",
1418
+ "number_of_trades",
1419
+ "taker_buy_base_asset_volume",
1420
+ "taker_buy_quote_asset_volume",
1421
+ ]
1422
+ for col in numeric_cols:
1423
+ df[col] = pd.to_numeric(df[col], errors="coerce")
1424
+
1425
+ # Convert date columns
1426
+ df["date"] = pd.to_datetime(df["date"])
1427
+ df["close_time"] = pd.to_datetime(df["close_time"])
1428
+
1429
+ print("\n✅ CONCURRENT COLLECTION SUCCESS")
1430
+ print(f"📊 Collected: {len(df):,} bars")
1431
+ print(f"⚡ Performance: {bars_per_second:.0f} bars/sec")
1432
+ print(
1433
+ f"🚀 Speed: {collection_result.collection_time:.1f}s vs ~{collection_result.collection_time * 10:.0f}s sequential"
1434
+ )
1435
+ print(
1436
+ f"📁 Sources: {collection_result.data_source_breakdown['monthly']} monthly + {collection_result.data_source_breakdown['daily']} daily"
1437
+ )
1438
+
1439
+ return {
1440
+ "dataframe": df,
1441
+ "filepath": filepath,
1442
+ "stats": collection_stats,
1443
+ "collection_method": "concurrent_hybrid",
1444
+ }
1445
+
1446
+ except Exception as e:
1447
+ print(f"❌ Concurrent collection failed: {e}")
1448
+ print("⏮️ Falling back to synchronous method...")
1449
+ # Fallback to synchronous method
1450
+ return self.collect_timeframe_data(trading_timeframe)
1451
+
1452
+ async def collect_multiple_timeframes_concurrent(
1453
+ self, timeframes: Optional[List[str]] = None
1454
+ ) -> Dict[str, Dict[str, Any]]:
1455
+ """
1456
+ Collect multiple timeframes using concurrent hybrid strategy.
1457
+
1458
+ High-performance collection across multiple timeframes with optimal
1459
+ resource utilization and parallel processing.
1460
+
1461
+ Args:
1462
+ timeframes (list, optional): List of timeframes to collect.
1463
+ If None, defaults to ["1m", "3m", "5m", "15m", "30m", "1h", "2h"].
1464
+
1465
+ Returns:
1466
+ dict: Collection results by timeframe with comprehensive performance metrics.
1467
+
1468
+ Examples:
1469
+ >>> collector = BinancePublicDataCollector(symbol="ETHUSDT")
1470
+ >>> results = await collector.collect_multiple_timeframes_concurrent(["1h", "4h"])
1471
+ >>> for timeframe, result in results.items():
1472
+ ... stats = result["stats"]
1473
+ ... print(f"{timeframe}: {stats['total_bars']} bars in {stats['duration']:.1f}s")
1474
+ 1h: 8760 bars in 15.2s
1475
+ 4h: 2190 bars in 8.7s
1476
+
1477
+ Note:
1478
+ This method processes timeframes sequentially to avoid overwhelming
1479
+ servers, but each timeframe uses full concurrent downloading.
1480
+ """
1481
+ from .concurrent_collection_orchestrator import ConcurrentCollectionOrchestrator
1482
+
1483
+ if timeframes is None:
1484
+ timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
1485
+
1486
+ print("\n🚀 CONCURRENT MULTI-TIMEFRAME COLLECTION")
1487
+ print(f"Strategy: Hybrid Monthly+Daily with {13} Concurrent Downloads")
1488
+ print(f"Timeframes: {timeframes}")
1489
+ print("=" * 80)
1490
+
1491
+ results = {}
1492
+ overall_start = datetime.now()
1493
+
1494
+ try:
1495
+ # Initialize concurrent orchestrator
1496
+ orchestrator = ConcurrentCollectionOrchestrator(
1497
+ symbol=self.symbol,
1498
+ start_date=self.start_date,
1499
+ end_date=self.end_date,
1500
+ output_dir=self.output_dir,
1501
+ max_concurrent=13,
1502
+ )
1503
+
1504
+ async with orchestrator:
1505
+ # Process each timeframe with concurrent downloads
1506
+ for i, timeframe in enumerate(timeframes):
1507
+ print(f"\n📊 Processing {timeframe} ({i + 1}/{len(timeframes)})...")
1508
+
1509
+ result = await self.collect_timeframe_data_concurrent(timeframe)
1510
+
1511
+ if result and result.get("filepath"):
1512
+ filepath = result["filepath"]
1513
+ results[timeframe] = filepath
1514
+ file_size_mb = filepath.stat().st_size / (1024 * 1024)
1515
+ bars_per_sec = result["stats"]["bars_per_second"]
1516
+ print(
1517
+ f"✅ {timeframe}: {filepath.name} ({file_size_mb:.1f} MB, {bars_per_sec:.0f} bars/sec)"
1518
+ )
1519
+ else:
1520
+ print(f"❌ Failed to collect {timeframe} data")
1521
+
1522
+ except Exception as e:
1523
+ print(f"❌ Concurrent collection failed: {e}")
1524
+ print("⏮️ Falling back to synchronous method...")
1525
+ # Fallback to synchronous method
1526
+ return self.collect_multiple_timeframes(timeframes)
1527
+
1528
+ overall_duration = (datetime.now() - overall_start).total_seconds()
1529
+
1530
+ print("\n" + "=" * 80)
1531
+ print("🎉 CONCURRENT MULTI-TIMEFRAME COLLECTION COMPLETE")
1532
+ print(
1533
+ f"⏱️ Total time: {overall_duration:.1f} seconds ({overall_duration / 60:.1f} minutes)"
1534
+ )
1535
+ print(f"📊 Generated {len(results)} datasets")
1536
+ print("🚀 Average speedup: ~10-15x faster than sequential downloads")
1537
+
1538
+ return results
1539
+
1540
+ def _progress_callback(self, completed: int, total: int, current_task):
1541
+ """Progress callback for concurrent downloads."""
1542
+ if completed % 5 == 0 or completed == total: # Report every 5 downloads or at completion
1543
+ percentage = (completed / total) * 100
1544
+ source_type = current_task.source_type.value
1545
+ print(
1546
+ f" 📥 Progress: {completed}/{total} ({percentage:.1f}%) - {source_type}: {current_task.filename}"
1547
+ )
1548
+
1549
+ def validate_csv_file(
1550
+ self, csv_filepath: Union[str, Path], expected_timeframe: Optional[str] = None
1551
+ ) -> Dict[str, Any]:
1552
+ """
1553
+ Comprehensive validation of CSV file data integrity, completeness, and quality.
1554
+
1555
+ Delegates to CSVValidator for multi-layer validation including structure checking,
1556
+ datetime sequence validation, OHLCV quality analysis, coverage calculation, and
1557
+ statistical anomaly detection.
1558
+
1559
+ Args:
1560
+ csv_filepath: Path to CSV file to validate
1561
+ expected_timeframe: Expected timeframe (e.g., '30m') for interval validation
1562
+
1563
+ Returns:
1564
+ dict: Validation results with detailed analysis
1565
+
1566
+ Note:
1567
+ This method delegates to the validation.csv_validator.CSVValidator class
1568
+ for complete validation logic. See CSVValidator for implementation details.
1569
+ """
1570
+ validator = CSVValidator()
1571
+ return validator.validate_csv_file(csv_filepath, expected_timeframe)
1572
+
1573
+ def update_metadata_with_validation(self, csv_filepath, validation_results):
1574
+ """Update metadata JSON file with validation results."""
1575
+ metadata_filepath = csv_filepath.with_suffix(".metadata.json")
1576
+
1577
+ if metadata_filepath.exists():
1578
+ with open(metadata_filepath, "r") as f:
1579
+ metadata = json.load(f)
1580
+ else:
1581
+ metadata = {}
1582
+
1583
+ # Add validation results to metadata
1584
+ metadata["validation"] = validation_results
1585
+
1586
+ # Update compliance status based on validation
1587
+ compliance = metadata.get("compliance", {})
1588
+ if validation_results["total_errors"] == 0:
1589
+ compliance["data_validation_passed"] = True
1590
+ compliance["validation_summary"] = validation_results["validation_summary"]
1591
+ else:
1592
+ compliance["data_validation_passed"] = False
1593
+ compliance["validation_summary"] = validation_results["validation_summary"]
1594
+ compliance["validation_errors"] = validation_results["total_errors"]
1595
+ compliance["validation_warnings"] = validation_results["total_warnings"]
1596
+
1597
+ metadata["compliance"] = compliance
1598
+
1599
+ # Save updated metadata with JSON serialization fix
1600
+ def convert_numpy_types(obj):
1601
+ """Convert numpy types to Python native types for JSON serialization."""
1602
+ if hasattr(obj, "item"):
1603
+ return obj.item()
1604
+ elif isinstance(obj, dict):
1605
+ return {key: convert_numpy_types(value) for key, value in obj.items()}
1606
+ elif isinstance(obj, list):
1607
+ return [convert_numpy_types(item) for item in obj]
1608
+ else:
1609
+ return obj
1610
+
1611
+ with open(metadata_filepath, "w") as f:
1612
+ json.dump(convert_numpy_types(metadata), f, indent=2)
1613
+
1614
+ print(f"✅ Updated metadata: {metadata_filepath.name}")
1615
+
1616
+ def apply_gap_filling_to_validated_files(self):
1617
+ """Apply comprehensive gap filling to validated data files using authentic Binance API data"""
1618
+
1619
+ try:
1620
+ print("\n🔧 INTEGRATED GAP FILLING SYSTEM")
1621
+ print("Primary Source: Binance REST API (Authentic Data Only)")
1622
+ print("=" * 60)
1623
+
1624
+ # Initialize gap filling components
1625
+ gap_filler = UniversalGapFiller()
1626
+
1627
+ # Find CSV files to check for gaps
1628
+ csv_files = list(Path(self.output_dir).glob("*.csv"))
1629
+
1630
+ if not csv_files:
1631
+ print("❌ No CSV files found for gap filling")
1632
+ return
1633
+
1634
+ # Filter to only files for this symbol
1635
+ symbol_files = [f for f in csv_files if self.symbol in f.name]
1636
+
1637
+ if not symbol_files:
1638
+ print(f"❌ No CSV files found for symbol {self.symbol}")
1639
+ return
1640
+
1641
+ print(f"🔍 Analyzing {len(symbol_files)} files for gaps...")
1642
+
1643
+ total_gaps_detected = 0
1644
+ total_gaps_filled = 0
1645
+ total_gaps_failed = 0
1646
+ files_processed = 0
1647
+ results = []
1648
+
1649
+ for csv_file in symbol_files:
1650
+ print(f"\n📁 Processing: {csv_file.name}")
1651
+
1652
+ # Extract timeframe from filename
1653
+ file_timeframe = self._extract_timeframe_from_filename(csv_file.name)
1654
+ print(f" 📊 Detected timeframe: {file_timeframe}")
1655
+
1656
+ # Use the proper UniversalGapFiller process_file method
1657
+ result = gap_filler.process_file(csv_file, file_timeframe)
1658
+ results.append(result)
1659
+ files_processed += 1
1660
+
1661
+ # Update totals
1662
+ total_gaps_detected += result["gaps_detected"]
1663
+ total_gaps_filled += result["gaps_filled"]
1664
+ total_gaps_failed += result["gaps_failed"]
1665
+
1666
+ # Report per-file results
1667
+ if result["gaps_detected"] == 0:
1668
+ print(f" ✅ No gaps found in {file_timeframe}")
1669
+ else:
1670
+ success_rate = result["success_rate"]
1671
+ status = "✅" if success_rate == 100.0 else "⚠️" if success_rate > 0 else "❌"
1672
+ print(
1673
+ f" {status} {result['gaps_filled']}/{result['gaps_detected']} gaps filled ({success_rate:.1f}%)"
1674
+ )
1675
+
1676
+ # Comprehensive summary
1677
+ print("\n" + "=" * 60)
1678
+ print("📊 GAP FILLING SUMMARY")
1679
+ print("=" * 60)
1680
+
1681
+ for result in results:
1682
+ if result["gaps_detected"] > 0:
1683
+ status = (
1684
+ "✅"
1685
+ if result["success_rate"] == 100.0
1686
+ else "⚠️"
1687
+ if result["success_rate"] > 0
1688
+ else "❌"
1689
+ )
1690
+ print(
1691
+ f"{status} {result['timeframe']:>3}: {result['gaps_filled']:>2}/{result['gaps_detected']:>2} gaps filled ({result['success_rate']:>5.1f}%)"
1692
+ )
1693
+
1694
+ print("-" * 60)
1695
+ overall_success = (
1696
+ (total_gaps_filled / total_gaps_detected * 100)
1697
+ if total_gaps_detected > 0
1698
+ else 100.0
1699
+ )
1700
+ print(
1701
+ f"🎯 OVERALL: {total_gaps_filled}/{total_gaps_detected} gaps filled ({overall_success:.1f}%)"
1702
+ )
1703
+
1704
+ if overall_success == 100.0:
1705
+ print("🎉 ALL GAPS FILLED SUCCESSFULLY!")
1706
+ print("✅ Datasets are now 100% gapless and ready for production use")
1707
+ else:
1708
+ print(
1709
+ f"⚠️ {total_gaps_failed} gaps failed to fill (may be legitimate exchange outages)"
1710
+ )
1711
+ print("📋 Review failed gaps to confirm they are legitimate market closures")
1712
+
1713
+ print(f"\nFiles processed: {files_processed}")
1714
+ print("Data source: Authentic Binance REST API")
1715
+ print("Gap filling protocol: API-first validation (no synthetic data)")
1716
+
1717
+ except Exception as e:
1718
+ print(f"❌ Gap filling error: {e}")
1719
+ print("⚠️ Continuing without gap filling...")
1720
+ import traceback
1721
+
1722
+ traceback.print_exc()
1723
+
1724
+ def _extract_timeframe_from_filename(self, filename):
1725
+ """Extract timeframe from filename (e.g., 'SOLUSDT-15m-data.csv' -> '15m')"""
1726
+ for tf in [
1727
+ "1s",
1728
+ "1m",
1729
+ "3m",
1730
+ "5m",
1731
+ "15m",
1732
+ "30m",
1733
+ "1h",
1734
+ "2h",
1735
+ "4h",
1736
+ "6h",
1737
+ "8h",
1738
+ "12h",
1739
+ "1d",
1740
+ "3d",
1741
+ "1w",
1742
+ "1mo",
1743
+ ]:
1744
+ if f"-{tf}_" in filename or f"-{tf}-" in filename:
1745
+ return tf
1746
+ return "15m" # Default
1747
+
1748
+
1749
+ def _setup_argument_parser() -> argparse.ArgumentParser:
1750
+ """Create and configure CLI argument parser.
1751
+
1752
+ Returns:
1753
+ Configured ArgumentParser with all CLI options
1754
+ """
1755
+ parser = argparse.ArgumentParser(
1756
+ description="Ultra-fast Binance spot data collector with validation"
1757
+ )
1758
+ parser.add_argument(
1759
+ "--symbol", default="SOLUSDT", help="Trading pair symbol (default: SOLUSDT)"
1760
+ )
1761
+ parser.add_argument(
1762
+ "--timeframes",
1763
+ default="1m,3m,5m,15m,30m,1h,2h",
1764
+ help="Comma-separated timeframes (default: 1m,3m,5m,15m,30m,1h,2h)",
1765
+ )
1766
+ parser.add_argument(
1767
+ "--start", default="2020-08-15", help="Start date YYYY-MM-DD (default: 2020-08-15)"
1768
+ )
1769
+ parser.add_argument(
1770
+ "--end", default="2025-03-20", help="End date YYYY-MM-DD (default: 2025-03-20)"
1771
+ )
1772
+ parser.add_argument(
1773
+ "--validate-only",
1774
+ action="store_true",
1775
+ help="Only validate existing CSV files, do not collect new data",
1776
+ )
1777
+ parser.add_argument(
1778
+ "--validate-files", nargs="+", help="Specific CSV files to validate (with --validate-only)"
1779
+ )
1780
+ parser.add_argument(
1781
+ "--no-validation",
1782
+ action="store_true",
1783
+ help="Skip validation after collection (not recommended)",
1784
+ )
1785
+ return parser
1786
+
1787
+
1788
+ def _discover_files_to_validate(args, collector) -> List[Path]:
1789
+ """Discover CSV files to validate based on arguments.
1790
+
1791
+ Args:
1792
+ args: Parsed command line arguments
1793
+ collector: BinancePublicDataCollector instance
1794
+
1795
+ Returns:
1796
+ List of Path objects for files to validate
1797
+ """
1798
+ if args.validate_files:
1799
+ return [Path(f) for f in args.validate_files]
1800
+ else:
1801
+ pattern = f"*{args.symbol}*.csv"
1802
+ return list(collector.output_dir.glob(pattern))
1803
+
1804
+
1805
+ def _validate_files(collector, files_to_validate: List[Path]) -> List[Dict]:
1806
+ """Validate list of CSV files.
1807
+
1808
+ Args:
1809
+ collector: BinancePublicDataCollector instance
1810
+ files_to_validate: List of file paths to validate
1811
+
1812
+ Returns:
1813
+ List of validation summary dictionaries
1814
+ """
1815
+ validation_summary = []
1816
+ for csv_file in files_to_validate:
1817
+ # Extract timeframe from filename
1818
+ timeframe = None
1819
+ for tf in ["1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "1d"]:
1820
+ if f"-{tf}_" in csv_file.name:
1821
+ timeframe = tf
1822
+ break
1823
+
1824
+ # Validate file
1825
+ validation_result = collector.validate_csv_file(csv_file, timeframe)
1826
+ collector.update_metadata_with_validation(csv_file, validation_result)
1827
+
1828
+ validation_summary.append(
1829
+ {
1830
+ "file": csv_file.name,
1831
+ "status": validation_result["validation_summary"],
1832
+ "errors": validation_result["total_errors"],
1833
+ "warnings": validation_result["total_warnings"],
1834
+ }
1835
+ )
1836
+
1837
+ return validation_summary
1838
+
1839
+
1840
+ def _print_validation_summary(validation_summary: List[Dict]) -> int:
1841
+ """Print validation summary and return exit code.
1842
+
1843
+ Args:
1844
+ validation_summary: List of validation result dictionaries
1845
+
1846
+ Returns:
1847
+ Exit code (0 for success, 1 for failures)
1848
+ """
1849
+ print("\n" + "=" * 80)
1850
+ print("VALIDATION SUMMARY")
1851
+ print("=" * 80)
1852
+
1853
+ perfect_files = 0
1854
+ good_files = 0
1855
+ failed_files = 0
1856
+
1857
+ for summary in validation_summary:
1858
+ if summary["errors"] == 0:
1859
+ if summary["warnings"] == 0:
1860
+ status_icon = "✅"
1861
+ perfect_files += 1
1862
+ else:
1863
+ status_icon = "⚠️ "
1864
+ good_files += 1
1865
+ else:
1866
+ status_icon = "❌"
1867
+ failed_files += 1
1868
+
1869
+ print(f"{status_icon} {summary['file']}: {summary['status']}")
1870
+ if summary["errors"] > 0 or summary["warnings"] > 0:
1871
+ print(f" └─ {summary['errors']} errors, {summary['warnings']} warnings")
1872
+
1873
+ print("\nOVERALL RESULTS:")
1874
+ print(f" ✅ Perfect: {perfect_files} files")
1875
+ print(f" ⚠️ Good: {good_files} files")
1876
+ print(f" ❌ Failed: {failed_files} files")
1877
+
1878
+ if failed_files == 0:
1879
+ print("\n🎉 ALL VALIDATIONS PASSED!")
1880
+ return 0
1881
+ else:
1882
+ print(f"\n⚠️ {failed_files} files failed validation")
1883
+ return 1
1884
+
1885
+
1886
+ def _run_validation_only_mode(args, collector) -> int:
1887
+ """Execute validation-only mode workflow.
1888
+
1889
+ Args:
1890
+ args: Parsed command line arguments
1891
+ collector: BinancePublicDataCollector instance
1892
+
1893
+ Returns:
1894
+ Exit code (0 for success, 1 for failures)
1895
+ """
1896
+ print("🔍 VALIDATION-ONLY MODE")
1897
+
1898
+ files_to_validate = _discover_files_to_validate(args, collector)
1899
+
1900
+ if not files_to_validate:
1901
+ print("❌ No CSV files found to validate")
1902
+ return 1
1903
+
1904
+ print(f"Found {len(files_to_validate)} files to validate:")
1905
+ for file_path in files_to_validate:
1906
+ print(f" 📄 {file_path.name}")
1907
+
1908
+ validation_summary = _validate_files(collector, files_to_validate)
1909
+ return _print_validation_summary(validation_summary)
1910
+
1911
+
1912
+ def _auto_validate_collected_files(collector, results: Dict) -> bool:
1913
+ """Perform auto-validation on collected files.
1914
+
1915
+ Args:
1916
+ collector: BinancePublicDataCollector instance
1917
+ results: Collection results dictionary
1918
+
1919
+ Returns:
1920
+ True if all validations passed, False otherwise
1921
+ """
1922
+ print("\n🔍 AUTO-VALIDATION AFTER COLLECTION")
1923
+ validation_passed = 0
1924
+ validation_failed = 0
1925
+
1926
+ for timeframe, csv_file in results.items():
1927
+ validation_result = collector.validate_csv_file(csv_file, timeframe)
1928
+ collector.update_metadata_with_validation(csv_file, validation_result)
1929
+
1930
+ if validation_result["total_errors"] == 0:
1931
+ validation_passed += 1
1932
+ else:
1933
+ validation_failed += 1
1934
+
1935
+ print(f"\nVALIDATION RESULTS: {validation_passed} passed, {validation_failed} failed")
1936
+
1937
+ if validation_failed == 0:
1938
+ print("🎉 ALL FILES VALIDATED SUCCESSFULLY!")
1939
+ print("Ready for ML training, backtesting, and production use")
1940
+ collector.apply_gap_filling_to_validated_files()
1941
+ return True
1942
+ else:
1943
+ print("⚠️ Some files failed validation - check errors above")
1944
+ return False
1945
+
1946
+
1947
+ def _run_collection_mode(args, collector) -> int:
1948
+ """Execute data collection mode workflow.
1949
+
1950
+ Args:
1951
+ args: Parsed command line arguments
1952
+ collector: BinancePublicDataCollector instance
1953
+
1954
+ Returns:
1955
+ Exit code (0 for success, 1 for failure)
1956
+ """
1957
+ timeframes = [tf.strip() for tf in args.timeframes.split(",")]
1958
+ print(f"Collecting timeframes: {timeframes}")
1959
+
1960
+ results = collector.collect_multiple_timeframes(timeframes)
1961
+
1962
+ if not results:
1963
+ print("❌ Collection failed")
1964
+ return 1
1965
+
1966
+ print(f"\n🚀 ULTRA-FAST COLLECTION SUCCESS: Generated {len(results)} datasets")
1967
+
1968
+ if not args.no_validation:
1969
+ _auto_validate_collected_files(collector, results)
1970
+
1971
+ return 0
1972
+
1973
+
1974
+ def main():
1975
+ """Main execution function with CLI argument support."""
1976
+ parser = _setup_argument_parser()
1977
+ args = parser.parse_args()
1978
+
1979
+ print("Binance Public Data Ultra-Fast Collector with Validation")
1980
+ print("Official Binance data repository - 10-100x faster than API")
1981
+ print("=" * 80)
1982
+
1983
+ collector = BinancePublicDataCollector(
1984
+ symbol=args.symbol, start_date=args.start, end_date=args.end
1985
+ )
1986
+
1987
+ if args.validate_only:
1988
+ return _run_validation_only_mode(args, collector)
1989
+ else:
1990
+ return _run_collection_mode(args, collector)
1991
+
1992
+
1993
+ if __name__ == "__main__":
1994
+ exit(main())