gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1994 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Binance Public Data Collector
|
|
4
|
+
|
|
5
|
+
Ultra-fast historical data collection using Binance's official public data repository.
|
|
6
|
+
10-100x faster than API calls, with complete historical coverage.
|
|
7
|
+
|
|
8
|
+
Data source: https://data.binance.vision/data/spot/monthly/klines/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import csv
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import shutil
|
|
16
|
+
import tempfile
|
|
17
|
+
import urllib.request
|
|
18
|
+
import warnings
|
|
19
|
+
import zipfile
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from ..gap_filling.universal_gap_filler import UniversalGapFiller
|
|
27
|
+
from ..utils.etag_cache import ETagCache
|
|
28
|
+
from ..utils.timeframe_constants import TIMEFRAME_TO_MINUTES
|
|
29
|
+
from ..utils.timestamp_format_analyzer import TimestampFormatAnalyzer
|
|
30
|
+
from ..validation.csv_validator import CSVValidator
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BinancePublicDataCollector:
|
|
34
|
+
"""Ultra-fast cryptocurrency spot data collection from Binance's public data repository.
|
|
35
|
+
|
|
36
|
+
This collector provides 10-100x faster data collection compared to API calls by
|
|
37
|
+
downloading pre-generated monthly ZIP files from Binance's official public data repository.
|
|
38
|
+
Supports complete historical coverage with full 11-column microstructure format including
|
|
39
|
+
order flow metrics.
|
|
40
|
+
|
|
41
|
+
Features:
|
|
42
|
+
- Ultra-fast bulk data collection from monthly ZIP archives
|
|
43
|
+
- Complete historical coverage from 2017 onwards
|
|
44
|
+
- Full 11-column microstructure format with order flow data
|
|
45
|
+
- Automatic gap detection and filling capabilities
|
|
46
|
+
- Built-in data validation and integrity checks
|
|
47
|
+
- Support for all major timeframes (1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h)
|
|
48
|
+
- DataFrame-first Python API with seamless pandas integration
|
|
49
|
+
|
|
50
|
+
Data Format:
|
|
51
|
+
The collector outputs CSV files with 11 columns providing complete market microstructure:
|
|
52
|
+
- OHLCV: Open, High, Low, Close, Volume
|
|
53
|
+
- Timestamps: Open Time, Close Time
|
|
54
|
+
- Order Flow: Quote Asset Volume, Number of Trades
|
|
55
|
+
- Taker Metrics: Taker Buy Base Volume, Taker Buy Quote Volume
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
For simple data collection, consider using the function-based API:
|
|
59
|
+
|
|
60
|
+
>>> import gapless_crypto_clickhouse as gcd
|
|
61
|
+
>>> df = gcd.fetch_data("BTCUSDT", "1h", start="2024-01-01", end="2024-12-31")
|
|
62
|
+
|
|
63
|
+
Advanced usage with this class for complex workflows:
|
|
64
|
+
|
|
65
|
+
>>> collector = BinancePublicDataCollector()
|
|
66
|
+
>>> result = collector.collect_timeframe_data("1h")
|
|
67
|
+
>>> df = result["dataframe"]
|
|
68
|
+
>>> print(f"Collected {len(df)} bars of {collector.symbol} data")
|
|
69
|
+
Collected 26280 bars of SOLUSDT data
|
|
70
|
+
|
|
71
|
+
Custom configuration and multiple timeframes:
|
|
72
|
+
|
|
73
|
+
>>> collector = BinancePublicDataCollector(
|
|
74
|
+
... symbol="BTCUSDT",
|
|
75
|
+
... start_date="2023-01-01",
|
|
76
|
+
... end_date="2023-12-31",
|
|
77
|
+
... output_dir="./crypto_data"
|
|
78
|
+
... )
|
|
79
|
+
>>> results = collector.collect_multiple_timeframes(["1h", "4h"])
|
|
80
|
+
>>> for timeframe, result in results.items():
|
|
81
|
+
... print(f"{timeframe}: {len(result['dataframe'])} bars")
|
|
82
|
+
1h: 8760 bars
|
|
83
|
+
4h: 2190 bars
|
|
84
|
+
|
|
85
|
+
Note:
|
|
86
|
+
This collector supports 713 USDT perpetual symbols for both spot and futures-um markets.
|
|
87
|
+
Symbol validation is handled in the API layer via get_supported_symbols().
|
|
88
|
+
See ADR-0022 for spot/futures alignment rationale.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def _validate_symbol(self, symbol: str) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Validate and sanitize symbol input for security.
|
|
94
|
+
|
|
95
|
+
This method prevents path traversal attacks and ensures symbol format integrity
|
|
96
|
+
by rejecting invalid characters and malformed inputs.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
symbol: Trading pair symbol to validate (e.g., "BTCUSDT", "SOLUSDT")
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Validated and normalized symbol string (uppercase, stripped)
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: If symbol is None, empty, or contains invalid characters
|
|
106
|
+
|
|
107
|
+
Security:
|
|
108
|
+
- Prevents path traversal attacks (CWE-22)
|
|
109
|
+
- Blocks directory navigation characters (/, \\, ., ..)
|
|
110
|
+
- Enforces alphanumeric-only input
|
|
111
|
+
- Protects file operations using symbol in paths
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
>>> collector._validate_symbol("btcusdt")
|
|
115
|
+
'BTCUSDT'
|
|
116
|
+
|
|
117
|
+
>>> collector._validate_symbol("BTC/../etc/passwd")
|
|
118
|
+
ValueError: Symbol contains invalid characters...
|
|
119
|
+
|
|
120
|
+
>>> collector._validate_symbol("")
|
|
121
|
+
ValueError: Symbol cannot be empty
|
|
122
|
+
|
|
123
|
+
>>> collector._validate_symbol(None)
|
|
124
|
+
ValueError: Symbol cannot be None
|
|
125
|
+
"""
|
|
126
|
+
# SEC-03: None value validation
|
|
127
|
+
if symbol is None:
|
|
128
|
+
raise ValueError("Symbol cannot be None")
|
|
129
|
+
|
|
130
|
+
# SEC-02: Empty string validation
|
|
131
|
+
if not symbol or not symbol.strip():
|
|
132
|
+
raise ValueError("Symbol cannot be empty")
|
|
133
|
+
|
|
134
|
+
# SEC-01: Path traversal prevention
|
|
135
|
+
import re
|
|
136
|
+
|
|
137
|
+
if re.search(r"[./\\]", symbol):
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Symbol contains invalid characters: {symbol}\n"
|
|
140
|
+
f"Symbol must be alphanumeric (e.g., BTCUSDT, SOLUSDT)"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Normalize to uppercase and strip whitespace
|
|
144
|
+
symbol = symbol.upper().strip()
|
|
145
|
+
|
|
146
|
+
# Whitelist validation - only alphanumeric characters
|
|
147
|
+
if not re.match(r"^[A-Z0-9]+$", symbol):
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"Symbol must be alphanumeric: {symbol}\nValid examples: BTCUSDT, ETHUSDT, SOLUSDT"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return symbol
|
|
153
|
+
|
|
154
|
+
# ADR-0021: URL constants for spot and futures
|
|
155
|
+
SPOT_BASE_URL = "https://data.binance.vision/data/spot"
|
|
156
|
+
FUTURES_BASE_URL = "https://data.binance.vision/data/futures/um"
|
|
157
|
+
|
|
158
|
+
def __init__(
|
|
159
|
+
self,
|
|
160
|
+
symbol: str = "SOLUSDT",
|
|
161
|
+
start_date: str = "2020-08-15",
|
|
162
|
+
end_date: str = "2025-03-20",
|
|
163
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
164
|
+
output_format: str = "csv",
|
|
165
|
+
instrument_type: str = "spot", # ADR-0021: UM futures support
|
|
166
|
+
) -> None:
|
|
167
|
+
"""Initialize the Binance Public Data Collector.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
symbol (str, optional): Trading pair symbol in USDT format.
|
|
171
|
+
Must be alphanumeric (A-Z, 0-9) only. Path characters (/, \\, .)
|
|
172
|
+
and special characters are rejected for security.
|
|
173
|
+
Symbol is normalized to uppercase.
|
|
174
|
+
Defaults to "SOLUSDT".
|
|
175
|
+
start_date (str, optional): Start date in YYYY-MM-DD format.
|
|
176
|
+
Data collection begins from this date (inclusive).
|
|
177
|
+
Must be on or before end_date.
|
|
178
|
+
Defaults to "2020-08-15".
|
|
179
|
+
end_date (str, optional): End date in YYYY-MM-DD format.
|
|
180
|
+
Data collection ends on this date (inclusive, 23:59:59).
|
|
181
|
+
Must be on or after start_date.
|
|
182
|
+
Defaults to "2025-03-20".
|
|
183
|
+
output_dir (str or Path, optional): Directory to save files.
|
|
184
|
+
If None, saves to package's sample_data directory.
|
|
185
|
+
Defaults to None.
|
|
186
|
+
output_format (str, optional): Output format ("csv" or "parquet").
|
|
187
|
+
CSV provides universal compatibility, Parquet offers 5-10x compression.
|
|
188
|
+
Defaults to "csv".
|
|
189
|
+
instrument_type (str, optional): Instrument type - "spot" or "futures-um".
|
|
190
|
+
Both types support 713 perpetual symbols. Defaults to "spot".
|
|
191
|
+
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If instrument_type is not "spot" or "futures-um"
|
|
194
|
+
ValueError: If symbol is None, empty, or contains invalid characters
|
|
195
|
+
(path traversal, special characters, non-alphanumeric).
|
|
196
|
+
ValueError: If date format is incorrect (not YYYY-MM-DD).
|
|
197
|
+
ValueError: If end_date is before start_date.
|
|
198
|
+
ValueError: If output_format is not 'csv' or 'parquet'.
|
|
199
|
+
FileNotFoundError: If output_dir path is invalid.
|
|
200
|
+
|
|
201
|
+
Security:
|
|
202
|
+
Input validation prevents path traversal attacks (CWE-22) by:
|
|
203
|
+
- Rejecting symbols with path characters (/, \\, ., ..)
|
|
204
|
+
- Enforcing alphanumeric-only symbols
|
|
205
|
+
- Validating date range logic
|
|
206
|
+
- Normalizing inputs to uppercase
|
|
207
|
+
|
|
208
|
+
Examples:
|
|
209
|
+
>>> # Default configuration (SOLUSDT spot, 4+ years of data)
|
|
210
|
+
>>> collector = BinancePublicDataCollector()
|
|
211
|
+
|
|
212
|
+
>>> # Futures data collection (NEW in v3.2.0)
|
|
213
|
+
>>> collector = BinancePublicDataCollector(
|
|
214
|
+
... symbol="BTCUSDT",
|
|
215
|
+
... start_date="2024-01-01",
|
|
216
|
+
... end_date="2024-12-31",
|
|
217
|
+
... instrument_type="futures-um"
|
|
218
|
+
... )
|
|
219
|
+
|
|
220
|
+
>>> # Custom output directory with Parquet format
|
|
221
|
+
>>> collector = BinancePublicDataCollector(
|
|
222
|
+
... symbol="ETHUSDT",
|
|
223
|
+
... output_dir="/path/to/crypto/data",
|
|
224
|
+
... output_format="parquet"
|
|
225
|
+
... )
|
|
226
|
+
"""
|
|
227
|
+
# ADR-0021: Validate instrument type first (fail fast)
|
|
228
|
+
if instrument_type not in ("spot", "futures-um"):
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Invalid instrument_type '{instrument_type}'. Must be 'spot' or 'futures-um'"
|
|
231
|
+
)
|
|
232
|
+
self.instrument_type = instrument_type
|
|
233
|
+
|
|
234
|
+
# Validate and assign symbol (SEC-01, SEC-02, SEC-03)
|
|
235
|
+
self.symbol = self._validate_symbol(symbol)
|
|
236
|
+
|
|
237
|
+
# Parse and assign dates with validation
|
|
238
|
+
try:
|
|
239
|
+
self.start_date = datetime.strptime(start_date, "%Y-%m-%d")
|
|
240
|
+
# Make end_date inclusive of the full day (23:59:59)
|
|
241
|
+
self.end_date = datetime.strptime(end_date, "%Y-%m-%d").replace(
|
|
242
|
+
hour=23, minute=59, second=59
|
|
243
|
+
)
|
|
244
|
+
except ValueError as e:
|
|
245
|
+
raise ValueError(f"Invalid date format. Use YYYY-MM-DD format. Error: {e}") from e
|
|
246
|
+
|
|
247
|
+
# SEC-04: Validate date range logic
|
|
248
|
+
if self.end_date < self.start_date:
|
|
249
|
+
raise ValueError(
|
|
250
|
+
f"Invalid date range: end_date ({self.end_date.strftime('%Y-%m-%d')}) "
|
|
251
|
+
f"is before start_date ({self.start_date.strftime('%Y-%m-%d')})"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# ADR-0021: URL routing based on instrument type
|
|
255
|
+
if instrument_type == "spot":
|
|
256
|
+
self.base_url = f"{self.SPOT_BASE_URL}/monthly/klines"
|
|
257
|
+
else: # futures-um
|
|
258
|
+
self.base_url = f"{self.FUTURES_BASE_URL}/monthly/klines"
|
|
259
|
+
|
|
260
|
+
# Initialize ETag cache for bandwidth optimization (90% reduction on re-runs)
|
|
261
|
+
self.etag_cache = ETagCache()
|
|
262
|
+
|
|
263
|
+
# Validate and store output format
|
|
264
|
+
if output_format not in ["csv", "parquet"]:
|
|
265
|
+
raise ValueError(f"output_format must be 'csv' or 'parquet', got '{output_format}'")
|
|
266
|
+
self.output_format = output_format
|
|
267
|
+
|
|
268
|
+
# Configure output directory - use provided path or default to sample_data
|
|
269
|
+
if output_dir:
|
|
270
|
+
self.output_dir = Path(output_dir)
|
|
271
|
+
else:
|
|
272
|
+
self.output_dir = Path(__file__).parent.parent / "sample_data"
|
|
273
|
+
|
|
274
|
+
# Ensure output directory exists
|
|
275
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
|
|
277
|
+
# Initialize Rich console for progress indicators
|
|
278
|
+
# Simple logging instead of Rich console
|
|
279
|
+
|
|
280
|
+
# Available timeframes on Binance public data
|
|
281
|
+
self.available_timeframes = [
|
|
282
|
+
"1s",
|
|
283
|
+
"1m",
|
|
284
|
+
"3m",
|
|
285
|
+
"5m",
|
|
286
|
+
"15m",
|
|
287
|
+
"30m",
|
|
288
|
+
"1h",
|
|
289
|
+
"2h",
|
|
290
|
+
"4h",
|
|
291
|
+
"6h",
|
|
292
|
+
"8h",
|
|
293
|
+
"12h",
|
|
294
|
+
"1d",
|
|
295
|
+
"3d",
|
|
296
|
+
"1w",
|
|
297
|
+
"1mo",
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
# Validate date range and symbol
|
|
301
|
+
self._validate_parameters()
|
|
302
|
+
|
|
303
|
+
print("Binance Public Data Collector")
|
|
304
|
+
print(f"Symbol: {self.symbol}")
|
|
305
|
+
print(
|
|
306
|
+
f"Date Range: {self.start_date.strftime('%Y-%m-%d')} to {self.end_date.strftime('%Y-%m-%d')}"
|
|
307
|
+
)
|
|
308
|
+
print(f"Data Source: {self.base_url}")
|
|
309
|
+
|
|
310
|
+
def _validate_parameters(self):
|
|
311
|
+
"""Validate date range parameters.
|
|
312
|
+
|
|
313
|
+
Note: Symbol validation is handled in the API layer via get_supported_symbols().
|
|
314
|
+
See ADR-0022 for symbol alignment rationale (spot and futures both use
|
|
315
|
+
binance-futures-availability package for 713 validated symbols).
|
|
316
|
+
"""
|
|
317
|
+
today = datetime.now().date()
|
|
318
|
+
yesterday = today - timedelta(days=1)
|
|
319
|
+
|
|
320
|
+
# Check for future dates
|
|
321
|
+
if self.end_date.date() > yesterday:
|
|
322
|
+
warnings.warn(
|
|
323
|
+
f"⚠️ Requested end date {self.end_date.strftime('%Y-%m-%d')} is in the future. "
|
|
324
|
+
f"Binance public data is typically available up to {yesterday}. "
|
|
325
|
+
f"Recent data may not be available and requests may fail with 404 errors.",
|
|
326
|
+
UserWarning,
|
|
327
|
+
stacklevel=2,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def generate_monthly_urls(self, trading_timeframe: str) -> List[Tuple[str, str, str]]:
|
|
331
|
+
"""Generate list of monthly ZIP file URLs to download."""
|
|
332
|
+
monthly_zip_urls = []
|
|
333
|
+
current_month_date = self.start_date.replace(day=1) # Start of month
|
|
334
|
+
|
|
335
|
+
while current_month_date <= self.end_date:
|
|
336
|
+
year_month_string = current_month_date.strftime("%Y-%m")
|
|
337
|
+
zip_filename = f"{self.symbol}-{trading_timeframe}-{year_month_string}.zip"
|
|
338
|
+
binance_zip_url = f"{self.base_url}/{self.symbol}/{trading_timeframe}/{zip_filename}"
|
|
339
|
+
monthly_zip_urls.append((binance_zip_url, year_month_string, zip_filename))
|
|
340
|
+
|
|
341
|
+
# Move to next month
|
|
342
|
+
if current_month_date.month == 12:
|
|
343
|
+
current_month_date = current_month_date.replace(
|
|
344
|
+
year=current_month_date.year + 1, month=1
|
|
345
|
+
)
|
|
346
|
+
else:
|
|
347
|
+
current_month_date = current_month_date.replace(month=current_month_date.month + 1)
|
|
348
|
+
|
|
349
|
+
return monthly_zip_urls
|
|
350
|
+
|
|
351
|
+
def download_and_extract_month(self, binance_zip_url, zip_filename):
|
|
352
|
+
"""Download and extract a single monthly ZIP file with ETag caching.
|
|
353
|
+
|
|
354
|
+
ETag caching reduces bandwidth by storing ZIP files locally and using
|
|
355
|
+
HTTP conditional requests (If-None-Match) to check if the file has changed.
|
|
356
|
+
Since Binance historical data is immutable, this achieves 90%+ bandwidth
|
|
357
|
+
reduction on re-runs.
|
|
358
|
+
"""
|
|
359
|
+
print(f" Downloading {zip_filename}...")
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
# Local cache path for ZIP files (XDG-compliant)
|
|
363
|
+
cache_zip_path = self.etag_cache.cache_dir / "zips" / zip_filename
|
|
364
|
+
cache_zip_path.parent.mkdir(parents=True, exist_ok=True)
|
|
365
|
+
|
|
366
|
+
# Check cache for ETag
|
|
367
|
+
cached_etag = self.etag_cache.get_etag(binance_zip_url)
|
|
368
|
+
|
|
369
|
+
# If we have both ETag and local file, check if remote changed
|
|
370
|
+
if cached_etag and cache_zip_path.exists():
|
|
371
|
+
request = urllib.request.Request(binance_zip_url)
|
|
372
|
+
request.add_header("If-None-Match", cached_etag)
|
|
373
|
+
print(f" 💾 Cache check: ETag {cached_etag[:8]}...")
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
with urllib.request.urlopen(request, timeout=60) as http_response:
|
|
377
|
+
if http_response.status == 304:
|
|
378
|
+
# 304 Not Modified - use cached ZIP file
|
|
379
|
+
print(
|
|
380
|
+
f" ✅ Cache HIT: {zip_filename} not modified (0 bytes downloaded)"
|
|
381
|
+
)
|
|
382
|
+
# Load data from cached ZIP file
|
|
383
|
+
with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
|
|
384
|
+
expected_csv_filename = zip_filename.replace(".zip", ".csv")
|
|
385
|
+
if expected_csv_filename in zip_file_handle.namelist():
|
|
386
|
+
with zip_file_handle.open(
|
|
387
|
+
expected_csv_filename
|
|
388
|
+
) as extracted_csv_file:
|
|
389
|
+
csv_file_content = extracted_csv_file.read().decode("utf-8")
|
|
390
|
+
return list(
|
|
391
|
+
csv.reader(csv_file_content.strip().split("\n"))
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
print(f" ⚠️ CSV file not found in cached {zip_filename}")
|
|
395
|
+
# Cache corrupted, delete and re-download
|
|
396
|
+
cache_zip_path.unlink()
|
|
397
|
+
self.etag_cache.invalidate(binance_zip_url)
|
|
398
|
+
elif http_response.status == 200:
|
|
399
|
+
# ETag changed - download new version
|
|
400
|
+
response_etag = http_response.headers.get("ETag")
|
|
401
|
+
content_length = http_response.headers.get("Content-Length", 0)
|
|
402
|
+
|
|
403
|
+
# Download to cache
|
|
404
|
+
with open(cache_zip_path, "wb") as cache_file:
|
|
405
|
+
shutil.copyfileobj(http_response, cache_file)
|
|
406
|
+
|
|
407
|
+
# Update ETag cache
|
|
408
|
+
if response_etag:
|
|
409
|
+
self.etag_cache.update_etag(
|
|
410
|
+
binance_zip_url, response_etag, int(content_length)
|
|
411
|
+
)
|
|
412
|
+
print(f" 📦 Cache UPDATE: Downloaded {zip_filename}")
|
|
413
|
+
|
|
414
|
+
# Extract CSV data from cached file
|
|
415
|
+
with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
|
|
416
|
+
expected_csv_filename = zip_filename.replace(".zip", ".csv")
|
|
417
|
+
if expected_csv_filename in zip_file_handle.namelist():
|
|
418
|
+
with zip_file_handle.open(
|
|
419
|
+
expected_csv_filename
|
|
420
|
+
) as extracted_csv_file:
|
|
421
|
+
csv_file_content = extracted_csv_file.read().decode("utf-8")
|
|
422
|
+
return list(
|
|
423
|
+
csv.reader(csv_file_content.strip().split("\n"))
|
|
424
|
+
)
|
|
425
|
+
else:
|
|
426
|
+
print(
|
|
427
|
+
f" ⚠️ HTTP {http_response.status} - {zip_filename} not available"
|
|
428
|
+
)
|
|
429
|
+
return []
|
|
430
|
+
except urllib.error.HTTPError as e:
|
|
431
|
+
if e.code == 304:
|
|
432
|
+
# Handle 304 explicitly - load from cache
|
|
433
|
+
print(f" ✅ Cache HIT: {zip_filename} not modified (0 bytes downloaded)")
|
|
434
|
+
with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
|
|
435
|
+
expected_csv_filename = zip_filename.replace(".zip", ".csv")
|
|
436
|
+
if expected_csv_filename in zip_file_handle.namelist():
|
|
437
|
+
with zip_file_handle.open(
|
|
438
|
+
expected_csv_filename
|
|
439
|
+
) as extracted_csv_file:
|
|
440
|
+
csv_file_content = extracted_csv_file.read().decode("utf-8")
|
|
441
|
+
return list(csv.reader(csv_file_content.strip().split("\n")))
|
|
442
|
+
else:
|
|
443
|
+
raise
|
|
444
|
+
else:
|
|
445
|
+
# No cache - download fresh
|
|
446
|
+
request = urllib.request.Request(binance_zip_url)
|
|
447
|
+
with urllib.request.urlopen(request, timeout=60) as http_response:
|
|
448
|
+
response_etag = http_response.headers.get("ETag")
|
|
449
|
+
content_length = http_response.headers.get("Content-Length", 0)
|
|
450
|
+
|
|
451
|
+
# Download to cache
|
|
452
|
+
with open(cache_zip_path, "wb") as cache_file:
|
|
453
|
+
shutil.copyfileobj(http_response, cache_file)
|
|
454
|
+
|
|
455
|
+
# Update ETag cache
|
|
456
|
+
if response_etag:
|
|
457
|
+
self.etag_cache.update_etag(
|
|
458
|
+
binance_zip_url, response_etag, int(content_length)
|
|
459
|
+
)
|
|
460
|
+
print(f" 📦 Cache MISS: Downloaded {zip_filename}")
|
|
461
|
+
|
|
462
|
+
# Extract CSV data from cached file
|
|
463
|
+
with zipfile.ZipFile(cache_zip_path, "r") as zip_file_handle:
|
|
464
|
+
expected_csv_filename = zip_filename.replace(".zip", ".csv")
|
|
465
|
+
if expected_csv_filename in zip_file_handle.namelist():
|
|
466
|
+
with zip_file_handle.open(expected_csv_filename) as extracted_csv_file:
|
|
467
|
+
csv_file_content = extracted_csv_file.read().decode("utf-8")
|
|
468
|
+
return list(csv.reader(csv_file_content.strip().split("\n")))
|
|
469
|
+
else:
|
|
470
|
+
print(f" ⚠️ CSV file not found in {zip_filename}")
|
|
471
|
+
return []
|
|
472
|
+
|
|
473
|
+
except Exception as download_exception:
|
|
474
|
+
print(f" ❌ Error downloading {zip_filename}: {download_exception}")
|
|
475
|
+
|
|
476
|
+
# Implement automatic fallback to daily files when monthly fails
|
|
477
|
+
print(f" 🔄 Attempting daily file fallback for {zip_filename}")
|
|
478
|
+
return self._fallback_to_daily_files(zip_filename)
|
|
479
|
+
|
|
480
|
+
def _fallback_to_daily_files(self, failed_monthly_filename):
|
|
481
|
+
"""
|
|
482
|
+
Fallback to daily file downloads when monthly file is not available.
|
|
483
|
+
|
|
484
|
+
Automatically downloads individual daily files for the failed month
|
|
485
|
+
and combines them into a single dataset for seamless operation.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
failed_monthly_filename: The monthly filename that failed (e.g., "BTCUSDT-1d-2025-09.zip")
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
List of combined daily data, or empty list if all daily files also fail
|
|
492
|
+
"""
|
|
493
|
+
# Extract symbol, timeframe, and year-month from failed filename
|
|
494
|
+
# Format: "BTCUSDT-1d-2025-09.zip"
|
|
495
|
+
parts = failed_monthly_filename.replace(".zip", "").split("-")
|
|
496
|
+
if len(parts) < 4:
|
|
497
|
+
print(f" ❌ Cannot parse monthly filename: {failed_monthly_filename}")
|
|
498
|
+
return []
|
|
499
|
+
|
|
500
|
+
symbol = parts[0]
|
|
501
|
+
timeframe = parts[1]
|
|
502
|
+
year = parts[2]
|
|
503
|
+
month = parts[3]
|
|
504
|
+
|
|
505
|
+
print(f" 📅 Fallback: Downloading daily files for {symbol} {timeframe} {year}-{month}")
|
|
506
|
+
|
|
507
|
+
# Generate daily URLs for the entire month
|
|
508
|
+
daily_urls = self._generate_daily_urls_for_month(symbol, timeframe, year, month)
|
|
509
|
+
|
|
510
|
+
# Download all daily files for this month
|
|
511
|
+
combined_daily_data = []
|
|
512
|
+
successful_daily_downloads = 0
|
|
513
|
+
|
|
514
|
+
for daily_url, daily_filename in daily_urls:
|
|
515
|
+
daily_data = self._download_and_extract_daily_file(daily_url, daily_filename)
|
|
516
|
+
if daily_data:
|
|
517
|
+
combined_daily_data.extend(daily_data)
|
|
518
|
+
successful_daily_downloads += 1
|
|
519
|
+
|
|
520
|
+
if successful_daily_downloads > 0:
|
|
521
|
+
print(
|
|
522
|
+
f" ✅ Daily fallback successful: {successful_daily_downloads}/{len(daily_urls)} daily files retrieved"
|
|
523
|
+
)
|
|
524
|
+
return combined_daily_data
|
|
525
|
+
else:
|
|
526
|
+
print(f" ❌ Daily fallback failed: No daily files available for {year}-{month}")
|
|
527
|
+
return []
|
|
528
|
+
|
|
529
|
+
def _generate_daily_urls_for_month(self, symbol, timeframe, year, month):
|
|
530
|
+
"""Generate daily URLs for all days in a specific month."""
|
|
531
|
+
from calendar import monthrange
|
|
532
|
+
|
|
533
|
+
# Get number of days in the month
|
|
534
|
+
year_int = int(year)
|
|
535
|
+
month_int = int(month)
|
|
536
|
+
_, days_in_month = monthrange(year_int, month_int)
|
|
537
|
+
|
|
538
|
+
daily_urls = []
|
|
539
|
+
|
|
540
|
+
# Use daily data URL pattern: https://data.binance.vision/data/spot/daily/klines/
|
|
541
|
+
daily_base_url = self.base_url.replace("/monthly/", "/daily/")
|
|
542
|
+
|
|
543
|
+
for day in range(1, days_in_month + 1):
|
|
544
|
+
date_str = f"{year}-{month_int:02d}-{day:02d}"
|
|
545
|
+
daily_filename = f"{symbol}-{timeframe}-{date_str}.zip"
|
|
546
|
+
daily_url = f"{daily_base_url}/{symbol}/{timeframe}/{daily_filename}"
|
|
547
|
+
daily_urls.append((daily_url, daily_filename))
|
|
548
|
+
|
|
549
|
+
return daily_urls
|
|
550
|
+
|
|
551
|
+
def _download_and_extract_daily_file(self, daily_url, daily_filename):
|
|
552
|
+
"""Download and extract a single daily ZIP file."""
|
|
553
|
+
try:
|
|
554
|
+
with tempfile.NamedTemporaryFile() as temporary_zip_file:
|
|
555
|
+
# Download daily ZIP file
|
|
556
|
+
with urllib.request.urlopen(daily_url, timeout=30) as http_response:
|
|
557
|
+
if http_response.status == 200:
|
|
558
|
+
shutil.copyfileobj(http_response, temporary_zip_file)
|
|
559
|
+
temporary_zip_file.flush()
|
|
560
|
+
else:
|
|
561
|
+
# Daily file not available (normal for future dates or weekends)
|
|
562
|
+
return []
|
|
563
|
+
|
|
564
|
+
# Extract CSV data from daily file
|
|
565
|
+
with zipfile.ZipFile(temporary_zip_file.name, "r") as zip_file_handle:
|
|
566
|
+
expected_csv_filename = daily_filename.replace(".zip", ".csv")
|
|
567
|
+
if expected_csv_filename in zip_file_handle.namelist():
|
|
568
|
+
with zip_file_handle.open(expected_csv_filename) as extracted_csv_file:
|
|
569
|
+
csv_file_content = extracted_csv_file.read().decode("utf-8")
|
|
570
|
+
return list(csv.reader(csv_file_content.strip().split("\n")))
|
|
571
|
+
else:
|
|
572
|
+
return []
|
|
573
|
+
|
|
574
|
+
except Exception:
|
|
575
|
+
# Silent failure for daily files - many days may not have data
|
|
576
|
+
return []
|
|
577
|
+
|
|
578
|
+
def _detect_header_intelligent(self, raw_csv_data):
|
|
579
|
+
"""Intelligent header detection - determine if first row is data or header."""
|
|
580
|
+
if not raw_csv_data:
|
|
581
|
+
return False
|
|
582
|
+
|
|
583
|
+
first_csv_row = raw_csv_data[0]
|
|
584
|
+
if len(first_csv_row) < 6:
|
|
585
|
+
return False
|
|
586
|
+
|
|
587
|
+
# Header detection heuristics
|
|
588
|
+
try:
|
|
589
|
+
# Test if first field is numeric timestamp
|
|
590
|
+
first_field_value = int(first_csv_row[0])
|
|
591
|
+
|
|
592
|
+
# ✅ BOUNDARY FIX: Support both milliseconds (13-digit) AND microseconds (16-digit) formats
|
|
593
|
+
# Valid timestamp ranges:
|
|
594
|
+
# Milliseconds: 1000000000000 (2001) to 9999999999999 (2286)
|
|
595
|
+
# Microseconds: 1000000000000000 (2001) to 9999999999999999 (2286)
|
|
596
|
+
is_valid_millisecond_timestamp = 1000000000000 <= first_field_value <= 9999999999999
|
|
597
|
+
is_valid_microsecond_timestamp = (
|
|
598
|
+
1000000000000000 <= first_field_value <= 9999999999999999
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
if is_valid_millisecond_timestamp or is_valid_microsecond_timestamp:
|
|
602
|
+
# Test if other fields are numeric (prices/volumes)
|
|
603
|
+
for ohlcv_field_index in [1, 2, 3, 4, 5]: # OHLCV fields
|
|
604
|
+
float(first_csv_row[ohlcv_field_index])
|
|
605
|
+
return False # All numeric = data row
|
|
606
|
+
else:
|
|
607
|
+
return True # Invalid timestamp = likely header
|
|
608
|
+
|
|
609
|
+
except (ValueError, IndexError):
|
|
610
|
+
# Non-numeric first field = header
|
|
611
|
+
return True
|
|
612
|
+
|
|
613
|
+
def process_raw_data(self, raw_csv_data):
|
|
614
|
+
"""Convert raw Binance CSV data with comprehensive timestamp format tracking and transition detection."""
|
|
615
|
+
processed_candle_data = []
|
|
616
|
+
self.corruption_log = getattr(self, "corruption_log", [])
|
|
617
|
+
|
|
618
|
+
# Initialize timestamp format analyzer
|
|
619
|
+
format_analyzer = TimestampFormatAnalyzer()
|
|
620
|
+
format_analyzer.initialize_tracking()
|
|
621
|
+
|
|
622
|
+
# Intelligent header detection
|
|
623
|
+
csv_has_header = self._detect_header_intelligent(raw_csv_data)
|
|
624
|
+
data_start_row_index = 1 if csv_has_header else 0
|
|
625
|
+
|
|
626
|
+
# Store header detection results for metadata
|
|
627
|
+
self._header_detected = csv_has_header
|
|
628
|
+
self._header_content = raw_csv_data[0][:6] if csv_has_header else None
|
|
629
|
+
self._data_start_row = data_start_row_index
|
|
630
|
+
|
|
631
|
+
if csv_has_header:
|
|
632
|
+
print(f" 📋 Header detected: {raw_csv_data[0][:6]}")
|
|
633
|
+
else:
|
|
634
|
+
print(" 📊 Pure data format detected (no header)")
|
|
635
|
+
|
|
636
|
+
format_transition_logged = False
|
|
637
|
+
|
|
638
|
+
for csv_row_index, csv_row_data in enumerate(
|
|
639
|
+
raw_csv_data[data_start_row_index:], start=data_start_row_index
|
|
640
|
+
):
|
|
641
|
+
if len(csv_row_data) >= 6: # Binance format has 12 columns but we need first 6
|
|
642
|
+
try:
|
|
643
|
+
# Binance format: [timestamp, open, high, low, close, volume, close_time, quote_volume, count, taker_buy_volume, taker_buy_quote_volume, ignore]
|
|
644
|
+
raw_timestamp_value = int(csv_row_data[0])
|
|
645
|
+
|
|
646
|
+
# Comprehensive format detection with transition tracking
|
|
647
|
+
(
|
|
648
|
+
detected_timestamp_format,
|
|
649
|
+
converted_timestamp_seconds,
|
|
650
|
+
format_validation_result,
|
|
651
|
+
) = format_analyzer.analyze_timestamp_format(raw_timestamp_value, csv_row_index)
|
|
652
|
+
|
|
653
|
+
# Track format transitions and update statistics
|
|
654
|
+
if format_analyzer.current_format is None:
|
|
655
|
+
print(f" 🎯 Initial timestamp format: {detected_timestamp_format}")
|
|
656
|
+
|
|
657
|
+
transition_detected = format_analyzer.update_format_stats(
|
|
658
|
+
detected_timestamp_format, raw_timestamp_value, csv_row_index
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
if transition_detected and not format_transition_logged:
|
|
662
|
+
last_transition = format_analyzer.format_transitions[-1]
|
|
663
|
+
print(
|
|
664
|
+
f" 🔄 Format transition detected: {last_transition['from_format']} → {detected_timestamp_format}"
|
|
665
|
+
)
|
|
666
|
+
format_transition_logged = True
|
|
667
|
+
|
|
668
|
+
# Skip if validation failed
|
|
669
|
+
if not format_validation_result["valid"]:
|
|
670
|
+
self.corruption_log.append(format_validation_result["error_details"])
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
# ✅ CRITICAL FIX: Use UTC to match Binance's native timezone
|
|
674
|
+
# Eliminates artificial DST gaps caused by local timezone conversion
|
|
675
|
+
utc_datetime = datetime.fromtimestamp(converted_timestamp_seconds, timezone.utc)
|
|
676
|
+
|
|
677
|
+
# ✅ BOUNDARY FIX: Don't filter per-monthly-file to preserve month boundaries
|
|
678
|
+
# Enhanced processing: capture all 11 essential Binance columns for complete microstructure analysis
|
|
679
|
+
processed_candle_row = [
|
|
680
|
+
utc_datetime.strftime("%Y-%m-%d %H:%M:%S"), # date (from open_time)
|
|
681
|
+
float(csv_row_data[1]), # open
|
|
682
|
+
float(csv_row_data[2]), # high
|
|
683
|
+
float(csv_row_data[3]), # low
|
|
684
|
+
float(csv_row_data[4]), # close
|
|
685
|
+
float(csv_row_data[5]), # volume (base asset volume)
|
|
686
|
+
# Additional microstructure columns for professional analysis
|
|
687
|
+
datetime.fromtimestamp(
|
|
688
|
+
int(csv_row_data[6])
|
|
689
|
+
/ (1000000 if len(str(int(csv_row_data[6]))) >= 16 else 1000),
|
|
690
|
+
timezone.utc,
|
|
691
|
+
).strftime("%Y-%m-%d %H:%M:%S"), # close_time
|
|
692
|
+
float(csv_row_data[7]), # quote_asset_volume
|
|
693
|
+
int(csv_row_data[8]), # number_of_trades
|
|
694
|
+
float(csv_row_data[9]), # taker_buy_base_asset_volume
|
|
695
|
+
float(csv_row_data[10]), # taker_buy_quote_asset_volume
|
|
696
|
+
]
|
|
697
|
+
processed_candle_data.append(processed_candle_row)
|
|
698
|
+
|
|
699
|
+
except (ValueError, OSError, OverflowError) as parsing_exception:
|
|
700
|
+
format_analyzer.format_stats["unknown"]["count"] += 1
|
|
701
|
+
error_record = {
|
|
702
|
+
"row_index": csv_row_index,
|
|
703
|
+
"error_type": "timestamp_parse_error",
|
|
704
|
+
"error_message": str(parsing_exception),
|
|
705
|
+
"raw_row": csv_row_data[:10] if len(csv_row_data) > 10 else csv_row_data,
|
|
706
|
+
}
|
|
707
|
+
self.corruption_log.append(error_record)
|
|
708
|
+
format_analyzer.format_stats["unknown"]["errors"].append(error_record)
|
|
709
|
+
continue
|
|
710
|
+
else:
|
|
711
|
+
# Record insufficient columns
|
|
712
|
+
self.corruption_log.append(
|
|
713
|
+
{
|
|
714
|
+
"row_index": csv_row_index,
|
|
715
|
+
"error_type": "insufficient_columns",
|
|
716
|
+
"column_count": len(csv_row_data),
|
|
717
|
+
"raw_row": csv_row_data,
|
|
718
|
+
}
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
# Report comprehensive format analysis
|
|
722
|
+
format_analyzer.report_format_analysis()
|
|
723
|
+
|
|
724
|
+
# Store format analysis summary for metadata
|
|
725
|
+
self._format_analysis_summary = format_analyzer.get_format_analysis_summary()
|
|
726
|
+
|
|
727
|
+
return processed_candle_data
|
|
728
|
+
|
|
729
|
+
def collect_timeframe_data(self, trading_timeframe: str) -> Dict[str, Any]:
|
|
730
|
+
"""Collect complete historical data for a single timeframe with full 11-column microstructure format.
|
|
731
|
+
|
|
732
|
+
Downloads and processes monthly ZIP files from Binance's public data repository
|
|
733
|
+
for the specified timeframe. Automatically handles data processing, validation,
|
|
734
|
+
and saves to CSV while returning a DataFrame for immediate use.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
trading_timeframe (str): Timeframe for data collection.
|
|
738
|
+
Must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
dict: Collection results containing:
|
|
742
|
+
- dataframe (pd.DataFrame): Complete OHLCV data with 11 columns:
|
|
743
|
+
* date: Timestamp (open time)
|
|
744
|
+
* open, high, low, close: Price data
|
|
745
|
+
* volume: Base asset volume
|
|
746
|
+
* close_time: Timestamp (close time)
|
|
747
|
+
* quote_asset_volume: Quote asset volume
|
|
748
|
+
* number_of_trades: Trade count
|
|
749
|
+
* taker_buy_base_asset_volume: Taker buy base volume
|
|
750
|
+
* taker_buy_quote_asset_volume: Taker buy quote volume
|
|
751
|
+
- filepath (Path): Path to saved CSV file
|
|
752
|
+
- stats (dict): Collection statistics including duration and bar count
|
|
753
|
+
|
|
754
|
+
Raises:
|
|
755
|
+
ValueError: If trading_timeframe is not supported.
|
|
756
|
+
ConnectionError: If download from Binance repository fails.
|
|
757
|
+
FileNotFoundError: If output directory is invalid.
|
|
758
|
+
|
|
759
|
+
Examples:
|
|
760
|
+
>>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
|
|
761
|
+
>>> result = collector.collect_timeframe_data("1h")
|
|
762
|
+
>>> df = result["dataframe"]
|
|
763
|
+
>>> print(f"Collected {len(df)} hourly bars")
|
|
764
|
+
>>> print(f"Date range: {df['date'].min()} to {df['date'].max()}")
|
|
765
|
+
Collected 26280 hourly bars
|
|
766
|
+
Date range: 2020-08-15 01:00:00 to 2025-03-20 23:00:00
|
|
767
|
+
|
|
768
|
+
>>> # Access microstructure data
|
|
769
|
+
>>> print(f"Total trades: {df['number_of_trades'].sum():,}")
|
|
770
|
+
>>> print(f"Average taker buy ratio: {df['taker_buy_base_asset_volume'].sum() / df['volume'].sum():.2%}")
|
|
771
|
+
Total trades: 15,234,567
|
|
772
|
+
Average taker buy ratio: 51.23%
|
|
773
|
+
|
|
774
|
+
Note:
|
|
775
|
+
This method processes data chronologically and may take several minutes
|
|
776
|
+
for large date ranges due to monthly ZIP file downloads. Progress is
|
|
777
|
+
displayed during collection.
|
|
778
|
+
"""
|
|
779
|
+
print(f"\n{'=' * 60}")
|
|
780
|
+
print(f"COLLECTING {trading_timeframe.upper()} DATA FROM BINANCE PUBLIC REPOSITORY")
|
|
781
|
+
print(f"{'=' * 60}")
|
|
782
|
+
|
|
783
|
+
if trading_timeframe not in self.available_timeframes:
|
|
784
|
+
print(f"❌ Timeframe '{trading_timeframe}' not available")
|
|
785
|
+
print(f"📊 Available timeframes: {', '.join(self.available_timeframes)}")
|
|
786
|
+
print("💡 Use 'gapless-crypto-data --list-timeframes' for detailed descriptions")
|
|
787
|
+
return None
|
|
788
|
+
|
|
789
|
+
# Generate monthly URLs
|
|
790
|
+
monthly_zip_urls = self.generate_monthly_urls(trading_timeframe)
|
|
791
|
+
print(f"Monthly files to download: {len(monthly_zip_urls)}")
|
|
792
|
+
|
|
793
|
+
# Collect data from all months
|
|
794
|
+
combined_candle_data = []
|
|
795
|
+
successful_download_count = 0
|
|
796
|
+
|
|
797
|
+
for binance_zip_url, year_month_string, zip_filename in monthly_zip_urls:
|
|
798
|
+
raw_monthly_csv_data = self.download_and_extract_month(binance_zip_url, zip_filename)
|
|
799
|
+
if raw_monthly_csv_data:
|
|
800
|
+
processed_monthly_data = self.process_raw_data(raw_monthly_csv_data)
|
|
801
|
+
combined_candle_data.extend(processed_monthly_data)
|
|
802
|
+
successful_download_count += 1
|
|
803
|
+
print(f" ✅ {len(processed_monthly_data):,} bars from {year_month_string}")
|
|
804
|
+
else:
|
|
805
|
+
print(f" ⚠️ No data from {year_month_string}")
|
|
806
|
+
|
|
807
|
+
print("\nCollection Summary:")
|
|
808
|
+
print(f" Successful downloads: {successful_download_count}/{len(monthly_zip_urls)}")
|
|
809
|
+
print(f" Total bars collected: {len(combined_candle_data):,}")
|
|
810
|
+
|
|
811
|
+
# ETag cache statistics for observability
|
|
812
|
+
cache_stats = self.etag_cache.get_cache_stats()
|
|
813
|
+
if cache_stats["total_entries"] > 0:
|
|
814
|
+
total_cached_size_mb = cache_stats["total_cached_size"] / (1024 * 1024)
|
|
815
|
+
print(
|
|
816
|
+
f" ETag cache: {cache_stats['total_entries']} entries, {total_cached_size_mb:.1f} MB tracked"
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
if combined_candle_data:
|
|
820
|
+
# Sort by timestamp to ensure chronological order
|
|
821
|
+
combined_candle_data.sort(key=lambda candle_row: candle_row[0])
|
|
822
|
+
print(
|
|
823
|
+
f" Pre-filtering range: {combined_candle_data[0][0]} to {combined_candle_data[-1][0]}"
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
# ✅ BOUNDARY FIX: Apply final date range filtering after combining all monthly data
|
|
827
|
+
# This preserves month boundaries while respecting the requested date range
|
|
828
|
+
date_filtered_data = []
|
|
829
|
+
for candle_row in combined_candle_data:
|
|
830
|
+
candle_datetime = datetime.strptime(candle_row[0], "%Y-%m-%d %H:%M:%S")
|
|
831
|
+
if self.start_date <= candle_datetime <= self.end_date:
|
|
832
|
+
date_filtered_data.append(candle_row)
|
|
833
|
+
|
|
834
|
+
print(f" Post-filtering: {len(date_filtered_data):,} bars in requested range")
|
|
835
|
+
if date_filtered_data:
|
|
836
|
+
print(f" Final range: {date_filtered_data[0][0]} to {date_filtered_data[-1][0]}")
|
|
837
|
+
|
|
838
|
+
# Save to CSV and return DataFrame for seamless Python integration
|
|
839
|
+
if date_filtered_data:
|
|
840
|
+
# Calculate collection stats for metadata
|
|
841
|
+
collection_stats = {
|
|
842
|
+
"method": "direct_download",
|
|
843
|
+
"duration": 0.0, # Minimal for single timeframe
|
|
844
|
+
"bars_per_second": 0,
|
|
845
|
+
"total_bars": len(date_filtered_data),
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
# Save to CSV file (addresses the output_dir bug)
|
|
849
|
+
filepath = self.save_data(trading_timeframe, date_filtered_data, collection_stats)
|
|
850
|
+
|
|
851
|
+
# Convert to DataFrame for Python API users
|
|
852
|
+
columns = [
|
|
853
|
+
"date",
|
|
854
|
+
"open",
|
|
855
|
+
"high",
|
|
856
|
+
"low",
|
|
857
|
+
"close",
|
|
858
|
+
"volume",
|
|
859
|
+
"close_time",
|
|
860
|
+
"quote_asset_volume",
|
|
861
|
+
"number_of_trades",
|
|
862
|
+
"taker_buy_base_asset_volume",
|
|
863
|
+
"taker_buy_quote_asset_volume",
|
|
864
|
+
]
|
|
865
|
+
df = pd.DataFrame(date_filtered_data, columns=columns)
|
|
866
|
+
|
|
867
|
+
# Convert numeric columns
|
|
868
|
+
numeric_cols = [
|
|
869
|
+
"open",
|
|
870
|
+
"high",
|
|
871
|
+
"low",
|
|
872
|
+
"close",
|
|
873
|
+
"volume",
|
|
874
|
+
"quote_asset_volume",
|
|
875
|
+
"number_of_trades",
|
|
876
|
+
"taker_buy_base_asset_volume",
|
|
877
|
+
"taker_buy_quote_asset_volume",
|
|
878
|
+
]
|
|
879
|
+
for col in numeric_cols:
|
|
880
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
881
|
+
|
|
882
|
+
# Convert date columns to datetime
|
|
883
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
884
|
+
df["close_time"] = pd.to_datetime(df["close_time"])
|
|
885
|
+
|
|
886
|
+
return {"dataframe": df, "filepath": filepath, "stats": collection_stats}
|
|
887
|
+
|
|
888
|
+
return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
|
|
889
|
+
|
|
890
|
+
# Save to CSV and return DataFrame for unfiltered data
|
|
891
|
+
if combined_candle_data:
|
|
892
|
+
# Calculate collection stats for metadata
|
|
893
|
+
collection_stats = {
|
|
894
|
+
"method": "direct_download",
|
|
895
|
+
"duration": 0.0, # Minimal for single timeframe
|
|
896
|
+
"bars_per_second": 0,
|
|
897
|
+
"total_bars": len(combined_candle_data),
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
# Save to CSV file (addresses the output_dir bug)
|
|
901
|
+
filepath = self.save_data(trading_timeframe, combined_candle_data, collection_stats)
|
|
902
|
+
|
|
903
|
+
# Convert to DataFrame for Python API users
|
|
904
|
+
columns = [
|
|
905
|
+
"date",
|
|
906
|
+
"open",
|
|
907
|
+
"high",
|
|
908
|
+
"low",
|
|
909
|
+
"close",
|
|
910
|
+
"volume",
|
|
911
|
+
"close_time",
|
|
912
|
+
"quote_asset_volume",
|
|
913
|
+
"number_of_trades",
|
|
914
|
+
"taker_buy_base_asset_volume",
|
|
915
|
+
"taker_buy_quote_asset_volume",
|
|
916
|
+
]
|
|
917
|
+
df = pd.DataFrame(combined_candle_data, columns=columns)
|
|
918
|
+
|
|
919
|
+
# Convert numeric columns
|
|
920
|
+
numeric_cols = [
|
|
921
|
+
"open",
|
|
922
|
+
"high",
|
|
923
|
+
"low",
|
|
924
|
+
"close",
|
|
925
|
+
"volume",
|
|
926
|
+
"quote_asset_volume",
|
|
927
|
+
"number_of_trades",
|
|
928
|
+
"taker_buy_base_asset_volume",
|
|
929
|
+
"taker_buy_quote_asset_volume",
|
|
930
|
+
]
|
|
931
|
+
for col in numeric_cols:
|
|
932
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
933
|
+
|
|
934
|
+
# Convert date columns to datetime
|
|
935
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
936
|
+
df["close_time"] = pd.to_datetime(df["close_time"])
|
|
937
|
+
|
|
938
|
+
return {"dataframe": df, "filepath": filepath, "stats": collection_stats}
|
|
939
|
+
|
|
940
|
+
return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
|
|
941
|
+
|
|
942
|
+
def generate_metadata(
|
|
943
|
+
self, trading_timeframe, candle_data, collection_performance_stats, gap_analysis_result=None
|
|
944
|
+
):
|
|
945
|
+
"""Generate comprehensive metadata for 11-column microstructure format."""
|
|
946
|
+
if not candle_data:
|
|
947
|
+
return {}
|
|
948
|
+
|
|
949
|
+
# Calculate statistics
|
|
950
|
+
price_values = []
|
|
951
|
+
volume_values = []
|
|
952
|
+
for candle_row in candle_data:
|
|
953
|
+
price_values.extend([candle_row[2], candle_row[3]]) # high, low
|
|
954
|
+
volume_values.append(candle_row[5])
|
|
955
|
+
|
|
956
|
+
return {
|
|
957
|
+
"version": "v2.10.0",
|
|
958
|
+
"generator": "BinancePublicDataCollector",
|
|
959
|
+
"generation_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
|
960
|
+
"data_source": "Binance Public Data Repository",
|
|
961
|
+
"data_source_url": self.base_url,
|
|
962
|
+
"market_type": "spot",
|
|
963
|
+
"symbol": self.symbol,
|
|
964
|
+
"timeframe": trading_timeframe,
|
|
965
|
+
"collection_method": "direct_download",
|
|
966
|
+
"target_period": {
|
|
967
|
+
"start": self.start_date.isoformat(),
|
|
968
|
+
"end": self.end_date.isoformat(),
|
|
969
|
+
"total_days": (self.end_date - self.start_date).days,
|
|
970
|
+
},
|
|
971
|
+
"actual_bars": len(candle_data),
|
|
972
|
+
"date_range": {
|
|
973
|
+
"start": candle_data[0][0] if candle_data else None,
|
|
974
|
+
"end": candle_data[-1][0] if candle_data else None,
|
|
975
|
+
},
|
|
976
|
+
"statistics": {
|
|
977
|
+
"price_min": min(price_values) if price_values else 0,
|
|
978
|
+
"price_max": max(price_values) if price_values else 0,
|
|
979
|
+
"volume_total": sum(volume_values) if volume_values else 0,
|
|
980
|
+
"volume_mean": sum(volume_values) / len(volume_values) if volume_values else 0,
|
|
981
|
+
},
|
|
982
|
+
"collection_performance": collection_performance_stats,
|
|
983
|
+
"data_integrity": {
|
|
984
|
+
"chronological_order": True,
|
|
985
|
+
"data_hash": self._calculate_data_hash(candle_data),
|
|
986
|
+
"corruption_detected": len(getattr(self, "corruption_log", [])) > 0,
|
|
987
|
+
"corrupted_rows_count": len(getattr(self, "corruption_log", [])),
|
|
988
|
+
"corruption_details": getattr(self, "corruption_log", []),
|
|
989
|
+
"header_detection": {
|
|
990
|
+
"header_found": getattr(self, "_header_detected", False),
|
|
991
|
+
"header_content": getattr(self, "_header_content", None),
|
|
992
|
+
"data_start_row": getattr(self, "_data_start_row", 0),
|
|
993
|
+
},
|
|
994
|
+
},
|
|
995
|
+
"timestamp_format_analysis": getattr(
|
|
996
|
+
self,
|
|
997
|
+
"_format_analysis_summary",
|
|
998
|
+
{
|
|
999
|
+
"total_rows_analyzed": 0,
|
|
1000
|
+
"formats_detected": {},
|
|
1001
|
+
"transitions_detected": 0,
|
|
1002
|
+
"transition_details": [],
|
|
1003
|
+
"primary_format": "unknown",
|
|
1004
|
+
"format_consistency": True,
|
|
1005
|
+
"analysis_note": "Format analysis not available - may be legacy collection",
|
|
1006
|
+
},
|
|
1007
|
+
),
|
|
1008
|
+
"enhanced_microstructure_format": {
|
|
1009
|
+
"format_version": "v2.10.0",
|
|
1010
|
+
"total_columns": len(candle_data[0]) if candle_data else 11,
|
|
1011
|
+
"enhanced_features": [
|
|
1012
|
+
"quote_asset_volume",
|
|
1013
|
+
"number_of_trades",
|
|
1014
|
+
"taker_buy_base_asset_volume",
|
|
1015
|
+
"taker_buy_quote_asset_volume",
|
|
1016
|
+
"close_time",
|
|
1017
|
+
],
|
|
1018
|
+
"analysis_capabilities": [
|
|
1019
|
+
"order_flow_analysis",
|
|
1020
|
+
"liquidity_metrics",
|
|
1021
|
+
"market_microstructure",
|
|
1022
|
+
"trade_weighted_prices",
|
|
1023
|
+
"institutional_data_patterns",
|
|
1024
|
+
],
|
|
1025
|
+
"professional_features": True,
|
|
1026
|
+
"api_format_compatibility": True,
|
|
1027
|
+
},
|
|
1028
|
+
"gap_analysis": gap_analysis_result
|
|
1029
|
+
or {
|
|
1030
|
+
"analysis_performed": False,
|
|
1031
|
+
"total_gaps_detected": 0,
|
|
1032
|
+
"gaps_filled": 0,
|
|
1033
|
+
"gaps_remaining": 0,
|
|
1034
|
+
"gap_details": [],
|
|
1035
|
+
"gap_filling_method": "authentic_binance_api",
|
|
1036
|
+
"data_completeness_score": 1.0,
|
|
1037
|
+
"note": "Gap analysis can be performed using UniversalGapFiller.detect_all_gaps()",
|
|
1038
|
+
},
|
|
1039
|
+
"compliance": {
|
|
1040
|
+
"zero_magic_numbers": True,
|
|
1041
|
+
"temporal_integrity": True,
|
|
1042
|
+
"authentic_spot_data_only": True,
|
|
1043
|
+
"official_binance_source": True,
|
|
1044
|
+
"binance_format_transition_aware": True,
|
|
1045
|
+
"supports_milliseconds_microseconds": True,
|
|
1046
|
+
"full_binance_microstructure_format": True,
|
|
1047
|
+
"professional_trading_ready": True,
|
|
1048
|
+
},
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
def _perform_gap_analysis(self, data, timeframe):
|
|
1052
|
+
"""Perform gap analysis on collected data and return detailed results."""
|
|
1053
|
+
if not data or len(data) < 2:
|
|
1054
|
+
return {
|
|
1055
|
+
"analysis_performed": True,
|
|
1056
|
+
"total_gaps_detected": 0,
|
|
1057
|
+
"gaps_filled": 0,
|
|
1058
|
+
"gaps_remaining": 0,
|
|
1059
|
+
"gap_details": [],
|
|
1060
|
+
"gap_filling_method": "authentic_binance_api",
|
|
1061
|
+
"data_completeness_score": 1.0,
|
|
1062
|
+
"note": "Insufficient data for gap analysis (< 2 rows)",
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
# Calculate expected interval in minutes using centralized constants
|
|
1066
|
+
interval_minutes = TIMEFRAME_TO_MINUTES.get(timeframe, 60)
|
|
1067
|
+
expected_gap_minutes = interval_minutes
|
|
1068
|
+
|
|
1069
|
+
# Analyze timestamp gaps
|
|
1070
|
+
gaps_detected = []
|
|
1071
|
+
total_bars_expected = 0
|
|
1072
|
+
|
|
1073
|
+
for i in range(1, len(data)):
|
|
1074
|
+
current_time = datetime.strptime(data[i][0], "%Y-%m-%d %H:%M:%S")
|
|
1075
|
+
previous_time = datetime.strptime(data[i - 1][0], "%Y-%m-%d %H:%M:%S")
|
|
1076
|
+
|
|
1077
|
+
actual_gap_minutes = (current_time - previous_time).total_seconds() / 60
|
|
1078
|
+
|
|
1079
|
+
if actual_gap_minutes > expected_gap_minutes * 1.5: # Allow 50% tolerance
|
|
1080
|
+
missing_bars = int(actual_gap_minutes / expected_gap_minutes) - 1
|
|
1081
|
+
if missing_bars > 0:
|
|
1082
|
+
gaps_detected.append(
|
|
1083
|
+
{
|
|
1084
|
+
"gap_start": data[i - 1][0],
|
|
1085
|
+
"gap_end": data[i][0],
|
|
1086
|
+
"missing_bars": missing_bars,
|
|
1087
|
+
"duration_minutes": actual_gap_minutes - expected_gap_minutes,
|
|
1088
|
+
}
|
|
1089
|
+
)
|
|
1090
|
+
total_bars_expected += missing_bars
|
|
1091
|
+
|
|
1092
|
+
# Calculate completeness score
|
|
1093
|
+
total_bars_collected = len(data)
|
|
1094
|
+
total_bars_should_exist = total_bars_collected + total_bars_expected
|
|
1095
|
+
completeness_score = (
|
|
1096
|
+
total_bars_collected / total_bars_should_exist if total_bars_should_exist > 0 else 1.0
|
|
1097
|
+
)
|
|
1098
|
+
|
|
1099
|
+
return {
|
|
1100
|
+
"analysis_performed": True,
|
|
1101
|
+
"total_gaps_detected": len(gaps_detected),
|
|
1102
|
+
"gaps_filled": 0, # Will be updated during gap filling process
|
|
1103
|
+
"gaps_remaining": len(gaps_detected),
|
|
1104
|
+
"gap_details": gaps_detected[:10], # Limit to first 10 gaps for metadata size
|
|
1105
|
+
"total_missing_bars": total_bars_expected,
|
|
1106
|
+
"gap_filling_method": "authentic_binance_api",
|
|
1107
|
+
"data_completeness_score": round(completeness_score, 4),
|
|
1108
|
+
"analysis_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
|
1109
|
+
"analysis_parameters": {
|
|
1110
|
+
"timeframe": timeframe,
|
|
1111
|
+
"expected_interval_minutes": expected_gap_minutes,
|
|
1112
|
+
"tolerance_factor": 1.5,
|
|
1113
|
+
},
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
def _calculate_data_hash(self, data):
|
|
1117
|
+
"""Calculate hash of data for integrity verification."""
|
|
1118
|
+
data_string = "\n".join(",".join(map(str, row)) for row in data)
|
|
1119
|
+
return hashlib.sha256(data_string.encode()).hexdigest()
|
|
1120
|
+
|
|
1121
|
+
def save_data(self, timeframe: str, data: List[List], collection_stats: Dict[str, Any]) -> Path:
|
|
1122
|
+
"""Save data to file with format determined by output_format (CSV or Parquet)."""
|
|
1123
|
+
if not data:
|
|
1124
|
+
print(f"❌ No data to save for {timeframe}")
|
|
1125
|
+
return None
|
|
1126
|
+
|
|
1127
|
+
# Generate filename with appropriate extension
|
|
1128
|
+
start_date_str = self.start_date.strftime("%Y%m%d")
|
|
1129
|
+
end_date_str = datetime.strptime(data[-1][0], "%Y-%m-%d %H:%M:%S").strftime("%Y%m%d")
|
|
1130
|
+
version = "v2.10.0" # Updated version for Parquet support
|
|
1131
|
+
file_extension = self.output_format
|
|
1132
|
+
filename = f"binance_spot_{self.symbol}-{timeframe}_{start_date_str}-{end_date_str}_{version}.{file_extension}"
|
|
1133
|
+
filepath = self.output_dir / filename
|
|
1134
|
+
|
|
1135
|
+
# Ensure output directory exists
|
|
1136
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
1137
|
+
|
|
1138
|
+
# Perform gap analysis on collected data
|
|
1139
|
+
gap_analysis = self._perform_gap_analysis(data, timeframe)
|
|
1140
|
+
|
|
1141
|
+
# Generate metadata with gap analysis results
|
|
1142
|
+
metadata = self.generate_metadata(timeframe, data, collection_stats, gap_analysis)
|
|
1143
|
+
|
|
1144
|
+
# Convert data to DataFrame for both formats
|
|
1145
|
+
df = pd.DataFrame(
|
|
1146
|
+
data,
|
|
1147
|
+
columns=[
|
|
1148
|
+
"date",
|
|
1149
|
+
"open",
|
|
1150
|
+
"high",
|
|
1151
|
+
"low",
|
|
1152
|
+
"close",
|
|
1153
|
+
"volume",
|
|
1154
|
+
"close_time",
|
|
1155
|
+
"quote_asset_volume",
|
|
1156
|
+
"number_of_trades",
|
|
1157
|
+
"taker_buy_base_asset_volume",
|
|
1158
|
+
"taker_buy_quote_asset_volume",
|
|
1159
|
+
],
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Convert date column to datetime
|
|
1163
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
1164
|
+
|
|
1165
|
+
if self.output_format == "parquet":
|
|
1166
|
+
# Save as Parquet with metadata
|
|
1167
|
+
df.to_parquet(filepath, engine="pyarrow", compression="snappy", index=False)
|
|
1168
|
+
print(f"📊 Saved {len(df):,} bars to {filepath.name} (Parquet format)")
|
|
1169
|
+
else:
|
|
1170
|
+
# Save as CSV with metadata headers (existing logic)
|
|
1171
|
+
with open(filepath, "w", newline="") as f:
|
|
1172
|
+
# Write metadata headers
|
|
1173
|
+
f.write(f"# Binance Spot Market Data {metadata['version']}\n")
|
|
1174
|
+
f.write(f"# Generated: {metadata['generation_timestamp']}\n")
|
|
1175
|
+
f.write(f"# Source: {metadata['data_source']}\n")
|
|
1176
|
+
f.write(
|
|
1177
|
+
f"# Market: {metadata['market_type'].upper()} | Symbol: {metadata['symbol']} | Timeframe: {metadata['timeframe']}\n"
|
|
1178
|
+
)
|
|
1179
|
+
f.write(f"# Coverage: {metadata['actual_bars']:,} bars\n")
|
|
1180
|
+
f.write(
|
|
1181
|
+
f"# Period: {metadata['date_range']['start']} to {metadata['date_range']['end']}\n"
|
|
1182
|
+
)
|
|
1183
|
+
f.write(
|
|
1184
|
+
f"# Collection: {collection_stats['method']} in {collection_stats['duration']:.1f}s\n"
|
|
1185
|
+
)
|
|
1186
|
+
f.write(f"# Data Hash: {metadata['data_integrity']['data_hash'][:16]}...\n")
|
|
1187
|
+
f.write(
|
|
1188
|
+
"# Compliance: Zero-Magic-Numbers, Temporal-Integrity, Official-Binance-Source\n"
|
|
1189
|
+
)
|
|
1190
|
+
f.write("#\n")
|
|
1191
|
+
|
|
1192
|
+
# Write CSV data
|
|
1193
|
+
df.to_csv(f, index=False)
|
|
1194
|
+
print(f"📊 Saved {len(df):,} bars to {filepath.name} (CSV format)")
|
|
1195
|
+
|
|
1196
|
+
# Save metadata as JSON
|
|
1197
|
+
metadata_filepath = filepath.with_suffix(".metadata.json")
|
|
1198
|
+
with open(metadata_filepath, "w") as f:
|
|
1199
|
+
json.dump(metadata, f, indent=2)
|
|
1200
|
+
|
|
1201
|
+
file_size_mb = filepath.stat().st_size / (1024 * 1024)
|
|
1202
|
+
print(f"\n✅ Created: {filepath.name} ({file_size_mb:.1f} MB)")
|
|
1203
|
+
print(f"✅ Metadata: {metadata_filepath.name}")
|
|
1204
|
+
|
|
1205
|
+
return filepath
|
|
1206
|
+
|
|
1207
|
+
def collect_multiple_timeframes(
|
|
1208
|
+
self, timeframes: Optional[List[str]] = None
|
|
1209
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
1210
|
+
"""Collect data for multiple timeframes with comprehensive progress tracking.
|
|
1211
|
+
|
|
1212
|
+
Efficiently collects historical data across multiple timeframes in sequence,
|
|
1213
|
+
providing a complete dataset for multi-timeframe analysis. Each timeframe
|
|
1214
|
+
is processed independently with full validation and progress reporting.
|
|
1215
|
+
|
|
1216
|
+
Args:
|
|
1217
|
+
timeframes (list, optional): List of timeframes to collect.
|
|
1218
|
+
Each must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
|
|
1219
|
+
If None, defaults to ["1m", "3m", "5m", "15m", "30m", "1h", "2h"].
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
dict: Collection results by timeframe, where each key is a timeframe string
|
|
1223
|
+
and each value is a dict containing:
|
|
1224
|
+
- dataframe (pd.DataFrame): Complete OHLCV data with 11 columns
|
|
1225
|
+
- filepath (Path): Path to saved CSV file
|
|
1226
|
+
- stats (dict): Collection statistics
|
|
1227
|
+
|
|
1228
|
+
Raises:
|
|
1229
|
+
ValueError: If any timeframe in the list is not supported.
|
|
1230
|
+
ConnectionError: If download from Binance repository fails.
|
|
1231
|
+
|
|
1232
|
+
Examples:
|
|
1233
|
+
Default comprehensive collection:
|
|
1234
|
+
|
|
1235
|
+
>>> collector = BinancePublicDataCollector(symbol="ETHUSDT")
|
|
1236
|
+
>>> results = collector.collect_multiple_timeframes()
|
|
1237
|
+
>>> for timeframe, result in results.items():
|
|
1238
|
+
... df = result["dataframe"]
|
|
1239
|
+
... print(f"{timeframe}: {len(df):,} bars saved to {result['filepath'].name}")
|
|
1240
|
+
1m: 1,574,400 bars saved to ETHUSDT_1m_2020-08-15_to_2025-03-20.csv
|
|
1241
|
+
3m: 524,800 bars saved to ETHUSDT_3m_2020-08-15_to_2025-03-20.csv
|
|
1242
|
+
|
|
1243
|
+
Custom timeframes for specific analysis:
|
|
1244
|
+
|
|
1245
|
+
>>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
|
|
1246
|
+
>>> results = collector.collect_multiple_timeframes(["1h", "4h"])
|
|
1247
|
+
>>> hourly_df = results["1h"]["dataframe"]
|
|
1248
|
+
>>> four_hour_df = results["4h"]["dataframe"]
|
|
1249
|
+
>>> print(f"Hourly data: {len(hourly_df)} bars")
|
|
1250
|
+
>>> print(f"4-hour data: {len(four_hour_df)} bars")
|
|
1251
|
+
Hourly data: 26,280 bars
|
|
1252
|
+
4-hour data: 6,570 bars
|
|
1253
|
+
|
|
1254
|
+
Access collection statistics:
|
|
1255
|
+
|
|
1256
|
+
>>> results = collector.collect_multiple_timeframes(["1h"])
|
|
1257
|
+
>>> stats = results["1h"]["stats"]
|
|
1258
|
+
>>> print(f"Collection took {stats['duration']:.1f} seconds")
|
|
1259
|
+
>>> print(f"Processing rate: {stats['bars_per_second']:,.0f} bars/sec")
|
|
1260
|
+
Collection took 45.2 seconds
|
|
1261
|
+
Processing rate: 582 bars/sec
|
|
1262
|
+
|
|
1263
|
+
Note:
|
|
1264
|
+
Processing time scales with the number of timeframes and date range.
|
|
1265
|
+
Progress is displayed in real-time with Rich progress bars.
|
|
1266
|
+
All timeframes are collected sequentially to avoid overwhelming
|
|
1267
|
+
Binance's public data servers.
|
|
1268
|
+
"""
|
|
1269
|
+
if timeframes is None:
|
|
1270
|
+
timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
|
|
1271
|
+
|
|
1272
|
+
print("\n🚀 BINANCE PUBLIC DATA ULTRA-FAST COLLECTION")
|
|
1273
|
+
print(f"Timeframes: {timeframes}")
|
|
1274
|
+
print("=" * 80)
|
|
1275
|
+
|
|
1276
|
+
results = {}
|
|
1277
|
+
overall_start = datetime.now()
|
|
1278
|
+
|
|
1279
|
+
for i, timeframe in enumerate(timeframes):
|
|
1280
|
+
print(f"Processing {timeframe} ({i + 1}/{len(timeframes)})...")
|
|
1281
|
+
|
|
1282
|
+
result = self.collect_timeframe_data(timeframe)
|
|
1283
|
+
|
|
1284
|
+
if result and result.get("filepath"):
|
|
1285
|
+
filepath = result["filepath"]
|
|
1286
|
+
results[timeframe] = filepath
|
|
1287
|
+
file_size_mb = filepath.stat().st_size / (1024 * 1024)
|
|
1288
|
+
print(f"✅ {timeframe}: {filepath.name} ({file_size_mb:.1f} MB)")
|
|
1289
|
+
else:
|
|
1290
|
+
print(f"❌ Failed to collect {timeframe} data")
|
|
1291
|
+
|
|
1292
|
+
overall_duration = (datetime.now() - overall_start).total_seconds()
|
|
1293
|
+
|
|
1294
|
+
print("\n" + "=" * 80)
|
|
1295
|
+
print("🎉 ULTRA-FAST COLLECTION COMPLETE")
|
|
1296
|
+
print(
|
|
1297
|
+
f"⏱️ Total time: {overall_duration:.1f} seconds ({overall_duration / 60:.1f} minutes)"
|
|
1298
|
+
)
|
|
1299
|
+
print(f"📊 Generated {len(results)} files")
|
|
1300
|
+
|
|
1301
|
+
return results
|
|
1302
|
+
|
|
1303
|
+
async def collect_timeframe_data_concurrent(self, trading_timeframe: str) -> Dict[str, Any]:
|
|
1304
|
+
"""
|
|
1305
|
+
Collect data using high-performance concurrent hybrid strategy.
|
|
1306
|
+
|
|
1307
|
+
This method uses the ConcurrentCollectionOrchestrator to achieve 10-15x faster
|
|
1308
|
+
data collection through parallel downloads of monthly and daily ZIP files.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
trading_timeframe (str): Timeframe for data collection.
|
|
1312
|
+
Must be one of: "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h".
|
|
1313
|
+
|
|
1314
|
+
Returns:
|
|
1315
|
+
dict: Collection results containing:
|
|
1316
|
+
- dataframe (pd.DataFrame): Complete OHLCV data with 11 columns
|
|
1317
|
+
- filepath (Path): Path to saved CSV file
|
|
1318
|
+
- stats (dict): Collection statistics including performance metrics
|
|
1319
|
+
- collection_method (str): "concurrent_hybrid"
|
|
1320
|
+
|
|
1321
|
+
Examples:
|
|
1322
|
+
>>> collector = BinancePublicDataCollector(symbol="BTCUSDT")
|
|
1323
|
+
>>> result = await collector.collect_timeframe_data_concurrent("1h")
|
|
1324
|
+
>>> df = result["dataframe"]
|
|
1325
|
+
>>> print(f"Collected {len(df)} bars in {result['stats']['collection_time']:.1f}s")
|
|
1326
|
+
>>> print(f"Performance: {result['stats']['bars_per_second']:.0f} bars/sec")
|
|
1327
|
+
Collected 8760 bars in 12.3s
|
|
1328
|
+
Performance: 712 bars/sec
|
|
1329
|
+
|
|
1330
|
+
Note:
|
|
1331
|
+
This is the recommended high-performance method for new applications.
|
|
1332
|
+
Falls back to synchronous method if async context is not available.
|
|
1333
|
+
"""
|
|
1334
|
+
from .concurrent_collection_orchestrator import ConcurrentCollectionOrchestrator
|
|
1335
|
+
|
|
1336
|
+
print(f"\n{'=' * 60}")
|
|
1337
|
+
print(f"CONCURRENT COLLECTION: {trading_timeframe.upper()} DATA")
|
|
1338
|
+
print(f"Strategy: Hybrid Monthly+Daily with {13} Concurrent Downloads")
|
|
1339
|
+
print(f"{'=' * 60}")
|
|
1340
|
+
|
|
1341
|
+
if trading_timeframe not in self.available_timeframes:
|
|
1342
|
+
print(f"❌ Timeframe '{trading_timeframe}' not available")
|
|
1343
|
+
print(f"📊 Available timeframes: {', '.join(self.available_timeframes)}")
|
|
1344
|
+
return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
|
|
1345
|
+
|
|
1346
|
+
try:
|
|
1347
|
+
# Initialize concurrent orchestrator
|
|
1348
|
+
orchestrator = ConcurrentCollectionOrchestrator(
|
|
1349
|
+
symbol=self.symbol,
|
|
1350
|
+
start_date=self.start_date,
|
|
1351
|
+
end_date=self.end_date,
|
|
1352
|
+
output_dir=self.output_dir,
|
|
1353
|
+
max_concurrent=13,
|
|
1354
|
+
)
|
|
1355
|
+
|
|
1356
|
+
async with orchestrator:
|
|
1357
|
+
# Execute concurrent collection
|
|
1358
|
+
collection_result = await orchestrator.collect_timeframe_concurrent(
|
|
1359
|
+
trading_timeframe, progress_callback=self._progress_callback
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1362
|
+
if not collection_result.success or not collection_result.processed_data:
|
|
1363
|
+
print(f"❌ Concurrent collection failed for {trading_timeframe}")
|
|
1364
|
+
if collection_result.errors:
|
|
1365
|
+
for error in collection_result.errors:
|
|
1366
|
+
print(f" Error: {error}")
|
|
1367
|
+
return {"dataframe": pd.DataFrame(), "filepath": None, "stats": {}}
|
|
1368
|
+
|
|
1369
|
+
# Process data using existing methods
|
|
1370
|
+
processed_data = collection_result.processed_data
|
|
1371
|
+
|
|
1372
|
+
# Calculate performance stats
|
|
1373
|
+
bars_per_second = (
|
|
1374
|
+
collection_result.total_bars / collection_result.collection_time
|
|
1375
|
+
if collection_result.collection_time > 0
|
|
1376
|
+
else 0
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1379
|
+
collection_stats = {
|
|
1380
|
+
"method": "concurrent_hybrid",
|
|
1381
|
+
"duration": collection_result.collection_time,
|
|
1382
|
+
"bars_per_second": bars_per_second,
|
|
1383
|
+
"total_bars": collection_result.total_bars,
|
|
1384
|
+
"successful_downloads": collection_result.successful_downloads,
|
|
1385
|
+
"failed_downloads": collection_result.failed_downloads,
|
|
1386
|
+
"data_source_breakdown": collection_result.data_source_breakdown,
|
|
1387
|
+
"concurrent_downloads": 13,
|
|
1388
|
+
"strategy": "monthly_historical_daily_recent",
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
# Save to CSV using existing method
|
|
1392
|
+
filepath = self.save_data(trading_timeframe, processed_data, collection_stats)
|
|
1393
|
+
|
|
1394
|
+
# Convert to DataFrame
|
|
1395
|
+
columns = [
|
|
1396
|
+
"date",
|
|
1397
|
+
"open",
|
|
1398
|
+
"high",
|
|
1399
|
+
"low",
|
|
1400
|
+
"close",
|
|
1401
|
+
"volume",
|
|
1402
|
+
"close_time",
|
|
1403
|
+
"quote_asset_volume",
|
|
1404
|
+
"number_of_trades",
|
|
1405
|
+
"taker_buy_base_asset_volume",
|
|
1406
|
+
"taker_buy_quote_asset_volume",
|
|
1407
|
+
]
|
|
1408
|
+
df = pd.DataFrame(processed_data, columns=columns)
|
|
1409
|
+
|
|
1410
|
+
# Convert numeric columns
|
|
1411
|
+
numeric_cols = [
|
|
1412
|
+
"open",
|
|
1413
|
+
"high",
|
|
1414
|
+
"low",
|
|
1415
|
+
"close",
|
|
1416
|
+
"volume",
|
|
1417
|
+
"quote_asset_volume",
|
|
1418
|
+
"number_of_trades",
|
|
1419
|
+
"taker_buy_base_asset_volume",
|
|
1420
|
+
"taker_buy_quote_asset_volume",
|
|
1421
|
+
]
|
|
1422
|
+
for col in numeric_cols:
|
|
1423
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
1424
|
+
|
|
1425
|
+
# Convert date columns
|
|
1426
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
1427
|
+
df["close_time"] = pd.to_datetime(df["close_time"])
|
|
1428
|
+
|
|
1429
|
+
print("\n✅ CONCURRENT COLLECTION SUCCESS")
|
|
1430
|
+
print(f"📊 Collected: {len(df):,} bars")
|
|
1431
|
+
print(f"⚡ Performance: {bars_per_second:.0f} bars/sec")
|
|
1432
|
+
print(
|
|
1433
|
+
f"🚀 Speed: {collection_result.collection_time:.1f}s vs ~{collection_result.collection_time * 10:.0f}s sequential"
|
|
1434
|
+
)
|
|
1435
|
+
print(
|
|
1436
|
+
f"📁 Sources: {collection_result.data_source_breakdown['monthly']} monthly + {collection_result.data_source_breakdown['daily']} daily"
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
return {
|
|
1440
|
+
"dataframe": df,
|
|
1441
|
+
"filepath": filepath,
|
|
1442
|
+
"stats": collection_stats,
|
|
1443
|
+
"collection_method": "concurrent_hybrid",
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
except Exception as e:
|
|
1447
|
+
print(f"❌ Concurrent collection failed: {e}")
|
|
1448
|
+
print("⏮️ Falling back to synchronous method...")
|
|
1449
|
+
# Fallback to synchronous method
|
|
1450
|
+
return self.collect_timeframe_data(trading_timeframe)
|
|
1451
|
+
|
|
1452
|
+
async def collect_multiple_timeframes_concurrent(
|
|
1453
|
+
self, timeframes: Optional[List[str]] = None
|
|
1454
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
1455
|
+
"""
|
|
1456
|
+
Collect multiple timeframes using concurrent hybrid strategy.
|
|
1457
|
+
|
|
1458
|
+
High-performance collection across multiple timeframes with optimal
|
|
1459
|
+
resource utilization and parallel processing.
|
|
1460
|
+
|
|
1461
|
+
Args:
|
|
1462
|
+
timeframes (list, optional): List of timeframes to collect.
|
|
1463
|
+
If None, defaults to ["1m", "3m", "5m", "15m", "30m", "1h", "2h"].
|
|
1464
|
+
|
|
1465
|
+
Returns:
|
|
1466
|
+
dict: Collection results by timeframe with comprehensive performance metrics.
|
|
1467
|
+
|
|
1468
|
+
Examples:
|
|
1469
|
+
>>> collector = BinancePublicDataCollector(symbol="ETHUSDT")
|
|
1470
|
+
>>> results = await collector.collect_multiple_timeframes_concurrent(["1h", "4h"])
|
|
1471
|
+
>>> for timeframe, result in results.items():
|
|
1472
|
+
... stats = result["stats"]
|
|
1473
|
+
... print(f"{timeframe}: {stats['total_bars']} bars in {stats['duration']:.1f}s")
|
|
1474
|
+
1h: 8760 bars in 15.2s
|
|
1475
|
+
4h: 2190 bars in 8.7s
|
|
1476
|
+
|
|
1477
|
+
Note:
|
|
1478
|
+
This method processes timeframes sequentially to avoid overwhelming
|
|
1479
|
+
servers, but each timeframe uses full concurrent downloading.
|
|
1480
|
+
"""
|
|
1481
|
+
from .concurrent_collection_orchestrator import ConcurrentCollectionOrchestrator
|
|
1482
|
+
|
|
1483
|
+
if timeframes is None:
|
|
1484
|
+
timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
|
|
1485
|
+
|
|
1486
|
+
print("\n🚀 CONCURRENT MULTI-TIMEFRAME COLLECTION")
|
|
1487
|
+
print(f"Strategy: Hybrid Monthly+Daily with {13} Concurrent Downloads")
|
|
1488
|
+
print(f"Timeframes: {timeframes}")
|
|
1489
|
+
print("=" * 80)
|
|
1490
|
+
|
|
1491
|
+
results = {}
|
|
1492
|
+
overall_start = datetime.now()
|
|
1493
|
+
|
|
1494
|
+
try:
|
|
1495
|
+
# Initialize concurrent orchestrator
|
|
1496
|
+
orchestrator = ConcurrentCollectionOrchestrator(
|
|
1497
|
+
symbol=self.symbol,
|
|
1498
|
+
start_date=self.start_date,
|
|
1499
|
+
end_date=self.end_date,
|
|
1500
|
+
output_dir=self.output_dir,
|
|
1501
|
+
max_concurrent=13,
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
async with orchestrator:
|
|
1505
|
+
# Process each timeframe with concurrent downloads
|
|
1506
|
+
for i, timeframe in enumerate(timeframes):
|
|
1507
|
+
print(f"\n📊 Processing {timeframe} ({i + 1}/{len(timeframes)})...")
|
|
1508
|
+
|
|
1509
|
+
result = await self.collect_timeframe_data_concurrent(timeframe)
|
|
1510
|
+
|
|
1511
|
+
if result and result.get("filepath"):
|
|
1512
|
+
filepath = result["filepath"]
|
|
1513
|
+
results[timeframe] = filepath
|
|
1514
|
+
file_size_mb = filepath.stat().st_size / (1024 * 1024)
|
|
1515
|
+
bars_per_sec = result["stats"]["bars_per_second"]
|
|
1516
|
+
print(
|
|
1517
|
+
f"✅ {timeframe}: {filepath.name} ({file_size_mb:.1f} MB, {bars_per_sec:.0f} bars/sec)"
|
|
1518
|
+
)
|
|
1519
|
+
else:
|
|
1520
|
+
print(f"❌ Failed to collect {timeframe} data")
|
|
1521
|
+
|
|
1522
|
+
except Exception as e:
|
|
1523
|
+
print(f"❌ Concurrent collection failed: {e}")
|
|
1524
|
+
print("⏮️ Falling back to synchronous method...")
|
|
1525
|
+
# Fallback to synchronous method
|
|
1526
|
+
return self.collect_multiple_timeframes(timeframes)
|
|
1527
|
+
|
|
1528
|
+
overall_duration = (datetime.now() - overall_start).total_seconds()
|
|
1529
|
+
|
|
1530
|
+
print("\n" + "=" * 80)
|
|
1531
|
+
print("🎉 CONCURRENT MULTI-TIMEFRAME COLLECTION COMPLETE")
|
|
1532
|
+
print(
|
|
1533
|
+
f"⏱️ Total time: {overall_duration:.1f} seconds ({overall_duration / 60:.1f} minutes)"
|
|
1534
|
+
)
|
|
1535
|
+
print(f"📊 Generated {len(results)} datasets")
|
|
1536
|
+
print("🚀 Average speedup: ~10-15x faster than sequential downloads")
|
|
1537
|
+
|
|
1538
|
+
return results
|
|
1539
|
+
|
|
1540
|
+
def _progress_callback(self, completed: int, total: int, current_task):
|
|
1541
|
+
"""Progress callback for concurrent downloads."""
|
|
1542
|
+
if completed % 5 == 0 or completed == total: # Report every 5 downloads or at completion
|
|
1543
|
+
percentage = (completed / total) * 100
|
|
1544
|
+
source_type = current_task.source_type.value
|
|
1545
|
+
print(
|
|
1546
|
+
f" 📥 Progress: {completed}/{total} ({percentage:.1f}%) - {source_type}: {current_task.filename}"
|
|
1547
|
+
)
|
|
1548
|
+
|
|
1549
|
+
def validate_csv_file(
|
|
1550
|
+
self, csv_filepath: Union[str, Path], expected_timeframe: Optional[str] = None
|
|
1551
|
+
) -> Dict[str, Any]:
|
|
1552
|
+
"""
|
|
1553
|
+
Comprehensive validation of CSV file data integrity, completeness, and quality.
|
|
1554
|
+
|
|
1555
|
+
Delegates to CSVValidator for multi-layer validation including structure checking,
|
|
1556
|
+
datetime sequence validation, OHLCV quality analysis, coverage calculation, and
|
|
1557
|
+
statistical anomaly detection.
|
|
1558
|
+
|
|
1559
|
+
Args:
|
|
1560
|
+
csv_filepath: Path to CSV file to validate
|
|
1561
|
+
expected_timeframe: Expected timeframe (e.g., '30m') for interval validation
|
|
1562
|
+
|
|
1563
|
+
Returns:
|
|
1564
|
+
dict: Validation results with detailed analysis
|
|
1565
|
+
|
|
1566
|
+
Note:
|
|
1567
|
+
This method delegates to the validation.csv_validator.CSVValidator class
|
|
1568
|
+
for complete validation logic. See CSVValidator for implementation details.
|
|
1569
|
+
"""
|
|
1570
|
+
validator = CSVValidator()
|
|
1571
|
+
return validator.validate_csv_file(csv_filepath, expected_timeframe)
|
|
1572
|
+
|
|
1573
|
+
def update_metadata_with_validation(self, csv_filepath, validation_results):
|
|
1574
|
+
"""Update metadata JSON file with validation results."""
|
|
1575
|
+
metadata_filepath = csv_filepath.with_suffix(".metadata.json")
|
|
1576
|
+
|
|
1577
|
+
if metadata_filepath.exists():
|
|
1578
|
+
with open(metadata_filepath, "r") as f:
|
|
1579
|
+
metadata = json.load(f)
|
|
1580
|
+
else:
|
|
1581
|
+
metadata = {}
|
|
1582
|
+
|
|
1583
|
+
# Add validation results to metadata
|
|
1584
|
+
metadata["validation"] = validation_results
|
|
1585
|
+
|
|
1586
|
+
# Update compliance status based on validation
|
|
1587
|
+
compliance = metadata.get("compliance", {})
|
|
1588
|
+
if validation_results["total_errors"] == 0:
|
|
1589
|
+
compliance["data_validation_passed"] = True
|
|
1590
|
+
compliance["validation_summary"] = validation_results["validation_summary"]
|
|
1591
|
+
else:
|
|
1592
|
+
compliance["data_validation_passed"] = False
|
|
1593
|
+
compliance["validation_summary"] = validation_results["validation_summary"]
|
|
1594
|
+
compliance["validation_errors"] = validation_results["total_errors"]
|
|
1595
|
+
compliance["validation_warnings"] = validation_results["total_warnings"]
|
|
1596
|
+
|
|
1597
|
+
metadata["compliance"] = compliance
|
|
1598
|
+
|
|
1599
|
+
# Save updated metadata with JSON serialization fix
|
|
1600
|
+
def convert_numpy_types(obj):
|
|
1601
|
+
"""Convert numpy types to Python native types for JSON serialization."""
|
|
1602
|
+
if hasattr(obj, "item"):
|
|
1603
|
+
return obj.item()
|
|
1604
|
+
elif isinstance(obj, dict):
|
|
1605
|
+
return {key: convert_numpy_types(value) for key, value in obj.items()}
|
|
1606
|
+
elif isinstance(obj, list):
|
|
1607
|
+
return [convert_numpy_types(item) for item in obj]
|
|
1608
|
+
else:
|
|
1609
|
+
return obj
|
|
1610
|
+
|
|
1611
|
+
with open(metadata_filepath, "w") as f:
|
|
1612
|
+
json.dump(convert_numpy_types(metadata), f, indent=2)
|
|
1613
|
+
|
|
1614
|
+
print(f"✅ Updated metadata: {metadata_filepath.name}")
|
|
1615
|
+
|
|
1616
|
+
def apply_gap_filling_to_validated_files(self):
|
|
1617
|
+
"""Apply comprehensive gap filling to validated data files using authentic Binance API data"""
|
|
1618
|
+
|
|
1619
|
+
try:
|
|
1620
|
+
print("\n🔧 INTEGRATED GAP FILLING SYSTEM")
|
|
1621
|
+
print("Primary Source: Binance REST API (Authentic Data Only)")
|
|
1622
|
+
print("=" * 60)
|
|
1623
|
+
|
|
1624
|
+
# Initialize gap filling components
|
|
1625
|
+
gap_filler = UniversalGapFiller()
|
|
1626
|
+
|
|
1627
|
+
# Find CSV files to check for gaps
|
|
1628
|
+
csv_files = list(Path(self.output_dir).glob("*.csv"))
|
|
1629
|
+
|
|
1630
|
+
if not csv_files:
|
|
1631
|
+
print("❌ No CSV files found for gap filling")
|
|
1632
|
+
return
|
|
1633
|
+
|
|
1634
|
+
# Filter to only files for this symbol
|
|
1635
|
+
symbol_files = [f for f in csv_files if self.symbol in f.name]
|
|
1636
|
+
|
|
1637
|
+
if not symbol_files:
|
|
1638
|
+
print(f"❌ No CSV files found for symbol {self.symbol}")
|
|
1639
|
+
return
|
|
1640
|
+
|
|
1641
|
+
print(f"🔍 Analyzing {len(symbol_files)} files for gaps...")
|
|
1642
|
+
|
|
1643
|
+
total_gaps_detected = 0
|
|
1644
|
+
total_gaps_filled = 0
|
|
1645
|
+
total_gaps_failed = 0
|
|
1646
|
+
files_processed = 0
|
|
1647
|
+
results = []
|
|
1648
|
+
|
|
1649
|
+
for csv_file in symbol_files:
|
|
1650
|
+
print(f"\n📁 Processing: {csv_file.name}")
|
|
1651
|
+
|
|
1652
|
+
# Extract timeframe from filename
|
|
1653
|
+
file_timeframe = self._extract_timeframe_from_filename(csv_file.name)
|
|
1654
|
+
print(f" 📊 Detected timeframe: {file_timeframe}")
|
|
1655
|
+
|
|
1656
|
+
# Use the proper UniversalGapFiller process_file method
|
|
1657
|
+
result = gap_filler.process_file(csv_file, file_timeframe)
|
|
1658
|
+
results.append(result)
|
|
1659
|
+
files_processed += 1
|
|
1660
|
+
|
|
1661
|
+
# Update totals
|
|
1662
|
+
total_gaps_detected += result["gaps_detected"]
|
|
1663
|
+
total_gaps_filled += result["gaps_filled"]
|
|
1664
|
+
total_gaps_failed += result["gaps_failed"]
|
|
1665
|
+
|
|
1666
|
+
# Report per-file results
|
|
1667
|
+
if result["gaps_detected"] == 0:
|
|
1668
|
+
print(f" ✅ No gaps found in {file_timeframe}")
|
|
1669
|
+
else:
|
|
1670
|
+
success_rate = result["success_rate"]
|
|
1671
|
+
status = "✅" if success_rate == 100.0 else "⚠️" if success_rate > 0 else "❌"
|
|
1672
|
+
print(
|
|
1673
|
+
f" {status} {result['gaps_filled']}/{result['gaps_detected']} gaps filled ({success_rate:.1f}%)"
|
|
1674
|
+
)
|
|
1675
|
+
|
|
1676
|
+
# Comprehensive summary
|
|
1677
|
+
print("\n" + "=" * 60)
|
|
1678
|
+
print("📊 GAP FILLING SUMMARY")
|
|
1679
|
+
print("=" * 60)
|
|
1680
|
+
|
|
1681
|
+
for result in results:
|
|
1682
|
+
if result["gaps_detected"] > 0:
|
|
1683
|
+
status = (
|
|
1684
|
+
"✅"
|
|
1685
|
+
if result["success_rate"] == 100.0
|
|
1686
|
+
else "⚠️"
|
|
1687
|
+
if result["success_rate"] > 0
|
|
1688
|
+
else "❌"
|
|
1689
|
+
)
|
|
1690
|
+
print(
|
|
1691
|
+
f"{status} {result['timeframe']:>3}: {result['gaps_filled']:>2}/{result['gaps_detected']:>2} gaps filled ({result['success_rate']:>5.1f}%)"
|
|
1692
|
+
)
|
|
1693
|
+
|
|
1694
|
+
print("-" * 60)
|
|
1695
|
+
overall_success = (
|
|
1696
|
+
(total_gaps_filled / total_gaps_detected * 100)
|
|
1697
|
+
if total_gaps_detected > 0
|
|
1698
|
+
else 100.0
|
|
1699
|
+
)
|
|
1700
|
+
print(
|
|
1701
|
+
f"🎯 OVERALL: {total_gaps_filled}/{total_gaps_detected} gaps filled ({overall_success:.1f}%)"
|
|
1702
|
+
)
|
|
1703
|
+
|
|
1704
|
+
if overall_success == 100.0:
|
|
1705
|
+
print("🎉 ALL GAPS FILLED SUCCESSFULLY!")
|
|
1706
|
+
print("✅ Datasets are now 100% gapless and ready for production use")
|
|
1707
|
+
else:
|
|
1708
|
+
print(
|
|
1709
|
+
f"⚠️ {total_gaps_failed} gaps failed to fill (may be legitimate exchange outages)"
|
|
1710
|
+
)
|
|
1711
|
+
print("📋 Review failed gaps to confirm they are legitimate market closures")
|
|
1712
|
+
|
|
1713
|
+
print(f"\nFiles processed: {files_processed}")
|
|
1714
|
+
print("Data source: Authentic Binance REST API")
|
|
1715
|
+
print("Gap filling protocol: API-first validation (no synthetic data)")
|
|
1716
|
+
|
|
1717
|
+
except Exception as e:
|
|
1718
|
+
print(f"❌ Gap filling error: {e}")
|
|
1719
|
+
print("⚠️ Continuing without gap filling...")
|
|
1720
|
+
import traceback
|
|
1721
|
+
|
|
1722
|
+
traceback.print_exc()
|
|
1723
|
+
|
|
1724
|
+
def _extract_timeframe_from_filename(self, filename):
|
|
1725
|
+
"""Extract timeframe from filename (e.g., 'SOLUSDT-15m-data.csv' -> '15m')"""
|
|
1726
|
+
for tf in [
|
|
1727
|
+
"1s",
|
|
1728
|
+
"1m",
|
|
1729
|
+
"3m",
|
|
1730
|
+
"5m",
|
|
1731
|
+
"15m",
|
|
1732
|
+
"30m",
|
|
1733
|
+
"1h",
|
|
1734
|
+
"2h",
|
|
1735
|
+
"4h",
|
|
1736
|
+
"6h",
|
|
1737
|
+
"8h",
|
|
1738
|
+
"12h",
|
|
1739
|
+
"1d",
|
|
1740
|
+
"3d",
|
|
1741
|
+
"1w",
|
|
1742
|
+
"1mo",
|
|
1743
|
+
]:
|
|
1744
|
+
if f"-{tf}_" in filename or f"-{tf}-" in filename:
|
|
1745
|
+
return tf
|
|
1746
|
+
return "15m" # Default
|
|
1747
|
+
|
|
1748
|
+
|
|
1749
|
+
def _setup_argument_parser() -> argparse.ArgumentParser:
|
|
1750
|
+
"""Create and configure CLI argument parser.
|
|
1751
|
+
|
|
1752
|
+
Returns:
|
|
1753
|
+
Configured ArgumentParser with all CLI options
|
|
1754
|
+
"""
|
|
1755
|
+
parser = argparse.ArgumentParser(
|
|
1756
|
+
description="Ultra-fast Binance spot data collector with validation"
|
|
1757
|
+
)
|
|
1758
|
+
parser.add_argument(
|
|
1759
|
+
"--symbol", default="SOLUSDT", help="Trading pair symbol (default: SOLUSDT)"
|
|
1760
|
+
)
|
|
1761
|
+
parser.add_argument(
|
|
1762
|
+
"--timeframes",
|
|
1763
|
+
default="1m,3m,5m,15m,30m,1h,2h",
|
|
1764
|
+
help="Comma-separated timeframes (default: 1m,3m,5m,15m,30m,1h,2h)",
|
|
1765
|
+
)
|
|
1766
|
+
parser.add_argument(
|
|
1767
|
+
"--start", default="2020-08-15", help="Start date YYYY-MM-DD (default: 2020-08-15)"
|
|
1768
|
+
)
|
|
1769
|
+
parser.add_argument(
|
|
1770
|
+
"--end", default="2025-03-20", help="End date YYYY-MM-DD (default: 2025-03-20)"
|
|
1771
|
+
)
|
|
1772
|
+
parser.add_argument(
|
|
1773
|
+
"--validate-only",
|
|
1774
|
+
action="store_true",
|
|
1775
|
+
help="Only validate existing CSV files, do not collect new data",
|
|
1776
|
+
)
|
|
1777
|
+
parser.add_argument(
|
|
1778
|
+
"--validate-files", nargs="+", help="Specific CSV files to validate (with --validate-only)"
|
|
1779
|
+
)
|
|
1780
|
+
parser.add_argument(
|
|
1781
|
+
"--no-validation",
|
|
1782
|
+
action="store_true",
|
|
1783
|
+
help="Skip validation after collection (not recommended)",
|
|
1784
|
+
)
|
|
1785
|
+
return parser
|
|
1786
|
+
|
|
1787
|
+
|
|
1788
|
+
def _discover_files_to_validate(args, collector) -> List[Path]:
|
|
1789
|
+
"""Discover CSV files to validate based on arguments.
|
|
1790
|
+
|
|
1791
|
+
Args:
|
|
1792
|
+
args: Parsed command line arguments
|
|
1793
|
+
collector: BinancePublicDataCollector instance
|
|
1794
|
+
|
|
1795
|
+
Returns:
|
|
1796
|
+
List of Path objects for files to validate
|
|
1797
|
+
"""
|
|
1798
|
+
if args.validate_files:
|
|
1799
|
+
return [Path(f) for f in args.validate_files]
|
|
1800
|
+
else:
|
|
1801
|
+
pattern = f"*{args.symbol}*.csv"
|
|
1802
|
+
return list(collector.output_dir.glob(pattern))
|
|
1803
|
+
|
|
1804
|
+
|
|
1805
|
+
def _validate_files(collector, files_to_validate: List[Path]) -> List[Dict]:
|
|
1806
|
+
"""Validate list of CSV files.
|
|
1807
|
+
|
|
1808
|
+
Args:
|
|
1809
|
+
collector: BinancePublicDataCollector instance
|
|
1810
|
+
files_to_validate: List of file paths to validate
|
|
1811
|
+
|
|
1812
|
+
Returns:
|
|
1813
|
+
List of validation summary dictionaries
|
|
1814
|
+
"""
|
|
1815
|
+
validation_summary = []
|
|
1816
|
+
for csv_file in files_to_validate:
|
|
1817
|
+
# Extract timeframe from filename
|
|
1818
|
+
timeframe = None
|
|
1819
|
+
for tf in ["1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "1d"]:
|
|
1820
|
+
if f"-{tf}_" in csv_file.name:
|
|
1821
|
+
timeframe = tf
|
|
1822
|
+
break
|
|
1823
|
+
|
|
1824
|
+
# Validate file
|
|
1825
|
+
validation_result = collector.validate_csv_file(csv_file, timeframe)
|
|
1826
|
+
collector.update_metadata_with_validation(csv_file, validation_result)
|
|
1827
|
+
|
|
1828
|
+
validation_summary.append(
|
|
1829
|
+
{
|
|
1830
|
+
"file": csv_file.name,
|
|
1831
|
+
"status": validation_result["validation_summary"],
|
|
1832
|
+
"errors": validation_result["total_errors"],
|
|
1833
|
+
"warnings": validation_result["total_warnings"],
|
|
1834
|
+
}
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
return validation_summary
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
def _print_validation_summary(validation_summary: List[Dict]) -> int:
|
|
1841
|
+
"""Print validation summary and return exit code.
|
|
1842
|
+
|
|
1843
|
+
Args:
|
|
1844
|
+
validation_summary: List of validation result dictionaries
|
|
1845
|
+
|
|
1846
|
+
Returns:
|
|
1847
|
+
Exit code (0 for success, 1 for failures)
|
|
1848
|
+
"""
|
|
1849
|
+
print("\n" + "=" * 80)
|
|
1850
|
+
print("VALIDATION SUMMARY")
|
|
1851
|
+
print("=" * 80)
|
|
1852
|
+
|
|
1853
|
+
perfect_files = 0
|
|
1854
|
+
good_files = 0
|
|
1855
|
+
failed_files = 0
|
|
1856
|
+
|
|
1857
|
+
for summary in validation_summary:
|
|
1858
|
+
if summary["errors"] == 0:
|
|
1859
|
+
if summary["warnings"] == 0:
|
|
1860
|
+
status_icon = "✅"
|
|
1861
|
+
perfect_files += 1
|
|
1862
|
+
else:
|
|
1863
|
+
status_icon = "⚠️ "
|
|
1864
|
+
good_files += 1
|
|
1865
|
+
else:
|
|
1866
|
+
status_icon = "❌"
|
|
1867
|
+
failed_files += 1
|
|
1868
|
+
|
|
1869
|
+
print(f"{status_icon} {summary['file']}: {summary['status']}")
|
|
1870
|
+
if summary["errors"] > 0 or summary["warnings"] > 0:
|
|
1871
|
+
print(f" └─ {summary['errors']} errors, {summary['warnings']} warnings")
|
|
1872
|
+
|
|
1873
|
+
print("\nOVERALL RESULTS:")
|
|
1874
|
+
print(f" ✅ Perfect: {perfect_files} files")
|
|
1875
|
+
print(f" ⚠️ Good: {good_files} files")
|
|
1876
|
+
print(f" ❌ Failed: {failed_files} files")
|
|
1877
|
+
|
|
1878
|
+
if failed_files == 0:
|
|
1879
|
+
print("\n🎉 ALL VALIDATIONS PASSED!")
|
|
1880
|
+
return 0
|
|
1881
|
+
else:
|
|
1882
|
+
print(f"\n⚠️ {failed_files} files failed validation")
|
|
1883
|
+
return 1
|
|
1884
|
+
|
|
1885
|
+
|
|
1886
|
+
def _run_validation_only_mode(args, collector) -> int:
|
|
1887
|
+
"""Execute validation-only mode workflow.
|
|
1888
|
+
|
|
1889
|
+
Args:
|
|
1890
|
+
args: Parsed command line arguments
|
|
1891
|
+
collector: BinancePublicDataCollector instance
|
|
1892
|
+
|
|
1893
|
+
Returns:
|
|
1894
|
+
Exit code (0 for success, 1 for failures)
|
|
1895
|
+
"""
|
|
1896
|
+
print("🔍 VALIDATION-ONLY MODE")
|
|
1897
|
+
|
|
1898
|
+
files_to_validate = _discover_files_to_validate(args, collector)
|
|
1899
|
+
|
|
1900
|
+
if not files_to_validate:
|
|
1901
|
+
print("❌ No CSV files found to validate")
|
|
1902
|
+
return 1
|
|
1903
|
+
|
|
1904
|
+
print(f"Found {len(files_to_validate)} files to validate:")
|
|
1905
|
+
for file_path in files_to_validate:
|
|
1906
|
+
print(f" 📄 {file_path.name}")
|
|
1907
|
+
|
|
1908
|
+
validation_summary = _validate_files(collector, files_to_validate)
|
|
1909
|
+
return _print_validation_summary(validation_summary)
|
|
1910
|
+
|
|
1911
|
+
|
|
1912
|
+
def _auto_validate_collected_files(collector, results: Dict) -> bool:
|
|
1913
|
+
"""Perform auto-validation on collected files.
|
|
1914
|
+
|
|
1915
|
+
Args:
|
|
1916
|
+
collector: BinancePublicDataCollector instance
|
|
1917
|
+
results: Collection results dictionary
|
|
1918
|
+
|
|
1919
|
+
Returns:
|
|
1920
|
+
True if all validations passed, False otherwise
|
|
1921
|
+
"""
|
|
1922
|
+
print("\n🔍 AUTO-VALIDATION AFTER COLLECTION")
|
|
1923
|
+
validation_passed = 0
|
|
1924
|
+
validation_failed = 0
|
|
1925
|
+
|
|
1926
|
+
for timeframe, csv_file in results.items():
|
|
1927
|
+
validation_result = collector.validate_csv_file(csv_file, timeframe)
|
|
1928
|
+
collector.update_metadata_with_validation(csv_file, validation_result)
|
|
1929
|
+
|
|
1930
|
+
if validation_result["total_errors"] == 0:
|
|
1931
|
+
validation_passed += 1
|
|
1932
|
+
else:
|
|
1933
|
+
validation_failed += 1
|
|
1934
|
+
|
|
1935
|
+
print(f"\nVALIDATION RESULTS: {validation_passed} passed, {validation_failed} failed")
|
|
1936
|
+
|
|
1937
|
+
if validation_failed == 0:
|
|
1938
|
+
print("🎉 ALL FILES VALIDATED SUCCESSFULLY!")
|
|
1939
|
+
print("Ready for ML training, backtesting, and production use")
|
|
1940
|
+
collector.apply_gap_filling_to_validated_files()
|
|
1941
|
+
return True
|
|
1942
|
+
else:
|
|
1943
|
+
print("⚠️ Some files failed validation - check errors above")
|
|
1944
|
+
return False
|
|
1945
|
+
|
|
1946
|
+
|
|
1947
|
+
def _run_collection_mode(args, collector) -> int:
|
|
1948
|
+
"""Execute data collection mode workflow.
|
|
1949
|
+
|
|
1950
|
+
Args:
|
|
1951
|
+
args: Parsed command line arguments
|
|
1952
|
+
collector: BinancePublicDataCollector instance
|
|
1953
|
+
|
|
1954
|
+
Returns:
|
|
1955
|
+
Exit code (0 for success, 1 for failure)
|
|
1956
|
+
"""
|
|
1957
|
+
timeframes = [tf.strip() for tf in args.timeframes.split(",")]
|
|
1958
|
+
print(f"Collecting timeframes: {timeframes}")
|
|
1959
|
+
|
|
1960
|
+
results = collector.collect_multiple_timeframes(timeframes)
|
|
1961
|
+
|
|
1962
|
+
if not results:
|
|
1963
|
+
print("❌ Collection failed")
|
|
1964
|
+
return 1
|
|
1965
|
+
|
|
1966
|
+
print(f"\n🚀 ULTRA-FAST COLLECTION SUCCESS: Generated {len(results)} datasets")
|
|
1967
|
+
|
|
1968
|
+
if not args.no_validation:
|
|
1969
|
+
_auto_validate_collected_files(collector, results)
|
|
1970
|
+
|
|
1971
|
+
return 0
|
|
1972
|
+
|
|
1973
|
+
|
|
1974
|
+
def main():
|
|
1975
|
+
"""Main execution function with CLI argument support."""
|
|
1976
|
+
parser = _setup_argument_parser()
|
|
1977
|
+
args = parser.parse_args()
|
|
1978
|
+
|
|
1979
|
+
print("Binance Public Data Ultra-Fast Collector with Validation")
|
|
1980
|
+
print("Official Binance data repository - 10-100x faster than API")
|
|
1981
|
+
print("=" * 80)
|
|
1982
|
+
|
|
1983
|
+
collector = BinancePublicDataCollector(
|
|
1984
|
+
symbol=args.symbol, start_date=args.start, end_date=args.end
|
|
1985
|
+
)
|
|
1986
|
+
|
|
1987
|
+
if args.validate_only:
|
|
1988
|
+
return _run_validation_only_mode(args, collector)
|
|
1989
|
+
else:
|
|
1990
|
+
return _run_collection_mode(args, collector)
|
|
1991
|
+
|
|
1992
|
+
|
|
1993
|
+
if __name__ == "__main__":
|
|
1994
|
+
exit(main())
|