gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1032 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Convenience API functions for gapless-crypto-data
|
|
4
|
+
|
|
5
|
+
Provides function-based API following financial data library conventions.
|
|
6
|
+
Simple and intuitive data collection returning standard pandas DataFrames.
|
|
7
|
+
|
|
8
|
+
Exception-only failure principles - all errors raise exceptions.
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
import gapless_crypto_clickhouse as gcd
|
|
12
|
+
|
|
13
|
+
# Simple data fetching
|
|
14
|
+
df = gcd.fetch_data("BTCUSDT", "1h", limit=1000)
|
|
15
|
+
|
|
16
|
+
# Get available symbols and timeframes
|
|
17
|
+
symbols = gcd.get_supported_symbols()
|
|
18
|
+
intervals = gcd.get_supported_timeframes()
|
|
19
|
+
|
|
20
|
+
# Download with date range
|
|
21
|
+
df = gcd.download("ETHUSDT", "4h", start="2024-01-01", end="2024-06-30")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from datetime import datetime, timedelta
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import List, Literal, Optional, Union
|
|
27
|
+
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from .collectors.binance_public_data_collector import BinancePublicDataCollector
|
|
31
|
+
from .gap_filling.universal_gap_filler import UniversalGapFiller
|
|
32
|
+
|
|
33
|
+
# Instrument type support (ADR-0021)
|
|
34
|
+
InstrumentType = Literal["spot", "futures-um"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_supported_symbols(instrument_type: InstrumentType = "spot") -> List[str]:
|
|
38
|
+
"""Get list of supported trading pairs for the specified instrument type.
|
|
39
|
+
|
|
40
|
+
Returns 713 validated perpetual symbols for both spot and futures.
|
|
41
|
+
Symbol list sourced from binance-futures-availability package
|
|
42
|
+
(validated daily via S3 Vision probes, 95%+ SLA).
|
|
43
|
+
|
|
44
|
+
Note: The instrument_type parameter is retained for API compatibility,
|
|
45
|
+
but both "spot" and "futures-um" return the same 713-symbol list.
|
|
46
|
+
Rationale: Binance markets are aligned - perpetual futures symbols
|
|
47
|
+
correspond to spot pairs. See ADR-0022 for complete alignment rationale.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
instrument_type: Type of instrument ("spot" or "futures-um"). Default: "spot"
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of 713 supported perpetual symbols (same for both spot and futures)
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If instrument_type is invalid
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
>>> # Get spot symbols (default) - returns 713 symbols
|
|
60
|
+
>>> symbols = get_supported_symbols()
|
|
61
|
+
>>> print(f"Found {len(symbols)} spot symbols")
|
|
62
|
+
Found 713 spot symbols
|
|
63
|
+
|
|
64
|
+
>>> # Get futures symbols - returns same 713 symbols
|
|
65
|
+
>>> futures = get_supported_symbols(instrument_type="futures-um")
|
|
66
|
+
>>> print(f"Found {len(futures)} futures symbols")
|
|
67
|
+
Found 713 futures symbols
|
|
68
|
+
|
|
69
|
+
>>> # Verify alignment
|
|
70
|
+
>>> get_supported_symbols("spot") == get_supported_symbols("futures-um")
|
|
71
|
+
True
|
|
72
|
+
|
|
73
|
+
>>> # Check symbol availability
|
|
74
|
+
>>> print(f"Bitcoin supported: {'BTCUSDT' in symbols}")
|
|
75
|
+
Bitcoin supported: True
|
|
76
|
+
"""
|
|
77
|
+
from binance_futures_availability.config.symbol_loader import load_symbols
|
|
78
|
+
|
|
79
|
+
# Validate parameter (fail fast on invalid types)
|
|
80
|
+
_validate_instrument_type(instrument_type)
|
|
81
|
+
|
|
82
|
+
# Return same 713 symbols for both types (ADR-0022)
|
|
83
|
+
return load_symbols("perpetual")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_supported_timeframes() -> List[str]:
|
|
87
|
+
"""Get list of supported timeframe intervals.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of timeframe strings (e.g., ["1m", "5m", "1h", "4h", ...])
|
|
91
|
+
|
|
92
|
+
Examples:
|
|
93
|
+
>>> timeframes = get_supported_timeframes()
|
|
94
|
+
>>> print(f"Available timeframes: {timeframes}")
|
|
95
|
+
>>> print(f"1-hour supported: {'1h' in timeframes}")
|
|
96
|
+
Available timeframes: ['1s', '1m', '3m', '5m', '15m', '30m', '1h', '2h', '4h', '6h', '8h', '12h', '1d', '3d', '1w', '1mo']
|
|
97
|
+
1-hour supported: True
|
|
98
|
+
"""
|
|
99
|
+
collector = BinancePublicDataCollector()
|
|
100
|
+
return collector.available_timeframes
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# DEPRECATED in v4.1.0: SupportedSymbol type alias removed (ADR-0022)
|
|
104
|
+
# Reason: 713-symbol Literal exceeds practical type checker limits
|
|
105
|
+
# Migration: Use `str` for symbol parameters, validate via get_supported_symbols()
|
|
106
|
+
#
|
|
107
|
+
# Before (v4.0.0):
|
|
108
|
+
# def my_function(symbol: SupportedSymbol) -> None: ...
|
|
109
|
+
#
|
|
110
|
+
# After (v4.1.0):
|
|
111
|
+
# def my_function(symbol: str) -> None:
|
|
112
|
+
# if symbol not in get_supported_symbols():
|
|
113
|
+
# raise ValueError(f"Unsupported symbol: {symbol}")
|
|
114
|
+
#
|
|
115
|
+
# Note: Spot and futures now both support 713 symbols (up from 20 spot symbols)
|
|
116
|
+
|
|
117
|
+
SupportedTimeframe = Literal[
|
|
118
|
+
"1s", "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "6h", "8h", "12h", "1d"
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _validate_instrument_type(instrument_type: str) -> None:
|
|
123
|
+
"""Validate instrument_type parameter.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
instrument_type: Instrument type to validate
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If instrument_type is not supported
|
|
130
|
+
|
|
131
|
+
Examples:
|
|
132
|
+
>>> _validate_instrument_type("spot") # Valid
|
|
133
|
+
>>> _validate_instrument_type("futures-um") # Valid
|
|
134
|
+
>>> _validate_instrument_type("futures") # Invalid
|
|
135
|
+
Traceback (most recent call last):
|
|
136
|
+
...
|
|
137
|
+
ValueError: Invalid instrument_type 'futures'. Must be 'spot' or 'futures-um'
|
|
138
|
+
"""
|
|
139
|
+
valid_types = {"spot", "futures-um"}
|
|
140
|
+
if instrument_type not in valid_types:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Invalid instrument_type '{instrument_type}'. "
|
|
143
|
+
f"Must be one of: {', '.join(sorted(valid_types))}. "
|
|
144
|
+
f"Use 'futures-um' for USDT-margined perpetual futures (713 symbols). "
|
|
145
|
+
f"See get_supported_symbols(instrument_type='{instrument_type}') for available symbols."
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _validate_timeframe_parameters(
|
|
150
|
+
timeframe: Optional[Union[str, SupportedTimeframe]],
|
|
151
|
+
interval: Optional[Union[str, SupportedTimeframe]],
|
|
152
|
+
) -> str:
|
|
153
|
+
"""Validate and resolve timeframe/interval parameters.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
timeframe: Timeframe parameter (preferred)
|
|
157
|
+
interval: Legacy interval parameter
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Resolved timeframe string
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
ValueError: If parameters are invalid or conflicting
|
|
164
|
+
"""
|
|
165
|
+
# Dual parameter validation with exception-only failures
|
|
166
|
+
if timeframe is None and interval is None:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"Must specify 'timeframe' parameter. "
|
|
169
|
+
"CCXT-compatible 'timeframe' is preferred over legacy 'interval'."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if timeframe is not None and interval is not None:
|
|
173
|
+
raise ValueError(
|
|
174
|
+
"Cannot specify both 'timeframe' and 'interval' parameters. "
|
|
175
|
+
"Use 'timeframe' (CCXT-compatible) or 'interval' (legacy), not both."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Use timeframe if provided, otherwise use interval (legacy)
|
|
179
|
+
return timeframe if timeframe is not None else interval
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _validate_index_type_parameter(index_type: Optional[str]) -> None:
|
|
183
|
+
"""Validate deprecated index_type parameter.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
index_type: Deprecated index_type parameter
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
ValueError: If index_type is invalid
|
|
190
|
+
"""
|
|
191
|
+
if index_type is None:
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
import warnings
|
|
195
|
+
|
|
196
|
+
warnings.warn(
|
|
197
|
+
"The 'index_type' parameter is deprecated and will be removed in v3.0.0. "
|
|
198
|
+
"Use standard pandas operations on the returned DataFrame instead.",
|
|
199
|
+
DeprecationWarning,
|
|
200
|
+
stacklevel=3,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Validate deprecated parameter for backward compatibility
|
|
204
|
+
valid_index_types = {"datetime", "range", "auto"}
|
|
205
|
+
if index_type not in valid_index_types:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"Invalid index_type '{index_type}'. "
|
|
208
|
+
f"Must be one of: {', '.join(sorted(valid_index_types))}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _validate_symbol(symbol: str, instrument_type: str = "spot") -> None:
|
|
213
|
+
"""Validate symbol against known supported symbols for instrument type.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
symbol: Trading pair symbol to validate
|
|
217
|
+
instrument_type: Instrument type context for validation
|
|
218
|
+
|
|
219
|
+
Raises:
|
|
220
|
+
ValueError: If symbol is not supported, with suggestions
|
|
221
|
+
"""
|
|
222
|
+
from gapless_crypto_clickhouse import get_supported_symbols
|
|
223
|
+
|
|
224
|
+
supported = get_supported_symbols(instrument_type=instrument_type)
|
|
225
|
+
|
|
226
|
+
if symbol not in supported:
|
|
227
|
+
# Find close matches (simple prefix matching)
|
|
228
|
+
symbol_upper = symbol.upper()
|
|
229
|
+
close_matches = [s for s in supported if s.startswith(symbol_upper[:3])]
|
|
230
|
+
|
|
231
|
+
if close_matches:
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"Invalid symbol '{symbol}' for instrument_type='{instrument_type}'. "
|
|
234
|
+
f"Did you mean '{close_matches[0]}'? "
|
|
235
|
+
f"Supported {instrument_type} symbols: {', '.join(supported[:5])}, ... "
|
|
236
|
+
f"(see get_supported_symbols(instrument_type='{instrument_type}') for full list)"
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
f"Invalid symbol '{symbol}' for instrument_type='{instrument_type}'. "
|
|
241
|
+
f"Supported {instrument_type} symbols: {', '.join(supported[:10])}, ... "
|
|
242
|
+
f"(see get_supported_symbols(instrument_type='{instrument_type}') for full list of {len(supported)} symbols)"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _validate_timeframe_value(timeframe: str) -> None:
|
|
247
|
+
"""Validate timeframe against supported timeframes.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
timeframe: Timeframe interval to validate
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
ValueError: If timeframe is not supported
|
|
254
|
+
"""
|
|
255
|
+
from gapless_crypto_clickhouse import get_supported_timeframes
|
|
256
|
+
|
|
257
|
+
supported = get_supported_timeframes()
|
|
258
|
+
|
|
259
|
+
if timeframe not in supported:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"Invalid timeframe '{timeframe}'. "
|
|
262
|
+
f"Supported timeframes: {', '.join(supported)} "
|
|
263
|
+
f"(see get_supported_timeframes() for details)"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _validate_date_format(date_str: Optional[str], param_name: str) -> None:
|
|
268
|
+
"""Validate date string format (YYYY-MM-DD).
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
date_str: Date string to validate
|
|
272
|
+
param_name: Parameter name for error context
|
|
273
|
+
|
|
274
|
+
Raises:
|
|
275
|
+
ValueError: If date format is invalid
|
|
276
|
+
"""
|
|
277
|
+
if date_str is None:
|
|
278
|
+
return
|
|
279
|
+
|
|
280
|
+
import re
|
|
281
|
+
|
|
282
|
+
# Check YYYY-MM-DD format
|
|
283
|
+
if not re.match(r"^\d{4}-\d{2}-\d{2}$", date_str):
|
|
284
|
+
raise ValueError(
|
|
285
|
+
f"Invalid {param_name} format '{date_str}'. "
|
|
286
|
+
f"Expected format: YYYY-MM-DD (e.g., '2024-01-01')"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Validate date is parseable
|
|
290
|
+
try:
|
|
291
|
+
datetime.strptime(date_str, "%Y-%m-%d")
|
|
292
|
+
except ValueError as e:
|
|
293
|
+
raise ValueError(f"Invalid {param_name} date '{date_str}': {str(e)}") from e
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _calculate_date_range_from_limit(
|
|
297
|
+
limit: Optional[int],
|
|
298
|
+
period: str,
|
|
299
|
+
start: Optional[str],
|
|
300
|
+
end: Optional[str],
|
|
301
|
+
) -> tuple[str, str]:
|
|
302
|
+
"""Calculate date range from limit parameter.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
limit: Maximum number of bars to return
|
|
306
|
+
period: Timeframe interval
|
|
307
|
+
start: Existing start date
|
|
308
|
+
end: Existing end date
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Tuple of (start_date, end_date) strings
|
|
312
|
+
"""
|
|
313
|
+
# If start/end already specified, use them
|
|
314
|
+
if start or end:
|
|
315
|
+
return start, end
|
|
316
|
+
|
|
317
|
+
# If no limit, return as-is
|
|
318
|
+
if not limit:
|
|
319
|
+
return start, end
|
|
320
|
+
|
|
321
|
+
# Calculate start date based on limit and interval
|
|
322
|
+
interval_minutes = {
|
|
323
|
+
"1s": 1 / 60, # 1 second = 1/60 minute
|
|
324
|
+
"1m": 1,
|
|
325
|
+
"3m": 3,
|
|
326
|
+
"5m": 5,
|
|
327
|
+
"15m": 15,
|
|
328
|
+
"30m": 30,
|
|
329
|
+
"1h": 60,
|
|
330
|
+
"2h": 120,
|
|
331
|
+
"4h": 240,
|
|
332
|
+
"6h": 360,
|
|
333
|
+
"8h": 480,
|
|
334
|
+
"12h": 720,
|
|
335
|
+
"1d": 1440,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if period in interval_minutes:
|
|
339
|
+
minutes_total = limit * interval_minutes[period]
|
|
340
|
+
start_date = datetime.now() - timedelta(minutes=minutes_total)
|
|
341
|
+
calculated_start = start_date.strftime("%Y-%m-%d")
|
|
342
|
+
calculated_end = datetime.now().strftime("%Y-%m-%d")
|
|
343
|
+
else:
|
|
344
|
+
# Default fallback for unknown periods
|
|
345
|
+
calculated_start = "2024-01-01"
|
|
346
|
+
calculated_end = datetime.now().strftime("%Y-%m-%d")
|
|
347
|
+
|
|
348
|
+
return calculated_start, calculated_end
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _apply_default_date_range(start: Optional[str], end: Optional[str]) -> tuple[str, str]:
|
|
352
|
+
"""Apply default date range if not specified.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
start: Start date
|
|
356
|
+
end: End date
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Tuple of (start_date, end_date) with defaults applied
|
|
360
|
+
"""
|
|
361
|
+
if not start:
|
|
362
|
+
start = "2021-01-01"
|
|
363
|
+
if not end:
|
|
364
|
+
end = datetime.now().strftime("%Y-%m-%d")
|
|
365
|
+
return start, end
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _perform_gap_filling(
|
|
369
|
+
result: dict,
|
|
370
|
+
auto_fill_gaps: bool,
|
|
371
|
+
period: str,
|
|
372
|
+
df: pd.DataFrame,
|
|
373
|
+
instrument_type: str = "spot", # ADR-0021
|
|
374
|
+
) -> pd.DataFrame:
|
|
375
|
+
"""Perform automatic gap filling on collected data.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
result: Collection result dictionary
|
|
379
|
+
auto_fill_gaps: Whether to auto-fill gaps
|
|
380
|
+
period: Timeframe interval
|
|
381
|
+
df: DataFrame with collected data
|
|
382
|
+
instrument_type: Instrument type for API endpoint selection
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
DataFrame with gaps filled (if applicable)
|
|
386
|
+
"""
|
|
387
|
+
if not auto_fill_gaps or not result.get("filepath"):
|
|
388
|
+
return df
|
|
389
|
+
|
|
390
|
+
import logging
|
|
391
|
+
|
|
392
|
+
logger = logging.getLogger(__name__)
|
|
393
|
+
|
|
394
|
+
csv_file = Path(result["filepath"])
|
|
395
|
+
gap_filler = UniversalGapFiller(
|
|
396
|
+
instrument_type=instrument_type
|
|
397
|
+
) # ADR-0021: Pass instrument type for API endpoint
|
|
398
|
+
|
|
399
|
+
# Detect and fill gaps
|
|
400
|
+
gap_result = gap_filler.process_file(csv_file, period)
|
|
401
|
+
|
|
402
|
+
if gap_result["gaps_detected"] > 0:
|
|
403
|
+
if gap_result["gaps_filled"] > 0:
|
|
404
|
+
logger.info(
|
|
405
|
+
f"✅ Auto-filled {gap_result['gaps_filled']}/{gap_result['gaps_detected']} "
|
|
406
|
+
f"gap(s) with authentic Binance API data"
|
|
407
|
+
)
|
|
408
|
+
# Reload DataFrame with filled gaps
|
|
409
|
+
df = pd.read_csv(csv_file, comment="#")
|
|
410
|
+
else:
|
|
411
|
+
logger.warning(
|
|
412
|
+
f"⚠️ Detected {gap_result['gaps_detected']} gap(s) but could not fill them. "
|
|
413
|
+
f"Data may not be complete."
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
return df
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _apply_limit_and_index(
|
|
420
|
+
df: pd.DataFrame,
|
|
421
|
+
limit: Optional[int],
|
|
422
|
+
index_type: Optional[str],
|
|
423
|
+
) -> pd.DataFrame:
|
|
424
|
+
"""Apply limit and index_type to DataFrame.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
df: DataFrame to process
|
|
428
|
+
limit: Maximum number of rows to return
|
|
429
|
+
index_type: Deprecated index_type parameter
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
Processed DataFrame
|
|
433
|
+
"""
|
|
434
|
+
# Apply limit if specified
|
|
435
|
+
if limit and len(df) > limit:
|
|
436
|
+
df = df.tail(limit).reset_index(drop=True)
|
|
437
|
+
|
|
438
|
+
# Handle deprecated index_type parameter for backward compatibility
|
|
439
|
+
if index_type in ("datetime", "auto"):
|
|
440
|
+
if "date" in df.columns:
|
|
441
|
+
# For deprecated datetime mode, return DataFrame with DatetimeIndex
|
|
442
|
+
return df.set_index("date", drop=False)
|
|
443
|
+
else:
|
|
444
|
+
# Handle edge case where date column is missing
|
|
445
|
+
return df
|
|
446
|
+
elif index_type == "range":
|
|
447
|
+
# For deprecated range mode, return DataFrame with RangeIndex (default)
|
|
448
|
+
return df
|
|
449
|
+
else:
|
|
450
|
+
# Default behavior: return standard pandas DataFrame with RangeIndex
|
|
451
|
+
# Users can use df.set_index('date') for DatetimeIndex operations
|
|
452
|
+
return df
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _create_empty_dataframe() -> pd.DataFrame:
|
|
456
|
+
"""Create empty DataFrame with expected OHLCV columns.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Empty DataFrame with standard columns
|
|
460
|
+
"""
|
|
461
|
+
columns = [
|
|
462
|
+
"date",
|
|
463
|
+
"open",
|
|
464
|
+
"high",
|
|
465
|
+
"low",
|
|
466
|
+
"close",
|
|
467
|
+
"volume",
|
|
468
|
+
"close_time",
|
|
469
|
+
"quote_asset_volume",
|
|
470
|
+
"number_of_trades",
|
|
471
|
+
"taker_buy_base_asset_volume",
|
|
472
|
+
"taker_buy_quote_asset_volume",
|
|
473
|
+
]
|
|
474
|
+
return pd.DataFrame(columns=columns)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def fetch_data(
|
|
478
|
+
symbol: str,
|
|
479
|
+
timeframe: Optional[Union[str, SupportedTimeframe]] = None,
|
|
480
|
+
limit: Optional[int] = None,
|
|
481
|
+
start: Optional[str] = None,
|
|
482
|
+
end: Optional[str] = None,
|
|
483
|
+
start_date: Optional[str] = None, # Alias for start
|
|
484
|
+
end_date: Optional[str] = None, # Alias for end
|
|
485
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
486
|
+
index_type: Optional[Literal["datetime", "range", "auto"]] = None, # Deprecated parameter
|
|
487
|
+
auto_fill_gaps: bool = True,
|
|
488
|
+
instrument_type: InstrumentType = "spot", # ADR-0021: UM futures support
|
|
489
|
+
*,
|
|
490
|
+
interval: Optional[Union[str, SupportedTimeframe]] = None,
|
|
491
|
+
) -> pd.DataFrame:
|
|
492
|
+
"""Fetch cryptocurrency data as standard pandas DataFrame with zero gaps guarantee.
|
|
493
|
+
|
|
494
|
+
Returns pandas DataFrame with complete OHLCV and microstructure data.
|
|
495
|
+
All analysis and calculations can be performed using standard pandas operations.
|
|
496
|
+
|
|
497
|
+
By default, automatically detects and fills gaps using authentic Binance API data
|
|
498
|
+
to deliver on the "zero gaps guarantee" promise.
|
|
499
|
+
|
|
500
|
+
**⚠️ IMPORTANT - funding_rate Column (v3.2.0+)**:
|
|
501
|
+
The DataFrame includes a `funding_rate` column for futures data, but it is **NULL**
|
|
502
|
+
in v3.2.0 (not yet populated). Funding rate collection will be implemented in v3.3.0
|
|
503
|
+
via a separate `/fapi/v1/fundingRate` API endpoint. Do not use this column for
|
|
504
|
+
calculations until it is populated.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT", "ETHUSDT")
|
|
508
|
+
timeframe: Timeframe interval (e.g., "1m", "5m", "1h", "4h", "1d")
|
|
509
|
+
limit: Maximum number of recent bars to return (optional)
|
|
510
|
+
start: Start date in YYYY-MM-DD format (optional). Alias: start_date
|
|
511
|
+
end: End date in YYYY-MM-DD format (optional). Alias: end_date
|
|
512
|
+
start_date: Alias for start (recommended for clarity)
|
|
513
|
+
end_date: Alias for end (recommended for clarity)
|
|
514
|
+
output_dir: Directory to save CSV files (optional)
|
|
515
|
+
index_type: DEPRECATED - Use pandas operations directly
|
|
516
|
+
auto_fill_gaps: Automatically fill detected gaps with authentic Binance API data (default: True)
|
|
517
|
+
instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
|
|
518
|
+
interval: Legacy parameter name for timeframe (deprecated, use timeframe)
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
pd.DataFrame with OHLCV data and microstructure columns:
|
|
522
|
+
- date: Timestamp (open time)
|
|
523
|
+
- open, high, low, close: Price data
|
|
524
|
+
- volume: Base asset volume
|
|
525
|
+
- close_time: Close timestamp
|
|
526
|
+
- quote_asset_volume: Quote asset volume
|
|
527
|
+
- number_of_trades: Trade count
|
|
528
|
+
- taker_buy_base_asset_volume: Taker buy base volume
|
|
529
|
+
- taker_buy_quote_asset_volume: Taker buy quote volume
|
|
530
|
+
- funding_rate: Funding rate (⚠️ NULL in v3.2.0, will be populated in v3.3.0)
|
|
531
|
+
|
|
532
|
+
Raises:
|
|
533
|
+
ValueError: If both 'start' and 'start_date' specified, or both 'end' and 'end_date' specified
|
|
534
|
+
ValueError: If symbol is not supported (with suggestions for correction)
|
|
535
|
+
ValueError: If timeframe is not supported (with list of supported timeframes)
|
|
536
|
+
ValueError: If date format is invalid (expected YYYY-MM-DD)
|
|
537
|
+
ValueError: If instrument_type is invalid (must be "spot" or "futures-um")
|
|
538
|
+
|
|
539
|
+
Examples:
|
|
540
|
+
# Simple spot data fetching (default)
|
|
541
|
+
df = fetch_data("BTCUSDT", "1h", limit=1000)
|
|
542
|
+
|
|
543
|
+
# Fetch futures data (713 symbols available)
|
|
544
|
+
df = fetch_data("BTCUSDT", "1h", limit=1000, instrument_type="futures-um")
|
|
545
|
+
|
|
546
|
+
# Standard pandas operations for analysis
|
|
547
|
+
returns = df['close'].pct_change() # Returns calculation
|
|
548
|
+
rolling_vol = df['close'].rolling(20).std() # Rolling volatility
|
|
549
|
+
df_resampled = df.set_index('date').resample('4H').agg({
|
|
550
|
+
'open': 'first', 'high': 'max', 'low': 'min',
|
|
551
|
+
'close': 'last', 'volume': 'sum'
|
|
552
|
+
}) # OHLCV resampling
|
|
553
|
+
|
|
554
|
+
# Fetch specific date range (explicit form - recommended)
|
|
555
|
+
df = fetch_data("ETHUSDT", "4h", start_date="2024-01-01", end_date="2024-06-30")
|
|
556
|
+
|
|
557
|
+
# Fetch futures with date range
|
|
558
|
+
df = fetch_data("SOLUSDT", "1h", start_date="2024-01-01", end_date="2024-06-30",
|
|
559
|
+
instrument_type="futures-um")
|
|
560
|
+
"""
|
|
561
|
+
# Validate and resolve timeframe parameters
|
|
562
|
+
period = _validate_timeframe_parameters(timeframe, interval)
|
|
563
|
+
|
|
564
|
+
# Validate deprecated index_type parameter
|
|
565
|
+
_validate_index_type_parameter(index_type)
|
|
566
|
+
|
|
567
|
+
# Validate and normalize date range parameters
|
|
568
|
+
if start is not None and start_date is not None:
|
|
569
|
+
raise ValueError(
|
|
570
|
+
"Cannot specify both 'start' and 'start_date'. "
|
|
571
|
+
"Use either 'start' OR 'start_date', not both."
|
|
572
|
+
)
|
|
573
|
+
if end is not None and end_date is not None:
|
|
574
|
+
raise ValueError(
|
|
575
|
+
"Cannot specify both 'end' and 'end_date'. Use either 'end' OR 'end_date', not both."
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Normalize: prefer explicit _date parameters
|
|
579
|
+
start = start_date if start_date is not None else start
|
|
580
|
+
end = end_date if end_date is not None else end
|
|
581
|
+
|
|
582
|
+
# Validate symbol parameter (None check)
|
|
583
|
+
if symbol is None:
|
|
584
|
+
raise ValueError(
|
|
585
|
+
"symbol parameter is required (cannot be None). "
|
|
586
|
+
"Specify a trading pair (e.g., symbol='BTCUSDT')"
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# Normalize symbol case (auto-uppercase for user convenience)
|
|
590
|
+
symbol = symbol.upper()
|
|
591
|
+
|
|
592
|
+
# Calculate date range from limit if needed
|
|
593
|
+
start, end = _calculate_date_range_from_limit(limit, period, start, end)
|
|
594
|
+
|
|
595
|
+
# Apply default date range if not specified
|
|
596
|
+
start, end = _apply_default_date_range(start, end)
|
|
597
|
+
|
|
598
|
+
# Upfront input validation (fast failure before expensive operations)
|
|
599
|
+
_validate_instrument_type(instrument_type) # ADR-0021: Validate instrument type first
|
|
600
|
+
_validate_symbol(
|
|
601
|
+
symbol, instrument_type=instrument_type
|
|
602
|
+
) # ADR-0021: Pass instrument type for context-aware validation
|
|
603
|
+
_validate_timeframe_value(period)
|
|
604
|
+
_validate_date_format(start, "start/start_date")
|
|
605
|
+
_validate_date_format(end, "end/end_date")
|
|
606
|
+
|
|
607
|
+
# Initialize collector and collect data
|
|
608
|
+
collector = BinancePublicDataCollector(
|
|
609
|
+
symbol=symbol,
|
|
610
|
+
start_date=start,
|
|
611
|
+
end_date=end,
|
|
612
|
+
output_dir=output_dir,
|
|
613
|
+
instrument_type=instrument_type, # ADR-0021: Pass instrument type for URL routing
|
|
614
|
+
)
|
|
615
|
+
result = collector.collect_timeframe_data(period)
|
|
616
|
+
|
|
617
|
+
# Process result or return empty DataFrame
|
|
618
|
+
if result and "dataframe" in result:
|
|
619
|
+
df = result["dataframe"]
|
|
620
|
+
|
|
621
|
+
# Auto-fill gaps if enabled (delivers "zero gaps guarantee")
|
|
622
|
+
df = _perform_gap_filling(result, auto_fill_gaps, period, df, instrument_type)
|
|
623
|
+
|
|
624
|
+
# Apply limit and index_type
|
|
625
|
+
return _apply_limit_and_index(df, limit, index_type)
|
|
626
|
+
else:
|
|
627
|
+
# Return empty DataFrame with expected columns
|
|
628
|
+
return _create_empty_dataframe()
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def download(
|
|
632
|
+
symbol: str,
|
|
633
|
+
timeframe: Optional[Union[str, SupportedTimeframe]] = None,
|
|
634
|
+
start: Optional[str] = None,
|
|
635
|
+
end: Optional[str] = None,
|
|
636
|
+
start_date: Optional[str] = None, # Alias for start
|
|
637
|
+
end_date: Optional[str] = None, # Alias for end
|
|
638
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
639
|
+
index_type: Optional[Literal["datetime", "range", "auto"]] = None, # Deprecated parameter
|
|
640
|
+
auto_fill_gaps: bool = True,
|
|
641
|
+
instrument_type: InstrumentType = "spot", # ADR-0021: UM futures support
|
|
642
|
+
*,
|
|
643
|
+
interval: Optional[Union[str, SupportedTimeframe]] = None,
|
|
644
|
+
) -> pd.DataFrame:
|
|
645
|
+
"""Download cryptocurrency data with zero gaps guarantee.
|
|
646
|
+
|
|
647
|
+
Provides familiar API patterns for intuitive data collection.
|
|
648
|
+
By default, automatically detects and fills gaps using authentic Binance API data
|
|
649
|
+
to deliver on the package's core promise of zero gaps.
|
|
650
|
+
|
|
651
|
+
**NEW in v3.2.0**: USDT-margined perpetual futures support (713 symbols).
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
symbol: Trading pair symbol (e.g., "BTCUSDT")
|
|
655
|
+
timeframe: Timeframe interval (default: "1h" if neither specified)
|
|
656
|
+
start: Start date in YYYY-MM-DD format. Alias: start_date
|
|
657
|
+
end: End date in YYYY-MM-DD format. Alias: end_date
|
|
658
|
+
start_date: Alias for start (recommended for clarity)
|
|
659
|
+
end_date: Alias for end (recommended for clarity)
|
|
660
|
+
output_dir: Directory to save CSV files
|
|
661
|
+
index_type: DEPRECATED - Use standard pandas operations instead
|
|
662
|
+
auto_fill_gaps: Automatically fill detected gaps with authentic Binance API data (default: True)
|
|
663
|
+
instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
|
|
664
|
+
interval: Legacy parameter name for timeframe (deprecated)
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
pd.DataFrame with complete OHLCV and microstructure data (gapless by default).
|
|
668
|
+
Includes funding_rate column (⚠️ NULL in v3.2.0, populated in future release).
|
|
669
|
+
|
|
670
|
+
Raises:
|
|
671
|
+
ValueError: If instrument_type is invalid (must be "spot" or "futures-um")
|
|
672
|
+
ValueError: If both 'start' and 'start_date' specified, or both 'end' and 'end_date' specified
|
|
673
|
+
ValueError: If symbol is not supported (with suggestions for correction)
|
|
674
|
+
ValueError: If timeframe is not supported (with list of supported timeframes)
|
|
675
|
+
ValueError: If date format is invalid (expected YYYY-MM-DD)
|
|
676
|
+
|
|
677
|
+
Warning:
|
|
678
|
+
The funding_rate column exists but is NULL for all rows in v3.2.0.
|
|
679
|
+
Funding rate data requires separate API endpoint (/fapi/v1/fundingRate)
|
|
680
|
+
and will be implemented in v3.3.0. Do not rely on funding_rate values.
|
|
681
|
+
|
|
682
|
+
Examples:
|
|
683
|
+
# Simple spot data download (default)
|
|
684
|
+
df = download("BTCUSDT", "1h", start="2024-01-01", end="2024-06-30")
|
|
685
|
+
|
|
686
|
+
# Futures data download (NEW in v3.2.0)
|
|
687
|
+
df = download("BTCUSDT", "1h", start="2024-01-01", end="2024-06-30",
|
|
688
|
+
instrument_type="futures-um")
|
|
689
|
+
|
|
690
|
+
# Explicit form (recommended)
|
|
691
|
+
df = download("BTCUSDT", "1h", start_date="2024-01-01", end_date="2024-06-30")
|
|
692
|
+
|
|
693
|
+
# Disable auto-fill if you want raw Vision archive data
|
|
694
|
+
df = download("ETHUSDT", "4h", auto_fill_gaps=False)
|
|
695
|
+
|
|
696
|
+
# Legacy interval parameter
|
|
697
|
+
df = download("BTCUSDT", interval="1h")
|
|
698
|
+
"""
|
|
699
|
+
# Apply default if neither parameter specified
|
|
700
|
+
if timeframe is None and interval is None:
|
|
701
|
+
timeframe = "1h"
|
|
702
|
+
|
|
703
|
+
# Validate and normalize date range parameters
|
|
704
|
+
if start is not None and start_date is not None:
|
|
705
|
+
raise ValueError(
|
|
706
|
+
"Cannot specify both 'start' and 'start_date'. "
|
|
707
|
+
"Use either 'start' OR 'start_date', not both."
|
|
708
|
+
)
|
|
709
|
+
if end is not None and end_date is not None:
|
|
710
|
+
raise ValueError(
|
|
711
|
+
"Cannot specify both 'end' and 'end_date'. Use either 'end' OR 'end_date', not both."
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
# Normalize: prefer explicit _date parameters
|
|
715
|
+
start = start_date if start_date is not None else start
|
|
716
|
+
end = end_date if end_date is not None else end
|
|
717
|
+
|
|
718
|
+
# Validate symbol parameter (None check)
|
|
719
|
+
if symbol is None:
|
|
720
|
+
raise ValueError(
|
|
721
|
+
"symbol parameter is required (cannot be None). "
|
|
722
|
+
"Specify a trading pair (e.g., symbol='BTCUSDT')"
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
# Normalize symbol case (auto-uppercase for user convenience)
|
|
726
|
+
symbol = symbol.upper()
|
|
727
|
+
|
|
728
|
+
return fetch_data(
|
|
729
|
+
symbol=symbol,
|
|
730
|
+
timeframe=timeframe,
|
|
731
|
+
start=start,
|
|
732
|
+
end=end,
|
|
733
|
+
output_dir=output_dir,
|
|
734
|
+
index_type=index_type,
|
|
735
|
+
auto_fill_gaps=auto_fill_gaps,
|
|
736
|
+
instrument_type=instrument_type, # ADR-0021
|
|
737
|
+
interval=interval,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def download_multiple(
|
|
742
|
+
symbols: List[str],
|
|
743
|
+
timeframe: str,
|
|
744
|
+
start_date: Optional[str] = None,
|
|
745
|
+
end_date: Optional[str] = None,
|
|
746
|
+
limit: Optional[int] = None,
|
|
747
|
+
max_workers: int = 5,
|
|
748
|
+
raise_on_partial_failure: bool = False,
|
|
749
|
+
instrument_type: InstrumentType = "spot", # ADR-0021
|
|
750
|
+
**kwargs,
|
|
751
|
+
) -> dict[str, pd.DataFrame]:
|
|
752
|
+
"""Download historical data for multiple symbols concurrently.
|
|
753
|
+
|
|
754
|
+
Executes concurrent downloads using ThreadPoolExecutor for network-bound
|
|
755
|
+
operations. Returns dict mapping symbol → DataFrame.
|
|
756
|
+
|
|
757
|
+
**NEW in v3.2.0**: Supports USDT-margined perpetual futures (713 symbols).
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
symbols: List of trading pair symbols (e.g., ["BTCUSDT", "ETHUSDT"])
|
|
761
|
+
timeframe: Candle interval (e.g., "1h", "4h", "1d")
|
|
762
|
+
start_date: Start date (YYYY-MM-DD)
|
|
763
|
+
end_date: End date (YYYY-MM-DD)
|
|
764
|
+
limit: Maximum bars per symbol
|
|
765
|
+
max_workers: Maximum concurrent downloads (default: 5)
|
|
766
|
+
raise_on_partial_failure: Raise error if any symbol fails (default: False)
|
|
767
|
+
instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
|
|
768
|
+
**kwargs: Additional parameters passed to download()
|
|
769
|
+
|
|
770
|
+
Returns:
|
|
771
|
+
dict[str, pd.DataFrame]: Mapping of symbol → DataFrame
|
|
772
|
+
Only includes successful downloads (failed symbols omitted unless raise_on_partial_failure=True)
|
|
773
|
+
Each DataFrame includes funding_rate column (⚠️ NULL in v3.2.0, populated in future release)
|
|
774
|
+
|
|
775
|
+
Raises:
|
|
776
|
+
ValueError: If instrument_type is invalid
|
|
777
|
+
ValueError: If symbols list is empty
|
|
778
|
+
ValueError: If max_workers < 1
|
|
779
|
+
ValueError: If all symbols fail
|
|
780
|
+
ValueError: If raise_on_partial_failure=True and any symbol fails
|
|
781
|
+
|
|
782
|
+
Warning:
|
|
783
|
+
The funding_rate column exists but is NULL for all rows in v3.2.0.
|
|
784
|
+
|
|
785
|
+
Examples:
|
|
786
|
+
>>> # Download multiple spot symbols concurrently (default)
|
|
787
|
+
>>> results = download_multiple(
|
|
788
|
+
... symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
789
|
+
... timeframe="1h",
|
|
790
|
+
... start_date="2024-01-01",
|
|
791
|
+
... end_date="2024-06-30"
|
|
792
|
+
... )
|
|
793
|
+
>>> len(results)
|
|
794
|
+
3
|
|
795
|
+
|
|
796
|
+
>>> # Download multiple futures symbols concurrently (NEW in v3.2.0)
|
|
797
|
+
>>> futures = download_multiple(
|
|
798
|
+
... symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
799
|
+
... timeframe="1h",
|
|
800
|
+
... start_date="2024-01-01",
|
|
801
|
+
... instrument_type="futures-um"
|
|
802
|
+
... )
|
|
803
|
+
>>> len(futures)
|
|
804
|
+
3
|
|
805
|
+
|
|
806
|
+
>>> # With error handling (partial failure - some succeed)
|
|
807
|
+
>>> results = download_multiple(
|
|
808
|
+
... symbols=["BTCUSDT", "INVALID", "ETHUSDT"],
|
|
809
|
+
... timeframe="1h",
|
|
810
|
+
... start_date="2024-01-01"
|
|
811
|
+
... )
|
|
812
|
+
>>> len(results)
|
|
813
|
+
2 # Only BTCUSDT and ETHUSDT succeeded
|
|
814
|
+
|
|
815
|
+
>>> # Strict mode (fail fast on any error)
|
|
816
|
+
>>> results = download_multiple(
|
|
817
|
+
... symbols=["BTCUSDT", "INVALID"],
|
|
818
|
+
... timeframe="1h",
|
|
819
|
+
... start_date="2024-01-01",
|
|
820
|
+
... raise_on_partial_failure=True
|
|
821
|
+
... )
|
|
822
|
+
# → Raises ValueError immediately on first failure
|
|
823
|
+
"""
|
|
824
|
+
import warnings
|
|
825
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
826
|
+
|
|
827
|
+
# Input validation
|
|
828
|
+
if not symbols:
|
|
829
|
+
raise ValueError("symbols list cannot be empty")
|
|
830
|
+
|
|
831
|
+
if max_workers < 1:
|
|
832
|
+
raise ValueError("max_workers must be >= 1")
|
|
833
|
+
|
|
834
|
+
results: dict[str, pd.DataFrame] = {}
|
|
835
|
+
errors: dict[str, str] = {}
|
|
836
|
+
|
|
837
|
+
# Concurrent execution with ThreadPoolExecutor
|
|
838
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
839
|
+
# Submit all download tasks
|
|
840
|
+
future_to_symbol = {
|
|
841
|
+
executor.submit(
|
|
842
|
+
download,
|
|
843
|
+
symbol=symbol,
|
|
844
|
+
timeframe=timeframe,
|
|
845
|
+
start_date=start_date,
|
|
846
|
+
end_date=end_date,
|
|
847
|
+
limit=limit,
|
|
848
|
+
instrument_type=instrument_type, # ADR-0021
|
|
849
|
+
**kwargs,
|
|
850
|
+
): symbol
|
|
851
|
+
for symbol in symbols
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
# Collect results as they complete
|
|
855
|
+
for future in as_completed(future_to_symbol):
|
|
856
|
+
symbol = future_to_symbol[future]
|
|
857
|
+
try:
|
|
858
|
+
results[symbol] = future.result()
|
|
859
|
+
except Exception as e:
|
|
860
|
+
errors[symbol] = str(e)
|
|
861
|
+
|
|
862
|
+
# Fail fast mode
|
|
863
|
+
if raise_on_partial_failure:
|
|
864
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
865
|
+
raise ValueError(f"Download failed for {symbol}: {e}") from e
|
|
866
|
+
|
|
867
|
+
# Handle complete failure
|
|
868
|
+
if not results and errors:
|
|
869
|
+
raise ValueError(f"All {len(symbols)} symbols failed. Errors: {errors}")
|
|
870
|
+
|
|
871
|
+
# Log warnings for partial failures
|
|
872
|
+
if errors:
|
|
873
|
+
warnings.warn(
|
|
874
|
+
f"Failed to download {len(errors)} symbols: {list(errors.keys())}. Errors: {errors}",
|
|
875
|
+
UserWarning,
|
|
876
|
+
stacklevel=2,
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
return results
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def fill_gaps(directory: Union[str, Path], symbols: Optional[List[str]] = None) -> dict:
|
|
883
|
+
"""Fill gaps in existing CSV data files.
|
|
884
|
+
|
|
885
|
+
Args:
|
|
886
|
+
directory: Directory containing CSV files to process
|
|
887
|
+
symbols: Optional list of symbols to process (default: all found)
|
|
888
|
+
|
|
889
|
+
Returns:
|
|
890
|
+
dict: Gap filling results with statistics
|
|
891
|
+
|
|
892
|
+
Examples:
|
|
893
|
+
# Fill all gaps in directory
|
|
894
|
+
results = fill_gaps("./data")
|
|
895
|
+
|
|
896
|
+
# Fill gaps for specific symbols
|
|
897
|
+
results = fill_gaps("./data", symbols=["BTCUSDT", "ETHUSDT"])
|
|
898
|
+
"""
|
|
899
|
+
gap_filler = UniversalGapFiller()
|
|
900
|
+
target_dir = Path(directory)
|
|
901
|
+
|
|
902
|
+
# Find CSV files
|
|
903
|
+
csv_files = list(target_dir.glob("*.csv"))
|
|
904
|
+
if symbols:
|
|
905
|
+
# Filter by specified symbols
|
|
906
|
+
csv_files = [f for f in csv_files if any(symbol in f.name for symbol in symbols)]
|
|
907
|
+
|
|
908
|
+
results = {
|
|
909
|
+
"files_processed": 0,
|
|
910
|
+
"gaps_detected": 0,
|
|
911
|
+
"gaps_filled": 0,
|
|
912
|
+
"success_rate": 0.0,
|
|
913
|
+
"file_results": {},
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
for csv_file in csv_files:
|
|
917
|
+
# Extract timeframe from filename
|
|
918
|
+
timeframe = "1h" # Default
|
|
919
|
+
for tf in ["1s", "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "6h", "8h", "12h", "1d"]:
|
|
920
|
+
if f"-{tf}_" in csv_file.name or f"-{tf}-" in csv_file.name:
|
|
921
|
+
timeframe = tf
|
|
922
|
+
break
|
|
923
|
+
|
|
924
|
+
# Process file
|
|
925
|
+
file_result = gap_filler.process_file(csv_file, timeframe)
|
|
926
|
+
results["file_results"][csv_file.name] = file_result
|
|
927
|
+
results["files_processed"] += 1
|
|
928
|
+
results["gaps_detected"] += file_result["gaps_detected"]
|
|
929
|
+
results["gaps_filled"] += file_result["gaps_filled"]
|
|
930
|
+
|
|
931
|
+
# Calculate overall success rate
|
|
932
|
+
if results["gaps_detected"] > 0:
|
|
933
|
+
results["success_rate"] = (results["gaps_filled"] / results["gaps_detected"]) * 100
|
|
934
|
+
else:
|
|
935
|
+
results["success_rate"] = 100.0
|
|
936
|
+
|
|
937
|
+
return results
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def get_info() -> dict:
|
|
941
|
+
"""Get library information and capabilities.
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
dict: Library metadata and capabilities
|
|
945
|
+
|
|
946
|
+
Examples:
|
|
947
|
+
>>> info = get_info()
|
|
948
|
+
>>> print(f"Version: {info['version']}")
|
|
949
|
+
>>> print(f"Supported symbols: {len(info['supported_symbols'])}")
|
|
950
|
+
"""
|
|
951
|
+
from . import __version__
|
|
952
|
+
|
|
953
|
+
return {
|
|
954
|
+
"version": __version__,
|
|
955
|
+
"name": "gapless-crypto-data",
|
|
956
|
+
"description": "Ultra-fast cryptocurrency data collection with zero gaps guarantee",
|
|
957
|
+
"supported_symbols": get_supported_symbols(),
|
|
958
|
+
"supported_timeframes": get_supported_timeframes(),
|
|
959
|
+
"market_type": "USDT spot pairs only",
|
|
960
|
+
"data_source": "Binance public data repository + API",
|
|
961
|
+
"features": [
|
|
962
|
+
"22x faster than API calls",
|
|
963
|
+
"Full 11-column microstructure format",
|
|
964
|
+
"Automatic gap detection and filling",
|
|
965
|
+
"Production-grade data quality",
|
|
966
|
+
],
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def get_supported_intervals() -> List[str]:
|
|
971
|
+
"""Get list of supported timeframe intervals (legacy alias).
|
|
972
|
+
|
|
973
|
+
Deprecated: Use get_supported_timeframes() instead.
|
|
974
|
+
Maintained for backward compatibility with existing code.
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
List of timeframe strings (e.g., ["1m", "5m", "1h", "4h", ...])
|
|
978
|
+
|
|
979
|
+
Examples:
|
|
980
|
+
>>> intervals = get_supported_intervals() # deprecated
|
|
981
|
+
>>> timeframes = get_supported_timeframes() # preferred
|
|
982
|
+
"""
|
|
983
|
+
import warnings
|
|
984
|
+
|
|
985
|
+
warnings.warn(
|
|
986
|
+
"get_supported_intervals() is deprecated. Use get_supported_timeframes() instead.",
|
|
987
|
+
DeprecationWarning,
|
|
988
|
+
stacklevel=2,
|
|
989
|
+
)
|
|
990
|
+
return get_supported_timeframes()
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def save_parquet(df: pd.DataFrame, path: str) -> None:
|
|
994
|
+
"""Save DataFrame to Parquet format with optimized compression.
|
|
995
|
+
|
|
996
|
+
Args:
|
|
997
|
+
df: DataFrame to save
|
|
998
|
+
path: Output file path (should end with .parquet)
|
|
999
|
+
|
|
1000
|
+
Raises:
|
|
1001
|
+
FileNotFoundError: If output directory doesn't exist
|
|
1002
|
+
PermissionError: If cannot write to path
|
|
1003
|
+
ValueError: If DataFrame is invalid
|
|
1004
|
+
|
|
1005
|
+
Examples:
|
|
1006
|
+
>>> df = fetch_data("BTCUSDT", "1h", limit=1000)
|
|
1007
|
+
>>> save_parquet(df, "btc_data.parquet")
|
|
1008
|
+
"""
|
|
1009
|
+
if df is None or df.empty:
|
|
1010
|
+
raise ValueError("Cannot save empty DataFrame to Parquet")
|
|
1011
|
+
|
|
1012
|
+
df.to_parquet(path, engine="pyarrow", compression="snappy", index=False)
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
def load_parquet(path: str) -> pd.DataFrame:
|
|
1016
|
+
"""Load DataFrame from Parquet file.
|
|
1017
|
+
|
|
1018
|
+
Args:
|
|
1019
|
+
path: Parquet file path
|
|
1020
|
+
|
|
1021
|
+
Returns:
|
|
1022
|
+
DataFrame with original structure and data types
|
|
1023
|
+
|
|
1024
|
+
Raises:
|
|
1025
|
+
FileNotFoundError: If file doesn't exist
|
|
1026
|
+
ParquetError: If file is corrupted or invalid
|
|
1027
|
+
|
|
1028
|
+
Examples:
|
|
1029
|
+
>>> df = load_parquet("btc_data.parquet")
|
|
1030
|
+
>>> print(f"Loaded {len(df)} bars")
|
|
1031
|
+
"""
|
|
1032
|
+
return pd.read_parquet(path, engine="pyarrow")
|