gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1032 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convenience API functions for gapless-crypto-data
4
+
5
+ Provides function-based API following financial data library conventions.
6
+ Simple and intuitive data collection returning standard pandas DataFrames.
7
+
8
+ Exception-only failure principles - all errors raise exceptions.
9
+
10
+ Examples:
11
+ import gapless_crypto_clickhouse as gcd
12
+
13
+ # Simple data fetching
14
+ df = gcd.fetch_data("BTCUSDT", "1h", limit=1000)
15
+
16
+ # Get available symbols and timeframes
17
+ symbols = gcd.get_supported_symbols()
18
+ intervals = gcd.get_supported_timeframes()
19
+
20
+ # Download with date range
21
+ df = gcd.download("ETHUSDT", "4h", start="2024-01-01", end="2024-06-30")
22
+ """
23
+
24
+ from datetime import datetime, timedelta
25
+ from pathlib import Path
26
+ from typing import List, Literal, Optional, Union
27
+
28
+ import pandas as pd
29
+
30
+ from .collectors.binance_public_data_collector import BinancePublicDataCollector
31
+ from .gap_filling.universal_gap_filler import UniversalGapFiller
32
+
33
+ # Instrument type support (ADR-0021)
34
+ InstrumentType = Literal["spot", "futures-um"]
35
+
36
+
37
+ def get_supported_symbols(instrument_type: InstrumentType = "spot") -> List[str]:
38
+ """Get list of supported trading pairs for the specified instrument type.
39
+
40
+ Returns 713 validated perpetual symbols for both spot and futures.
41
+ Symbol list sourced from binance-futures-availability package
42
+ (validated daily via S3 Vision probes, 95%+ SLA).
43
+
44
+ Note: The instrument_type parameter is retained for API compatibility,
45
+ but both "spot" and "futures-um" return the same 713-symbol list.
46
+ Rationale: Binance markets are aligned - perpetual futures symbols
47
+ correspond to spot pairs. See ADR-0022 for complete alignment rationale.
48
+
49
+ Args:
50
+ instrument_type: Type of instrument ("spot" or "futures-um"). Default: "spot"
51
+
52
+ Returns:
53
+ List of 713 supported perpetual symbols (same for both spot and futures)
54
+
55
+ Raises:
56
+ ValueError: If instrument_type is invalid
57
+
58
+ Examples:
59
+ >>> # Get spot symbols (default) - returns 713 symbols
60
+ >>> symbols = get_supported_symbols()
61
+ >>> print(f"Found {len(symbols)} spot symbols")
62
+ Found 713 spot symbols
63
+
64
+ >>> # Get futures symbols - returns same 713 symbols
65
+ >>> futures = get_supported_symbols(instrument_type="futures-um")
66
+ >>> print(f"Found {len(futures)} futures symbols")
67
+ Found 713 futures symbols
68
+
69
+ >>> # Verify alignment
70
+ >>> get_supported_symbols("spot") == get_supported_symbols("futures-um")
71
+ True
72
+
73
+ >>> # Check symbol availability
74
+ >>> print(f"Bitcoin supported: {'BTCUSDT' in symbols}")
75
+ Bitcoin supported: True
76
+ """
77
+ from binance_futures_availability.config.symbol_loader import load_symbols
78
+
79
+ # Validate parameter (fail fast on invalid types)
80
+ _validate_instrument_type(instrument_type)
81
+
82
+ # Return same 713 symbols for both types (ADR-0022)
83
+ return load_symbols("perpetual")
84
+
85
+
86
+ def get_supported_timeframes() -> List[str]:
87
+ """Get list of supported timeframe intervals.
88
+
89
+ Returns:
90
+ List of timeframe strings (e.g., ["1m", "5m", "1h", "4h", ...])
91
+
92
+ Examples:
93
+ >>> timeframes = get_supported_timeframes()
94
+ >>> print(f"Available timeframes: {timeframes}")
95
+ >>> print(f"1-hour supported: {'1h' in timeframes}")
96
+ Available timeframes: ['1s', '1m', '3m', '5m', '15m', '30m', '1h', '2h', '4h', '6h', '8h', '12h', '1d', '3d', '1w', '1mo']
97
+ 1-hour supported: True
98
+ """
99
+ collector = BinancePublicDataCollector()
100
+ return collector.available_timeframes
101
+
102
+
103
+ # DEPRECATED in v4.1.0: SupportedSymbol type alias removed (ADR-0022)
104
+ # Reason: 713-symbol Literal exceeds practical type checker limits
105
+ # Migration: Use `str` for symbol parameters, validate via get_supported_symbols()
106
+ #
107
+ # Before (v4.0.0):
108
+ # def my_function(symbol: SupportedSymbol) -> None: ...
109
+ #
110
+ # After (v4.1.0):
111
+ # def my_function(symbol: str) -> None:
112
+ # if symbol not in get_supported_symbols():
113
+ # raise ValueError(f"Unsupported symbol: {symbol}")
114
+ #
115
+ # Note: Spot and futures now both support 713 symbols (up from 20 spot symbols)
116
+
117
+ SupportedTimeframe = Literal[
118
+ "1s", "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "6h", "8h", "12h", "1d"
119
+ ]
120
+
121
+
122
+ def _validate_instrument_type(instrument_type: str) -> None:
123
+ """Validate instrument_type parameter.
124
+
125
+ Args:
126
+ instrument_type: Instrument type to validate
127
+
128
+ Raises:
129
+ ValueError: If instrument_type is not supported
130
+
131
+ Examples:
132
+ >>> _validate_instrument_type("spot") # Valid
133
+ >>> _validate_instrument_type("futures-um") # Valid
134
+ >>> _validate_instrument_type("futures") # Invalid
135
+ Traceback (most recent call last):
136
+ ...
137
+ ValueError: Invalid instrument_type 'futures'. Must be 'spot' or 'futures-um'
138
+ """
139
+ valid_types = {"spot", "futures-um"}
140
+ if instrument_type not in valid_types:
141
+ raise ValueError(
142
+ f"Invalid instrument_type '{instrument_type}'. "
143
+ f"Must be one of: {', '.join(sorted(valid_types))}. "
144
+ f"Use 'futures-um' for USDT-margined perpetual futures (713 symbols). "
145
+ f"See get_supported_symbols(instrument_type='{instrument_type}') for available symbols."
146
+ )
147
+
148
+
149
+ def _validate_timeframe_parameters(
150
+ timeframe: Optional[Union[str, SupportedTimeframe]],
151
+ interval: Optional[Union[str, SupportedTimeframe]],
152
+ ) -> str:
153
+ """Validate and resolve timeframe/interval parameters.
154
+
155
+ Args:
156
+ timeframe: Timeframe parameter (preferred)
157
+ interval: Legacy interval parameter
158
+
159
+ Returns:
160
+ Resolved timeframe string
161
+
162
+ Raises:
163
+ ValueError: If parameters are invalid or conflicting
164
+ """
165
+ # Dual parameter validation with exception-only failures
166
+ if timeframe is None and interval is None:
167
+ raise ValueError(
168
+ "Must specify 'timeframe' parameter. "
169
+ "CCXT-compatible 'timeframe' is preferred over legacy 'interval'."
170
+ )
171
+
172
+ if timeframe is not None and interval is not None:
173
+ raise ValueError(
174
+ "Cannot specify both 'timeframe' and 'interval' parameters. "
175
+ "Use 'timeframe' (CCXT-compatible) or 'interval' (legacy), not both."
176
+ )
177
+
178
+ # Use timeframe if provided, otherwise use interval (legacy)
179
+ return timeframe if timeframe is not None else interval
180
+
181
+
182
+ def _validate_index_type_parameter(index_type: Optional[str]) -> None:
183
+ """Validate deprecated index_type parameter.
184
+
185
+ Args:
186
+ index_type: Deprecated index_type parameter
187
+
188
+ Raises:
189
+ ValueError: If index_type is invalid
190
+ """
191
+ if index_type is None:
192
+ return
193
+
194
+ import warnings
195
+
196
+ warnings.warn(
197
+ "The 'index_type' parameter is deprecated and will be removed in v3.0.0. "
198
+ "Use standard pandas operations on the returned DataFrame instead.",
199
+ DeprecationWarning,
200
+ stacklevel=3,
201
+ )
202
+
203
+ # Validate deprecated parameter for backward compatibility
204
+ valid_index_types = {"datetime", "range", "auto"}
205
+ if index_type not in valid_index_types:
206
+ raise ValueError(
207
+ f"Invalid index_type '{index_type}'. "
208
+ f"Must be one of: {', '.join(sorted(valid_index_types))}"
209
+ )
210
+
211
+
212
+ def _validate_symbol(symbol: str, instrument_type: str = "spot") -> None:
213
+ """Validate symbol against known supported symbols for instrument type.
214
+
215
+ Args:
216
+ symbol: Trading pair symbol to validate
217
+ instrument_type: Instrument type context for validation
218
+
219
+ Raises:
220
+ ValueError: If symbol is not supported, with suggestions
221
+ """
222
+ from gapless_crypto_clickhouse import get_supported_symbols
223
+
224
+ supported = get_supported_symbols(instrument_type=instrument_type)
225
+
226
+ if symbol not in supported:
227
+ # Find close matches (simple prefix matching)
228
+ symbol_upper = symbol.upper()
229
+ close_matches = [s for s in supported if s.startswith(symbol_upper[:3])]
230
+
231
+ if close_matches:
232
+ raise ValueError(
233
+ f"Invalid symbol '{symbol}' for instrument_type='{instrument_type}'. "
234
+ f"Did you mean '{close_matches[0]}'? "
235
+ f"Supported {instrument_type} symbols: {', '.join(supported[:5])}, ... "
236
+ f"(see get_supported_symbols(instrument_type='{instrument_type}') for full list)"
237
+ )
238
+ else:
239
+ raise ValueError(
240
+ f"Invalid symbol '{symbol}' for instrument_type='{instrument_type}'. "
241
+ f"Supported {instrument_type} symbols: {', '.join(supported[:10])}, ... "
242
+ f"(see get_supported_symbols(instrument_type='{instrument_type}') for full list of {len(supported)} symbols)"
243
+ )
244
+
245
+
246
+ def _validate_timeframe_value(timeframe: str) -> None:
247
+ """Validate timeframe against supported timeframes.
248
+
249
+ Args:
250
+ timeframe: Timeframe interval to validate
251
+
252
+ Raises:
253
+ ValueError: If timeframe is not supported
254
+ """
255
+ from gapless_crypto_clickhouse import get_supported_timeframes
256
+
257
+ supported = get_supported_timeframes()
258
+
259
+ if timeframe not in supported:
260
+ raise ValueError(
261
+ f"Invalid timeframe '{timeframe}'. "
262
+ f"Supported timeframes: {', '.join(supported)} "
263
+ f"(see get_supported_timeframes() for details)"
264
+ )
265
+
266
+
267
+ def _validate_date_format(date_str: Optional[str], param_name: str) -> None:
268
+ """Validate date string format (YYYY-MM-DD).
269
+
270
+ Args:
271
+ date_str: Date string to validate
272
+ param_name: Parameter name for error context
273
+
274
+ Raises:
275
+ ValueError: If date format is invalid
276
+ """
277
+ if date_str is None:
278
+ return
279
+
280
+ import re
281
+
282
+ # Check YYYY-MM-DD format
283
+ if not re.match(r"^\d{4}-\d{2}-\d{2}$", date_str):
284
+ raise ValueError(
285
+ f"Invalid {param_name} format '{date_str}'. "
286
+ f"Expected format: YYYY-MM-DD (e.g., '2024-01-01')"
287
+ )
288
+
289
+ # Validate date is parseable
290
+ try:
291
+ datetime.strptime(date_str, "%Y-%m-%d")
292
+ except ValueError as e:
293
+ raise ValueError(f"Invalid {param_name} date '{date_str}': {str(e)}") from e
294
+
295
+
296
+ def _calculate_date_range_from_limit(
297
+ limit: Optional[int],
298
+ period: str,
299
+ start: Optional[str],
300
+ end: Optional[str],
301
+ ) -> tuple[str, str]:
302
+ """Calculate date range from limit parameter.
303
+
304
+ Args:
305
+ limit: Maximum number of bars to return
306
+ period: Timeframe interval
307
+ start: Existing start date
308
+ end: Existing end date
309
+
310
+ Returns:
311
+ Tuple of (start_date, end_date) strings
312
+ """
313
+ # If start/end already specified, use them
314
+ if start or end:
315
+ return start, end
316
+
317
+ # If no limit, return as-is
318
+ if not limit:
319
+ return start, end
320
+
321
+ # Calculate start date based on limit and interval
322
+ interval_minutes = {
323
+ "1s": 1 / 60, # 1 second = 1/60 minute
324
+ "1m": 1,
325
+ "3m": 3,
326
+ "5m": 5,
327
+ "15m": 15,
328
+ "30m": 30,
329
+ "1h": 60,
330
+ "2h": 120,
331
+ "4h": 240,
332
+ "6h": 360,
333
+ "8h": 480,
334
+ "12h": 720,
335
+ "1d": 1440,
336
+ }
337
+
338
+ if period in interval_minutes:
339
+ minutes_total = limit * interval_minutes[period]
340
+ start_date = datetime.now() - timedelta(minutes=minutes_total)
341
+ calculated_start = start_date.strftime("%Y-%m-%d")
342
+ calculated_end = datetime.now().strftime("%Y-%m-%d")
343
+ else:
344
+ # Default fallback for unknown periods
345
+ calculated_start = "2024-01-01"
346
+ calculated_end = datetime.now().strftime("%Y-%m-%d")
347
+
348
+ return calculated_start, calculated_end
349
+
350
+
351
+ def _apply_default_date_range(start: Optional[str], end: Optional[str]) -> tuple[str, str]:
352
+ """Apply default date range if not specified.
353
+
354
+ Args:
355
+ start: Start date
356
+ end: End date
357
+
358
+ Returns:
359
+ Tuple of (start_date, end_date) with defaults applied
360
+ """
361
+ if not start:
362
+ start = "2021-01-01"
363
+ if not end:
364
+ end = datetime.now().strftime("%Y-%m-%d")
365
+ return start, end
366
+
367
+
368
+ def _perform_gap_filling(
369
+ result: dict,
370
+ auto_fill_gaps: bool,
371
+ period: str,
372
+ df: pd.DataFrame,
373
+ instrument_type: str = "spot", # ADR-0021
374
+ ) -> pd.DataFrame:
375
+ """Perform automatic gap filling on collected data.
376
+
377
+ Args:
378
+ result: Collection result dictionary
379
+ auto_fill_gaps: Whether to auto-fill gaps
380
+ period: Timeframe interval
381
+ df: DataFrame with collected data
382
+ instrument_type: Instrument type for API endpoint selection
383
+
384
+ Returns:
385
+ DataFrame with gaps filled (if applicable)
386
+ """
387
+ if not auto_fill_gaps or not result.get("filepath"):
388
+ return df
389
+
390
+ import logging
391
+
392
+ logger = logging.getLogger(__name__)
393
+
394
+ csv_file = Path(result["filepath"])
395
+ gap_filler = UniversalGapFiller(
396
+ instrument_type=instrument_type
397
+ ) # ADR-0021: Pass instrument type for API endpoint
398
+
399
+ # Detect and fill gaps
400
+ gap_result = gap_filler.process_file(csv_file, period)
401
+
402
+ if gap_result["gaps_detected"] > 0:
403
+ if gap_result["gaps_filled"] > 0:
404
+ logger.info(
405
+ f"✅ Auto-filled {gap_result['gaps_filled']}/{gap_result['gaps_detected']} "
406
+ f"gap(s) with authentic Binance API data"
407
+ )
408
+ # Reload DataFrame with filled gaps
409
+ df = pd.read_csv(csv_file, comment="#")
410
+ else:
411
+ logger.warning(
412
+ f"⚠️ Detected {gap_result['gaps_detected']} gap(s) but could not fill them. "
413
+ f"Data may not be complete."
414
+ )
415
+
416
+ return df
417
+
418
+
419
+ def _apply_limit_and_index(
420
+ df: pd.DataFrame,
421
+ limit: Optional[int],
422
+ index_type: Optional[str],
423
+ ) -> pd.DataFrame:
424
+ """Apply limit and index_type to DataFrame.
425
+
426
+ Args:
427
+ df: DataFrame to process
428
+ limit: Maximum number of rows to return
429
+ index_type: Deprecated index_type parameter
430
+
431
+ Returns:
432
+ Processed DataFrame
433
+ """
434
+ # Apply limit if specified
435
+ if limit and len(df) > limit:
436
+ df = df.tail(limit).reset_index(drop=True)
437
+
438
+ # Handle deprecated index_type parameter for backward compatibility
439
+ if index_type in ("datetime", "auto"):
440
+ if "date" in df.columns:
441
+ # For deprecated datetime mode, return DataFrame with DatetimeIndex
442
+ return df.set_index("date", drop=False)
443
+ else:
444
+ # Handle edge case where date column is missing
445
+ return df
446
+ elif index_type == "range":
447
+ # For deprecated range mode, return DataFrame with RangeIndex (default)
448
+ return df
449
+ else:
450
+ # Default behavior: return standard pandas DataFrame with RangeIndex
451
+ # Users can use df.set_index('date') for DatetimeIndex operations
452
+ return df
453
+
454
+
455
+ def _create_empty_dataframe() -> pd.DataFrame:
456
+ """Create empty DataFrame with expected OHLCV columns.
457
+
458
+ Returns:
459
+ Empty DataFrame with standard columns
460
+ """
461
+ columns = [
462
+ "date",
463
+ "open",
464
+ "high",
465
+ "low",
466
+ "close",
467
+ "volume",
468
+ "close_time",
469
+ "quote_asset_volume",
470
+ "number_of_trades",
471
+ "taker_buy_base_asset_volume",
472
+ "taker_buy_quote_asset_volume",
473
+ ]
474
+ return pd.DataFrame(columns=columns)
475
+
476
+
477
+ def fetch_data(
478
+ symbol: str,
479
+ timeframe: Optional[Union[str, SupportedTimeframe]] = None,
480
+ limit: Optional[int] = None,
481
+ start: Optional[str] = None,
482
+ end: Optional[str] = None,
483
+ start_date: Optional[str] = None, # Alias for start
484
+ end_date: Optional[str] = None, # Alias for end
485
+ output_dir: Optional[Union[str, Path]] = None,
486
+ index_type: Optional[Literal["datetime", "range", "auto"]] = None, # Deprecated parameter
487
+ auto_fill_gaps: bool = True,
488
+ instrument_type: InstrumentType = "spot", # ADR-0021: UM futures support
489
+ *,
490
+ interval: Optional[Union[str, SupportedTimeframe]] = None,
491
+ ) -> pd.DataFrame:
492
+ """Fetch cryptocurrency data as standard pandas DataFrame with zero gaps guarantee.
493
+
494
+ Returns pandas DataFrame with complete OHLCV and microstructure data.
495
+ All analysis and calculations can be performed using standard pandas operations.
496
+
497
+ By default, automatically detects and fills gaps using authentic Binance API data
498
+ to deliver on the "zero gaps guarantee" promise.
499
+
500
+ **⚠️ IMPORTANT - funding_rate Column (v3.2.0+)**:
501
+ The DataFrame includes a `funding_rate` column for futures data, but it is **NULL**
502
+ in v3.2.0 (not yet populated). Funding rate collection will be implemented in v3.3.0
503
+ via a separate `/fapi/v1/fundingRate` API endpoint. Do not use this column for
504
+ calculations until it is populated.
505
+
506
+ Args:
507
+ symbol: Trading pair symbol (e.g., "BTCUSDT", "ETHUSDT")
508
+ timeframe: Timeframe interval (e.g., "1m", "5m", "1h", "4h", "1d")
509
+ limit: Maximum number of recent bars to return (optional)
510
+ start: Start date in YYYY-MM-DD format (optional). Alias: start_date
511
+ end: End date in YYYY-MM-DD format (optional). Alias: end_date
512
+ start_date: Alias for start (recommended for clarity)
513
+ end_date: Alias for end (recommended for clarity)
514
+ output_dir: Directory to save CSV files (optional)
515
+ index_type: DEPRECATED - Use pandas operations directly
516
+ auto_fill_gaps: Automatically fill detected gaps with authentic Binance API data (default: True)
517
+ instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
518
+ interval: Legacy parameter name for timeframe (deprecated, use timeframe)
519
+
520
+ Returns:
521
+ pd.DataFrame with OHLCV data and microstructure columns:
522
+ - date: Timestamp (open time)
523
+ - open, high, low, close: Price data
524
+ - volume: Base asset volume
525
+ - close_time: Close timestamp
526
+ - quote_asset_volume: Quote asset volume
527
+ - number_of_trades: Trade count
528
+ - taker_buy_base_asset_volume: Taker buy base volume
529
+ - taker_buy_quote_asset_volume: Taker buy quote volume
530
+ - funding_rate: Funding rate (⚠️ NULL in v3.2.0, will be populated in v3.3.0)
531
+
532
+ Raises:
533
+ ValueError: If both 'start' and 'start_date' specified, or both 'end' and 'end_date' specified
534
+ ValueError: If symbol is not supported (with suggestions for correction)
535
+ ValueError: If timeframe is not supported (with list of supported timeframes)
536
+ ValueError: If date format is invalid (expected YYYY-MM-DD)
537
+ ValueError: If instrument_type is invalid (must be "spot" or "futures-um")
538
+
539
+ Examples:
540
+ # Simple spot data fetching (default)
541
+ df = fetch_data("BTCUSDT", "1h", limit=1000)
542
+
543
+ # Fetch futures data (713 symbols available)
544
+ df = fetch_data("BTCUSDT", "1h", limit=1000, instrument_type="futures-um")
545
+
546
+ # Standard pandas operations for analysis
547
+ returns = df['close'].pct_change() # Returns calculation
548
+ rolling_vol = df['close'].rolling(20).std() # Rolling volatility
549
+ df_resampled = df.set_index('date').resample('4H').agg({
550
+ 'open': 'first', 'high': 'max', 'low': 'min',
551
+ 'close': 'last', 'volume': 'sum'
552
+ }) # OHLCV resampling
553
+
554
+ # Fetch specific date range (explicit form - recommended)
555
+ df = fetch_data("ETHUSDT", "4h", start_date="2024-01-01", end_date="2024-06-30")
556
+
557
+ # Fetch futures with date range
558
+ df = fetch_data("SOLUSDT", "1h", start_date="2024-01-01", end_date="2024-06-30",
559
+ instrument_type="futures-um")
560
+ """
561
+ # Validate and resolve timeframe parameters
562
+ period = _validate_timeframe_parameters(timeframe, interval)
563
+
564
+ # Validate deprecated index_type parameter
565
+ _validate_index_type_parameter(index_type)
566
+
567
+ # Validate and normalize date range parameters
568
+ if start is not None and start_date is not None:
569
+ raise ValueError(
570
+ "Cannot specify both 'start' and 'start_date'. "
571
+ "Use either 'start' OR 'start_date', not both."
572
+ )
573
+ if end is not None and end_date is not None:
574
+ raise ValueError(
575
+ "Cannot specify both 'end' and 'end_date'. Use either 'end' OR 'end_date', not both."
576
+ )
577
+
578
+ # Normalize: prefer explicit _date parameters
579
+ start = start_date if start_date is not None else start
580
+ end = end_date if end_date is not None else end
581
+
582
+ # Validate symbol parameter (None check)
583
+ if symbol is None:
584
+ raise ValueError(
585
+ "symbol parameter is required (cannot be None). "
586
+ "Specify a trading pair (e.g., symbol='BTCUSDT')"
587
+ )
588
+
589
+ # Normalize symbol case (auto-uppercase for user convenience)
590
+ symbol = symbol.upper()
591
+
592
+ # Calculate date range from limit if needed
593
+ start, end = _calculate_date_range_from_limit(limit, period, start, end)
594
+
595
+ # Apply default date range if not specified
596
+ start, end = _apply_default_date_range(start, end)
597
+
598
+ # Upfront input validation (fast failure before expensive operations)
599
+ _validate_instrument_type(instrument_type) # ADR-0021: Validate instrument type first
600
+ _validate_symbol(
601
+ symbol, instrument_type=instrument_type
602
+ ) # ADR-0021: Pass instrument type for context-aware validation
603
+ _validate_timeframe_value(period)
604
+ _validate_date_format(start, "start/start_date")
605
+ _validate_date_format(end, "end/end_date")
606
+
607
+ # Initialize collector and collect data
608
+ collector = BinancePublicDataCollector(
609
+ symbol=symbol,
610
+ start_date=start,
611
+ end_date=end,
612
+ output_dir=output_dir,
613
+ instrument_type=instrument_type, # ADR-0021: Pass instrument type for URL routing
614
+ )
615
+ result = collector.collect_timeframe_data(period)
616
+
617
+ # Process result or return empty DataFrame
618
+ if result and "dataframe" in result:
619
+ df = result["dataframe"]
620
+
621
+ # Auto-fill gaps if enabled (delivers "zero gaps guarantee")
622
+ df = _perform_gap_filling(result, auto_fill_gaps, period, df, instrument_type)
623
+
624
+ # Apply limit and index_type
625
+ return _apply_limit_and_index(df, limit, index_type)
626
+ else:
627
+ # Return empty DataFrame with expected columns
628
+ return _create_empty_dataframe()
629
+
630
+
631
+ def download(
632
+ symbol: str,
633
+ timeframe: Optional[Union[str, SupportedTimeframe]] = None,
634
+ start: Optional[str] = None,
635
+ end: Optional[str] = None,
636
+ start_date: Optional[str] = None, # Alias for start
637
+ end_date: Optional[str] = None, # Alias for end
638
+ output_dir: Optional[Union[str, Path]] = None,
639
+ index_type: Optional[Literal["datetime", "range", "auto"]] = None, # Deprecated parameter
640
+ auto_fill_gaps: bool = True,
641
+ instrument_type: InstrumentType = "spot", # ADR-0021: UM futures support
642
+ *,
643
+ interval: Optional[Union[str, SupportedTimeframe]] = None,
644
+ ) -> pd.DataFrame:
645
+ """Download cryptocurrency data with zero gaps guarantee.
646
+
647
+ Provides familiar API patterns for intuitive data collection.
648
+ By default, automatically detects and fills gaps using authentic Binance API data
649
+ to deliver on the package's core promise of zero gaps.
650
+
651
+ **NEW in v3.2.0**: USDT-margined perpetual futures support (713 symbols).
652
+
653
+ Args:
654
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
655
+ timeframe: Timeframe interval (default: "1h" if neither specified)
656
+ start: Start date in YYYY-MM-DD format. Alias: start_date
657
+ end: End date in YYYY-MM-DD format. Alias: end_date
658
+ start_date: Alias for start (recommended for clarity)
659
+ end_date: Alias for end (recommended for clarity)
660
+ output_dir: Directory to save CSV files
661
+ index_type: DEPRECATED - Use standard pandas operations instead
662
+ auto_fill_gaps: Automatically fill detected gaps with authentic Binance API data (default: True)
663
+ instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
664
+ interval: Legacy parameter name for timeframe (deprecated)
665
+
666
+ Returns:
667
+ pd.DataFrame with complete OHLCV and microstructure data (gapless by default).
668
+ Includes funding_rate column (⚠️ NULL in v3.2.0, populated in future release).
669
+
670
+ Raises:
671
+ ValueError: If instrument_type is invalid (must be "spot" or "futures-um")
672
+ ValueError: If both 'start' and 'start_date' specified, or both 'end' and 'end_date' specified
673
+ ValueError: If symbol is not supported (with suggestions for correction)
674
+ ValueError: If timeframe is not supported (with list of supported timeframes)
675
+ ValueError: If date format is invalid (expected YYYY-MM-DD)
676
+
677
+ Warning:
678
+ The funding_rate column exists but is NULL for all rows in v3.2.0.
679
+ Funding rate data requires separate API endpoint (/fapi/v1/fundingRate)
680
+ and will be implemented in v3.3.0. Do not rely on funding_rate values.
681
+
682
+ Examples:
683
+ # Simple spot data download (default)
684
+ df = download("BTCUSDT", "1h", start="2024-01-01", end="2024-06-30")
685
+
686
+ # Futures data download (NEW in v3.2.0)
687
+ df = download("BTCUSDT", "1h", start="2024-01-01", end="2024-06-30",
688
+ instrument_type="futures-um")
689
+
690
+ # Explicit form (recommended)
691
+ df = download("BTCUSDT", "1h", start_date="2024-01-01", end_date="2024-06-30")
692
+
693
+ # Disable auto-fill if you want raw Vision archive data
694
+ df = download("ETHUSDT", "4h", auto_fill_gaps=False)
695
+
696
+ # Legacy interval parameter
697
+ df = download("BTCUSDT", interval="1h")
698
+ """
699
+ # Apply default if neither parameter specified
700
+ if timeframe is None and interval is None:
701
+ timeframe = "1h"
702
+
703
+ # Validate and normalize date range parameters
704
+ if start is not None and start_date is not None:
705
+ raise ValueError(
706
+ "Cannot specify both 'start' and 'start_date'. "
707
+ "Use either 'start' OR 'start_date', not both."
708
+ )
709
+ if end is not None and end_date is not None:
710
+ raise ValueError(
711
+ "Cannot specify both 'end' and 'end_date'. Use either 'end' OR 'end_date', not both."
712
+ )
713
+
714
+ # Normalize: prefer explicit _date parameters
715
+ start = start_date if start_date is not None else start
716
+ end = end_date if end_date is not None else end
717
+
718
+ # Validate symbol parameter (None check)
719
+ if symbol is None:
720
+ raise ValueError(
721
+ "symbol parameter is required (cannot be None). "
722
+ "Specify a trading pair (e.g., symbol='BTCUSDT')"
723
+ )
724
+
725
+ # Normalize symbol case (auto-uppercase for user convenience)
726
+ symbol = symbol.upper()
727
+
728
+ return fetch_data(
729
+ symbol=symbol,
730
+ timeframe=timeframe,
731
+ start=start,
732
+ end=end,
733
+ output_dir=output_dir,
734
+ index_type=index_type,
735
+ auto_fill_gaps=auto_fill_gaps,
736
+ instrument_type=instrument_type, # ADR-0021
737
+ interval=interval,
738
+ )
739
+
740
+
741
+ def download_multiple(
742
+ symbols: List[str],
743
+ timeframe: str,
744
+ start_date: Optional[str] = None,
745
+ end_date: Optional[str] = None,
746
+ limit: Optional[int] = None,
747
+ max_workers: int = 5,
748
+ raise_on_partial_failure: bool = False,
749
+ instrument_type: InstrumentType = "spot", # ADR-0021
750
+ **kwargs,
751
+ ) -> dict[str, pd.DataFrame]:
752
+ """Download historical data for multiple symbols concurrently.
753
+
754
+ Executes concurrent downloads using ThreadPoolExecutor for network-bound
755
+ operations. Returns dict mapping symbol → DataFrame.
756
+
757
+ **NEW in v3.2.0**: Supports USDT-margined perpetual futures (713 symbols).
758
+
759
+ Args:
760
+ symbols: List of trading pair symbols (e.g., ["BTCUSDT", "ETHUSDT"])
761
+ timeframe: Candle interval (e.g., "1h", "4h", "1d")
762
+ start_date: Start date (YYYY-MM-DD)
763
+ end_date: End date (YYYY-MM-DD)
764
+ limit: Maximum bars per symbol
765
+ max_workers: Maximum concurrent downloads (default: 5)
766
+ raise_on_partial_failure: Raise error if any symbol fails (default: False)
767
+ instrument_type: Instrument type - "spot" or "futures-um" (both support 713 symbols, default: "spot")
768
+ **kwargs: Additional parameters passed to download()
769
+
770
+ Returns:
771
+ dict[str, pd.DataFrame]: Mapping of symbol → DataFrame
772
+ Only includes successful downloads (failed symbols omitted unless raise_on_partial_failure=True)
773
+ Each DataFrame includes funding_rate column (⚠️ NULL in v3.2.0, populated in future release)
774
+
775
+ Raises:
776
+ ValueError: If instrument_type is invalid
777
+ ValueError: If symbols list is empty
778
+ ValueError: If max_workers < 1
779
+ ValueError: If all symbols fail
780
+ ValueError: If raise_on_partial_failure=True and any symbol fails
781
+
782
+ Warning:
783
+ The funding_rate column exists but is NULL for all rows in v3.2.0.
784
+
785
+ Examples:
786
+ >>> # Download multiple spot symbols concurrently (default)
787
+ >>> results = download_multiple(
788
+ ... symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
789
+ ... timeframe="1h",
790
+ ... start_date="2024-01-01",
791
+ ... end_date="2024-06-30"
792
+ ... )
793
+ >>> len(results)
794
+ 3
795
+
796
+ >>> # Download multiple futures symbols concurrently (NEW in v3.2.0)
797
+ >>> futures = download_multiple(
798
+ ... symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
799
+ ... timeframe="1h",
800
+ ... start_date="2024-01-01",
801
+ ... instrument_type="futures-um"
802
+ ... )
803
+ >>> len(futures)
804
+ 3
805
+
806
+ >>> # With error handling (partial failure - some succeed)
807
+ >>> results = download_multiple(
808
+ ... symbols=["BTCUSDT", "INVALID", "ETHUSDT"],
809
+ ... timeframe="1h",
810
+ ... start_date="2024-01-01"
811
+ ... )
812
+ >>> len(results)
813
+ 2 # Only BTCUSDT and ETHUSDT succeeded
814
+
815
+ >>> # Strict mode (fail fast on any error)
816
+ >>> results = download_multiple(
817
+ ... symbols=["BTCUSDT", "INVALID"],
818
+ ... timeframe="1h",
819
+ ... start_date="2024-01-01",
820
+ ... raise_on_partial_failure=True
821
+ ... )
822
+ # → Raises ValueError immediately on first failure
823
+ """
824
+ import warnings
825
+ from concurrent.futures import ThreadPoolExecutor, as_completed
826
+
827
+ # Input validation
828
+ if not symbols:
829
+ raise ValueError("symbols list cannot be empty")
830
+
831
+ if max_workers < 1:
832
+ raise ValueError("max_workers must be >= 1")
833
+
834
+ results: dict[str, pd.DataFrame] = {}
835
+ errors: dict[str, str] = {}
836
+
837
+ # Concurrent execution with ThreadPoolExecutor
838
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
839
+ # Submit all download tasks
840
+ future_to_symbol = {
841
+ executor.submit(
842
+ download,
843
+ symbol=symbol,
844
+ timeframe=timeframe,
845
+ start_date=start_date,
846
+ end_date=end_date,
847
+ limit=limit,
848
+ instrument_type=instrument_type, # ADR-0021
849
+ **kwargs,
850
+ ): symbol
851
+ for symbol in symbols
852
+ }
853
+
854
+ # Collect results as they complete
855
+ for future in as_completed(future_to_symbol):
856
+ symbol = future_to_symbol[future]
857
+ try:
858
+ results[symbol] = future.result()
859
+ except Exception as e:
860
+ errors[symbol] = str(e)
861
+
862
+ # Fail fast mode
863
+ if raise_on_partial_failure:
864
+ executor.shutdown(wait=False, cancel_futures=True)
865
+ raise ValueError(f"Download failed for {symbol}: {e}") from e
866
+
867
+ # Handle complete failure
868
+ if not results and errors:
869
+ raise ValueError(f"All {len(symbols)} symbols failed. Errors: {errors}")
870
+
871
+ # Log warnings for partial failures
872
+ if errors:
873
+ warnings.warn(
874
+ f"Failed to download {len(errors)} symbols: {list(errors.keys())}. Errors: {errors}",
875
+ UserWarning,
876
+ stacklevel=2,
877
+ )
878
+
879
+ return results
880
+
881
+
882
+ def fill_gaps(directory: Union[str, Path], symbols: Optional[List[str]] = None) -> dict:
883
+ """Fill gaps in existing CSV data files.
884
+
885
+ Args:
886
+ directory: Directory containing CSV files to process
887
+ symbols: Optional list of symbols to process (default: all found)
888
+
889
+ Returns:
890
+ dict: Gap filling results with statistics
891
+
892
+ Examples:
893
+ # Fill all gaps in directory
894
+ results = fill_gaps("./data")
895
+
896
+ # Fill gaps for specific symbols
897
+ results = fill_gaps("./data", symbols=["BTCUSDT", "ETHUSDT"])
898
+ """
899
+ gap_filler = UniversalGapFiller()
900
+ target_dir = Path(directory)
901
+
902
+ # Find CSV files
903
+ csv_files = list(target_dir.glob("*.csv"))
904
+ if symbols:
905
+ # Filter by specified symbols
906
+ csv_files = [f for f in csv_files if any(symbol in f.name for symbol in symbols)]
907
+
908
+ results = {
909
+ "files_processed": 0,
910
+ "gaps_detected": 0,
911
+ "gaps_filled": 0,
912
+ "success_rate": 0.0,
913
+ "file_results": {},
914
+ }
915
+
916
+ for csv_file in csv_files:
917
+ # Extract timeframe from filename
918
+ timeframe = "1h" # Default
919
+ for tf in ["1s", "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "6h", "8h", "12h", "1d"]:
920
+ if f"-{tf}_" in csv_file.name or f"-{tf}-" in csv_file.name:
921
+ timeframe = tf
922
+ break
923
+
924
+ # Process file
925
+ file_result = gap_filler.process_file(csv_file, timeframe)
926
+ results["file_results"][csv_file.name] = file_result
927
+ results["files_processed"] += 1
928
+ results["gaps_detected"] += file_result["gaps_detected"]
929
+ results["gaps_filled"] += file_result["gaps_filled"]
930
+
931
+ # Calculate overall success rate
932
+ if results["gaps_detected"] > 0:
933
+ results["success_rate"] = (results["gaps_filled"] / results["gaps_detected"]) * 100
934
+ else:
935
+ results["success_rate"] = 100.0
936
+
937
+ return results
938
+
939
+
940
+ def get_info() -> dict:
941
+ """Get library information and capabilities.
942
+
943
+ Returns:
944
+ dict: Library metadata and capabilities
945
+
946
+ Examples:
947
+ >>> info = get_info()
948
+ >>> print(f"Version: {info['version']}")
949
+ >>> print(f"Supported symbols: {len(info['supported_symbols'])}")
950
+ """
951
+ from . import __version__
952
+
953
+ return {
954
+ "version": __version__,
955
+ "name": "gapless-crypto-data",
956
+ "description": "Ultra-fast cryptocurrency data collection with zero gaps guarantee",
957
+ "supported_symbols": get_supported_symbols(),
958
+ "supported_timeframes": get_supported_timeframes(),
959
+ "market_type": "USDT spot pairs only",
960
+ "data_source": "Binance public data repository + API",
961
+ "features": [
962
+ "22x faster than API calls",
963
+ "Full 11-column microstructure format",
964
+ "Automatic gap detection and filling",
965
+ "Production-grade data quality",
966
+ ],
967
+ }
968
+
969
+
970
+ def get_supported_intervals() -> List[str]:
971
+ """Get list of supported timeframe intervals (legacy alias).
972
+
973
+ Deprecated: Use get_supported_timeframes() instead.
974
+ Maintained for backward compatibility with existing code.
975
+
976
+ Returns:
977
+ List of timeframe strings (e.g., ["1m", "5m", "1h", "4h", ...])
978
+
979
+ Examples:
980
+ >>> intervals = get_supported_intervals() # deprecated
981
+ >>> timeframes = get_supported_timeframes() # preferred
982
+ """
983
+ import warnings
984
+
985
+ warnings.warn(
986
+ "get_supported_intervals() is deprecated. Use get_supported_timeframes() instead.",
987
+ DeprecationWarning,
988
+ stacklevel=2,
989
+ )
990
+ return get_supported_timeframes()
991
+
992
+
993
+ def save_parquet(df: pd.DataFrame, path: str) -> None:
994
+ """Save DataFrame to Parquet format with optimized compression.
995
+
996
+ Args:
997
+ df: DataFrame to save
998
+ path: Output file path (should end with .parquet)
999
+
1000
+ Raises:
1001
+ FileNotFoundError: If output directory doesn't exist
1002
+ PermissionError: If cannot write to path
1003
+ ValueError: If DataFrame is invalid
1004
+
1005
+ Examples:
1006
+ >>> df = fetch_data("BTCUSDT", "1h", limit=1000)
1007
+ >>> save_parquet(df, "btc_data.parquet")
1008
+ """
1009
+ if df is None or df.empty:
1010
+ raise ValueError("Cannot save empty DataFrame to Parquet")
1011
+
1012
+ df.to_parquet(path, engine="pyarrow", compression="snappy", index=False)
1013
+
1014
+
1015
+ def load_parquet(path: str) -> pd.DataFrame:
1016
+ """Load DataFrame from Parquet file.
1017
+
1018
+ Args:
1019
+ path: Parquet file path
1020
+
1021
+ Returns:
1022
+ DataFrame with original structure and data types
1023
+
1024
+ Raises:
1025
+ FileNotFoundError: If file doesn't exist
1026
+ ParquetError: If file is corrupted or invalid
1027
+
1028
+ Examples:
1029
+ >>> df = load_parquet("btc_data.parquet")
1030
+ >>> print(f"Loaded {len(df)} bars")
1031
+ """
1032
+ return pd.read_parquet(path, engine="pyarrow")