gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,757 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Universal Gap Filler - Detects and fills ALL gaps in OHLCV CSV files
4
+
5
+ This script automatically detects ALL gaps in any timeframe's CSV file and fills them
6
+ using authentic Binance API data with full 11-column microstructure format.
7
+
8
+ Unlike synthetic data approaches, this filler uses authentic Binance data
9
+ providing complete microstructure columns for professional analysis.
10
+
11
+ Key Features:
12
+ - Auto-detects gaps by analyzing timestamp sequences
13
+ - Uses authentic Binance API with full 11-column microstructure format
14
+ - Handles all timeframes (1s, 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d)
15
+ - Provides authentic order flow metrics including trade counts and taker volumes
16
+ - Processes gaps chronologically to maintain data integrity
17
+ - NO synthetic or estimated data - only authentic exchange data
18
+ - API-first validation protocol using authentic Binance data exclusively
19
+ """
20
+
21
+ import logging
22
+ import time
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import Dict, List, Optional
26
+
27
+ import httpx
28
+ import pandas as pd
29
+
30
+ from ..utils.timeframe_constants import (
31
+ TIMEFRAME_TO_BINANCE_INTERVAL,
32
+ TIMEFRAME_TO_PYTHON_TIMEDELTA,
33
+ TIMEFRAME_TO_TIMEDELTA,
34
+ )
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class UniversalGapFiller:
41
+ """Universal gap detection and filling for all timeframes with authentic 11-column microstructure format.
42
+
43
+ Automatically detects and fills timestamp gaps in OHLCV CSV files using authentic
44
+ Binance API data. Provides complete gap detection across all timeframes with
45
+ professional-grade microstructure data including order flow metrics.
46
+
47
+ Unlike synthetic data generators, this gap filler exclusively uses authentic
48
+ Binance market data, ensuring all filled gaps contain real order flow metrics,
49
+ trade counts, and taker volume statistics essential for quantitative analysis.
50
+
51
+ Features:
52
+ - Universal gap detection for any timeframe (1s to 1d)
53
+ - Authentic Binance API data for gap filling (never synthetic)
54
+ - Complete 11-column microstructure format preservation
55
+ - Chronological processing for data integrity
56
+ - Automatic symbol extraction from filenames
57
+ - Batch processing for multiple files
58
+ - Safe atomic operations with backup/rollback
59
+
60
+ Supported Timeframes:
61
+ - 1s: Second-based intervals
62
+ - 1m, 3m, 5m, 15m, 30m: Minute-based intervals
63
+ - 1h, 2h, 4h, 6h, 8h, 12h: Hour-based intervals
64
+ - 1d: Daily intervals
65
+
66
+ Data Quality:
67
+ All gap-filled data maintains the same structure as original Binance data:
68
+ - OHLCV: Open, High, Low, Close, Volume (base asset)
69
+ - Timestamps: Open time, Close time
70
+ - Order Flow: Quote asset volume, Number of trades
71
+ - Taker Metrics: Taker buy base volume, Taker buy quote volume
72
+
73
+ Examples:
74
+ For simple gap filling, consider using the function-based API:
75
+
76
+ >>> import gapless_crypto_clickhouse as gcd
77
+ >>> results = gcd.fill_gaps("./data")
78
+ >>> print(f"Filled {results['gaps_filled']}/{results['gaps_detected']} gaps")
79
+
80
+ Advanced usage with this class for detailed control:
81
+
82
+ >>> gap_filler = UniversalGapFiller()
83
+ >>> gaps = gap_filler.detect_all_gaps("BTCUSDT_1h_2024-01-01_to_2024-12-31.csv", "1h")
84
+ >>> print(f"Found {len(gaps)} gaps")
85
+ >>> success = gap_filler.fill_gap(gaps[0], "BTCUSDT_1h_data.csv", "1h")
86
+ >>> print(f"Gap filled: {success}")
87
+ Found 3 gaps
88
+ Gap filled: True
89
+
90
+ Batch processing for directory:
91
+
92
+ >>> gap_filler = UniversalGapFiller()
93
+ >>> result = gap_filler.process_file("BTCUSDT_1h.csv", "1h")
94
+ >>> print(f"Filled {result['gaps_filled']}/{result['gaps_detected']} gaps")
95
+ Filled 2/3 gaps
96
+
97
+ Custom symbol processing:
98
+
99
+ >>> symbol = gap_filler.extract_symbol_from_filename("SOLUSDT_15m_data.csv")
100
+ >>> print(f"Extracted symbol: {symbol}")
101
+ Extracted symbol: SOLUSDT
102
+
103
+ Note:
104
+ This gap filler requires internet connectivity to fetch authentic data
105
+ from Binance's public API. Rate limiting is automatically handled to
106
+ respect API limits during gap filling operations.
107
+ """
108
+
109
+ # ADR-0021: API endpoint constants for spot and futures
110
+ SPOT_API_URL = "https://api.binance.com/api/v3/klines"
111
+ FUTURES_API_URL = "https://fapi.binance.com/fapi/v1/klines"
112
+
113
+ def __init__(self, instrument_type: str = "spot"):
114
+ """Initialize UniversalGapFiller with instrument type support.
115
+
116
+ Args:
117
+ instrument_type: Instrument type - "spot" or "futures-um" (default: "spot")
118
+
119
+ Raises:
120
+ ValueError: If instrument_type is invalid
121
+ """
122
+ # ADR-0021: Validate instrument type and set API endpoint
123
+ if instrument_type not in ("spot", "futures-um"):
124
+ raise ValueError(
125
+ f"Invalid instrument_type '{instrument_type}'. Must be 'spot' or 'futures-um'"
126
+ )
127
+ self.instrument_type = instrument_type
128
+
129
+ # ADR-0021: API endpoint selection based on instrument type
130
+ if instrument_type == "spot":
131
+ self.binance_base_url = self.SPOT_API_URL
132
+ else: # futures-um
133
+ self.binance_base_url = self.FUTURES_API_URL
134
+
135
+ def extract_symbol_from_filename(self, csv_path) -> str:
136
+ """Extract symbol from CSV filename
137
+
138
+ Supports formats like:
139
+ - binance_spot_BTCUSDT-1h_20240101-20240101_v2.5.0.csv
140
+ - BTCUSDT_1h_data.csv
141
+ - ETHUSDT-4h.csv
142
+ """
143
+ # Handle both string and Path inputs
144
+ if isinstance(csv_path, (str, Path)):
145
+ path_obj = Path(csv_path)
146
+ filename = path_obj.name
147
+ else:
148
+ filename = str(csv_path)
149
+
150
+ # Handle gapless-crypto-data format: binance_spot_SYMBOL-timeframe_dates.csv
151
+ if "binance_spot_" in filename:
152
+ parts = filename.split("_")
153
+ if len(parts) >= 3:
154
+ symbol_part = parts[2] # BTCUSDT-1h
155
+ symbol = symbol_part.split("-")[0] # BTCUSDT
156
+ return symbol
157
+
158
+ # Handle simple formats: SYMBOL_timeframe or SYMBOL-timeframe
159
+ for separator in ["-", "_"]:
160
+ if separator in filename:
161
+ parts = filename.split(separator)
162
+ potential_symbol = parts[0]
163
+ # Check if it looks like a trading pair (ends with USDT, BTC, ETH, etc.)
164
+ if potential_symbol.endswith(("USDT", "BTC", "ETH", "BNB")):
165
+ return potential_symbol
166
+
167
+ # Fallback: look for common trading pairs (top 20 by market cap)
168
+ common_symbols = [
169
+ "BTCUSDT",
170
+ "ETHUSDT",
171
+ "BNBUSDT",
172
+ "SOLUSDT",
173
+ "XRPUSDT",
174
+ "DOGEUSDT",
175
+ "ADAUSDT",
176
+ "AVAXUSDT",
177
+ "DOTUSDT",
178
+ "LINKUSDT",
179
+ "MATICUSDT",
180
+ "LTCUSDT",
181
+ "UNIUSDT",
182
+ "ATOMUSDT",
183
+ "FTMUSDT",
184
+ "NEARUSDT",
185
+ "ALGOUSDT",
186
+ "SANDUSDT",
187
+ "MANAUSDT",
188
+ "APEUSDT",
189
+ ]
190
+ filename_upper = filename.upper()
191
+ for symbol in common_symbols:
192
+ if symbol in filename_upper:
193
+ return symbol
194
+
195
+ # Default fallback (should not happen in practice)
196
+ logger.warning(
197
+ f"⚠️ Could not extract symbol from filename {filename}, defaulting to BTCUSDT"
198
+ )
199
+ return "BTCUSDT"
200
+
201
+ def detect_all_gaps(self, csv_path: Path, timeframe: str) -> List[Dict]:
202
+ """Detect ALL gaps in CSV file by analyzing timestamp sequence for 11-column format"""
203
+ logger.info(f"🔍 Analyzing {csv_path} for gaps...")
204
+
205
+ # Load CSV data
206
+ ohlcv_dataframe = pd.read_csv(csv_path, comment="#")
207
+ ohlcv_dataframe["date"] = pd.to_datetime(ohlcv_dataframe["date"])
208
+ ohlcv_dataframe = ohlcv_dataframe.sort_values("date")
209
+
210
+ # Calculate expected interval using centralized constants
211
+ expected_interval = TIMEFRAME_TO_PYTHON_TIMEDELTA[timeframe]
212
+
213
+ detected_gaps = []
214
+ for row_index in range(1, len(ohlcv_dataframe)):
215
+ current_time = ohlcv_dataframe.iloc[row_index]["date"]
216
+ previous_time = ohlcv_dataframe.iloc[row_index - 1]["date"]
217
+ actual_gap_duration = current_time - previous_time
218
+
219
+ if actual_gap_duration > expected_interval:
220
+ timestamp_gap_info = {
221
+ "position": row_index,
222
+ "start_time": previous_time + expected_interval,
223
+ "end_time": current_time,
224
+ "duration": actual_gap_duration,
225
+ "expected_interval": expected_interval,
226
+ }
227
+ detected_gaps.append(timestamp_gap_info)
228
+ logger.info(
229
+ f" 📊 Gap {len(detected_gaps)}: {timestamp_gap_info['start_time']} → {timestamp_gap_info['end_time']} ({timestamp_gap_info['duration']})"
230
+ )
231
+
232
+ logger.info(f"✅ Found {len(detected_gaps)} gaps in {timeframe} timeframe")
233
+ return detected_gaps
234
+
235
+ def fetch_binance_data(
236
+ self,
237
+ start_time: datetime,
238
+ end_time: datetime,
239
+ timeframe: str,
240
+ symbol: str,
241
+ enhanced_format: bool = False,
242
+ ) -> Optional[List[Dict]]:
243
+ """Fetch authentic microstructure data from Binance API - NO synthetic data"""
244
+ binance_interval = TIMEFRAME_TO_BINANCE_INTERVAL[timeframe]
245
+
246
+ # Convert to millisecond timestamps for Binance API
247
+ # ✅ UTC ONLY: All timestamps are UTC - no timezone conversion needed
248
+
249
+ # Convert pandas Timestamp to datetime if needed
250
+ if hasattr(start_time, "to_pydatetime"):
251
+ start_time = start_time.to_pydatetime()
252
+ if hasattr(end_time, "to_pydatetime"):
253
+ end_time = end_time.to_pydatetime()
254
+
255
+ # Simple UTC timestamp conversion - CSV timestamps are naive UTC
256
+ # The CSV timestamps should be interpreted as local machine time for API calls
257
+ # This matches how Binance API expects timestamps
258
+ start_timestamp_ms = int(start_time.timestamp() * 1000)
259
+ end_timestamp_ms = int(end_time.timestamp() * 1000)
260
+
261
+ api_request_params = {
262
+ "symbol": symbol,
263
+ "interval": binance_interval,
264
+ "startTime": start_timestamp_ms,
265
+ "endTime": end_timestamp_ms,
266
+ "limit": 1000,
267
+ }
268
+
269
+ logger.info(f" 📡 Binance API call: {api_request_params}")
270
+
271
+ try:
272
+ http_response = httpx.get(self.binance_base_url, params=api_request_params, timeout=30)
273
+ http_response.raise_for_status()
274
+ binance_klines_data = http_response.json()
275
+
276
+ if not binance_klines_data:
277
+ logger.warning(" ❌ Binance returned no data")
278
+ return None
279
+
280
+ # Convert Binance data to required format with authentic microstructure data
281
+ processed_candles = []
282
+ for raw_candle_data in binance_klines_data:
283
+ # Binance returns: [open_time, open, high, low, close, volume, close_time,
284
+ # quote_asset_volume, number_of_trades, taker_buy_base_asset_volume,
285
+ # taker_buy_quote_asset_volume, ignore]
286
+
287
+ open_time = datetime.fromtimestamp(int(raw_candle_data[0]) / 1000)
288
+ close_time = datetime.fromtimestamp(int(raw_candle_data[6]) / 1000)
289
+
290
+ # Only include candles within the gap period (all UTC)
291
+ if start_time <= open_time.replace(tzinfo=None) < end_time:
292
+ # Basic OHLCV data (always included)
293
+ candle_bar_data = {
294
+ "timestamp": open_time.strftime("%Y-%m-%d %H:%M:%S"),
295
+ "open": float(raw_candle_data[1]),
296
+ "high": float(raw_candle_data[2]),
297
+ "low": float(raw_candle_data[3]),
298
+ "close": float(raw_candle_data[4]),
299
+ "volume": float(raw_candle_data[5]),
300
+ }
301
+
302
+ # Add authentic microstructure data for enhanced format
303
+ if enhanced_format:
304
+ candle_bar_data.update(
305
+ {
306
+ "close_time": close_time.strftime("%Y-%m-%d %H:%M:%S"),
307
+ "quote_asset_volume": float(raw_candle_data[7]),
308
+ "number_of_trades": int(raw_candle_data[8]),
309
+ "taker_buy_base_asset_volume": float(raw_candle_data[9]),
310
+ "taker_buy_quote_asset_volume": float(raw_candle_data[10]),
311
+ }
312
+ )
313
+
314
+ processed_candles.append(candle_bar_data)
315
+ logger.info(f" ✅ Retrieved authentic candle: {open_time}")
316
+
317
+ logger.info(f" 📈 Retrieved {len(processed_candles)} authentic candles from Binance")
318
+ return processed_candles
319
+
320
+ except Exception as api_exception:
321
+ logger.error(f" ❌ Binance API error: {api_exception}")
322
+ return None
323
+
324
+ def _detect_csv_format(self, existing_data: pd.DataFrame) -> tuple[bool, bool]:
325
+ """Detect CSV format: enhanced (11 columns) vs legacy (6 columns).
326
+
327
+ Args:
328
+ existing_data: Existing CSV data as DataFrame
329
+
330
+ Returns:
331
+ tuple: (is_enhanced_format, is_legacy_format)
332
+ """
333
+ enhanced_columns = [
334
+ "date",
335
+ "open",
336
+ "high",
337
+ "low",
338
+ "close",
339
+ "volume",
340
+ "close_time",
341
+ "quote_asset_volume",
342
+ "number_of_trades",
343
+ "taker_buy_base_asset_volume",
344
+ "taker_buy_quote_asset_volume",
345
+ ]
346
+ legacy_columns = ["date", "open", "high", "low", "close", "volume"]
347
+
348
+ is_enhanced = all(col in existing_data.columns for col in enhanced_columns)
349
+ is_legacy = all(col in existing_data.columns for col in legacy_columns)
350
+
351
+ if is_enhanced:
352
+ logger.info(" 🚀 Enhanced 11-column format detected")
353
+ elif is_legacy:
354
+ logger.info(" 📊 Legacy 6-column format detected")
355
+ else:
356
+ logger.error(f" ❌ Unknown CSV format. Columns: {list(existing_data.columns)}")
357
+
358
+ return is_enhanced, is_legacy
359
+
360
+ def _retrieve_api_data_with_metadata(
361
+ self,
362
+ timestamp_gap_info: Dict,
363
+ trading_timeframe: str,
364
+ extracted_symbol: str,
365
+ is_enhanced_format: bool,
366
+ ) -> tuple[Optional[List[Dict]], Dict]:
367
+ """Retrieve API data and create metadata tracking.
368
+
369
+ Args:
370
+ timestamp_gap_info: Gap information with start/end times
371
+ trading_timeframe: Trading timeframe (e.g., "1h")
372
+ extracted_symbol: Symbol extracted from filename
373
+ is_enhanced_format: Whether enhanced format is used
374
+
375
+ Returns:
376
+ tuple: (authentic_api_data, gap_fill_metadata)
377
+ """
378
+ logger.info(" 🔍 Step 1: Attempting authentic Binance REST API data retrieval")
379
+ authentic_api_data = self.fetch_binance_data(
380
+ timestamp_gap_info["start_time"],
381
+ timestamp_gap_info["end_time"],
382
+ trading_timeframe,
383
+ extracted_symbol,
384
+ enhanced_format=is_enhanced_format,
385
+ )
386
+
387
+ # Initialize metadata
388
+ gap_fill_metadata = {
389
+ "timestamp": timestamp_gap_info["start_time"].strftime("%Y-%m-%d %H:%M:%S"),
390
+ "duration_hours": (
391
+ timestamp_gap_info["end_time"] - timestamp_gap_info["start_time"]
392
+ ).total_seconds()
393
+ / 3600,
394
+ "fill_method": None,
395
+ "data_source": None,
396
+ "authentic_data": False,
397
+ "synthetic_data": False,
398
+ "reason": None,
399
+ "ohlcv": None,
400
+ "microstructure_data": None,
401
+ }
402
+
403
+ if not authentic_api_data:
404
+ logger.warning(" ⚠️ Step 1 Failed: No authentic API data available")
405
+ logger.info(" 🔍 Step 2: Checking if gap is legitimate exchange outage")
406
+ logger.error(" ❌ Gap filling failed: No authentic data available via API")
407
+ logger.info(" 📋 Preserving authentic data integrity - no synthetic fill applied")
408
+ return None, gap_fill_metadata
409
+
410
+ logger.info(
411
+ f" ✅ Step 1 Success: Retrieved {len(authentic_api_data)} authentic candles from API"
412
+ )
413
+
414
+ # Update metadata for successful retrieval
415
+ gap_fill_metadata.update(
416
+ {
417
+ "fill_method": "binance_rest_api",
418
+ "data_source": "https://api.binance.com/api/v3/klines",
419
+ "authentic_data": True,
420
+ "synthetic_data": False,
421
+ "reason": "missing_from_monthly_file_but_available_via_api",
422
+ }
423
+ )
424
+
425
+ # Add OHLCV data to metadata
426
+ if authentic_api_data:
427
+ first_candle = authentic_api_data[0]
428
+ gap_fill_metadata["ohlcv"] = {
429
+ "open": first_candle["open"],
430
+ "high": first_candle["high"],
431
+ "low": first_candle["low"],
432
+ "close": first_candle["close"],
433
+ "volume": first_candle["volume"],
434
+ }
435
+
436
+ if is_enhanced_format and "quote_asset_volume" in first_candle:
437
+ gap_fill_metadata["microstructure_data"] = {
438
+ "quote_asset_volume": first_candle["quote_asset_volume"],
439
+ "number_of_trades": first_candle["number_of_trades"],
440
+ "taker_buy_base_asset_volume": first_candle["taker_buy_base_asset_volume"],
441
+ "taker_buy_quote_asset_volume": first_candle["taker_buy_quote_asset_volume"],
442
+ }
443
+
444
+ return authentic_api_data, gap_fill_metadata
445
+
446
+ def _prepare_api_dataframe(
447
+ self,
448
+ authentic_api_data: List[Dict],
449
+ is_enhanced_format: bool,
450
+ ) -> pd.DataFrame:
451
+ """Convert API data to DataFrame and select appropriate columns.
452
+
453
+ Args:
454
+ authentic_api_data: Raw API data
455
+ is_enhanced_format: Whether to include microstructure columns
456
+
457
+ Returns:
458
+ DataFrame with selected columns
459
+ """
460
+ df = pd.DataFrame(authentic_api_data)
461
+ df["date"] = pd.to_datetime(df["timestamp"])
462
+
463
+ if is_enhanced_format:
464
+ # Enhanced format: include all microstructure columns
465
+ columns = ["date", "open", "high", "low", "close", "volume"]
466
+ if "close_time" in df.columns:
467
+ columns.extend(
468
+ [
469
+ "close_time",
470
+ "quote_asset_volume",
471
+ "number_of_trades",
472
+ "taker_buy_base_asset_volume",
473
+ "taker_buy_quote_asset_volume",
474
+ ]
475
+ )
476
+ return df[columns]
477
+ else:
478
+ # Legacy format: only basic OHLCV columns
479
+ return df[["date", "open", "high", "low", "close", "volume"]]
480
+
481
+ def _filter_to_gap_period(
482
+ self,
483
+ api_dataframe: pd.DataFrame,
484
+ timestamp_gap_info: Dict,
485
+ ) -> Optional[pd.DataFrame]:
486
+ """Filter API data to only include timestamps within gap period.
487
+
488
+ Args:
489
+ api_dataframe: API data as DataFrame
490
+ timestamp_gap_info: Gap information with start/end times
491
+
492
+ Returns:
493
+ Filtered DataFrame or None if no data in gap period
494
+ """
495
+ gap_start = pd.to_datetime(timestamp_gap_info["start_time"])
496
+ gap_end = pd.to_datetime(timestamp_gap_info["end_time"])
497
+
498
+ time_filter = (api_dataframe["date"] >= gap_start) & (api_dataframe["date"] < gap_end)
499
+ filtered = api_dataframe[time_filter].copy()
500
+
501
+ if len(filtered) == 0:
502
+ logger.warning(" ⚠️ No authentic Binance data falls within gap period after filtering")
503
+ return None
504
+
505
+ logger.info(f" 📊 Filtered to {len(filtered)} authentic candles within gap period")
506
+ return filtered
507
+
508
+ def _merge_and_deduplicate(
509
+ self,
510
+ existing_data: pd.DataFrame,
511
+ filtered_api_data: pd.DataFrame,
512
+ ) -> pd.DataFrame:
513
+ """Merge existing and new data, removing duplicates.
514
+
515
+ Args:
516
+ existing_data: Existing CSV data
517
+ filtered_api_data: Filtered API data for gap period
518
+
519
+ Returns:
520
+ Combined DataFrame with duplicates removed
521
+ """
522
+ combined = pd.concat([existing_data, filtered_api_data], ignore_index=True)
523
+
524
+ pre_dedup = len(combined)
525
+ combined = combined.sort_values("date").drop_duplicates(subset=["date"], keep="first")
526
+ duplicates = pre_dedup - len(combined)
527
+
528
+ if duplicates > 0:
529
+ logger.warning(f" ⚠️ Removed {duplicates} duplicate timestamp(s) during gap filling")
530
+
531
+ return combined
532
+
533
+ def _validate_gap_filled(
534
+ self,
535
+ combined_data: pd.DataFrame,
536
+ timestamp_gap_info: Dict,
537
+ trading_timeframe: str,
538
+ ) -> None:
539
+ """Validate that gap was actually filled.
540
+
541
+ Args:
542
+ combined_data: Combined DataFrame after gap filling
543
+ timestamp_gap_info: Gap information with start/end times
544
+ trading_timeframe: Trading timeframe for interval calculation
545
+ """
546
+ gap_start = pd.to_datetime(timestamp_gap_info["start_time"])
547
+ gap_end = pd.to_datetime(timestamp_gap_info["end_time"])
548
+
549
+ sorted_data = combined_data.sort_values("date").reset_index(drop=True)
550
+ remaining_gaps = []
551
+
552
+ for i in range(1, len(sorted_data)):
553
+ current = sorted_data.iloc[i]["date"]
554
+ previous = sorted_data.iloc[i - 1]["date"]
555
+ expected_interval = TIMEFRAME_TO_TIMEDELTA[trading_timeframe]
556
+ actual_diff = current - previous
557
+
558
+ if actual_diff > expected_interval:
559
+ # Check if overlaps with target gap
560
+ if (previous < gap_end) and (current > gap_start):
561
+ remaining_gaps.append(f"{previous} → {current}")
562
+
563
+ if remaining_gaps:
564
+ logger.warning(f" ⚠️ Gap partially filled - remaining gaps: {remaining_gaps}")
565
+
566
+ def _save_with_headers(
567
+ self,
568
+ csv_path: Path,
569
+ dataframe: pd.DataFrame,
570
+ ) -> None:
571
+ """Save DataFrame to CSV with header comments preserved.
572
+
573
+ Args:
574
+ csv_path: Path to CSV file
575
+ dataframe: DataFrame to save
576
+ """
577
+ # Read header comments
578
+ headers = []
579
+ with open(csv_path, "r") as f:
580
+ for line in f:
581
+ if line.startswith("#"):
582
+ headers.append(line.rstrip())
583
+ else:
584
+ break
585
+
586
+ # Write headers + data
587
+ with open(csv_path, "w") as f:
588
+ for header in headers:
589
+ f.write(header + "\n")
590
+ dataframe.to_csv(f, index=False)
591
+
592
+ def fill_gap(
593
+ self,
594
+ timestamp_gap_info: Dict,
595
+ csv_path: Path,
596
+ trading_timeframe: str,
597
+ ) -> bool:
598
+ """Fill a single gap with authentic Binance data using API-first validation protocol."""
599
+ logger.info(
600
+ f"🔧 Filling gap: {timestamp_gap_info['start_time']} → {timestamp_gap_info['end_time']}"
601
+ )
602
+ logger.info(" 📋 Applying API-first validation protocol")
603
+
604
+ # Load and detect format
605
+ existing_data = pd.read_csv(csv_path, comment="#")
606
+ existing_data["date"] = pd.to_datetime(existing_data["date"])
607
+
608
+ is_enhanced, is_legacy = self._detect_csv_format(existing_data)
609
+ if not is_enhanced and not is_legacy:
610
+ return False
611
+
612
+ # Extract symbol and retrieve API data
613
+ extracted_symbol = self.extract_symbol_from_filename(csv_path)
614
+ filename = Path(csv_path).name if isinstance(csv_path, str) else csv_path.name
615
+ logger.info(f" 🎯 Extracted symbol: {extracted_symbol} from file: {filename}")
616
+
617
+ api_data, metadata = self._retrieve_api_data_with_metadata(
618
+ timestamp_gap_info, trading_timeframe, extracted_symbol, is_enhanced
619
+ )
620
+
621
+ if not api_data:
622
+ return False
623
+
624
+ # Prepare and filter API data
625
+ api_df = self._prepare_api_dataframe(api_data, is_enhanced)
626
+ filtered_df = self._filter_to_gap_period(api_df, timestamp_gap_info)
627
+
628
+ if filtered_df is None:
629
+ return False
630
+
631
+ # Merge and validate
632
+ combined = self._merge_and_deduplicate(existing_data, filtered_df)
633
+ self._validate_gap_filled(combined, timestamp_gap_info, trading_timeframe)
634
+
635
+ # Save results
636
+ self._save_with_headers(csv_path, combined)
637
+
638
+ logger.info(f" ✅ Gap filled with {len(filtered_df)} authentic candles")
639
+ return True
640
+
641
+ def process_file(self, csv_path: Path, trading_timeframe: str) -> Dict:
642
+ """Process a single CSV file - detect and fill ALL gaps"""
643
+ logger.info(f"🎯 Processing {csv_path} ({trading_timeframe})")
644
+
645
+ # Detect all gaps
646
+ detected_gaps = self.detect_all_gaps(csv_path, trading_timeframe)
647
+
648
+ if not detected_gaps:
649
+ logger.info(f" ✅ No gaps found in {trading_timeframe}")
650
+ return {
651
+ "timeframe": trading_timeframe,
652
+ "gaps_detected": 0,
653
+ "gaps_filled": 0,
654
+ "gaps_failed": 0,
655
+ "success_rate": 100.0,
656
+ }
657
+
658
+ # Fill each gap
659
+ gaps_filled_count = 0
660
+ gaps_failed_count = 0
661
+
662
+ for gap_index, timestamp_gap in enumerate(detected_gaps, 1):
663
+ logger.info(f" 🔧 Processing gap {gap_index}/{len(detected_gaps)}")
664
+ if self.fill_gap(timestamp_gap, csv_path, trading_timeframe):
665
+ gaps_filled_count += 1
666
+ else:
667
+ gaps_failed_count += 1
668
+
669
+ # Brief pause between API calls
670
+ if gap_index < len(detected_gaps):
671
+ time.sleep(1)
672
+
673
+ gap_fill_success_rate = (
674
+ (gaps_filled_count / len(detected_gaps)) * 100 if detected_gaps else 100.0
675
+ )
676
+
677
+ processing_result = {
678
+ "timeframe": trading_timeframe,
679
+ "gaps_detected": len(detected_gaps),
680
+ "gaps_filled": gaps_filled_count,
681
+ "gaps_failed": gaps_failed_count,
682
+ "success_rate": gap_fill_success_rate,
683
+ }
684
+
685
+ logger.info(
686
+ f" 📊 Result: {gaps_filled_count}/{len(detected_gaps)} gaps filled ({gap_fill_success_rate:.1f}%)"
687
+ )
688
+ return processing_result
689
+
690
+
691
+ def main():
692
+ """Main execution function"""
693
+ logger.info("🚀 UNIVERSAL GAP FILLER - Fill ALL Gaps in ALL Timeframes")
694
+ logger.info("=" * 60)
695
+
696
+ gap_filler_instance = UniversalGapFiller()
697
+ sample_data_directory = Path("../sample_data")
698
+
699
+ # Define timeframes that need gap filling (exclude 4h which is perfect)
700
+ target_trading_timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
701
+
702
+ processing_results = []
703
+
704
+ for trading_timeframe in target_trading_timeframes:
705
+ csv_file_pattern = f"binance_spot_SOLUSDT-{trading_timeframe}_*.csv"
706
+ matching_csv_files = list(sample_data_directory.glob(csv_file_pattern))
707
+
708
+ if not matching_csv_files:
709
+ logger.warning(f"❌ No CSV file found for {trading_timeframe}")
710
+ continue
711
+
712
+ selected_csv_file = matching_csv_files[0] # Use first match
713
+ timeframe_result = gap_filler_instance.process_file(selected_csv_file, trading_timeframe)
714
+ processing_results.append(timeframe_result)
715
+
716
+ # Summary report
717
+ logger.info("\n" + "=" * 60)
718
+ logger.info("📊 UNIVERSAL GAP FILLING SUMMARY")
719
+ logger.info("=" * 60)
720
+
721
+ total_gaps_detected_count = sum(
722
+ result_data["gaps_detected"] for result_data in processing_results
723
+ )
724
+ total_gaps_filled_count = sum(result_data["gaps_filled"] for result_data in processing_results)
725
+ total_gaps_failed_count = sum(result_data["gaps_failed"] for result_data in processing_results)
726
+
727
+ for timeframe_result in processing_results:
728
+ status_icon = (
729
+ "✅"
730
+ if timeframe_result["success_rate"] == 100.0
731
+ else "⚠️"
732
+ if timeframe_result["success_rate"] > 0
733
+ else "❌"
734
+ )
735
+ logger.info(
736
+ f"{status_icon} {timeframe_result['timeframe']:>3}: {timeframe_result['gaps_filled']:>2}/{timeframe_result['gaps_detected']:>2} gaps filled ({timeframe_result['success_rate']:>5.1f}%)"
737
+ )
738
+
739
+ logger.info("-" * 60)
740
+ overall_success_rate = (
741
+ (total_gaps_filled_count / total_gaps_detected_count * 100)
742
+ if total_gaps_detected_count > 0
743
+ else 100.0
744
+ )
745
+ logger.info(
746
+ f"🎯 OVERALL: {total_gaps_filled_count}/{total_gaps_detected_count} gaps filled ({overall_success_rate:.1f}%)"
747
+ )
748
+ logger.info("=" * 60)
749
+
750
+ if overall_success_rate == 100.0:
751
+ logger.info("🎉 ALL GAPS FILLED SUCCESSFULLY! Ready for validation.")
752
+ else:
753
+ logger.warning(f"⚠️ {total_gaps_failed_count} gaps failed to fill. Manual review needed.")
754
+
755
+
756
+ if __name__ == "__main__":
757
+ main()