gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Universal Gap Filler - Detects and fills ALL gaps in OHLCV CSV files
|
|
4
|
+
|
|
5
|
+
This script automatically detects ALL gaps in any timeframe's CSV file and fills them
|
|
6
|
+
using authentic Binance API data with full 11-column microstructure format.
|
|
7
|
+
|
|
8
|
+
Unlike synthetic data approaches, this filler uses authentic Binance data
|
|
9
|
+
providing complete microstructure columns for professional analysis.
|
|
10
|
+
|
|
11
|
+
Key Features:
|
|
12
|
+
- Auto-detects gaps by analyzing timestamp sequences
|
|
13
|
+
- Uses authentic Binance API with full 11-column microstructure format
|
|
14
|
+
- Handles all timeframes (1s, 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d)
|
|
15
|
+
- Provides authentic order flow metrics including trade counts and taker volumes
|
|
16
|
+
- Processes gaps chronologically to maintain data integrity
|
|
17
|
+
- NO synthetic or estimated data - only authentic exchange data
|
|
18
|
+
- API-first validation protocol using authentic Binance data exclusively
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import time
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Dict, List, Optional
|
|
26
|
+
|
|
27
|
+
import httpx
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from ..utils.timeframe_constants import (
|
|
31
|
+
TIMEFRAME_TO_BINANCE_INTERVAL,
|
|
32
|
+
TIMEFRAME_TO_PYTHON_TIMEDELTA,
|
|
33
|
+
TIMEFRAME_TO_TIMEDELTA,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UniversalGapFiller:
|
|
41
|
+
"""Universal gap detection and filling for all timeframes with authentic 11-column microstructure format.
|
|
42
|
+
|
|
43
|
+
Automatically detects and fills timestamp gaps in OHLCV CSV files using authentic
|
|
44
|
+
Binance API data. Provides complete gap detection across all timeframes with
|
|
45
|
+
professional-grade microstructure data including order flow metrics.
|
|
46
|
+
|
|
47
|
+
Unlike synthetic data generators, this gap filler exclusively uses authentic
|
|
48
|
+
Binance market data, ensuring all filled gaps contain real order flow metrics,
|
|
49
|
+
trade counts, and taker volume statistics essential for quantitative analysis.
|
|
50
|
+
|
|
51
|
+
Features:
|
|
52
|
+
- Universal gap detection for any timeframe (1s to 1d)
|
|
53
|
+
- Authentic Binance API data for gap filling (never synthetic)
|
|
54
|
+
- Complete 11-column microstructure format preservation
|
|
55
|
+
- Chronological processing for data integrity
|
|
56
|
+
- Automatic symbol extraction from filenames
|
|
57
|
+
- Batch processing for multiple files
|
|
58
|
+
- Safe atomic operations with backup/rollback
|
|
59
|
+
|
|
60
|
+
Supported Timeframes:
|
|
61
|
+
- 1s: Second-based intervals
|
|
62
|
+
- 1m, 3m, 5m, 15m, 30m: Minute-based intervals
|
|
63
|
+
- 1h, 2h, 4h, 6h, 8h, 12h: Hour-based intervals
|
|
64
|
+
- 1d: Daily intervals
|
|
65
|
+
|
|
66
|
+
Data Quality:
|
|
67
|
+
All gap-filled data maintains the same structure as original Binance data:
|
|
68
|
+
- OHLCV: Open, High, Low, Close, Volume (base asset)
|
|
69
|
+
- Timestamps: Open time, Close time
|
|
70
|
+
- Order Flow: Quote asset volume, Number of trades
|
|
71
|
+
- Taker Metrics: Taker buy base volume, Taker buy quote volume
|
|
72
|
+
|
|
73
|
+
Examples:
|
|
74
|
+
For simple gap filling, consider using the function-based API:
|
|
75
|
+
|
|
76
|
+
>>> import gapless_crypto_clickhouse as gcd
|
|
77
|
+
>>> results = gcd.fill_gaps("./data")
|
|
78
|
+
>>> print(f"Filled {results['gaps_filled']}/{results['gaps_detected']} gaps")
|
|
79
|
+
|
|
80
|
+
Advanced usage with this class for detailed control:
|
|
81
|
+
|
|
82
|
+
>>> gap_filler = UniversalGapFiller()
|
|
83
|
+
>>> gaps = gap_filler.detect_all_gaps("BTCUSDT_1h_2024-01-01_to_2024-12-31.csv", "1h")
|
|
84
|
+
>>> print(f"Found {len(gaps)} gaps")
|
|
85
|
+
>>> success = gap_filler.fill_gap(gaps[0], "BTCUSDT_1h_data.csv", "1h")
|
|
86
|
+
>>> print(f"Gap filled: {success}")
|
|
87
|
+
Found 3 gaps
|
|
88
|
+
Gap filled: True
|
|
89
|
+
|
|
90
|
+
Batch processing for directory:
|
|
91
|
+
|
|
92
|
+
>>> gap_filler = UniversalGapFiller()
|
|
93
|
+
>>> result = gap_filler.process_file("BTCUSDT_1h.csv", "1h")
|
|
94
|
+
>>> print(f"Filled {result['gaps_filled']}/{result['gaps_detected']} gaps")
|
|
95
|
+
Filled 2/3 gaps
|
|
96
|
+
|
|
97
|
+
Custom symbol processing:
|
|
98
|
+
|
|
99
|
+
>>> symbol = gap_filler.extract_symbol_from_filename("SOLUSDT_15m_data.csv")
|
|
100
|
+
>>> print(f"Extracted symbol: {symbol}")
|
|
101
|
+
Extracted symbol: SOLUSDT
|
|
102
|
+
|
|
103
|
+
Note:
|
|
104
|
+
This gap filler requires internet connectivity to fetch authentic data
|
|
105
|
+
from Binance's public API. Rate limiting is automatically handled to
|
|
106
|
+
respect API limits during gap filling operations.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
# ADR-0021: API endpoint constants for spot and futures
|
|
110
|
+
SPOT_API_URL = "https://api.binance.com/api/v3/klines"
|
|
111
|
+
FUTURES_API_URL = "https://fapi.binance.com/fapi/v1/klines"
|
|
112
|
+
|
|
113
|
+
def __init__(self, instrument_type: str = "spot"):
|
|
114
|
+
"""Initialize UniversalGapFiller with instrument type support.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
instrument_type: Instrument type - "spot" or "futures-um" (default: "spot")
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ValueError: If instrument_type is invalid
|
|
121
|
+
"""
|
|
122
|
+
# ADR-0021: Validate instrument type and set API endpoint
|
|
123
|
+
if instrument_type not in ("spot", "futures-um"):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Invalid instrument_type '{instrument_type}'. Must be 'spot' or 'futures-um'"
|
|
126
|
+
)
|
|
127
|
+
self.instrument_type = instrument_type
|
|
128
|
+
|
|
129
|
+
# ADR-0021: API endpoint selection based on instrument type
|
|
130
|
+
if instrument_type == "spot":
|
|
131
|
+
self.binance_base_url = self.SPOT_API_URL
|
|
132
|
+
else: # futures-um
|
|
133
|
+
self.binance_base_url = self.FUTURES_API_URL
|
|
134
|
+
|
|
135
|
+
def extract_symbol_from_filename(self, csv_path) -> str:
|
|
136
|
+
"""Extract symbol from CSV filename
|
|
137
|
+
|
|
138
|
+
Supports formats like:
|
|
139
|
+
- binance_spot_BTCUSDT-1h_20240101-20240101_v2.5.0.csv
|
|
140
|
+
- BTCUSDT_1h_data.csv
|
|
141
|
+
- ETHUSDT-4h.csv
|
|
142
|
+
"""
|
|
143
|
+
# Handle both string and Path inputs
|
|
144
|
+
if isinstance(csv_path, (str, Path)):
|
|
145
|
+
path_obj = Path(csv_path)
|
|
146
|
+
filename = path_obj.name
|
|
147
|
+
else:
|
|
148
|
+
filename = str(csv_path)
|
|
149
|
+
|
|
150
|
+
# Handle gapless-crypto-data format: binance_spot_SYMBOL-timeframe_dates.csv
|
|
151
|
+
if "binance_spot_" in filename:
|
|
152
|
+
parts = filename.split("_")
|
|
153
|
+
if len(parts) >= 3:
|
|
154
|
+
symbol_part = parts[2] # BTCUSDT-1h
|
|
155
|
+
symbol = symbol_part.split("-")[0] # BTCUSDT
|
|
156
|
+
return symbol
|
|
157
|
+
|
|
158
|
+
# Handle simple formats: SYMBOL_timeframe or SYMBOL-timeframe
|
|
159
|
+
for separator in ["-", "_"]:
|
|
160
|
+
if separator in filename:
|
|
161
|
+
parts = filename.split(separator)
|
|
162
|
+
potential_symbol = parts[0]
|
|
163
|
+
# Check if it looks like a trading pair (ends with USDT, BTC, ETH, etc.)
|
|
164
|
+
if potential_symbol.endswith(("USDT", "BTC", "ETH", "BNB")):
|
|
165
|
+
return potential_symbol
|
|
166
|
+
|
|
167
|
+
# Fallback: look for common trading pairs (top 20 by market cap)
|
|
168
|
+
common_symbols = [
|
|
169
|
+
"BTCUSDT",
|
|
170
|
+
"ETHUSDT",
|
|
171
|
+
"BNBUSDT",
|
|
172
|
+
"SOLUSDT",
|
|
173
|
+
"XRPUSDT",
|
|
174
|
+
"DOGEUSDT",
|
|
175
|
+
"ADAUSDT",
|
|
176
|
+
"AVAXUSDT",
|
|
177
|
+
"DOTUSDT",
|
|
178
|
+
"LINKUSDT",
|
|
179
|
+
"MATICUSDT",
|
|
180
|
+
"LTCUSDT",
|
|
181
|
+
"UNIUSDT",
|
|
182
|
+
"ATOMUSDT",
|
|
183
|
+
"FTMUSDT",
|
|
184
|
+
"NEARUSDT",
|
|
185
|
+
"ALGOUSDT",
|
|
186
|
+
"SANDUSDT",
|
|
187
|
+
"MANAUSDT",
|
|
188
|
+
"APEUSDT",
|
|
189
|
+
]
|
|
190
|
+
filename_upper = filename.upper()
|
|
191
|
+
for symbol in common_symbols:
|
|
192
|
+
if symbol in filename_upper:
|
|
193
|
+
return symbol
|
|
194
|
+
|
|
195
|
+
# Default fallback (should not happen in practice)
|
|
196
|
+
logger.warning(
|
|
197
|
+
f"⚠️ Could not extract symbol from filename {filename}, defaulting to BTCUSDT"
|
|
198
|
+
)
|
|
199
|
+
return "BTCUSDT"
|
|
200
|
+
|
|
201
|
+
def detect_all_gaps(self, csv_path: Path, timeframe: str) -> List[Dict]:
|
|
202
|
+
"""Detect ALL gaps in CSV file by analyzing timestamp sequence for 11-column format"""
|
|
203
|
+
logger.info(f"🔍 Analyzing {csv_path} for gaps...")
|
|
204
|
+
|
|
205
|
+
# Load CSV data
|
|
206
|
+
ohlcv_dataframe = pd.read_csv(csv_path, comment="#")
|
|
207
|
+
ohlcv_dataframe["date"] = pd.to_datetime(ohlcv_dataframe["date"])
|
|
208
|
+
ohlcv_dataframe = ohlcv_dataframe.sort_values("date")
|
|
209
|
+
|
|
210
|
+
# Calculate expected interval using centralized constants
|
|
211
|
+
expected_interval = TIMEFRAME_TO_PYTHON_TIMEDELTA[timeframe]
|
|
212
|
+
|
|
213
|
+
detected_gaps = []
|
|
214
|
+
for row_index in range(1, len(ohlcv_dataframe)):
|
|
215
|
+
current_time = ohlcv_dataframe.iloc[row_index]["date"]
|
|
216
|
+
previous_time = ohlcv_dataframe.iloc[row_index - 1]["date"]
|
|
217
|
+
actual_gap_duration = current_time - previous_time
|
|
218
|
+
|
|
219
|
+
if actual_gap_duration > expected_interval:
|
|
220
|
+
timestamp_gap_info = {
|
|
221
|
+
"position": row_index,
|
|
222
|
+
"start_time": previous_time + expected_interval,
|
|
223
|
+
"end_time": current_time,
|
|
224
|
+
"duration": actual_gap_duration,
|
|
225
|
+
"expected_interval": expected_interval,
|
|
226
|
+
}
|
|
227
|
+
detected_gaps.append(timestamp_gap_info)
|
|
228
|
+
logger.info(
|
|
229
|
+
f" 📊 Gap {len(detected_gaps)}: {timestamp_gap_info['start_time']} → {timestamp_gap_info['end_time']} ({timestamp_gap_info['duration']})"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
logger.info(f"✅ Found {len(detected_gaps)} gaps in {timeframe} timeframe")
|
|
233
|
+
return detected_gaps
|
|
234
|
+
|
|
235
|
+
def fetch_binance_data(
|
|
236
|
+
self,
|
|
237
|
+
start_time: datetime,
|
|
238
|
+
end_time: datetime,
|
|
239
|
+
timeframe: str,
|
|
240
|
+
symbol: str,
|
|
241
|
+
enhanced_format: bool = False,
|
|
242
|
+
) -> Optional[List[Dict]]:
|
|
243
|
+
"""Fetch authentic microstructure data from Binance API - NO synthetic data"""
|
|
244
|
+
binance_interval = TIMEFRAME_TO_BINANCE_INTERVAL[timeframe]
|
|
245
|
+
|
|
246
|
+
# Convert to millisecond timestamps for Binance API
|
|
247
|
+
# ✅ UTC ONLY: All timestamps are UTC - no timezone conversion needed
|
|
248
|
+
|
|
249
|
+
# Convert pandas Timestamp to datetime if needed
|
|
250
|
+
if hasattr(start_time, "to_pydatetime"):
|
|
251
|
+
start_time = start_time.to_pydatetime()
|
|
252
|
+
if hasattr(end_time, "to_pydatetime"):
|
|
253
|
+
end_time = end_time.to_pydatetime()
|
|
254
|
+
|
|
255
|
+
# Simple UTC timestamp conversion - CSV timestamps are naive UTC
|
|
256
|
+
# The CSV timestamps should be interpreted as local machine time for API calls
|
|
257
|
+
# This matches how Binance API expects timestamps
|
|
258
|
+
start_timestamp_ms = int(start_time.timestamp() * 1000)
|
|
259
|
+
end_timestamp_ms = int(end_time.timestamp() * 1000)
|
|
260
|
+
|
|
261
|
+
api_request_params = {
|
|
262
|
+
"symbol": symbol,
|
|
263
|
+
"interval": binance_interval,
|
|
264
|
+
"startTime": start_timestamp_ms,
|
|
265
|
+
"endTime": end_timestamp_ms,
|
|
266
|
+
"limit": 1000,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
logger.info(f" 📡 Binance API call: {api_request_params}")
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
http_response = httpx.get(self.binance_base_url, params=api_request_params, timeout=30)
|
|
273
|
+
http_response.raise_for_status()
|
|
274
|
+
binance_klines_data = http_response.json()
|
|
275
|
+
|
|
276
|
+
if not binance_klines_data:
|
|
277
|
+
logger.warning(" ❌ Binance returned no data")
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
# Convert Binance data to required format with authentic microstructure data
|
|
281
|
+
processed_candles = []
|
|
282
|
+
for raw_candle_data in binance_klines_data:
|
|
283
|
+
# Binance returns: [open_time, open, high, low, close, volume, close_time,
|
|
284
|
+
# quote_asset_volume, number_of_trades, taker_buy_base_asset_volume,
|
|
285
|
+
# taker_buy_quote_asset_volume, ignore]
|
|
286
|
+
|
|
287
|
+
open_time = datetime.fromtimestamp(int(raw_candle_data[0]) / 1000)
|
|
288
|
+
close_time = datetime.fromtimestamp(int(raw_candle_data[6]) / 1000)
|
|
289
|
+
|
|
290
|
+
# Only include candles within the gap period (all UTC)
|
|
291
|
+
if start_time <= open_time.replace(tzinfo=None) < end_time:
|
|
292
|
+
# Basic OHLCV data (always included)
|
|
293
|
+
candle_bar_data = {
|
|
294
|
+
"timestamp": open_time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
295
|
+
"open": float(raw_candle_data[1]),
|
|
296
|
+
"high": float(raw_candle_data[2]),
|
|
297
|
+
"low": float(raw_candle_data[3]),
|
|
298
|
+
"close": float(raw_candle_data[4]),
|
|
299
|
+
"volume": float(raw_candle_data[5]),
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# Add authentic microstructure data for enhanced format
|
|
303
|
+
if enhanced_format:
|
|
304
|
+
candle_bar_data.update(
|
|
305
|
+
{
|
|
306
|
+
"close_time": close_time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
307
|
+
"quote_asset_volume": float(raw_candle_data[7]),
|
|
308
|
+
"number_of_trades": int(raw_candle_data[8]),
|
|
309
|
+
"taker_buy_base_asset_volume": float(raw_candle_data[9]),
|
|
310
|
+
"taker_buy_quote_asset_volume": float(raw_candle_data[10]),
|
|
311
|
+
}
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
processed_candles.append(candle_bar_data)
|
|
315
|
+
logger.info(f" ✅ Retrieved authentic candle: {open_time}")
|
|
316
|
+
|
|
317
|
+
logger.info(f" 📈 Retrieved {len(processed_candles)} authentic candles from Binance")
|
|
318
|
+
return processed_candles
|
|
319
|
+
|
|
320
|
+
except Exception as api_exception:
|
|
321
|
+
logger.error(f" ❌ Binance API error: {api_exception}")
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
def _detect_csv_format(self, existing_data: pd.DataFrame) -> tuple[bool, bool]:
|
|
325
|
+
"""Detect CSV format: enhanced (11 columns) vs legacy (6 columns).
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
existing_data: Existing CSV data as DataFrame
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
tuple: (is_enhanced_format, is_legacy_format)
|
|
332
|
+
"""
|
|
333
|
+
enhanced_columns = [
|
|
334
|
+
"date",
|
|
335
|
+
"open",
|
|
336
|
+
"high",
|
|
337
|
+
"low",
|
|
338
|
+
"close",
|
|
339
|
+
"volume",
|
|
340
|
+
"close_time",
|
|
341
|
+
"quote_asset_volume",
|
|
342
|
+
"number_of_trades",
|
|
343
|
+
"taker_buy_base_asset_volume",
|
|
344
|
+
"taker_buy_quote_asset_volume",
|
|
345
|
+
]
|
|
346
|
+
legacy_columns = ["date", "open", "high", "low", "close", "volume"]
|
|
347
|
+
|
|
348
|
+
is_enhanced = all(col in existing_data.columns for col in enhanced_columns)
|
|
349
|
+
is_legacy = all(col in existing_data.columns for col in legacy_columns)
|
|
350
|
+
|
|
351
|
+
if is_enhanced:
|
|
352
|
+
logger.info(" 🚀 Enhanced 11-column format detected")
|
|
353
|
+
elif is_legacy:
|
|
354
|
+
logger.info(" 📊 Legacy 6-column format detected")
|
|
355
|
+
else:
|
|
356
|
+
logger.error(f" ❌ Unknown CSV format. Columns: {list(existing_data.columns)}")
|
|
357
|
+
|
|
358
|
+
return is_enhanced, is_legacy
|
|
359
|
+
|
|
360
|
+
def _retrieve_api_data_with_metadata(
|
|
361
|
+
self,
|
|
362
|
+
timestamp_gap_info: Dict,
|
|
363
|
+
trading_timeframe: str,
|
|
364
|
+
extracted_symbol: str,
|
|
365
|
+
is_enhanced_format: bool,
|
|
366
|
+
) -> tuple[Optional[List[Dict]], Dict]:
|
|
367
|
+
"""Retrieve API data and create metadata tracking.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
timestamp_gap_info: Gap information with start/end times
|
|
371
|
+
trading_timeframe: Trading timeframe (e.g., "1h")
|
|
372
|
+
extracted_symbol: Symbol extracted from filename
|
|
373
|
+
is_enhanced_format: Whether enhanced format is used
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
tuple: (authentic_api_data, gap_fill_metadata)
|
|
377
|
+
"""
|
|
378
|
+
logger.info(" 🔍 Step 1: Attempting authentic Binance REST API data retrieval")
|
|
379
|
+
authentic_api_data = self.fetch_binance_data(
|
|
380
|
+
timestamp_gap_info["start_time"],
|
|
381
|
+
timestamp_gap_info["end_time"],
|
|
382
|
+
trading_timeframe,
|
|
383
|
+
extracted_symbol,
|
|
384
|
+
enhanced_format=is_enhanced_format,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Initialize metadata
|
|
388
|
+
gap_fill_metadata = {
|
|
389
|
+
"timestamp": timestamp_gap_info["start_time"].strftime("%Y-%m-%d %H:%M:%S"),
|
|
390
|
+
"duration_hours": (
|
|
391
|
+
timestamp_gap_info["end_time"] - timestamp_gap_info["start_time"]
|
|
392
|
+
).total_seconds()
|
|
393
|
+
/ 3600,
|
|
394
|
+
"fill_method": None,
|
|
395
|
+
"data_source": None,
|
|
396
|
+
"authentic_data": False,
|
|
397
|
+
"synthetic_data": False,
|
|
398
|
+
"reason": None,
|
|
399
|
+
"ohlcv": None,
|
|
400
|
+
"microstructure_data": None,
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if not authentic_api_data:
|
|
404
|
+
logger.warning(" ⚠️ Step 1 Failed: No authentic API data available")
|
|
405
|
+
logger.info(" 🔍 Step 2: Checking if gap is legitimate exchange outage")
|
|
406
|
+
logger.error(" ❌ Gap filling failed: No authentic data available via API")
|
|
407
|
+
logger.info(" 📋 Preserving authentic data integrity - no synthetic fill applied")
|
|
408
|
+
return None, gap_fill_metadata
|
|
409
|
+
|
|
410
|
+
logger.info(
|
|
411
|
+
f" ✅ Step 1 Success: Retrieved {len(authentic_api_data)} authentic candles from API"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Update metadata for successful retrieval
|
|
415
|
+
gap_fill_metadata.update(
|
|
416
|
+
{
|
|
417
|
+
"fill_method": "binance_rest_api",
|
|
418
|
+
"data_source": "https://api.binance.com/api/v3/klines",
|
|
419
|
+
"authentic_data": True,
|
|
420
|
+
"synthetic_data": False,
|
|
421
|
+
"reason": "missing_from_monthly_file_but_available_via_api",
|
|
422
|
+
}
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Add OHLCV data to metadata
|
|
426
|
+
if authentic_api_data:
|
|
427
|
+
first_candle = authentic_api_data[0]
|
|
428
|
+
gap_fill_metadata["ohlcv"] = {
|
|
429
|
+
"open": first_candle["open"],
|
|
430
|
+
"high": first_candle["high"],
|
|
431
|
+
"low": first_candle["low"],
|
|
432
|
+
"close": first_candle["close"],
|
|
433
|
+
"volume": first_candle["volume"],
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
if is_enhanced_format and "quote_asset_volume" in first_candle:
|
|
437
|
+
gap_fill_metadata["microstructure_data"] = {
|
|
438
|
+
"quote_asset_volume": first_candle["quote_asset_volume"],
|
|
439
|
+
"number_of_trades": first_candle["number_of_trades"],
|
|
440
|
+
"taker_buy_base_asset_volume": first_candle["taker_buy_base_asset_volume"],
|
|
441
|
+
"taker_buy_quote_asset_volume": first_candle["taker_buy_quote_asset_volume"],
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
return authentic_api_data, gap_fill_metadata
|
|
445
|
+
|
|
446
|
+
def _prepare_api_dataframe(
|
|
447
|
+
self,
|
|
448
|
+
authentic_api_data: List[Dict],
|
|
449
|
+
is_enhanced_format: bool,
|
|
450
|
+
) -> pd.DataFrame:
|
|
451
|
+
"""Convert API data to DataFrame and select appropriate columns.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
authentic_api_data: Raw API data
|
|
455
|
+
is_enhanced_format: Whether to include microstructure columns
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
DataFrame with selected columns
|
|
459
|
+
"""
|
|
460
|
+
df = pd.DataFrame(authentic_api_data)
|
|
461
|
+
df["date"] = pd.to_datetime(df["timestamp"])
|
|
462
|
+
|
|
463
|
+
if is_enhanced_format:
|
|
464
|
+
# Enhanced format: include all microstructure columns
|
|
465
|
+
columns = ["date", "open", "high", "low", "close", "volume"]
|
|
466
|
+
if "close_time" in df.columns:
|
|
467
|
+
columns.extend(
|
|
468
|
+
[
|
|
469
|
+
"close_time",
|
|
470
|
+
"quote_asset_volume",
|
|
471
|
+
"number_of_trades",
|
|
472
|
+
"taker_buy_base_asset_volume",
|
|
473
|
+
"taker_buy_quote_asset_volume",
|
|
474
|
+
]
|
|
475
|
+
)
|
|
476
|
+
return df[columns]
|
|
477
|
+
else:
|
|
478
|
+
# Legacy format: only basic OHLCV columns
|
|
479
|
+
return df[["date", "open", "high", "low", "close", "volume"]]
|
|
480
|
+
|
|
481
|
+
def _filter_to_gap_period(
|
|
482
|
+
self,
|
|
483
|
+
api_dataframe: pd.DataFrame,
|
|
484
|
+
timestamp_gap_info: Dict,
|
|
485
|
+
) -> Optional[pd.DataFrame]:
|
|
486
|
+
"""Filter API data to only include timestamps within gap period.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
api_dataframe: API data as DataFrame
|
|
490
|
+
timestamp_gap_info: Gap information with start/end times
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Filtered DataFrame or None if no data in gap period
|
|
494
|
+
"""
|
|
495
|
+
gap_start = pd.to_datetime(timestamp_gap_info["start_time"])
|
|
496
|
+
gap_end = pd.to_datetime(timestamp_gap_info["end_time"])
|
|
497
|
+
|
|
498
|
+
time_filter = (api_dataframe["date"] >= gap_start) & (api_dataframe["date"] < gap_end)
|
|
499
|
+
filtered = api_dataframe[time_filter].copy()
|
|
500
|
+
|
|
501
|
+
if len(filtered) == 0:
|
|
502
|
+
logger.warning(" ⚠️ No authentic Binance data falls within gap period after filtering")
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
logger.info(f" 📊 Filtered to {len(filtered)} authentic candles within gap period")
|
|
506
|
+
return filtered
|
|
507
|
+
|
|
508
|
+
def _merge_and_deduplicate(
|
|
509
|
+
self,
|
|
510
|
+
existing_data: pd.DataFrame,
|
|
511
|
+
filtered_api_data: pd.DataFrame,
|
|
512
|
+
) -> pd.DataFrame:
|
|
513
|
+
"""Merge existing and new data, removing duplicates.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
existing_data: Existing CSV data
|
|
517
|
+
filtered_api_data: Filtered API data for gap period
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Combined DataFrame with duplicates removed
|
|
521
|
+
"""
|
|
522
|
+
combined = pd.concat([existing_data, filtered_api_data], ignore_index=True)
|
|
523
|
+
|
|
524
|
+
pre_dedup = len(combined)
|
|
525
|
+
combined = combined.sort_values("date").drop_duplicates(subset=["date"], keep="first")
|
|
526
|
+
duplicates = pre_dedup - len(combined)
|
|
527
|
+
|
|
528
|
+
if duplicates > 0:
|
|
529
|
+
logger.warning(f" ⚠️ Removed {duplicates} duplicate timestamp(s) during gap filling")
|
|
530
|
+
|
|
531
|
+
return combined
|
|
532
|
+
|
|
533
|
+
def _validate_gap_filled(
|
|
534
|
+
self,
|
|
535
|
+
combined_data: pd.DataFrame,
|
|
536
|
+
timestamp_gap_info: Dict,
|
|
537
|
+
trading_timeframe: str,
|
|
538
|
+
) -> None:
|
|
539
|
+
"""Validate that gap was actually filled.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
combined_data: Combined DataFrame after gap filling
|
|
543
|
+
timestamp_gap_info: Gap information with start/end times
|
|
544
|
+
trading_timeframe: Trading timeframe for interval calculation
|
|
545
|
+
"""
|
|
546
|
+
gap_start = pd.to_datetime(timestamp_gap_info["start_time"])
|
|
547
|
+
gap_end = pd.to_datetime(timestamp_gap_info["end_time"])
|
|
548
|
+
|
|
549
|
+
sorted_data = combined_data.sort_values("date").reset_index(drop=True)
|
|
550
|
+
remaining_gaps = []
|
|
551
|
+
|
|
552
|
+
for i in range(1, len(sorted_data)):
|
|
553
|
+
current = sorted_data.iloc[i]["date"]
|
|
554
|
+
previous = sorted_data.iloc[i - 1]["date"]
|
|
555
|
+
expected_interval = TIMEFRAME_TO_TIMEDELTA[trading_timeframe]
|
|
556
|
+
actual_diff = current - previous
|
|
557
|
+
|
|
558
|
+
if actual_diff > expected_interval:
|
|
559
|
+
# Check if overlaps with target gap
|
|
560
|
+
if (previous < gap_end) and (current > gap_start):
|
|
561
|
+
remaining_gaps.append(f"{previous} → {current}")
|
|
562
|
+
|
|
563
|
+
if remaining_gaps:
|
|
564
|
+
logger.warning(f" ⚠️ Gap partially filled - remaining gaps: {remaining_gaps}")
|
|
565
|
+
|
|
566
|
+
def _save_with_headers(
|
|
567
|
+
self,
|
|
568
|
+
csv_path: Path,
|
|
569
|
+
dataframe: pd.DataFrame,
|
|
570
|
+
) -> None:
|
|
571
|
+
"""Save DataFrame to CSV with header comments preserved.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
csv_path: Path to CSV file
|
|
575
|
+
dataframe: DataFrame to save
|
|
576
|
+
"""
|
|
577
|
+
# Read header comments
|
|
578
|
+
headers = []
|
|
579
|
+
with open(csv_path, "r") as f:
|
|
580
|
+
for line in f:
|
|
581
|
+
if line.startswith("#"):
|
|
582
|
+
headers.append(line.rstrip())
|
|
583
|
+
else:
|
|
584
|
+
break
|
|
585
|
+
|
|
586
|
+
# Write headers + data
|
|
587
|
+
with open(csv_path, "w") as f:
|
|
588
|
+
for header in headers:
|
|
589
|
+
f.write(header + "\n")
|
|
590
|
+
dataframe.to_csv(f, index=False)
|
|
591
|
+
|
|
592
|
+
def fill_gap(
|
|
593
|
+
self,
|
|
594
|
+
timestamp_gap_info: Dict,
|
|
595
|
+
csv_path: Path,
|
|
596
|
+
trading_timeframe: str,
|
|
597
|
+
) -> bool:
|
|
598
|
+
"""Fill a single gap with authentic Binance data using API-first validation protocol."""
|
|
599
|
+
logger.info(
|
|
600
|
+
f"🔧 Filling gap: {timestamp_gap_info['start_time']} → {timestamp_gap_info['end_time']}"
|
|
601
|
+
)
|
|
602
|
+
logger.info(" 📋 Applying API-first validation protocol")
|
|
603
|
+
|
|
604
|
+
# Load and detect format
|
|
605
|
+
existing_data = pd.read_csv(csv_path, comment="#")
|
|
606
|
+
existing_data["date"] = pd.to_datetime(existing_data["date"])
|
|
607
|
+
|
|
608
|
+
is_enhanced, is_legacy = self._detect_csv_format(existing_data)
|
|
609
|
+
if not is_enhanced and not is_legacy:
|
|
610
|
+
return False
|
|
611
|
+
|
|
612
|
+
# Extract symbol and retrieve API data
|
|
613
|
+
extracted_symbol = self.extract_symbol_from_filename(csv_path)
|
|
614
|
+
filename = Path(csv_path).name if isinstance(csv_path, str) else csv_path.name
|
|
615
|
+
logger.info(f" 🎯 Extracted symbol: {extracted_symbol} from file: {filename}")
|
|
616
|
+
|
|
617
|
+
api_data, metadata = self._retrieve_api_data_with_metadata(
|
|
618
|
+
timestamp_gap_info, trading_timeframe, extracted_symbol, is_enhanced
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
if not api_data:
|
|
622
|
+
return False
|
|
623
|
+
|
|
624
|
+
# Prepare and filter API data
|
|
625
|
+
api_df = self._prepare_api_dataframe(api_data, is_enhanced)
|
|
626
|
+
filtered_df = self._filter_to_gap_period(api_df, timestamp_gap_info)
|
|
627
|
+
|
|
628
|
+
if filtered_df is None:
|
|
629
|
+
return False
|
|
630
|
+
|
|
631
|
+
# Merge and validate
|
|
632
|
+
combined = self._merge_and_deduplicate(existing_data, filtered_df)
|
|
633
|
+
self._validate_gap_filled(combined, timestamp_gap_info, trading_timeframe)
|
|
634
|
+
|
|
635
|
+
# Save results
|
|
636
|
+
self._save_with_headers(csv_path, combined)
|
|
637
|
+
|
|
638
|
+
logger.info(f" ✅ Gap filled with {len(filtered_df)} authentic candles")
|
|
639
|
+
return True
|
|
640
|
+
|
|
641
|
+
def process_file(self, csv_path: Path, trading_timeframe: str) -> Dict:
|
|
642
|
+
"""Process a single CSV file - detect and fill ALL gaps"""
|
|
643
|
+
logger.info(f"🎯 Processing {csv_path} ({trading_timeframe})")
|
|
644
|
+
|
|
645
|
+
# Detect all gaps
|
|
646
|
+
detected_gaps = self.detect_all_gaps(csv_path, trading_timeframe)
|
|
647
|
+
|
|
648
|
+
if not detected_gaps:
|
|
649
|
+
logger.info(f" ✅ No gaps found in {trading_timeframe}")
|
|
650
|
+
return {
|
|
651
|
+
"timeframe": trading_timeframe,
|
|
652
|
+
"gaps_detected": 0,
|
|
653
|
+
"gaps_filled": 0,
|
|
654
|
+
"gaps_failed": 0,
|
|
655
|
+
"success_rate": 100.0,
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
# Fill each gap
|
|
659
|
+
gaps_filled_count = 0
|
|
660
|
+
gaps_failed_count = 0
|
|
661
|
+
|
|
662
|
+
for gap_index, timestamp_gap in enumerate(detected_gaps, 1):
|
|
663
|
+
logger.info(f" 🔧 Processing gap {gap_index}/{len(detected_gaps)}")
|
|
664
|
+
if self.fill_gap(timestamp_gap, csv_path, trading_timeframe):
|
|
665
|
+
gaps_filled_count += 1
|
|
666
|
+
else:
|
|
667
|
+
gaps_failed_count += 1
|
|
668
|
+
|
|
669
|
+
# Brief pause between API calls
|
|
670
|
+
if gap_index < len(detected_gaps):
|
|
671
|
+
time.sleep(1)
|
|
672
|
+
|
|
673
|
+
gap_fill_success_rate = (
|
|
674
|
+
(gaps_filled_count / len(detected_gaps)) * 100 if detected_gaps else 100.0
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
processing_result = {
|
|
678
|
+
"timeframe": trading_timeframe,
|
|
679
|
+
"gaps_detected": len(detected_gaps),
|
|
680
|
+
"gaps_filled": gaps_filled_count,
|
|
681
|
+
"gaps_failed": gaps_failed_count,
|
|
682
|
+
"success_rate": gap_fill_success_rate,
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
logger.info(
|
|
686
|
+
f" 📊 Result: {gaps_filled_count}/{len(detected_gaps)} gaps filled ({gap_fill_success_rate:.1f}%)"
|
|
687
|
+
)
|
|
688
|
+
return processing_result
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def main():
|
|
692
|
+
"""Main execution function"""
|
|
693
|
+
logger.info("🚀 UNIVERSAL GAP FILLER - Fill ALL Gaps in ALL Timeframes")
|
|
694
|
+
logger.info("=" * 60)
|
|
695
|
+
|
|
696
|
+
gap_filler_instance = UniversalGapFiller()
|
|
697
|
+
sample_data_directory = Path("../sample_data")
|
|
698
|
+
|
|
699
|
+
# Define timeframes that need gap filling (exclude 4h which is perfect)
|
|
700
|
+
target_trading_timeframes = ["1m", "3m", "5m", "15m", "30m", "1h", "2h"]
|
|
701
|
+
|
|
702
|
+
processing_results = []
|
|
703
|
+
|
|
704
|
+
for trading_timeframe in target_trading_timeframes:
|
|
705
|
+
csv_file_pattern = f"binance_spot_SOLUSDT-{trading_timeframe}_*.csv"
|
|
706
|
+
matching_csv_files = list(sample_data_directory.glob(csv_file_pattern))
|
|
707
|
+
|
|
708
|
+
if not matching_csv_files:
|
|
709
|
+
logger.warning(f"❌ No CSV file found for {trading_timeframe}")
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
selected_csv_file = matching_csv_files[0] # Use first match
|
|
713
|
+
timeframe_result = gap_filler_instance.process_file(selected_csv_file, trading_timeframe)
|
|
714
|
+
processing_results.append(timeframe_result)
|
|
715
|
+
|
|
716
|
+
# Summary report
|
|
717
|
+
logger.info("\n" + "=" * 60)
|
|
718
|
+
logger.info("📊 UNIVERSAL GAP FILLING SUMMARY")
|
|
719
|
+
logger.info("=" * 60)
|
|
720
|
+
|
|
721
|
+
total_gaps_detected_count = sum(
|
|
722
|
+
result_data["gaps_detected"] for result_data in processing_results
|
|
723
|
+
)
|
|
724
|
+
total_gaps_filled_count = sum(result_data["gaps_filled"] for result_data in processing_results)
|
|
725
|
+
total_gaps_failed_count = sum(result_data["gaps_failed"] for result_data in processing_results)
|
|
726
|
+
|
|
727
|
+
for timeframe_result in processing_results:
|
|
728
|
+
status_icon = (
|
|
729
|
+
"✅"
|
|
730
|
+
if timeframe_result["success_rate"] == 100.0
|
|
731
|
+
else "⚠️"
|
|
732
|
+
if timeframe_result["success_rate"] > 0
|
|
733
|
+
else "❌"
|
|
734
|
+
)
|
|
735
|
+
logger.info(
|
|
736
|
+
f"{status_icon} {timeframe_result['timeframe']:>3}: {timeframe_result['gaps_filled']:>2}/{timeframe_result['gaps_detected']:>2} gaps filled ({timeframe_result['success_rate']:>5.1f}%)"
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
logger.info("-" * 60)
|
|
740
|
+
overall_success_rate = (
|
|
741
|
+
(total_gaps_filled_count / total_gaps_detected_count * 100)
|
|
742
|
+
if total_gaps_detected_count > 0
|
|
743
|
+
else 100.0
|
|
744
|
+
)
|
|
745
|
+
logger.info(
|
|
746
|
+
f"🎯 OVERALL: {total_gaps_filled_count}/{total_gaps_detected_count} gaps filled ({overall_success_rate:.1f}%)"
|
|
747
|
+
)
|
|
748
|
+
logger.info("=" * 60)
|
|
749
|
+
|
|
750
|
+
if overall_success_rate == 100.0:
|
|
751
|
+
logger.info("🎉 ALL GAPS FILLED SUCCESSFULLY! Ready for validation.")
|
|
752
|
+
else:
|
|
753
|
+
logger.warning(f"⚠️ {total_gaps_failed_count} gaps failed to fill. Manual review needed.")
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
if __name__ == "__main__":
|
|
757
|
+
main()
|