gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""ETag-based HTTP caching for immutable Binance Vision data.
|
|
2
|
+
|
|
3
|
+
CloudFront CDN provides ETags for all monthly ZIP files. Since historical data
|
|
4
|
+
is immutable, ETags enable bandwidth-efficient re-runs through 304 Not Modified
|
|
5
|
+
responses (90%+ bandwidth reduction).
|
|
6
|
+
|
|
7
|
+
SLO Targets:
|
|
8
|
+
Availability: 100% - handles cache corruption gracefully
|
|
9
|
+
Correctness: 100% - cache mismatches trigger full download
|
|
10
|
+
Observability: All cache hits/misses logged
|
|
11
|
+
Maintainability: Follows XDG Base Directory Specification
|
|
12
|
+
|
|
13
|
+
Architecture:
|
|
14
|
+
- Cache location: $HOME/.cache/gapless-crypto-data/etags.json
|
|
15
|
+
- Standard library only (pathlib + json)
|
|
16
|
+
- Exception-only failure (no silent fallbacks)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Dict, Optional
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ETagCache:
|
|
29
|
+
"""HTTP ETag cache manager for Binance Vision immutable data.
|
|
30
|
+
|
|
31
|
+
Manages ETag-based caching to avoid re-downloading immutable historical data.
|
|
32
|
+
Uses XDG Base Directory Specification for cache location.
|
|
33
|
+
|
|
34
|
+
Cache Structure:
|
|
35
|
+
{
|
|
36
|
+
"https://data.binance.vision/.../BTCUSDT-1h-2024-01.zip": {
|
|
37
|
+
"etag": "efcd0b4716abb9d950262a26fcb6ba43",
|
|
38
|
+
"last_checked": "2025-10-16T16:30:00Z",
|
|
39
|
+
"file_size": 12845632
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Examples:
|
|
44
|
+
>>> cache = ETagCache()
|
|
45
|
+
>>> cache.update_etag(url, "abc123", 1024000)
|
|
46
|
+
>>> etag = cache.get_etag(url)
|
|
47
|
+
>>> print(f"Cache hit: {etag}")
|
|
48
|
+
Cache hit: abc123
|
|
49
|
+
|
|
50
|
+
Note:
|
|
51
|
+
Cache is persistent across runs. Corrupted cache files are automatically
|
|
52
|
+
deleted and recreated. All errors propagate (exception-only failure).
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, cache_dir: Optional[Path] = None):
|
|
56
|
+
"""Initialize ETag cache manager.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
cache_dir: Override default cache directory location.
|
|
60
|
+
Default: $HOME/.cache/gapless-crypto-data/
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
OSError: If cache directory creation fails
|
|
64
|
+
"""
|
|
65
|
+
if cache_dir is None:
|
|
66
|
+
# Follow XDG Base Directory Specification
|
|
67
|
+
home = Path.home()
|
|
68
|
+
self.cache_dir = home / ".cache" / "gapless-crypto-data"
|
|
69
|
+
else:
|
|
70
|
+
self.cache_dir = cache_dir
|
|
71
|
+
|
|
72
|
+
self.cache_file = self.cache_dir / "etags.json"
|
|
73
|
+
|
|
74
|
+
# Create cache directory if not exists
|
|
75
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
# Load cache (or create empty)
|
|
78
|
+
self._cache: Dict[str, Dict] = self._load_cache()
|
|
79
|
+
|
|
80
|
+
def _load_cache(self) -> Dict[str, Dict]:
|
|
81
|
+
"""Load cache from disk.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Cache dictionary mapping URLs to ETag metadata
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
json.JSONDecodeError: If cache file is corrupted (auto-deleted)
|
|
88
|
+
OSError: If file read fails (propagated)
|
|
89
|
+
"""
|
|
90
|
+
if not self.cache_file.exists():
|
|
91
|
+
logger.debug(f"Cache file not found, creating new cache: {self.cache_file}")
|
|
92
|
+
return {}
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
with open(self.cache_file, "r") as f:
|
|
96
|
+
cache_data = json.load(f)
|
|
97
|
+
logger.debug(f"Loaded ETag cache with {len(cache_data)} entries")
|
|
98
|
+
return cache_data
|
|
99
|
+
except json.JSONDecodeError as e:
|
|
100
|
+
logger.error(f"Corrupted ETag cache file, deleting: {e}")
|
|
101
|
+
self.cache_file.unlink() # Delete corrupted cache
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"ETag cache corrupted at {self.cache_file}. "
|
|
104
|
+
f"Deleted corrupted file. Original error: {e}"
|
|
105
|
+
) from e
|
|
106
|
+
|
|
107
|
+
def _save_cache(self) -> None:
|
|
108
|
+
"""Save cache to disk.
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
OSError: If file write fails (propagated)
|
|
112
|
+
json.JSONEncodeError: If cache data not JSON-serializable (propagated)
|
|
113
|
+
"""
|
|
114
|
+
with open(self.cache_file, "w") as f:
|
|
115
|
+
json.dump(self._cache, f, indent=2)
|
|
116
|
+
logger.debug(f"Saved ETag cache with {len(self._cache)} entries")
|
|
117
|
+
|
|
118
|
+
def get_etag(self, url: str) -> Optional[str]:
|
|
119
|
+
"""Get ETag for URL from cache.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
url: Full URL to Binance Vision file
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
ETag string if cached, None if not found
|
|
126
|
+
|
|
127
|
+
Examples:
|
|
128
|
+
>>> cache = ETagCache()
|
|
129
|
+
>>> etag = cache.get_etag("https://data.binance.vision/.../BTCUSDT-1h-2024-01.zip")
|
|
130
|
+
>>> if etag:
|
|
131
|
+
... print("Cache hit")
|
|
132
|
+
... else:
|
|
133
|
+
... print("Cache miss")
|
|
134
|
+
"""
|
|
135
|
+
entry = self._cache.get(url)
|
|
136
|
+
if entry:
|
|
137
|
+
logger.debug(f"Cache hit for {url}: {entry['etag']}")
|
|
138
|
+
return entry["etag"]
|
|
139
|
+
else:
|
|
140
|
+
logger.debug(f"Cache miss for {url}")
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
def update_etag(self, url: str, etag: str, file_size: int) -> None:
|
|
144
|
+
"""Update cache with new ETag metadata.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
url: Full URL to Binance Vision file
|
|
148
|
+
etag: ETag from HTTP response header
|
|
149
|
+
file_size: Content-Length from HTTP response
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
OSError: If cache save fails (propagated)
|
|
153
|
+
"""
|
|
154
|
+
self._cache[url] = {
|
|
155
|
+
"etag": etag,
|
|
156
|
+
"last_checked": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
157
|
+
"file_size": file_size,
|
|
158
|
+
}
|
|
159
|
+
self._save_cache()
|
|
160
|
+
logger.debug(f"Updated cache for {url}: {etag}")
|
|
161
|
+
|
|
162
|
+
def invalidate(self, url: str) -> None:
|
|
163
|
+
"""Remove URL from cache (ETag mismatch scenario).
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
url: Full URL to invalidate
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
OSError: If cache save fails (propagated)
|
|
170
|
+
"""
|
|
171
|
+
if url in self._cache:
|
|
172
|
+
del self._cache[url]
|
|
173
|
+
self._save_cache()
|
|
174
|
+
logger.warning(f"Invalidated cache entry for {url}")
|
|
175
|
+
|
|
176
|
+
def get_cache_stats(self) -> Dict[str, int]:
|
|
177
|
+
"""Get cache statistics for observability.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Dictionary with cache entry count and total cached file size
|
|
181
|
+
"""
|
|
182
|
+
total_size = sum(entry.get("file_size", 0) for entry in self._cache.values())
|
|
183
|
+
return {"total_entries": len(self._cache), "total_cached_size": total_size}
|
|
184
|
+
|
|
185
|
+
def clear_cache(self) -> None:
|
|
186
|
+
"""Clear all cache entries.
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
OSError: If cache file deletion fails (propagated)
|
|
190
|
+
"""
|
|
191
|
+
self._cache = {}
|
|
192
|
+
if self.cache_file.exists():
|
|
193
|
+
self.cache_file.unlink()
|
|
194
|
+
logger.info("Cleared ETag cache")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Centralized timeframe constants for data collection and gap detection.
|
|
2
|
+
|
|
3
|
+
This module provides single source of truth for timeframe-to-interval mappings
|
|
4
|
+
used across collectors and gap fillers, eliminating code duplication and
|
|
5
|
+
preventing calculation bugs.
|
|
6
|
+
|
|
7
|
+
SLO Targets:
|
|
8
|
+
Maintainability: Single source of truth eliminates 3+ code duplications
|
|
9
|
+
Correctness: All 13 timeframes map to accurate minute values
|
|
10
|
+
Availability: Supports full spectrum from 1s to 1d timeframes
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from datetime import timedelta
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
# Timeframe to minutes mapping (single source of truth)
|
|
19
|
+
TIMEFRAME_TO_MINUTES: Dict[str, float] = {
|
|
20
|
+
"1s": 1 / 60, # 1 second = 1/60 minute
|
|
21
|
+
"1m": 1,
|
|
22
|
+
"3m": 3,
|
|
23
|
+
"5m": 5,
|
|
24
|
+
"15m": 15,
|
|
25
|
+
"30m": 30,
|
|
26
|
+
"1h": 60,
|
|
27
|
+
"2h": 120,
|
|
28
|
+
"4h": 240,
|
|
29
|
+
"6h": 360,
|
|
30
|
+
"8h": 480,
|
|
31
|
+
"12h": 720,
|
|
32
|
+
"1d": 1440, # 24 hours = 1440 minutes
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Pandas Timedelta mapping (derived from minutes)
|
|
36
|
+
TIMEFRAME_TO_TIMEDELTA: Dict[str, pd.Timedelta] = {
|
|
37
|
+
timeframe: pd.Timedelta(minutes=minutes) for timeframe, minutes in TIMEFRAME_TO_MINUTES.items()
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Python timedelta mapping (for non-pandas contexts)
|
|
41
|
+
TIMEFRAME_TO_PYTHON_TIMEDELTA: Dict[str, timedelta] = {
|
|
42
|
+
timeframe: timedelta(minutes=minutes) for timeframe, minutes in TIMEFRAME_TO_MINUTES.items()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Binance API interval mapping (for API parameter compatibility)
|
|
46
|
+
TIMEFRAME_TO_BINANCE_INTERVAL: Dict[str, str] = {
|
|
47
|
+
"1s": "1s",
|
|
48
|
+
"1m": "1m",
|
|
49
|
+
"3m": "3m",
|
|
50
|
+
"5m": "5m",
|
|
51
|
+
"15m": "15m",
|
|
52
|
+
"30m": "30m",
|
|
53
|
+
"1h": "1h",
|
|
54
|
+
"2h": "2h",
|
|
55
|
+
"4h": "4h",
|
|
56
|
+
"6h": "6h",
|
|
57
|
+
"8h": "8h",
|
|
58
|
+
"12h": "12h",
|
|
59
|
+
"1d": "1d",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Validation: All timeframes must be present in all mappings
|
|
63
|
+
_EXPECTED_TIMEFRAMES = {
|
|
64
|
+
"1s",
|
|
65
|
+
"1m",
|
|
66
|
+
"3m",
|
|
67
|
+
"5m",
|
|
68
|
+
"15m",
|
|
69
|
+
"30m",
|
|
70
|
+
"1h",
|
|
71
|
+
"2h",
|
|
72
|
+
"4h",
|
|
73
|
+
"6h",
|
|
74
|
+
"8h",
|
|
75
|
+
"12h",
|
|
76
|
+
"1d",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
assert set(TIMEFRAME_TO_MINUTES.keys()) == _EXPECTED_TIMEFRAMES, (
|
|
80
|
+
f"TIMEFRAME_TO_MINUTES missing timeframes: "
|
|
81
|
+
f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_MINUTES.keys())}"
|
|
82
|
+
)
|
|
83
|
+
assert set(TIMEFRAME_TO_TIMEDELTA.keys()) == _EXPECTED_TIMEFRAMES, (
|
|
84
|
+
f"TIMEFRAME_TO_TIMEDELTA missing timeframes: "
|
|
85
|
+
f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_TIMEDELTA.keys())}"
|
|
86
|
+
)
|
|
87
|
+
assert set(TIMEFRAME_TO_BINANCE_INTERVAL.keys()) == _EXPECTED_TIMEFRAMES, (
|
|
88
|
+
f"TIMEFRAME_TO_BINANCE_INTERVAL missing timeframes: "
|
|
89
|
+
f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_BINANCE_INTERVAL.keys())}"
|
|
90
|
+
)
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Timestamp format detection and analysis for Binance data.
|
|
2
|
+
|
|
3
|
+
This module provides timestamp format detection supporting both millisecond
|
|
4
|
+
and microsecond precision timestamps, with comprehensive validation and
|
|
5
|
+
transition tracking capabilities.
|
|
6
|
+
|
|
7
|
+
Classes:
|
|
8
|
+
TimestampFormatAnalyzer: Analyzes and validates timestamp formats
|
|
9
|
+
|
|
10
|
+
SLO Targets:
|
|
11
|
+
Correctness: 100% - accurate format detection for all valid timestamps
|
|
12
|
+
Observability: Complete reporting of format transitions and statistics
|
|
13
|
+
Maintainability: Single source of truth for timestamp format logic
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TimestampFormatAnalyzer:
|
|
18
|
+
"""Analyzes timestamp formats in cryptocurrency data with transition tracking.
|
|
19
|
+
|
|
20
|
+
Supports both legacy millisecond precision (13-digit) and modern microsecond
|
|
21
|
+
precision (16-digit) timestamps from Binance data. Tracks format transitions
|
|
22
|
+
and provides comprehensive statistics.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
format_stats: Statistics for each detected format type
|
|
26
|
+
format_transitions: List of detected format transition points
|
|
27
|
+
current_format: Current timestamp format being processed
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> analyzer = TimestampFormatAnalyzer()
|
|
31
|
+
>>> analyzer.initialize_tracking()
|
|
32
|
+
>>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000, 0)
|
|
33
|
+
>>> fmt
|
|
34
|
+
'milliseconds'
|
|
35
|
+
>>> analyzer.update_format_stats(fmt, 1609459200000, 0)
|
|
36
|
+
>>> analyzer.report_format_analysis()
|
|
37
|
+
📈 COMPREHENSIVE FORMAT ANALYSIS:
|
|
38
|
+
MILLISECONDS: 1 rows (100.0%)
|
|
39
|
+
...
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
"""Initialize timestamp format analyzer."""
|
|
44
|
+
self.format_stats = {}
|
|
45
|
+
self.format_transitions = []
|
|
46
|
+
self.current_format = None
|
|
47
|
+
self._format_analysis_summary = {}
|
|
48
|
+
|
|
49
|
+
def initialize_tracking(self):
|
|
50
|
+
"""Initialize format tracking state for new data processing."""
|
|
51
|
+
self.format_stats = {
|
|
52
|
+
"milliseconds": {
|
|
53
|
+
"count": 0,
|
|
54
|
+
"first_seen": None,
|
|
55
|
+
"last_seen": None,
|
|
56
|
+
"sample_values": [],
|
|
57
|
+
},
|
|
58
|
+
"microseconds": {
|
|
59
|
+
"count": 0,
|
|
60
|
+
"first_seen": None,
|
|
61
|
+
"last_seen": None,
|
|
62
|
+
"sample_values": [],
|
|
63
|
+
},
|
|
64
|
+
"unknown": {"count": 0, "errors": []},
|
|
65
|
+
}
|
|
66
|
+
self.format_transitions = []
|
|
67
|
+
self.current_format = None
|
|
68
|
+
self._format_analysis_summary = {}
|
|
69
|
+
|
|
70
|
+
def analyze_timestamp_format(self, raw_timestamp_value, csv_row_index):
|
|
71
|
+
"""Comprehensive timestamp format analysis with validation.
|
|
72
|
+
|
|
73
|
+
Detects whether a timestamp is in milliseconds (13-digit) or microseconds
|
|
74
|
+
(16+ digit) format and validates the timestamp is within expected range
|
|
75
|
+
(2010-2030).
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
raw_timestamp_value: Integer timestamp value to analyze
|
|
79
|
+
csv_row_index: Row index for error reporting
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
tuple: (detected_format_type, converted_seconds, validation_result)
|
|
83
|
+
- detected_format_type: "milliseconds", "microseconds", or "unknown"
|
|
84
|
+
- converted_seconds: Timestamp converted to seconds (float or None)
|
|
85
|
+
- validation_result: Dict with "valid" bool and optional "error_details"
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> analyzer = TimestampFormatAnalyzer()
|
|
89
|
+
>>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000, 0)
|
|
90
|
+
>>> fmt
|
|
91
|
+
'milliseconds'
|
|
92
|
+
>>> secs
|
|
93
|
+
1609459200.0
|
|
94
|
+
>>> valid
|
|
95
|
+
{'valid': True}
|
|
96
|
+
|
|
97
|
+
>>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000000, 1)
|
|
98
|
+
>>> fmt
|
|
99
|
+
'microseconds'
|
|
100
|
+
>>> secs
|
|
101
|
+
1609459200.0
|
|
102
|
+
"""
|
|
103
|
+
timestamp_digit_count = len(str(raw_timestamp_value))
|
|
104
|
+
|
|
105
|
+
# Enhanced format detection logic
|
|
106
|
+
if timestamp_digit_count >= 16: # Microseconds (16+ digits) - 2025+ format
|
|
107
|
+
detected_format_type = "microseconds"
|
|
108
|
+
converted_seconds = raw_timestamp_value / 1000000
|
|
109
|
+
timestamp_min_bound = 1262304000000000 # 2010-01-01 00:00:00 (microseconds)
|
|
110
|
+
timestamp_max_bound = 1893456000000000 # 2030-01-01 00:00:00 (microseconds)
|
|
111
|
+
|
|
112
|
+
elif timestamp_digit_count >= 10: # Milliseconds (10-15 digits) - Legacy format
|
|
113
|
+
detected_format_type = "milliseconds"
|
|
114
|
+
converted_seconds = raw_timestamp_value / 1000
|
|
115
|
+
timestamp_min_bound = 1262304000000 # 2010-01-01 00:00:00 (milliseconds)
|
|
116
|
+
timestamp_max_bound = 1893456000000 # 2030-01-01 00:00:00 (milliseconds)
|
|
117
|
+
|
|
118
|
+
else: # Unknown format (less than 10 digits)
|
|
119
|
+
detected_format_type = "unknown"
|
|
120
|
+
converted_seconds = None
|
|
121
|
+
timestamp_min_bound = timestamp_max_bound = None
|
|
122
|
+
|
|
123
|
+
# Enhanced validation with detailed error reporting
|
|
124
|
+
if detected_format_type == "unknown":
|
|
125
|
+
timestamp_validation_result = {
|
|
126
|
+
"valid": False,
|
|
127
|
+
"error_details": {
|
|
128
|
+
"row_index": csv_row_index,
|
|
129
|
+
"error_type": "unknown_timestamp_format",
|
|
130
|
+
"timestamp_value": raw_timestamp_value,
|
|
131
|
+
"digit_count": timestamp_digit_count,
|
|
132
|
+
"expected_formats": "milliseconds (10-15 digits) or microseconds (16+ digits)",
|
|
133
|
+
"raw_row": f"Timestamp too short: {timestamp_digit_count} digits",
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
elif raw_timestamp_value < timestamp_min_bound or raw_timestamp_value > timestamp_max_bound:
|
|
137
|
+
timestamp_validation_result = {
|
|
138
|
+
"valid": False,
|
|
139
|
+
"error_details": {
|
|
140
|
+
"row_index": csv_row_index,
|
|
141
|
+
"error_type": "invalid_timestamp_range",
|
|
142
|
+
"timestamp_value": raw_timestamp_value,
|
|
143
|
+
"timestamp_format": detected_format_type,
|
|
144
|
+
"digit_count": timestamp_digit_count,
|
|
145
|
+
"valid_range": f"{timestamp_min_bound} to {timestamp_max_bound}",
|
|
146
|
+
"parsed_date": "out_of_range",
|
|
147
|
+
"raw_row": f"Out of valid {detected_format_type} range (2010-2030)",
|
|
148
|
+
},
|
|
149
|
+
}
|
|
150
|
+
else:
|
|
151
|
+
timestamp_validation_result = {"valid": True}
|
|
152
|
+
|
|
153
|
+
return detected_format_type, converted_seconds, timestamp_validation_result
|
|
154
|
+
|
|
155
|
+
def update_format_stats(self, detected_timestamp_format, raw_timestamp_value, csv_row_index):
|
|
156
|
+
"""Update format statistics and detect transitions.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
detected_timestamp_format: Format type ("milliseconds", "microseconds", "unknown")
|
|
160
|
+
raw_timestamp_value: Original timestamp value
|
|
161
|
+
csv_row_index: Row index for tracking
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
bool: True if format transition detected, False otherwise
|
|
165
|
+
"""
|
|
166
|
+
transition_detected = False
|
|
167
|
+
|
|
168
|
+
# Track format transitions
|
|
169
|
+
if self.current_format is None:
|
|
170
|
+
self.current_format = detected_timestamp_format
|
|
171
|
+
elif (
|
|
172
|
+
self.current_format != detected_timestamp_format
|
|
173
|
+
and detected_timestamp_format != "unknown"
|
|
174
|
+
):
|
|
175
|
+
self.format_transitions.append(
|
|
176
|
+
{
|
|
177
|
+
"row_index": csv_row_index,
|
|
178
|
+
"from_format": self.current_format,
|
|
179
|
+
"to_format": detected_timestamp_format,
|
|
180
|
+
"timestamp_value": raw_timestamp_value,
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
self.current_format = detected_timestamp_format
|
|
184
|
+
transition_detected = True
|
|
185
|
+
|
|
186
|
+
# Update format statistics
|
|
187
|
+
self.format_stats[detected_timestamp_format]["count"] += 1
|
|
188
|
+
if self.format_stats[detected_timestamp_format]["first_seen"] is None:
|
|
189
|
+
self.format_stats[detected_timestamp_format]["first_seen"] = csv_row_index
|
|
190
|
+
self.format_stats[detected_timestamp_format]["last_seen"] = csv_row_index
|
|
191
|
+
|
|
192
|
+
# Store sample values (first 3 per format)
|
|
193
|
+
if len(self.format_stats[detected_timestamp_format]["sample_values"]) < 3:
|
|
194
|
+
self.format_stats[detected_timestamp_format]["sample_values"].append(
|
|
195
|
+
raw_timestamp_value
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return transition_detected
|
|
199
|
+
|
|
200
|
+
def report_format_analysis(self):
|
|
201
|
+
"""Report comprehensive format analysis with transition detection.
|
|
202
|
+
|
|
203
|
+
Prints format statistics and transitions to console, and stores
|
|
204
|
+
analysis summary in self._format_analysis_summary for metadata.
|
|
205
|
+
"""
|
|
206
|
+
total_rows = sum(stats["count"] for stats in self.format_stats.values())
|
|
207
|
+
|
|
208
|
+
print(" 📈 COMPREHENSIVE FORMAT ANALYSIS:")
|
|
209
|
+
|
|
210
|
+
for format_type, stats in self.format_stats.items():
|
|
211
|
+
if stats["count"] > 0:
|
|
212
|
+
percentage = (stats["count"] / total_rows) * 100 if total_rows > 0 else 0
|
|
213
|
+
print(f" {format_type.upper()}: {stats['count']:,} rows ({percentage:.1f}%)")
|
|
214
|
+
|
|
215
|
+
if format_type != "unknown" and stats["sample_values"]:
|
|
216
|
+
first_sample = stats["sample_values"][0]
|
|
217
|
+
print(
|
|
218
|
+
f" Sample: {first_sample} (rows {stats['first_seen']}-{stats['last_seen']})"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Report format transitions
|
|
222
|
+
if len(self.format_transitions) > 0:
|
|
223
|
+
print(f" 🔄 FORMAT TRANSITIONS DETECTED: {len(self.format_transitions)}")
|
|
224
|
+
for i, transition in enumerate(self.format_transitions[:3]): # Show first 3
|
|
225
|
+
print(
|
|
226
|
+
f" #{i + 1}: Row {transition['row_index']} - {transition['from_format']} → {transition['to_format']}"
|
|
227
|
+
)
|
|
228
|
+
print(f" Timestamp: {transition['timestamp_value']}")
|
|
229
|
+
if len(self.format_transitions) > 3:
|
|
230
|
+
print(f" ... and {len(self.format_transitions) - 3} more transitions")
|
|
231
|
+
else:
|
|
232
|
+
print(
|
|
233
|
+
f" ✅ SINGLE FORMAT: No transitions detected - consistent {self.current_format}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Store format analysis results for metadata
|
|
237
|
+
self._format_analysis_summary = {
|
|
238
|
+
"total_rows_analyzed": total_rows,
|
|
239
|
+
"formats_detected": {
|
|
240
|
+
fmt: stats["count"]
|
|
241
|
+
for fmt, stats in self.format_stats.items()
|
|
242
|
+
if stats["count"] > 0
|
|
243
|
+
},
|
|
244
|
+
"transitions_detected": len(self.format_transitions),
|
|
245
|
+
"transition_details": self.format_transitions,
|
|
246
|
+
"primary_format": self.current_format,
|
|
247
|
+
"format_consistency": len(self.format_transitions) == 0,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
def get_format_analysis_summary(self):
|
|
251
|
+
"""Get format analysis summary for metadata.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
dict: Format analysis summary with statistics and transitions
|
|
255
|
+
"""
|
|
256
|
+
return self._format_analysis_summary
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Simple timestamp utility functions for Binance data format conversions.
|
|
2
|
+
|
|
3
|
+
This module provides lightweight utility functions for timestamp format detection
|
|
4
|
+
and normalization, complementing the comprehensive TimestampFormatAnalyzer class.
|
|
5
|
+
|
|
6
|
+
Binance Vision API Format Transition (2025-01-01):
|
|
7
|
+
- Spot data: Transitioned to microseconds (16 digits)
|
|
8
|
+
- Futures data: Remains milliseconds (13 digits)
|
|
9
|
+
- Target: Universal microsecond precision (DateTime64(6))
|
|
10
|
+
|
|
11
|
+
Functions:
|
|
12
|
+
detect_timestamp_precision: Quick format detection (milliseconds vs microseconds)
|
|
13
|
+
normalize_timestamp_to_microseconds: Convert timestamps to microsecond precision
|
|
14
|
+
|
|
15
|
+
SLO Targets:
|
|
16
|
+
Correctness: 100% - accurate conversion with no data loss
|
|
17
|
+
Maintainability: Simple functions for inline use throughout codebase
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def detect_timestamp_precision(timestamp: int) -> str:
|
|
22
|
+
"""Detect timestamp precision from magnitude.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
timestamp: Raw timestamp from Binance CSV (integer)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
str: "microseconds" (16+ digits) or "milliseconds" (10-15 digits)
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If timestamp has unexpected digit count (<10 digits)
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> detect_timestamp_precision(1704067200000000) # 16 digits
|
|
35
|
+
'microseconds'
|
|
36
|
+
|
|
37
|
+
>>> detect_timestamp_precision(1704067200000) # 13 digits
|
|
38
|
+
'milliseconds'
|
|
39
|
+
|
|
40
|
+
>>> detect_timestamp_precision(123) # Too short
|
|
41
|
+
Traceback (most recent call last):
|
|
42
|
+
...
|
|
43
|
+
ValueError: Invalid timestamp 123: expected 10+ digits, got 3
|
|
44
|
+
|
|
45
|
+
SLO: Correctness - accurate detection for all valid Binance timestamps
|
|
46
|
+
"""
|
|
47
|
+
digit_count = len(str(timestamp))
|
|
48
|
+
|
|
49
|
+
if digit_count >= 16: # Microseconds (2025+ spot data)
|
|
50
|
+
return "microseconds"
|
|
51
|
+
elif digit_count >= 10: # Milliseconds (legacy spot, all futures)
|
|
52
|
+
return "milliseconds"
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Invalid timestamp {timestamp}: expected 10+ digits, got {digit_count}. "
|
|
56
|
+
f"Valid formats: milliseconds (10-15 digits) or microseconds (16+ digits)."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def normalize_timestamp_to_microseconds(timestamp: int, source_precision: str) -> int:
|
|
61
|
+
"""Normalize timestamp to microsecond precision.
|
|
62
|
+
|
|
63
|
+
Converts millisecond timestamps to microseconds for uniform DateTime64(6) storage.
|
|
64
|
+
Microsecond timestamps are passed through unchanged.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
timestamp: Raw timestamp from Binance CSV
|
|
68
|
+
source_precision: Detected precision ("milliseconds" or "microseconds")
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
int: Timestamp in microseconds (DateTime64(6) compatible)
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If source_precision is not "milliseconds" or "microseconds"
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
>>> # Milliseconds → Microseconds (multiply by 1000)
|
|
78
|
+
>>> normalize_timestamp_to_microseconds(1704067200000, "milliseconds")
|
|
79
|
+
1704067200000000
|
|
80
|
+
|
|
81
|
+
>>> # Microseconds → Microseconds (no change)
|
|
82
|
+
>>> normalize_timestamp_to_microseconds(1704067200000000, "microseconds")
|
|
83
|
+
1704067200000000
|
|
84
|
+
|
|
85
|
+
>>> # Invalid precision
|
|
86
|
+
>>> normalize_timestamp_to_microseconds(1704067200, "seconds")
|
|
87
|
+
Traceback (most recent call last):
|
|
88
|
+
...
|
|
89
|
+
ValueError: Unknown precision: seconds. Must be 'milliseconds' or 'microseconds'.
|
|
90
|
+
|
|
91
|
+
SLO: Correctness - lossless conversion with validation
|
|
92
|
+
"""
|
|
93
|
+
if source_precision == "microseconds":
|
|
94
|
+
return timestamp # Already correct precision
|
|
95
|
+
elif source_precision == "milliseconds":
|
|
96
|
+
return timestamp * 1000 # Convert ms → μs
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Unknown precision: {source_precision}. Must be 'milliseconds' or 'microseconds'."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def normalize_timestamp_auto(timestamp: int) -> int:
|
|
104
|
+
"""Auto-detect and normalize timestamp to microseconds.
|
|
105
|
+
|
|
106
|
+
Convenience function combining detection and normalization in one call.
|
|
107
|
+
Useful for inline conversions without explicit precision tracking.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
timestamp: Raw timestamp from Binance CSV
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
int: Timestamp normalized to microseconds
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If timestamp is invalid (<10 digits)
|
|
117
|
+
|
|
118
|
+
Examples:
|
|
119
|
+
>>> # Auto-detect milliseconds and convert
|
|
120
|
+
>>> normalize_timestamp_auto(1704067200000)
|
|
121
|
+
1704067200000000
|
|
122
|
+
|
|
123
|
+
>>> # Auto-detect microseconds and pass through
|
|
124
|
+
>>> normalize_timestamp_auto(1704067200000000)
|
|
125
|
+
1704067200000000
|
|
126
|
+
|
|
127
|
+
SLO: Correctness - accurate auto-detection and conversion
|
|
128
|
+
"""
|
|
129
|
+
precision = detect_timestamp_precision(timestamp)
|
|
130
|
+
return normalize_timestamp_to_microseconds(timestamp, precision)
|