gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,194 @@
1
+ """ETag-based HTTP caching for immutable Binance Vision data.
2
+
3
+ CloudFront CDN provides ETags for all monthly ZIP files. Since historical data
4
+ is immutable, ETags enable bandwidth-efficient re-runs through 304 Not Modified
5
+ responses (90%+ bandwidth reduction).
6
+
7
+ SLO Targets:
8
+ Availability: 100% - handles cache corruption gracefully
9
+ Correctness: 100% - cache mismatches trigger full download
10
+ Observability: All cache hits/misses logged
11
+ Maintainability: Follows XDG Base Directory Specification
12
+
13
+ Architecture:
14
+ - Cache location: $HOME/.cache/gapless-crypto-data/etags.json
15
+ - Standard library only (pathlib + json)
16
+ - Exception-only failure (no silent fallbacks)
17
+ """
18
+
19
+ import json
20
+ import logging
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+ from typing import Dict, Optional
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ETagCache:
29
+ """HTTP ETag cache manager for Binance Vision immutable data.
30
+
31
+ Manages ETag-based caching to avoid re-downloading immutable historical data.
32
+ Uses XDG Base Directory Specification for cache location.
33
+
34
+ Cache Structure:
35
+ {
36
+ "https://data.binance.vision/.../BTCUSDT-1h-2024-01.zip": {
37
+ "etag": "efcd0b4716abb9d950262a26fcb6ba43",
38
+ "last_checked": "2025-10-16T16:30:00Z",
39
+ "file_size": 12845632
40
+ }
41
+ }
42
+
43
+ Examples:
44
+ >>> cache = ETagCache()
45
+ >>> cache.update_etag(url, "abc123", 1024000)
46
+ >>> etag = cache.get_etag(url)
47
+ >>> print(f"Cache hit: {etag}")
48
+ Cache hit: abc123
49
+
50
+ Note:
51
+ Cache is persistent across runs. Corrupted cache files are automatically
52
+ deleted and recreated. All errors propagate (exception-only failure).
53
+ """
54
+
55
+ def __init__(self, cache_dir: Optional[Path] = None):
56
+ """Initialize ETag cache manager.
57
+
58
+ Args:
59
+ cache_dir: Override default cache directory location.
60
+ Default: $HOME/.cache/gapless-crypto-data/
61
+
62
+ Raises:
63
+ OSError: If cache directory creation fails
64
+ """
65
+ if cache_dir is None:
66
+ # Follow XDG Base Directory Specification
67
+ home = Path.home()
68
+ self.cache_dir = home / ".cache" / "gapless-crypto-data"
69
+ else:
70
+ self.cache_dir = cache_dir
71
+
72
+ self.cache_file = self.cache_dir / "etags.json"
73
+
74
+ # Create cache directory if not exists
75
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
76
+
77
+ # Load cache (or create empty)
78
+ self._cache: Dict[str, Dict] = self._load_cache()
79
+
80
+ def _load_cache(self) -> Dict[str, Dict]:
81
+ """Load cache from disk.
82
+
83
+ Returns:
84
+ Cache dictionary mapping URLs to ETag metadata
85
+
86
+ Raises:
87
+ json.JSONDecodeError: If cache file is corrupted (auto-deleted)
88
+ OSError: If file read fails (propagated)
89
+ """
90
+ if not self.cache_file.exists():
91
+ logger.debug(f"Cache file not found, creating new cache: {self.cache_file}")
92
+ return {}
93
+
94
+ try:
95
+ with open(self.cache_file, "r") as f:
96
+ cache_data = json.load(f)
97
+ logger.debug(f"Loaded ETag cache with {len(cache_data)} entries")
98
+ return cache_data
99
+ except json.JSONDecodeError as e:
100
+ logger.error(f"Corrupted ETag cache file, deleting: {e}")
101
+ self.cache_file.unlink() # Delete corrupted cache
102
+ raise ValueError(
103
+ f"ETag cache corrupted at {self.cache_file}. "
104
+ f"Deleted corrupted file. Original error: {e}"
105
+ ) from e
106
+
107
+ def _save_cache(self) -> None:
108
+ """Save cache to disk.
109
+
110
+ Raises:
111
+ OSError: If file write fails (propagated)
112
+ json.JSONEncodeError: If cache data not JSON-serializable (propagated)
113
+ """
114
+ with open(self.cache_file, "w") as f:
115
+ json.dump(self._cache, f, indent=2)
116
+ logger.debug(f"Saved ETag cache with {len(self._cache)} entries")
117
+
118
+ def get_etag(self, url: str) -> Optional[str]:
119
+ """Get ETag for URL from cache.
120
+
121
+ Args:
122
+ url: Full URL to Binance Vision file
123
+
124
+ Returns:
125
+ ETag string if cached, None if not found
126
+
127
+ Examples:
128
+ >>> cache = ETagCache()
129
+ >>> etag = cache.get_etag("https://data.binance.vision/.../BTCUSDT-1h-2024-01.zip")
130
+ >>> if etag:
131
+ ... print("Cache hit")
132
+ ... else:
133
+ ... print("Cache miss")
134
+ """
135
+ entry = self._cache.get(url)
136
+ if entry:
137
+ logger.debug(f"Cache hit for {url}: {entry['etag']}")
138
+ return entry["etag"]
139
+ else:
140
+ logger.debug(f"Cache miss for {url}")
141
+ return None
142
+
143
+ def update_etag(self, url: str, etag: str, file_size: int) -> None:
144
+ """Update cache with new ETag metadata.
145
+
146
+ Args:
147
+ url: Full URL to Binance Vision file
148
+ etag: ETag from HTTP response header
149
+ file_size: Content-Length from HTTP response
150
+
151
+ Raises:
152
+ OSError: If cache save fails (propagated)
153
+ """
154
+ self._cache[url] = {
155
+ "etag": etag,
156
+ "last_checked": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
157
+ "file_size": file_size,
158
+ }
159
+ self._save_cache()
160
+ logger.debug(f"Updated cache for {url}: {etag}")
161
+
162
+ def invalidate(self, url: str) -> None:
163
+ """Remove URL from cache (ETag mismatch scenario).
164
+
165
+ Args:
166
+ url: Full URL to invalidate
167
+
168
+ Raises:
169
+ OSError: If cache save fails (propagated)
170
+ """
171
+ if url in self._cache:
172
+ del self._cache[url]
173
+ self._save_cache()
174
+ logger.warning(f"Invalidated cache entry for {url}")
175
+
176
+ def get_cache_stats(self) -> Dict[str, int]:
177
+ """Get cache statistics for observability.
178
+
179
+ Returns:
180
+ Dictionary with cache entry count and total cached file size
181
+ """
182
+ total_size = sum(entry.get("file_size", 0) for entry in self._cache.values())
183
+ return {"total_entries": len(self._cache), "total_cached_size": total_size}
184
+
185
+ def clear_cache(self) -> None:
186
+ """Clear all cache entries.
187
+
188
+ Raises:
189
+ OSError: If cache file deletion fails (propagated)
190
+ """
191
+ self._cache = {}
192
+ if self.cache_file.exists():
193
+ self.cache_file.unlink()
194
+ logger.info("Cleared ETag cache")
@@ -0,0 +1,90 @@
1
+ """Centralized timeframe constants for data collection and gap detection.
2
+
3
+ This module provides single source of truth for timeframe-to-interval mappings
4
+ used across collectors and gap fillers, eliminating code duplication and
5
+ preventing calculation bugs.
6
+
7
+ SLO Targets:
8
+ Maintainability: Single source of truth eliminates 3+ code duplications
9
+ Correctness: All 13 timeframes map to accurate minute values
10
+ Availability: Supports full spectrum from 1s to 1d timeframes
11
+ """
12
+
13
+ from datetime import timedelta
14
+ from typing import Dict
15
+
16
+ import pandas as pd
17
+
18
+ # Timeframe to minutes mapping (single source of truth)
19
+ TIMEFRAME_TO_MINUTES: Dict[str, float] = {
20
+ "1s": 1 / 60, # 1 second = 1/60 minute
21
+ "1m": 1,
22
+ "3m": 3,
23
+ "5m": 5,
24
+ "15m": 15,
25
+ "30m": 30,
26
+ "1h": 60,
27
+ "2h": 120,
28
+ "4h": 240,
29
+ "6h": 360,
30
+ "8h": 480,
31
+ "12h": 720,
32
+ "1d": 1440, # 24 hours = 1440 minutes
33
+ }
34
+
35
+ # Pandas Timedelta mapping (derived from minutes)
36
+ TIMEFRAME_TO_TIMEDELTA: Dict[str, pd.Timedelta] = {
37
+ timeframe: pd.Timedelta(minutes=minutes) for timeframe, minutes in TIMEFRAME_TO_MINUTES.items()
38
+ }
39
+
40
+ # Python timedelta mapping (for non-pandas contexts)
41
+ TIMEFRAME_TO_PYTHON_TIMEDELTA: Dict[str, timedelta] = {
42
+ timeframe: timedelta(minutes=minutes) for timeframe, minutes in TIMEFRAME_TO_MINUTES.items()
43
+ }
44
+
45
+ # Binance API interval mapping (for API parameter compatibility)
46
+ TIMEFRAME_TO_BINANCE_INTERVAL: Dict[str, str] = {
47
+ "1s": "1s",
48
+ "1m": "1m",
49
+ "3m": "3m",
50
+ "5m": "5m",
51
+ "15m": "15m",
52
+ "30m": "30m",
53
+ "1h": "1h",
54
+ "2h": "2h",
55
+ "4h": "4h",
56
+ "6h": "6h",
57
+ "8h": "8h",
58
+ "12h": "12h",
59
+ "1d": "1d",
60
+ }
61
+
62
+ # Validation: All timeframes must be present in all mappings
63
+ _EXPECTED_TIMEFRAMES = {
64
+ "1s",
65
+ "1m",
66
+ "3m",
67
+ "5m",
68
+ "15m",
69
+ "30m",
70
+ "1h",
71
+ "2h",
72
+ "4h",
73
+ "6h",
74
+ "8h",
75
+ "12h",
76
+ "1d",
77
+ }
78
+
79
+ assert set(TIMEFRAME_TO_MINUTES.keys()) == _EXPECTED_TIMEFRAMES, (
80
+ f"TIMEFRAME_TO_MINUTES missing timeframes: "
81
+ f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_MINUTES.keys())}"
82
+ )
83
+ assert set(TIMEFRAME_TO_TIMEDELTA.keys()) == _EXPECTED_TIMEFRAMES, (
84
+ f"TIMEFRAME_TO_TIMEDELTA missing timeframes: "
85
+ f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_TIMEDELTA.keys())}"
86
+ )
87
+ assert set(TIMEFRAME_TO_BINANCE_INTERVAL.keys()) == _EXPECTED_TIMEFRAMES, (
88
+ f"TIMEFRAME_TO_BINANCE_INTERVAL missing timeframes: "
89
+ f"{_EXPECTED_TIMEFRAMES - set(TIMEFRAME_TO_BINANCE_INTERVAL.keys())}"
90
+ )
@@ -0,0 +1,256 @@
1
+ """Timestamp format detection and analysis for Binance data.
2
+
3
+ This module provides timestamp format detection supporting both millisecond
4
+ and microsecond precision timestamps, with comprehensive validation and
5
+ transition tracking capabilities.
6
+
7
+ Classes:
8
+ TimestampFormatAnalyzer: Analyzes and validates timestamp formats
9
+
10
+ SLO Targets:
11
+ Correctness: 100% - accurate format detection for all valid timestamps
12
+ Observability: Complete reporting of format transitions and statistics
13
+ Maintainability: Single source of truth for timestamp format logic
14
+ """
15
+
16
+
17
+ class TimestampFormatAnalyzer:
18
+ """Analyzes timestamp formats in cryptocurrency data with transition tracking.
19
+
20
+ Supports both legacy millisecond precision (13-digit) and modern microsecond
21
+ precision (16-digit) timestamps from Binance data. Tracks format transitions
22
+ and provides comprehensive statistics.
23
+
24
+ Attributes:
25
+ format_stats: Statistics for each detected format type
26
+ format_transitions: List of detected format transition points
27
+ current_format: Current timestamp format being processed
28
+
29
+ Examples:
30
+ >>> analyzer = TimestampFormatAnalyzer()
31
+ >>> analyzer.initialize_tracking()
32
+ >>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000, 0)
33
+ >>> fmt
34
+ 'milliseconds'
35
+ >>> analyzer.update_format_stats(fmt, 1609459200000, 0)
36
+ >>> analyzer.report_format_analysis()
37
+ 📈 COMPREHENSIVE FORMAT ANALYSIS:
38
+ MILLISECONDS: 1 rows (100.0%)
39
+ ...
40
+ """
41
+
42
+ def __init__(self):
43
+ """Initialize timestamp format analyzer."""
44
+ self.format_stats = {}
45
+ self.format_transitions = []
46
+ self.current_format = None
47
+ self._format_analysis_summary = {}
48
+
49
+ def initialize_tracking(self):
50
+ """Initialize format tracking state for new data processing."""
51
+ self.format_stats = {
52
+ "milliseconds": {
53
+ "count": 0,
54
+ "first_seen": None,
55
+ "last_seen": None,
56
+ "sample_values": [],
57
+ },
58
+ "microseconds": {
59
+ "count": 0,
60
+ "first_seen": None,
61
+ "last_seen": None,
62
+ "sample_values": [],
63
+ },
64
+ "unknown": {"count": 0, "errors": []},
65
+ }
66
+ self.format_transitions = []
67
+ self.current_format = None
68
+ self._format_analysis_summary = {}
69
+
70
+ def analyze_timestamp_format(self, raw_timestamp_value, csv_row_index):
71
+ """Comprehensive timestamp format analysis with validation.
72
+
73
+ Detects whether a timestamp is in milliseconds (13-digit) or microseconds
74
+ (16+ digit) format and validates the timestamp is within expected range
75
+ (2010-2030).
76
+
77
+ Args:
78
+ raw_timestamp_value: Integer timestamp value to analyze
79
+ csv_row_index: Row index for error reporting
80
+
81
+ Returns:
82
+ tuple: (detected_format_type, converted_seconds, validation_result)
83
+ - detected_format_type: "milliseconds", "microseconds", or "unknown"
84
+ - converted_seconds: Timestamp converted to seconds (float or None)
85
+ - validation_result: Dict with "valid" bool and optional "error_details"
86
+
87
+ Examples:
88
+ >>> analyzer = TimestampFormatAnalyzer()
89
+ >>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000, 0)
90
+ >>> fmt
91
+ 'milliseconds'
92
+ >>> secs
93
+ 1609459200.0
94
+ >>> valid
95
+ {'valid': True}
96
+
97
+ >>> fmt, secs, valid = analyzer.analyze_timestamp_format(1609459200000000, 1)
98
+ >>> fmt
99
+ 'microseconds'
100
+ >>> secs
101
+ 1609459200.0
102
+ """
103
+ timestamp_digit_count = len(str(raw_timestamp_value))
104
+
105
+ # Enhanced format detection logic
106
+ if timestamp_digit_count >= 16: # Microseconds (16+ digits) - 2025+ format
107
+ detected_format_type = "microseconds"
108
+ converted_seconds = raw_timestamp_value / 1000000
109
+ timestamp_min_bound = 1262304000000000 # 2010-01-01 00:00:00 (microseconds)
110
+ timestamp_max_bound = 1893456000000000 # 2030-01-01 00:00:00 (microseconds)
111
+
112
+ elif timestamp_digit_count >= 10: # Milliseconds (10-15 digits) - Legacy format
113
+ detected_format_type = "milliseconds"
114
+ converted_seconds = raw_timestamp_value / 1000
115
+ timestamp_min_bound = 1262304000000 # 2010-01-01 00:00:00 (milliseconds)
116
+ timestamp_max_bound = 1893456000000 # 2030-01-01 00:00:00 (milliseconds)
117
+
118
+ else: # Unknown format (less than 10 digits)
119
+ detected_format_type = "unknown"
120
+ converted_seconds = None
121
+ timestamp_min_bound = timestamp_max_bound = None
122
+
123
+ # Enhanced validation with detailed error reporting
124
+ if detected_format_type == "unknown":
125
+ timestamp_validation_result = {
126
+ "valid": False,
127
+ "error_details": {
128
+ "row_index": csv_row_index,
129
+ "error_type": "unknown_timestamp_format",
130
+ "timestamp_value": raw_timestamp_value,
131
+ "digit_count": timestamp_digit_count,
132
+ "expected_formats": "milliseconds (10-15 digits) or microseconds (16+ digits)",
133
+ "raw_row": f"Timestamp too short: {timestamp_digit_count} digits",
134
+ },
135
+ }
136
+ elif raw_timestamp_value < timestamp_min_bound or raw_timestamp_value > timestamp_max_bound:
137
+ timestamp_validation_result = {
138
+ "valid": False,
139
+ "error_details": {
140
+ "row_index": csv_row_index,
141
+ "error_type": "invalid_timestamp_range",
142
+ "timestamp_value": raw_timestamp_value,
143
+ "timestamp_format": detected_format_type,
144
+ "digit_count": timestamp_digit_count,
145
+ "valid_range": f"{timestamp_min_bound} to {timestamp_max_bound}",
146
+ "parsed_date": "out_of_range",
147
+ "raw_row": f"Out of valid {detected_format_type} range (2010-2030)",
148
+ },
149
+ }
150
+ else:
151
+ timestamp_validation_result = {"valid": True}
152
+
153
+ return detected_format_type, converted_seconds, timestamp_validation_result
154
+
155
+ def update_format_stats(self, detected_timestamp_format, raw_timestamp_value, csv_row_index):
156
+ """Update format statistics and detect transitions.
157
+
158
+ Args:
159
+ detected_timestamp_format: Format type ("milliseconds", "microseconds", "unknown")
160
+ raw_timestamp_value: Original timestamp value
161
+ csv_row_index: Row index for tracking
162
+
163
+ Returns:
164
+ bool: True if format transition detected, False otherwise
165
+ """
166
+ transition_detected = False
167
+
168
+ # Track format transitions
169
+ if self.current_format is None:
170
+ self.current_format = detected_timestamp_format
171
+ elif (
172
+ self.current_format != detected_timestamp_format
173
+ and detected_timestamp_format != "unknown"
174
+ ):
175
+ self.format_transitions.append(
176
+ {
177
+ "row_index": csv_row_index,
178
+ "from_format": self.current_format,
179
+ "to_format": detected_timestamp_format,
180
+ "timestamp_value": raw_timestamp_value,
181
+ }
182
+ )
183
+ self.current_format = detected_timestamp_format
184
+ transition_detected = True
185
+
186
+ # Update format statistics
187
+ self.format_stats[detected_timestamp_format]["count"] += 1
188
+ if self.format_stats[detected_timestamp_format]["first_seen"] is None:
189
+ self.format_stats[detected_timestamp_format]["first_seen"] = csv_row_index
190
+ self.format_stats[detected_timestamp_format]["last_seen"] = csv_row_index
191
+
192
+ # Store sample values (first 3 per format)
193
+ if len(self.format_stats[detected_timestamp_format]["sample_values"]) < 3:
194
+ self.format_stats[detected_timestamp_format]["sample_values"].append(
195
+ raw_timestamp_value
196
+ )
197
+
198
+ return transition_detected
199
+
200
+ def report_format_analysis(self):
201
+ """Report comprehensive format analysis with transition detection.
202
+
203
+ Prints format statistics and transitions to console, and stores
204
+ analysis summary in self._format_analysis_summary for metadata.
205
+ """
206
+ total_rows = sum(stats["count"] for stats in self.format_stats.values())
207
+
208
+ print(" 📈 COMPREHENSIVE FORMAT ANALYSIS:")
209
+
210
+ for format_type, stats in self.format_stats.items():
211
+ if stats["count"] > 0:
212
+ percentage = (stats["count"] / total_rows) * 100 if total_rows > 0 else 0
213
+ print(f" {format_type.upper()}: {stats['count']:,} rows ({percentage:.1f}%)")
214
+
215
+ if format_type != "unknown" and stats["sample_values"]:
216
+ first_sample = stats["sample_values"][0]
217
+ print(
218
+ f" Sample: {first_sample} (rows {stats['first_seen']}-{stats['last_seen']})"
219
+ )
220
+
221
+ # Report format transitions
222
+ if len(self.format_transitions) > 0:
223
+ print(f" 🔄 FORMAT TRANSITIONS DETECTED: {len(self.format_transitions)}")
224
+ for i, transition in enumerate(self.format_transitions[:3]): # Show first 3
225
+ print(
226
+ f" #{i + 1}: Row {transition['row_index']} - {transition['from_format']} → {transition['to_format']}"
227
+ )
228
+ print(f" Timestamp: {transition['timestamp_value']}")
229
+ if len(self.format_transitions) > 3:
230
+ print(f" ... and {len(self.format_transitions) - 3} more transitions")
231
+ else:
232
+ print(
233
+ f" ✅ SINGLE FORMAT: No transitions detected - consistent {self.current_format}"
234
+ )
235
+
236
+ # Store format analysis results for metadata
237
+ self._format_analysis_summary = {
238
+ "total_rows_analyzed": total_rows,
239
+ "formats_detected": {
240
+ fmt: stats["count"]
241
+ for fmt, stats in self.format_stats.items()
242
+ if stats["count"] > 0
243
+ },
244
+ "transitions_detected": len(self.format_transitions),
245
+ "transition_details": self.format_transitions,
246
+ "primary_format": self.current_format,
247
+ "format_consistency": len(self.format_transitions) == 0,
248
+ }
249
+
250
+ def get_format_analysis_summary(self):
251
+ """Get format analysis summary for metadata.
252
+
253
+ Returns:
254
+ dict: Format analysis summary with statistics and transitions
255
+ """
256
+ return self._format_analysis_summary
@@ -0,0 +1,130 @@
1
+ """Simple timestamp utility functions for Binance data format conversions.
2
+
3
+ This module provides lightweight utility functions for timestamp format detection
4
+ and normalization, complementing the comprehensive TimestampFormatAnalyzer class.
5
+
6
+ Binance Vision API Format Transition (2025-01-01):
7
+ - Spot data: Transitioned to microseconds (16 digits)
8
+ - Futures data: Remains milliseconds (13 digits)
9
+ - Target: Universal microsecond precision (DateTime64(6))
10
+
11
+ Functions:
12
+ detect_timestamp_precision: Quick format detection (milliseconds vs microseconds)
13
+ normalize_timestamp_to_microseconds: Convert timestamps to microsecond precision
14
+
15
+ SLO Targets:
16
+ Correctness: 100% - accurate conversion with no data loss
17
+ Maintainability: Simple functions for inline use throughout codebase
18
+ """
19
+
20
+
21
+ def detect_timestamp_precision(timestamp: int) -> str:
22
+ """Detect timestamp precision from magnitude.
23
+
24
+ Args:
25
+ timestamp: Raw timestamp from Binance CSV (integer)
26
+
27
+ Returns:
28
+ str: "microseconds" (16+ digits) or "milliseconds" (10-15 digits)
29
+
30
+ Raises:
31
+ ValueError: If timestamp has unexpected digit count (<10 digits)
32
+
33
+ Examples:
34
+ >>> detect_timestamp_precision(1704067200000000) # 16 digits
35
+ 'microseconds'
36
+
37
+ >>> detect_timestamp_precision(1704067200000) # 13 digits
38
+ 'milliseconds'
39
+
40
+ >>> detect_timestamp_precision(123) # Too short
41
+ Traceback (most recent call last):
42
+ ...
43
+ ValueError: Invalid timestamp 123: expected 10+ digits, got 3
44
+
45
+ SLO: Correctness - accurate detection for all valid Binance timestamps
46
+ """
47
+ digit_count = len(str(timestamp))
48
+
49
+ if digit_count >= 16: # Microseconds (2025+ spot data)
50
+ return "microseconds"
51
+ elif digit_count >= 10: # Milliseconds (legacy spot, all futures)
52
+ return "milliseconds"
53
+ else:
54
+ raise ValueError(
55
+ f"Invalid timestamp {timestamp}: expected 10+ digits, got {digit_count}. "
56
+ f"Valid formats: milliseconds (10-15 digits) or microseconds (16+ digits)."
57
+ )
58
+
59
+
60
+ def normalize_timestamp_to_microseconds(timestamp: int, source_precision: str) -> int:
61
+ """Normalize timestamp to microsecond precision.
62
+
63
+ Converts millisecond timestamps to microseconds for uniform DateTime64(6) storage.
64
+ Microsecond timestamps are passed through unchanged.
65
+
66
+ Args:
67
+ timestamp: Raw timestamp from Binance CSV
68
+ source_precision: Detected precision ("milliseconds" or "microseconds")
69
+
70
+ Returns:
71
+ int: Timestamp in microseconds (DateTime64(6) compatible)
72
+
73
+ Raises:
74
+ ValueError: If source_precision is not "milliseconds" or "microseconds"
75
+
76
+ Examples:
77
+ >>> # Milliseconds → Microseconds (multiply by 1000)
78
+ >>> normalize_timestamp_to_microseconds(1704067200000, "milliseconds")
79
+ 1704067200000000
80
+
81
+ >>> # Microseconds → Microseconds (no change)
82
+ >>> normalize_timestamp_to_microseconds(1704067200000000, "microseconds")
83
+ 1704067200000000
84
+
85
+ >>> # Invalid precision
86
+ >>> normalize_timestamp_to_microseconds(1704067200, "seconds")
87
+ Traceback (most recent call last):
88
+ ...
89
+ ValueError: Unknown precision: seconds. Must be 'milliseconds' or 'microseconds'.
90
+
91
+ SLO: Correctness - lossless conversion with validation
92
+ """
93
+ if source_precision == "microseconds":
94
+ return timestamp # Already correct precision
95
+ elif source_precision == "milliseconds":
96
+ return timestamp * 1000 # Convert ms → μs
97
+ else:
98
+ raise ValueError(
99
+ f"Unknown precision: {source_precision}. Must be 'milliseconds' or 'microseconds'."
100
+ )
101
+
102
+
103
+ def normalize_timestamp_auto(timestamp: int) -> int:
104
+ """Auto-detect and normalize timestamp to microseconds.
105
+
106
+ Convenience function combining detection and normalization in one call.
107
+ Useful for inline conversions without explicit precision tracking.
108
+
109
+ Args:
110
+ timestamp: Raw timestamp from Binance CSV
111
+
112
+ Returns:
113
+ int: Timestamp normalized to microseconds
114
+
115
+ Raises:
116
+ ValueError: If timestamp is invalid (<10 digits)
117
+
118
+ Examples:
119
+ >>> # Auto-detect milliseconds and convert
120
+ >>> normalize_timestamp_auto(1704067200000)
121
+ 1704067200000000
122
+
123
+ >>> # Auto-detect microseconds and pass through
124
+ >>> normalize_timestamp_auto(1704067200000000)
125
+ 1704067200000000
126
+
127
+ SLO: Correctness - accurate auto-detection and conversion
128
+ """
129
+ precision = detect_timestamp_precision(timestamp)
130
+ return normalize_timestamp_to_microseconds(timestamp, precision)