gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,316 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hybrid URL Generator for Binance Data Sources
4
+
5
+ Generates URLs for both monthly and daily data sources with intelligent
6
+ strategy determination for optimal data collection performance.
7
+
8
+ This module implements the hybrid approach:
9
+ - Monthly ZIP files for historical data (>30 days old)
10
+ - Daily ZIP files for recent data (≤30 days old)
11
+ - Concurrent collection support for both sources
12
+ """
13
+
14
+ from datetime import datetime, timedelta
15
+ from enum import Enum
16
+ from typing import List, NamedTuple, Optional, Tuple
17
+
18
+
19
+ class DataSource(Enum):
20
+ """Data source types for Binance public data."""
21
+
22
+ MONTHLY = "monthly"
23
+ DAILY = "daily"
24
+
25
+
26
+ class DownloadTask(NamedTuple):
27
+ """Represents a single download task with all necessary information."""
28
+
29
+ url: str
30
+ filename: str
31
+ source_type: DataSource
32
+ period_identifier: str # "2024-01" for monthly, "2024-01-15" for daily
33
+ date_range: Tuple[datetime, datetime] # (start, end) for the file
34
+
35
+
36
+ class HybridUrlGenerator:
37
+ """
38
+ Intelligent URL generator for hybrid monthly+daily Binance data collection.
39
+
40
+ Automatically determines optimal data source strategy based on date ranges:
41
+ - Uses monthly files for bulk historical data (efficient for large ranges)
42
+ - Uses daily files for recent data (more up-to-date, smaller files)
43
+ - Supports concurrent download planning with proper task distribution
44
+
45
+ Features:
46
+ - Automatic cutoff date determination (configurable)
47
+ - URL generation for both monthly and daily sources
48
+ - Download task optimization and batching
49
+ - Rate limit-aware task distribution
50
+ - Intelligent overlap handling between monthly and daily sources
51
+
52
+ Examples:
53
+ Basic hybrid URL generation:
54
+
55
+ >>> generator = HybridUrlGenerator()
56
+ >>> tasks = generator.generate_download_tasks(
57
+ ... symbol="BTCUSDT",
58
+ ... timeframe="1h",
59
+ ... start_date=datetime(2024, 1, 1),
60
+ ... end_date=datetime(2024, 12, 31)
61
+ ... )
62
+ >>> print(f"Generated {len(tasks)} download tasks")
63
+ Generated 15 download tasks
64
+
65
+ Custom configuration:
66
+
67
+ >>> generator = HybridUrlGenerator(
68
+ ... daily_lookback_days=45, # Use daily files for last 45 days
69
+ ... base_url="https://data.binance.vision/data/spot"
70
+ ... )
71
+ >>> monthly_tasks, daily_tasks = generator.separate_tasks_by_source(tasks)
72
+ >>> print(f"Monthly: {len(monthly_tasks)}, Daily: {len(daily_tasks)}")
73
+ Monthly: 11, Daily: 45
74
+
75
+ Concurrent batch planning:
76
+
77
+ >>> batches = generator.create_concurrent_batches(tasks, max_concurrent=13)
78
+ >>> for i, batch in enumerate(batches):
79
+ ... print(f"Batch {i}: {len(batch)} downloads")
80
+ Batch 0: 13 downloads
81
+ Batch 1: 5 downloads
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ daily_lookback_days: int = 30,
87
+ base_url: str = "https://data.binance.vision/data/spot",
88
+ max_concurrent_per_batch: int = 13,
89
+ ):
90
+ """
91
+ Initialize hybrid URL generator with configuration.
92
+
93
+ Args:
94
+ daily_lookback_days: Number of days to use daily files for recent data
95
+ base_url: Base URL for Binance data repository
96
+ max_concurrent_per_batch: Maximum concurrent downloads per batch (13 for ZIP files)
97
+ """
98
+ self.daily_lookback_days = daily_lookback_days
99
+ self.base_url = base_url.rstrip("/")
100
+ self.max_concurrent_per_batch = max_concurrent_per_batch
101
+
102
+ # Calculate cutoff date for monthly vs daily strategy
103
+ self.cutoff_date = datetime.now() - timedelta(days=daily_lookback_days)
104
+
105
+ def generate_download_tasks(
106
+ self,
107
+ symbol: str,
108
+ timeframe: str,
109
+ start_date: datetime,
110
+ end_date: datetime,
111
+ ) -> List[DownloadTask]:
112
+ """
113
+ Generate optimal download tasks using hybrid monthly+daily strategy.
114
+
115
+ Args:
116
+ symbol: Trading pair symbol (e.g., "BTCUSDT")
117
+ timeframe: Data timeframe (e.g., "1h", "1d")
118
+ start_date: Collection start date
119
+ end_date: Collection end date
120
+
121
+ Returns:
122
+ List of DownloadTask objects optimized for concurrent execution
123
+ """
124
+ tasks = []
125
+
126
+ # Determine which portions need monthly vs daily files
127
+ monthly_end = min(end_date, self.cutoff_date)
128
+ daily_start = max(start_date, self.cutoff_date)
129
+
130
+ # Generate monthly tasks for historical data
131
+ if start_date <= monthly_end:
132
+ monthly_tasks = self._generate_monthly_tasks(symbol, timeframe, start_date, monthly_end)
133
+ tasks.extend(monthly_tasks)
134
+
135
+ # Generate daily tasks for recent data
136
+ if daily_start <= end_date:
137
+ daily_tasks = self._generate_daily_tasks(symbol, timeframe, daily_start, end_date)
138
+ tasks.extend(daily_tasks)
139
+
140
+ # Sort tasks chronologically for optimal processing
141
+ tasks.sort(key=lambda task: task.date_range[0])
142
+
143
+ return tasks
144
+
145
+ def _generate_monthly_tasks(
146
+ self,
147
+ symbol: str,
148
+ timeframe: str,
149
+ start_date: datetime,
150
+ end_date: datetime,
151
+ ) -> List[DownloadTask]:
152
+ """Generate download tasks for monthly ZIP files."""
153
+ tasks = []
154
+ current_month = start_date.replace(day=1)
155
+
156
+ while current_month <= end_date:
157
+ year_month = current_month.strftime("%Y-%m")
158
+ filename = f"{symbol}-{timeframe}-{year_month}.zip"
159
+ url = f"{self.base_url}/monthly/klines/{symbol}/{timeframe}/{filename}"
160
+
161
+ # Calculate actual date range for this monthly file
162
+ month_start = current_month
163
+ if current_month.month == 12:
164
+ month_end = current_month.replace(year=current_month.year + 1, month=1) - timedelta(
165
+ days=1
166
+ )
167
+ else:
168
+ month_end = current_month.replace(month=current_month.month + 1) - timedelta(days=1)
169
+
170
+ # Clip to requested range
171
+ file_start = max(month_start, start_date)
172
+ file_end = min(month_end, end_date)
173
+
174
+ task = DownloadTask(
175
+ url=url,
176
+ filename=filename,
177
+ source_type=DataSource.MONTHLY,
178
+ period_identifier=year_month,
179
+ date_range=(file_start, file_end),
180
+ )
181
+ tasks.append(task)
182
+
183
+ # Move to next month
184
+ if current_month.month == 12:
185
+ current_month = current_month.replace(year=current_month.year + 1, month=1)
186
+ else:
187
+ current_month = current_month.replace(month=current_month.month + 1)
188
+
189
+ return tasks
190
+
191
+ def _generate_daily_tasks(
192
+ self,
193
+ symbol: str,
194
+ timeframe: str,
195
+ start_date: datetime,
196
+ end_date: datetime,
197
+ ) -> List[DownloadTask]:
198
+ """Generate download tasks for daily ZIP files."""
199
+ tasks = []
200
+ current_date = start_date.date()
201
+ end_date_only = end_date.date()
202
+
203
+ while current_date <= end_date_only:
204
+ date_str = current_date.strftime("%Y-%m-%d")
205
+ filename = f"{symbol}-{timeframe}-{date_str}.zip"
206
+ url = f"{self.base_url}/daily/klines/{symbol}/{timeframe}/{filename}"
207
+
208
+ # Daily files contain one day of data
209
+ day_start = datetime.combine(current_date, datetime.min.time())
210
+ day_end = day_start + timedelta(days=1) - timedelta(seconds=1)
211
+
212
+ # Clip to requested range
213
+ file_start = max(day_start, start_date)
214
+ file_end = min(day_end, end_date)
215
+
216
+ task = DownloadTask(
217
+ url=url,
218
+ filename=filename,
219
+ source_type=DataSource.DAILY,
220
+ period_identifier=date_str,
221
+ date_range=(file_start, file_end),
222
+ )
223
+ tasks.append(task)
224
+
225
+ current_date += timedelta(days=1)
226
+
227
+ return tasks
228
+
229
+ def separate_tasks_by_source(
230
+ self, tasks: List[DownloadTask]
231
+ ) -> Tuple[List[DownloadTask], List[DownloadTask]]:
232
+ """
233
+ Separate tasks into monthly and daily groups.
234
+
235
+ Args:
236
+ tasks: List of download tasks
237
+
238
+ Returns:
239
+ Tuple of (monthly_tasks, daily_tasks)
240
+ """
241
+ monthly_tasks = [task for task in tasks if task.source_type == DataSource.MONTHLY]
242
+ daily_tasks = [task for task in tasks if task.source_type == DataSource.DAILY]
243
+
244
+ return monthly_tasks, daily_tasks
245
+
246
+ def create_concurrent_batches(
247
+ self, tasks: List[DownloadTask], max_concurrent: Optional[int] = None
248
+ ) -> List[List[DownloadTask]]:
249
+ """
250
+ Create batches of tasks for concurrent execution with rate limiting.
251
+
252
+ Args:
253
+ tasks: List of download tasks
254
+ max_concurrent: Maximum concurrent downloads per batch
255
+
256
+ Returns:
257
+ List of task batches for concurrent execution
258
+ """
259
+ if max_concurrent is None:
260
+ max_concurrent = self.max_concurrent_per_batch
261
+
262
+ batches = []
263
+ for i in range(0, len(tasks), max_concurrent):
264
+ batch = tasks[i : i + max_concurrent]
265
+ batches.append(batch)
266
+
267
+ return batches
268
+
269
+ def get_collection_strategy_summary(
270
+ self,
271
+ symbol: str,
272
+ timeframe: str,
273
+ start_date: datetime,
274
+ end_date: datetime,
275
+ ) -> dict:
276
+ """
277
+ Get summary of collection strategy for the given parameters.
278
+
279
+ Args:
280
+ symbol: Trading pair symbol
281
+ timeframe: Data timeframe
282
+ start_date: Collection start date
283
+ end_date: Collection end date
284
+
285
+ Returns:
286
+ Strategy summary with source breakdown and task counts
287
+ """
288
+ tasks = self.generate_download_tasks(symbol, timeframe, start_date, end_date)
289
+ monthly_tasks, daily_tasks = self.separate_tasks_by_source(tasks)
290
+
291
+ return {
292
+ "total_tasks": len(tasks),
293
+ "monthly_tasks": len(monthly_tasks),
294
+ "daily_tasks": len(daily_tasks),
295
+ "cutoff_date": self.cutoff_date.isoformat(),
296
+ "daily_lookback_days": self.daily_lookback_days,
297
+ "estimated_batches": len(self.create_concurrent_batches(tasks)),
298
+ "sources_used": {
299
+ "monthly": len(monthly_tasks) > 0,
300
+ "daily": len(daily_tasks) > 0,
301
+ },
302
+ "date_ranges": {
303
+ "monthly_range": (
304
+ monthly_tasks[0].date_range[0].isoformat(),
305
+ monthly_tasks[-1].date_range[1].isoformat(),
306
+ )
307
+ if monthly_tasks
308
+ else None,
309
+ "daily_range": (
310
+ daily_tasks[0].date_range[0].isoformat(),
311
+ daily_tasks[-1].date_range[1].isoformat(),
312
+ )
313
+ if daily_tasks
314
+ else None,
315
+ },
316
+ }
@@ -0,0 +1,145 @@
1
+ """Structured exception hierarchy for gapless-crypto-data.
2
+
3
+ Provides machine-parseable error details via .details dict attribute,
4
+ enabling AI agents and downstream packages to programmatically handle errors.
5
+
6
+ Exception Hierarchy:
7
+ GaplessCryptoDataError (base)
8
+ ├── DataCollectionError - Binance data collection failures
9
+ ├── ValidationError - Input validation failures
10
+ ├── NetworkError - Network operation failures
11
+ └── GapFillingError - Gap detection/filling failures
12
+
13
+ Version: 3.2.0
14
+ """
15
+
16
+ from typing import Any
17
+
18
+
19
+ class GaplessCryptoDataError(Exception):
20
+ """Base exception for all gapless-crypto-data errors.
21
+
22
+ Provides structured error details via .details dict for machine-parseable
23
+ error handling by AI agents and downstream packages.
24
+
25
+ Attributes:
26
+ message: Human-readable error message
27
+ details: Machine-parseable error context (dict)
28
+
29
+ Examples:
30
+ >>> try:
31
+ ... raise DataCollectionError(
32
+ ... "Failed to collect BTCUSDT data",
33
+ ... details={"symbol": "BTCUSDT", "timeframe": "1h", "status_code": 404}
34
+ ... )
35
+ ... except GaplessCryptoDataError as e:
36
+ ... print(e.details)
37
+ {'symbol': 'BTCUSDT', 'timeframe': '1h', 'status_code': 404}
38
+ """
39
+
40
+ def __init__(self, message: str, details: dict[str, Any] | None = None):
41
+ """Initialize exception with message and optional structured details.
42
+
43
+ Args:
44
+ message: Human-readable error description
45
+ details: Machine-parseable context (symbol, timeframe, status_code, etc.)
46
+ """
47
+ super().__init__(message)
48
+ self.details = details or {}
49
+
50
+
51
+ class DataCollectionError(GaplessCryptoDataError):
52
+ """Raised when Binance data collection fails.
53
+
54
+ Common scenarios:
55
+ - Missing monthly ZIP files on Binance public repository
56
+ - ZIP extraction failures
57
+ - CSV parsing errors
58
+ - Timeframe not available for symbol/date range
59
+
60
+ Example:
61
+ >>> raise DataCollectionError(
62
+ ... "Monthly ZIP file not found",
63
+ ... details={
64
+ ... "symbol": "BTCUSDT",
65
+ ... "timeframe": "1s",
66
+ ... "year_month": "2020-01",
67
+ ... "url": "https://data.binance.vision/...",
68
+ ... "status_code": 404
69
+ ... }
70
+ ... )
71
+ """
72
+
73
+ pass
74
+
75
+
76
+ class ValidationError(GaplessCryptoDataError):
77
+ """Raised when input validation fails.
78
+
79
+ Common scenarios:
80
+ - Invalid symbol format
81
+ - Invalid timeframe (not in supported set)
82
+ - Invalid date range
83
+ - Malformed CSV data
84
+ - OHLCV constraint violations (high < low, etc.)
85
+
86
+ Example:
87
+ >>> raise ValidationError(
88
+ ... "Invalid timeframe",
89
+ ... details={
90
+ ... "provided_timeframe": "2h30m",
91
+ ... "supported_timeframes": ["1s", "1m", "3m", "5m", "15m", "30m", "1h", "2h", "4h", "6h", "8h", "12h", "1d"]
92
+ ... }
93
+ ... )
94
+ """
95
+
96
+ pass
97
+
98
+
99
+ class NetworkError(GaplessCryptoDataError):
100
+ """Raised when network operations fail.
101
+
102
+ Common scenarios:
103
+ - HTTP request timeouts
104
+ - Connection failures
105
+ - Rate limiting (429 status)
106
+ - Server errors (5xx status)
107
+
108
+ Example:
109
+ >>> raise NetworkError(
110
+ ... "Binance API rate limit exceeded",
111
+ ... details={
112
+ ... "endpoint": "https://api.binance.com/api/v3/klines",
113
+ ... "status_code": 429,
114
+ ... "retry_after": 60,
115
+ ... "request_count": 1200
116
+ ... }
117
+ ... )
118
+ """
119
+
120
+ pass
121
+
122
+
123
+ class GapFillingError(GaplessCryptoDataError):
124
+ """Raised when gap detection or filling fails.
125
+
126
+ Common scenarios:
127
+ - Gap detection algorithm failures
128
+ - API data unavailable for gap period
129
+ - Merge conflicts during gap filling
130
+ - Atomic operation failures
131
+
132
+ Example:
133
+ >>> raise GapFillingError(
134
+ ... "Cannot fill gap: API data unavailable",
135
+ ... details={
136
+ ... "gap_start": "2024-01-01T00:00:00",
137
+ ... "gap_end": "2024-01-01T06:00:00",
138
+ ... "gap_size_hours": 6,
139
+ ... "api_response": "No data returned",
140
+ ... "csv_file": "/path/to/BTCUSDT_1h.csv"
141
+ ... }
142
+ ... )
143
+ """
144
+
145
+ pass
@@ -0,0 +1 @@
1
+ """Gap filling module."""