gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,502 @@
1
+ """DuckDB-based persistent storage for validation reports.
2
+
3
+ This module provides efficient storage and querying of CSV validation reports
4
+ using DuckDB's single-file OLAP database. Designed for AI coding agents to
5
+ analyze validation history and trends.
6
+
7
+ Examples:
8
+ >>> from gapless_crypto_clickhouse.validation.storage import ValidationStorage
9
+ >>> from gapless_crypto_clickhouse.validation.models import ValidationReport
10
+ >>> from datetime import datetime, timezone
11
+ >>>
12
+ >>> # Initialize storage (creates DB at ~/.cache/gapless-crypto-data/validation.duckdb)
13
+ >>> storage = ValidationStorage()
14
+ >>>
15
+ >>> # Create a validation report
16
+ >>> report = ValidationReport(
17
+ ... validation_timestamp=datetime.now(timezone.utc),
18
+ ... file_path="/data/BTCUSDT-1h.csv",
19
+ ... file_size_mb=15.3,
20
+ ... symbol="BTCUSDT",
21
+ ... timeframe="1h",
22
+ ... total_bars=8760,
23
+ ... total_errors=0,
24
+ ... total_warnings=2,
25
+ ... validation_summary="GOOD - 2 warnings",
26
+ ... validation_duration_ms=123.45,
27
+ ... structure_validation={},
28
+ ... datetime_validation={},
29
+ ... ohlcv_validation={},
30
+ ... coverage_validation={},
31
+ ... anomaly_validation={}
32
+ ... )
33
+ >>>
34
+ >>> # Store report
35
+ >>> storage.insert_report(report)
36
+ >>>
37
+ >>> # Query recent validations
38
+ >>> recent = storage.query_recent(limit=10, symbol="BTCUSDT")
39
+ >>>
40
+ >>> # Export to pandas for analysis
41
+ >>> df = storage.export_to_dataframe()
42
+ """
43
+
44
+ import json
45
+ import re
46
+ from datetime import datetime
47
+ from pathlib import Path
48
+ from typing import Any, Dict, List, Optional, Tuple
49
+
50
+ import duckdb
51
+ import pandas as pd
52
+
53
+ from .models import ValidationReport
54
+
55
+
56
+ def get_validation_db_path() -> Path:
57
+ """Get XDG-compliant path for validation database.
58
+
59
+ Returns:
60
+ Path to validation.duckdb in XDG cache directory
61
+
62
+ Examples:
63
+ >>> path = get_validation_db_path()
64
+ >>> str(path)
65
+ '/Users/username/.cache/gapless-crypto-data/validation.duckdb'
66
+ """
67
+ cache_dir = Path.home() / ".cache" / "gapless-crypto-data"
68
+ cache_dir.mkdir(parents=True, exist_ok=True)
69
+ return cache_dir / "validation.duckdb"
70
+
71
+
72
+ def extract_symbol_timeframe_from_path(filepath: str) -> Tuple[Optional[str], Optional[str]]:
73
+ """Extract trading pair symbol and timeframe from CSV file path.
74
+
75
+ Supports multiple filename patterns:
76
+ - binance_spot_BTCUSDT-1h_20240101-20240102_v2.10.0.csv
77
+ - BTCUSDT-1h.csv
78
+ - BTCUSDT_1h_data.csv
79
+ - /path/to/BTCUSDT-1h.csv
80
+
81
+ Args:
82
+ filepath: Path to CSV file (absolute or relative)
83
+
84
+ Returns:
85
+ Tuple of (symbol, timeframe) where each may be None if not found
86
+
87
+ Examples:
88
+ >>> extract_symbol_timeframe_from_path("binance_spot_BTCUSDT-1h_20240101-20240102_v2.10.0.csv")
89
+ ('BTCUSDT', '1h')
90
+
91
+ >>> extract_symbol_timeframe_from_path("/data/ETHUSDT-5m.csv")
92
+ ('ETHUSDT', '5m')
93
+
94
+ >>> extract_symbol_timeframe_from_path("random_file.csv")
95
+ (None, None)
96
+ """
97
+ # Get filename without directory
98
+ filename = Path(filepath).name
99
+
100
+ # Pattern 1: binance_spot_SYMBOL-TIMEFRAME_DATES_VERSION.csv
101
+ # Example: binance_spot_BTCUSDT-1h_20240101-20240102_v2.10.0.csv
102
+ match = re.search(r"binance_spot_([A-Z]+USDT?)-(\d+[smhd])", filename)
103
+ if match:
104
+ return match.group(1), match.group(2)
105
+
106
+ # Pattern 2: SYMBOL-TIMEFRAME (with optional extensions)
107
+ # Example: BTCUSDT-1h.csv or BTCUSDT-1h_data.csv
108
+ match = re.search(r"([A-Z]+USDT?)-(\d+[smhd])", filename)
109
+ if match:
110
+ return match.group(1), match.group(2)
111
+
112
+ # Pattern 3: SYMBOL_TIMEFRAME (underscore separator)
113
+ # Example: BTCUSDT_1h.csv
114
+ match = re.search(r"([A-Z]+USDT?)_(\d+[smhd])", filename)
115
+ if match:
116
+ return match.group(1), match.group(2)
117
+
118
+ # Could not extract symbol/timeframe
119
+ return None, None
120
+
121
+
122
+ class ValidationStorage:
123
+ """DuckDB-based storage for validation reports with SQL query interface.
124
+
125
+ Provides persistent storage of CSV validation reports with efficient querying
126
+ capabilities for AI coding agents. Uses DuckDB's columnar storage for fast
127
+ analytical queries over large validation histories.
128
+
129
+ Attributes:
130
+ db_path: Path to DuckDB database file
131
+
132
+ Examples:
133
+ >>> storage = ValidationStorage()
134
+ >>> storage.insert_report(report)
135
+ >>> recent_btc = storage.query_recent(limit=5, symbol="BTCUSDT")
136
+ >>> failed = storage.query_by_status("FAILED")
137
+ """
138
+
139
+ def __init__(self, db_path: Optional[Path] = None):
140
+ """Initialize ValidationStorage with DuckDB connection.
141
+
142
+ Args:
143
+ db_path: Optional custom database path. Defaults to XDG cache location.
144
+
145
+ Examples:
146
+ >>> # Use default XDG location
147
+ >>> storage = ValidationStorage()
148
+ >>>
149
+ >>> # Use custom location
150
+ >>> storage = ValidationStorage(db_path=Path("/tmp/validation.duckdb"))
151
+ """
152
+ self.db_path = db_path or get_validation_db_path()
153
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
154
+
155
+ # Create table on first use
156
+ self._create_table()
157
+
158
+ def _create_table(self) -> None:
159
+ """Create validation_reports table if it doesn't exist.
160
+
161
+ Schema matches ValidationReport Pydantic model with 30+ columns for
162
+ efficient SQL queries. JSON columns store nested layer results.
163
+ """
164
+ with duckdb.connect(str(self.db_path)) as conn:
165
+ conn.execute("""
166
+ CREATE TABLE IF NOT EXISTS validation_reports (
167
+ -- Metadata
168
+ validation_timestamp TIMESTAMP NOT NULL,
169
+ file_path VARCHAR NOT NULL,
170
+ file_size_mb DOUBLE NOT NULL,
171
+ validator_version VARCHAR DEFAULT '3.3.0',
172
+ symbol VARCHAR,
173
+ timeframe VARCHAR,
174
+
175
+ -- Core Results
176
+ total_bars INTEGER NOT NULL,
177
+ total_errors INTEGER NOT NULL,
178
+ total_warnings INTEGER NOT NULL,
179
+ validation_summary VARCHAR NOT NULL,
180
+ validation_duration_ms DOUBLE NOT NULL,
181
+
182
+ -- Layer Results (JSON columns for nested data)
183
+ structure_validation JSON NOT NULL,
184
+ datetime_validation JSON NOT NULL,
185
+ ohlcv_validation JSON NOT NULL,
186
+ coverage_validation JSON NOT NULL,
187
+ anomaly_validation JSON NOT NULL,
188
+
189
+ -- Flattened metrics for efficient querying
190
+ date_range_start TIMESTAMP,
191
+ date_range_end TIMESTAMP,
192
+ duration_days DOUBLE,
193
+ gaps_found INTEGER,
194
+ chronological_order BOOLEAN,
195
+
196
+ price_min DOUBLE,
197
+ price_max DOUBLE,
198
+ volume_min DOUBLE,
199
+ volume_max DOUBLE,
200
+ volume_mean DOUBLE,
201
+ ohlc_errors INTEGER,
202
+ negative_zero_values INTEGER,
203
+
204
+ expected_bars INTEGER,
205
+ actual_bars INTEGER,
206
+ coverage_percentage DOUBLE,
207
+
208
+ price_outliers INTEGER,
209
+ volume_outliers INTEGER,
210
+ suspicious_patterns INTEGER,
211
+
212
+ -- Indexing for fast queries
213
+ PRIMARY KEY (validation_timestamp, file_path)
214
+ );
215
+
216
+ -- Indexes for common query patterns
217
+ CREATE INDEX IF NOT EXISTS idx_symbol_timeframe
218
+ ON validation_reports(symbol, timeframe);
219
+ CREATE INDEX IF NOT EXISTS idx_validation_timestamp
220
+ ON validation_reports(validation_timestamp DESC);
221
+ CREATE INDEX IF NOT EXISTS idx_validation_summary
222
+ ON validation_reports(validation_summary);
223
+ """)
224
+
225
+ def _convert_to_json_safe(self, obj: Any) -> Any:
226
+ """Convert numpy/pandas types to JSON-serializable Python types.
227
+
228
+ Args:
229
+ obj: Object to convert (can be dict, list, numpy/pandas type, or any)
230
+
231
+ Returns:
232
+ JSON-serializable version of the object
233
+ """
234
+ import numpy as np
235
+
236
+ if isinstance(obj, dict):
237
+ return {key: self._convert_to_json_safe(value) for key, value in obj.items()}
238
+ elif isinstance(obj, list):
239
+ return [self._convert_to_json_safe(item) for item in obj]
240
+ elif isinstance(obj, (np.integer, np.int64)):
241
+ return int(obj)
242
+ elif isinstance(obj, (np.floating, np.float64)):
243
+ return float(obj)
244
+ elif isinstance(obj, (np.bool_)):
245
+ return bool(obj)
246
+ elif isinstance(obj, (np.ndarray,)):
247
+ return obj.tolist()
248
+ else:
249
+ return obj
250
+
251
+ def insert_report(self, report: ValidationReport) -> None:
252
+ """Insert validation report into DuckDB.
253
+
254
+ Args:
255
+ report: ValidationReport instance to store
256
+
257
+ Examples:
258
+ >>> storage = ValidationStorage()
259
+ >>> storage.insert_report(report)
260
+ """
261
+ with duckdb.connect(str(self.db_path)) as conn:
262
+ # Convert Pydantic model to dict for insertion
263
+ data = report.model_dump()
264
+
265
+ # Convert datetime objects to ISO strings for DuckDB
266
+ if isinstance(data["validation_timestamp"], datetime):
267
+ data["validation_timestamp"] = data["validation_timestamp"].isoformat()
268
+ if data.get("date_range_start") and isinstance(data["date_range_start"], datetime):
269
+ data["date_range_start"] = data["date_range_start"].isoformat()
270
+ if data.get("date_range_end") and isinstance(data["date_range_end"], datetime):
271
+ data["date_range_end"] = data["date_range_end"].isoformat()
272
+
273
+ # Convert dicts to JSON strings (with numpy/pandas type conversion)
274
+ data["structure_validation"] = json.dumps(
275
+ self._convert_to_json_safe(data["structure_validation"])
276
+ )
277
+ data["datetime_validation"] = json.dumps(
278
+ self._convert_to_json_safe(data["datetime_validation"])
279
+ )
280
+ data["ohlcv_validation"] = json.dumps(
281
+ self._convert_to_json_safe(data["ohlcv_validation"])
282
+ )
283
+ data["coverage_validation"] = json.dumps(
284
+ self._convert_to_json_safe(data["coverage_validation"])
285
+ )
286
+ data["anomaly_validation"] = json.dumps(
287
+ self._convert_to_json_safe(data["anomaly_validation"])
288
+ )
289
+
290
+ # Build INSERT statement
291
+ columns = ", ".join(data.keys())
292
+ placeholders = ", ".join(["?" for _ in data])
293
+ values = list(data.values())
294
+
295
+ conn.execute(
296
+ f"INSERT INTO validation_reports ({columns}) VALUES ({placeholders})", values
297
+ )
298
+
299
+ def query_recent(
300
+ self, limit: int = 10, symbol: Optional[str] = None, timeframe: Optional[str] = None
301
+ ) -> List[Dict[str, Any]]:
302
+ """Query most recent validation reports.
303
+
304
+ Args:
305
+ limit: Maximum number of reports to return
306
+ symbol: Optional filter by trading pair symbol
307
+ timeframe: Optional filter by timeframe
308
+
309
+ Returns:
310
+ List of validation report dicts ordered by timestamp (newest first)
311
+
312
+ Examples:
313
+ >>> # Get 10 most recent validations
314
+ >>> storage.query_recent(limit=10)
315
+
316
+ >>> # Get recent BTCUSDT validations
317
+ >>> storage.query_recent(limit=5, symbol="BTCUSDT")
318
+
319
+ >>> # Get recent 1h validations
320
+ >>> storage.query_recent(limit=5, timeframe="1h")
321
+ """
322
+ with duckdb.connect(str(self.db_path)) as conn:
323
+ query = "SELECT * FROM validation_reports WHERE 1=1"
324
+ params = []
325
+
326
+ if symbol:
327
+ query += " AND symbol = ?"
328
+ params.append(symbol)
329
+
330
+ if timeframe:
331
+ query += " AND timeframe = ?"
332
+ params.append(timeframe)
333
+
334
+ query += " ORDER BY validation_timestamp DESC LIMIT ?"
335
+ params.append(limit)
336
+
337
+ result = conn.execute(query, params).fetchall()
338
+ columns = [desc[0] for desc in conn.description]
339
+
340
+ return [dict(zip(columns, row, strict=False)) for row in result]
341
+
342
+ def query_by_date_range(
343
+ self, start: datetime, end: datetime, symbol: Optional[str] = None
344
+ ) -> List[Dict[str, Any]]:
345
+ """Query validations within a date range.
346
+
347
+ Args:
348
+ start: Start datetime (inclusive)
349
+ end: End datetime (inclusive)
350
+ symbol: Optional filter by trading pair symbol
351
+
352
+ Returns:
353
+ List of validation report dicts within date range
354
+
355
+ Examples:
356
+ >>> from datetime import datetime
357
+ >>> start = datetime(2025, 1, 1)
358
+ >>> end = datetime(2025, 1, 31)
359
+ >>> storage.query_by_date_range(start, end, symbol="BTCUSDT")
360
+ """
361
+ with duckdb.connect(str(self.db_path)) as conn:
362
+ query = """
363
+ SELECT * FROM validation_reports
364
+ WHERE validation_timestamp >= ? AND validation_timestamp <= ?
365
+ """
366
+ params = [start.isoformat(), end.isoformat()]
367
+
368
+ if symbol:
369
+ query += " AND symbol = ?"
370
+ params.append(symbol)
371
+
372
+ query += " ORDER BY validation_timestamp DESC"
373
+
374
+ result = conn.execute(query, params).fetchall()
375
+ columns = [desc[0] for desc in conn.description]
376
+
377
+ return [dict(zip(columns, row, strict=False)) for row in result]
378
+
379
+ def query_by_status(self, status: str) -> List[Dict[str, Any]]:
380
+ """Query validations by summary status.
381
+
382
+ Args:
383
+ status: Validation status to filter by (PERFECT, GOOD, FAILED)
384
+
385
+ Returns:
386
+ List of validation report dicts matching status
387
+
388
+ Examples:
389
+ >>> # Find all failed validations
390
+ >>> storage.query_by_status("FAILED")
391
+
392
+ >>> # Find all perfect validations
393
+ >>> storage.query_by_status("PERFECT")
394
+ """
395
+ with duckdb.connect(str(self.db_path)) as conn:
396
+ result = conn.execute(
397
+ "SELECT * FROM validation_reports WHERE validation_summary LIKE ? ORDER BY validation_timestamp DESC",
398
+ [f"{status}%"],
399
+ ).fetchall()
400
+ columns = [desc[0] for desc in conn.description]
401
+
402
+ return [dict(zip(columns, row, strict=False)) for row in result]
403
+
404
+ def export_to_dataframe(
405
+ self, symbol: Optional[str] = None, timeframe: Optional[str] = None
406
+ ) -> pd.DataFrame:
407
+ """Export validation reports to pandas DataFrame for analysis.
408
+
409
+ Args:
410
+ symbol: Optional filter by trading pair symbol
411
+ timeframe: Optional filter by timeframe
412
+
413
+ Returns:
414
+ Pandas DataFrame with all validation reports
415
+
416
+ Examples:
417
+ >>> # Export all validations
418
+ >>> df = storage.export_to_dataframe()
419
+ >>>
420
+ >>> # Export BTCUSDT validations
421
+ >>> df = storage.export_to_dataframe(symbol="BTCUSDT")
422
+ >>>
423
+ >>> # Analyze validation trends
424
+ >>> df.groupby("symbol")["total_errors"].mean()
425
+ """
426
+ with duckdb.connect(str(self.db_path)) as conn:
427
+ query = "SELECT * FROM validation_reports WHERE 1=1"
428
+ params = []
429
+
430
+ if symbol:
431
+ query += " AND symbol = ?"
432
+ params.append(symbol)
433
+
434
+ if timeframe:
435
+ query += " AND timeframe = ?"
436
+ params.append(timeframe)
437
+
438
+ query += " ORDER BY validation_timestamp DESC"
439
+
440
+ return conn.execute(query, params).df()
441
+
442
+ def get_summary_stats(self) -> Dict[str, Any]:
443
+ """Get summary statistics about validation history.
444
+
445
+ Returns:
446
+ Dictionary with aggregate statistics:
447
+ - total_validations: Total number of validations stored
448
+ - symbols: List of unique symbols validated
449
+ - timeframes: List of unique timeframes validated
450
+ - avg_errors: Average errors per validation
451
+ - avg_warnings: Average warnings per validation
452
+ - status_distribution: Count by validation_summary
453
+
454
+ Examples:
455
+ >>> stats = storage.get_summary_stats()
456
+ >>> stats["total_validations"]
457
+ 1247
458
+ >>> stats["symbols"]
459
+ ['BTCUSDT', 'ETHUSDT', 'SOLUSDT']
460
+ """
461
+ with duckdb.connect(str(self.db_path)) as conn:
462
+ # Get total count
463
+ total = conn.execute("SELECT COUNT(*) FROM validation_reports").fetchone()[0]
464
+
465
+ # Get unique symbols and timeframes
466
+ symbols = conn.execute(
467
+ "SELECT DISTINCT symbol FROM validation_reports WHERE symbol IS NOT NULL ORDER BY symbol"
468
+ ).fetchall()
469
+ symbols = [s[0] for s in symbols]
470
+
471
+ timeframes = conn.execute(
472
+ "SELECT DISTINCT timeframe FROM validation_reports WHERE timeframe IS NOT NULL ORDER BY timeframe"
473
+ ).fetchall()
474
+ timeframes = [t[0] for t in timeframes]
475
+
476
+ # Get averages
477
+ avg_stats = conn.execute("""
478
+ SELECT
479
+ AVG(total_errors) as avg_errors,
480
+ AVG(total_warnings) as avg_warnings,
481
+ AVG(validation_duration_ms) as avg_duration_ms
482
+ FROM validation_reports
483
+ """).fetchone()
484
+
485
+ # Get status distribution
486
+ status_dist = conn.execute("""
487
+ SELECT validation_summary, COUNT(*) as count
488
+ FROM validation_reports
489
+ GROUP BY validation_summary
490
+ ORDER BY count DESC
491
+ """).fetchall()
492
+ status_distribution = dict(status_dist)
493
+
494
+ return {
495
+ "total_validations": total,
496
+ "symbols": symbols,
497
+ "timeframes": timeframes,
498
+ "avg_errors": avg_stats[0] if avg_stats[0] is not None else 0.0,
499
+ "avg_warnings": avg_stats[1] if avg_stats[1] is not None else 0.0,
500
+ "avg_duration_ms": avg_stats[2] if avg_stats[2] is not None else 0.0,
501
+ "status_distribution": status_distribution,
502
+ }