gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,439 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Atomic File Operations Module
4
+ Prevents data corruption during CSV file modifications by using atomic operations.
5
+
6
+ Key Features:
7
+ - Atomic file writes (temp file + rename)
8
+ - Header preservation for commented CSV files
9
+ - Validation checkpoints
10
+ - Automatic rollback on failure
11
+ - Progress tracking and validation
12
+ """
13
+
14
+ import logging
15
+ import shutil
16
+ import tempfile
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import List, Optional, Tuple
20
+
21
+ import pandas as pd
22
+
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class AtomicCSVOperations:
28
+ """Safe atomic operations for CSV files with header preservation and corruption prevention.
29
+
30
+ Provides atomic file operations to prevent data corruption during CSV modifications.
31
+ Uses temporary files and atomic rename operations to ensure data integrity,
32
+ even if the process is interrupted during file operations.
33
+
34
+ Features:
35
+ - Atomic write operations (temp file + rename)
36
+ - Header comment preservation for metadata
37
+ - Automatic backup creation with timestamps
38
+ - DataFrame validation before writing
39
+ - Rollback capability on failure
40
+ - Progress tracking and validation
41
+
42
+ The atomic operation sequence:
43
+ 1. Create timestamped backup of original file
44
+ 2. Write new data to temporary file
45
+ 3. Validate temporary file integrity
46
+ 4. Atomically rename temp file to replace original
47
+ 5. Clean up temporary files on success
48
+
49
+ Examples:
50
+ Basic atomic CSV write:
51
+
52
+ >>> from pathlib import Path
53
+ >>> csv_path = Path("data.csv")
54
+ >>> atomic_ops = AtomicCSVOperations(csv_path)
55
+ >>> df = pd.DataFrame({"price": [100, 101, 102], "volume": [1000, 1100, 900]})
56
+ >>> backup_path = atomic_ops.create_backup()
57
+ >>> success = atomic_ops.write_dataframe_atomic(df)
58
+ >>> if success:
59
+ ... print("Data written safely")
60
+ ... else:
61
+ ... atomic_ops.rollback_from_backup()
62
+ Data written safely
63
+
64
+ With header preservation:
65
+
66
+ >>> # Original file has metadata comments
67
+ >>> atomic_ops = AtomicCSVOperations(Path("btc_data.csv"))
68
+ >>> headers = atomic_ops.read_header_comments()
69
+ >>> print(f"Found {len(headers)} header lines")
70
+ >>> # Headers are automatically preserved during atomic writes
71
+ Found 8 header lines
72
+
73
+ Note:
74
+ Always call create_backup() before performing write operations
75
+ to enable rollback capability in case of errors.
76
+ """
77
+
78
+ def __init__(self, csv_path: Path):
79
+ self.csv_path = Path(csv_path)
80
+ self.backup_path = None
81
+ self.temp_path = None
82
+
83
+ def create_backup(self) -> Path:
84
+ """Create timestamped backup of original file"""
85
+ if not self.csv_path.exists():
86
+ raise FileNotFoundError(f"Source file not found: {self.csv_path}")
87
+
88
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
89
+ backup_name = f"{self.csv_path.stem}.backup_{timestamp}{self.csv_path.suffix}"
90
+ backup_path = self.csv_path.parent / backup_name
91
+
92
+ logger.info(f"๐Ÿ“ฆ Creating backup: {backup_path}")
93
+ shutil.copy2(self.csv_path, backup_path)
94
+
95
+ self.backup_path = backup_path
96
+ return backup_path
97
+
98
+ def read_header_comments(self) -> List[str]:
99
+ """Extract header comments from CSV file"""
100
+ header_comments = []
101
+
102
+ if not self.csv_path.exists():
103
+ return header_comments
104
+
105
+ with open(self.csv_path, "r") as f:
106
+ for line in f:
107
+ if line.startswith("#"):
108
+ header_comments.append(line.rstrip())
109
+ else:
110
+ break
111
+
112
+ logger.info(f"๐Ÿ“„ Found {len(header_comments)} header comment lines")
113
+ return header_comments
114
+
115
+ def validate_dataframe(self, df: pd.DataFrame) -> Tuple[bool, str]:
116
+ """Validate DataFrame integrity before writing"""
117
+ if df is None or df.empty:
118
+ return False, "DataFrame is None or empty"
119
+
120
+ # Check required columns for OHLCV data
121
+ required_cols = ["date", "open", "high", "low", "close", "volume"]
122
+ missing_cols = [col for col in required_cols if col not in df.columns]
123
+ if missing_cols:
124
+ return False, f"Missing required columns: {missing_cols}"
125
+
126
+ # Check for duplicate timestamps
127
+ if "date" in df.columns:
128
+ duplicates = df["date"].duplicated().sum()
129
+ if duplicates > 0:
130
+ return False, f"Found {duplicates} duplicate timestamps"
131
+
132
+ # Check data types
133
+ numeric_cols = ["open", "high", "low", "close", "volume"]
134
+ for col in numeric_cols:
135
+ if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
136
+ return False, f"Column {col} is not numeric"
137
+
138
+ logger.info(f"โœ… DataFrame validation passed: {len(df)} rows, {len(df.columns)} columns")
139
+ return True, "Validation passed"
140
+
141
+ def write_dataframe_atomic(
142
+ self, df: pd.DataFrame, header_comments: Optional[List[str]] = None
143
+ ) -> bool:
144
+ """Write DataFrame to CSV using atomic operations"""
145
+
146
+ # Validate DataFrame
147
+ is_valid, validation_msg = self.validate_dataframe(df)
148
+ if not is_valid:
149
+ logger.error(f"โŒ DataFrame validation failed: {validation_msg}")
150
+ return False
151
+
152
+ # Use existing headers if none provided
153
+ if header_comments is None:
154
+ header_comments = self.read_header_comments()
155
+
156
+ try:
157
+ # Create temporary file in same directory for atomic rename
158
+ temp_fd, temp_path = tempfile.mkstemp(suffix=".csv.tmp", dir=self.csv_path.parent)
159
+ self.temp_path = Path(temp_path)
160
+
161
+ logger.info(f"๐Ÿ”ง Writing to temporary file: {self.temp_path}")
162
+
163
+ # Write to temporary file
164
+ with open(temp_fd, "w") as f:
165
+ # Write header comments
166
+ for comment in header_comments:
167
+ f.write(comment + "\n")
168
+
169
+ # Write DataFrame
170
+ df.to_csv(f, index=False)
171
+
172
+ # Validate temporary file
173
+ logger.info("๐Ÿ” Validating temporary file...")
174
+ test_df = pd.read_csv(self.temp_path, comment="#")
175
+
176
+ if len(test_df) != len(df):
177
+ raise ValueError(f"Row count mismatch: expected {len(df)}, got {len(test_df)}")
178
+
179
+ # Atomic rename (only works within same filesystem)
180
+ logger.info(f"๐ŸŽฏ Performing atomic rename: {self.temp_path} โ†’ {self.csv_path}")
181
+ shutil.move(str(self.temp_path), str(self.csv_path))
182
+
183
+ logger.info("โœ… Atomic write completed successfully")
184
+ return True
185
+
186
+ except Exception as e:
187
+ logger.error(f"โŒ Atomic write failed: {e}")
188
+
189
+ # Cleanup temporary file
190
+ if self.temp_path and self.temp_path.exists():
191
+ self.temp_path.unlink()
192
+ logger.info("๐Ÿงน Cleaned up temporary file")
193
+
194
+ return False
195
+
196
+ def rollback_from_backup(self) -> bool:
197
+ """Restore file from backup in case of failure"""
198
+ if not self.backup_path or not self.backup_path.exists():
199
+ logger.error("โŒ No backup available for rollback")
200
+ return False
201
+
202
+ try:
203
+ logger.info(f"๐Ÿ”„ Rolling back from backup: {self.backup_path}")
204
+ shutil.copy2(self.backup_path, self.csv_path)
205
+ logger.info("โœ… Rollback completed successfully")
206
+ return True
207
+
208
+ except Exception as e:
209
+ logger.error(f"โŒ Rollback failed: {e}")
210
+ return False
211
+
212
+ def cleanup_backup(self) -> bool:
213
+ """Remove backup file after successful operation"""
214
+ if not self.backup_path or not self.backup_path.exists():
215
+ return True
216
+
217
+ try:
218
+ self.backup_path.unlink()
219
+ logger.info(f"๐Ÿงน Backup cleaned up: {self.backup_path}")
220
+ return True
221
+
222
+ except Exception as e:
223
+ logger.warning(f"โš ๏ธ Could not cleanup backup: {e}")
224
+ return False
225
+
226
+
227
+ class SafeCSVMerger:
228
+ """Safe CSV data merging with gap filling capabilities and data integrity validation.
229
+
230
+ Provides safe merging of gap-filling data into existing CSV files using atomic operations.
231
+ Handles temporal data insertion, duplicate detection, and maintains chronological order
232
+ while preserving data integrity through comprehensive validation.
233
+
234
+ Features:
235
+ - Atomic merge operations with backup/rollback
236
+ - Chronological data insertion and sorting
237
+ - Duplicate detection and handling
238
+ - Data validation before and after merge
239
+ - Gap boundary validation
240
+ - Maintains CSV header comments and metadata
241
+
242
+ The merge process:
243
+ 1. Create backup of original CSV file
244
+ 2. Load existing data and gap data
245
+ 3. Validate gap boundaries and data format
246
+ 4. Remove any overlapping data in gap range
247
+ 5. Insert new gap data chronologically
248
+ 6. Validate merged dataset integrity
249
+ 7. Atomically write merged data
250
+
251
+ Examples:
252
+ Basic gap filling:
253
+
254
+ >>> from datetime import datetime
255
+ >>> import pandas as pd
256
+ >>> from pathlib import Path
257
+ >>>
258
+ >>> # Create gap data to fill missing period
259
+ >>> gap_data = pd.DataFrame({
260
+ ... "date": ["2024-01-01 12:00:00", "2024-01-01 13:00:00"],
261
+ ... "open": [100.0, 101.0],
262
+ ... "high": [102.0, 103.0],
263
+ ... "low": [99.0, 100.0],
264
+ ... "close": [101.0, 102.0],
265
+ ... "volume": [1000, 1100]
266
+ ... })
267
+ >>>
268
+ >>> merger = SafeCSVMerger(Path("btc_1h.csv"))
269
+ >>> success = merger.merge_gap_data_safe(
270
+ ... gap_data,
271
+ ... datetime(2024, 1, 1, 12),
272
+ ... datetime(2024, 1, 1, 13)
273
+ ... )
274
+ >>> if success:
275
+ ... print("Gap filled successfully")
276
+ Gap filled successfully
277
+
278
+ Note:
279
+ The merge operation is atomic - either all data is merged successfully
280
+ or the original file remains unchanged. Always validate gap boundaries
281
+ to ensure data consistency.
282
+ """
283
+
284
+ def __init__(self, csv_path: Path):
285
+ """Initialize SafeCSVMerger for the specified CSV file.
286
+
287
+ Args:
288
+ csv_path (Path): Path to the CSV file for gap filling operations.
289
+ """
290
+ self.csv_path = Path(csv_path)
291
+ self.atomic_ops = AtomicCSVOperations(csv_path)
292
+
293
+ def merge_gap_data_safe(
294
+ self, gap_data: pd.DataFrame, gap_start: datetime, gap_end: datetime
295
+ ) -> bool:
296
+ """Safely merge gap data into existing CSV using atomic operations.
297
+
298
+ Inserts gap-filling data into the existing CSV file while maintaining
299
+ chronological order and data integrity. Uses atomic operations to
300
+ ensure the merge is completed safely or not at all.
301
+
302
+ Args:
303
+ gap_data (pd.DataFrame): DataFrame containing gap data to merge.
304
+ Must have columns matching the existing CSV structure.
305
+ Timestamp column must be named 'date'.
306
+ gap_start (datetime): Start timestamp of the gap being filled.
307
+ Used for validation and boundary checking.
308
+ gap_end (datetime): End timestamp of the gap being filled.
309
+ Used for validation and boundary checking.
310
+
311
+ Returns:
312
+ bool: True if merge completed successfully, False if merge failed.
313
+ On failure, original file is preserved via automatic rollback.
314
+
315
+ Raises:
316
+ ValueError: If gap_data format doesn't match existing CSV structure.
317
+ FileNotFoundError: If the target CSV file doesn't exist.
318
+
319
+ Examples:
320
+ >>> merger = SafeCSVMerger(Path("eth_data.csv"))
321
+ >>> gap_df = pd.DataFrame({...}) # Gap data
322
+ >>> success = merger.merge_gap_data_safe(
323
+ ... gap_df,
324
+ ... datetime(2024, 1, 1, 12),
325
+ ... datetime(2024, 1, 1, 15)
326
+ ... )
327
+ >>> print(f"Merge success: {success}")
328
+ Merge success: True
329
+
330
+ Note:
331
+ This method automatically handles:
332
+ - Backup creation before modification
333
+ - Data validation and format checking
334
+ - Chronological sorting after merge
335
+ - Rollback on any failure
336
+ """
337
+
338
+ logger.info(f"๐ŸŽฏ SAFE GAP MERGE: {gap_start} โ†’ {gap_end}")
339
+ logger.info(f"๐Ÿ“Š Gap data: {len(gap_data)} rows")
340
+
341
+ try:
342
+ # Step 1: Create backup
343
+ self.atomic_ops.create_backup()
344
+
345
+ # Step 2: Load existing data
346
+ logger.info("๐Ÿ“„ Loading existing CSV data...")
347
+ existing_df = pd.read_csv(self.csv_path, comment="#")
348
+ existing_df["date"] = pd.to_datetime(existing_df["date"])
349
+
350
+ original_count = len(existing_df)
351
+ logger.info(f"๐Ÿ“Š Original data: {original_count} rows")
352
+
353
+ # Step 3: Prepare gap data
354
+ gap_data = gap_data.copy()
355
+ gap_data["date"] = pd.to_datetime(gap_data["date"])
356
+
357
+ # Step 4: Remove existing data in gap range
358
+ gap_mask = (existing_df["date"] >= gap_start) & (existing_df["date"] <= gap_end)
359
+ removed_count = gap_mask.sum()
360
+
361
+ logger.info(f"๐Ÿ—‘๏ธ Removing {removed_count} existing rows in gap range")
362
+ df_cleaned = existing_df[~gap_mask].copy()
363
+
364
+ # Step 5: Merge with gap data
365
+ logger.info("๐Ÿ”ง Merging gap data...")
366
+ merged_df = pd.concat([df_cleaned, gap_data], ignore_index=True)
367
+
368
+ # Step 6: Sort by date
369
+ merged_df = merged_df.sort_values("date").reset_index(drop=True)
370
+ final_count = len(merged_df)
371
+
372
+ logger.info(f"๐Ÿ“Š Merged result: {final_count} rows")
373
+ logger.info(f"๐Ÿ“ˆ Net change: {final_count - original_count:+d} rows")
374
+
375
+ # Step 7: Validate merge
376
+ gap_check = ((merged_df["date"] >= gap_start) & (merged_df["date"] <= gap_end)).sum()
377
+ expected_gap_rows = len(gap_data)
378
+
379
+ if gap_check != expected_gap_rows:
380
+ raise ValueError(
381
+ f"Gap merge validation failed: expected {expected_gap_rows}, got {gap_check}"
382
+ )
383
+
384
+ # Step 8: Atomic write
385
+ success = self.atomic_ops.write_dataframe_atomic(merged_df)
386
+
387
+ if success:
388
+ logger.info("โœ… Safe gap merge completed successfully")
389
+ # Keep backup for now, don't auto-cleanup
390
+ return True
391
+ else:
392
+ logger.error("โŒ Atomic write failed, rolling back...")
393
+ self.atomic_ops.rollback_from_backup()
394
+ return False
395
+
396
+ except Exception as e:
397
+ logger.error(f"โŒ Safe gap merge failed: {e}")
398
+
399
+ # Attempt rollback
400
+ if hasattr(self.atomic_ops, "backup_path"):
401
+ logger.info("๐Ÿ”„ Attempting rollback...")
402
+ self.atomic_ops.rollback_from_backup()
403
+
404
+ return False
405
+
406
+
407
+ def main():
408
+ """Test atomic operations functionality"""
409
+ logger.info("๐Ÿงช TESTING ATOMIC FILE OPERATIONS")
410
+
411
+ # Test with sample data
412
+ test_csv = Path("../sample_data/binance_spot_SOLUSDT-1h_20210806-20250831_v2.5.0.csv")
413
+
414
+ if not test_csv.exists():
415
+ logger.error(f"Test file not found: {test_csv}")
416
+ return 1
417
+
418
+ # Test backup and restore
419
+ atomic_ops = AtomicCSVOperations(test_csv)
420
+
421
+ # Create backup
422
+ backup_path = atomic_ops.create_backup()
423
+ logger.info(f"โœ… Backup created: {backup_path}")
424
+
425
+ # Read headers
426
+ headers = atomic_ops.read_header_comments()
427
+ logger.info(f"โœ… Headers read: {len(headers)} lines")
428
+
429
+ # Load and validate data
430
+ df = pd.read_csv(test_csv, comment="#")
431
+ is_valid, msg = atomic_ops.validate_dataframe(df)
432
+ logger.info(f"โœ… Validation: {is_valid} - {msg}")
433
+
434
+ logger.info("โœ… All atomic operations tests passed")
435
+ return 0
436
+
437
+
438
+ if __name__ == "__main__":
439
+ exit(main())