gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,677 @@
1
+ """CSV validation module for cryptocurrency market data quality assurance.
2
+
3
+ This module provides comprehensive validation for CSV files containing OHLCV data,
4
+ ensuring data integrity, completeness, and quality before use in trading systems.
5
+
6
+ Validation Layers:
7
+ 1. Structure Validation: Column presence and format detection
8
+ 2. DateTime Validation: Chronological order and gap detection
9
+ 3. OHLCV Quality: Logical consistency and value ranges
10
+ 4. Coverage Validation: Expected vs actual bar counts
11
+ 5. Statistical Anomaly Detection: Outlier and pattern analysis
12
+
13
+ SLO Targets:
14
+ Correctness: 100% - all validation rules must be accurate
15
+ Observability: Complete reporting with errors, warnings, and metrics
16
+ Maintainability: Single source of truth for CSV validation
17
+ """
18
+
19
+ import time
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+ from typing import Any, Dict, Optional, Union
23
+
24
+ import pandas as pd
25
+
26
+ from .models import ValidationReport
27
+ from .storage import ValidationStorage, extract_symbol_timeframe_from_path
28
+
29
+
30
+ class CSVValidator:
31
+ """Validator for cryptocurrency market data CSV files.
32
+
33
+ Provides multi-layer validation including structure checking, datetime
34
+ sequence validation, OHLCV quality analysis, coverage calculation, and
35
+ statistical anomaly detection.
36
+
37
+ Examples:
38
+ >>> validator = CSVValidator()
39
+ >>> results = validator.validate_csv_file("BTCUSDT-1h.csv", expected_timeframe="1h")
40
+ >>> if results["total_errors"] == 0:
41
+ ... print("Validation passed!")
42
+ ... else:
43
+ ... print(f"Validation failed: {results['validation_summary']}")
44
+
45
+ Note:
46
+ All validation errors and warnings are logged to console and returned
47
+ in the validation results dictionary.
48
+ """
49
+
50
+ def _run_structure_validation_layer(self, df: pd.DataFrame, validation_results: dict) -> None:
51
+ """Run structure validation layer and update results.
52
+
53
+ Args:
54
+ df: DataFrame to validate
55
+ validation_results: Validation results dict to update
56
+ """
57
+ print("\n1. BASIC STRUCTURE VALIDATION")
58
+ structure_validation = self._validate_csv_structure(df)
59
+ validation_results["structure_validation"] = structure_validation
60
+ print(f" Columns: {structure_validation['status']}")
61
+
62
+ if structure_validation["errors"]:
63
+ for error in structure_validation["errors"]:
64
+ print(f" ❌ {error}")
65
+ validation_results["total_errors"] += 1
66
+
67
+ def _run_datetime_validation_layer(
68
+ self, df: pd.DataFrame, expected_timeframe: Optional[str], validation_results: dict
69
+ ) -> None:
70
+ """Run datetime validation layer and update results.
71
+
72
+ Args:
73
+ df: DataFrame to validate
74
+ expected_timeframe: Expected timeframe for gap detection
75
+ validation_results: Validation results dict to update
76
+ """
77
+ print("\n2. DATE/TIME VALIDATION")
78
+ datetime_validation = self._validate_datetime_sequence(df, expected_timeframe)
79
+ validation_results["datetime_validation"] = datetime_validation
80
+ print(
81
+ f" Date Range: {datetime_validation['date_range']['start']} to {datetime_validation['date_range']['end']}"
82
+ )
83
+ print(f" Duration: {datetime_validation['duration_days']:.1f} days")
84
+ print(f" Gaps Found: {datetime_validation['gaps_found']}")
85
+ print(f" Sequence: {datetime_validation['chronological_order']}")
86
+
87
+ if datetime_validation["errors"]:
88
+ for error in datetime_validation["errors"]:
89
+ print(f" ❌ {error}")
90
+ validation_results["total_errors"] += 1
91
+ if datetime_validation["warnings"]:
92
+ for warning in datetime_validation["warnings"]:
93
+ print(f" ⚠️ {warning}")
94
+ validation_results["total_warnings"] += 1
95
+
96
+ def _run_ohlcv_validation_layer(self, df: pd.DataFrame, validation_results: dict) -> None:
97
+ """Run OHLCV quality validation layer and update results.
98
+
99
+ Args:
100
+ df: DataFrame to validate
101
+ validation_results: Validation results dict to update
102
+ """
103
+ print("\n3. OHLCV DATA QUALITY VALIDATION")
104
+ ohlcv_validation = self._validate_ohlcv_quality(df)
105
+ validation_results["ohlcv_validation"] = ohlcv_validation
106
+ print(
107
+ f" Price Range: ${ohlcv_validation['price_range']['min']:.4f} - ${ohlcv_validation['price_range']['max']:.4f}"
108
+ )
109
+ print(
110
+ f" Volume Range: {ohlcv_validation['volume_stats']['min']:.2f} - {ohlcv_validation['volume_stats']['max']:,.0f}"
111
+ )
112
+ print(f" OHLC Logic Errors: {ohlcv_validation['ohlc_errors']}")
113
+ print(f" Negative/Zero Values: {ohlcv_validation['negative_zero_values']}")
114
+
115
+ if ohlcv_validation["errors"]:
116
+ for error in ohlcv_validation["errors"]:
117
+ print(f" ❌ {error}")
118
+ validation_results["total_errors"] += 1
119
+ if ohlcv_validation["warnings"]:
120
+ for warning in ohlcv_validation["warnings"]:
121
+ print(f" ⚠️ {warning}")
122
+ validation_results["total_warnings"] += 1
123
+
124
+ def _run_coverage_and_anomaly_layers(
125
+ self, df: pd.DataFrame, expected_timeframe: Optional[str], validation_results: dict
126
+ ) -> None:
127
+ """Run coverage and anomaly validation layers.
128
+
129
+ Args:
130
+ df: DataFrame to validate
131
+ expected_timeframe: Expected timeframe for coverage calculation
132
+ validation_results: Validation results dict to update
133
+ """
134
+ print("\n4. EXPECTED COVERAGE VALIDATION")
135
+ coverage_validation = self._validate_expected_coverage(df, expected_timeframe)
136
+ validation_results["coverage_validation"] = coverage_validation
137
+ print(f" Expected Bars: {coverage_validation['expected_bars']:,}")
138
+ print(f" Actual Bars: {coverage_validation['actual_bars']:,}")
139
+ print(f" Coverage: {coverage_validation['coverage_percentage']:.1f}%")
140
+
141
+ print("\n5. STATISTICAL ANOMALY DETECTION")
142
+ anomaly_validation = self._validate_statistical_anomalies(df)
143
+ validation_results["anomaly_validation"] = anomaly_validation
144
+ print(f" Price Outliers: {anomaly_validation['price_outliers']}")
145
+ print(f" Volume Outliers: {anomaly_validation['volume_outliers']}")
146
+ print(f" Suspicious Patterns: {anomaly_validation['suspicious_patterns']}")
147
+
148
+ def _generate_final_validation_summary(self, validation_results: dict) -> None:
149
+ """Generate and print final validation summary.
150
+
151
+ Args:
152
+ validation_results: Validation results dict to update with summary
153
+ """
154
+ if validation_results["total_errors"] == 0:
155
+ if validation_results["total_warnings"] == 0:
156
+ validation_results["validation_summary"] = "PERFECT - No errors or warnings"
157
+ print("\n✅ VALIDATION RESULT: PERFECT")
158
+ print(" No errors or warnings found. Data quality is excellent.")
159
+ else:
160
+ validation_results["validation_summary"] = (
161
+ f"GOOD - {validation_results['total_warnings']} warnings"
162
+ )
163
+ print("\n✅ VALIDATION RESULT: GOOD")
164
+ print(f" No errors, but {validation_results['total_warnings']} warnings found.")
165
+ else:
166
+ validation_results["validation_summary"] = (
167
+ f"FAILED - {validation_results['total_errors']} errors, {validation_results['total_warnings']} warnings"
168
+ )
169
+ print("\n❌ VALIDATION RESULT: FAILED")
170
+ print(
171
+ f" {validation_results['total_errors']} errors and {validation_results['total_warnings']} warnings found."
172
+ )
173
+
174
+ def validate_csv_file(
175
+ self,
176
+ csv_filepath: Union[str, Path],
177
+ expected_timeframe: Optional[str] = None,
178
+ store_report: bool = False,
179
+ ) -> Dict[str, Any]:
180
+ """
181
+ Comprehensive validation of CSV file data integrity, completeness, and quality.
182
+
183
+ Args:
184
+ csv_filepath: Path to CSV file to validate
185
+ expected_timeframe: Expected timeframe (e.g., '30m') for interval validation
186
+ store_report: If True, persist validation report to DuckDB for analysis (default: False)
187
+
188
+ Returns:
189
+ dict: Validation results with detailed analysis
190
+
191
+ Examples:
192
+ >>> validator = CSVValidator()
193
+ >>> results = validator.validate_csv_file("data.csv", "1h")
194
+ >>> print(f"Errors: {results['total_errors']}, Warnings: {results['total_warnings']}")
195
+
196
+ >>> # Store validation report for AI agent analysis
197
+ >>> results = validator.validate_csv_file("data.csv", "1h", store_report=True)
198
+ """
199
+ # Start timing for performance metrics
200
+ start_time = time.perf_counter()
201
+
202
+ csv_filepath = Path(csv_filepath)
203
+
204
+ print(f"\n{'=' * 60}")
205
+ print(f"VALIDATING: {csv_filepath.name}")
206
+ print(f"{'=' * 60}")
207
+
208
+ validation_results = {
209
+ "validation_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
210
+ "file_path": str(csv_filepath),
211
+ "file_exists": csv_filepath.exists(),
212
+ "file_size_mb": 0,
213
+ "total_errors": 0,
214
+ "total_warnings": 0,
215
+ "validation_summary": "UNKNOWN",
216
+ }
217
+
218
+ if not csv_filepath.exists():
219
+ validation_results["validation_summary"] = "FAILED - File not found"
220
+ validation_results["total_errors"] = 1
221
+ return validation_results
222
+
223
+ validation_results["file_size_mb"] = csv_filepath.stat().st_size / (1024 * 1024)
224
+
225
+ try:
226
+ # Load CSV data efficiently
227
+ print("Loading and parsing CSV data...")
228
+ df = pd.read_csv(csv_filepath, comment="#")
229
+ validation_results["total_bars"] = len(df)
230
+ print(f" ✅ Loaded {len(df):,} data bars")
231
+
232
+ # Run all validation layers
233
+ self._run_structure_validation_layer(df, validation_results)
234
+ self._run_datetime_validation_layer(df, expected_timeframe, validation_results)
235
+ self._run_ohlcv_validation_layer(df, validation_results)
236
+ self._run_coverage_and_anomaly_layers(df, expected_timeframe, validation_results)
237
+
238
+ # Generate final summary
239
+ self._generate_final_validation_summary(validation_results)
240
+
241
+ except Exception as e:
242
+ validation_results["validation_summary"] = f"ERROR - {str(e)}"
243
+ validation_results["total_errors"] += 1
244
+ print(f"❌ Validation failed with exception: {e}")
245
+
246
+ # Calculate validation duration
247
+ end_time = time.perf_counter()
248
+ duration_ms = (end_time - start_time) * 1000
249
+
250
+ # Store report to DuckDB if requested
251
+ if store_report:
252
+ try:
253
+ # Extract symbol and timeframe from filepath
254
+ symbol, timeframe = extract_symbol_timeframe_from_path(str(csv_filepath))
255
+
256
+ # Convert legacy dict to typed ValidationReport
257
+ report = ValidationReport.from_legacy_dict(
258
+ validation_results, duration_ms=duration_ms, symbol=symbol, timeframe=timeframe
259
+ )
260
+
261
+ # Persist to DuckDB
262
+ storage = ValidationStorage()
263
+ storage.insert_report(report)
264
+ print(f"\n📊 Validation report stored to database ({duration_ms:.2f}ms)")
265
+
266
+ except Exception as e:
267
+ print(f"⚠️ Failed to store validation report: {e}")
268
+
269
+ return validation_results
270
+
271
+ def _detect_csv_format_type(
272
+ self, df: pd.DataFrame, expected_columns: list, legacy_columns: list
273
+ ) -> tuple[str, bool, bool]:
274
+ """Detect CSV format type (enhanced, legacy, or incomplete).
275
+
276
+ Args:
277
+ df: DataFrame to analyze
278
+ expected_columns: List of enhanced format columns
279
+ legacy_columns: List of legacy format columns
280
+
281
+ Returns:
282
+ tuple: (format_type, has_enhanced_format, has_legacy_format)
283
+ """
284
+ has_enhanced_format = all(col in df.columns for col in expected_columns)
285
+ has_legacy_format = all(col in df.columns for col in legacy_columns)
286
+
287
+ if has_enhanced_format:
288
+ format_type = "enhanced"
289
+ elif has_legacy_format:
290
+ format_type = "legacy"
291
+ else:
292
+ format_type = "incomplete"
293
+
294
+ return format_type, has_enhanced_format, has_legacy_format
295
+
296
+ def _validate_column_completeness(
297
+ self,
298
+ df: pd.DataFrame,
299
+ format_type: str,
300
+ expected_columns: list,
301
+ legacy_columns: list,
302
+ errors: list,
303
+ warnings: list,
304
+ ) -> None:
305
+ """Validate column completeness based on format type.
306
+
307
+ Args:
308
+ df: DataFrame to validate
309
+ format_type: Detected format type
310
+ expected_columns: List of enhanced format columns
311
+ legacy_columns: List of legacy format columns
312
+ errors: List to append errors to
313
+ warnings: List to append warnings to
314
+ """
315
+ if format_type == "enhanced":
316
+ missing_columns = [col for col in expected_columns if col not in df.columns]
317
+ if missing_columns:
318
+ errors.append(f"Missing enhanced columns: {missing_columns}")
319
+ elif format_type == "legacy":
320
+ warnings.append(
321
+ "Legacy format detected - missing microstructure columns for advanced analysis"
322
+ )
323
+ missing_enhanced = [col for col in expected_columns if col not in df.columns]
324
+ warnings.append(f"Enhanced features unavailable: {missing_enhanced}")
325
+ else: # incomplete format
326
+ missing_basic = [col for col in legacy_columns if col not in df.columns]
327
+ errors.append(f"Missing basic required columns: {missing_basic}")
328
+
329
+ def _check_extra_columns(
330
+ self, df: pd.DataFrame, expected_columns: list, warnings: list
331
+ ) -> None:
332
+ """Check for unexpected extra columns.
333
+
334
+ Args:
335
+ df: DataFrame to check
336
+ expected_columns: List of expected columns
337
+ warnings: List to append warnings to
338
+ """
339
+ extra_columns = [col for col in df.columns if col not in expected_columns]
340
+ if extra_columns:
341
+ warnings.append(f"Unexpected extra columns: {extra_columns}")
342
+
343
+ def _validate_csv_structure(self, df: pd.DataFrame) -> Dict[str, Any]:
344
+ """Validate CSV has correct structure and columns.
345
+
346
+ Args:
347
+ df: DataFrame to validate
348
+
349
+ Returns:
350
+ dict: Structure validation results with status, format type, errors, and warnings
351
+
352
+ Note:
353
+ Supports both enhanced (11-column) and legacy (6-column) formats for
354
+ backward compatibility.
355
+ """
356
+ # Enhanced expected columns for complete microstructure data
357
+ expected_columns = [
358
+ "date",
359
+ "open",
360
+ "high",
361
+ "low",
362
+ "close",
363
+ "volume",
364
+ "close_time",
365
+ "quote_asset_volume",
366
+ "number_of_trades",
367
+ "taker_buy_base_asset_volume",
368
+ "taker_buy_quote_asset_volume",
369
+ ]
370
+
371
+ # Legacy format for backward compatibility
372
+ legacy_columns = ["date", "open", "high", "low", "close", "volume"]
373
+
374
+ errors = []
375
+ warnings = []
376
+
377
+ # Detect format type
378
+ format_type, has_enhanced_format, has_legacy_format = self._detect_csv_format_type(
379
+ df, expected_columns, legacy_columns
380
+ )
381
+
382
+ # Validate column completeness based on format
383
+ self._validate_column_completeness(
384
+ df, format_type, expected_columns, legacy_columns, errors, warnings
385
+ )
386
+
387
+ # Check for extra columns
388
+ self._check_extra_columns(df, expected_columns, warnings)
389
+
390
+ # Check for empty data
391
+ if len(df) == 0:
392
+ errors.append("CSV file is empty (no data rows)")
393
+
394
+ return {
395
+ "status": "VALID" if not errors else "INVALID",
396
+ "format_type": format_type,
397
+ "errors": errors,
398
+ "warnings": warnings,
399
+ "columns_found": list(df.columns),
400
+ "expected_columns": expected_columns,
401
+ "legacy_columns": legacy_columns,
402
+ }
403
+
404
+ def _validate_datetime_sequence(
405
+ self, df: pd.DataFrame, expected_timeframe: Optional[str]
406
+ ) -> Dict[str, Any]:
407
+ """Validate datetime sequence is complete and chronological.
408
+
409
+ Args:
410
+ df: DataFrame with 'date' column to validate
411
+ expected_timeframe: Expected timeframe for gap detection (e.g., '1h', '30m')
412
+
413
+ Returns:
414
+ dict: DateTime validation results with status, date range, gaps, and errors
415
+
416
+ Note:
417
+ Detects all gaps > expected interval and reports chronological ordering issues.
418
+ """
419
+ errors = []
420
+ warnings = []
421
+ gaps_found = 0
422
+
423
+ # Convert date column to datetime
424
+ try:
425
+ df["datetime"] = pd.to_datetime(df["date"])
426
+ except Exception as e:
427
+ errors.append(f"Failed to parse dates: {e}")
428
+ return {"status": "INVALID", "errors": errors, "warnings": warnings}
429
+
430
+ # Check chronological order
431
+ is_sorted = df["datetime"].is_monotonic_increasing
432
+
433
+ # Find gaps if we have expected timeframe
434
+ gap_details = []
435
+ if expected_timeframe and len(df) > 1:
436
+ # Calculate expected interval in minutes
437
+ interval_map = {"1m": 1, "3m": 3, "5m": 5, "15m": 15, "30m": 30, "1h": 60, "2h": 120}
438
+ expected_interval = interval_map.get(expected_timeframe, 0)
439
+
440
+ if expected_interval > 0:
441
+ expected_delta = pd.Timedelta(minutes=expected_interval)
442
+
443
+ # Check for gaps
444
+ for i in range(1, len(df)):
445
+ actual_delta = df["datetime"].iloc[i] - df["datetime"].iloc[i - 1]
446
+ if actual_delta > expected_delta:
447
+ gaps_found += 1
448
+ gap_details.append(
449
+ {
450
+ "position": i,
451
+ "expected_time": (
452
+ df["datetime"].iloc[i - 1] + expected_delta
453
+ ).isoformat(),
454
+ "actual_time": df["datetime"].iloc[i].isoformat(),
455
+ "gap_duration": str(actual_delta - expected_delta),
456
+ }
457
+ )
458
+
459
+ # Record every single gap for complete validation tracking
460
+ warnings.append(
461
+ f"Gap at position {i}: expected {expected_delta}, got {actual_delta}"
462
+ )
463
+
464
+ if not is_sorted:
465
+ errors.append("Timestamps are not in chronological order")
466
+
467
+ if gaps_found > 10:
468
+ errors.append(f"Too many gaps found: {gaps_found} (data may be incomplete)")
469
+ elif gaps_found > 0:
470
+ warnings.append(f"{gaps_found} timestamp gaps found (market closures or data issues)")
471
+
472
+ return {
473
+ "status": "VALID" if not errors else "INVALID",
474
+ "errors": errors,
475
+ "warnings": warnings,
476
+ "date_range": {
477
+ "start": df["datetime"].min().isoformat(),
478
+ "end": df["datetime"].max().isoformat(),
479
+ },
480
+ "duration_days": (df["datetime"].max() - df["datetime"].min()).days,
481
+ "chronological_order": is_sorted,
482
+ "gaps_found": gaps_found,
483
+ "gap_details": gap_details, # Complete gap details for thorough analysis
484
+ }
485
+
486
+ def _validate_ohlcv_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
487
+ """Validate OHLCV data quality and logical consistency.
488
+
489
+ Args:
490
+ df: DataFrame with OHLCV columns to validate
491
+
492
+ Returns:
493
+ dict: OHLCV quality validation results with price ranges, volume stats, and errors
494
+
495
+ Note:
496
+ Checks OHLC logic (High >= Low, Open/Close within range), detects negative/zero
497
+ values, and flags volume anomalies.
498
+ """
499
+ errors = []
500
+ warnings = []
501
+
502
+ # Check for negative or zero values
503
+ negative_zero_count = 0
504
+ for col in ["open", "high", "low", "close"]:
505
+ negative_zero = (df[col] <= 0).sum()
506
+ if negative_zero > 0:
507
+ errors.append(f"Found {negative_zero} negative/zero values in {col}")
508
+ negative_zero_count += negative_zero
509
+
510
+ # Check volume (can be zero but not negative)
511
+ negative_volume = (df["volume"] < 0).sum()
512
+ if negative_volume > 0:
513
+ errors.append(f"Found {negative_volume} negative volume values")
514
+
515
+ zero_volume = (df["volume"] == 0).sum()
516
+ if zero_volume > 0:
517
+ warnings.append(f"Found {zero_volume} zero volume bars")
518
+
519
+ # Check OHLC logic: High >= Low, Open/Close within High/Low range
520
+ ohlc_errors = 0
521
+
522
+ # High should be >= Low
523
+ high_low_errors = (df["high"] < df["low"]).sum()
524
+ if high_low_errors > 0:
525
+ errors.append(f"Found {high_low_errors} bars where High < Low")
526
+ ohlc_errors += high_low_errors
527
+
528
+ # Open should be within High/Low range
529
+ open_range_errors = ((df["open"] > df["high"]) | (df["open"] < df["low"])).sum()
530
+ if open_range_errors > 0:
531
+ errors.append(f"Found {open_range_errors} bars where Open is outside High/Low range")
532
+ ohlc_errors += open_range_errors
533
+
534
+ # Close should be within High/Low range
535
+ close_range_errors = ((df["close"] > df["high"]) | (df["close"] < df["low"])).sum()
536
+ if close_range_errors > 0:
537
+ errors.append(f"Found {close_range_errors} bars where Close is outside High/Low range")
538
+ ohlc_errors += close_range_errors
539
+
540
+ return {
541
+ "status": "VALID" if not errors else "INVALID",
542
+ "errors": errors,
543
+ "warnings": warnings,
544
+ "price_range": {
545
+ "min": min(df["low"].min(), df["high"].min(), df["open"].min(), df["close"].min()),
546
+ "max": max(df["low"].max(), df["high"].max(), df["open"].max(), df["close"].max()),
547
+ },
548
+ "volume_stats": {
549
+ "min": df["volume"].min(),
550
+ "max": df["volume"].max(),
551
+ "mean": df["volume"].mean(),
552
+ },
553
+ "ohlc_errors": ohlc_errors,
554
+ "negative_zero_values": negative_zero_count,
555
+ }
556
+
557
+ def _validate_expected_coverage(
558
+ self, df: pd.DataFrame, expected_timeframe: Optional[str]
559
+ ) -> Dict[str, Any]:
560
+ """Validate data coverage matches expected timeframe and duration.
561
+
562
+ Args:
563
+ df: DataFrame with 'date' column
564
+ expected_timeframe: Expected timeframe for coverage calculation (e.g., '1h')
565
+
566
+ Returns:
567
+ dict: Coverage validation results with expected/actual bar counts and percentage
568
+
569
+ Note:
570
+ Warns if coverage < 95% (missing data) or > 105% (duplicate data).
571
+ """
572
+ warnings = []
573
+
574
+ if not expected_timeframe or len(df) == 0:
575
+ return {"status": "SKIPPED", "warnings": ["Cannot validate coverage without timeframe"]}
576
+
577
+ # Calculate expected bars based on timeframe and actual date range
578
+ df["datetime"] = pd.to_datetime(df["date"])
579
+ start_time = df["datetime"].min()
580
+ end_time = df["datetime"].max()
581
+ duration = end_time - start_time
582
+
583
+ # Calculate expected number of bars
584
+ interval_map = {"1m": 1, "3m": 3, "5m": 5, "15m": 15, "30m": 30, "1h": 60, "2h": 120}
585
+ interval_minutes = interval_map.get(expected_timeframe, 0)
586
+
587
+ if interval_minutes > 0:
588
+ expected_bars = int(duration.total_seconds() / (interval_minutes * 60)) + 1
589
+ actual_bars = len(df)
590
+ coverage_percentage = (actual_bars / expected_bars) * 100
591
+
592
+ if coverage_percentage < 95:
593
+ warnings.append(
594
+ f"Low coverage: {coverage_percentage:.1f}% (may indicate missing data)"
595
+ )
596
+ elif coverage_percentage > 105:
597
+ warnings.append(
598
+ f"High coverage: {coverage_percentage:.1f}% (may indicate duplicate data)"
599
+ )
600
+ else:
601
+ expected_bars = 0
602
+ coverage_percentage = 0
603
+ warnings.append(f"Unknown timeframe '{expected_timeframe}' for coverage calculation")
604
+
605
+ return {
606
+ "status": "VALID" if not warnings else "WARNING",
607
+ "warnings": warnings,
608
+ "expected_bars": expected_bars,
609
+ "actual_bars": len(df),
610
+ "coverage_percentage": coverage_percentage,
611
+ "duration_days": duration.days,
612
+ }
613
+
614
+ def _validate_statistical_anomalies(self, df: pd.DataFrame) -> Dict[str, Any]:
615
+ """Detect statistical anomalies in price and volume data.
616
+
617
+ Args:
618
+ df: DataFrame with OHLCV columns
619
+
620
+ Returns:
621
+ dict: Anomaly validation results with outlier counts and suspicious pattern detection
622
+
623
+ Note:
624
+ Uses IQR (Interquartile Range) method for outlier detection. Flags if > 5% price
625
+ outliers, > 2% volume outliers, or > 10% repeated values in any price column.
626
+ """
627
+ warnings = []
628
+
629
+ # Calculate basic statistics
630
+ price_cols = ["open", "high", "low", "close"]
631
+
632
+ # Price outliers (using IQR method)
633
+ price_outliers = 0
634
+ for col in price_cols:
635
+ Q1 = df[col].quantile(0.25)
636
+ Q3 = df[col].quantile(0.75)
637
+ IQR = Q3 - Q1
638
+ lower_bound = Q1 - 1.5 * IQR
639
+ upper_bound = Q3 + 1.5 * IQR
640
+ outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
641
+ price_outliers += outliers
642
+
643
+ # Volume outliers
644
+ vol_Q1 = df["volume"].quantile(0.25)
645
+ vol_Q3 = df["volume"].quantile(0.75)
646
+ vol_IQR = vol_Q3 - vol_Q1
647
+ vol_upper_bound = vol_Q3 + 1.5 * vol_IQR
648
+ volume_outliers = (df["volume"] > vol_upper_bound).sum()
649
+
650
+ # Suspicious patterns
651
+ suspicious_patterns = 0
652
+
653
+ # Check for repeated identical prices (suspicious)
654
+ for col in price_cols:
655
+ repeated = df[col].value_counts()
656
+ max_repeats = repeated.max()
657
+ if max_repeats > len(df) * 0.1: # More than 10% identical values
658
+ warnings.append(f"Suspicious: {col} has {max_repeats} repeated values")
659
+ suspicious_patterns += 1
660
+
661
+ if price_outliers > len(df) * 0.05: # More than 5% outliers
662
+ warnings.append(
663
+ f"High number of price outliers: {price_outliers} ({100 * price_outliers / len(df):.1f}%)"
664
+ )
665
+
666
+ if volume_outliers > len(df) * 0.02: # More than 2% volume outliers
667
+ warnings.append(
668
+ f"High number of volume outliers: {volume_outliers} ({100 * volume_outliers / len(df):.1f}%)"
669
+ )
670
+
671
+ return {
672
+ "status": "VALID" if not warnings else "WARNING",
673
+ "warnings": warnings,
674
+ "price_outliers": price_outliers,
675
+ "volume_outliers": volume_outliers,
676
+ "suspicious_patterns": suspicious_patterns,
677
+ }