gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""CSV validation module for cryptocurrency market data quality assurance.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive validation for CSV files containing OHLCV data,
|
|
4
|
+
ensuring data integrity, completeness, and quality before use in trading systems.
|
|
5
|
+
|
|
6
|
+
Validation Layers:
|
|
7
|
+
1. Structure Validation: Column presence and format detection
|
|
8
|
+
2. DateTime Validation: Chronological order and gap detection
|
|
9
|
+
3. OHLCV Quality: Logical consistency and value ranges
|
|
10
|
+
4. Coverage Validation: Expected vs actual bar counts
|
|
11
|
+
5. Statistical Anomaly Detection: Outlier and pattern analysis
|
|
12
|
+
|
|
13
|
+
SLO Targets:
|
|
14
|
+
Correctness: 100% - all validation rules must be accurate
|
|
15
|
+
Observability: Complete reporting with errors, warnings, and metrics
|
|
16
|
+
Maintainability: Single source of truth for CSV validation
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import time
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, Optional, Union
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from .models import ValidationReport
|
|
27
|
+
from .storage import ValidationStorage, extract_symbol_timeframe_from_path
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CSVValidator:
|
|
31
|
+
"""Validator for cryptocurrency market data CSV files.
|
|
32
|
+
|
|
33
|
+
Provides multi-layer validation including structure checking, datetime
|
|
34
|
+
sequence validation, OHLCV quality analysis, coverage calculation, and
|
|
35
|
+
statistical anomaly detection.
|
|
36
|
+
|
|
37
|
+
Examples:
|
|
38
|
+
>>> validator = CSVValidator()
|
|
39
|
+
>>> results = validator.validate_csv_file("BTCUSDT-1h.csv", expected_timeframe="1h")
|
|
40
|
+
>>> if results["total_errors"] == 0:
|
|
41
|
+
... print("Validation passed!")
|
|
42
|
+
... else:
|
|
43
|
+
... print(f"Validation failed: {results['validation_summary']}")
|
|
44
|
+
|
|
45
|
+
Note:
|
|
46
|
+
All validation errors and warnings are logged to console and returned
|
|
47
|
+
in the validation results dictionary.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def _run_structure_validation_layer(self, df: pd.DataFrame, validation_results: dict) -> None:
|
|
51
|
+
"""Run structure validation layer and update results.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
df: DataFrame to validate
|
|
55
|
+
validation_results: Validation results dict to update
|
|
56
|
+
"""
|
|
57
|
+
print("\n1. BASIC STRUCTURE VALIDATION")
|
|
58
|
+
structure_validation = self._validate_csv_structure(df)
|
|
59
|
+
validation_results["structure_validation"] = structure_validation
|
|
60
|
+
print(f" Columns: {structure_validation['status']}")
|
|
61
|
+
|
|
62
|
+
if structure_validation["errors"]:
|
|
63
|
+
for error in structure_validation["errors"]:
|
|
64
|
+
print(f" ❌ {error}")
|
|
65
|
+
validation_results["total_errors"] += 1
|
|
66
|
+
|
|
67
|
+
def _run_datetime_validation_layer(
|
|
68
|
+
self, df: pd.DataFrame, expected_timeframe: Optional[str], validation_results: dict
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Run datetime validation layer and update results.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: DataFrame to validate
|
|
74
|
+
expected_timeframe: Expected timeframe for gap detection
|
|
75
|
+
validation_results: Validation results dict to update
|
|
76
|
+
"""
|
|
77
|
+
print("\n2. DATE/TIME VALIDATION")
|
|
78
|
+
datetime_validation = self._validate_datetime_sequence(df, expected_timeframe)
|
|
79
|
+
validation_results["datetime_validation"] = datetime_validation
|
|
80
|
+
print(
|
|
81
|
+
f" Date Range: {datetime_validation['date_range']['start']} to {datetime_validation['date_range']['end']}"
|
|
82
|
+
)
|
|
83
|
+
print(f" Duration: {datetime_validation['duration_days']:.1f} days")
|
|
84
|
+
print(f" Gaps Found: {datetime_validation['gaps_found']}")
|
|
85
|
+
print(f" Sequence: {datetime_validation['chronological_order']}")
|
|
86
|
+
|
|
87
|
+
if datetime_validation["errors"]:
|
|
88
|
+
for error in datetime_validation["errors"]:
|
|
89
|
+
print(f" ❌ {error}")
|
|
90
|
+
validation_results["total_errors"] += 1
|
|
91
|
+
if datetime_validation["warnings"]:
|
|
92
|
+
for warning in datetime_validation["warnings"]:
|
|
93
|
+
print(f" ⚠️ {warning}")
|
|
94
|
+
validation_results["total_warnings"] += 1
|
|
95
|
+
|
|
96
|
+
def _run_ohlcv_validation_layer(self, df: pd.DataFrame, validation_results: dict) -> None:
|
|
97
|
+
"""Run OHLCV quality validation layer and update results.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
df: DataFrame to validate
|
|
101
|
+
validation_results: Validation results dict to update
|
|
102
|
+
"""
|
|
103
|
+
print("\n3. OHLCV DATA QUALITY VALIDATION")
|
|
104
|
+
ohlcv_validation = self._validate_ohlcv_quality(df)
|
|
105
|
+
validation_results["ohlcv_validation"] = ohlcv_validation
|
|
106
|
+
print(
|
|
107
|
+
f" Price Range: ${ohlcv_validation['price_range']['min']:.4f} - ${ohlcv_validation['price_range']['max']:.4f}"
|
|
108
|
+
)
|
|
109
|
+
print(
|
|
110
|
+
f" Volume Range: {ohlcv_validation['volume_stats']['min']:.2f} - {ohlcv_validation['volume_stats']['max']:,.0f}"
|
|
111
|
+
)
|
|
112
|
+
print(f" OHLC Logic Errors: {ohlcv_validation['ohlc_errors']}")
|
|
113
|
+
print(f" Negative/Zero Values: {ohlcv_validation['negative_zero_values']}")
|
|
114
|
+
|
|
115
|
+
if ohlcv_validation["errors"]:
|
|
116
|
+
for error in ohlcv_validation["errors"]:
|
|
117
|
+
print(f" ❌ {error}")
|
|
118
|
+
validation_results["total_errors"] += 1
|
|
119
|
+
if ohlcv_validation["warnings"]:
|
|
120
|
+
for warning in ohlcv_validation["warnings"]:
|
|
121
|
+
print(f" ⚠️ {warning}")
|
|
122
|
+
validation_results["total_warnings"] += 1
|
|
123
|
+
|
|
124
|
+
def _run_coverage_and_anomaly_layers(
|
|
125
|
+
self, df: pd.DataFrame, expected_timeframe: Optional[str], validation_results: dict
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Run coverage and anomaly validation layers.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
df: DataFrame to validate
|
|
131
|
+
expected_timeframe: Expected timeframe for coverage calculation
|
|
132
|
+
validation_results: Validation results dict to update
|
|
133
|
+
"""
|
|
134
|
+
print("\n4. EXPECTED COVERAGE VALIDATION")
|
|
135
|
+
coverage_validation = self._validate_expected_coverage(df, expected_timeframe)
|
|
136
|
+
validation_results["coverage_validation"] = coverage_validation
|
|
137
|
+
print(f" Expected Bars: {coverage_validation['expected_bars']:,}")
|
|
138
|
+
print(f" Actual Bars: {coverage_validation['actual_bars']:,}")
|
|
139
|
+
print(f" Coverage: {coverage_validation['coverage_percentage']:.1f}%")
|
|
140
|
+
|
|
141
|
+
print("\n5. STATISTICAL ANOMALY DETECTION")
|
|
142
|
+
anomaly_validation = self._validate_statistical_anomalies(df)
|
|
143
|
+
validation_results["anomaly_validation"] = anomaly_validation
|
|
144
|
+
print(f" Price Outliers: {anomaly_validation['price_outliers']}")
|
|
145
|
+
print(f" Volume Outliers: {anomaly_validation['volume_outliers']}")
|
|
146
|
+
print(f" Suspicious Patterns: {anomaly_validation['suspicious_patterns']}")
|
|
147
|
+
|
|
148
|
+
def _generate_final_validation_summary(self, validation_results: dict) -> None:
|
|
149
|
+
"""Generate and print final validation summary.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
validation_results: Validation results dict to update with summary
|
|
153
|
+
"""
|
|
154
|
+
if validation_results["total_errors"] == 0:
|
|
155
|
+
if validation_results["total_warnings"] == 0:
|
|
156
|
+
validation_results["validation_summary"] = "PERFECT - No errors or warnings"
|
|
157
|
+
print("\n✅ VALIDATION RESULT: PERFECT")
|
|
158
|
+
print(" No errors or warnings found. Data quality is excellent.")
|
|
159
|
+
else:
|
|
160
|
+
validation_results["validation_summary"] = (
|
|
161
|
+
f"GOOD - {validation_results['total_warnings']} warnings"
|
|
162
|
+
)
|
|
163
|
+
print("\n✅ VALIDATION RESULT: GOOD")
|
|
164
|
+
print(f" No errors, but {validation_results['total_warnings']} warnings found.")
|
|
165
|
+
else:
|
|
166
|
+
validation_results["validation_summary"] = (
|
|
167
|
+
f"FAILED - {validation_results['total_errors']} errors, {validation_results['total_warnings']} warnings"
|
|
168
|
+
)
|
|
169
|
+
print("\n❌ VALIDATION RESULT: FAILED")
|
|
170
|
+
print(
|
|
171
|
+
f" {validation_results['total_errors']} errors and {validation_results['total_warnings']} warnings found."
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def validate_csv_file(
|
|
175
|
+
self,
|
|
176
|
+
csv_filepath: Union[str, Path],
|
|
177
|
+
expected_timeframe: Optional[str] = None,
|
|
178
|
+
store_report: bool = False,
|
|
179
|
+
) -> Dict[str, Any]:
|
|
180
|
+
"""
|
|
181
|
+
Comprehensive validation of CSV file data integrity, completeness, and quality.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
csv_filepath: Path to CSV file to validate
|
|
185
|
+
expected_timeframe: Expected timeframe (e.g., '30m') for interval validation
|
|
186
|
+
store_report: If True, persist validation report to DuckDB for analysis (default: False)
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
dict: Validation results with detailed analysis
|
|
190
|
+
|
|
191
|
+
Examples:
|
|
192
|
+
>>> validator = CSVValidator()
|
|
193
|
+
>>> results = validator.validate_csv_file("data.csv", "1h")
|
|
194
|
+
>>> print(f"Errors: {results['total_errors']}, Warnings: {results['total_warnings']}")
|
|
195
|
+
|
|
196
|
+
>>> # Store validation report for AI agent analysis
|
|
197
|
+
>>> results = validator.validate_csv_file("data.csv", "1h", store_report=True)
|
|
198
|
+
"""
|
|
199
|
+
# Start timing for performance metrics
|
|
200
|
+
start_time = time.perf_counter()
|
|
201
|
+
|
|
202
|
+
csv_filepath = Path(csv_filepath)
|
|
203
|
+
|
|
204
|
+
print(f"\n{'=' * 60}")
|
|
205
|
+
print(f"VALIDATING: {csv_filepath.name}")
|
|
206
|
+
print(f"{'=' * 60}")
|
|
207
|
+
|
|
208
|
+
validation_results = {
|
|
209
|
+
"validation_timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
|
210
|
+
"file_path": str(csv_filepath),
|
|
211
|
+
"file_exists": csv_filepath.exists(),
|
|
212
|
+
"file_size_mb": 0,
|
|
213
|
+
"total_errors": 0,
|
|
214
|
+
"total_warnings": 0,
|
|
215
|
+
"validation_summary": "UNKNOWN",
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if not csv_filepath.exists():
|
|
219
|
+
validation_results["validation_summary"] = "FAILED - File not found"
|
|
220
|
+
validation_results["total_errors"] = 1
|
|
221
|
+
return validation_results
|
|
222
|
+
|
|
223
|
+
validation_results["file_size_mb"] = csv_filepath.stat().st_size / (1024 * 1024)
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
# Load CSV data efficiently
|
|
227
|
+
print("Loading and parsing CSV data...")
|
|
228
|
+
df = pd.read_csv(csv_filepath, comment="#")
|
|
229
|
+
validation_results["total_bars"] = len(df)
|
|
230
|
+
print(f" ✅ Loaded {len(df):,} data bars")
|
|
231
|
+
|
|
232
|
+
# Run all validation layers
|
|
233
|
+
self._run_structure_validation_layer(df, validation_results)
|
|
234
|
+
self._run_datetime_validation_layer(df, expected_timeframe, validation_results)
|
|
235
|
+
self._run_ohlcv_validation_layer(df, validation_results)
|
|
236
|
+
self._run_coverage_and_anomaly_layers(df, expected_timeframe, validation_results)
|
|
237
|
+
|
|
238
|
+
# Generate final summary
|
|
239
|
+
self._generate_final_validation_summary(validation_results)
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
validation_results["validation_summary"] = f"ERROR - {str(e)}"
|
|
243
|
+
validation_results["total_errors"] += 1
|
|
244
|
+
print(f"❌ Validation failed with exception: {e}")
|
|
245
|
+
|
|
246
|
+
# Calculate validation duration
|
|
247
|
+
end_time = time.perf_counter()
|
|
248
|
+
duration_ms = (end_time - start_time) * 1000
|
|
249
|
+
|
|
250
|
+
# Store report to DuckDB if requested
|
|
251
|
+
if store_report:
|
|
252
|
+
try:
|
|
253
|
+
# Extract symbol and timeframe from filepath
|
|
254
|
+
symbol, timeframe = extract_symbol_timeframe_from_path(str(csv_filepath))
|
|
255
|
+
|
|
256
|
+
# Convert legacy dict to typed ValidationReport
|
|
257
|
+
report = ValidationReport.from_legacy_dict(
|
|
258
|
+
validation_results, duration_ms=duration_ms, symbol=symbol, timeframe=timeframe
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Persist to DuckDB
|
|
262
|
+
storage = ValidationStorage()
|
|
263
|
+
storage.insert_report(report)
|
|
264
|
+
print(f"\n📊 Validation report stored to database ({duration_ms:.2f}ms)")
|
|
265
|
+
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(f"⚠️ Failed to store validation report: {e}")
|
|
268
|
+
|
|
269
|
+
return validation_results
|
|
270
|
+
|
|
271
|
+
def _detect_csv_format_type(
|
|
272
|
+
self, df: pd.DataFrame, expected_columns: list, legacy_columns: list
|
|
273
|
+
) -> tuple[str, bool, bool]:
|
|
274
|
+
"""Detect CSV format type (enhanced, legacy, or incomplete).
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
df: DataFrame to analyze
|
|
278
|
+
expected_columns: List of enhanced format columns
|
|
279
|
+
legacy_columns: List of legacy format columns
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
tuple: (format_type, has_enhanced_format, has_legacy_format)
|
|
283
|
+
"""
|
|
284
|
+
has_enhanced_format = all(col in df.columns for col in expected_columns)
|
|
285
|
+
has_legacy_format = all(col in df.columns for col in legacy_columns)
|
|
286
|
+
|
|
287
|
+
if has_enhanced_format:
|
|
288
|
+
format_type = "enhanced"
|
|
289
|
+
elif has_legacy_format:
|
|
290
|
+
format_type = "legacy"
|
|
291
|
+
else:
|
|
292
|
+
format_type = "incomplete"
|
|
293
|
+
|
|
294
|
+
return format_type, has_enhanced_format, has_legacy_format
|
|
295
|
+
|
|
296
|
+
def _validate_column_completeness(
|
|
297
|
+
self,
|
|
298
|
+
df: pd.DataFrame,
|
|
299
|
+
format_type: str,
|
|
300
|
+
expected_columns: list,
|
|
301
|
+
legacy_columns: list,
|
|
302
|
+
errors: list,
|
|
303
|
+
warnings: list,
|
|
304
|
+
) -> None:
|
|
305
|
+
"""Validate column completeness based on format type.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
df: DataFrame to validate
|
|
309
|
+
format_type: Detected format type
|
|
310
|
+
expected_columns: List of enhanced format columns
|
|
311
|
+
legacy_columns: List of legacy format columns
|
|
312
|
+
errors: List to append errors to
|
|
313
|
+
warnings: List to append warnings to
|
|
314
|
+
"""
|
|
315
|
+
if format_type == "enhanced":
|
|
316
|
+
missing_columns = [col for col in expected_columns if col not in df.columns]
|
|
317
|
+
if missing_columns:
|
|
318
|
+
errors.append(f"Missing enhanced columns: {missing_columns}")
|
|
319
|
+
elif format_type == "legacy":
|
|
320
|
+
warnings.append(
|
|
321
|
+
"Legacy format detected - missing microstructure columns for advanced analysis"
|
|
322
|
+
)
|
|
323
|
+
missing_enhanced = [col for col in expected_columns if col not in df.columns]
|
|
324
|
+
warnings.append(f"Enhanced features unavailable: {missing_enhanced}")
|
|
325
|
+
else: # incomplete format
|
|
326
|
+
missing_basic = [col for col in legacy_columns if col not in df.columns]
|
|
327
|
+
errors.append(f"Missing basic required columns: {missing_basic}")
|
|
328
|
+
|
|
329
|
+
def _check_extra_columns(
|
|
330
|
+
self, df: pd.DataFrame, expected_columns: list, warnings: list
|
|
331
|
+
) -> None:
|
|
332
|
+
"""Check for unexpected extra columns.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
df: DataFrame to check
|
|
336
|
+
expected_columns: List of expected columns
|
|
337
|
+
warnings: List to append warnings to
|
|
338
|
+
"""
|
|
339
|
+
extra_columns = [col for col in df.columns if col not in expected_columns]
|
|
340
|
+
if extra_columns:
|
|
341
|
+
warnings.append(f"Unexpected extra columns: {extra_columns}")
|
|
342
|
+
|
|
343
|
+
def _validate_csv_structure(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
344
|
+
"""Validate CSV has correct structure and columns.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
df: DataFrame to validate
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
dict: Structure validation results with status, format type, errors, and warnings
|
|
351
|
+
|
|
352
|
+
Note:
|
|
353
|
+
Supports both enhanced (11-column) and legacy (6-column) formats for
|
|
354
|
+
backward compatibility.
|
|
355
|
+
"""
|
|
356
|
+
# Enhanced expected columns for complete microstructure data
|
|
357
|
+
expected_columns = [
|
|
358
|
+
"date",
|
|
359
|
+
"open",
|
|
360
|
+
"high",
|
|
361
|
+
"low",
|
|
362
|
+
"close",
|
|
363
|
+
"volume",
|
|
364
|
+
"close_time",
|
|
365
|
+
"quote_asset_volume",
|
|
366
|
+
"number_of_trades",
|
|
367
|
+
"taker_buy_base_asset_volume",
|
|
368
|
+
"taker_buy_quote_asset_volume",
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
# Legacy format for backward compatibility
|
|
372
|
+
legacy_columns = ["date", "open", "high", "low", "close", "volume"]
|
|
373
|
+
|
|
374
|
+
errors = []
|
|
375
|
+
warnings = []
|
|
376
|
+
|
|
377
|
+
# Detect format type
|
|
378
|
+
format_type, has_enhanced_format, has_legacy_format = self._detect_csv_format_type(
|
|
379
|
+
df, expected_columns, legacy_columns
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Validate column completeness based on format
|
|
383
|
+
self._validate_column_completeness(
|
|
384
|
+
df, format_type, expected_columns, legacy_columns, errors, warnings
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Check for extra columns
|
|
388
|
+
self._check_extra_columns(df, expected_columns, warnings)
|
|
389
|
+
|
|
390
|
+
# Check for empty data
|
|
391
|
+
if len(df) == 0:
|
|
392
|
+
errors.append("CSV file is empty (no data rows)")
|
|
393
|
+
|
|
394
|
+
return {
|
|
395
|
+
"status": "VALID" if not errors else "INVALID",
|
|
396
|
+
"format_type": format_type,
|
|
397
|
+
"errors": errors,
|
|
398
|
+
"warnings": warnings,
|
|
399
|
+
"columns_found": list(df.columns),
|
|
400
|
+
"expected_columns": expected_columns,
|
|
401
|
+
"legacy_columns": legacy_columns,
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
def _validate_datetime_sequence(
|
|
405
|
+
self, df: pd.DataFrame, expected_timeframe: Optional[str]
|
|
406
|
+
) -> Dict[str, Any]:
|
|
407
|
+
"""Validate datetime sequence is complete and chronological.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
df: DataFrame with 'date' column to validate
|
|
411
|
+
expected_timeframe: Expected timeframe for gap detection (e.g., '1h', '30m')
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
dict: DateTime validation results with status, date range, gaps, and errors
|
|
415
|
+
|
|
416
|
+
Note:
|
|
417
|
+
Detects all gaps > expected interval and reports chronological ordering issues.
|
|
418
|
+
"""
|
|
419
|
+
errors = []
|
|
420
|
+
warnings = []
|
|
421
|
+
gaps_found = 0
|
|
422
|
+
|
|
423
|
+
# Convert date column to datetime
|
|
424
|
+
try:
|
|
425
|
+
df["datetime"] = pd.to_datetime(df["date"])
|
|
426
|
+
except Exception as e:
|
|
427
|
+
errors.append(f"Failed to parse dates: {e}")
|
|
428
|
+
return {"status": "INVALID", "errors": errors, "warnings": warnings}
|
|
429
|
+
|
|
430
|
+
# Check chronological order
|
|
431
|
+
is_sorted = df["datetime"].is_monotonic_increasing
|
|
432
|
+
|
|
433
|
+
# Find gaps if we have expected timeframe
|
|
434
|
+
gap_details = []
|
|
435
|
+
if expected_timeframe and len(df) > 1:
|
|
436
|
+
# Calculate expected interval in minutes
|
|
437
|
+
interval_map = {"1m": 1, "3m": 3, "5m": 5, "15m": 15, "30m": 30, "1h": 60, "2h": 120}
|
|
438
|
+
expected_interval = interval_map.get(expected_timeframe, 0)
|
|
439
|
+
|
|
440
|
+
if expected_interval > 0:
|
|
441
|
+
expected_delta = pd.Timedelta(minutes=expected_interval)
|
|
442
|
+
|
|
443
|
+
# Check for gaps
|
|
444
|
+
for i in range(1, len(df)):
|
|
445
|
+
actual_delta = df["datetime"].iloc[i] - df["datetime"].iloc[i - 1]
|
|
446
|
+
if actual_delta > expected_delta:
|
|
447
|
+
gaps_found += 1
|
|
448
|
+
gap_details.append(
|
|
449
|
+
{
|
|
450
|
+
"position": i,
|
|
451
|
+
"expected_time": (
|
|
452
|
+
df["datetime"].iloc[i - 1] + expected_delta
|
|
453
|
+
).isoformat(),
|
|
454
|
+
"actual_time": df["datetime"].iloc[i].isoformat(),
|
|
455
|
+
"gap_duration": str(actual_delta - expected_delta),
|
|
456
|
+
}
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Record every single gap for complete validation tracking
|
|
460
|
+
warnings.append(
|
|
461
|
+
f"Gap at position {i}: expected {expected_delta}, got {actual_delta}"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
if not is_sorted:
|
|
465
|
+
errors.append("Timestamps are not in chronological order")
|
|
466
|
+
|
|
467
|
+
if gaps_found > 10:
|
|
468
|
+
errors.append(f"Too many gaps found: {gaps_found} (data may be incomplete)")
|
|
469
|
+
elif gaps_found > 0:
|
|
470
|
+
warnings.append(f"{gaps_found} timestamp gaps found (market closures or data issues)")
|
|
471
|
+
|
|
472
|
+
return {
|
|
473
|
+
"status": "VALID" if not errors else "INVALID",
|
|
474
|
+
"errors": errors,
|
|
475
|
+
"warnings": warnings,
|
|
476
|
+
"date_range": {
|
|
477
|
+
"start": df["datetime"].min().isoformat(),
|
|
478
|
+
"end": df["datetime"].max().isoformat(),
|
|
479
|
+
},
|
|
480
|
+
"duration_days": (df["datetime"].max() - df["datetime"].min()).days,
|
|
481
|
+
"chronological_order": is_sorted,
|
|
482
|
+
"gaps_found": gaps_found,
|
|
483
|
+
"gap_details": gap_details, # Complete gap details for thorough analysis
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
def _validate_ohlcv_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
487
|
+
"""Validate OHLCV data quality and logical consistency.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
df: DataFrame with OHLCV columns to validate
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
dict: OHLCV quality validation results with price ranges, volume stats, and errors
|
|
494
|
+
|
|
495
|
+
Note:
|
|
496
|
+
Checks OHLC logic (High >= Low, Open/Close within range), detects negative/zero
|
|
497
|
+
values, and flags volume anomalies.
|
|
498
|
+
"""
|
|
499
|
+
errors = []
|
|
500
|
+
warnings = []
|
|
501
|
+
|
|
502
|
+
# Check for negative or zero values
|
|
503
|
+
negative_zero_count = 0
|
|
504
|
+
for col in ["open", "high", "low", "close"]:
|
|
505
|
+
negative_zero = (df[col] <= 0).sum()
|
|
506
|
+
if negative_zero > 0:
|
|
507
|
+
errors.append(f"Found {negative_zero} negative/zero values in {col}")
|
|
508
|
+
negative_zero_count += negative_zero
|
|
509
|
+
|
|
510
|
+
# Check volume (can be zero but not negative)
|
|
511
|
+
negative_volume = (df["volume"] < 0).sum()
|
|
512
|
+
if negative_volume > 0:
|
|
513
|
+
errors.append(f"Found {negative_volume} negative volume values")
|
|
514
|
+
|
|
515
|
+
zero_volume = (df["volume"] == 0).sum()
|
|
516
|
+
if zero_volume > 0:
|
|
517
|
+
warnings.append(f"Found {zero_volume} zero volume bars")
|
|
518
|
+
|
|
519
|
+
# Check OHLC logic: High >= Low, Open/Close within High/Low range
|
|
520
|
+
ohlc_errors = 0
|
|
521
|
+
|
|
522
|
+
# High should be >= Low
|
|
523
|
+
high_low_errors = (df["high"] < df["low"]).sum()
|
|
524
|
+
if high_low_errors > 0:
|
|
525
|
+
errors.append(f"Found {high_low_errors} bars where High < Low")
|
|
526
|
+
ohlc_errors += high_low_errors
|
|
527
|
+
|
|
528
|
+
# Open should be within High/Low range
|
|
529
|
+
open_range_errors = ((df["open"] > df["high"]) | (df["open"] < df["low"])).sum()
|
|
530
|
+
if open_range_errors > 0:
|
|
531
|
+
errors.append(f"Found {open_range_errors} bars where Open is outside High/Low range")
|
|
532
|
+
ohlc_errors += open_range_errors
|
|
533
|
+
|
|
534
|
+
# Close should be within High/Low range
|
|
535
|
+
close_range_errors = ((df["close"] > df["high"]) | (df["close"] < df["low"])).sum()
|
|
536
|
+
if close_range_errors > 0:
|
|
537
|
+
errors.append(f"Found {close_range_errors} bars where Close is outside High/Low range")
|
|
538
|
+
ohlc_errors += close_range_errors
|
|
539
|
+
|
|
540
|
+
return {
|
|
541
|
+
"status": "VALID" if not errors else "INVALID",
|
|
542
|
+
"errors": errors,
|
|
543
|
+
"warnings": warnings,
|
|
544
|
+
"price_range": {
|
|
545
|
+
"min": min(df["low"].min(), df["high"].min(), df["open"].min(), df["close"].min()),
|
|
546
|
+
"max": max(df["low"].max(), df["high"].max(), df["open"].max(), df["close"].max()),
|
|
547
|
+
},
|
|
548
|
+
"volume_stats": {
|
|
549
|
+
"min": df["volume"].min(),
|
|
550
|
+
"max": df["volume"].max(),
|
|
551
|
+
"mean": df["volume"].mean(),
|
|
552
|
+
},
|
|
553
|
+
"ohlc_errors": ohlc_errors,
|
|
554
|
+
"negative_zero_values": negative_zero_count,
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
def _validate_expected_coverage(
|
|
558
|
+
self, df: pd.DataFrame, expected_timeframe: Optional[str]
|
|
559
|
+
) -> Dict[str, Any]:
|
|
560
|
+
"""Validate data coverage matches expected timeframe and duration.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
df: DataFrame with 'date' column
|
|
564
|
+
expected_timeframe: Expected timeframe for coverage calculation (e.g., '1h')
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
dict: Coverage validation results with expected/actual bar counts and percentage
|
|
568
|
+
|
|
569
|
+
Note:
|
|
570
|
+
Warns if coverage < 95% (missing data) or > 105% (duplicate data).
|
|
571
|
+
"""
|
|
572
|
+
warnings = []
|
|
573
|
+
|
|
574
|
+
if not expected_timeframe or len(df) == 0:
|
|
575
|
+
return {"status": "SKIPPED", "warnings": ["Cannot validate coverage without timeframe"]}
|
|
576
|
+
|
|
577
|
+
# Calculate expected bars based on timeframe and actual date range
|
|
578
|
+
df["datetime"] = pd.to_datetime(df["date"])
|
|
579
|
+
start_time = df["datetime"].min()
|
|
580
|
+
end_time = df["datetime"].max()
|
|
581
|
+
duration = end_time - start_time
|
|
582
|
+
|
|
583
|
+
# Calculate expected number of bars
|
|
584
|
+
interval_map = {"1m": 1, "3m": 3, "5m": 5, "15m": 15, "30m": 30, "1h": 60, "2h": 120}
|
|
585
|
+
interval_minutes = interval_map.get(expected_timeframe, 0)
|
|
586
|
+
|
|
587
|
+
if interval_minutes > 0:
|
|
588
|
+
expected_bars = int(duration.total_seconds() / (interval_minutes * 60)) + 1
|
|
589
|
+
actual_bars = len(df)
|
|
590
|
+
coverage_percentage = (actual_bars / expected_bars) * 100
|
|
591
|
+
|
|
592
|
+
if coverage_percentage < 95:
|
|
593
|
+
warnings.append(
|
|
594
|
+
f"Low coverage: {coverage_percentage:.1f}% (may indicate missing data)"
|
|
595
|
+
)
|
|
596
|
+
elif coverage_percentage > 105:
|
|
597
|
+
warnings.append(
|
|
598
|
+
f"High coverage: {coverage_percentage:.1f}% (may indicate duplicate data)"
|
|
599
|
+
)
|
|
600
|
+
else:
|
|
601
|
+
expected_bars = 0
|
|
602
|
+
coverage_percentage = 0
|
|
603
|
+
warnings.append(f"Unknown timeframe '{expected_timeframe}' for coverage calculation")
|
|
604
|
+
|
|
605
|
+
return {
|
|
606
|
+
"status": "VALID" if not warnings else "WARNING",
|
|
607
|
+
"warnings": warnings,
|
|
608
|
+
"expected_bars": expected_bars,
|
|
609
|
+
"actual_bars": len(df),
|
|
610
|
+
"coverage_percentage": coverage_percentage,
|
|
611
|
+
"duration_days": duration.days,
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
def _validate_statistical_anomalies(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
615
|
+
"""Detect statistical anomalies in price and volume data.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
df: DataFrame with OHLCV columns
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
dict: Anomaly validation results with outlier counts and suspicious pattern detection
|
|
622
|
+
|
|
623
|
+
Note:
|
|
624
|
+
Uses IQR (Interquartile Range) method for outlier detection. Flags if > 5% price
|
|
625
|
+
outliers, > 2% volume outliers, or > 10% repeated values in any price column.
|
|
626
|
+
"""
|
|
627
|
+
warnings = []
|
|
628
|
+
|
|
629
|
+
# Calculate basic statistics
|
|
630
|
+
price_cols = ["open", "high", "low", "close"]
|
|
631
|
+
|
|
632
|
+
# Price outliers (using IQR method)
|
|
633
|
+
price_outliers = 0
|
|
634
|
+
for col in price_cols:
|
|
635
|
+
Q1 = df[col].quantile(0.25)
|
|
636
|
+
Q3 = df[col].quantile(0.75)
|
|
637
|
+
IQR = Q3 - Q1
|
|
638
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
639
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
640
|
+
outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
|
|
641
|
+
price_outliers += outliers
|
|
642
|
+
|
|
643
|
+
# Volume outliers
|
|
644
|
+
vol_Q1 = df["volume"].quantile(0.25)
|
|
645
|
+
vol_Q3 = df["volume"].quantile(0.75)
|
|
646
|
+
vol_IQR = vol_Q3 - vol_Q1
|
|
647
|
+
vol_upper_bound = vol_Q3 + 1.5 * vol_IQR
|
|
648
|
+
volume_outliers = (df["volume"] > vol_upper_bound).sum()
|
|
649
|
+
|
|
650
|
+
# Suspicious patterns
|
|
651
|
+
suspicious_patterns = 0
|
|
652
|
+
|
|
653
|
+
# Check for repeated identical prices (suspicious)
|
|
654
|
+
for col in price_cols:
|
|
655
|
+
repeated = df[col].value_counts()
|
|
656
|
+
max_repeats = repeated.max()
|
|
657
|
+
if max_repeats > len(df) * 0.1: # More than 10% identical values
|
|
658
|
+
warnings.append(f"Suspicious: {col} has {max_repeats} repeated values")
|
|
659
|
+
suspicious_patterns += 1
|
|
660
|
+
|
|
661
|
+
if price_outliers > len(df) * 0.05: # More than 5% outliers
|
|
662
|
+
warnings.append(
|
|
663
|
+
f"High number of price outliers: {price_outliers} ({100 * price_outliers / len(df):.1f}%)"
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
if volume_outliers > len(df) * 0.02: # More than 2% volume outliers
|
|
667
|
+
warnings.append(
|
|
668
|
+
f"High number of volume outliers: {volume_outliers} ({100 * volume_outliers / len(df):.1f}%)"
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
return {
|
|
672
|
+
"status": "VALID" if not warnings else "WARNING",
|
|
673
|
+
"warnings": warnings,
|
|
674
|
+
"price_outliers": price_outliers,
|
|
675
|
+
"volume_outliers": volume_outliers,
|
|
676
|
+
"suspicious_patterns": suspicious_patterns,
|
|
677
|
+
}
|