quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Memory optimization utilities for large datasets.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
# Try to import pandas
|
|
9
|
+
try:
|
|
10
|
+
import pandas as pd
|
|
11
|
+
HAS_PANDAS = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_PANDAS = False
|
|
14
|
+
pd = None
|
|
15
|
+
|
|
16
|
+
# Try to import NumPy
|
|
17
|
+
try:
|
|
18
|
+
import numpy as np
|
|
19
|
+
HAS_NUMPY = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
HAS_NUMPY = False
|
|
22
|
+
np = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def optimize_dtypes(df: Any) -> Any:
|
|
26
|
+
"""
|
|
27
|
+
Optimize pandas DataFrame dtypes to reduce memory usage.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
df: Pandas DataFrame
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
DataFrame with optimized dtypes
|
|
34
|
+
"""
|
|
35
|
+
if not HAS_PANDAS:
|
|
36
|
+
raise ImportError("pandas required. Install with: pip install pandas")
|
|
37
|
+
|
|
38
|
+
if not isinstance(df, pd.DataFrame):
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
original_memory = df.memory_usage(deep=True).sum()
|
|
42
|
+
|
|
43
|
+
# Optimize numeric columns
|
|
44
|
+
for col in df.select_dtypes(include=['int64']).columns:
|
|
45
|
+
df[col] = pd.to_numeric(df[col], downcast='integer')
|
|
46
|
+
|
|
47
|
+
for col in df.select_dtypes(include=['float64']).columns:
|
|
48
|
+
df[col] = pd.to_numeric(df[col], downcast='float')
|
|
49
|
+
|
|
50
|
+
# Optimize object columns (strings)
|
|
51
|
+
for col in df.select_dtypes(include=['object']).columns:
|
|
52
|
+
num_unique = df[col].nunique()
|
|
53
|
+
num_total = len(df)
|
|
54
|
+
if num_unique / num_total < 0.5: # Low cardinality
|
|
55
|
+
df[col] = df[col].astype('category')
|
|
56
|
+
|
|
57
|
+
new_memory = df.memory_usage(deep=True).sum()
|
|
58
|
+
reduction = (1 - new_memory / original_memory) * 100
|
|
59
|
+
|
|
60
|
+
return df
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def chunked_process(
|
|
64
|
+
data: List,
|
|
65
|
+
chunk_size: int,
|
|
66
|
+
process_fn: callable,
|
|
67
|
+
*args,
|
|
68
|
+
**kwargs
|
|
69
|
+
) -> List:
|
|
70
|
+
"""
|
|
71
|
+
Process data in chunks to reduce memory usage.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
data: List of data to process
|
|
75
|
+
chunk_size: Size of each chunk
|
|
76
|
+
process_fn: Function to process each chunk
|
|
77
|
+
*args: Arguments for process function
|
|
78
|
+
**kwargs: Keyword arguments for process function
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of processed results
|
|
82
|
+
"""
|
|
83
|
+
results = []
|
|
84
|
+
|
|
85
|
+
for i in range(0, len(data), chunk_size):
|
|
86
|
+
chunk = data[i:i + chunk_size]
|
|
87
|
+
chunk_result = process_fn(chunk, *args, **kwargs)
|
|
88
|
+
results.extend(chunk_result)
|
|
89
|
+
|
|
90
|
+
return results
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def estimate_memory_usage(data: Any) -> Dict[str, Any]:
|
|
94
|
+
"""
|
|
95
|
+
Estimate memory usage of data structure.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
data: Data structure (list, dict, DataFrame, etc.)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dictionary with memory usage information
|
|
102
|
+
"""
|
|
103
|
+
if HAS_PANDAS and isinstance(data, pd.DataFrame):
|
|
104
|
+
memory_info = {
|
|
105
|
+
'total_mb': data.memory_usage(deep=True).sum() / 1024 / 1024,
|
|
106
|
+
'per_column': {
|
|
107
|
+
col: size / 1024 / 1024
|
|
108
|
+
for col, size in data.memory_usage(deep=True).items()
|
|
109
|
+
},
|
|
110
|
+
'num_rows': len(data),
|
|
111
|
+
'num_columns': len(data.columns)
|
|
112
|
+
}
|
|
113
|
+
return memory_info
|
|
114
|
+
|
|
115
|
+
elif isinstance(data, list):
|
|
116
|
+
# Rough estimate
|
|
117
|
+
if len(data) == 0:
|
|
118
|
+
return {'total_mb': 0, 'num_items': 0}
|
|
119
|
+
|
|
120
|
+
# Estimate size of first item
|
|
121
|
+
sample_size = sys.getsizeof(data[0])
|
|
122
|
+
if isinstance(data[0], list):
|
|
123
|
+
sample_size = sum(sys.getsizeof(item) for item in data[0])
|
|
124
|
+
|
|
125
|
+
total_size = len(data) * sample_size
|
|
126
|
+
memory_info = {
|
|
127
|
+
'total_mb': total_size / 1024 / 1024,
|
|
128
|
+
'num_items': len(data),
|
|
129
|
+
'estimated_item_size_bytes': sample_size
|
|
130
|
+
}
|
|
131
|
+
return memory_info
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
size_bytes = sys.getsizeof(data)
|
|
135
|
+
return {
|
|
136
|
+
'total_mb': size_bytes / 1024 / 1024,
|
|
137
|
+
'size_bytes': size_bytes
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class StreamingDataProcessor:
|
|
142
|
+
"""Process large datasets in streaming fashion."""
|
|
143
|
+
|
|
144
|
+
def __init__(self, chunk_size: int = 10000):
|
|
145
|
+
"""
|
|
146
|
+
Initialize streaming processor.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
chunk_size: Size of each processing chunk
|
|
150
|
+
"""
|
|
151
|
+
self.chunk_size = chunk_size
|
|
152
|
+
|
|
153
|
+
def process_file(
|
|
154
|
+
self,
|
|
155
|
+
filepath: str,
|
|
156
|
+
process_fn: callable,
|
|
157
|
+
*args,
|
|
158
|
+
**kwargs
|
|
159
|
+
) -> List:
|
|
160
|
+
"""
|
|
161
|
+
Process file in chunks.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
filepath: Path to data file
|
|
165
|
+
process_fn: Function to process each chunk
|
|
166
|
+
*args: Arguments for process function
|
|
167
|
+
**kwargs: Keyword arguments for process function
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
List of processed results
|
|
171
|
+
"""
|
|
172
|
+
if not HAS_PANDAS:
|
|
173
|
+
raise ImportError("pandas required for file processing")
|
|
174
|
+
|
|
175
|
+
results = []
|
|
176
|
+
|
|
177
|
+
# Read file in chunks
|
|
178
|
+
chunk_iterator = pd.read_csv(
|
|
179
|
+
filepath,
|
|
180
|
+
chunksize=self.chunk_size
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
for chunk in chunk_iterator:
|
|
184
|
+
chunk_result = process_fn(chunk, *args, **kwargs)
|
|
185
|
+
if isinstance(chunk_result, list):
|
|
186
|
+
results.extend(chunk_result)
|
|
187
|
+
else:
|
|
188
|
+
results.append(chunk_result)
|
|
189
|
+
|
|
190
|
+
return results
|
|
191
|
+
|
|
192
|
+
def process_iterable(
|
|
193
|
+
self,
|
|
194
|
+
data_iterable,
|
|
195
|
+
process_fn: callable,
|
|
196
|
+
*args,
|
|
197
|
+
**kwargs
|
|
198
|
+
) -> List:
|
|
199
|
+
"""
|
|
200
|
+
Process iterable in chunks.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
data_iterable: Iterable data source
|
|
204
|
+
process_fn: Function to process each chunk
|
|
205
|
+
*args: Arguments for process function
|
|
206
|
+
**kwargs: Keyword arguments for process function
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of processed results
|
|
210
|
+
"""
|
|
211
|
+
results = []
|
|
212
|
+
chunk = []
|
|
213
|
+
|
|
214
|
+
for item in data_iterable:
|
|
215
|
+
chunk.append(item)
|
|
216
|
+
|
|
217
|
+
if len(chunk) >= self.chunk_size:
|
|
218
|
+
chunk_result = process_fn(chunk, *args, **kwargs)
|
|
219
|
+
if isinstance(chunk_result, list):
|
|
220
|
+
results.extend(chunk_result)
|
|
221
|
+
else:
|
|
222
|
+
results.append(chunk_result)
|
|
223
|
+
chunk = []
|
|
224
|
+
|
|
225
|
+
# Process remaining items
|
|
226
|
+
if chunk:
|
|
227
|
+
chunk_result = process_fn(chunk, *args, **kwargs)
|
|
228
|
+
if isinstance(chunk_result, list):
|
|
229
|
+
results.extend(chunk_result)
|
|
230
|
+
else:
|
|
231
|
+
results.append(chunk_result)
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data validation and integrity checks for market data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
import math
|
|
9
|
+
|
|
10
|
+
# Try to import pandas
|
|
11
|
+
try:
|
|
12
|
+
import pandas as pd
|
|
13
|
+
HAS_PANDAS = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
HAS_PANDAS = False
|
|
16
|
+
pd = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class DataQualityReport:
|
|
21
|
+
"""Data quality report."""
|
|
22
|
+
total_rows: int
|
|
23
|
+
missing_timestamps: int
|
|
24
|
+
duplicate_timestamps: int
|
|
25
|
+
zero_volume_count: int
|
|
26
|
+
nan_price_count: int
|
|
27
|
+
nan_volume_count: int
|
|
28
|
+
negative_price_count: int
|
|
29
|
+
negative_volume_count: int
|
|
30
|
+
gaps_detected: List[Tuple[datetime, datetime]]
|
|
31
|
+
quality_score: float # 0.0 to 1.0
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
34
|
+
"""Convert to dictionary."""
|
|
35
|
+
return {
|
|
36
|
+
'total_rows': self.total_rows,
|
|
37
|
+
'missing_timestamps': self.missing_timestamps,
|
|
38
|
+
'duplicate_timestamps': self.duplicate_timestamps,
|
|
39
|
+
'zero_volume_count': self.zero_volume_count,
|
|
40
|
+
'nan_price_count': self.nan_price_count,
|
|
41
|
+
'nan_volume_count': self.nan_volume_count,
|
|
42
|
+
'negative_price_count': self.negative_price_count,
|
|
43
|
+
'negative_volume_count': self.negative_volume_count,
|
|
44
|
+
'gaps_detected': len(self.gaps_detected),
|
|
45
|
+
'quality_score': self.quality_score
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def validate_timestamps(
|
|
50
|
+
timestamps: List,
|
|
51
|
+
expected_frequency: Optional[str] = None,
|
|
52
|
+
allow_gaps: bool = True
|
|
53
|
+
) -> Tuple[bool, List[Tuple[Any, Any]]]:
|
|
54
|
+
"""
|
|
55
|
+
Validate timestamp sequence.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
timestamps: List of timestamps
|
|
59
|
+
expected_frequency: Expected frequency (e.g., '1min', '5min', '1H')
|
|
60
|
+
allow_gaps: Whether gaps are allowed (e.g., weekends, holidays)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (is_valid, list of gaps)
|
|
64
|
+
"""
|
|
65
|
+
if len(timestamps) < 2:
|
|
66
|
+
return True, []
|
|
67
|
+
|
|
68
|
+
gaps = []
|
|
69
|
+
|
|
70
|
+
# Convert to datetime if needed
|
|
71
|
+
if HAS_PANDAS:
|
|
72
|
+
try:
|
|
73
|
+
timestamps = pd.to_datetime(timestamps)
|
|
74
|
+
except Exception:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
# Check for duplicates
|
|
78
|
+
seen = set()
|
|
79
|
+
for ts in timestamps:
|
|
80
|
+
if ts in seen:
|
|
81
|
+
return False, [(ts, ts)] # Duplicate
|
|
82
|
+
seen.add(ts)
|
|
83
|
+
|
|
84
|
+
# Sort timestamps
|
|
85
|
+
sorted_ts = sorted(timestamps)
|
|
86
|
+
|
|
87
|
+
# Detect gaps if frequency specified
|
|
88
|
+
if expected_frequency and not allow_gaps:
|
|
89
|
+
# Parse frequency
|
|
90
|
+
if expected_frequency.endswith('min'):
|
|
91
|
+
minutes = int(expected_frequency[:-3])
|
|
92
|
+
delta = timedelta(minutes=minutes)
|
|
93
|
+
elif expected_frequency.endswith('H'):
|
|
94
|
+
hours = int(expected_frequency[:-1])
|
|
95
|
+
delta = timedelta(hours=hours)
|
|
96
|
+
elif expected_frequency == '1D':
|
|
97
|
+
delta = timedelta(days=1)
|
|
98
|
+
else:
|
|
99
|
+
delta = None
|
|
100
|
+
|
|
101
|
+
if delta:
|
|
102
|
+
for i in range(len(sorted_ts) - 1):
|
|
103
|
+
expected_next = sorted_ts[i] + delta
|
|
104
|
+
if sorted_ts[i + 1] != expected_next:
|
|
105
|
+
gaps.append((sorted_ts[i], sorted_ts[i + 1]))
|
|
106
|
+
|
|
107
|
+
return len(gaps) == 0, gaps
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def check_duplicates(data: List, key: Optional[str] = None) -> List[int]:
|
|
111
|
+
"""
|
|
112
|
+
Check for duplicate values.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
data: List of values or list of dictionaries
|
|
116
|
+
key: Key to check if data is list of dicts
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of indices with duplicates
|
|
120
|
+
"""
|
|
121
|
+
if key:
|
|
122
|
+
values = [item[key] for item in data if isinstance(item, dict)]
|
|
123
|
+
else:
|
|
124
|
+
values = data
|
|
125
|
+
|
|
126
|
+
seen = {}
|
|
127
|
+
duplicates = []
|
|
128
|
+
|
|
129
|
+
for i, val in enumerate(values):
|
|
130
|
+
if val in seen:
|
|
131
|
+
duplicates.append(i)
|
|
132
|
+
if seen[val] not in duplicates:
|
|
133
|
+
duplicates.append(seen[val])
|
|
134
|
+
else:
|
|
135
|
+
seen[val] = i
|
|
136
|
+
|
|
137
|
+
return duplicates
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def check_missing_values(
|
|
141
|
+
prices: List[float],
|
|
142
|
+
volumes: Optional[List[float]] = None
|
|
143
|
+
) -> Dict[str, int]:
|
|
144
|
+
"""
|
|
145
|
+
Check for missing values (NaN, None, zero).
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
prices: List of prices
|
|
149
|
+
volumes: Optional list of volumes
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Dictionary with counts of missing values
|
|
153
|
+
"""
|
|
154
|
+
results = {
|
|
155
|
+
'nan_prices': 0,
|
|
156
|
+
'zero_prices': 0,
|
|
157
|
+
'negative_prices': 0,
|
|
158
|
+
'nan_volumes': 0,
|
|
159
|
+
'zero_volumes': 0,
|
|
160
|
+
'negative_volumes': 0
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for price in prices:
|
|
164
|
+
if price is None or (isinstance(price, float) and math.isnan(price)):
|
|
165
|
+
results['nan_prices'] += 1
|
|
166
|
+
elif price == 0:
|
|
167
|
+
results['zero_prices'] += 1
|
|
168
|
+
elif price < 0:
|
|
169
|
+
results['negative_prices'] += 1
|
|
170
|
+
|
|
171
|
+
if volumes:
|
|
172
|
+
for volume in volumes:
|
|
173
|
+
if volume is None or (isinstance(volume, float) and math.isnan(volume)):
|
|
174
|
+
results['nan_volumes'] += 1
|
|
175
|
+
elif volume == 0:
|
|
176
|
+
results['zero_volumes'] += 1
|
|
177
|
+
elif volume < 0:
|
|
178
|
+
results['negative_volumes'] += 1
|
|
179
|
+
|
|
180
|
+
return results
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def validate_price_data(
|
|
184
|
+
prices: List[float],
|
|
185
|
+
volumes: Optional[List[float]] = None,
|
|
186
|
+
timestamps: Optional[List] = None,
|
|
187
|
+
min_price: float = 0.01,
|
|
188
|
+
max_price: float = 1e6
|
|
189
|
+
) -> Tuple[bool, List[str]]:
|
|
190
|
+
"""
|
|
191
|
+
Validate price data for common issues.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
prices: List of prices
|
|
195
|
+
volumes: Optional list of volumes
|
|
196
|
+
timestamps: Optional list of timestamps
|
|
197
|
+
min_price: Minimum valid price
|
|
198
|
+
max_price: Maximum valid price
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tuple of (is_valid, list of error messages)
|
|
202
|
+
"""
|
|
203
|
+
errors = []
|
|
204
|
+
|
|
205
|
+
if not prices:
|
|
206
|
+
errors.append("Empty price data")
|
|
207
|
+
return False, errors
|
|
208
|
+
|
|
209
|
+
# Check prices
|
|
210
|
+
for i, price in enumerate(prices):
|
|
211
|
+
if price is None or (isinstance(price, float) and math.isnan(price)):
|
|
212
|
+
errors.append(f"NaN price at index {i}")
|
|
213
|
+
elif price <= 0:
|
|
214
|
+
errors.append(f"Non-positive price at index {i}: {price}")
|
|
215
|
+
elif price < min_price:
|
|
216
|
+
errors.append(f"Price too low at index {i}: {price}")
|
|
217
|
+
elif price > max_price:
|
|
218
|
+
errors.append(f"Price too high at index {i}: {price}")
|
|
219
|
+
|
|
220
|
+
# Check volumes if provided
|
|
221
|
+
if volumes:
|
|
222
|
+
if len(volumes) != len(prices):
|
|
223
|
+
errors.append(f"Volume length ({len(volumes)}) != price length ({len(prices)})")
|
|
224
|
+
else:
|
|
225
|
+
for i, volume in enumerate(volumes):
|
|
226
|
+
if volume is None or (isinstance(volume, float) and math.isnan(volume)):
|
|
227
|
+
errors.append(f"NaN volume at index {i}")
|
|
228
|
+
elif volume < 0:
|
|
229
|
+
errors.append(f"Negative volume at index {i}: {volume}")
|
|
230
|
+
|
|
231
|
+
# Check timestamps if provided
|
|
232
|
+
if timestamps:
|
|
233
|
+
if len(timestamps) != len(prices):
|
|
234
|
+
errors.append(f"Timestamp length ({len(timestamps)}) != price length ({len(prices)})")
|
|
235
|
+
else:
|
|
236
|
+
is_valid, gaps = validate_timestamps(timestamps)
|
|
237
|
+
if not is_valid:
|
|
238
|
+
errors.append(f"Invalid timestamps: {len(gaps)} gaps detected")
|
|
239
|
+
|
|
240
|
+
return len(errors) == 0, errors
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def generate_data_quality_report(
|
|
244
|
+
prices: List[float],
|
|
245
|
+
volumes: Optional[List[float]] = None,
|
|
246
|
+
timestamps: Optional[List] = None,
|
|
247
|
+
expected_frequency: Optional[str] = None
|
|
248
|
+
) -> DataQualityReport:
|
|
249
|
+
"""
|
|
250
|
+
Generate comprehensive data quality report.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
prices: List of prices
|
|
254
|
+
volumes: Optional list of volumes
|
|
255
|
+
timestamps: Optional list of timestamps
|
|
256
|
+
expected_frequency: Expected data frequency
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
DataQualityReport object
|
|
260
|
+
"""
|
|
261
|
+
total_rows = len(prices)
|
|
262
|
+
|
|
263
|
+
# Check missing values
|
|
264
|
+
missing = check_missing_values(prices, volumes)
|
|
265
|
+
|
|
266
|
+
# Check timestamps
|
|
267
|
+
missing_ts = 0
|
|
268
|
+
duplicate_ts = 0
|
|
269
|
+
gaps = []
|
|
270
|
+
|
|
271
|
+
if timestamps:
|
|
272
|
+
is_valid, gaps = validate_timestamps(timestamps, expected_frequency)
|
|
273
|
+
if not is_valid:
|
|
274
|
+
# Count issues
|
|
275
|
+
seen = set()
|
|
276
|
+
for ts in timestamps:
|
|
277
|
+
if ts is None:
|
|
278
|
+
missing_ts += 1
|
|
279
|
+
elif ts in seen:
|
|
280
|
+
duplicate_ts += 1
|
|
281
|
+
else:
|
|
282
|
+
seen.add(ts)
|
|
283
|
+
|
|
284
|
+
# Calculate quality score
|
|
285
|
+
issues = (
|
|
286
|
+
missing['nan_prices'] +
|
|
287
|
+
missing['zero_prices'] +
|
|
288
|
+
missing['negative_prices'] +
|
|
289
|
+
missing['nan_volumes'] +
|
|
290
|
+
missing['zero_volumes'] +
|
|
291
|
+
missing['negative_volumes'] +
|
|
292
|
+
missing_ts +
|
|
293
|
+
duplicate_ts +
|
|
294
|
+
len(gaps)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
quality_score = max(0.0, 1.0 - (issues / max(total_rows, 1)))
|
|
298
|
+
|
|
299
|
+
return DataQualityReport(
|
|
300
|
+
total_rows=total_rows,
|
|
301
|
+
missing_timestamps=missing_ts,
|
|
302
|
+
duplicate_timestamps=duplicate_ts,
|
|
303
|
+
zero_volume_count=missing['zero_volumes'],
|
|
304
|
+
nan_price_count=missing['nan_prices'],
|
|
305
|
+
nan_volume_count=missing['nan_volumes'],
|
|
306
|
+
negative_price_count=missing['negative_prices'],
|
|
307
|
+
negative_volume_count=missing['negative_volumes'],
|
|
308
|
+
gaps_detected=gaps,
|
|
309
|
+
quality_score=quality_score
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def validate_futures_data(
|
|
314
|
+
prices: List[float],
|
|
315
|
+
opens: Optional[List[float]] = None,
|
|
316
|
+
closes: Optional[List[float]] = None,
|
|
317
|
+
volumes: Optional[List[float]] = None,
|
|
318
|
+
timestamps: Optional[List] = None,
|
|
319
|
+
instrument: str = "ES"
|
|
320
|
+
) -> Tuple[bool, List[str]]:
|
|
321
|
+
"""
|
|
322
|
+
Validate futures-specific data issues.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
prices: Price data
|
|
326
|
+
opens: Opening prices (for gap detection)
|
|
327
|
+
closes: Closing prices (for gap detection)
|
|
328
|
+
volumes: Volume data
|
|
329
|
+
timestamps: Timestamp data
|
|
330
|
+
instrument: Instrument symbol
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Tuple of (is_valid, list of error messages)
|
|
334
|
+
"""
|
|
335
|
+
errors = []
|
|
336
|
+
|
|
337
|
+
# Basic price validation
|
|
338
|
+
is_valid, price_errors = validate_price_data(prices, volumes, timestamps)
|
|
339
|
+
if not is_valid:
|
|
340
|
+
errors.extend(price_errors)
|
|
341
|
+
|
|
342
|
+
# Check for holiday gaps
|
|
343
|
+
if opens and closes and len(opens) > 1 and len(closes) > 1:
|
|
344
|
+
for i in range(1, min(len(opens), len(closes))):
|
|
345
|
+
if closes[i-1] > 0:
|
|
346
|
+
gap = abs((opens[i] - closes[i-1]) / closes[i-1])
|
|
347
|
+
# Large gaps might indicate holiday or data issue
|
|
348
|
+
if gap > 0.1: # 10% gap
|
|
349
|
+
if timestamps and i < len(timestamps) and i-1 < len(timestamps):
|
|
350
|
+
days_diff = (timestamps[i] - timestamps[i-1]).days
|
|
351
|
+
if days_diff <= 1:
|
|
352
|
+
errors.append(f"Large gap at index {i} ({gap*100:.2f}%) without holiday")
|
|
353
|
+
|
|
354
|
+
# Check for roll dates (volume drops)
|
|
355
|
+
if volumes and len(volumes) > 1:
|
|
356
|
+
for i in range(1, len(volumes)):
|
|
357
|
+
if volumes[i] < volumes[i-1] * 0.2: # 80% drop
|
|
358
|
+
errors.append(f"Potential contract roll at index {i} (volume drop)")
|
|
359
|
+
|
|
360
|
+
return len(errors) == 0, errors
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def validate_roll_dates(
|
|
364
|
+
roll_indices: List[int],
|
|
365
|
+
data_length: int
|
|
366
|
+
) -> Tuple[bool, List[str]]:
|
|
367
|
+
"""
|
|
368
|
+
Validate contract roll dates.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
roll_indices: List of roll indices
|
|
372
|
+
data_length: Total data length
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
Tuple of (is_valid, list of error messages)
|
|
376
|
+
"""
|
|
377
|
+
errors = []
|
|
378
|
+
|
|
379
|
+
for idx in roll_indices:
|
|
380
|
+
if idx < 0 or idx >= data_length:
|
|
381
|
+
errors.append(f"Invalid roll index: {idx} (data length: {data_length})")
|
|
382
|
+
|
|
383
|
+
# Check for too frequent rolls
|
|
384
|
+
if len(roll_indices) > 1:
|
|
385
|
+
for i in range(1, len(roll_indices)):
|
|
386
|
+
if roll_indices[i] - roll_indices[i-1] < 10:
|
|
387
|
+
errors.append(f"Rolls too frequent: {roll_indices[i-1]} -> {roll_indices[i]}")
|
|
388
|
+
|
|
389
|
+
return len(errors) == 0, errors
|
|
390
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Experiment tracking module.
|
|
3
|
+
|
|
4
|
+
Provides tools for tracking experiments, comparing results, and ensuring reproducibility.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from quantml.experiments.tracker import ExperimentTracker, compare_experiments
|
|
8
|
+
from quantml.experiments.logger import CSVExperimentLogger, JSONExperimentLogger
|
|
9
|
+
from quantml.experiments.results import (
|
|
10
|
+
compare_experiments as compare_results,
|
|
11
|
+
generate_summary_table,
|
|
12
|
+
export_for_paper
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
'ExperimentTracker',
|
|
17
|
+
'compare_experiments',
|
|
18
|
+
'CSVExperimentLogger',
|
|
19
|
+
'JSONExperimentLogger',
|
|
20
|
+
'compare_results',
|
|
21
|
+
'generate_summary_table',
|
|
22
|
+
'export_for_paper'
|
|
23
|
+
]
|