quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/config/config.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for QuantML experiments.
|
|
3
|
+
|
|
4
|
+
Supports YAML/JSON config files and command-line argument integration.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from typing import Dict, Any, Optional, List
|
|
10
|
+
from dataclasses import dataclass, field, asdict
|
|
11
|
+
import argparse
|
|
12
|
+
|
|
13
|
+
# Try to import YAML
|
|
14
|
+
try:
|
|
15
|
+
import yaml
|
|
16
|
+
HAS_YAML = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
HAS_YAML = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class DataConfig:
|
|
23
|
+
"""Data loading and preprocessing configuration."""
|
|
24
|
+
instrument: str = "ES" # MES, ES, MNQ, NQ, etc.
|
|
25
|
+
start_date: str = "2015-01-01"
|
|
26
|
+
end_date: str = "2024-12-31"
|
|
27
|
+
data_source: str = "csv" # csv, database, api
|
|
28
|
+
data_path: Optional[str] = None
|
|
29
|
+
frequency: str = "1min" # 1min, 5min, daily
|
|
30
|
+
validate_data: bool = True
|
|
31
|
+
handle_missing: str = "forward_fill" # forward_fill, drop, interpolate
|
|
32
|
+
cache_features: bool = True
|
|
33
|
+
feature_cache_path: str = "./cache/features"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class FeatureConfig:
|
|
38
|
+
"""Feature engineering configuration."""
|
|
39
|
+
enabled_features: List[str] = field(default_factory=lambda: [
|
|
40
|
+
"lagged_price",
|
|
41
|
+
"rolling_mean",
|
|
42
|
+
"rolling_std",
|
|
43
|
+
"returns",
|
|
44
|
+
"volatility"
|
|
45
|
+
])
|
|
46
|
+
lag_periods: List[int] = field(default_factory=lambda: [1, 5, 10, 20])
|
|
47
|
+
rolling_windows: List[int] = field(default_factory=lambda: [20, 50])
|
|
48
|
+
normalize: bool = True
|
|
49
|
+
normalization_method: str = "zscore" # zscore, minmax, robust
|
|
50
|
+
alpha_factors: Dict[str, Any] = field(default_factory=lambda: {
|
|
51
|
+
"momentum": {"enabled": True, "lookback": 20},
|
|
52
|
+
"mean_reversion": {"enabled": True, "window": 20},
|
|
53
|
+
"volatility": {"enabled": True, "window": 20}
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ModelConfig:
|
|
59
|
+
"""Model architecture and hyperparameters."""
|
|
60
|
+
model_type: str = "Linear" # Linear, SimpleRNN, TCN
|
|
61
|
+
in_features: int = 10
|
|
62
|
+
out_features: int = 1
|
|
63
|
+
hidden_size: Optional[int] = None
|
|
64
|
+
bias: bool = True
|
|
65
|
+
dropout: float = 0.0
|
|
66
|
+
activation: Optional[str] = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class TrainingConfig:
|
|
71
|
+
"""Training configuration."""
|
|
72
|
+
optimizer: str = "Adam" # SGD, Adam, RMSProp, etc.
|
|
73
|
+
learning_rate: float = 0.001
|
|
74
|
+
batch_size: int = 32
|
|
75
|
+
epochs: int = 100
|
|
76
|
+
loss_function: str = "mse_loss" # mse_loss, sharpe_loss, etc.
|
|
77
|
+
early_stopping: Dict[str, Any] = field(default_factory=lambda: {
|
|
78
|
+
"enabled": True,
|
|
79
|
+
"patience": 10,
|
|
80
|
+
"min_delta": 0.0001
|
|
81
|
+
})
|
|
82
|
+
gradient_clipping: Dict[str, Any] = field(default_factory=lambda: {
|
|
83
|
+
"enabled": False,
|
|
84
|
+
"max_norm": 1.0
|
|
85
|
+
})
|
|
86
|
+
scheduler: Optional[Dict[str, Any]] = None
|
|
87
|
+
walk_forward: Dict[str, Any] = field(default_factory=lambda: {
|
|
88
|
+
"enabled": True,
|
|
89
|
+
"train_size": 500,
|
|
90
|
+
"test_size": 100,
|
|
91
|
+
"window_type": "expanding" # expanding, rolling
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class ExperimentConfig:
|
|
97
|
+
"""Complete experiment configuration."""
|
|
98
|
+
name: str = "default_experiment"
|
|
99
|
+
description: str = ""
|
|
100
|
+
random_seed: int = 42
|
|
101
|
+
data: DataConfig = field(default_factory=DataConfig)
|
|
102
|
+
features: FeatureConfig = field(default_factory=FeatureConfig)
|
|
103
|
+
model: ModelConfig = field(default_factory=ModelConfig)
|
|
104
|
+
training: TrainingConfig = field(default_factory=TrainingConfig)
|
|
105
|
+
output_dir: str = "./experiments"
|
|
106
|
+
log_level: str = "INFO"
|
|
107
|
+
|
|
108
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
109
|
+
"""Convert config to dictionary."""
|
|
110
|
+
return asdict(self)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'ExperimentConfig':
|
|
114
|
+
"""Create config from dictionary."""
|
|
115
|
+
# Handle nested configs
|
|
116
|
+
if 'data' in data and isinstance(data['data'], dict):
|
|
117
|
+
data['data'] = DataConfig(**data['data'])
|
|
118
|
+
if 'features' in data and isinstance(data['features'], dict):
|
|
119
|
+
data['features'] = FeatureConfig(**data['features'])
|
|
120
|
+
if 'model' in data and isinstance(data['model'], dict):
|
|
121
|
+
data['model'] = ModelConfig(**data['model'])
|
|
122
|
+
if 'training' in data and isinstance(data['training'], dict):
|
|
123
|
+
data['training'] = TrainingConfig(**data['training'])
|
|
124
|
+
return cls(**data)
|
|
125
|
+
|
|
126
|
+
def validate(self) -> List[str]:
|
|
127
|
+
"""Validate configuration and return list of errors."""
|
|
128
|
+
errors = []
|
|
129
|
+
|
|
130
|
+
# Validate dates
|
|
131
|
+
try:
|
|
132
|
+
from datetime import datetime
|
|
133
|
+
datetime.strptime(self.data.start_date, "%Y-%m-%d")
|
|
134
|
+
datetime.strptime(self.data.end_date, "%Y-%m-%d")
|
|
135
|
+
except ValueError:
|
|
136
|
+
errors.append("Invalid date format. Use YYYY-MM-DD")
|
|
137
|
+
|
|
138
|
+
# Validate learning rate
|
|
139
|
+
if self.training.learning_rate <= 0:
|
|
140
|
+
errors.append("Learning rate must be positive")
|
|
141
|
+
|
|
142
|
+
# Validate batch size
|
|
143
|
+
if self.training.batch_size <= 0:
|
|
144
|
+
errors.append("Batch size must be positive")
|
|
145
|
+
|
|
146
|
+
# Validate model features
|
|
147
|
+
if self.model.in_features <= 0:
|
|
148
|
+
errors.append("Model input features must be positive")
|
|
149
|
+
|
|
150
|
+
return errors
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class Config:
|
|
154
|
+
"""Main configuration class."""
|
|
155
|
+
|
|
156
|
+
@staticmethod
|
|
157
|
+
def load_yaml(filepath: str) -> ExperimentConfig:
|
|
158
|
+
"""Load configuration from YAML file."""
|
|
159
|
+
if not HAS_YAML:
|
|
160
|
+
raise ImportError("PyYAML not installed. Install with: pip install pyyaml")
|
|
161
|
+
|
|
162
|
+
with open(filepath, 'r') as f:
|
|
163
|
+
data = yaml.safe_load(f)
|
|
164
|
+
|
|
165
|
+
return ExperimentConfig.from_dict(data)
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def load_json(filepath: str) -> ExperimentConfig:
|
|
169
|
+
"""Load configuration from JSON file."""
|
|
170
|
+
with open(filepath, 'r') as f:
|
|
171
|
+
data = json.load(f)
|
|
172
|
+
|
|
173
|
+
return ExperimentConfig.from_dict(data)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def save_yaml(config: ExperimentConfig, filepath: str):
|
|
177
|
+
"""Save configuration to YAML file."""
|
|
178
|
+
if not HAS_YAML:
|
|
179
|
+
raise ImportError("PyYAML not installed. Install with: pyyaml")
|
|
180
|
+
|
|
181
|
+
os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
|
|
182
|
+
|
|
183
|
+
with open(filepath, 'w') as f:
|
|
184
|
+
yaml.dump(config.to_dict(), f, default_flow_style=False, sort_keys=False)
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def save_json(config: ExperimentConfig, filepath: str):
|
|
188
|
+
"""Save configuration to JSON file."""
|
|
189
|
+
os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
|
|
190
|
+
|
|
191
|
+
with open(filepath, 'w') as f:
|
|
192
|
+
json.dump(config.to_dict(), f, indent=2)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def load_config(filepath: str) -> ExperimentConfig:
|
|
196
|
+
"""Load configuration from file (auto-detect YAML/JSON)."""
|
|
197
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
198
|
+
|
|
199
|
+
if ext in ['.yaml', '.yml']:
|
|
200
|
+
return Config.load_yaml(filepath)
|
|
201
|
+
elif ext == '.json':
|
|
202
|
+
return Config.load_json(filepath)
|
|
203
|
+
else:
|
|
204
|
+
raise ValueError(f"Unsupported config file format: {ext}. Use .yaml, .yml, or .json")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def save_config(config: ExperimentConfig, filepath: str):
|
|
208
|
+
"""Save configuration to file (auto-detect YAML/JSON)."""
|
|
209
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
210
|
+
|
|
211
|
+
if ext in ['.yaml', '.yml']:
|
|
212
|
+
Config.save_yaml(config, filepath)
|
|
213
|
+
elif ext == '.json':
|
|
214
|
+
Config.save_json(config, filepath)
|
|
215
|
+
else:
|
|
216
|
+
raise ValueError(f"Unsupported config file format: {ext}. Use .yaml, .yml, or .json")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def create_argparser() -> argparse.ArgumentParser:
|
|
220
|
+
"""Create argument parser with common config options."""
|
|
221
|
+
parser = argparse.ArgumentParser(description='QuantML Experiment Runner')
|
|
222
|
+
|
|
223
|
+
parser.add_argument('--config', type=str, help='Path to config file (YAML/JSON)')
|
|
224
|
+
parser.add_argument('--instrument', type=str, help='Trading instrument (ES, MES, NQ, MNQ)')
|
|
225
|
+
parser.add_argument('--start-date', type=str, help='Start date (YYYY-MM-DD)')
|
|
226
|
+
parser.add_argument('--end-date', type=str, help='End date (YYYY-MM-DD)')
|
|
227
|
+
parser.add_argument('--model-type', type=str, help='Model type (Linear, SimpleRNN, TCN)')
|
|
228
|
+
parser.add_argument('--learning-rate', type=float, help='Learning rate')
|
|
229
|
+
parser.add_argument('--epochs', type=int, help='Number of epochs')
|
|
230
|
+
parser.add_argument('--batch-size', type=int, help='Batch size')
|
|
231
|
+
parser.add_argument('--random-seed', type=int, help='Random seed')
|
|
232
|
+
parser.add_argument('--output-dir', type=str, help='Output directory')
|
|
233
|
+
|
|
234
|
+
return parser
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def merge_config_with_args(config: ExperimentConfig, args: argparse.Namespace) -> ExperimentConfig:
|
|
238
|
+
"""Merge command-line arguments into config."""
|
|
239
|
+
if args.instrument:
|
|
240
|
+
config.data.instrument = args.instrument
|
|
241
|
+
if args.start_date:
|
|
242
|
+
config.data.start_date = args.start_date
|
|
243
|
+
if args.end_date:
|
|
244
|
+
config.data.end_date = args.end_date
|
|
245
|
+
if args.model_type:
|
|
246
|
+
config.model.model_type = args.model_type
|
|
247
|
+
if args.learning_rate:
|
|
248
|
+
config.training.learning_rate = args.learning_rate
|
|
249
|
+
if args.epochs:
|
|
250
|
+
config.training.epochs = args.epochs
|
|
251
|
+
if args.batch_size:
|
|
252
|
+
config.training.batch_size = args.batch_size
|
|
253
|
+
if args.random_seed:
|
|
254
|
+
config.random_seed = args.random_seed
|
|
255
|
+
if args.output_dir:
|
|
256
|
+
config.output_dir = args.output_dir
|
|
257
|
+
|
|
258
|
+
return config
|
|
259
|
+
|
quantml/data/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuantML Data Management
|
|
3
|
+
|
|
4
|
+
This module provides data loading, validation, and caching utilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from quantml.data.validators import (
|
|
8
|
+
validate_price_data,
|
|
9
|
+
validate_timestamps,
|
|
10
|
+
check_duplicates,
|
|
11
|
+
check_missing_values,
|
|
12
|
+
generate_data_quality_report,
|
|
13
|
+
DataQualityReport
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from quantml.data.loaders import (
|
|
17
|
+
load_csv_data,
|
|
18
|
+
load_dataframe,
|
|
19
|
+
DataLoader
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
'validate_price_data',
|
|
24
|
+
'validate_timestamps',
|
|
25
|
+
'check_duplicates',
|
|
26
|
+
'check_missing_values',
|
|
27
|
+
'generate_data_quality_report',
|
|
28
|
+
'DataQualityReport',
|
|
29
|
+
'load_csv_data',
|
|
30
|
+
'load_dataframe',
|
|
31
|
+
'DataLoader'
|
|
32
|
+
]
|
|
33
|
+
|
quantml/data/cache.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cache management utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Callable, Any
|
|
6
|
+
from functools import wraps
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
from quantml.data.feature_store import FeatureStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cached_features(
|
|
15
|
+
cache_dir: str = "./cache/features",
|
|
16
|
+
use_cache: bool = True
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
Decorator to cache feature computation results.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
cache_dir: Cache directory
|
|
23
|
+
use_cache: Whether to use cache
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
@cached_features(cache_dir="./cache")
|
|
27
|
+
def compute_features(data):
|
|
28
|
+
# Expensive computation
|
|
29
|
+
return features
|
|
30
|
+
"""
|
|
31
|
+
def decorator(func: Callable) -> Callable:
|
|
32
|
+
store = FeatureStore(cache_dir=cache_dir)
|
|
33
|
+
|
|
34
|
+
@wraps(func)
|
|
35
|
+
def wrapper(*args, **kwargs):
|
|
36
|
+
if not use_cache:
|
|
37
|
+
return func(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
# Generate cache key from function arguments
|
|
40
|
+
cache_key_data = {
|
|
41
|
+
'func': func.__name__,
|
|
42
|
+
'args': str(args),
|
|
43
|
+
'kwargs': json.dumps(kwargs, sort_keys=True)
|
|
44
|
+
}
|
|
45
|
+
cache_key_str = json.dumps(cache_key_data, sort_keys=True)
|
|
46
|
+
cache_key = hashlib.md5(cache_key_str.encode()).hexdigest()
|
|
47
|
+
|
|
48
|
+
# Check cache
|
|
49
|
+
if store.cache_exists(cache_key):
|
|
50
|
+
features, _ = store.load_features(cache_key)
|
|
51
|
+
return features
|
|
52
|
+
|
|
53
|
+
# Compute features
|
|
54
|
+
features = func(*args, **kwargs)
|
|
55
|
+
|
|
56
|
+
# Save to cache
|
|
57
|
+
# Extract metadata from kwargs if available
|
|
58
|
+
instrument = kwargs.get('instrument', 'unknown')
|
|
59
|
+
start_date = kwargs.get('start_date', 'unknown')
|
|
60
|
+
end_date = kwargs.get('end_date', 'unknown')
|
|
61
|
+
feature_config = kwargs.get('feature_config', {})
|
|
62
|
+
|
|
63
|
+
store.save_features(
|
|
64
|
+
features,
|
|
65
|
+
instrument,
|
|
66
|
+
start_date,
|
|
67
|
+
end_date,
|
|
68
|
+
feature_config
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return features
|
|
72
|
+
|
|
73
|
+
return wrapper
|
|
74
|
+
return decorator
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class CacheManager:
|
|
78
|
+
"""Cache manager for feature computation."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, cache_dir: str = "./cache/features"):
|
|
81
|
+
"""
|
|
82
|
+
Initialize cache manager.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
cache_dir: Cache directory
|
|
86
|
+
"""
|
|
87
|
+
self.store = FeatureStore(cache_dir=cache_dir)
|
|
88
|
+
self.cache_dir = cache_dir
|
|
89
|
+
|
|
90
|
+
def get_or_compute(
|
|
91
|
+
self,
|
|
92
|
+
cache_key: str,
|
|
93
|
+
compute_fn: Callable,
|
|
94
|
+
*args,
|
|
95
|
+
**kwargs
|
|
96
|
+
) -> Any:
|
|
97
|
+
"""
|
|
98
|
+
Get from cache or compute.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
cache_key: Cache key
|
|
102
|
+
compute_fn: Function to compute if not cached
|
|
103
|
+
*args: Arguments for compute function
|
|
104
|
+
**kwargs: Keyword arguments for compute function
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Cached or computed result
|
|
108
|
+
"""
|
|
109
|
+
if self.store.cache_exists(cache_key):
|
|
110
|
+
features, _ = self.store.load_features(cache_key)
|
|
111
|
+
return features
|
|
112
|
+
|
|
113
|
+
# Compute
|
|
114
|
+
result = compute_fn(*args, **kwargs)
|
|
115
|
+
|
|
116
|
+
# Save to cache if result is features
|
|
117
|
+
if isinstance(result, list) and len(result) > 0:
|
|
118
|
+
if isinstance(result[0], list): # List of feature vectors
|
|
119
|
+
instrument = kwargs.get('instrument', 'unknown')
|
|
120
|
+
start_date = kwargs.get('start_date', 'unknown')
|
|
121
|
+
end_date = kwargs.get('end_date', 'unknown')
|
|
122
|
+
feature_config = kwargs.get('feature_config', {})
|
|
123
|
+
|
|
124
|
+
self.store.save_features(
|
|
125
|
+
result,
|
|
126
|
+
instrument,
|
|
127
|
+
start_date,
|
|
128
|
+
end_date,
|
|
129
|
+
feature_config
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
def clear_cache(self, pattern: Optional[str] = None):
|
|
135
|
+
"""
|
|
136
|
+
Clear cache.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
pattern: Optional pattern to match (e.g., instrument name)
|
|
140
|
+
"""
|
|
141
|
+
if pattern:
|
|
142
|
+
self.store.invalidate_cache(instrument=pattern)
|
|
143
|
+
else:
|
|
144
|
+
# Clear all
|
|
145
|
+
for filename in os.listdir(self.cache_dir):
|
|
146
|
+
filepath = os.path.join(self.cache_dir, filename)
|
|
147
|
+
if os.path.isfile(filepath):
|
|
148
|
+
os.remove(filepath)
|
|
149
|
+
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feature store for caching computed features.
|
|
3
|
+
|
|
4
|
+
Uses Parquet format for efficient storage and loading of large feature datasets.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
from typing import List, Dict, Any, Optional
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
13
|
+
# Try to import pandas and pyarrow
|
|
14
|
+
try:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
HAS_PANDAS = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
HAS_PANDAS = False
|
|
19
|
+
pd = None
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import pyarrow.parquet as pq
|
|
23
|
+
HAS_PARQUET = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
HAS_PARQUET = False
|
|
26
|
+
pq = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FeatureStore:
|
|
30
|
+
"""Feature caching system using Parquet format."""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
cache_dir: str = "./cache/features",
|
|
35
|
+
use_parquet: bool = True
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize feature store.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
cache_dir: Directory for cached features
|
|
42
|
+
use_parquet: Whether to use Parquet format (requires pyarrow)
|
|
43
|
+
"""
|
|
44
|
+
self.cache_dir = cache_dir
|
|
45
|
+
self.use_parquet = use_parquet and HAS_PARQUET
|
|
46
|
+
|
|
47
|
+
if not self.use_parquet and not HAS_PANDAS:
|
|
48
|
+
raise ImportError(
|
|
49
|
+
"Either pandas or pyarrow required. "
|
|
50
|
+
"Install with: pip install pandas pyarrow"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
def _generate_cache_key(
|
|
56
|
+
self,
|
|
57
|
+
instrument: str,
|
|
58
|
+
start_date: str,
|
|
59
|
+
end_date: str,
|
|
60
|
+
feature_config: Dict[str, Any]
|
|
61
|
+
) -> str:
|
|
62
|
+
"""Generate cache key from configuration."""
|
|
63
|
+
key_data = {
|
|
64
|
+
'instrument': instrument,
|
|
65
|
+
'start_date': start_date,
|
|
66
|
+
'end_date': end_date,
|
|
67
|
+
'features': feature_config
|
|
68
|
+
}
|
|
69
|
+
key_str = json.dumps(key_data, sort_keys=True)
|
|
70
|
+
return hashlib.md5(key_str.encode()).hexdigest()
|
|
71
|
+
|
|
72
|
+
def _get_cache_path(self, cache_key: str) -> str:
|
|
73
|
+
"""Get cache file path."""
|
|
74
|
+
if self.use_parquet:
|
|
75
|
+
return os.path.join(self.cache_dir, f"{cache_key}.parquet")
|
|
76
|
+
else:
|
|
77
|
+
return os.path.join(self.cache_dir, f"{cache_key}.csv")
|
|
78
|
+
|
|
79
|
+
def _get_metadata_path(self, cache_key: str) -> str:
|
|
80
|
+
"""Get metadata file path."""
|
|
81
|
+
return os.path.join(self.cache_dir, f"{cache_key}_metadata.json")
|
|
82
|
+
|
|
83
|
+
def save_features(
|
|
84
|
+
self,
|
|
85
|
+
features: List[List[float]],
|
|
86
|
+
instrument: str,
|
|
87
|
+
start_date: str,
|
|
88
|
+
end_date: str,
|
|
89
|
+
feature_config: Dict[str, Any],
|
|
90
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
91
|
+
) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Save features to cache.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
features: List of feature vectors
|
|
97
|
+
instrument: Trading instrument
|
|
98
|
+
start_date: Start date
|
|
99
|
+
end_date: End date
|
|
100
|
+
feature_config: Feature configuration
|
|
101
|
+
metadata: Optional metadata
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Cache key
|
|
105
|
+
"""
|
|
106
|
+
cache_key = self._generate_cache_key(
|
|
107
|
+
instrument, start_date, end_date, feature_config
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
cache_path = self._get_cache_path(cache_key)
|
|
111
|
+
metadata_path = self._get_metadata_path(cache_key)
|
|
112
|
+
|
|
113
|
+
# Convert to DataFrame
|
|
114
|
+
if not HAS_PANDAS:
|
|
115
|
+
raise ImportError("pandas required for feature storage")
|
|
116
|
+
|
|
117
|
+
df = pd.DataFrame(features)
|
|
118
|
+
|
|
119
|
+
# Save features
|
|
120
|
+
if self.use_parquet:
|
|
121
|
+
df.to_parquet(cache_path, compression='snappy', index=False)
|
|
122
|
+
else:
|
|
123
|
+
df.to_csv(cache_path, index=False)
|
|
124
|
+
|
|
125
|
+
# Save metadata
|
|
126
|
+
meta = {
|
|
127
|
+
'cache_key': cache_key,
|
|
128
|
+
'instrument': instrument,
|
|
129
|
+
'start_date': start_date,
|
|
130
|
+
'end_date': end_date,
|
|
131
|
+
'feature_config': feature_config,
|
|
132
|
+
'num_features': len(features[0]) if features else 0,
|
|
133
|
+
'num_samples': len(features),
|
|
134
|
+
'created_at': datetime.now().isoformat(),
|
|
135
|
+
'metadata': metadata or {}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
with open(metadata_path, 'w') as f:
|
|
139
|
+
json.dump(meta, f, indent=2)
|
|
140
|
+
|
|
141
|
+
return cache_key
|
|
142
|
+
|
|
143
|
+
def load_features(self, cache_key: str) -> tuple:
|
|
144
|
+
"""
|
|
145
|
+
Load features from cache.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
cache_key: Cache key
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Tuple of (features, metadata)
|
|
152
|
+
"""
|
|
153
|
+
cache_path = self._get_cache_path(cache_key)
|
|
154
|
+
metadata_path = self._get_metadata_path(cache_key)
|
|
155
|
+
|
|
156
|
+
if not os.path.exists(cache_path):
|
|
157
|
+
raise FileNotFoundError(f"Cache not found: {cache_key}")
|
|
158
|
+
|
|
159
|
+
# Load features
|
|
160
|
+
if not HAS_PANDAS:
|
|
161
|
+
raise ImportError("pandas required for feature loading")
|
|
162
|
+
|
|
163
|
+
if self.use_parquet:
|
|
164
|
+
df = pd.read_parquet(cache_path)
|
|
165
|
+
else:
|
|
166
|
+
df = pd.read_csv(cache_path)
|
|
167
|
+
|
|
168
|
+
features = df.values.tolist()
|
|
169
|
+
|
|
170
|
+
# Load metadata
|
|
171
|
+
metadata = {}
|
|
172
|
+
if os.path.exists(metadata_path):
|
|
173
|
+
with open(metadata_path, 'r') as f:
|
|
174
|
+
metadata = json.load(f)
|
|
175
|
+
|
|
176
|
+
return features, metadata
|
|
177
|
+
|
|
178
|
+
def cache_exists(self, cache_key: str) -> bool:
|
|
179
|
+
"""Check if cache exists."""
|
|
180
|
+
cache_path = self._get_cache_path(cache_key)
|
|
181
|
+
return os.path.exists(cache_path)
|
|
182
|
+
|
|
183
|
+
def invalidate_cache(
|
|
184
|
+
self,
|
|
185
|
+
instrument: Optional[str] = None,
|
|
186
|
+
cache_key: Optional[str] = None
|
|
187
|
+
):
|
|
188
|
+
"""
|
|
189
|
+
Invalidate cache entries.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
instrument: Invalidate all caches for this instrument
|
|
193
|
+
cache_key: Invalidate specific cache key
|
|
194
|
+
"""
|
|
195
|
+
if cache_key:
|
|
196
|
+
cache_path = self._get_cache_path(cache_key)
|
|
197
|
+
metadata_path = self._get_metadata_path(cache_key)
|
|
198
|
+
|
|
199
|
+
if os.path.exists(cache_path):
|
|
200
|
+
os.remove(cache_path)
|
|
201
|
+
if os.path.exists(metadata_path):
|
|
202
|
+
os.remove(metadata_path)
|
|
203
|
+
|
|
204
|
+
elif instrument:
|
|
205
|
+
# Remove all caches for instrument
|
|
206
|
+
for filename in os.listdir(self.cache_dir):
|
|
207
|
+
if filename.endswith('_metadata.json'):
|
|
208
|
+
metadata_path = os.path.join(self.cache_dir, filename)
|
|
209
|
+
try:
|
|
210
|
+
with open(metadata_path, 'r') as f:
|
|
211
|
+
meta = json.load(f)
|
|
212
|
+
if meta.get('instrument') == instrument:
|
|
213
|
+
cache_key = meta.get('cache_key')
|
|
214
|
+
if cache_key:
|
|
215
|
+
self.invalidate_cache(cache_key=cache_key)
|
|
216
|
+
except Exception:
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
def list_caches(self) -> List[Dict[str, Any]]:
|
|
220
|
+
"""List all cached features."""
|
|
221
|
+
caches = []
|
|
222
|
+
|
|
223
|
+
for filename in os.listdir(self.cache_dir):
|
|
224
|
+
if filename.endswith('_metadata.json'):
|
|
225
|
+
metadata_path = os.path.join(self.cache_dir, filename)
|
|
226
|
+
try:
|
|
227
|
+
with open(metadata_path, 'r') as f:
|
|
228
|
+
meta = json.load(f)
|
|
229
|
+
caches.append(meta)
|
|
230
|
+
except Exception:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
return caches
|
|
234
|
+
|