aponyx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aponyx might be problematic. Click here for more details.
- aponyx/__init__.py +12 -0
- aponyx/backtest/__init__.py +29 -0
- aponyx/backtest/adapters.py +134 -0
- aponyx/backtest/config.py +59 -0
- aponyx/backtest/engine.py +256 -0
- aponyx/backtest/metrics.py +216 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/config/__init__.py +77 -0
- aponyx/data/__init__.py +31 -0
- aponyx/data/cache.py +242 -0
- aponyx/data/fetch.py +410 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +269 -0
- aponyx/data/providers/file.py +86 -0
- aponyx/data/sample_data.py +359 -0
- aponyx/data/schemas.py +65 -0
- aponyx/data/sources.py +135 -0
- aponyx/data/validation.py +231 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +24 -0
- aponyx/models/catalog.py +167 -0
- aponyx/models/config.py +33 -0
- aponyx/models/registry.py +200 -0
- aponyx/models/signal_catalog.json +34 -0
- aponyx/models/signals.py +221 -0
- aponyx/persistence/__init__.py +20 -0
- aponyx/persistence/json_io.py +130 -0
- aponyx/persistence/parquet_io.py +174 -0
- aponyx/persistence/registry.py +375 -0
- aponyx/py.typed +0 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx-0.1.0.dist-info/METADATA +271 -0
- aponyx-0.1.0.dist-info/RECORD +37 -0
- aponyx-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Performance and risk metrics for backtest analysis.
|
|
3
|
+
|
|
4
|
+
Provides standard quantitative metrics for strategy evaluation.
|
|
5
|
+
Metrics follow industry conventions and are compatible with common
|
|
6
|
+
performance attribution frameworks.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class PerformanceMetrics:
|
|
20
|
+
"""
|
|
21
|
+
Container for strategy performance statistics.
|
|
22
|
+
|
|
23
|
+
Attributes
|
|
24
|
+
----------
|
|
25
|
+
sharpe_ratio : float
|
|
26
|
+
Annualized Sharpe ratio (assumes 252 trading days).
|
|
27
|
+
sortino_ratio : float
|
|
28
|
+
Annualized Sortino ratio (downside deviation).
|
|
29
|
+
max_drawdown : float
|
|
30
|
+
Maximum peak-to-trough decline in cumulative P&L.
|
|
31
|
+
calmar_ratio : float
|
|
32
|
+
Annualized return divided by max drawdown.
|
|
33
|
+
total_return : float
|
|
34
|
+
Total P&L over backtest period.
|
|
35
|
+
annualized_return : float
|
|
36
|
+
Total return annualized to yearly basis.
|
|
37
|
+
annualized_volatility : float
|
|
38
|
+
Annualized standard deviation of daily returns.
|
|
39
|
+
hit_rate : float
|
|
40
|
+
Proportion of profitable trades (0.0 to 1.0).
|
|
41
|
+
avg_win : float
|
|
42
|
+
Average P&L of winning trades.
|
|
43
|
+
avg_loss : float
|
|
44
|
+
Average P&L of losing trades (negative).
|
|
45
|
+
win_loss_ratio : float
|
|
46
|
+
Absolute value of avg_win / avg_loss.
|
|
47
|
+
n_trades : int
|
|
48
|
+
Total number of round-trip trades.
|
|
49
|
+
avg_holding_days : float
|
|
50
|
+
Average days per trade.
|
|
51
|
+
|
|
52
|
+
Notes
|
|
53
|
+
-----
|
|
54
|
+
All ratios use risk-free rate = 0 for simplicity.
|
|
55
|
+
Metrics are based on daily P&L, not mark-to-market equity curve.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
sharpe_ratio: float
|
|
59
|
+
sortino_ratio: float
|
|
60
|
+
max_drawdown: float
|
|
61
|
+
calmar_ratio: float
|
|
62
|
+
total_return: float
|
|
63
|
+
annualized_return: float
|
|
64
|
+
annualized_volatility: float
|
|
65
|
+
hit_rate: float
|
|
66
|
+
avg_win: float
|
|
67
|
+
avg_loss: float
|
|
68
|
+
win_loss_ratio: float
|
|
69
|
+
n_trades: int
|
|
70
|
+
avg_holding_days: float
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compute_performance_metrics(
|
|
74
|
+
pnl_df: pd.DataFrame,
|
|
75
|
+
positions_df: pd.DataFrame,
|
|
76
|
+
) -> PerformanceMetrics:
|
|
77
|
+
"""
|
|
78
|
+
Compute comprehensive performance metrics from backtest results.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
pnl_df : pd.DataFrame
|
|
83
|
+
Daily P&L data with 'net_pnl' and 'cumulative_pnl' columns.
|
|
84
|
+
positions_df : pd.DataFrame
|
|
85
|
+
Daily position data with 'position' and 'days_held' columns.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
PerformanceMetrics
|
|
90
|
+
Complete set of performance statistics.
|
|
91
|
+
|
|
92
|
+
Notes
|
|
93
|
+
-----
|
|
94
|
+
Calculations assume:
|
|
95
|
+
- 252 trading days per year for annualization
|
|
96
|
+
- No risk-free rate (excess returns = total returns)
|
|
97
|
+
- Daily P&L represents actual trading results
|
|
98
|
+
|
|
99
|
+
Sharpe and Sortino use daily P&L volatility, not equity curve volatility.
|
|
100
|
+
This better captures strategy risk for overlay strategies.
|
|
101
|
+
|
|
102
|
+
Examples
|
|
103
|
+
--------
|
|
104
|
+
>>> metrics = compute_performance_metrics(result.pnl, result.positions)
|
|
105
|
+
>>> print(f"Sharpe: {metrics.sharpe_ratio:.2f}, Max DD: ${metrics.max_drawdown:,.0f}")
|
|
106
|
+
"""
|
|
107
|
+
logger.info("Computing performance metrics")
|
|
108
|
+
|
|
109
|
+
# Basic statistics
|
|
110
|
+
daily_pnl = pnl_df["net_pnl"]
|
|
111
|
+
cum_pnl = pnl_df["cumulative_pnl"]
|
|
112
|
+
n_days = len(pnl_df)
|
|
113
|
+
n_years = n_days / 252.0
|
|
114
|
+
|
|
115
|
+
# Return metrics
|
|
116
|
+
total_return = cum_pnl.iloc[-1]
|
|
117
|
+
annualized_return = total_return / n_years if n_years > 0 else 0.0
|
|
118
|
+
|
|
119
|
+
# Risk metrics
|
|
120
|
+
daily_std = daily_pnl.std()
|
|
121
|
+
annualized_vol = daily_std * np.sqrt(252)
|
|
122
|
+
|
|
123
|
+
# Sharpe ratio (using daily P&L, not equity curve)
|
|
124
|
+
if daily_std > 0:
|
|
125
|
+
sharpe_ratio = (daily_pnl.mean() / daily_std) * np.sqrt(252)
|
|
126
|
+
else:
|
|
127
|
+
sharpe_ratio = 0.0
|
|
128
|
+
|
|
129
|
+
# Sortino ratio (downside deviation)
|
|
130
|
+
downside_returns = daily_pnl[daily_pnl < 0]
|
|
131
|
+
if len(downside_returns) > 0:
|
|
132
|
+
downside_std = downside_returns.std()
|
|
133
|
+
if downside_std > 0:
|
|
134
|
+
sortino_ratio = (daily_pnl.mean() / downside_std) * np.sqrt(252)
|
|
135
|
+
else:
|
|
136
|
+
sortino_ratio = 0.0
|
|
137
|
+
else:
|
|
138
|
+
sortino_ratio = sharpe_ratio # No downside = same as Sharpe
|
|
139
|
+
|
|
140
|
+
# Drawdown analysis
|
|
141
|
+
running_max = cum_pnl.expanding().max()
|
|
142
|
+
drawdown = cum_pnl - running_max
|
|
143
|
+
max_drawdown = drawdown.min()
|
|
144
|
+
|
|
145
|
+
# Calmar ratio
|
|
146
|
+
if max_drawdown < 0:
|
|
147
|
+
calmar_ratio = annualized_return / abs(max_drawdown)
|
|
148
|
+
else:
|
|
149
|
+
calmar_ratio = 0.0
|
|
150
|
+
|
|
151
|
+
# Trade-level statistics
|
|
152
|
+
# Identify trade entries (transitions from flat to positioned)
|
|
153
|
+
prev_position = positions_df["position"].shift(1).fillna(0)
|
|
154
|
+
position_entries = (prev_position == 0) & (positions_df["position"] != 0)
|
|
155
|
+
n_trades = position_entries.sum()
|
|
156
|
+
|
|
157
|
+
# Compute P&L per trade by grouping consecutive positions
|
|
158
|
+
# Assign a trade_id to each position period
|
|
159
|
+
position_changes = (positions_df["position"] != prev_position).astype(int)
|
|
160
|
+
trade_id = position_changes.cumsum()
|
|
161
|
+
|
|
162
|
+
# Only include periods where we have a position
|
|
163
|
+
active_trades = positions_df[positions_df["position"] != 0].copy()
|
|
164
|
+
|
|
165
|
+
if len(active_trades) > 0:
|
|
166
|
+
active_trades["trade_id"] = trade_id[positions_df["position"] != 0]
|
|
167
|
+
|
|
168
|
+
# Sum P&L per trade_id
|
|
169
|
+
trade_pnls = pnl_df.loc[active_trades.index].groupby(
|
|
170
|
+
active_trades["trade_id"]
|
|
171
|
+
)["net_pnl"].sum()
|
|
172
|
+
|
|
173
|
+
trade_pnls_array = trade_pnls.values
|
|
174
|
+
winning_trades = trade_pnls_array[trade_pnls_array > 0]
|
|
175
|
+
losing_trades = trade_pnls_array[trade_pnls_array < 0]
|
|
176
|
+
|
|
177
|
+
hit_rate = len(winning_trades) / len(trade_pnls_array) if len(trade_pnls_array) > 0 else 0.0
|
|
178
|
+
avg_win = winning_trades.mean() if len(winning_trades) > 0 else 0.0
|
|
179
|
+
avg_loss = losing_trades.mean() if len(losing_trades) > 0 else 0.0
|
|
180
|
+
|
|
181
|
+
if avg_loss < 0:
|
|
182
|
+
win_loss_ratio = abs(avg_win / avg_loss)
|
|
183
|
+
else:
|
|
184
|
+
win_loss_ratio = 0.0
|
|
185
|
+
else:
|
|
186
|
+
hit_rate = 0.0
|
|
187
|
+
avg_win = 0.0
|
|
188
|
+
avg_loss = 0.0
|
|
189
|
+
win_loss_ratio = 0.0
|
|
190
|
+
|
|
191
|
+
# Holding period statistics
|
|
192
|
+
holding_periods = positions_df[positions_df["position"] != 0]["days_held"]
|
|
193
|
+
avg_holding_days = holding_periods.mean() if len(holding_periods) > 0 else 0.0
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
"Metrics computed: sharpe=%.2f, max_dd=$%.0f, hit_rate=%.1f%%",
|
|
197
|
+
sharpe_ratio,
|
|
198
|
+
max_drawdown,
|
|
199
|
+
hit_rate * 100,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return PerformanceMetrics(
|
|
203
|
+
sharpe_ratio=sharpe_ratio,
|
|
204
|
+
sortino_ratio=sortino_ratio,
|
|
205
|
+
max_drawdown=max_drawdown,
|
|
206
|
+
calmar_ratio=calmar_ratio,
|
|
207
|
+
total_return=total_return,
|
|
208
|
+
annualized_return=annualized_return,
|
|
209
|
+
annualized_volatility=annualized_vol,
|
|
210
|
+
hit_rate=hit_rate,
|
|
211
|
+
avg_win=avg_win,
|
|
212
|
+
avg_loss=avg_loss,
|
|
213
|
+
win_loss_ratio=win_loss_ratio,
|
|
214
|
+
n_trades=int(n_trades),
|
|
215
|
+
avg_holding_days=avg_holding_days,
|
|
216
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Protocol definitions for backtest engine extensibility.
|
|
3
|
+
|
|
4
|
+
These protocols define the interface for swappable backtest components,
|
|
5
|
+
allowing easy integration of external libraries (vectorbt, backtrader, etc.)
|
|
6
|
+
while maintaining our domain-specific API.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Protocol
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from .config import BacktestConfig
|
|
14
|
+
from .engine import BacktestResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BacktestEngine(Protocol):
|
|
18
|
+
"""
|
|
19
|
+
Protocol for backtest engine implementations.
|
|
20
|
+
|
|
21
|
+
This allows swapping between our simple implementation and
|
|
22
|
+
more sophisticated libraries while maintaining the same API.
|
|
23
|
+
|
|
24
|
+
Examples
|
|
25
|
+
--------
|
|
26
|
+
>>> # Our implementation
|
|
27
|
+
>>> from aponyx.backtest import run_backtest
|
|
28
|
+
>>> result = run_backtest(signal, spread, config)
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Future: vectorbt wrapper
|
|
31
|
+
>>> from aponyx.backtest.adapters import VectorBTEngine
|
|
32
|
+
>>> engine = VectorBTEngine()
|
|
33
|
+
>>> result = engine.run(signal, spread, config)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def run(
|
|
37
|
+
self,
|
|
38
|
+
composite_signal: pd.Series,
|
|
39
|
+
spread: pd.Series,
|
|
40
|
+
config: BacktestConfig | None = None,
|
|
41
|
+
) -> BacktestResult:
|
|
42
|
+
"""
|
|
43
|
+
Execute backtest on signal and price data.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
composite_signal : pd.Series
|
|
48
|
+
Daily positioning scores from signal computation.
|
|
49
|
+
spread : pd.Series
|
|
50
|
+
CDX spread levels aligned to signal dates.
|
|
51
|
+
config : BacktestConfig | None
|
|
52
|
+
Backtest parameters. Uses defaults if None.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
BacktestResult
|
|
57
|
+
Complete backtest results including positions and P&L.
|
|
58
|
+
"""
|
|
59
|
+
...
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PerformanceCalculator(Protocol):
|
|
63
|
+
"""
|
|
64
|
+
Protocol for performance metrics calculation.
|
|
65
|
+
|
|
66
|
+
Allows swapping between our simple implementation and
|
|
67
|
+
libraries like quantstats, empyrical, pyfolio, etc.
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> # Our implementation
|
|
72
|
+
>>> from aponyx.backtest import compute_performance_metrics
|
|
73
|
+
>>> metrics = compute_performance_metrics(result.pnl, result.positions)
|
|
74
|
+
>>>
|
|
75
|
+
>>> # Future: quantstats wrapper
|
|
76
|
+
>>> from aponyx.backtest.adapters import QuantStatsCalculator
|
|
77
|
+
>>> calc = QuantStatsCalculator()
|
|
78
|
+
>>> metrics = calc.compute(result.pnl, result.positions)
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def compute(
|
|
82
|
+
self,
|
|
83
|
+
pnl_df: pd.DataFrame,
|
|
84
|
+
positions_df: pd.DataFrame,
|
|
85
|
+
) -> pd.DataFrame | dict:
|
|
86
|
+
"""
|
|
87
|
+
Compute performance metrics from backtest results.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
pnl_df : pd.DataFrame
|
|
92
|
+
Daily P&L data with 'net_pnl' and 'cumulative_pnl' columns.
|
|
93
|
+
positions_df : pd.DataFrame
|
|
94
|
+
Daily position data with 'position' and 'days_held' columns.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
pd.DataFrame | dict
|
|
99
|
+
Performance statistics. Format may vary by implementation.
|
|
100
|
+
"""
|
|
101
|
+
...
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration module for paths, constants, and environment settings.
|
|
3
|
+
|
|
4
|
+
Defines project-wide constants including instrument universe, data paths,
|
|
5
|
+
and default parameters for the CDX overlay strategy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Final, Any
|
|
10
|
+
|
|
11
|
+
# Project root and data directories
|
|
12
|
+
# From src/aponyx/config/__init__.py -> src/aponyx -> src -> project_root
|
|
13
|
+
PROJECT_ROOT: Final[Path] = Path(__file__).parent.parent.parent.parent
|
|
14
|
+
DATA_DIR: Final[Path] = PROJECT_ROOT / "data"
|
|
15
|
+
REGISTRY_PATH: Final[Path] = DATA_DIR / "registry.json"
|
|
16
|
+
LOGS_DIR: Final[Path] = PROJECT_ROOT / "logs"
|
|
17
|
+
|
|
18
|
+
# Instrument universe for CDX overlay strategy
|
|
19
|
+
CDX_INSTRUMENTS: Final[dict[str, list[str]]] = {
|
|
20
|
+
"IG": ["5Y", "10Y"], # Investment Grade
|
|
21
|
+
"HY": ["5Y"], # High Yield
|
|
22
|
+
"XO": ["5Y"], # Crossover
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# ETF proxies for signal generation (not direct trading)
|
|
26
|
+
ETF_TICKERS: Final[list[str]] = ["HYG", "LQD"]
|
|
27
|
+
|
|
28
|
+
# Market data identifiers
|
|
29
|
+
MARKET_DATA_TICKERS: Final[dict[str, str]] = {
|
|
30
|
+
"VIX": "^VIX",
|
|
31
|
+
"SPX": "^GSPC",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Default signal parameters
|
|
35
|
+
DEFAULT_SIGNAL_PARAMS: Final[dict[str, int | float]] = {
|
|
36
|
+
"momentum_window": 5,
|
|
37
|
+
"volatility_window": 20,
|
|
38
|
+
"z_score_window": 60,
|
|
39
|
+
"basis_threshold": 0.5,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Data versioning
|
|
43
|
+
DATA_VERSION: Final[str] = "0.1.0"
|
|
44
|
+
|
|
45
|
+
# Cache configuration
|
|
46
|
+
CACHE_ENABLED: Final[bool] = True
|
|
47
|
+
CACHE_TTL_DAYS: Final[dict[str, int | None]] = {
|
|
48
|
+
"cdx": 1, # Daily refresh for market data
|
|
49
|
+
"vix": 1,
|
|
50
|
+
"etf": 1,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Default data sources (can be overridden per fetch call)
|
|
54
|
+
# Set to None to require explicit source in fetch calls
|
|
55
|
+
DEFAULT_DATA_SOURCES: Final[dict[str, Any]] = {
|
|
56
|
+
"cdx": None,
|
|
57
|
+
"vix": None,
|
|
58
|
+
"etf": None,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def ensure_directories() -> None:
|
|
63
|
+
"""
|
|
64
|
+
Create required directories if they don't exist.
|
|
65
|
+
|
|
66
|
+
Creates data, logs, and other necessary directories for the project.
|
|
67
|
+
Safe to call multiple times.
|
|
68
|
+
"""
|
|
69
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
(DATA_DIR / "raw").mkdir(exist_ok=True)
|
|
72
|
+
(DATA_DIR / "processed").mkdir(exist_ok=True)
|
|
73
|
+
(DATA_DIR / "cache").mkdir(exist_ok=True)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Initialize directories on module import
|
|
77
|
+
ensure_directories()
|
aponyx/data/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data layer for systematic macro credit strategy.
|
|
3
|
+
|
|
4
|
+
This module handles data fetching, cleaning, and transformation for:
|
|
5
|
+
- CDX indices (IG, HY, XO) across tenors
|
|
6
|
+
- VIX equity volatility index
|
|
7
|
+
- Credit ETFs (HYG, LQD) used for signal generation
|
|
8
|
+
|
|
9
|
+
All fetch functions produce standardized DataFrames with DatetimeIndex and validated schemas.
|
|
10
|
+
Supports multiple data providers: local files, Bloomberg Terminal, APIs.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .fetch import fetch_cdx, fetch_vix, fetch_etf
|
|
14
|
+
from .sources import FileSource, BloombergSource, APISource, DataSource
|
|
15
|
+
from .validation import validate_cdx_schema, validate_vix_schema, validate_etf_schema
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# Fetch functions
|
|
19
|
+
"fetch_cdx",
|
|
20
|
+
"fetch_vix",
|
|
21
|
+
"fetch_etf",
|
|
22
|
+
# Data sources
|
|
23
|
+
"FileSource",
|
|
24
|
+
"BloombergSource",
|
|
25
|
+
"APISource",
|
|
26
|
+
"DataSource",
|
|
27
|
+
# Validation
|
|
28
|
+
"validate_cdx_schema",
|
|
29
|
+
"validate_vix_schema",
|
|
30
|
+
"validate_etf_schema",
|
|
31
|
+
]
|
aponyx/data/cache.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transparent caching layer for fetched data.
|
|
3
|
+
|
|
4
|
+
Caches API/provider responses to local Parquet files with staleness tracking.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from ..persistence.parquet_io import save_parquet, load_parquet
|
|
16
|
+
from ..persistence.registry import DataRegistry
|
|
17
|
+
from .sources import DataSource, resolve_provider
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _generate_cache_key(
|
|
23
|
+
source: DataSource,
|
|
24
|
+
instrument: str,
|
|
25
|
+
start_date: str | None,
|
|
26
|
+
end_date: str | None,
|
|
27
|
+
**params: Any,
|
|
28
|
+
) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Generate unique cache key from fetch parameters.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
source : DataSource
|
|
35
|
+
Data source configuration.
|
|
36
|
+
instrument : str
|
|
37
|
+
Instrument identifier.
|
|
38
|
+
start_date : str or None
|
|
39
|
+
Start date.
|
|
40
|
+
end_date : str or None
|
|
41
|
+
End date.
|
|
42
|
+
**params : Any
|
|
43
|
+
Additional parameters.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
str
|
|
48
|
+
Hash-based cache key.
|
|
49
|
+
"""
|
|
50
|
+
# Create stable string representation
|
|
51
|
+
key_parts = [
|
|
52
|
+
resolve_provider(source),
|
|
53
|
+
instrument,
|
|
54
|
+
start_date or "none",
|
|
55
|
+
end_date or "none",
|
|
56
|
+
str(sorted(params.items())),
|
|
57
|
+
]
|
|
58
|
+
key_string = "|".join(key_parts)
|
|
59
|
+
|
|
60
|
+
# Generate hash
|
|
61
|
+
hash_obj = hashlib.sha256(key_string.encode())
|
|
62
|
+
return hash_obj.hexdigest()[:16]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_cache_path(
|
|
66
|
+
cache_dir: Path,
|
|
67
|
+
provider: str,
|
|
68
|
+
instrument: str,
|
|
69
|
+
cache_key: str,
|
|
70
|
+
) -> Path:
|
|
71
|
+
"""
|
|
72
|
+
Generate file path for cached data.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
cache_dir : Path
|
|
77
|
+
Base cache directory.
|
|
78
|
+
provider : str
|
|
79
|
+
Provider type (file, bloomberg, api).
|
|
80
|
+
instrument : str
|
|
81
|
+
Instrument identifier.
|
|
82
|
+
cache_key : str
|
|
83
|
+
Unique cache key.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
Path
|
|
88
|
+
Path to cache file.
|
|
89
|
+
"""
|
|
90
|
+
provider_dir = cache_dir / provider
|
|
91
|
+
provider_dir.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
# Sanitize instrument name for filename
|
|
94
|
+
safe_instrument = instrument.replace(".", "_").replace("/", "_")
|
|
95
|
+
filename = f"{safe_instrument}_{cache_key}.parquet"
|
|
96
|
+
|
|
97
|
+
return provider_dir / filename
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_cache_stale(
|
|
101
|
+
cache_path: Path,
|
|
102
|
+
ttl_days: int | None = None,
|
|
103
|
+
) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Check if cached data is stale based on TTL.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
cache_path : Path
|
|
110
|
+
Path to cached file.
|
|
111
|
+
ttl_days : int or None
|
|
112
|
+
Time-to-live in days. None means cache never expires.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
bool
|
|
117
|
+
True if cache is stale or doesn't exist.
|
|
118
|
+
"""
|
|
119
|
+
if not cache_path.exists():
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
if ttl_days is None:
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
# Check file modification time
|
|
126
|
+
mtime = datetime.fromtimestamp(cache_path.stat().st_mtime)
|
|
127
|
+
age = datetime.now() - mtime
|
|
128
|
+
|
|
129
|
+
is_stale = age > timedelta(days=ttl_days)
|
|
130
|
+
|
|
131
|
+
if is_stale:
|
|
132
|
+
logger.debug("Cache stale: age=%s, ttl=%d days", age, ttl_days)
|
|
133
|
+
|
|
134
|
+
return is_stale
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_cached_data(
|
|
138
|
+
source: DataSource,
|
|
139
|
+
instrument: str,
|
|
140
|
+
cache_dir: Path,
|
|
141
|
+
start_date: str | None = None,
|
|
142
|
+
end_date: str | None = None,
|
|
143
|
+
ttl_days: int | None = None,
|
|
144
|
+
**params: Any,
|
|
145
|
+
) -> pd.DataFrame | None:
|
|
146
|
+
"""
|
|
147
|
+
Retrieve data from cache if available and fresh.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
source : DataSource
|
|
152
|
+
Data source configuration.
|
|
153
|
+
instrument : str
|
|
154
|
+
Instrument identifier.
|
|
155
|
+
cache_dir : Path
|
|
156
|
+
Cache directory.
|
|
157
|
+
start_date : str or None
|
|
158
|
+
Start date filter.
|
|
159
|
+
end_date : str or None
|
|
160
|
+
End date filter.
|
|
161
|
+
ttl_days : int or None
|
|
162
|
+
Cache TTL in days.
|
|
163
|
+
**params : Any
|
|
164
|
+
Additional fetch parameters.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
pd.DataFrame or None
|
|
169
|
+
Cached data if available and fresh, None otherwise.
|
|
170
|
+
"""
|
|
171
|
+
provider = resolve_provider(source)
|
|
172
|
+
cache_key = _generate_cache_key(source, instrument, start_date, end_date, **params)
|
|
173
|
+
cache_path = get_cache_path(cache_dir, provider, instrument, cache_key)
|
|
174
|
+
|
|
175
|
+
if is_cache_stale(cache_path, ttl_days):
|
|
176
|
+
logger.debug("Cache miss or stale: %s", cache_path.name)
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
logger.info("Cache hit: %s", cache_path.name)
|
|
180
|
+
return load_parquet(cache_path)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def save_to_cache(
|
|
184
|
+
df: pd.DataFrame,
|
|
185
|
+
source: DataSource,
|
|
186
|
+
instrument: str,
|
|
187
|
+
cache_dir: Path,
|
|
188
|
+
registry: DataRegistry | None = None,
|
|
189
|
+
start_date: str | None = None,
|
|
190
|
+
end_date: str | None = None,
|
|
191
|
+
**params: Any,
|
|
192
|
+
) -> Path:
|
|
193
|
+
"""
|
|
194
|
+
Save fetched data to cache.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
df : pd.DataFrame
|
|
199
|
+
Data to cache.
|
|
200
|
+
source : DataSource
|
|
201
|
+
Data source configuration.
|
|
202
|
+
instrument : str
|
|
203
|
+
Instrument identifier.
|
|
204
|
+
cache_dir : Path
|
|
205
|
+
Cache directory.
|
|
206
|
+
registry : DataRegistry or None
|
|
207
|
+
Optional registry to register cached dataset.
|
|
208
|
+
start_date : str or None
|
|
209
|
+
Start date (for cache key).
|
|
210
|
+
end_date : str or None
|
|
211
|
+
End date (for cache key).
|
|
212
|
+
**params : Any
|
|
213
|
+
Additional parameters (for cache key).
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
Path
|
|
218
|
+
Path to cached file.
|
|
219
|
+
"""
|
|
220
|
+
provider = resolve_provider(source)
|
|
221
|
+
cache_key = _generate_cache_key(source, instrument, start_date, end_date, **params)
|
|
222
|
+
cache_path = get_cache_path(cache_dir, provider, instrument, cache_key)
|
|
223
|
+
|
|
224
|
+
# Save to Parquet
|
|
225
|
+
save_parquet(df, cache_path)
|
|
226
|
+
logger.info("Cached data: path=%s, rows=%d", cache_path, len(df))
|
|
227
|
+
|
|
228
|
+
# Register in catalog if provided
|
|
229
|
+
if registry is not None:
|
|
230
|
+
registry.register_dataset(
|
|
231
|
+
name=f"cache_{instrument}_{cache_key}",
|
|
232
|
+
file_path=cache_path,
|
|
233
|
+
instrument=instrument,
|
|
234
|
+
metadata={
|
|
235
|
+
"provider": provider,
|
|
236
|
+
"cached_at": datetime.now().isoformat(),
|
|
237
|
+
"cache_key": cache_key,
|
|
238
|
+
"params": params,
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return cache_path
|