aponyx 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aponyx/__init__.py +14 -0
- aponyx/backtest/__init__.py +31 -0
- aponyx/backtest/adapters.py +77 -0
- aponyx/backtest/config.py +84 -0
- aponyx/backtest/engine.py +560 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/backtest/registry.py +334 -0
- aponyx/backtest/strategy_catalog.json +50 -0
- aponyx/cli/__init__.py +5 -0
- aponyx/cli/commands/__init__.py +8 -0
- aponyx/cli/commands/clean.py +349 -0
- aponyx/cli/commands/list.py +302 -0
- aponyx/cli/commands/report.py +167 -0
- aponyx/cli/commands/run.py +377 -0
- aponyx/cli/main.py +125 -0
- aponyx/config/__init__.py +82 -0
- aponyx/data/__init__.py +99 -0
- aponyx/data/bloomberg_config.py +306 -0
- aponyx/data/bloomberg_instruments.json +26 -0
- aponyx/data/bloomberg_securities.json +42 -0
- aponyx/data/cache.py +294 -0
- aponyx/data/fetch.py +659 -0
- aponyx/data/fetch_registry.py +135 -0
- aponyx/data/loaders.py +205 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +383 -0
- aponyx/data/providers/file.py +111 -0
- aponyx/data/registry.py +500 -0
- aponyx/data/requirements.py +96 -0
- aponyx/data/sample_data.py +415 -0
- aponyx/data/schemas.py +60 -0
- aponyx/data/sources.py +171 -0
- aponyx/data/synthetic_params.json +46 -0
- aponyx/data/transforms.py +336 -0
- aponyx/data/validation.py +308 -0
- aponyx/docs/__init__.py +24 -0
- aponyx/docs/adding_data_providers.md +682 -0
- aponyx/docs/cdx_knowledge_base.md +455 -0
- aponyx/docs/cdx_overlay_strategy.md +135 -0
- aponyx/docs/cli_guide.md +607 -0
- aponyx/docs/governance_design.md +551 -0
- aponyx/docs/logging_design.md +251 -0
- aponyx/docs/performance_evaluation_design.md +265 -0
- aponyx/docs/python_guidelines.md +786 -0
- aponyx/docs/signal_registry_usage.md +369 -0
- aponyx/docs/signal_suitability_design.md +558 -0
- aponyx/docs/visualization_design.md +277 -0
- aponyx/evaluation/__init__.py +11 -0
- aponyx/evaluation/performance/__init__.py +24 -0
- aponyx/evaluation/performance/adapters.py +109 -0
- aponyx/evaluation/performance/analyzer.py +384 -0
- aponyx/evaluation/performance/config.py +320 -0
- aponyx/evaluation/performance/decomposition.py +304 -0
- aponyx/evaluation/performance/metrics.py +761 -0
- aponyx/evaluation/performance/registry.py +327 -0
- aponyx/evaluation/performance/report.py +541 -0
- aponyx/evaluation/suitability/__init__.py +67 -0
- aponyx/evaluation/suitability/config.py +143 -0
- aponyx/evaluation/suitability/evaluator.py +389 -0
- aponyx/evaluation/suitability/registry.py +328 -0
- aponyx/evaluation/suitability/report.py +398 -0
- aponyx/evaluation/suitability/scoring.py +367 -0
- aponyx/evaluation/suitability/tests.py +303 -0
- aponyx/examples/01_generate_synthetic_data.py +53 -0
- aponyx/examples/02_fetch_data_file.py +82 -0
- aponyx/examples/03_fetch_data_bloomberg.py +104 -0
- aponyx/examples/04_compute_signal.py +164 -0
- aponyx/examples/05_evaluate_suitability.py +224 -0
- aponyx/examples/06_run_backtest.py +242 -0
- aponyx/examples/07_analyze_performance.py +214 -0
- aponyx/examples/08_visualize_results.py +272 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +45 -0
- aponyx/models/config.py +83 -0
- aponyx/models/indicator_transformation.json +52 -0
- aponyx/models/indicators.py +292 -0
- aponyx/models/metadata.py +447 -0
- aponyx/models/orchestrator.py +213 -0
- aponyx/models/registry.py +860 -0
- aponyx/models/score_transformation.json +42 -0
- aponyx/models/signal_catalog.json +29 -0
- aponyx/models/signal_composer.py +513 -0
- aponyx/models/signal_transformation.json +29 -0
- aponyx/persistence/__init__.py +16 -0
- aponyx/persistence/json_io.py +132 -0
- aponyx/persistence/parquet_io.py +378 -0
- aponyx/py.typed +0 -0
- aponyx/reporting/__init__.py +10 -0
- aponyx/reporting/generator.py +517 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx/workflows/__init__.py +18 -0
- aponyx/workflows/concrete_steps.py +720 -0
- aponyx/workflows/config.py +122 -0
- aponyx/workflows/engine.py +279 -0
- aponyx/workflows/registry.py +116 -0
- aponyx/workflows/steps.py +180 -0
- aponyx-0.1.18.dist-info/METADATA +552 -0
- aponyx-0.1.18.dist-info/RECORD +104 -0
- aponyx-0.1.18.dist-info/WHEEL +4 -0
- aponyx-0.1.18.dist-info/entry_points.txt +2 -0
- aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""Synthetic data generation for testing and demonstrations.
|
|
2
|
+
|
|
3
|
+
Generates realistic market data for CDX, VIX, and ETF instruments
|
|
4
|
+
with configurable volatility, correlation, and trend parameters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from ..persistence.parquet_io import save_parquet
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_cdx_sample(
|
|
20
|
+
start_date: str = "2024-01-01",
|
|
21
|
+
periods: int = 252,
|
|
22
|
+
index_name: str = "CDX_IG",
|
|
23
|
+
tenor: str = "5Y",
|
|
24
|
+
base_spread: float = 100.0,
|
|
25
|
+
volatility: float = 5.0,
|
|
26
|
+
seed: int = 42,
|
|
27
|
+
) -> pd.DataFrame:
|
|
28
|
+
"""
|
|
29
|
+
Generate synthetic CDX spread data.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
start_date : str, default "2024-01-01"
|
|
34
|
+
Start date for time series.
|
|
35
|
+
periods : int, default 252
|
|
36
|
+
Number of daily observations (trading days).
|
|
37
|
+
index_name : str, default "CDX_IG"
|
|
38
|
+
Index identifier (CDX_IG, CDX_HY, CDX_XO).
|
|
39
|
+
tenor : str, default "5Y"
|
|
40
|
+
Tenor string (5Y, 10Y).
|
|
41
|
+
base_spread : float, default 100.0
|
|
42
|
+
Starting spread level in basis points.
|
|
43
|
+
volatility : float, default 5.0
|
|
44
|
+
Daily spread volatility in basis points.
|
|
45
|
+
seed : int, default 42
|
|
46
|
+
Random seed for reproducibility.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
pd.DataFrame
|
|
51
|
+
CDX data with columns: date, spread, index, tenor, series
|
|
52
|
+
|
|
53
|
+
Notes
|
|
54
|
+
-----
|
|
55
|
+
- Uses geometric Brownian motion with mean reversion
|
|
56
|
+
- Spreads constrained to positive values
|
|
57
|
+
- Realistic credit market dynamics
|
|
58
|
+
"""
|
|
59
|
+
logger.info(
|
|
60
|
+
"Generating CDX sample: index=%s, tenor=%s, periods=%d",
|
|
61
|
+
index_name,
|
|
62
|
+
tenor,
|
|
63
|
+
periods,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
rng = np.random.default_rng(seed)
|
|
67
|
+
dates = pd.bdate_range(start=start_date, periods=periods)
|
|
68
|
+
|
|
69
|
+
# Mean-reverting spread dynamics
|
|
70
|
+
spread = [base_spread]
|
|
71
|
+
mean_reversion_speed = 0.1
|
|
72
|
+
mean_level = base_spread
|
|
73
|
+
|
|
74
|
+
for _ in range(periods - 1):
|
|
75
|
+
drift = mean_reversion_speed * (mean_level - spread[-1])
|
|
76
|
+
shock = rng.normal(0, volatility)
|
|
77
|
+
new_spread = max(1.0, spread[-1] + drift + shock)
|
|
78
|
+
spread.append(new_spread)
|
|
79
|
+
|
|
80
|
+
df = pd.DataFrame(
|
|
81
|
+
{
|
|
82
|
+
"date": dates,
|
|
83
|
+
"spread": spread,
|
|
84
|
+
"index": [f"{index_name}_{tenor}"] * periods,
|
|
85
|
+
"tenor": [tenor] * periods,
|
|
86
|
+
"series": [42] * periods,
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
logger.debug("Generated CDX sample: mean_spread=%.2f", df["spread"].mean())
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def generate_vix_sample(
|
|
95
|
+
start_date: str = "2024-01-01",
|
|
96
|
+
periods: int = 252,
|
|
97
|
+
base_vix: float = 15.0,
|
|
98
|
+
volatility: float = 2.0,
|
|
99
|
+
seed: int = 42,
|
|
100
|
+
) -> pd.DataFrame:
|
|
101
|
+
"""
|
|
102
|
+
Generate synthetic VIX volatility data.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
start_date : str, default "2024-01-01"
|
|
107
|
+
Start date for time series.
|
|
108
|
+
periods : int, default 252
|
|
109
|
+
Number of daily observations.
|
|
110
|
+
base_vix : float, default 15.0
|
|
111
|
+
Starting VIX level.
|
|
112
|
+
volatility : float, default 2.0
|
|
113
|
+
Volatility of volatility (vol of vol).
|
|
114
|
+
seed : int, default 42
|
|
115
|
+
Random seed for reproducibility.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
pd.DataFrame
|
|
120
|
+
VIX data with columns: date, level
|
|
121
|
+
|
|
122
|
+
Notes
|
|
123
|
+
-----
|
|
124
|
+
- Uses mean-reverting process with occasional spikes
|
|
125
|
+
- VIX constrained to positive values
|
|
126
|
+
"""
|
|
127
|
+
logger.info("Generating VIX sample: periods=%d", periods)
|
|
128
|
+
|
|
129
|
+
rng = np.random.default_rng(seed)
|
|
130
|
+
dates = pd.bdate_range(start=start_date, periods=periods)
|
|
131
|
+
|
|
132
|
+
# Mean-reverting VIX with spike potential
|
|
133
|
+
vix_close = [base_vix]
|
|
134
|
+
mean_reversion_speed = 0.15
|
|
135
|
+
mean_level = base_vix
|
|
136
|
+
|
|
137
|
+
for i in range(periods - 1):
|
|
138
|
+
# Occasional spike (5% probability)
|
|
139
|
+
if rng.random() < 0.05:
|
|
140
|
+
spike = rng.uniform(5, 15)
|
|
141
|
+
else:
|
|
142
|
+
spike = 0
|
|
143
|
+
|
|
144
|
+
drift = mean_reversion_speed * (mean_level - vix_close[-1])
|
|
145
|
+
shock = rng.normal(0, volatility)
|
|
146
|
+
new_vix = max(8.0, vix_close[-1] + drift + shock + spike)
|
|
147
|
+
vix_close.append(new_vix)
|
|
148
|
+
|
|
149
|
+
df = pd.DataFrame(
|
|
150
|
+
{
|
|
151
|
+
"date": dates,
|
|
152
|
+
"level": vix_close,
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
logger.debug("Generated VIX sample: mean=%.2f", df["level"].mean())
|
|
157
|
+
return df
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def generate_etf_sample(
|
|
161
|
+
start_date: str = "2024-01-01",
|
|
162
|
+
periods: int = 252,
|
|
163
|
+
ticker: str = "HYG",
|
|
164
|
+
base_price: float = 80.0,
|
|
165
|
+
volatility: float = 0.5,
|
|
166
|
+
seed: int = 42,
|
|
167
|
+
) -> pd.DataFrame:
|
|
168
|
+
"""
|
|
169
|
+
Generate synthetic credit ETF price data.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
start_date : str, default "2024-01-01"
|
|
174
|
+
Start date for time series.
|
|
175
|
+
periods : int, default 252
|
|
176
|
+
Number of daily observations.
|
|
177
|
+
ticker : str, default "HYG"
|
|
178
|
+
ETF ticker symbol (HYG, LQD).
|
|
179
|
+
base_price : float, default 80.0
|
|
180
|
+
Starting price.
|
|
181
|
+
volatility : float, default 0.5
|
|
182
|
+
Daily price volatility.
|
|
183
|
+
seed : int, default 42
|
|
184
|
+
Random seed for reproducibility.
|
|
185
|
+
|
|
186
|
+
Returns
|
|
187
|
+
-------
|
|
188
|
+
pd.DataFrame
|
|
189
|
+
ETF data with columns: date, spread, ticker
|
|
190
|
+
|
|
191
|
+
Notes
|
|
192
|
+
-----
|
|
193
|
+
- Uses geometric Brownian motion
|
|
194
|
+
- Prices constrained to positive values
|
|
195
|
+
"""
|
|
196
|
+
logger.info("Generating ETF sample: ticker=%s, periods=%d", ticker, periods)
|
|
197
|
+
|
|
198
|
+
rng = np.random.default_rng(seed)
|
|
199
|
+
dates = pd.bdate_range(start=start_date, periods=periods)
|
|
200
|
+
|
|
201
|
+
# Geometric Brownian motion for prices
|
|
202
|
+
returns = rng.normal(0.0001, volatility / base_price, periods)
|
|
203
|
+
price = base_price * np.exp(np.cumsum(returns))
|
|
204
|
+
|
|
205
|
+
df = pd.DataFrame(
|
|
206
|
+
{
|
|
207
|
+
"date": dates,
|
|
208
|
+
"spread": price,
|
|
209
|
+
"ticker": [ticker] * periods,
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
logger.debug("Generated ETF sample: mean_price=%.2f", df["spread"].mean())
|
|
214
|
+
return df
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def generate_for_fetch_interface(
|
|
218
|
+
output_dir: str | Path,
|
|
219
|
+
start_date: str = "2020-01-01",
|
|
220
|
+
end_date: str = "2025-01-01",
|
|
221
|
+
seed: int = 42,
|
|
222
|
+
) -> dict[str, Path]:
|
|
223
|
+
"""
|
|
224
|
+
Generate synthetic data for all securities in bloomberg_securities.json.
|
|
225
|
+
|
|
226
|
+
Creates individual files per security that work with fetch_cdx, fetch_vix,
|
|
227
|
+
and fetch_etf functions. Uses bloomberg_instruments.json for schema mapping.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
output_dir : str or Path
|
|
232
|
+
Base directory for raw files (e.g., "data/raw/file").
|
|
233
|
+
start_date : str, default "2020-01-01"
|
|
234
|
+
Start date for time series.
|
|
235
|
+
end_date : str, default "2025-01-01"
|
|
236
|
+
End date for time series.
|
|
237
|
+
seed : int, default 42
|
|
238
|
+
Random seed for reproducibility.
|
|
239
|
+
|
|
240
|
+
Returns
|
|
241
|
+
-------
|
|
242
|
+
dict[str, Path]
|
|
243
|
+
Mapping of security identifier to file path.
|
|
244
|
+
|
|
245
|
+
Notes
|
|
246
|
+
-----
|
|
247
|
+
Automatically generates data for all securities defined in bloomberg_securities.json:
|
|
248
|
+
- CDX indices: spread column with realistic credit dynamics
|
|
249
|
+
- VIX: level column with volatility spikes
|
|
250
|
+
- ETFs: spread column representing option-adjusted spreads
|
|
251
|
+
"""
|
|
252
|
+
import json
|
|
253
|
+
|
|
254
|
+
logger.info(
|
|
255
|
+
"Generating synthetic data for fetch interface: %s to %s",
|
|
256
|
+
start_date,
|
|
257
|
+
end_date,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
output_path = Path(output_dir)
|
|
261
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
262
|
+
|
|
263
|
+
# Load security and instrument configurations
|
|
264
|
+
config_dir = Path(__file__).parent
|
|
265
|
+
with open(config_dir / "bloomberg_securities.json") as f:
|
|
266
|
+
securities = json.load(f)
|
|
267
|
+
|
|
268
|
+
# Calculate periods from date range
|
|
269
|
+
start = pd.Timestamp(start_date)
|
|
270
|
+
end = pd.Timestamp(end_date)
|
|
271
|
+
dates = pd.bdate_range(start=start, end=end)
|
|
272
|
+
periods = len(dates)
|
|
273
|
+
|
|
274
|
+
file_paths = {}
|
|
275
|
+
seed_offset = 0
|
|
276
|
+
|
|
277
|
+
# Load parameters from config file
|
|
278
|
+
config_path = Path(__file__).parent / "synthetic_params.json"
|
|
279
|
+
with open(config_path, encoding="utf-8") as f:
|
|
280
|
+
default_params = json.load(f)
|
|
281
|
+
|
|
282
|
+
for security_id, security_config in securities.items():
|
|
283
|
+
instrument_type = security_config["instrument_type"]
|
|
284
|
+
|
|
285
|
+
logger.info("Generating %s data: %s", instrument_type, security_id)
|
|
286
|
+
|
|
287
|
+
if instrument_type == "cdx":
|
|
288
|
+
# Parse tenor from security_id or description
|
|
289
|
+
tenor = "5Y" if "5y" in security_id.lower() else "10Y"
|
|
290
|
+
index_name = security_id.upper().replace("_", " ")
|
|
291
|
+
|
|
292
|
+
params = default_params["cdx"].get(
|
|
293
|
+
security_id, default_params["cdx"]["default"]
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
df = generate_cdx_sample(
|
|
297
|
+
start_date=start_date,
|
|
298
|
+
periods=periods,
|
|
299
|
+
index_name=index_name,
|
|
300
|
+
tenor=tenor,
|
|
301
|
+
base_spread=params["base_spread"],
|
|
302
|
+
volatility=params["volatility"],
|
|
303
|
+
seed=seed + seed_offset,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Transform to CDX schema
|
|
307
|
+
df = df.set_index("date")
|
|
308
|
+
df = df[["spread"]].copy()
|
|
309
|
+
df["security"] = security_id
|
|
310
|
+
|
|
311
|
+
# Generate hash for raw storage naming (consistent with save_to_raw)
|
|
312
|
+
safe_instrument = security_id.replace(".", "_").replace("/", "_")
|
|
313
|
+
hash_input = (
|
|
314
|
+
f"synthetic|{security_id}|{df.index.min()}|{df.index.max()}|{len(df)}"
|
|
315
|
+
)
|
|
316
|
+
file_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:12]
|
|
317
|
+
file_path = output_path / f"{safe_instrument}_{file_hash}.parquet"
|
|
318
|
+
metadata_path = output_path / f"{safe_instrument}_{file_hash}.json"
|
|
319
|
+
|
|
320
|
+
elif instrument_type == "vix":
|
|
321
|
+
params = default_params["vix"]
|
|
322
|
+
|
|
323
|
+
df = generate_vix_sample(
|
|
324
|
+
start_date=start_date,
|
|
325
|
+
periods=periods,
|
|
326
|
+
base_vix=params["base_vix"],
|
|
327
|
+
volatility=params["volatility"],
|
|
328
|
+
seed=seed + seed_offset,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Transform to VIX schema
|
|
332
|
+
df = df.set_index("date")
|
|
333
|
+
df = df[["level"]].copy()
|
|
334
|
+
|
|
335
|
+
# Generate hash for raw storage naming (consistent with save_to_raw)
|
|
336
|
+
safe_instrument = security_id.replace(".", "_").replace("/", "_")
|
|
337
|
+
hash_input = (
|
|
338
|
+
f"synthetic|{security_id}|{df.index.min()}|{df.index.max()}|{len(df)}"
|
|
339
|
+
)
|
|
340
|
+
file_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:12]
|
|
341
|
+
file_path = output_path / f"{safe_instrument}_{file_hash}.parquet"
|
|
342
|
+
metadata_path = output_path / f"{safe_instrument}_{file_hash}.json"
|
|
343
|
+
|
|
344
|
+
elif instrument_type == "etf":
|
|
345
|
+
params = default_params["etf"].get(
|
|
346
|
+
security_id, default_params["etf"]["default"]
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
df = generate_etf_sample(
|
|
350
|
+
start_date=start_date,
|
|
351
|
+
periods=periods,
|
|
352
|
+
ticker=security_id.upper(),
|
|
353
|
+
base_price=params["base_price"],
|
|
354
|
+
volatility=params["volatility"],
|
|
355
|
+
seed=seed + seed_offset,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Transform to ETF schema
|
|
359
|
+
df = df.set_index("date")
|
|
360
|
+
df = df[["spread"]].copy()
|
|
361
|
+
df["security"] = security_id
|
|
362
|
+
|
|
363
|
+
# Generate hash for raw storage naming (consistent with save_to_raw)
|
|
364
|
+
safe_instrument = security_id.replace(".", "_").replace("/", "_")
|
|
365
|
+
hash_input = (
|
|
366
|
+
f"synthetic|{security_id}|{df.index.min()}|{df.index.max()}|{len(df)}"
|
|
367
|
+
)
|
|
368
|
+
file_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:12]
|
|
369
|
+
file_path = output_path / f"{safe_instrument}_{file_hash}.parquet"
|
|
370
|
+
metadata_path = output_path / f"{safe_instrument}_{file_hash}.json"
|
|
371
|
+
|
|
372
|
+
else:
|
|
373
|
+
logger.warning("Unknown instrument type: %s", instrument_type)
|
|
374
|
+
seed_offset += 1
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Save data and metadata
|
|
378
|
+
save_parquet(df, file_path)
|
|
379
|
+
|
|
380
|
+
metadata = {
|
|
381
|
+
"provider": "synthetic",
|
|
382
|
+
"instrument": instrument_type,
|
|
383
|
+
"security": security_id,
|
|
384
|
+
"stored_at": pd.Timestamp.now().isoformat(),
|
|
385
|
+
"date_range": {
|
|
386
|
+
"start": str(df.index.min()),
|
|
387
|
+
"end": str(df.index.max()),
|
|
388
|
+
},
|
|
389
|
+
"row_count": len(df),
|
|
390
|
+
"columns": list(df.columns),
|
|
391
|
+
"hash": file_hash,
|
|
392
|
+
"generation_params": params,
|
|
393
|
+
}
|
|
394
|
+
from ..persistence.json_io import save_json
|
|
395
|
+
|
|
396
|
+
save_json(metadata, metadata_path)
|
|
397
|
+
|
|
398
|
+
file_paths[security_id] = file_path
|
|
399
|
+
logger.info("Saved %s to %s (%d rows)", security_id, file_path, len(df))
|
|
400
|
+
|
|
401
|
+
seed_offset += 1
|
|
402
|
+
|
|
403
|
+
# Generate registry.json mapping security_id to filename
|
|
404
|
+
registry = {
|
|
405
|
+
security_id: Path(file_path).name
|
|
406
|
+
for security_id, file_path in file_paths.items()
|
|
407
|
+
}
|
|
408
|
+
registry_path = output_path / "registry.json"
|
|
409
|
+
save_json(registry, registry_path)
|
|
410
|
+
logger.info(
|
|
411
|
+
"Saved security registry: %s (%d securities)", registry_path, len(registry)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
logger.info("Synthetic data generation complete: %d files", len(file_paths))
|
|
415
|
+
return file_paths
|
aponyx/data/schemas.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data schemas and validation rules for market data.
|
|
3
|
+
|
|
4
|
+
Defines expected column names, types, and constraints for each data source.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class CDXSchema:
|
|
13
|
+
"""Schema for CDX index data."""
|
|
14
|
+
|
|
15
|
+
date_col: str = "date"
|
|
16
|
+
spread_col: str = "spread"
|
|
17
|
+
security_col: str = "security" # e.g., "cdx_ig_5y", "cdx_hy_5y"
|
|
18
|
+
|
|
19
|
+
required_cols: tuple[str, ...] = ("date", "spread")
|
|
20
|
+
|
|
21
|
+
# Validation constraints
|
|
22
|
+
min_spread: float = 0.0 # Spreads in basis points
|
|
23
|
+
max_spread: float = 10000.0 # 100% spread cap
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class VIXSchema:
|
|
28
|
+
"""Schema for VIX volatility index data."""
|
|
29
|
+
|
|
30
|
+
date_col: str = "date"
|
|
31
|
+
level_col: str = "level"
|
|
32
|
+
|
|
33
|
+
required_cols: tuple[str, ...] = ("date", "level")
|
|
34
|
+
|
|
35
|
+
# Validation constraints
|
|
36
|
+
min_vix: float = 0.0
|
|
37
|
+
max_vix: float = 200.0 # Extreme stress cap
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class ETFSchema:
|
|
42
|
+
"""Schema for credit ETF data (HYG, LQD)."""
|
|
43
|
+
|
|
44
|
+
date_col: str = "date"
|
|
45
|
+
spread_col: str = "spread"
|
|
46
|
+
security_col: str = "security" # e.g., "hyg", "lqd"
|
|
47
|
+
|
|
48
|
+
required_cols: tuple[str, ...] = ("date", "spread")
|
|
49
|
+
|
|
50
|
+
# Validation constraints
|
|
51
|
+
min_price: float = 0.0
|
|
52
|
+
max_price: float = 10000.0 # Sanity check
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Schema registry for runtime lookup
|
|
56
|
+
SCHEMAS: dict[str, Any] = {
|
|
57
|
+
"cdx": CDXSchema(),
|
|
58
|
+
"vix": VIXSchema(),
|
|
59
|
+
"etf": ETFSchema(),
|
|
60
|
+
}
|
aponyx/data/sources.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data source configuration for pluggable data providers.
|
|
3
|
+
|
|
4
|
+
Defines source types (file, Bloomberg, API) and factory for provider resolution.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Protocol, Any
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class FileSource:
|
|
19
|
+
"""
|
|
20
|
+
File-based data source with security-to-file mapping.
|
|
21
|
+
|
|
22
|
+
Attributes
|
|
23
|
+
----------
|
|
24
|
+
base_dir : Path
|
|
25
|
+
Base directory containing data files.
|
|
26
|
+
registry_path : Path or None
|
|
27
|
+
Path to registry JSON file. If None, defaults to {base_dir}/registry.json.
|
|
28
|
+
security_mapping : dict[str, str]
|
|
29
|
+
Mapping from security ID to filename (auto-loaded from registry).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
base_dir: Path
|
|
33
|
+
registry_path: Path | None = None
|
|
34
|
+
security_mapping: dict[str, str] | None = None
|
|
35
|
+
|
|
36
|
+
def __post_init__(self) -> None:
|
|
37
|
+
"""Load security mapping from registry file."""
|
|
38
|
+
import json
|
|
39
|
+
|
|
40
|
+
# Convert base_dir to Path if string
|
|
41
|
+
if isinstance(self.base_dir, str):
|
|
42
|
+
object.__setattr__(self, "base_dir", Path(self.base_dir))
|
|
43
|
+
|
|
44
|
+
# Determine registry path
|
|
45
|
+
if self.registry_path is None:
|
|
46
|
+
registry_path = self.base_dir / "registry.json"
|
|
47
|
+
else:
|
|
48
|
+
registry_path = (
|
|
49
|
+
self.registry_path
|
|
50
|
+
if isinstance(self.registry_path, Path)
|
|
51
|
+
else Path(self.registry_path)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Load security mapping from registry if not provided
|
|
55
|
+
if self.security_mapping is None:
|
|
56
|
+
if registry_path.exists():
|
|
57
|
+
with open(registry_path, encoding="utf-8") as f:
|
|
58
|
+
mapping = json.load(f)
|
|
59
|
+
object.__setattr__(self, "security_mapping", mapping)
|
|
60
|
+
logger.debug(
|
|
61
|
+
"Loaded security mapping from %s: %d securities",
|
|
62
|
+
registry_path,
|
|
63
|
+
len(mapping),
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
raise FileNotFoundError(
|
|
67
|
+
f"Registry file not found: {registry_path}. "
|
|
68
|
+
"Generate synthetic data or provide explicit security_mapping."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class BloombergSource:
|
|
74
|
+
"""
|
|
75
|
+
Bloomberg Terminal data source.
|
|
76
|
+
|
|
77
|
+
Notes
|
|
78
|
+
-----
|
|
79
|
+
Requires active Bloomberg Terminal session.
|
|
80
|
+
Connection is handled automatically by xbbg wrapper.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass(frozen=True)
|
|
87
|
+
class APISource:
|
|
88
|
+
"""
|
|
89
|
+
Generic REST API data source.
|
|
90
|
+
|
|
91
|
+
Attributes
|
|
92
|
+
----------
|
|
93
|
+
endpoint : str
|
|
94
|
+
API endpoint URL.
|
|
95
|
+
params : dict[str, Any]
|
|
96
|
+
Additional request parameters.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
endpoint: str
|
|
100
|
+
params: dict[str, Any] | None = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# Union type for all data sources
|
|
104
|
+
DataSource = FileSource | BloombergSource | APISource
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class DataProvider(Protocol):
|
|
108
|
+
"""
|
|
109
|
+
Protocol for data provider implementations.
|
|
110
|
+
|
|
111
|
+
All providers must implement fetch method with standardized signature.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
def fetch(
|
|
115
|
+
self,
|
|
116
|
+
instrument: str,
|
|
117
|
+
start_date: str | None = None,
|
|
118
|
+
end_date: str | None = None,
|
|
119
|
+
**params: Any,
|
|
120
|
+
) -> pd.DataFrame:
|
|
121
|
+
"""
|
|
122
|
+
Fetch data for specified instrument and date range.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
instrument : str
|
|
127
|
+
Instrument identifier (e.g., 'CDX.NA.IG.5Y', 'VIX', 'HYG').
|
|
128
|
+
start_date : str or None
|
|
129
|
+
Start date in ISO format (YYYY-MM-DD).
|
|
130
|
+
end_date : str or None
|
|
131
|
+
End date in ISO format (YYYY-MM-DD).
|
|
132
|
+
**params : Any
|
|
133
|
+
Provider-specific parameters.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
pd.DataFrame
|
|
138
|
+
Data with DatetimeIndex.
|
|
139
|
+
"""
|
|
140
|
+
...
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def resolve_provider(source: DataSource) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Resolve data source to provider type identifier.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
source : DataSource
|
|
150
|
+
Data source configuration.
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
str
|
|
155
|
+
Provider type: 'file', 'bloomberg', or 'api'.
|
|
156
|
+
|
|
157
|
+
Examples
|
|
158
|
+
--------
|
|
159
|
+
>>> resolve_provider(FileSource("data.parquet"))
|
|
160
|
+
'file'
|
|
161
|
+
>>> resolve_provider(BloombergSource())
|
|
162
|
+
'bloomberg'
|
|
163
|
+
"""
|
|
164
|
+
if isinstance(source, FileSource):
|
|
165
|
+
return "file"
|
|
166
|
+
elif isinstance(source, BloombergSource):
|
|
167
|
+
return "bloomberg"
|
|
168
|
+
elif isinstance(source, APISource):
|
|
169
|
+
return "api"
|
|
170
|
+
else:
|
|
171
|
+
raise ValueError(f"Unknown source type: {type(source)}")
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cdx": {
|
|
3
|
+
"cdx_ig_5y": {
|
|
4
|
+
"base_spread": 60.0,
|
|
5
|
+
"volatility": 5.0
|
|
6
|
+
},
|
|
7
|
+
"cdx_ig_10y": {
|
|
8
|
+
"base_spread": 70.0,
|
|
9
|
+
"volatility": 6.0
|
|
10
|
+
},
|
|
11
|
+
"cdx_hy_5y": {
|
|
12
|
+
"base_spread": 350.0,
|
|
13
|
+
"volatility": 20.0
|
|
14
|
+
},
|
|
15
|
+
"itrx_xover_5y": {
|
|
16
|
+
"base_spread": 280.0,
|
|
17
|
+
"volatility": 18.0
|
|
18
|
+
},
|
|
19
|
+
"itrx_eur_5y": {
|
|
20
|
+
"base_spread": 55.0,
|
|
21
|
+
"volatility": 4.5
|
|
22
|
+
},
|
|
23
|
+
"default": {
|
|
24
|
+
"base_spread": 100.0,
|
|
25
|
+
"volatility": 10.0
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"vix": {
|
|
29
|
+
"base_vix": 18.0,
|
|
30
|
+
"volatility": 2.5
|
|
31
|
+
},
|
|
32
|
+
"etf": {
|
|
33
|
+
"hyg": {
|
|
34
|
+
"base_price": 350.0,
|
|
35
|
+
"volatility": 15.0
|
|
36
|
+
},
|
|
37
|
+
"lqd": {
|
|
38
|
+
"base_price": 100.0,
|
|
39
|
+
"volatility": 8.0
|
|
40
|
+
},
|
|
41
|
+
"default": {
|
|
42
|
+
"base_price": 200.0,
|
|
43
|
+
"volatility": 12.0
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|