aponyx 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aponyx/__init__.py +14 -0
- aponyx/backtest/__init__.py +31 -0
- aponyx/backtest/adapters.py +77 -0
- aponyx/backtest/config.py +84 -0
- aponyx/backtest/engine.py +560 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/backtest/registry.py +334 -0
- aponyx/backtest/strategy_catalog.json +50 -0
- aponyx/cli/__init__.py +5 -0
- aponyx/cli/commands/__init__.py +8 -0
- aponyx/cli/commands/clean.py +349 -0
- aponyx/cli/commands/list.py +302 -0
- aponyx/cli/commands/report.py +167 -0
- aponyx/cli/commands/run.py +377 -0
- aponyx/cli/main.py +125 -0
- aponyx/config/__init__.py +82 -0
- aponyx/data/__init__.py +99 -0
- aponyx/data/bloomberg_config.py +306 -0
- aponyx/data/bloomberg_instruments.json +26 -0
- aponyx/data/bloomberg_securities.json +42 -0
- aponyx/data/cache.py +294 -0
- aponyx/data/fetch.py +659 -0
- aponyx/data/fetch_registry.py +135 -0
- aponyx/data/loaders.py +205 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +383 -0
- aponyx/data/providers/file.py +111 -0
- aponyx/data/registry.py +500 -0
- aponyx/data/requirements.py +96 -0
- aponyx/data/sample_data.py +415 -0
- aponyx/data/schemas.py +60 -0
- aponyx/data/sources.py +171 -0
- aponyx/data/synthetic_params.json +46 -0
- aponyx/data/transforms.py +336 -0
- aponyx/data/validation.py +308 -0
- aponyx/docs/__init__.py +24 -0
- aponyx/docs/adding_data_providers.md +682 -0
- aponyx/docs/cdx_knowledge_base.md +455 -0
- aponyx/docs/cdx_overlay_strategy.md +135 -0
- aponyx/docs/cli_guide.md +607 -0
- aponyx/docs/governance_design.md +551 -0
- aponyx/docs/logging_design.md +251 -0
- aponyx/docs/performance_evaluation_design.md +265 -0
- aponyx/docs/python_guidelines.md +786 -0
- aponyx/docs/signal_registry_usage.md +369 -0
- aponyx/docs/signal_suitability_design.md +558 -0
- aponyx/docs/visualization_design.md +277 -0
- aponyx/evaluation/__init__.py +11 -0
- aponyx/evaluation/performance/__init__.py +24 -0
- aponyx/evaluation/performance/adapters.py +109 -0
- aponyx/evaluation/performance/analyzer.py +384 -0
- aponyx/evaluation/performance/config.py +320 -0
- aponyx/evaluation/performance/decomposition.py +304 -0
- aponyx/evaluation/performance/metrics.py +761 -0
- aponyx/evaluation/performance/registry.py +327 -0
- aponyx/evaluation/performance/report.py +541 -0
- aponyx/evaluation/suitability/__init__.py +67 -0
- aponyx/evaluation/suitability/config.py +143 -0
- aponyx/evaluation/suitability/evaluator.py +389 -0
- aponyx/evaluation/suitability/registry.py +328 -0
- aponyx/evaluation/suitability/report.py +398 -0
- aponyx/evaluation/suitability/scoring.py +367 -0
- aponyx/evaluation/suitability/tests.py +303 -0
- aponyx/examples/01_generate_synthetic_data.py +53 -0
- aponyx/examples/02_fetch_data_file.py +82 -0
- aponyx/examples/03_fetch_data_bloomberg.py +104 -0
- aponyx/examples/04_compute_signal.py +164 -0
- aponyx/examples/05_evaluate_suitability.py +224 -0
- aponyx/examples/06_run_backtest.py +242 -0
- aponyx/examples/07_analyze_performance.py +214 -0
- aponyx/examples/08_visualize_results.py +272 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +45 -0
- aponyx/models/config.py +83 -0
- aponyx/models/indicator_transformation.json +52 -0
- aponyx/models/indicators.py +292 -0
- aponyx/models/metadata.py +447 -0
- aponyx/models/orchestrator.py +213 -0
- aponyx/models/registry.py +860 -0
- aponyx/models/score_transformation.json +42 -0
- aponyx/models/signal_catalog.json +29 -0
- aponyx/models/signal_composer.py +513 -0
- aponyx/models/signal_transformation.json +29 -0
- aponyx/persistence/__init__.py +16 -0
- aponyx/persistence/json_io.py +132 -0
- aponyx/persistence/parquet_io.py +378 -0
- aponyx/py.typed +0 -0
- aponyx/reporting/__init__.py +10 -0
- aponyx/reporting/generator.py +517 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx/workflows/__init__.py +18 -0
- aponyx/workflows/concrete_steps.py +720 -0
- aponyx/workflows/config.py +122 -0
- aponyx/workflows/engine.py +279 -0
- aponyx/workflows/registry.py +116 -0
- aponyx/workflows/steps.py +180 -0
- aponyx-0.1.18.dist-info/METADATA +552 -0
- aponyx-0.1.18.dist-info/RECORD +104 -0
- aponyx-0.1.18.dist-info/WHEEL +4 -0
- aponyx-0.1.18.dist-info/entry_points.txt +2 -0
- aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Component scoring logic for suitability evaluation.
|
|
3
|
+
|
|
4
|
+
Provides scoring functions for the four evaluation components and
|
|
5
|
+
composite score computation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from aponyx.evaluation.suitability.config import SuitabilityConfig
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def score_data_health(
|
|
16
|
+
valid_obs: int,
|
|
17
|
+
missing_pct: float,
|
|
18
|
+
min_obs: int,
|
|
19
|
+
) -> float:
|
|
20
|
+
"""
|
|
21
|
+
Score data quality and sufficiency.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
valid_obs : int
|
|
26
|
+
Number of valid observations after alignment.
|
|
27
|
+
missing_pct : float
|
|
28
|
+
Percentage of missing data (0-100).
|
|
29
|
+
min_obs : int
|
|
30
|
+
Minimum required observations threshold.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
float
|
|
35
|
+
Data health score on 0-1 scale.
|
|
36
|
+
Returns 0.0 if insufficient observations.
|
|
37
|
+
Otherwise, penalizes missing data with tolerance up to 20%.
|
|
38
|
+
|
|
39
|
+
Notes
|
|
40
|
+
-----
|
|
41
|
+
Scoring logic:
|
|
42
|
+
- If valid_obs < min_obs: score = 0.0 (insufficient data)
|
|
43
|
+
- Else: score = max(0, 1 - missing_pct / 20)
|
|
44
|
+
Capped at 0% (score=1.0) and 20% (score=0.0) missing.
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
>>> score_data_health(600, 5.0, 500) # 600 obs, 5% missing
|
|
49
|
+
0.75
|
|
50
|
+
>>> score_data_health(400, 0.0, 500) # Below threshold
|
|
51
|
+
0.0
|
|
52
|
+
"""
|
|
53
|
+
if valid_obs < min_obs:
|
|
54
|
+
logger.debug(
|
|
55
|
+
"Insufficient observations: %d < %d, score=0.0",
|
|
56
|
+
valid_obs,
|
|
57
|
+
min_obs,
|
|
58
|
+
)
|
|
59
|
+
return 0.0
|
|
60
|
+
|
|
61
|
+
# Penalize missing data, tolerating up to 20%
|
|
62
|
+
score = max(0.0, 1.0 - missing_pct / 20.0)
|
|
63
|
+
|
|
64
|
+
logger.debug(
|
|
65
|
+
"Data health: valid_obs=%d, missing_pct=%.2f%%, score=%.3f",
|
|
66
|
+
valid_obs,
|
|
67
|
+
missing_pct,
|
|
68
|
+
score,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return score
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def score_predictive(mean_abs_tstat: float) -> float:
|
|
75
|
+
"""
|
|
76
|
+
Score predictive association strength.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
mean_abs_tstat : float
|
|
81
|
+
Mean absolute t-statistic across lags.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
float
|
|
86
|
+
Predictive score on 0-1 scale.
|
|
87
|
+
Normalized by dividing by 3.0 (capped at 1.0).
|
|
88
|
+
|
|
89
|
+
Notes
|
|
90
|
+
-----
|
|
91
|
+
Scoring logic:
|
|
92
|
+
- score = min(1.0, mean_abs_tstat / 3.0)
|
|
93
|
+
- t-stat > 2.0: Statistically significant at conventional levels
|
|
94
|
+
- t-stat > 3.0: Strong evidence (score capped at 1.0)
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
>>> score_predictive(2.5) # Significant
|
|
99
|
+
0.833
|
|
100
|
+
>>> score_predictive(4.0) # Strong (capped)
|
|
101
|
+
1.0
|
|
102
|
+
"""
|
|
103
|
+
score = min(1.0, mean_abs_tstat / 3.0)
|
|
104
|
+
|
|
105
|
+
logger.debug(
|
|
106
|
+
"Predictive association: mean |t-stat|=%.3f, score=%.3f",
|
|
107
|
+
mean_abs_tstat,
|
|
108
|
+
score,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return score
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def score_economic(effect_size_bps: float) -> float:
|
|
115
|
+
"""
|
|
116
|
+
Score economic relevance based on effect size.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
effect_size_bps : float
|
|
121
|
+
Estimated impact in basis points per 1σ signal change.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
float
|
|
126
|
+
Economic score on 0-1 scale.
|
|
127
|
+
|
|
128
|
+
Notes
|
|
129
|
+
-----
|
|
130
|
+
Scoring thresholds:
|
|
131
|
+
- effect_size < 0.5 bps: Negligible → score = 0.2
|
|
132
|
+
- 0.5 ≤ effect_size < 2.0 bps: Moderate → score = 0.6
|
|
133
|
+
- effect_size ≥ 2.0 bps: Meaningful → score = 1.0
|
|
134
|
+
|
|
135
|
+
For CDX spreads, 2 bps is ~0.4% spread change (economically relevant).
|
|
136
|
+
|
|
137
|
+
Examples
|
|
138
|
+
--------
|
|
139
|
+
>>> score_economic(0.3) # Negligible
|
|
140
|
+
0.2
|
|
141
|
+
>>> score_economic(1.5) # Moderate
|
|
142
|
+
0.6
|
|
143
|
+
>>> score_economic(3.0) # Meaningful
|
|
144
|
+
1.0
|
|
145
|
+
"""
|
|
146
|
+
if effect_size_bps < 0.5:
|
|
147
|
+
score = 0.2
|
|
148
|
+
category = "negligible"
|
|
149
|
+
elif effect_size_bps < 2.0:
|
|
150
|
+
score = 0.6
|
|
151
|
+
category = "moderate"
|
|
152
|
+
else:
|
|
153
|
+
score = 1.0
|
|
154
|
+
category = "meaningful"
|
|
155
|
+
|
|
156
|
+
logger.debug(
|
|
157
|
+
"Economic relevance: effect_size=%.3f bps (%s), score=%.3f",
|
|
158
|
+
effect_size_bps,
|
|
159
|
+
category,
|
|
160
|
+
score,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return score
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def score_stability(
|
|
167
|
+
sign_consistency_ratio: float,
|
|
168
|
+
beta_cv: float,
|
|
169
|
+
) -> float:
|
|
170
|
+
"""
|
|
171
|
+
Score temporal stability based on rolling window statistics.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
sign_consistency_ratio : float
|
|
176
|
+
Proportion of rolling windows with same sign as aggregate beta (0-1).
|
|
177
|
+
beta_cv : float
|
|
178
|
+
Coefficient of variation of rolling betas (std / |mean|).
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
float
|
|
183
|
+
Stability score on 0-1 scale.
|
|
184
|
+
Weighted average of sign consistency and magnitude stability scores.
|
|
185
|
+
|
|
186
|
+
Notes
|
|
187
|
+
-----
|
|
188
|
+
Scoring logic:
|
|
189
|
+
|
|
190
|
+
Sign Consistency Component (50% weight):
|
|
191
|
+
- ratio ≥ 0.8: score = 1.0 (highly consistent)
|
|
192
|
+
- 0.6 ≤ ratio < 0.8: score = 0.5 (moderately consistent)
|
|
193
|
+
- ratio < 0.6: score = 0.0 (inconsistent)
|
|
194
|
+
|
|
195
|
+
Magnitude Stability Component (50% weight):
|
|
196
|
+
- CV < 0.5: score = 1.0 (stable magnitude)
|
|
197
|
+
- 0.5 ≤ CV < 1.0: score = 0.5 (moderate variation)
|
|
198
|
+
- CV ≥ 1.0: score = 0.0 (high variation)
|
|
199
|
+
|
|
200
|
+
Final score = 0.5 × sign_score + 0.5 × magnitude_score
|
|
201
|
+
|
|
202
|
+
Examples
|
|
203
|
+
--------
|
|
204
|
+
>>> score_stability(0.85, 0.3) # High consistency, low CV
|
|
205
|
+
1.0
|
|
206
|
+
>>> score_stability(0.75, 0.6) # Moderate both
|
|
207
|
+
0.5
|
|
208
|
+
>>> score_stability(0.5, 1.2) # Low consistency, high CV
|
|
209
|
+
0.0
|
|
210
|
+
"""
|
|
211
|
+
# Score sign consistency
|
|
212
|
+
if sign_consistency_ratio >= 0.8:
|
|
213
|
+
sign_score = 1.0
|
|
214
|
+
sign_category = "highly consistent"
|
|
215
|
+
elif sign_consistency_ratio >= 0.6:
|
|
216
|
+
sign_score = 0.5
|
|
217
|
+
sign_category = "moderately consistent"
|
|
218
|
+
else:
|
|
219
|
+
sign_score = 0.0
|
|
220
|
+
sign_category = "inconsistent"
|
|
221
|
+
|
|
222
|
+
# Score magnitude stability (lower CV = more stable)
|
|
223
|
+
if beta_cv < 0.5:
|
|
224
|
+
magnitude_score = 1.0
|
|
225
|
+
magnitude_category = "stable"
|
|
226
|
+
elif beta_cv < 1.0:
|
|
227
|
+
magnitude_score = 0.5
|
|
228
|
+
magnitude_category = "moderate variation"
|
|
229
|
+
else:
|
|
230
|
+
magnitude_score = 0.0
|
|
231
|
+
magnitude_category = "high variation"
|
|
232
|
+
|
|
233
|
+
# Weighted average (equal weights)
|
|
234
|
+
score = 0.5 * sign_score + 0.5 * magnitude_score
|
|
235
|
+
|
|
236
|
+
logger.debug(
|
|
237
|
+
"Temporal stability: sign_ratio=%.3f (%s, score=%.1f), "
|
|
238
|
+
"CV=%.3f (%s, score=%.1f), final_score=%.3f",
|
|
239
|
+
sign_consistency_ratio,
|
|
240
|
+
sign_category,
|
|
241
|
+
sign_score,
|
|
242
|
+
beta_cv,
|
|
243
|
+
magnitude_category,
|
|
244
|
+
magnitude_score,
|
|
245
|
+
score,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
return score
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def compute_composite_score(
|
|
252
|
+
data_health_score: float,
|
|
253
|
+
predictive_score: float,
|
|
254
|
+
economic_score: float,
|
|
255
|
+
stability_score: float,
|
|
256
|
+
config: SuitabilityConfig,
|
|
257
|
+
) -> float:
|
|
258
|
+
"""
|
|
259
|
+
Compute weighted composite score from component scores.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
data_health_score : float
|
|
264
|
+
Data quality score (0-1).
|
|
265
|
+
predictive_score : float
|
|
266
|
+
Predictive association score (0-1).
|
|
267
|
+
economic_score : float
|
|
268
|
+
Economic relevance score (0-1).
|
|
269
|
+
stability_score : float
|
|
270
|
+
Temporal stability score (0-1).
|
|
271
|
+
config : SuitabilityConfig
|
|
272
|
+
Configuration with component weights.
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
float
|
|
277
|
+
Composite score on 0-1 scale (weighted average).
|
|
278
|
+
|
|
279
|
+
Notes
|
|
280
|
+
-----
|
|
281
|
+
Default weights:
|
|
282
|
+
- Data health: 20%
|
|
283
|
+
- Predictive: 40%
|
|
284
|
+
- Economic: 20%
|
|
285
|
+
- Stability: 20%
|
|
286
|
+
|
|
287
|
+
Examples
|
|
288
|
+
--------
|
|
289
|
+
>>> config = SuitabilityConfig()
|
|
290
|
+
>>> compute_composite_score(0.8, 0.9, 0.6, 1.0, config)
|
|
291
|
+
0.82
|
|
292
|
+
"""
|
|
293
|
+
composite = (
|
|
294
|
+
config.data_health_weight * data_health_score
|
|
295
|
+
+ config.predictive_weight * predictive_score
|
|
296
|
+
+ config.economic_weight * economic_score
|
|
297
|
+
+ config.stability_weight * stability_score
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
logger.debug(
|
|
301
|
+
"Composite score: %.3f = %.2f×%.3f + %.2f×%.3f + %.2f×%.3f + %.2f×%.3f",
|
|
302
|
+
composite,
|
|
303
|
+
config.data_health_weight,
|
|
304
|
+
data_health_score,
|
|
305
|
+
config.predictive_weight,
|
|
306
|
+
predictive_score,
|
|
307
|
+
config.economic_weight,
|
|
308
|
+
economic_score,
|
|
309
|
+
config.stability_weight,
|
|
310
|
+
stability_score,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return composite
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def assign_decision(
|
|
317
|
+
composite_score: float,
|
|
318
|
+
config: SuitabilityConfig,
|
|
319
|
+
) -> str:
|
|
320
|
+
"""
|
|
321
|
+
Assign decision based on composite score and thresholds.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
composite_score : float
|
|
326
|
+
Composite score (0-1).
|
|
327
|
+
config : SuitabilityConfig
|
|
328
|
+
Configuration with decision thresholds.
|
|
329
|
+
|
|
330
|
+
Returns
|
|
331
|
+
-------
|
|
332
|
+
str
|
|
333
|
+
Decision: "PASS", "HOLD", or "FAIL".
|
|
334
|
+
|
|
335
|
+
Notes
|
|
336
|
+
-----
|
|
337
|
+
Decision logic:
|
|
338
|
+
- score ≥ pass_threshold (0.7): PASS → proceed to backtest
|
|
339
|
+
- pass_threshold > score ≥ hold_threshold (0.4): HOLD → marginal, requires judgment
|
|
340
|
+
- score < hold_threshold: FAIL → do not backtest
|
|
341
|
+
|
|
342
|
+
Examples
|
|
343
|
+
--------
|
|
344
|
+
>>> config = SuitabilityConfig()
|
|
345
|
+
>>> assign_decision(0.75, config)
|
|
346
|
+
'PASS'
|
|
347
|
+
>>> assign_decision(0.55, config)
|
|
348
|
+
'HOLD'
|
|
349
|
+
>>> assign_decision(0.35, config)
|
|
350
|
+
'FAIL'
|
|
351
|
+
"""
|
|
352
|
+
if composite_score >= config.pass_threshold:
|
|
353
|
+
decision = "PASS"
|
|
354
|
+
elif composite_score >= config.hold_threshold:
|
|
355
|
+
decision = "HOLD"
|
|
356
|
+
else:
|
|
357
|
+
decision = "FAIL"
|
|
358
|
+
|
|
359
|
+
logger.debug(
|
|
360
|
+
"Decision assignment: score=%.3f, pass_threshold=%.2f, hold_threshold=%.2f → %s",
|
|
361
|
+
composite_score,
|
|
362
|
+
config.pass_threshold,
|
|
363
|
+
config.hold_threshold,
|
|
364
|
+
decision,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
return decision
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Statistical tests for signal-target relationships.
|
|
3
|
+
|
|
4
|
+
Provides correlation, regression, and temporal stability tests for
|
|
5
|
+
evaluating predictive associations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import statsmodels.api as sm
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def compute_correlation(
|
|
18
|
+
signal: pd.Series,
|
|
19
|
+
target: pd.Series,
|
|
20
|
+
) -> float:
|
|
21
|
+
"""
|
|
22
|
+
Compute Pearson correlation between signal and target.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
signal : pd.Series
|
|
27
|
+
Signal time series.
|
|
28
|
+
target : pd.Series
|
|
29
|
+
Target time series (must be aligned with signal).
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
float
|
|
34
|
+
Pearson correlation coefficient (-1 to 1).
|
|
35
|
+
|
|
36
|
+
Notes
|
|
37
|
+
-----
|
|
38
|
+
Returns 0.0 if either series has zero variance or contains NaN values.
|
|
39
|
+
|
|
40
|
+
Examples
|
|
41
|
+
--------
|
|
42
|
+
>>> signal = pd.Series([1, 2, 3, 4, 5])
|
|
43
|
+
>>> target = pd.Series([2, 4, 6, 8, 10])
|
|
44
|
+
>>> compute_correlation(signal, target)
|
|
45
|
+
1.0
|
|
46
|
+
"""
|
|
47
|
+
if len(signal) == 0 or len(target) == 0:
|
|
48
|
+
logger.warning("Empty series provided, returning correlation=0.0")
|
|
49
|
+
return 0.0
|
|
50
|
+
|
|
51
|
+
if signal.std() == 0 or target.std() == 0:
|
|
52
|
+
logger.warning("Zero variance in series, returning correlation=0.0")
|
|
53
|
+
return 0.0
|
|
54
|
+
|
|
55
|
+
corr = signal.corr(target)
|
|
56
|
+
if pd.isna(corr):
|
|
57
|
+
logger.warning("NaN correlation (insufficient data), returning 0.0")
|
|
58
|
+
return 0.0
|
|
59
|
+
|
|
60
|
+
logger.debug("Computed correlation: %.4f", corr)
|
|
61
|
+
return float(corr)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def compute_regression_stats(
|
|
65
|
+
signal: pd.Series,
|
|
66
|
+
target: pd.Series,
|
|
67
|
+
) -> dict[str, float]:
|
|
68
|
+
"""
|
|
69
|
+
Compute OLS regression statistics for signal predicting target.
|
|
70
|
+
|
|
71
|
+
Runs ordinary least squares regression: target ~ signal
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
signal : pd.Series
|
|
76
|
+
Independent variable (predictor).
|
|
77
|
+
target : pd.Series
|
|
78
|
+
Dependent variable (response).
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
dict[str, float]
|
|
83
|
+
Dictionary with keys:
|
|
84
|
+
- 'beta': regression coefficient
|
|
85
|
+
- 't_stat': t-statistic for beta
|
|
86
|
+
- 'p_value': p-value for beta
|
|
87
|
+
- 'r_squared': coefficient of determination
|
|
88
|
+
|
|
89
|
+
Notes
|
|
90
|
+
-----
|
|
91
|
+
Uses statsmodels OLS with constant term (intercept).
|
|
92
|
+
Returns zeros if regression fails due to insufficient data or numerical issues.
|
|
93
|
+
|
|
94
|
+
Examples
|
|
95
|
+
--------
|
|
96
|
+
>>> signal = pd.Series([1, 2, 3, 4, 5])
|
|
97
|
+
>>> target = pd.Series([2, 4, 6, 8, 10])
|
|
98
|
+
>>> stats = compute_regression_stats(signal, target)
|
|
99
|
+
>>> stats['beta']
|
|
100
|
+
2.0
|
|
101
|
+
"""
|
|
102
|
+
if len(signal) < 3 or len(target) < 3:
|
|
103
|
+
logger.warning(
|
|
104
|
+
"Insufficient observations for regression (n=%d), returning zeros",
|
|
105
|
+
len(signal),
|
|
106
|
+
)
|
|
107
|
+
return {"beta": 0.0, "t_stat": 0.0, "p_value": 1.0, "r_squared": 0.0}
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
# Add constant for intercept
|
|
111
|
+
X = sm.add_constant(signal.values)
|
|
112
|
+
y = target.values
|
|
113
|
+
|
|
114
|
+
# Fit OLS model
|
|
115
|
+
model = sm.OLS(y, X).fit()
|
|
116
|
+
|
|
117
|
+
# Extract statistics for signal coefficient (index 1, after constant)
|
|
118
|
+
beta = float(model.params[1])
|
|
119
|
+
t_stat = float(model.tvalues[1])
|
|
120
|
+
p_value = float(model.pvalues[1])
|
|
121
|
+
r_squared = float(model.rsquared)
|
|
122
|
+
|
|
123
|
+
logger.debug(
|
|
124
|
+
"Regression: beta=%.4f, t=%.4f, p=%.4f, R²=%.4f",
|
|
125
|
+
beta,
|
|
126
|
+
t_stat,
|
|
127
|
+
p_value,
|
|
128
|
+
r_squared,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"beta": beta,
|
|
133
|
+
"t_stat": t_stat,
|
|
134
|
+
"p_value": p_value,
|
|
135
|
+
"r_squared": r_squared,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning("Regression failed: %s, returning zeros", e)
|
|
140
|
+
return {"beta": 0.0, "t_stat": 0.0, "p_value": 1.0, "r_squared": 0.0}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def compute_rolling_betas(
|
|
144
|
+
signal: pd.Series,
|
|
145
|
+
target: pd.Series,
|
|
146
|
+
window: int,
|
|
147
|
+
) -> pd.Series:
|
|
148
|
+
"""
|
|
149
|
+
Compute rolling regression betas using sliding window.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
signal : pd.Series
|
|
154
|
+
Signal time series with DatetimeIndex.
|
|
155
|
+
target : pd.Series
|
|
156
|
+
Target time series with DatetimeIndex (aligned with signal).
|
|
157
|
+
window : int
|
|
158
|
+
Rolling window size in observations (e.g., 252 for ~1 year daily data).
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
pd.Series
|
|
163
|
+
Time series of rolling beta coefficients.
|
|
164
|
+
Index matches input series; first (window-1) values are NaN.
|
|
165
|
+
|
|
166
|
+
Notes
|
|
167
|
+
-----
|
|
168
|
+
Uses OLS regression in each window: target ~ signal + constant.
|
|
169
|
+
Minimum window size is 50 observations for reliable estimation.
|
|
170
|
+
|
|
171
|
+
Implementation uses rolling window with apply to compute regression
|
|
172
|
+
beta coefficient in each window.
|
|
173
|
+
|
|
174
|
+
Examples
|
|
175
|
+
--------
|
|
176
|
+
>>> signal = pd.Series([...], index=date_range)
|
|
177
|
+
>>> target = pd.Series([...], index=date_range)
|
|
178
|
+
>>> rolling_betas = compute_rolling_betas(signal, target, window=252)
|
|
179
|
+
>>> rolling_betas.mean() # Average beta across all windows
|
|
180
|
+
"""
|
|
181
|
+
if len(signal) < window:
|
|
182
|
+
logger.warning(
|
|
183
|
+
"Insufficient data for rolling window (n=%d < window=%d), returning empty series",
|
|
184
|
+
len(signal),
|
|
185
|
+
window,
|
|
186
|
+
)
|
|
187
|
+
return pd.Series([], dtype=float, index=signal.index[:0])
|
|
188
|
+
|
|
189
|
+
# Preallocate result array
|
|
190
|
+
betas = np.full(len(signal), np.nan)
|
|
191
|
+
|
|
192
|
+
# Compute beta for each window
|
|
193
|
+
for i in range(window - 1, len(signal)):
|
|
194
|
+
window_signal = signal.iloc[i - window + 1 : i + 1]
|
|
195
|
+
window_target = target.iloc[i - window + 1 : i + 1]
|
|
196
|
+
|
|
197
|
+
if len(window_signal) < 3:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
X = sm.add_constant(window_signal.values)
|
|
202
|
+
y = window_target.values
|
|
203
|
+
model = sm.OLS(y, X).fit()
|
|
204
|
+
betas[i] = float(model.params[1])
|
|
205
|
+
except Exception:
|
|
206
|
+
pass # Keep as NaN
|
|
207
|
+
|
|
208
|
+
rolling_betas = pd.Series(betas, index=signal.index, name=signal.name)
|
|
209
|
+
|
|
210
|
+
logger.debug(
|
|
211
|
+
"Computed %d rolling betas (window=%d, valid=%d)",
|
|
212
|
+
len(rolling_betas),
|
|
213
|
+
window,
|
|
214
|
+
rolling_betas.notna().sum(),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return rolling_betas
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def compute_stability_metrics(
|
|
221
|
+
rolling_betas: pd.Series,
|
|
222
|
+
aggregate_beta: float,
|
|
223
|
+
) -> dict[str, float]:
|
|
224
|
+
"""
|
|
225
|
+
Compute stability metrics from rolling beta coefficients.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
rolling_betas : pd.Series
|
|
230
|
+
Time series of rolling beta coefficients from compute_rolling_betas().
|
|
231
|
+
aggregate_beta : float
|
|
232
|
+
Overall beta coefficient from full-sample regression.
|
|
233
|
+
Used as reference for sign consistency check.
|
|
234
|
+
|
|
235
|
+
Returns
|
|
236
|
+
-------
|
|
237
|
+
dict[str, float]
|
|
238
|
+
Dictionary with keys:
|
|
239
|
+
- 'sign_consistency_ratio': Proportion of windows matching aggregate sign
|
|
240
|
+
- 'beta_cv': Coefficient of variation (std / |mean|)
|
|
241
|
+
- 'n_windows': Number of valid rolling windows
|
|
242
|
+
|
|
243
|
+
Notes
|
|
244
|
+
-----
|
|
245
|
+
Sign consistency ratio ≥ 0.8 indicates stable directional relationship.
|
|
246
|
+
Beta CV < 0.5 indicates low magnitude variation (stable effect size).
|
|
247
|
+
|
|
248
|
+
Windows with beta ≈ 0 (|beta| < 0.01) are excluded from sign consistency
|
|
249
|
+
to avoid spurious sign flips in noise.
|
|
250
|
+
|
|
251
|
+
Examples
|
|
252
|
+
--------
|
|
253
|
+
>>> rolling_betas = pd.Series([1.5, 1.8, 1.6, 1.7, 1.9])
|
|
254
|
+
>>> aggregate_beta = 1.7
|
|
255
|
+
>>> metrics = compute_stability_metrics(rolling_betas, aggregate_beta)
|
|
256
|
+
>>> metrics['sign_consistency_ratio']
|
|
257
|
+
1.0 # All same sign
|
|
258
|
+
>>> metrics['beta_cv']
|
|
259
|
+
0.08 # Low variation
|
|
260
|
+
"""
|
|
261
|
+
# Remove NaN values
|
|
262
|
+
valid_betas = rolling_betas.dropna()
|
|
263
|
+
|
|
264
|
+
if len(valid_betas) == 0:
|
|
265
|
+
logger.warning("No valid rolling betas, returning zero metrics")
|
|
266
|
+
return {
|
|
267
|
+
"sign_consistency_ratio": 0.0,
|
|
268
|
+
"beta_cv": 0.0,
|
|
269
|
+
"n_windows": 0,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# Sign consistency: proportion of windows with same sign as aggregate
|
|
273
|
+
aggregate_sign = np.sign(aggregate_beta)
|
|
274
|
+
|
|
275
|
+
# Filter out near-zero betas (|beta| < 0.01) to avoid noise
|
|
276
|
+
non_zero_mask = np.abs(valid_betas) >= 0.01
|
|
277
|
+
if non_zero_mask.sum() == 0:
|
|
278
|
+
sign_consistency_ratio = 0.0
|
|
279
|
+
else:
|
|
280
|
+
same_sign = (np.sign(valid_betas[non_zero_mask]) == aggregate_sign).sum()
|
|
281
|
+
sign_consistency_ratio = float(same_sign / non_zero_mask.sum())
|
|
282
|
+
|
|
283
|
+
# Coefficient of variation: std / |mean|
|
|
284
|
+
beta_mean = valid_betas.mean()
|
|
285
|
+
beta_std = valid_betas.std()
|
|
286
|
+
|
|
287
|
+
if abs(beta_mean) < 1e-10:
|
|
288
|
+
beta_cv = 0.0
|
|
289
|
+
else:
|
|
290
|
+
beta_cv = float(beta_std / abs(beta_mean))
|
|
291
|
+
|
|
292
|
+
logger.debug(
|
|
293
|
+
"Stability metrics: sign_ratio=%.3f, CV=%.3f, n_windows=%d",
|
|
294
|
+
sign_consistency_ratio,
|
|
295
|
+
beta_cv,
|
|
296
|
+
len(valid_betas),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
"sign_consistency_ratio": sign_consistency_ratio,
|
|
301
|
+
"beta_cv": beta_cv,
|
|
302
|
+
"n_windows": len(valid_betas),
|
|
303
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate synthetic market data for all instruments.
|
|
3
|
+
|
|
4
|
+
Prerequisites
|
|
5
|
+
-------------
|
|
6
|
+
None — this is the first step in the workflow.
|
|
7
|
+
|
|
8
|
+
Outputs
|
|
9
|
+
-------
|
|
10
|
+
Raw synthetic data files in data/raw/synthetic/:
|
|
11
|
+
- cdx_ig_5y_{hash}.parquet (CDX spread data)
|
|
12
|
+
- cdx_ig_10y_{hash}.parquet (CDX spread data)
|
|
13
|
+
- cdx_hy_5y_{hash}.parquet (CDX spread data)
|
|
14
|
+
- itrx_xover_5y_{hash}.parquet (CDX spread data)
|
|
15
|
+
- itrx_eur_5y_{hash}.parquet (CDX spread data)
|
|
16
|
+
- vix_{hash}.parquet (VIX volatility index)
|
|
17
|
+
- hyg_{hash}.parquet (High yield ETF spreads)
|
|
18
|
+
- lqd_{hash}.parquet (Investment grade ETF spreads)
|
|
19
|
+
|
|
20
|
+
Each file includes metadata JSON with generation parameters.
|
|
21
|
+
|
|
22
|
+
Examples
|
|
23
|
+
--------
|
|
24
|
+
Run from project root:
|
|
25
|
+
python -m aponyx.examples.01_generate_synthetic_data
|
|
26
|
+
|
|
27
|
+
Expected output: 8 parquet files with 5 years of daily data (~1260 rows each).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from aponyx.config import RAW_DIR
|
|
31
|
+
from aponyx.data.sample_data import generate_for_fetch_interface
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main() -> None:
|
|
35
|
+
"""
|
|
36
|
+
Generate synthetic market data for testing and demonstrations.
|
|
37
|
+
|
|
38
|
+
Creates realistic time series data for all instruments defined in
|
|
39
|
+
bloomberg_securities.json, using hash-based naming compatible with
|
|
40
|
+
the data fetch interface.
|
|
41
|
+
"""
|
|
42
|
+
output_dir = RAW_DIR / "synthetic"
|
|
43
|
+
|
|
44
|
+
generate_for_fetch_interface(
|
|
45
|
+
output_dir=output_dir,
|
|
46
|
+
start_date="2020-01-01",
|
|
47
|
+
end_date="2025-01-01",
|
|
48
|
+
seed=42,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
main()
|