aponyx 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aponyx/__init__.py +14 -0
- aponyx/backtest/__init__.py +31 -0
- aponyx/backtest/adapters.py +77 -0
- aponyx/backtest/config.py +84 -0
- aponyx/backtest/engine.py +560 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/backtest/registry.py +334 -0
- aponyx/backtest/strategy_catalog.json +50 -0
- aponyx/cli/__init__.py +5 -0
- aponyx/cli/commands/__init__.py +8 -0
- aponyx/cli/commands/clean.py +349 -0
- aponyx/cli/commands/list.py +302 -0
- aponyx/cli/commands/report.py +167 -0
- aponyx/cli/commands/run.py +377 -0
- aponyx/cli/main.py +125 -0
- aponyx/config/__init__.py +82 -0
- aponyx/data/__init__.py +99 -0
- aponyx/data/bloomberg_config.py +306 -0
- aponyx/data/bloomberg_instruments.json +26 -0
- aponyx/data/bloomberg_securities.json +42 -0
- aponyx/data/cache.py +294 -0
- aponyx/data/fetch.py +659 -0
- aponyx/data/fetch_registry.py +135 -0
- aponyx/data/loaders.py +205 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +383 -0
- aponyx/data/providers/file.py +111 -0
- aponyx/data/registry.py +500 -0
- aponyx/data/requirements.py +96 -0
- aponyx/data/sample_data.py +415 -0
- aponyx/data/schemas.py +60 -0
- aponyx/data/sources.py +171 -0
- aponyx/data/synthetic_params.json +46 -0
- aponyx/data/transforms.py +336 -0
- aponyx/data/validation.py +308 -0
- aponyx/docs/__init__.py +24 -0
- aponyx/docs/adding_data_providers.md +682 -0
- aponyx/docs/cdx_knowledge_base.md +455 -0
- aponyx/docs/cdx_overlay_strategy.md +135 -0
- aponyx/docs/cli_guide.md +607 -0
- aponyx/docs/governance_design.md +551 -0
- aponyx/docs/logging_design.md +251 -0
- aponyx/docs/performance_evaluation_design.md +265 -0
- aponyx/docs/python_guidelines.md +786 -0
- aponyx/docs/signal_registry_usage.md +369 -0
- aponyx/docs/signal_suitability_design.md +558 -0
- aponyx/docs/visualization_design.md +277 -0
- aponyx/evaluation/__init__.py +11 -0
- aponyx/evaluation/performance/__init__.py +24 -0
- aponyx/evaluation/performance/adapters.py +109 -0
- aponyx/evaluation/performance/analyzer.py +384 -0
- aponyx/evaluation/performance/config.py +320 -0
- aponyx/evaluation/performance/decomposition.py +304 -0
- aponyx/evaluation/performance/metrics.py +761 -0
- aponyx/evaluation/performance/registry.py +327 -0
- aponyx/evaluation/performance/report.py +541 -0
- aponyx/evaluation/suitability/__init__.py +67 -0
- aponyx/evaluation/suitability/config.py +143 -0
- aponyx/evaluation/suitability/evaluator.py +389 -0
- aponyx/evaluation/suitability/registry.py +328 -0
- aponyx/evaluation/suitability/report.py +398 -0
- aponyx/evaluation/suitability/scoring.py +367 -0
- aponyx/evaluation/suitability/tests.py +303 -0
- aponyx/examples/01_generate_synthetic_data.py +53 -0
- aponyx/examples/02_fetch_data_file.py +82 -0
- aponyx/examples/03_fetch_data_bloomberg.py +104 -0
- aponyx/examples/04_compute_signal.py +164 -0
- aponyx/examples/05_evaluate_suitability.py +224 -0
- aponyx/examples/06_run_backtest.py +242 -0
- aponyx/examples/07_analyze_performance.py +214 -0
- aponyx/examples/08_visualize_results.py +272 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +45 -0
- aponyx/models/config.py +83 -0
- aponyx/models/indicator_transformation.json +52 -0
- aponyx/models/indicators.py +292 -0
- aponyx/models/metadata.py +447 -0
- aponyx/models/orchestrator.py +213 -0
- aponyx/models/registry.py +860 -0
- aponyx/models/score_transformation.json +42 -0
- aponyx/models/signal_catalog.json +29 -0
- aponyx/models/signal_composer.py +513 -0
- aponyx/models/signal_transformation.json +29 -0
- aponyx/persistence/__init__.py +16 -0
- aponyx/persistence/json_io.py +132 -0
- aponyx/persistence/parquet_io.py +378 -0
- aponyx/py.typed +0 -0
- aponyx/reporting/__init__.py +10 -0
- aponyx/reporting/generator.py +517 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx/workflows/__init__.py +18 -0
- aponyx/workflows/concrete_steps.py +720 -0
- aponyx/workflows/config.py +122 -0
- aponyx/workflows/engine.py +279 -0
- aponyx/workflows/registry.py +116 -0
- aponyx/workflows/steps.py +180 -0
- aponyx-0.1.18.dist-info/METADATA +552 -0
- aponyx-0.1.18.dist-info/RECORD +104 -0
- aponyx-0.1.18.dist-info/WHEEL +4 -0
- aponyx-0.1.18.dist-info/entry_points.txt +2 -0
- aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time series transformation functions for financial data.
|
|
3
|
+
|
|
4
|
+
Provides standardized transformations with consistent edge case handling:
|
|
5
|
+
- First difference (absolute change)
|
|
6
|
+
- Percent change (relative change)
|
|
7
|
+
- Log returns (continuous compounding)
|
|
8
|
+
- Z-score normalization (standardization)
|
|
9
|
+
- Normalized change (volatility-adjusted)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
TransformType = Literal[
|
|
21
|
+
"diff",
|
|
22
|
+
"pct_change",
|
|
23
|
+
"log_return",
|
|
24
|
+
"z_score",
|
|
25
|
+
"normalized_change",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def apply_transform(
|
|
30
|
+
series: pd.Series,
|
|
31
|
+
transform: TransformType,
|
|
32
|
+
*,
|
|
33
|
+
window: int | None = None,
|
|
34
|
+
min_periods: int | None = None,
|
|
35
|
+
periods: int = 1,
|
|
36
|
+
) -> pd.Series:
|
|
37
|
+
"""
|
|
38
|
+
Apply time series transformation with edge case handling.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
series : pd.Series
|
|
43
|
+
Input time series with DatetimeIndex.
|
|
44
|
+
transform : TransformType
|
|
45
|
+
Type of transformation to apply:
|
|
46
|
+
- 'diff': First difference (x[t] - x[t-periods])
|
|
47
|
+
- 'pct_change': Percent change ((x[t] - x[t-periods]) / x[t-periods])
|
|
48
|
+
- 'log_return': Log return (log(x[t] / x[t-periods]))
|
|
49
|
+
- 'z_score': Rolling z-score normalization
|
|
50
|
+
- 'normalized_change': Change normalized by rolling volatility
|
|
51
|
+
window : int or None
|
|
52
|
+
Rolling window size for z_score and normalized_change.
|
|
53
|
+
Required for these transforms, ignored for others.
|
|
54
|
+
min_periods : int or None
|
|
55
|
+
Minimum observations for rolling calculations.
|
|
56
|
+
Defaults to window if not specified.
|
|
57
|
+
periods : int, default 1
|
|
58
|
+
Number of periods for differencing operations.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
pd.Series
|
|
63
|
+
Transformed series with same DatetimeIndex.
|
|
64
|
+
NaN values propagate from input or calculation.
|
|
65
|
+
|
|
66
|
+
Raises
|
|
67
|
+
------
|
|
68
|
+
ValueError
|
|
69
|
+
If required parameters are missing for transform type.
|
|
70
|
+
If log_return used on series with non-positive values.
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> spreads = pd.Series([100, 105, 103, 108], index=pd.date_range('2024-01-01', periods=4))
|
|
75
|
+
>>> apply_transform(spreads, 'diff')
|
|
76
|
+
>>> apply_transform(spreads, 'pct_change')
|
|
77
|
+
>>> apply_transform(spreads, 'z_score', window=20, min_periods=10)
|
|
78
|
+
|
|
79
|
+
Notes
|
|
80
|
+
-----
|
|
81
|
+
- All transforms preserve DatetimeIndex alignment
|
|
82
|
+
- NaN handling follows pandas conventions (NaN in = NaN out)
|
|
83
|
+
- Division by zero in pct_change produces inf (pandas default)
|
|
84
|
+
- log_return validates input is positive before calculation
|
|
85
|
+
"""
|
|
86
|
+
if transform == "diff":
|
|
87
|
+
return _diff(series, periods)
|
|
88
|
+
elif transform == "pct_change":
|
|
89
|
+
return _pct_change(series, periods)
|
|
90
|
+
elif transform == "log_return":
|
|
91
|
+
return _log_return(series, periods)
|
|
92
|
+
elif transform == "z_score":
|
|
93
|
+
if window is None:
|
|
94
|
+
raise ValueError("window parameter required for z_score transform")
|
|
95
|
+
return _z_score(series, window, min_periods)
|
|
96
|
+
elif transform == "normalized_change":
|
|
97
|
+
if window is None:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
"window parameter required for normalized_change transform"
|
|
100
|
+
)
|
|
101
|
+
return _normalized_change(series, window, min_periods, periods)
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError(f"Unknown transform type: {transform}")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _diff(series: pd.Series, periods: int = 1) -> pd.Series:
|
|
107
|
+
"""
|
|
108
|
+
Compute first difference: x[t] - x[t-periods].
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
series : pd.Series
|
|
113
|
+
Input time series.
|
|
114
|
+
periods : int, default 1
|
|
115
|
+
Number of periods to difference.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
pd.Series
|
|
120
|
+
First differences.
|
|
121
|
+
|
|
122
|
+
Notes
|
|
123
|
+
-----
|
|
124
|
+
First `periods` observations will be NaN.
|
|
125
|
+
"""
|
|
126
|
+
return series.diff(periods)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _pct_change(series: pd.Series, periods: int = 1) -> pd.Series:
|
|
130
|
+
"""
|
|
131
|
+
Compute percent change: (x[t] - x[t-periods]) / x[t-periods].
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
series : pd.Series
|
|
136
|
+
Input time series.
|
|
137
|
+
periods : int, default 1
|
|
138
|
+
Number of periods for change calculation.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
pd.Series
|
|
143
|
+
Percent changes.
|
|
144
|
+
|
|
145
|
+
Notes
|
|
146
|
+
-----
|
|
147
|
+
- Division by zero produces inf (pandas default behavior)
|
|
148
|
+
- First `periods` observations will be NaN
|
|
149
|
+
- Use for cross-asset comparison where scales differ
|
|
150
|
+
"""
|
|
151
|
+
return series.pct_change(periods)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _log_return(series: pd.Series, periods: int = 1) -> pd.Series:
|
|
155
|
+
"""
|
|
156
|
+
Compute log returns: log(x[t] / x[t-periods]).
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
series : pd.Series
|
|
161
|
+
Input time series. Must contain only positive values.
|
|
162
|
+
periods : int, default 1
|
|
163
|
+
Number of periods for return calculation.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
pd.Series
|
|
168
|
+
Log returns.
|
|
169
|
+
|
|
170
|
+
Raises
|
|
171
|
+
------
|
|
172
|
+
ValueError
|
|
173
|
+
If series contains non-positive values (zero or negative).
|
|
174
|
+
|
|
175
|
+
Notes
|
|
176
|
+
-----
|
|
177
|
+
- Validates all non-NaN values are positive before calculation
|
|
178
|
+
- First `periods` observations will be NaN
|
|
179
|
+
- Preferred for risk calculations (continuous compounding)
|
|
180
|
+
- Approximates pct_change for small changes
|
|
181
|
+
"""
|
|
182
|
+
# Check for non-positive values (excluding NaN)
|
|
183
|
+
non_positive = (series <= 0) & series.notna()
|
|
184
|
+
if non_positive.any():
|
|
185
|
+
n_invalid = non_positive.sum()
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"log_return requires positive values, found {n_invalid} non-positive entries"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return np.log(series / series.shift(periods))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _z_score(
|
|
194
|
+
series: pd.Series, window: int, min_periods: int | None = None
|
|
195
|
+
) -> pd.Series:
|
|
196
|
+
"""
|
|
197
|
+
Compute rolling z-score: (x - rolling_mean) / rolling_std.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
series : pd.Series
|
|
202
|
+
Input time series.
|
|
203
|
+
window : int
|
|
204
|
+
Rolling window size for mean and std calculation.
|
|
205
|
+
min_periods : int or None
|
|
206
|
+
Minimum observations required. Defaults to window.
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
pd.Series
|
|
211
|
+
Rolling z-scores (zero mean, unit variance within window).
|
|
212
|
+
|
|
213
|
+
Notes
|
|
214
|
+
-----
|
|
215
|
+
- Useful for regime-independent signals
|
|
216
|
+
- First `window - 1` (or `min_periods - 1`) observations will be NaN
|
|
217
|
+
- Division by zero std produces inf (pandas default)
|
|
218
|
+
- More robust than raw differences when volatility varies over time
|
|
219
|
+
"""
|
|
220
|
+
if min_periods is None:
|
|
221
|
+
min_periods = window
|
|
222
|
+
|
|
223
|
+
rolling_mean = series.rolling(window=window, min_periods=min_periods).mean()
|
|
224
|
+
rolling_std = series.rolling(window=window, min_periods=min_periods).std()
|
|
225
|
+
|
|
226
|
+
return (series - rolling_mean) / rolling_std
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _normalized_change(
|
|
230
|
+
series: pd.Series,
|
|
231
|
+
window: int,
|
|
232
|
+
min_periods: int | None = None,
|
|
233
|
+
periods: int = 1,
|
|
234
|
+
) -> pd.Series:
|
|
235
|
+
"""
|
|
236
|
+
Compute change normalized by rolling volatility: (x[t] - x[t-periods]) / rolling_std.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
series : pd.Series
|
|
241
|
+
Input time series.
|
|
242
|
+
window : int
|
|
243
|
+
Rolling window for volatility calculation.
|
|
244
|
+
min_periods : int or None
|
|
245
|
+
Minimum observations required. Defaults to window.
|
|
246
|
+
periods : int, default 1
|
|
247
|
+
Number of periods for change calculation.
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
pd.Series
|
|
252
|
+
Volatility-normalized changes.
|
|
253
|
+
|
|
254
|
+
Notes
|
|
255
|
+
-----
|
|
256
|
+
- Combines absolute change with volatility scaling
|
|
257
|
+
- Useful when comparing signals across different regimes
|
|
258
|
+
- Similar to z_score but uses absolute change instead of deviation from mean
|
|
259
|
+
- First `max(window, periods)` observations will be NaN
|
|
260
|
+
"""
|
|
261
|
+
if min_periods is None:
|
|
262
|
+
min_periods = window
|
|
263
|
+
|
|
264
|
+
change = series.diff(periods)
|
|
265
|
+
rolling_std = series.rolling(window=window, min_periods=min_periods).std()
|
|
266
|
+
|
|
267
|
+
return change / rolling_std
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def apply_signal_transformation(
|
|
271
|
+
series: pd.Series,
|
|
272
|
+
*,
|
|
273
|
+
scaling: float = 1.0,
|
|
274
|
+
floor: float | None = None,
|
|
275
|
+
cap: float | None = None,
|
|
276
|
+
neutral_range: tuple[float, float] | None = None,
|
|
277
|
+
) -> pd.Series:
|
|
278
|
+
"""
|
|
279
|
+
Apply trading rule transformation to score series.
|
|
280
|
+
|
|
281
|
+
Applies transformations in order: scale → floor/cap → neutral_range.
|
|
282
|
+
This converts normalized scores into bounded trading signals.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
----------
|
|
286
|
+
series : pd.Series
|
|
287
|
+
Input score series (typically z-scores or normalized values).
|
|
288
|
+
scaling : float, default 1.0
|
|
289
|
+
Multiplier applied first to scale the signal.
|
|
290
|
+
Must be non-zero.
|
|
291
|
+
floor : float or None, optional
|
|
292
|
+
Lower bound after scaling. None = no lower bound (-inf).
|
|
293
|
+
cap : float or None, optional
|
|
294
|
+
Upper bound after scaling. None = no upper bound (+inf).
|
|
295
|
+
neutral_range : tuple[float, float] or None, optional
|
|
296
|
+
Values within [low, high] set to zero (no position signal).
|
|
297
|
+
None = no neutral zone.
|
|
298
|
+
|
|
299
|
+
Returns
|
|
300
|
+
-------
|
|
301
|
+
pd.Series
|
|
302
|
+
Transformed signal with same DatetimeIndex.
|
|
303
|
+
NaN values are preserved from input.
|
|
304
|
+
|
|
305
|
+
Examples
|
|
306
|
+
--------
|
|
307
|
+
>>> scores = pd.Series([2.5, -1.8, 0.3, -0.1], index=pd.date_range('2024-01-01', periods=4))
|
|
308
|
+
>>> apply_signal_transformation(scores, floor=-1.5, cap=1.5)
|
|
309
|
+
>>> apply_signal_transformation(scores, scaling=2.0, floor=-2.0, cap=2.0)
|
|
310
|
+
>>> apply_signal_transformation(scores, floor=-1.5, cap=1.5, neutral_range=(-0.25, 0.25))
|
|
311
|
+
|
|
312
|
+
Notes
|
|
313
|
+
-----
|
|
314
|
+
- Transformation order is fixed: scale → floor/cap → neutral_range
|
|
315
|
+
- NaN values propagate through all operations
|
|
316
|
+
- Neutral range is applied AFTER floor/cap bounds
|
|
317
|
+
- Use scaling=1.0 for no scaling (passthrough scaling factor)
|
|
318
|
+
"""
|
|
319
|
+
# Apply scaling first
|
|
320
|
+
result = series * scaling
|
|
321
|
+
|
|
322
|
+
# Apply floor/cap bounds
|
|
323
|
+
if floor is not None:
|
|
324
|
+
result = result.clip(lower=floor)
|
|
325
|
+
if cap is not None:
|
|
326
|
+
result = result.clip(upper=cap)
|
|
327
|
+
|
|
328
|
+
# Apply neutral range (zero out values within range)
|
|
329
|
+
if neutral_range is not None:
|
|
330
|
+
low, high = neutral_range
|
|
331
|
+
# Set values within neutral range to 0.0, keeping values outside the range
|
|
332
|
+
# Preserve NaN values by checking explicitly
|
|
333
|
+
mask = (result >= low) & (result <= high) # Values inside neutral range
|
|
334
|
+
result = result.mask(mask, 0.0) # Set those to 0.0, NaN values preserved
|
|
335
|
+
|
|
336
|
+
return result
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data validation utilities for market data quality checks.
|
|
3
|
+
|
|
4
|
+
Validates schema compliance, data types, and business logic constraints.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from .schemas import CDXSchema, VIXSchema, ETFSchema
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _ensure_datetime_index(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
|
|
17
|
+
"""
|
|
18
|
+
Convert DataFrame to use DatetimeIndex if not already.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
df : pd.DataFrame
|
|
23
|
+
DataFrame to process.
|
|
24
|
+
date_col : str
|
|
25
|
+
Name of date column to use as index.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
pd.DataFrame
|
|
30
|
+
DataFrame with DatetimeIndex named 'date', sorted by date.
|
|
31
|
+
"""
|
|
32
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
|
33
|
+
df = df.copy()
|
|
34
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
|
35
|
+
df = df.set_index(date_col)
|
|
36
|
+
|
|
37
|
+
# Ensure index is named 'date'
|
|
38
|
+
df.index.name = "date"
|
|
39
|
+
|
|
40
|
+
return df.sort_index()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_duplicate_dates(df: pd.DataFrame, context: str = "") -> None:
|
|
44
|
+
"""
|
|
45
|
+
Check for and log duplicate dates in DataFrame index.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
df : pd.DataFrame
|
|
50
|
+
DataFrame with DatetimeIndex to check.
|
|
51
|
+
context : str, optional
|
|
52
|
+
Additional context for log message (e.g., ticker name).
|
|
53
|
+
"""
|
|
54
|
+
if df.index.duplicated().any():
|
|
55
|
+
n_dups = df.index.duplicated().sum()
|
|
56
|
+
if context:
|
|
57
|
+
logger.warning("Found %d duplicate dates for %s", n_dups, context)
|
|
58
|
+
else:
|
|
59
|
+
logger.warning("Found %d duplicate dates", n_dups)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def handle_duplicate_index(
|
|
63
|
+
df: pd.DataFrame,
|
|
64
|
+
strategy: str = "last",
|
|
65
|
+
context: str = "",
|
|
66
|
+
) -> pd.DataFrame:
|
|
67
|
+
"""
|
|
68
|
+
Remove duplicate index entries with logging.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
df : pd.DataFrame
|
|
73
|
+
DataFrame with potential duplicate indices.
|
|
74
|
+
strategy : str, default "last"
|
|
75
|
+
Deduplication strategy:
|
|
76
|
+
- "first": Keep first occurrence
|
|
77
|
+
- "last": Keep last occurrence
|
|
78
|
+
- "raise": Raise ValueError if duplicates found
|
|
79
|
+
context : str, optional
|
|
80
|
+
Context for logging (e.g., "CDX IG 5Y").
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
pd.DataFrame
|
|
85
|
+
DataFrame with duplicates removed.
|
|
86
|
+
|
|
87
|
+
Raises
|
|
88
|
+
------
|
|
89
|
+
ValueError
|
|
90
|
+
If strategy="raise" and duplicates are found, or if strategy is invalid.
|
|
91
|
+
|
|
92
|
+
Examples
|
|
93
|
+
--------
|
|
94
|
+
>>> df = pd.DataFrame({"value": [1, 2, 3]}, index=pd.DatetimeIndex(["2024-01-01", "2024-01-01", "2024-01-02"]))
|
|
95
|
+
>>> clean_df = handle_duplicate_index(df, strategy="last")
|
|
96
|
+
>>> clean_df = handle_duplicate_index(df, strategy="raise") # Raises ValueError
|
|
97
|
+
"""
|
|
98
|
+
if strategy not in ("first", "last", "raise"):
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"Invalid strategy '{strategy}'. Must be 'first', 'last', or 'raise'"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if not df.index.duplicated().any():
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
# Log duplicates
|
|
107
|
+
_check_duplicate_dates(df, context)
|
|
108
|
+
|
|
109
|
+
# Handle based on strategy
|
|
110
|
+
if strategy == "raise":
|
|
111
|
+
n_dups = df.index.duplicated().sum()
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"Found {n_dups} duplicate index entries"
|
|
114
|
+
+ (f" for {context}" if context else "")
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Remove duplicates
|
|
118
|
+
df_clean = df[~df.index.duplicated(keep=strategy)]
|
|
119
|
+
logger.debug("Removed duplicates using strategy='%s'", strategy)
|
|
120
|
+
return df_clean
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def validate_cdx_schema(
|
|
124
|
+
df: pd.DataFrame, schema: CDXSchema = CDXSchema()
|
|
125
|
+
) -> pd.DataFrame:
|
|
126
|
+
"""
|
|
127
|
+
Validate CDX index data against expected schema.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
df : pd.DataFrame
|
|
132
|
+
Raw CDX data to validate.
|
|
133
|
+
schema : CDXSchema, default CDXSchema()
|
|
134
|
+
Schema definition with column names and constraints.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
pd.DataFrame
|
|
139
|
+
Validated DataFrame with DatetimeIndex.
|
|
140
|
+
|
|
141
|
+
Raises
|
|
142
|
+
------
|
|
143
|
+
ValueError
|
|
144
|
+
If required columns are missing or data violates constraints.
|
|
145
|
+
|
|
146
|
+
Notes
|
|
147
|
+
-----
|
|
148
|
+
- Converts date column to DatetimeIndex
|
|
149
|
+
- Validates spread values are within bounds
|
|
150
|
+
- Checks for duplicate dates per index
|
|
151
|
+
"""
|
|
152
|
+
logger.info("Validating CDX schema: %d rows", len(df))
|
|
153
|
+
|
|
154
|
+
# Check required columns (except date if already indexed)
|
|
155
|
+
required_cols = list(schema.required_cols)
|
|
156
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
157
|
+
# Already has DatetimeIndex, don't require date column
|
|
158
|
+
required_cols = [col for col in required_cols if col != schema.date_col]
|
|
159
|
+
|
|
160
|
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
|
161
|
+
if missing_cols:
|
|
162
|
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
163
|
+
|
|
164
|
+
# Validate spread bounds
|
|
165
|
+
if not df[schema.spread_col].between(schema.min_spread, schema.max_spread).all():
|
|
166
|
+
invalid = df[
|
|
167
|
+
~df[schema.spread_col].between(schema.min_spread, schema.max_spread)
|
|
168
|
+
]
|
|
169
|
+
logger.warning(
|
|
170
|
+
"Found %d invalid spread values outside [%.1f, %.1f]",
|
|
171
|
+
len(invalid),
|
|
172
|
+
schema.min_spread,
|
|
173
|
+
schema.max_spread,
|
|
174
|
+
)
|
|
175
|
+
raise ValueError(f"Spread values outside valid range: {invalid.head()}")
|
|
176
|
+
|
|
177
|
+
# Convert to DatetimeIndex and sort
|
|
178
|
+
df = _ensure_datetime_index(df, schema.date_col)
|
|
179
|
+
|
|
180
|
+
# Remove duplicates if present (without logging warning)
|
|
181
|
+
if df.index.duplicated().any():
|
|
182
|
+
n_dups = df.index.duplicated().sum()
|
|
183
|
+
logger.debug("Removing %d duplicate dates for CDX", n_dups)
|
|
184
|
+
df = df[~df.index.duplicated(keep="last")]
|
|
185
|
+
|
|
186
|
+
logger.debug(
|
|
187
|
+
"CDX validation passed: date_range=%s to %s", df.index.min(), df.index.max()
|
|
188
|
+
)
|
|
189
|
+
return df
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def validate_vix_schema(
|
|
193
|
+
df: pd.DataFrame, schema: VIXSchema = VIXSchema()
|
|
194
|
+
) -> pd.DataFrame:
|
|
195
|
+
"""
|
|
196
|
+
Validate VIX volatility data against expected schema.
|
|
197
|
+
|
|
198
|
+
Parameters
|
|
199
|
+
----------
|
|
200
|
+
df : pd.DataFrame
|
|
201
|
+
Raw VIX data to validate.
|
|
202
|
+
schema : VIXSchema, default VIXSchema()
|
|
203
|
+
Schema definition with column names and constraints.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
pd.DataFrame
|
|
208
|
+
Validated DataFrame with DatetimeIndex.
|
|
209
|
+
|
|
210
|
+
Raises
|
|
211
|
+
------
|
|
212
|
+
ValueError
|
|
213
|
+
If required columns are missing or data violates constraints.
|
|
214
|
+
"""
|
|
215
|
+
logger.info("Validating VIX schema: %d rows", len(df))
|
|
216
|
+
|
|
217
|
+
# Check required columns (except date if already indexed)
|
|
218
|
+
required_cols = list(schema.required_cols)
|
|
219
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
220
|
+
# Already has DatetimeIndex, don't require date column
|
|
221
|
+
required_cols = [col for col in required_cols if col != schema.date_col]
|
|
222
|
+
|
|
223
|
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
|
224
|
+
if missing_cols:
|
|
225
|
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
226
|
+
|
|
227
|
+
# Validate VIX bounds
|
|
228
|
+
if not df[schema.level_col].between(schema.min_vix, schema.max_vix).all():
|
|
229
|
+
invalid = df[~df[schema.level_col].between(schema.min_vix, schema.max_vix)]
|
|
230
|
+
logger.warning(
|
|
231
|
+
"Found %d invalid VIX values outside [%.1f, %.1f]",
|
|
232
|
+
len(invalid),
|
|
233
|
+
schema.min_vix,
|
|
234
|
+
schema.max_vix,
|
|
235
|
+
)
|
|
236
|
+
raise ValueError(f"VIX values outside valid range: {invalid.head()}")
|
|
237
|
+
|
|
238
|
+
# Convert to DatetimeIndex and sort
|
|
239
|
+
df = _ensure_datetime_index(df, schema.date_col)
|
|
240
|
+
|
|
241
|
+
# Check for duplicates (remove duplicates for VIX)
|
|
242
|
+
df = handle_duplicate_index(df, strategy="first", context="VIX")
|
|
243
|
+
|
|
244
|
+
logger.debug(
|
|
245
|
+
"VIX validation passed: date_range=%s to %s", df.index.min(), df.index.max()
|
|
246
|
+
)
|
|
247
|
+
return df
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def validate_etf_schema(
|
|
251
|
+
df: pd.DataFrame, schema: ETFSchema = ETFSchema()
|
|
252
|
+
) -> pd.DataFrame:
|
|
253
|
+
"""
|
|
254
|
+
Validate credit ETF data against expected schema.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
df : pd.DataFrame
|
|
259
|
+
Raw ETF data to validate.
|
|
260
|
+
schema : ETFSchema, default ETFSchema()
|
|
261
|
+
Schema definition with column names and constraints.
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
pd.DataFrame
|
|
266
|
+
Validated DataFrame with DatetimeIndex.
|
|
267
|
+
|
|
268
|
+
Raises
|
|
269
|
+
------
|
|
270
|
+
ValueError
|
|
271
|
+
If required columns are missing or data violates constraints.
|
|
272
|
+
"""
|
|
273
|
+
logger.info("Validating ETF schema: %d rows", len(df))
|
|
274
|
+
|
|
275
|
+
# Check required columns (except date if already indexed)
|
|
276
|
+
required_cols = list(schema.required_cols)
|
|
277
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
278
|
+
# Already has DatetimeIndex, don't require date column
|
|
279
|
+
required_cols = [col for col in required_cols if col != schema.date_col]
|
|
280
|
+
|
|
281
|
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
|
282
|
+
if missing_cols:
|
|
283
|
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
284
|
+
|
|
285
|
+
# Validate price bounds
|
|
286
|
+
if not df[schema.spread_col].between(schema.min_price, schema.max_price).all():
|
|
287
|
+
invalid = df[~df[schema.spread_col].between(schema.min_price, schema.max_price)]
|
|
288
|
+
logger.warning(
|
|
289
|
+
"Found %d invalid price values outside [%.1f, %.1f]",
|
|
290
|
+
len(invalid),
|
|
291
|
+
schema.min_price,
|
|
292
|
+
schema.max_price,
|
|
293
|
+
)
|
|
294
|
+
raise ValueError(f"Price values outside valid range: {invalid.head()}")
|
|
295
|
+
|
|
296
|
+
# Convert to DatetimeIndex and sort
|
|
297
|
+
df = _ensure_datetime_index(df, schema.date_col)
|
|
298
|
+
|
|
299
|
+
# Remove duplicates if present (without logging warning)
|
|
300
|
+
if df.index.duplicated().any():
|
|
301
|
+
n_dups = df.index.duplicated().sum()
|
|
302
|
+
logger.debug("Removing %d duplicate dates for ETF", n_dups)
|
|
303
|
+
df = df[~df.index.duplicated(keep="last")]
|
|
304
|
+
|
|
305
|
+
logger.debug(
|
|
306
|
+
"ETF validation passed: date_range=%s to %s", df.index.min(), df.index.max()
|
|
307
|
+
)
|
|
308
|
+
return df
|
aponyx/docs/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Documentation files for aponyx framework."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
__all__ = ["get_docs_dir"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_docs_dir() -> Path:
|
|
9
|
+
"""
|
|
10
|
+
Return the path to installed documentation directory.
|
|
11
|
+
|
|
12
|
+
Returns
|
|
13
|
+
-------
|
|
14
|
+
Path
|
|
15
|
+
Absolute path to docs directory.
|
|
16
|
+
|
|
17
|
+
Examples
|
|
18
|
+
--------
|
|
19
|
+
>>> from aponyx.docs import get_docs_dir
|
|
20
|
+
>>> docs = get_docs_dir()
|
|
21
|
+
>>> list(docs.glob("*.md"))
|
|
22
|
+
[PosixPath('.../python_guidelines.md'), ...]
|
|
23
|
+
"""
|
|
24
|
+
return Path(__file__).parent
|