aponyx 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. aponyx/__init__.py +14 -0
  2. aponyx/backtest/__init__.py +31 -0
  3. aponyx/backtest/adapters.py +77 -0
  4. aponyx/backtest/config.py +84 -0
  5. aponyx/backtest/engine.py +560 -0
  6. aponyx/backtest/protocols.py +101 -0
  7. aponyx/backtest/registry.py +334 -0
  8. aponyx/backtest/strategy_catalog.json +50 -0
  9. aponyx/cli/__init__.py +5 -0
  10. aponyx/cli/commands/__init__.py +8 -0
  11. aponyx/cli/commands/clean.py +349 -0
  12. aponyx/cli/commands/list.py +302 -0
  13. aponyx/cli/commands/report.py +167 -0
  14. aponyx/cli/commands/run.py +377 -0
  15. aponyx/cli/main.py +125 -0
  16. aponyx/config/__init__.py +82 -0
  17. aponyx/data/__init__.py +99 -0
  18. aponyx/data/bloomberg_config.py +306 -0
  19. aponyx/data/bloomberg_instruments.json +26 -0
  20. aponyx/data/bloomberg_securities.json +42 -0
  21. aponyx/data/cache.py +294 -0
  22. aponyx/data/fetch.py +659 -0
  23. aponyx/data/fetch_registry.py +135 -0
  24. aponyx/data/loaders.py +205 -0
  25. aponyx/data/providers/__init__.py +13 -0
  26. aponyx/data/providers/bloomberg.py +383 -0
  27. aponyx/data/providers/file.py +111 -0
  28. aponyx/data/registry.py +500 -0
  29. aponyx/data/requirements.py +96 -0
  30. aponyx/data/sample_data.py +415 -0
  31. aponyx/data/schemas.py +60 -0
  32. aponyx/data/sources.py +171 -0
  33. aponyx/data/synthetic_params.json +46 -0
  34. aponyx/data/transforms.py +336 -0
  35. aponyx/data/validation.py +308 -0
  36. aponyx/docs/__init__.py +24 -0
  37. aponyx/docs/adding_data_providers.md +682 -0
  38. aponyx/docs/cdx_knowledge_base.md +455 -0
  39. aponyx/docs/cdx_overlay_strategy.md +135 -0
  40. aponyx/docs/cli_guide.md +607 -0
  41. aponyx/docs/governance_design.md +551 -0
  42. aponyx/docs/logging_design.md +251 -0
  43. aponyx/docs/performance_evaluation_design.md +265 -0
  44. aponyx/docs/python_guidelines.md +786 -0
  45. aponyx/docs/signal_registry_usage.md +369 -0
  46. aponyx/docs/signal_suitability_design.md +558 -0
  47. aponyx/docs/visualization_design.md +277 -0
  48. aponyx/evaluation/__init__.py +11 -0
  49. aponyx/evaluation/performance/__init__.py +24 -0
  50. aponyx/evaluation/performance/adapters.py +109 -0
  51. aponyx/evaluation/performance/analyzer.py +384 -0
  52. aponyx/evaluation/performance/config.py +320 -0
  53. aponyx/evaluation/performance/decomposition.py +304 -0
  54. aponyx/evaluation/performance/metrics.py +761 -0
  55. aponyx/evaluation/performance/registry.py +327 -0
  56. aponyx/evaluation/performance/report.py +541 -0
  57. aponyx/evaluation/suitability/__init__.py +67 -0
  58. aponyx/evaluation/suitability/config.py +143 -0
  59. aponyx/evaluation/suitability/evaluator.py +389 -0
  60. aponyx/evaluation/suitability/registry.py +328 -0
  61. aponyx/evaluation/suitability/report.py +398 -0
  62. aponyx/evaluation/suitability/scoring.py +367 -0
  63. aponyx/evaluation/suitability/tests.py +303 -0
  64. aponyx/examples/01_generate_synthetic_data.py +53 -0
  65. aponyx/examples/02_fetch_data_file.py +82 -0
  66. aponyx/examples/03_fetch_data_bloomberg.py +104 -0
  67. aponyx/examples/04_compute_signal.py +164 -0
  68. aponyx/examples/05_evaluate_suitability.py +224 -0
  69. aponyx/examples/06_run_backtest.py +242 -0
  70. aponyx/examples/07_analyze_performance.py +214 -0
  71. aponyx/examples/08_visualize_results.py +272 -0
  72. aponyx/main.py +7 -0
  73. aponyx/models/__init__.py +45 -0
  74. aponyx/models/config.py +83 -0
  75. aponyx/models/indicator_transformation.json +52 -0
  76. aponyx/models/indicators.py +292 -0
  77. aponyx/models/metadata.py +447 -0
  78. aponyx/models/orchestrator.py +213 -0
  79. aponyx/models/registry.py +860 -0
  80. aponyx/models/score_transformation.json +42 -0
  81. aponyx/models/signal_catalog.json +29 -0
  82. aponyx/models/signal_composer.py +513 -0
  83. aponyx/models/signal_transformation.json +29 -0
  84. aponyx/persistence/__init__.py +16 -0
  85. aponyx/persistence/json_io.py +132 -0
  86. aponyx/persistence/parquet_io.py +378 -0
  87. aponyx/py.typed +0 -0
  88. aponyx/reporting/__init__.py +10 -0
  89. aponyx/reporting/generator.py +517 -0
  90. aponyx/visualization/__init__.py +20 -0
  91. aponyx/visualization/app.py +37 -0
  92. aponyx/visualization/plots.py +309 -0
  93. aponyx/visualization/visualizer.py +242 -0
  94. aponyx/workflows/__init__.py +18 -0
  95. aponyx/workflows/concrete_steps.py +720 -0
  96. aponyx/workflows/config.py +122 -0
  97. aponyx/workflows/engine.py +279 -0
  98. aponyx/workflows/registry.py +116 -0
  99. aponyx/workflows/steps.py +180 -0
  100. aponyx-0.1.18.dist-info/METADATA +552 -0
  101. aponyx-0.1.18.dist-info/RECORD +104 -0
  102. aponyx-0.1.18.dist-info/WHEEL +4 -0
  103. aponyx-0.1.18.dist-info/entry_points.txt +2 -0
  104. aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,336 @@
1
+ """
2
+ Time series transformation functions for financial data.
3
+
4
+ Provides standardized transformations with consistent edge case handling:
5
+ - First difference (absolute change)
6
+ - Percent change (relative change)
7
+ - Log returns (continuous compounding)
8
+ - Z-score normalization (standardization)
9
+ - Normalized change (volatility-adjusted)
10
+ """
11
+
12
+ import logging
13
+ from typing import Literal
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ TransformType = Literal[
21
+ "diff",
22
+ "pct_change",
23
+ "log_return",
24
+ "z_score",
25
+ "normalized_change",
26
+ ]
27
+
28
+
29
+ def apply_transform(
30
+ series: pd.Series,
31
+ transform: TransformType,
32
+ *,
33
+ window: int | None = None,
34
+ min_periods: int | None = None,
35
+ periods: int = 1,
36
+ ) -> pd.Series:
37
+ """
38
+ Apply time series transformation with edge case handling.
39
+
40
+ Parameters
41
+ ----------
42
+ series : pd.Series
43
+ Input time series with DatetimeIndex.
44
+ transform : TransformType
45
+ Type of transformation to apply:
46
+ - 'diff': First difference (x[t] - x[t-periods])
47
+ - 'pct_change': Percent change ((x[t] - x[t-periods]) / x[t-periods])
48
+ - 'log_return': Log return (log(x[t] / x[t-periods]))
49
+ - 'z_score': Rolling z-score normalization
50
+ - 'normalized_change': Change normalized by rolling volatility
51
+ window : int or None
52
+ Rolling window size for z_score and normalized_change.
53
+ Required for these transforms, ignored for others.
54
+ min_periods : int or None
55
+ Minimum observations for rolling calculations.
56
+ Defaults to window if not specified.
57
+ periods : int, default 1
58
+ Number of periods for differencing operations.
59
+
60
+ Returns
61
+ -------
62
+ pd.Series
63
+ Transformed series with same DatetimeIndex.
64
+ NaN values propagate from input or calculation.
65
+
66
+ Raises
67
+ ------
68
+ ValueError
69
+ If required parameters are missing for transform type.
70
+ If log_return used on series with non-positive values.
71
+
72
+ Examples
73
+ --------
74
+ >>> spreads = pd.Series([100, 105, 103, 108], index=pd.date_range('2024-01-01', periods=4))
75
+ >>> apply_transform(spreads, 'diff')
76
+ >>> apply_transform(spreads, 'pct_change')
77
+ >>> apply_transform(spreads, 'z_score', window=20, min_periods=10)
78
+
79
+ Notes
80
+ -----
81
+ - All transforms preserve DatetimeIndex alignment
82
+ - NaN handling follows pandas conventions (NaN in = NaN out)
83
+ - Division by zero in pct_change produces inf (pandas default)
84
+ - log_return validates input is positive before calculation
85
+ """
86
+ if transform == "diff":
87
+ return _diff(series, periods)
88
+ elif transform == "pct_change":
89
+ return _pct_change(series, periods)
90
+ elif transform == "log_return":
91
+ return _log_return(series, periods)
92
+ elif transform == "z_score":
93
+ if window is None:
94
+ raise ValueError("window parameter required for z_score transform")
95
+ return _z_score(series, window, min_periods)
96
+ elif transform == "normalized_change":
97
+ if window is None:
98
+ raise ValueError(
99
+ "window parameter required for normalized_change transform"
100
+ )
101
+ return _normalized_change(series, window, min_periods, periods)
102
+ else:
103
+ raise ValueError(f"Unknown transform type: {transform}")
104
+
105
+
106
+ def _diff(series: pd.Series, periods: int = 1) -> pd.Series:
107
+ """
108
+ Compute first difference: x[t] - x[t-periods].
109
+
110
+ Parameters
111
+ ----------
112
+ series : pd.Series
113
+ Input time series.
114
+ periods : int, default 1
115
+ Number of periods to difference.
116
+
117
+ Returns
118
+ -------
119
+ pd.Series
120
+ First differences.
121
+
122
+ Notes
123
+ -----
124
+ First `periods` observations will be NaN.
125
+ """
126
+ return series.diff(periods)
127
+
128
+
129
+ def _pct_change(series: pd.Series, periods: int = 1) -> pd.Series:
130
+ """
131
+ Compute percent change: (x[t] - x[t-periods]) / x[t-periods].
132
+
133
+ Parameters
134
+ ----------
135
+ series : pd.Series
136
+ Input time series.
137
+ periods : int, default 1
138
+ Number of periods for change calculation.
139
+
140
+ Returns
141
+ -------
142
+ pd.Series
143
+ Percent changes.
144
+
145
+ Notes
146
+ -----
147
+ - Division by zero produces inf (pandas default behavior)
148
+ - First `periods` observations will be NaN
149
+ - Use for cross-asset comparison where scales differ
150
+ """
151
+ return series.pct_change(periods)
152
+
153
+
154
+ def _log_return(series: pd.Series, periods: int = 1) -> pd.Series:
155
+ """
156
+ Compute log returns: log(x[t] / x[t-periods]).
157
+
158
+ Parameters
159
+ ----------
160
+ series : pd.Series
161
+ Input time series. Must contain only positive values.
162
+ periods : int, default 1
163
+ Number of periods for return calculation.
164
+
165
+ Returns
166
+ -------
167
+ pd.Series
168
+ Log returns.
169
+
170
+ Raises
171
+ ------
172
+ ValueError
173
+ If series contains non-positive values (zero or negative).
174
+
175
+ Notes
176
+ -----
177
+ - Validates all non-NaN values are positive before calculation
178
+ - First `periods` observations will be NaN
179
+ - Preferred for risk calculations (continuous compounding)
180
+ - Approximates pct_change for small changes
181
+ """
182
+ # Check for non-positive values (excluding NaN)
183
+ non_positive = (series <= 0) & series.notna()
184
+ if non_positive.any():
185
+ n_invalid = non_positive.sum()
186
+ raise ValueError(
187
+ f"log_return requires positive values, found {n_invalid} non-positive entries"
188
+ )
189
+
190
+ return np.log(series / series.shift(periods))
191
+
192
+
193
+ def _z_score(
194
+ series: pd.Series, window: int, min_periods: int | None = None
195
+ ) -> pd.Series:
196
+ """
197
+ Compute rolling z-score: (x - rolling_mean) / rolling_std.
198
+
199
+ Parameters
200
+ ----------
201
+ series : pd.Series
202
+ Input time series.
203
+ window : int
204
+ Rolling window size for mean and std calculation.
205
+ min_periods : int or None
206
+ Minimum observations required. Defaults to window.
207
+
208
+ Returns
209
+ -------
210
+ pd.Series
211
+ Rolling z-scores (zero mean, unit variance within window).
212
+
213
+ Notes
214
+ -----
215
+ - Useful for regime-independent signals
216
+ - First `window - 1` (or `min_periods - 1`) observations will be NaN
217
+ - Division by zero std produces inf (pandas default)
218
+ - More robust than raw differences when volatility varies over time
219
+ """
220
+ if min_periods is None:
221
+ min_periods = window
222
+
223
+ rolling_mean = series.rolling(window=window, min_periods=min_periods).mean()
224
+ rolling_std = series.rolling(window=window, min_periods=min_periods).std()
225
+
226
+ return (series - rolling_mean) / rolling_std
227
+
228
+
229
+ def _normalized_change(
230
+ series: pd.Series,
231
+ window: int,
232
+ min_periods: int | None = None,
233
+ periods: int = 1,
234
+ ) -> pd.Series:
235
+ """
236
+ Compute change normalized by rolling volatility: (x[t] - x[t-periods]) / rolling_std.
237
+
238
+ Parameters
239
+ ----------
240
+ series : pd.Series
241
+ Input time series.
242
+ window : int
243
+ Rolling window for volatility calculation.
244
+ min_periods : int or None
245
+ Minimum observations required. Defaults to window.
246
+ periods : int, default 1
247
+ Number of periods for change calculation.
248
+
249
+ Returns
250
+ -------
251
+ pd.Series
252
+ Volatility-normalized changes.
253
+
254
+ Notes
255
+ -----
256
+ - Combines absolute change with volatility scaling
257
+ - Useful when comparing signals across different regimes
258
+ - Similar to z_score but uses absolute change instead of deviation from mean
259
+ - First `max(window, periods)` observations will be NaN
260
+ """
261
+ if min_periods is None:
262
+ min_periods = window
263
+
264
+ change = series.diff(periods)
265
+ rolling_std = series.rolling(window=window, min_periods=min_periods).std()
266
+
267
+ return change / rolling_std
268
+
269
+
270
+ def apply_signal_transformation(
271
+ series: pd.Series,
272
+ *,
273
+ scaling: float = 1.0,
274
+ floor: float | None = None,
275
+ cap: float | None = None,
276
+ neutral_range: tuple[float, float] | None = None,
277
+ ) -> pd.Series:
278
+ """
279
+ Apply trading rule transformation to score series.
280
+
281
+ Applies transformations in order: scale → floor/cap → neutral_range.
282
+ This converts normalized scores into bounded trading signals.
283
+
284
+ Parameters
285
+ ----------
286
+ series : pd.Series
287
+ Input score series (typically z-scores or normalized values).
288
+ scaling : float, default 1.0
289
+ Multiplier applied first to scale the signal.
290
+ Must be non-zero.
291
+ floor : float or None, optional
292
+ Lower bound after scaling. None = no lower bound (-inf).
293
+ cap : float or None, optional
294
+ Upper bound after scaling. None = no upper bound (+inf).
295
+ neutral_range : tuple[float, float] or None, optional
296
+ Values within [low, high] set to zero (no position signal).
297
+ None = no neutral zone.
298
+
299
+ Returns
300
+ -------
301
+ pd.Series
302
+ Transformed signal with same DatetimeIndex.
303
+ NaN values are preserved from input.
304
+
305
+ Examples
306
+ --------
307
+ >>> scores = pd.Series([2.5, -1.8, 0.3, -0.1], index=pd.date_range('2024-01-01', periods=4))
308
+ >>> apply_signal_transformation(scores, floor=-1.5, cap=1.5)
309
+ >>> apply_signal_transformation(scores, scaling=2.0, floor=-2.0, cap=2.0)
310
+ >>> apply_signal_transformation(scores, floor=-1.5, cap=1.5, neutral_range=(-0.25, 0.25))
311
+
312
+ Notes
313
+ -----
314
+ - Transformation order is fixed: scale → floor/cap → neutral_range
315
+ - NaN values propagate through all operations
316
+ - Neutral range is applied AFTER floor/cap bounds
317
+ - Use scaling=1.0 for no scaling (passthrough scaling factor)
318
+ """
319
+ # Apply scaling first
320
+ result = series * scaling
321
+
322
+ # Apply floor/cap bounds
323
+ if floor is not None:
324
+ result = result.clip(lower=floor)
325
+ if cap is not None:
326
+ result = result.clip(upper=cap)
327
+
328
+ # Apply neutral range (zero out values within range)
329
+ if neutral_range is not None:
330
+ low, high = neutral_range
331
+ # Set values within neutral range to 0.0, keeping values outside the range
332
+ # Preserve NaN values by checking explicitly
333
+ mask = (result >= low) & (result <= high) # Values inside neutral range
334
+ result = result.mask(mask, 0.0) # Set those to 0.0, NaN values preserved
335
+
336
+ return result
@@ -0,0 +1,308 @@
1
+ """
2
+ Data validation utilities for market data quality checks.
3
+
4
+ Validates schema compliance, data types, and business logic constraints.
5
+ """
6
+
7
+ import logging
8
+
9
+ import pandas as pd
10
+
11
+ from .schemas import CDXSchema, VIXSchema, ETFSchema
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _ensure_datetime_index(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
17
+ """
18
+ Convert DataFrame to use DatetimeIndex if not already.
19
+
20
+ Parameters
21
+ ----------
22
+ df : pd.DataFrame
23
+ DataFrame to process.
24
+ date_col : str
25
+ Name of date column to use as index.
26
+
27
+ Returns
28
+ -------
29
+ pd.DataFrame
30
+ DataFrame with DatetimeIndex named 'date', sorted by date.
31
+ """
32
+ if not isinstance(df.index, pd.DatetimeIndex):
33
+ df = df.copy()
34
+ df[date_col] = pd.to_datetime(df[date_col])
35
+ df = df.set_index(date_col)
36
+
37
+ # Ensure index is named 'date'
38
+ df.index.name = "date"
39
+
40
+ return df.sort_index()
41
+
42
+
43
+ def _check_duplicate_dates(df: pd.DataFrame, context: str = "") -> None:
44
+ """
45
+ Check for and log duplicate dates in DataFrame index.
46
+
47
+ Parameters
48
+ ----------
49
+ df : pd.DataFrame
50
+ DataFrame with DatetimeIndex to check.
51
+ context : str, optional
52
+ Additional context for log message (e.g., ticker name).
53
+ """
54
+ if df.index.duplicated().any():
55
+ n_dups = df.index.duplicated().sum()
56
+ if context:
57
+ logger.warning("Found %d duplicate dates for %s", n_dups, context)
58
+ else:
59
+ logger.warning("Found %d duplicate dates", n_dups)
60
+
61
+
62
+ def handle_duplicate_index(
63
+ df: pd.DataFrame,
64
+ strategy: str = "last",
65
+ context: str = "",
66
+ ) -> pd.DataFrame:
67
+ """
68
+ Remove duplicate index entries with logging.
69
+
70
+ Parameters
71
+ ----------
72
+ df : pd.DataFrame
73
+ DataFrame with potential duplicate indices.
74
+ strategy : str, default "last"
75
+ Deduplication strategy:
76
+ - "first": Keep first occurrence
77
+ - "last": Keep last occurrence
78
+ - "raise": Raise ValueError if duplicates found
79
+ context : str, optional
80
+ Context for logging (e.g., "CDX IG 5Y").
81
+
82
+ Returns
83
+ -------
84
+ pd.DataFrame
85
+ DataFrame with duplicates removed.
86
+
87
+ Raises
88
+ ------
89
+ ValueError
90
+ If strategy="raise" and duplicates are found, or if strategy is invalid.
91
+
92
+ Examples
93
+ --------
94
+ >>> df = pd.DataFrame({"value": [1, 2, 3]}, index=pd.DatetimeIndex(["2024-01-01", "2024-01-01", "2024-01-02"]))
95
+ >>> clean_df = handle_duplicate_index(df, strategy="last")
96
+ >>> clean_df = handle_duplicate_index(df, strategy="raise") # Raises ValueError
97
+ """
98
+ if strategy not in ("first", "last", "raise"):
99
+ raise ValueError(
100
+ f"Invalid strategy '{strategy}'. Must be 'first', 'last', or 'raise'"
101
+ )
102
+
103
+ if not df.index.duplicated().any():
104
+ return df
105
+
106
+ # Log duplicates
107
+ _check_duplicate_dates(df, context)
108
+
109
+ # Handle based on strategy
110
+ if strategy == "raise":
111
+ n_dups = df.index.duplicated().sum()
112
+ raise ValueError(
113
+ f"Found {n_dups} duplicate index entries"
114
+ + (f" for {context}" if context else "")
115
+ )
116
+
117
+ # Remove duplicates
118
+ df_clean = df[~df.index.duplicated(keep=strategy)]
119
+ logger.debug("Removed duplicates using strategy='%s'", strategy)
120
+ return df_clean
121
+
122
+
123
+ def validate_cdx_schema(
124
+ df: pd.DataFrame, schema: CDXSchema = CDXSchema()
125
+ ) -> pd.DataFrame:
126
+ """
127
+ Validate CDX index data against expected schema.
128
+
129
+ Parameters
130
+ ----------
131
+ df : pd.DataFrame
132
+ Raw CDX data to validate.
133
+ schema : CDXSchema, default CDXSchema()
134
+ Schema definition with column names and constraints.
135
+
136
+ Returns
137
+ -------
138
+ pd.DataFrame
139
+ Validated DataFrame with DatetimeIndex.
140
+
141
+ Raises
142
+ ------
143
+ ValueError
144
+ If required columns are missing or data violates constraints.
145
+
146
+ Notes
147
+ -----
148
+ - Converts date column to DatetimeIndex
149
+ - Validates spread values are within bounds
150
+ - Checks for duplicate dates per index
151
+ """
152
+ logger.info("Validating CDX schema: %d rows", len(df))
153
+
154
+ # Check required columns (except date if already indexed)
155
+ required_cols = list(schema.required_cols)
156
+ if isinstance(df.index, pd.DatetimeIndex):
157
+ # Already has DatetimeIndex, don't require date column
158
+ required_cols = [col for col in required_cols if col != schema.date_col]
159
+
160
+ missing_cols = [col for col in required_cols if col not in df.columns]
161
+ if missing_cols:
162
+ raise ValueError(f"Missing required columns: {missing_cols}")
163
+
164
+ # Validate spread bounds
165
+ if not df[schema.spread_col].between(schema.min_spread, schema.max_spread).all():
166
+ invalid = df[
167
+ ~df[schema.spread_col].between(schema.min_spread, schema.max_spread)
168
+ ]
169
+ logger.warning(
170
+ "Found %d invalid spread values outside [%.1f, %.1f]",
171
+ len(invalid),
172
+ schema.min_spread,
173
+ schema.max_spread,
174
+ )
175
+ raise ValueError(f"Spread values outside valid range: {invalid.head()}")
176
+
177
+ # Convert to DatetimeIndex and sort
178
+ df = _ensure_datetime_index(df, schema.date_col)
179
+
180
+ # Remove duplicates if present (without logging warning)
181
+ if df.index.duplicated().any():
182
+ n_dups = df.index.duplicated().sum()
183
+ logger.debug("Removing %d duplicate dates for CDX", n_dups)
184
+ df = df[~df.index.duplicated(keep="last")]
185
+
186
+ logger.debug(
187
+ "CDX validation passed: date_range=%s to %s", df.index.min(), df.index.max()
188
+ )
189
+ return df
190
+
191
+
192
+ def validate_vix_schema(
193
+ df: pd.DataFrame, schema: VIXSchema = VIXSchema()
194
+ ) -> pd.DataFrame:
195
+ """
196
+ Validate VIX volatility data against expected schema.
197
+
198
+ Parameters
199
+ ----------
200
+ df : pd.DataFrame
201
+ Raw VIX data to validate.
202
+ schema : VIXSchema, default VIXSchema()
203
+ Schema definition with column names and constraints.
204
+
205
+ Returns
206
+ -------
207
+ pd.DataFrame
208
+ Validated DataFrame with DatetimeIndex.
209
+
210
+ Raises
211
+ ------
212
+ ValueError
213
+ If required columns are missing or data violates constraints.
214
+ """
215
+ logger.info("Validating VIX schema: %d rows", len(df))
216
+
217
+ # Check required columns (except date if already indexed)
218
+ required_cols = list(schema.required_cols)
219
+ if isinstance(df.index, pd.DatetimeIndex):
220
+ # Already has DatetimeIndex, don't require date column
221
+ required_cols = [col for col in required_cols if col != schema.date_col]
222
+
223
+ missing_cols = [col for col in required_cols if col not in df.columns]
224
+ if missing_cols:
225
+ raise ValueError(f"Missing required columns: {missing_cols}")
226
+
227
+ # Validate VIX bounds
228
+ if not df[schema.level_col].between(schema.min_vix, schema.max_vix).all():
229
+ invalid = df[~df[schema.level_col].between(schema.min_vix, schema.max_vix)]
230
+ logger.warning(
231
+ "Found %d invalid VIX values outside [%.1f, %.1f]",
232
+ len(invalid),
233
+ schema.min_vix,
234
+ schema.max_vix,
235
+ )
236
+ raise ValueError(f"VIX values outside valid range: {invalid.head()}")
237
+
238
+ # Convert to DatetimeIndex and sort
239
+ df = _ensure_datetime_index(df, schema.date_col)
240
+
241
+ # Check for duplicates (remove duplicates for VIX)
242
+ df = handle_duplicate_index(df, strategy="first", context="VIX")
243
+
244
+ logger.debug(
245
+ "VIX validation passed: date_range=%s to %s", df.index.min(), df.index.max()
246
+ )
247
+ return df
248
+
249
+
250
+ def validate_etf_schema(
251
+ df: pd.DataFrame, schema: ETFSchema = ETFSchema()
252
+ ) -> pd.DataFrame:
253
+ """
254
+ Validate credit ETF data against expected schema.
255
+
256
+ Parameters
257
+ ----------
258
+ df : pd.DataFrame
259
+ Raw ETF data to validate.
260
+ schema : ETFSchema, default ETFSchema()
261
+ Schema definition with column names and constraints.
262
+
263
+ Returns
264
+ -------
265
+ pd.DataFrame
266
+ Validated DataFrame with DatetimeIndex.
267
+
268
+ Raises
269
+ ------
270
+ ValueError
271
+ If required columns are missing or data violates constraints.
272
+ """
273
+ logger.info("Validating ETF schema: %d rows", len(df))
274
+
275
+ # Check required columns (except date if already indexed)
276
+ required_cols = list(schema.required_cols)
277
+ if isinstance(df.index, pd.DatetimeIndex):
278
+ # Already has DatetimeIndex, don't require date column
279
+ required_cols = [col for col in required_cols if col != schema.date_col]
280
+
281
+ missing_cols = [col for col in required_cols if col not in df.columns]
282
+ if missing_cols:
283
+ raise ValueError(f"Missing required columns: {missing_cols}")
284
+
285
+ # Validate price bounds
286
+ if not df[schema.spread_col].between(schema.min_price, schema.max_price).all():
287
+ invalid = df[~df[schema.spread_col].between(schema.min_price, schema.max_price)]
288
+ logger.warning(
289
+ "Found %d invalid price values outside [%.1f, %.1f]",
290
+ len(invalid),
291
+ schema.min_price,
292
+ schema.max_price,
293
+ )
294
+ raise ValueError(f"Price values outside valid range: {invalid.head()}")
295
+
296
+ # Convert to DatetimeIndex and sort
297
+ df = _ensure_datetime_index(df, schema.date_col)
298
+
299
+ # Remove duplicates if present (without logging warning)
300
+ if df.index.duplicated().any():
301
+ n_dups = df.index.duplicated().sum()
302
+ logger.debug("Removing %d duplicate dates for ETF", n_dups)
303
+ df = df[~df.index.duplicated(keep="last")]
304
+
305
+ logger.debug(
306
+ "ETF validation passed: date_range=%s to %s", df.index.min(), df.index.max()
307
+ )
308
+ return df
@@ -0,0 +1,24 @@
1
+ """Documentation files for aponyx framework."""
2
+
3
+ from pathlib import Path
4
+
5
+ __all__ = ["get_docs_dir"]
6
+
7
+
8
+ def get_docs_dir() -> Path:
9
+ """
10
+ Return the path to installed documentation directory.
11
+
12
+ Returns
13
+ -------
14
+ Path
15
+ Absolute path to docs directory.
16
+
17
+ Examples
18
+ --------
19
+ >>> from aponyx.docs import get_docs_dir
20
+ >>> docs = get_docs_dir()
21
+ >>> list(docs.glob("*.md"))
22
+ [PosixPath('.../python_guidelines.md'), ...]
23
+ """
24
+ return Path(__file__).parent