aponyx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aponyx might be problematic. Click here for more details.

@@ -0,0 +1,86 @@
1
+ """
2
+ File-based data provider for Parquet and CSV files.
3
+
4
+ Handles local file loading with automatic format detection.
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import pandas as pd
12
+
13
+ from ...persistence.parquet_io import load_parquet
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def fetch_from_file(
19
+ file_path: str | Path,
20
+ instrument: str,
21
+ start_date: str | None = None,
22
+ end_date: str | None = None,
23
+ **params: Any,
24
+ ) -> pd.DataFrame:
25
+ """
26
+ Fetch data from local Parquet or CSV file.
27
+
28
+ Parameters
29
+ ----------
30
+ file_path : str or Path
31
+ Path to data file.
32
+ instrument : str
33
+ Instrument identifier (for logging).
34
+ start_date : str or None
35
+ Optional start date filter (ISO format).
36
+ end_date : str or None
37
+ Optional end date filter (ISO format).
38
+ **params : Any
39
+ Additional parameters (unused for file provider).
40
+
41
+ Returns
42
+ -------
43
+ pd.DataFrame
44
+ Raw data loaded from file (validation happens in fetch layer).
45
+
46
+ Raises
47
+ ------
48
+ ValueError
49
+ If file format is not supported.
50
+ FileNotFoundError
51
+ If file does not exist.
52
+
53
+ Notes
54
+ -----
55
+ - Automatically detects Parquet vs CSV from file extension
56
+ - Date filtering applied after loading (files assumed small enough)
57
+ - Does not perform schema validation (handled by fetch layer)
58
+ """
59
+ file_path = Path(file_path)
60
+ logger.info("Fetching %s from file: %s", instrument, file_path)
61
+
62
+ if not file_path.exists():
63
+ raise FileNotFoundError(f"Data file not found: {file_path}")
64
+
65
+ # Load based on file type
66
+ if file_path.suffix == ".parquet":
67
+ df = load_parquet(file_path)
68
+ elif file_path.suffix == ".csv":
69
+ df = pd.read_csv(file_path)
70
+ else:
71
+ raise ValueError(f"Unsupported file format: {file_path.suffix}")
72
+
73
+ # Apply date filtering if requested
74
+ if isinstance(df.index, pd.DatetimeIndex):
75
+ if start_date is not None:
76
+ start = pd.Timestamp(start_date)
77
+ df = df[df.index >= start]
78
+ logger.debug("Filtered to start_date >= %s: %d rows", start_date, len(df))
79
+
80
+ if end_date is not None:
81
+ end = pd.Timestamp(end_date)
82
+ df = df[df.index <= end]
83
+ logger.debug("Filtered to end_date <= %s: %d rows", end_date, len(df))
84
+
85
+ logger.info("Loaded %d rows from file", len(df))
86
+ return df
@@ -0,0 +1,359 @@
1
+ """
2
+ Synthetic data generation for testing and demonstrations.
3
+
4
+ Generates realistic market data for CDX, VIX, and ETF instruments
5
+ with configurable volatility, correlation, and trend parameters.
6
+ """
7
+
8
+ import logging
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+ from ..persistence.parquet_io import save_parquet
15
+ from .sources import FileSource
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def generate_cdx_sample(
21
+ start_date: str = "2024-01-01",
22
+ periods: int = 252,
23
+ index_name: str = "CDX_IG",
24
+ tenor: str = "5Y",
25
+ base_spread: float = 100.0,
26
+ volatility: float = 5.0,
27
+ seed: int = 42,
28
+ ) -> pd.DataFrame:
29
+ """
30
+ Generate synthetic CDX spread data.
31
+
32
+ Parameters
33
+ ----------
34
+ start_date : str, default "2024-01-01"
35
+ Start date for time series.
36
+ periods : int, default 252
37
+ Number of daily observations (trading days).
38
+ index_name : str, default "CDX_IG"
39
+ Index identifier (CDX_IG, CDX_HY, CDX_XO).
40
+ tenor : str, default "5Y"
41
+ Tenor string (5Y, 10Y).
42
+ base_spread : float, default 100.0
43
+ Starting spread level in basis points.
44
+ volatility : float, default 5.0
45
+ Daily spread volatility in basis points.
46
+ seed : int, default 42
47
+ Random seed for reproducibility.
48
+
49
+ Returns
50
+ -------
51
+ pd.DataFrame
52
+ CDX data with columns: date, spread, index, tenor, series
53
+
54
+ Notes
55
+ -----
56
+ - Uses geometric Brownian motion with mean reversion
57
+ - Spreads constrained to positive values
58
+ - Realistic credit market dynamics
59
+ """
60
+ logger.info(
61
+ "Generating CDX sample: index=%s, tenor=%s, periods=%d",
62
+ index_name,
63
+ tenor,
64
+ periods,
65
+ )
66
+
67
+ rng = np.random.default_rng(seed)
68
+ dates = pd.date_range(start_date, periods=periods, freq="D")
69
+
70
+ # Mean-reverting spread dynamics
71
+ spread = [base_spread]
72
+ mean_reversion_speed = 0.1
73
+ mean_level = base_spread
74
+
75
+ for _ in range(periods - 1):
76
+ drift = mean_reversion_speed * (mean_level - spread[-1])
77
+ shock = rng.normal(0, volatility)
78
+ new_spread = max(1.0, spread[-1] + drift + shock)
79
+ spread.append(new_spread)
80
+
81
+ df = pd.DataFrame(
82
+ {
83
+ "date": dates,
84
+ "spread": spread,
85
+ "index": [f"{index_name}_{tenor}"] * periods,
86
+ "tenor": [tenor] * periods,
87
+ "series": [42] * periods,
88
+ }
89
+ )
90
+
91
+ logger.debug("Generated CDX sample: mean_spread=%.2f", df["spread"].mean())
92
+ return df
93
+
94
+
95
+ def generate_vix_sample(
96
+ start_date: str = "2024-01-01",
97
+ periods: int = 252,
98
+ base_vix: float = 15.0,
99
+ volatility: float = 2.0,
100
+ seed: int = 42,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Generate synthetic VIX volatility data.
104
+
105
+ Parameters
106
+ ----------
107
+ start_date : str, default "2024-01-01"
108
+ Start date for time series.
109
+ periods : int, default 252
110
+ Number of daily observations.
111
+ base_vix : float, default 15.0
112
+ Starting VIX level.
113
+ volatility : float, default 2.0
114
+ Volatility of volatility (vol of vol).
115
+ seed : int, default 42
116
+ Random seed for reproducibility.
117
+
118
+ Returns
119
+ -------
120
+ pd.DataFrame
121
+ VIX data with columns: date, close
122
+
123
+ Notes
124
+ -----
125
+ - Uses mean-reverting process with occasional spikes
126
+ - VIX constrained to positive values
127
+ """
128
+ logger.info("Generating VIX sample: periods=%d", periods)
129
+
130
+ rng = np.random.default_rng(seed)
131
+ dates = pd.date_range(start_date, periods=periods, freq="D")
132
+
133
+ # Mean-reverting VIX with spike potential
134
+ vix_close = [base_vix]
135
+ mean_reversion_speed = 0.15
136
+ mean_level = base_vix
137
+
138
+ for i in range(periods - 1):
139
+ # Occasional spike (5% probability)
140
+ if rng.random() < 0.05:
141
+ spike = rng.uniform(5, 15)
142
+ else:
143
+ spike = 0
144
+
145
+ drift = mean_reversion_speed * (mean_level - vix_close[-1])
146
+ shock = rng.normal(0, volatility)
147
+ new_vix = max(8.0, vix_close[-1] + drift + shock + spike)
148
+ vix_close.append(new_vix)
149
+
150
+ df = pd.DataFrame(
151
+ {
152
+ "date": dates,
153
+ "close": vix_close,
154
+ }
155
+ )
156
+
157
+ logger.debug("Generated VIX sample: mean=%.2f", df["close"].mean())
158
+ return df
159
+
160
+
161
+ def generate_etf_sample(
162
+ start_date: str = "2024-01-01",
163
+ periods: int = 252,
164
+ ticker: str = "HYG",
165
+ base_price: float = 80.0,
166
+ volatility: float = 0.5,
167
+ seed: int = 42,
168
+ ) -> pd.DataFrame:
169
+ """
170
+ Generate synthetic credit ETF price data.
171
+
172
+ Parameters
173
+ ----------
174
+ start_date : str, default "2024-01-01"
175
+ Start date for time series.
176
+ periods : int, default 252
177
+ Number of daily observations.
178
+ ticker : str, default "HYG"
179
+ ETF ticker symbol (HYG, LQD).
180
+ base_price : float, default 80.0
181
+ Starting price.
182
+ volatility : float, default 0.5
183
+ Daily price volatility.
184
+ seed : int, default 42
185
+ Random seed for reproducibility.
186
+
187
+ Returns
188
+ -------
189
+ pd.DataFrame
190
+ ETF data with columns: date, close, ticker
191
+
192
+ Notes
193
+ -----
194
+ - Uses geometric Brownian motion
195
+ - Prices constrained to positive values
196
+ """
197
+ logger.info("Generating ETF sample: ticker=%s, periods=%d", ticker, periods)
198
+
199
+ rng = np.random.default_rng(seed)
200
+ dates = pd.date_range(start_date, periods=periods, freq="D")
201
+
202
+ # Geometric Brownian motion for prices
203
+ returns = rng.normal(0.0001, volatility / base_price, periods)
204
+ price = base_price * np.exp(np.cumsum(returns))
205
+
206
+ df = pd.DataFrame(
207
+ {
208
+ "date": dates,
209
+ "close": price,
210
+ "ticker": [ticker] * periods,
211
+ }
212
+ )
213
+
214
+ logger.debug("Generated ETF sample: mean_price=%.2f", df["close"].mean())
215
+ return df
216
+
217
+
218
+ def generate_full_sample_dataset(
219
+ output_dir: str = "data/raw",
220
+ start_date: str = "2023-01-01",
221
+ periods: int = 252,
222
+ seed: int = 42,
223
+ ) -> dict[str, str]:
224
+ """
225
+ Generate complete sample dataset for testing.
226
+
227
+ Parameters
228
+ ----------
229
+ output_dir : str, default "data/raw"
230
+ Directory to save Parquet files.
231
+ start_date : str, default "2023-01-01"
232
+ Start date for all time series.
233
+ periods : int, default 252
234
+ Number of daily observations.
235
+ seed : int, default 42
236
+ Random seed for reproducibility.
237
+
238
+ Returns
239
+ -------
240
+ dict[str, str]
241
+ Dictionary mapping data type to file path.
242
+
243
+ Notes
244
+ -----
245
+ Generates and saves:
246
+ - CDX IG 5Y spreads
247
+ - CDX HY 5Y spreads
248
+ - VIX volatility
249
+ - HYG and LQD ETF prices
250
+ """
251
+ from pathlib import Path
252
+
253
+ logger.info("Generating full sample dataset: output_dir=%s", output_dir)
254
+
255
+ output_path = Path(output_dir)
256
+ output_path.mkdir(parents=True, exist_ok=True)
257
+
258
+ # Generate CDX data (multiple indices)
259
+ cdx_ig = generate_cdx_sample(
260
+ start_date=start_date,
261
+ periods=periods,
262
+ index_name="CDX_IG",
263
+ tenor="5Y",
264
+ base_spread=70.0,
265
+ volatility=3.0,
266
+ seed=seed,
267
+ )
268
+
269
+ cdx_hy = generate_cdx_sample(
270
+ start_date=start_date,
271
+ periods=periods,
272
+ index_name="CDX_HY",
273
+ tenor="5Y",
274
+ base_spread=350.0,
275
+ volatility=15.0,
276
+ seed=seed + 1,
277
+ )
278
+
279
+ cdx_all = pd.concat([cdx_ig, cdx_hy], ignore_index=True)
280
+ cdx_path = output_path / "cdx_spreads.parquet"
281
+ save_parquet(cdx_all, cdx_path)
282
+
283
+ # Generate VIX data
284
+ vix = generate_vix_sample(
285
+ start_date=start_date, periods=periods, base_vix=16.0, seed=seed + 2
286
+ )
287
+ vix_path = output_path / "vix.parquet"
288
+ save_parquet(vix, vix_path)
289
+
290
+ # Generate ETF data (multiple tickers)
291
+ hyg = generate_etf_sample(
292
+ start_date=start_date,
293
+ periods=periods,
294
+ ticker="HYG",
295
+ base_price=75.0,
296
+ volatility=0.6,
297
+ seed=seed + 3,
298
+ )
299
+
300
+ lqd = generate_etf_sample(
301
+ start_date=start_date,
302
+ periods=periods,
303
+ ticker="LQD",
304
+ base_price=110.0,
305
+ volatility=0.4,
306
+ seed=seed + 4,
307
+ )
308
+
309
+ etf_all = pd.concat([hyg, lqd], ignore_index=True)
310
+ etf_path = output_path / "etf_prices.parquet"
311
+ save_parquet(etf_all, etf_path)
312
+
313
+ file_paths = {
314
+ "cdx": str(cdx_path),
315
+ "vix": str(vix_path),
316
+ "etf": str(etf_path),
317
+ }
318
+
319
+ logger.info("Sample dataset generated: %s", file_paths)
320
+ return file_paths
321
+
322
+
323
+ def generate_full_sample_sources(
324
+ output_dir: str = "data/raw",
325
+ start_date: str = "2023-01-01",
326
+ periods: int = 252,
327
+ seed: int = 42,
328
+ ) -> dict[str, FileSource]:
329
+ """
330
+ Generate complete sample dataset and return FileSource configs.
331
+
332
+ Parameters
333
+ ----------
334
+ output_dir : str, default "data/raw"
335
+ Directory to save Parquet files.
336
+ start_date : str, default "2023-01-01"
337
+ Start date for all time series.
338
+ periods : int, default 252
339
+ Number of daily observations.
340
+ seed : int, default 42
341
+ Random seed for reproducibility.
342
+
343
+ Returns
344
+ -------
345
+ dict[str, FileSource]
346
+ Dictionary mapping data type to FileSource configuration.
347
+
348
+ Notes
349
+ -----
350
+ Convenience wrapper around generate_full_sample_dataset that returns
351
+ FileSource objects ready to use with fetch functions.
352
+ """
353
+ file_paths = generate_full_sample_dataset(output_dir, start_date, periods, seed)
354
+
355
+ return {
356
+ "cdx": FileSource(Path(file_paths["cdx"])),
357
+ "vix": FileSource(Path(file_paths["vix"])),
358
+ "etf": FileSource(Path(file_paths["etf"])),
359
+ }
aponyx/data/schemas.py ADDED
@@ -0,0 +1,65 @@
1
+ """
2
+ Data schemas and validation rules for market data.
3
+
4
+ Defines expected column names, types, and constraints for each data source.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class CDXSchema:
13
+ """Schema for CDX index data."""
14
+
15
+ date_col: str = "date"
16
+ spread_col: str = "spread"
17
+ index_col: str = "index" # e.g., "CDX_IG_5Y"
18
+ tenor_col: str = "tenor" # e.g., "5Y", "10Y"
19
+ series_col: str = "series" # CDX series number
20
+
21
+ required_cols: tuple[str, ...] = ("date", "spread", "index")
22
+ optional_cols: tuple[str, ...] = ("tenor", "series")
23
+
24
+ # Validation constraints
25
+ min_spread: float = 0.0 # Spreads in basis points
26
+ max_spread: float = 10000.0 # 100% spread cap
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class VIXSchema:
31
+ """Schema for VIX volatility index data."""
32
+
33
+ date_col: str = "date"
34
+ close_col: str = "close"
35
+
36
+ required_cols: tuple[str, ...] = ("date", "close")
37
+ optional_cols: tuple[str, ...] = ()
38
+
39
+ # Validation constraints
40
+ min_vix: float = 0.0
41
+ max_vix: float = 200.0 # Extreme stress cap
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class ETFSchema:
46
+ """Schema for credit ETF data (HYG, LQD)."""
47
+
48
+ date_col: str = "date"
49
+ close_col: str = "close"
50
+ ticker_col: str = "ticker"
51
+
52
+ required_cols: tuple[str, ...] = ("date", "close", "ticker")
53
+ optional_cols: tuple[str, ...] = ()
54
+
55
+ # Validation constraints
56
+ min_price: float = 0.0
57
+ max_price: float = 10000.0 # Sanity check
58
+
59
+
60
+ # Schema registry for runtime lookup
61
+ SCHEMAS: dict[str, Any] = {
62
+ "cdx": CDXSchema(),
63
+ "vix": VIXSchema(),
64
+ "etf": ETFSchema(),
65
+ }
aponyx/data/sources.py ADDED
@@ -0,0 +1,135 @@
1
+ """
2
+ Data source configuration for pluggable data providers.
3
+
4
+ Defines source types (file, Bloomberg, API) and factory for provider resolution.
5
+ """
6
+
7
+ import logging
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Protocol, Any
11
+
12
+ import pandas as pd
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class FileSource:
19
+ """
20
+ File-based data source (Parquet or CSV).
21
+
22
+ Attributes
23
+ ----------
24
+ path : Path
25
+ Path to the data file.
26
+ """
27
+
28
+ path: Path
29
+
30
+ def __post_init__(self) -> None:
31
+ """Convert path to Path object if string provided."""
32
+ if isinstance(self.path, str):
33
+ object.__setattr__(self, "path", Path(self.path))
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class BloombergSource:
38
+ """
39
+ Bloomberg Terminal data source.
40
+
41
+ Notes
42
+ -----
43
+ Requires active Bloomberg Terminal session.
44
+ Connection is handled automatically by xbbg wrapper.
45
+ """
46
+
47
+ pass
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class APISource:
52
+ """
53
+ Generic REST API data source.
54
+
55
+ Attributes
56
+ ----------
57
+ endpoint : str
58
+ API endpoint URL.
59
+ params : dict[str, Any]
60
+ Additional request parameters.
61
+ """
62
+
63
+ endpoint: str
64
+ params: dict[str, Any] | None = None
65
+
66
+
67
+ # Union type for all data sources
68
+ DataSource = FileSource | BloombergSource | APISource
69
+
70
+
71
+ class DataProvider(Protocol):
72
+ """
73
+ Protocol for data provider implementations.
74
+
75
+ All providers must implement fetch method with standardized signature.
76
+ """
77
+
78
+ def fetch(
79
+ self,
80
+ instrument: str,
81
+ start_date: str | None = None,
82
+ end_date: str | None = None,
83
+ **params: Any,
84
+ ) -> pd.DataFrame:
85
+ """
86
+ Fetch data for specified instrument and date range.
87
+
88
+ Parameters
89
+ ----------
90
+ instrument : str
91
+ Instrument identifier (e.g., 'CDX.NA.IG.5Y', 'VIX', 'HYG').
92
+ start_date : str or None
93
+ Start date in ISO format (YYYY-MM-DD).
94
+ end_date : str or None
95
+ End date in ISO format (YYYY-MM-DD).
96
+ **params : Any
97
+ Provider-specific parameters.
98
+
99
+ Returns
100
+ -------
101
+ pd.DataFrame
102
+ Data with DatetimeIndex.
103
+ """
104
+ ...
105
+
106
+
107
+ def resolve_provider(source: DataSource) -> str:
108
+ """
109
+ Resolve data source to provider type identifier.
110
+
111
+ Parameters
112
+ ----------
113
+ source : DataSource
114
+ Data source configuration.
115
+
116
+ Returns
117
+ -------
118
+ str
119
+ Provider type: 'file', 'bloomberg', or 'api'.
120
+
121
+ Examples
122
+ --------
123
+ >>> resolve_provider(FileSource("data.parquet"))
124
+ 'file'
125
+ >>> resolve_provider(BloombergSource())
126
+ 'bloomberg'
127
+ """
128
+ if isinstance(source, FileSource):
129
+ return "file"
130
+ elif isinstance(source, BloombergSource):
131
+ return "bloomberg"
132
+ elif isinstance(source, APISource):
133
+ return "api"
134
+ else:
135
+ raise ValueError(f"Unknown source type: {type(source)}")