aponyx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aponyx might be problematic. Click here for more details.
- aponyx/__init__.py +12 -0
- aponyx/backtest/__init__.py +29 -0
- aponyx/backtest/adapters.py +134 -0
- aponyx/backtest/config.py +59 -0
- aponyx/backtest/engine.py +256 -0
- aponyx/backtest/metrics.py +216 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/config/__init__.py +77 -0
- aponyx/data/__init__.py +31 -0
- aponyx/data/cache.py +242 -0
- aponyx/data/fetch.py +410 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +269 -0
- aponyx/data/providers/file.py +86 -0
- aponyx/data/sample_data.py +359 -0
- aponyx/data/schemas.py +65 -0
- aponyx/data/sources.py +135 -0
- aponyx/data/validation.py +231 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +24 -0
- aponyx/models/catalog.py +167 -0
- aponyx/models/config.py +33 -0
- aponyx/models/registry.py +200 -0
- aponyx/models/signal_catalog.json +34 -0
- aponyx/models/signals.py +221 -0
- aponyx/persistence/__init__.py +20 -0
- aponyx/persistence/json_io.py +130 -0
- aponyx/persistence/parquet_io.py +174 -0
- aponyx/persistence/registry.py +375 -0
- aponyx/py.typed +0 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx-0.1.0.dist-info/METADATA +271 -0
- aponyx-0.1.0.dist-info/RECORD +37 -0
- aponyx-0.1.0.dist-info/WHEEL +4 -0
aponyx/models/signals.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core signal generation functions for CDX overlay strategy.
|
|
3
|
+
|
|
4
|
+
Implements the three pilot signals:
|
|
5
|
+
1. CDX-ETF basis (flow-driven mispricing)
|
|
6
|
+
2. CDX-VIX gap (cross-asset risk sentiment)
|
|
7
|
+
3. Spread momentum (short-term continuation)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from .config import SignalConfig
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compute_cdx_etf_basis(
|
|
19
|
+
cdx_df: pd.DataFrame,
|
|
20
|
+
etf_df: pd.DataFrame,
|
|
21
|
+
config: SignalConfig | None = None,
|
|
22
|
+
) -> pd.Series:
|
|
23
|
+
"""
|
|
24
|
+
Compute normalized basis between CDX index spreads and ETF-implied spreads.
|
|
25
|
+
|
|
26
|
+
The signal captures temporary mispricing driven by ETF flows and liquidity
|
|
27
|
+
constraints. Positive values indicate CDX is cheap relative to ETF (long CDX
|
|
28
|
+
vs short ETF). Negative values indicate CDX is expensive (short CDX vs long ETF).
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
cdx_df : pd.DataFrame
|
|
33
|
+
CDX spread data with DatetimeIndex and 'spread' column.
|
|
34
|
+
etf_df : pd.DataFrame
|
|
35
|
+
ETF price data with DatetimeIndex and 'close' column.
|
|
36
|
+
config : SignalConfig | None
|
|
37
|
+
Configuration parameters. Uses defaults if None.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
pd.Series
|
|
42
|
+
Z-score normalized basis signal aligned to common dates.
|
|
43
|
+
|
|
44
|
+
Notes
|
|
45
|
+
-----
|
|
46
|
+
- Uses z-score normalization over rolling window for regime independence.
|
|
47
|
+
- Assumes ETF prices have been converted to spread-equivalent units externally.
|
|
48
|
+
- Missing values are forward-filled before alignment to avoid spurious gaps.
|
|
49
|
+
"""
|
|
50
|
+
if config is None:
|
|
51
|
+
config = SignalConfig()
|
|
52
|
+
|
|
53
|
+
logger.info(
|
|
54
|
+
"Computing CDX-ETF basis: cdx_rows=%d, etf_rows=%d, lookback=%d",
|
|
55
|
+
len(cdx_df),
|
|
56
|
+
len(etf_df),
|
|
57
|
+
config.lookback,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Align data to common dates
|
|
61
|
+
cdx_spread = cdx_df["spread"]
|
|
62
|
+
etf_spread = etf_df["close"].reindex(cdx_df.index, method="ffill")
|
|
63
|
+
|
|
64
|
+
# Compute raw basis
|
|
65
|
+
raw_basis = cdx_spread - etf_spread
|
|
66
|
+
|
|
67
|
+
# Normalize using rolling z-score
|
|
68
|
+
rolling_mean = raw_basis.rolling(
|
|
69
|
+
window=config.lookback,
|
|
70
|
+
min_periods=config.min_periods,
|
|
71
|
+
).mean()
|
|
72
|
+
rolling_std = raw_basis.rolling(
|
|
73
|
+
window=config.lookback,
|
|
74
|
+
min_periods=config.min_periods,
|
|
75
|
+
).std()
|
|
76
|
+
|
|
77
|
+
signal = (raw_basis - rolling_mean) / rolling_std
|
|
78
|
+
|
|
79
|
+
valid_count = signal.notna().sum()
|
|
80
|
+
logger.debug("Generated %d valid basis signals", valid_count)
|
|
81
|
+
|
|
82
|
+
return signal
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def compute_cdx_vix_gap(
|
|
86
|
+
cdx_df: pd.DataFrame,
|
|
87
|
+
vix_df: pd.DataFrame,
|
|
88
|
+
config: SignalConfig | None = None,
|
|
89
|
+
) -> pd.Series:
|
|
90
|
+
"""
|
|
91
|
+
Compute cross-asset risk sentiment gap between credit spreads and equity vol.
|
|
92
|
+
|
|
93
|
+
Identifies divergence between CDX and VIX movements. Positive values indicate
|
|
94
|
+
credit stress outpacing equity stress (long credit risk). Negative values indicate
|
|
95
|
+
equity stress outpacing credit stress (short credit risk).
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
cdx_df : pd.DataFrame
|
|
100
|
+
CDX spreads with DatetimeIndex and 'spread' column.
|
|
101
|
+
vix_df : pd.DataFrame
|
|
102
|
+
VIX levels with DatetimeIndex and 'close' column.
|
|
103
|
+
config : SignalConfig | None
|
|
104
|
+
Configuration parameters. Uses defaults if None.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
pd.Series
|
|
109
|
+
Z-score normalized CDX-VIX gap signal.
|
|
110
|
+
|
|
111
|
+
Notes
|
|
112
|
+
-----
|
|
113
|
+
- Both CDX and VIX deviations are computed from their own rolling means.
|
|
114
|
+
- Gap computed as CDX stress minus VIX stress for consistent sign convention.
|
|
115
|
+
- Normalized to account for varying volatility regimes.
|
|
116
|
+
- Filters out transient spikes by using mean deviation over the lookback period.
|
|
117
|
+
"""
|
|
118
|
+
if config is None:
|
|
119
|
+
config = SignalConfig()
|
|
120
|
+
|
|
121
|
+
logger.info(
|
|
122
|
+
"Computing CDX-VIX gap: cdx_rows=%d, vix_rows=%d, lookback=%d",
|
|
123
|
+
len(cdx_df),
|
|
124
|
+
len(vix_df),
|
|
125
|
+
config.lookback,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Align data to common dates
|
|
129
|
+
cdx = cdx_df["spread"]
|
|
130
|
+
vix = vix_df["close"].reindex(cdx_df.index, method="ffill")
|
|
131
|
+
|
|
132
|
+
# Compute deviations from rolling means
|
|
133
|
+
cdx_deviation = (
|
|
134
|
+
cdx
|
|
135
|
+
- cdx.rolling(
|
|
136
|
+
window=config.lookback,
|
|
137
|
+
min_periods=config.min_periods,
|
|
138
|
+
).mean()
|
|
139
|
+
)
|
|
140
|
+
vix_deviation = (
|
|
141
|
+
vix
|
|
142
|
+
- vix.rolling(
|
|
143
|
+
window=config.lookback,
|
|
144
|
+
min_periods=config.min_periods,
|
|
145
|
+
).mean()
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Raw gap: CDX stress minus VIX stress
|
|
149
|
+
# Positive when credit stress outpaces equity stress (buy CDX)
|
|
150
|
+
# Negative when equity stress outpaces credit stress (sell CDX)
|
|
151
|
+
raw_gap = cdx_deviation - vix_deviation
|
|
152
|
+
|
|
153
|
+
# Normalize the gap
|
|
154
|
+
rolling_std = raw_gap.rolling(
|
|
155
|
+
window=config.lookback,
|
|
156
|
+
min_periods=config.min_periods,
|
|
157
|
+
).std()
|
|
158
|
+
signal = raw_gap / rolling_std
|
|
159
|
+
|
|
160
|
+
valid_count = signal.notna().sum()
|
|
161
|
+
logger.debug("Generated %d valid CDX-VIX gap signals", valid_count)
|
|
162
|
+
|
|
163
|
+
return signal
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def compute_spread_momentum(
|
|
167
|
+
cdx_df: pd.DataFrame,
|
|
168
|
+
config: SignalConfig | None = None,
|
|
169
|
+
) -> pd.Series:
|
|
170
|
+
"""
|
|
171
|
+
Compute short-term volatility-adjusted momentum in CDX spreads.
|
|
172
|
+
|
|
173
|
+
Captures continuation or mean-reversion tendencies over 3-10 day horizons.
|
|
174
|
+
Positive signal suggests long credit risk (spreads tightening, momentum favorable).
|
|
175
|
+
Negative signal suggests short credit risk (spreads widening, momentum unfavorable).
|
|
176
|
+
|
|
177
|
+
Parameters
|
|
178
|
+
----------
|
|
179
|
+
cdx_df : pd.DataFrame
|
|
180
|
+
CDX spread data with DatetimeIndex and 'spread' column.
|
|
181
|
+
config : SignalConfig | None
|
|
182
|
+
Configuration parameters. Uses defaults if None.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
pd.Series
|
|
187
|
+
Z-score normalized momentum signal.
|
|
188
|
+
|
|
189
|
+
Notes
|
|
190
|
+
-----
|
|
191
|
+
- Uses negative of spread change: tightening spreads give positive signal.
|
|
192
|
+
- Short lookback (5-10 days) suitable for tactical overlay strategy.
|
|
193
|
+
- Positive signal indicates tightening momentum (bullish credit).
|
|
194
|
+
"""
|
|
195
|
+
if config is None:
|
|
196
|
+
config = SignalConfig()
|
|
197
|
+
|
|
198
|
+
logger.info(
|
|
199
|
+
"Computing spread momentum: cdx_rows=%d, lookback=%d",
|
|
200
|
+
len(cdx_df),
|
|
201
|
+
config.lookback,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
spread = cdx_df["spread"]
|
|
205
|
+
|
|
206
|
+
# Compute spread change over lookback period (negative for tightening)
|
|
207
|
+
spread_change = spread - spread.shift(config.lookback)
|
|
208
|
+
|
|
209
|
+
# Normalize by rolling volatility and negate
|
|
210
|
+
# Positive when spreads tightening (buy CDX)
|
|
211
|
+
# Negative when spreads widening (sell CDX)
|
|
212
|
+
rolling_std = spread.rolling(
|
|
213
|
+
window=config.lookback,
|
|
214
|
+
min_periods=config.min_periods,
|
|
215
|
+
).std()
|
|
216
|
+
signal = -spread_change / rolling_std
|
|
217
|
+
|
|
218
|
+
valid_count = signal.notna().sum()
|
|
219
|
+
logger.debug("Generated %d valid momentum signals", valid_count)
|
|
220
|
+
|
|
221
|
+
return signal
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persistence layer for time series data and metadata management.
|
|
3
|
+
|
|
4
|
+
Provides clean abstractions for Parquet and JSON I/O, with a registry
|
|
5
|
+
system to track available datasets.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .parquet_io import save_parquet, load_parquet, list_parquet_files
|
|
9
|
+
from .json_io import save_json, load_json
|
|
10
|
+
from .registry import DataRegistry, DatasetEntry
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"save_parquet",
|
|
14
|
+
"load_parquet",
|
|
15
|
+
"list_parquet_files",
|
|
16
|
+
"save_json",
|
|
17
|
+
"load_json",
|
|
18
|
+
"DataRegistry",
|
|
19
|
+
"DatasetEntry",
|
|
20
|
+
]
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JSON I/O utilities for metadata, parameters, and run logs.
|
|
3
|
+
|
|
4
|
+
Handles serialization of dictionaries with support for common data types
|
|
5
|
+
including datetime, Path, and numpy arrays.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
from datetime import datetime, date
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EnhancedJSONEncoder(json.JSONEncoder):
|
|
19
|
+
"""
|
|
20
|
+
JSON encoder with support for datetime, Path, and numpy types.
|
|
21
|
+
|
|
22
|
+
Extends standard JSONEncoder to handle common scientific computing types
|
|
23
|
+
that appear in metadata and parameter dictionaries.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def default(self, obj: Any) -> Any:
|
|
27
|
+
"""Convert non-serializable objects to JSON-compatible types."""
|
|
28
|
+
if isinstance(obj, (datetime, date)):
|
|
29
|
+
return obj.isoformat()
|
|
30
|
+
elif isinstance(obj, Path):
|
|
31
|
+
return str(obj)
|
|
32
|
+
elif isinstance(obj, np.integer):
|
|
33
|
+
return int(obj)
|
|
34
|
+
elif isinstance(obj, np.floating):
|
|
35
|
+
return float(obj)
|
|
36
|
+
elif isinstance(obj, np.ndarray):
|
|
37
|
+
return obj.tolist()
|
|
38
|
+
return super().default(obj)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def save_json(
|
|
42
|
+
data: dict[str, Any],
|
|
43
|
+
path: str | Path,
|
|
44
|
+
indent: int = 2,
|
|
45
|
+
sort_keys: bool = True,
|
|
46
|
+
) -> Path:
|
|
47
|
+
"""
|
|
48
|
+
Save dictionary to JSON file with enhanced type support.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
data : dict
|
|
53
|
+
Dictionary to serialize. Supports datetime, Path, and numpy types.
|
|
54
|
+
path : str or Path
|
|
55
|
+
Target file path. Parent directories created if needed.
|
|
56
|
+
indent : int, default 2
|
|
57
|
+
Number of spaces for indentation (for readability).
|
|
58
|
+
sort_keys : bool, default True
|
|
59
|
+
Whether to sort dictionary keys alphabetically.
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
Path
|
|
64
|
+
Absolute path to the saved file.
|
|
65
|
+
|
|
66
|
+
Examples
|
|
67
|
+
--------
|
|
68
|
+
>>> metadata = {
|
|
69
|
+
... 'timestamp': datetime.now(),
|
|
70
|
+
... 'params': {'window': 5, 'threshold': 0.5},
|
|
71
|
+
... 'version': '0.1.0'
|
|
72
|
+
... }
|
|
73
|
+
>>> save_json(metadata, 'logs/run_20241025.json')
|
|
74
|
+
"""
|
|
75
|
+
path = Path(path)
|
|
76
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
|
|
78
|
+
logger.info("Saving JSON to %s (%d top-level keys)", path, len(data))
|
|
79
|
+
|
|
80
|
+
with path.open("w", encoding="utf-8") as f:
|
|
81
|
+
json.dump(
|
|
82
|
+
data,
|
|
83
|
+
f,
|
|
84
|
+
cls=EnhancedJSONEncoder,
|
|
85
|
+
indent=indent,
|
|
86
|
+
sort_keys=sort_keys,
|
|
87
|
+
ensure_ascii=False,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
logger.debug("Successfully saved %d bytes to %s", path.stat().st_size, path)
|
|
91
|
+
return path.absolute()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def load_json(path: str | Path) -> dict[str, Any]:
|
|
95
|
+
"""
|
|
96
|
+
Load dictionary from JSON file.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
path : str or Path
|
|
101
|
+
Source file path.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
dict
|
|
106
|
+
Deserialized dictionary.
|
|
107
|
+
|
|
108
|
+
Raises
|
|
109
|
+
------
|
|
110
|
+
FileNotFoundError
|
|
111
|
+
If the specified file does not exist.
|
|
112
|
+
json.JSONDecodeError
|
|
113
|
+
If the file contains invalid JSON.
|
|
114
|
+
|
|
115
|
+
Examples
|
|
116
|
+
--------
|
|
117
|
+
>>> metadata = load_json('logs/run_20241025.json')
|
|
118
|
+
>>> print(metadata['timestamp'])
|
|
119
|
+
"""
|
|
120
|
+
path = Path(path)
|
|
121
|
+
if not path.exists():
|
|
122
|
+
raise FileNotFoundError(f"JSON file not found: {path}")
|
|
123
|
+
|
|
124
|
+
logger.info("Loading JSON from %s", path)
|
|
125
|
+
|
|
126
|
+
with path.open("r", encoding="utf-8") as f:
|
|
127
|
+
data = json.load(f)
|
|
128
|
+
|
|
129
|
+
logger.debug("Loaded JSON with %d top-level keys", len(data) if isinstance(data, dict) else 0)
|
|
130
|
+
return data
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parquet I/O utilities for time series data persistence.
|
|
3
|
+
|
|
4
|
+
Handles efficient storage and retrieval of market data (CDX spreads, VIX, ETF prices)
|
|
5
|
+
with metadata preservation and validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def save_parquet(
|
|
16
|
+
df: pd.DataFrame,
|
|
17
|
+
path: str | Path,
|
|
18
|
+
compression: str = "snappy",
|
|
19
|
+
index: bool = True,
|
|
20
|
+
) -> Path:
|
|
21
|
+
"""
|
|
22
|
+
Save DataFrame to Parquet with optimized settings for time series data.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
df : pd.DataFrame
|
|
27
|
+
DataFrame to persist. For time series, index should be DatetimeIndex.
|
|
28
|
+
path : str or Path
|
|
29
|
+
Target file path. Parent directories created if needed.
|
|
30
|
+
compression : str, default "snappy"
|
|
31
|
+
Compression algorithm. Options: "snappy", "gzip", "brotli", "zstd".
|
|
32
|
+
index : bool, default True
|
|
33
|
+
Whether to write DataFrame index to file.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
Path
|
|
38
|
+
Absolute path to the saved file.
|
|
39
|
+
|
|
40
|
+
Raises
|
|
41
|
+
------
|
|
42
|
+
ValueError
|
|
43
|
+
If DataFrame is empty or path is invalid.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> df = pd.DataFrame({'spread': [100, 105, 98]},
|
|
48
|
+
... index=pd.date_range('2024-01-01', periods=3))
|
|
49
|
+
>>> save_parquet(df, 'data/cdx_ig_5y.parquet')
|
|
50
|
+
"""
|
|
51
|
+
if df.empty:
|
|
52
|
+
raise ValueError("Cannot save empty DataFrame")
|
|
53
|
+
|
|
54
|
+
path = Path(path)
|
|
55
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
logger.info(
|
|
58
|
+
"Saving DataFrame to Parquet: path=%s, rows=%d, columns=%d, compression=%s",
|
|
59
|
+
path,
|
|
60
|
+
len(df),
|
|
61
|
+
len(df.columns),
|
|
62
|
+
compression,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
df.to_parquet(
|
|
66
|
+
path,
|
|
67
|
+
engine="pyarrow",
|
|
68
|
+
compression=compression,
|
|
69
|
+
index=index,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
logger.debug("Successfully saved %d bytes to %s", path.stat().st_size, path)
|
|
73
|
+
return path.absolute()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_parquet(
|
|
77
|
+
path: str | Path,
|
|
78
|
+
columns: list[str] | None = None,
|
|
79
|
+
start_date: pd.Timestamp | None = None,
|
|
80
|
+
end_date: pd.Timestamp | None = None,
|
|
81
|
+
) -> pd.DataFrame:
|
|
82
|
+
"""
|
|
83
|
+
Load DataFrame from Parquet with optional filtering.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
path : str or Path
|
|
88
|
+
Source file path.
|
|
89
|
+
columns : list of str, optional
|
|
90
|
+
Subset of columns to load. If None, loads all columns.
|
|
91
|
+
start_date : pd.Timestamp, optional
|
|
92
|
+
Filter data from this date (inclusive). Requires DatetimeIndex.
|
|
93
|
+
end_date : pd.Timestamp, optional
|
|
94
|
+
Filter data to this date (inclusive). Requires DatetimeIndex.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
pd.DataFrame
|
|
99
|
+
Loaded and optionally filtered DataFrame.
|
|
100
|
+
|
|
101
|
+
Raises
|
|
102
|
+
------
|
|
103
|
+
FileNotFoundError
|
|
104
|
+
If the specified file does not exist.
|
|
105
|
+
ValueError
|
|
106
|
+
If date filtering is requested but index is not DatetimeIndex.
|
|
107
|
+
|
|
108
|
+
Examples
|
|
109
|
+
--------
|
|
110
|
+
>>> df = load_parquet('data/cdx_ig_5y.parquet',
|
|
111
|
+
... start_date=pd.Timestamp('2024-01-01'))
|
|
112
|
+
>>> df = load_parquet('data/vix.parquet', columns=['close'])
|
|
113
|
+
"""
|
|
114
|
+
path = Path(path)
|
|
115
|
+
if not path.exists():
|
|
116
|
+
raise FileNotFoundError(f"Parquet file not found: {path}")
|
|
117
|
+
|
|
118
|
+
logger.info("Loading Parquet file: path=%s, columns=%s", path, columns or "all")
|
|
119
|
+
|
|
120
|
+
df = pd.read_parquet(path, engine="pyarrow", columns=columns)
|
|
121
|
+
|
|
122
|
+
# Apply date filtering if requested
|
|
123
|
+
if start_date is not None or end_date is not None:
|
|
124
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"Date filtering requires DatetimeIndex. "
|
|
127
|
+
f"Got {type(df.index).__name__}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if start_date is not None:
|
|
131
|
+
df = df[df.index >= start_date]
|
|
132
|
+
if end_date is not None:
|
|
133
|
+
df = df[df.index <= end_date]
|
|
134
|
+
|
|
135
|
+
logger.debug(
|
|
136
|
+
"Applied date filter: start=%s, end=%s, resulting_rows=%d",
|
|
137
|
+
start_date,
|
|
138
|
+
end_date,
|
|
139
|
+
len(df),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info("Loaded %d rows, %d columns from %s", len(df), len(df.columns), path)
|
|
143
|
+
return df
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def list_parquet_files(directory: str | Path, pattern: str = "*.parquet") -> list[Path]:
|
|
147
|
+
"""
|
|
148
|
+
List all Parquet files in a directory matching a pattern.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
directory : str or Path
|
|
153
|
+
Directory to search.
|
|
154
|
+
pattern : str, default "*.parquet"
|
|
155
|
+
Glob pattern for file matching.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
list of Path
|
|
160
|
+
Sorted list of matching file paths.
|
|
161
|
+
|
|
162
|
+
Examples
|
|
163
|
+
--------
|
|
164
|
+
>>> files = list_parquet_files('data/', pattern='cdx_*.parquet')
|
|
165
|
+
>>> files = list_parquet_files('data/raw/')
|
|
166
|
+
"""
|
|
167
|
+
directory = Path(directory)
|
|
168
|
+
if not directory.exists():
|
|
169
|
+
logger.debug("Directory does not exist: %s", directory)
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
files = sorted(directory.glob(pattern))
|
|
173
|
+
logger.info("Found %d Parquet files in %s (pattern=%s)", len(files), directory, pattern)
|
|
174
|
+
return files
|