aponyx 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aponyx/__init__.py +14 -0
- aponyx/backtest/__init__.py +31 -0
- aponyx/backtest/adapters.py +77 -0
- aponyx/backtest/config.py +84 -0
- aponyx/backtest/engine.py +560 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/backtest/registry.py +334 -0
- aponyx/backtest/strategy_catalog.json +50 -0
- aponyx/cli/__init__.py +5 -0
- aponyx/cli/commands/__init__.py +8 -0
- aponyx/cli/commands/clean.py +349 -0
- aponyx/cli/commands/list.py +302 -0
- aponyx/cli/commands/report.py +167 -0
- aponyx/cli/commands/run.py +377 -0
- aponyx/cli/main.py +125 -0
- aponyx/config/__init__.py +82 -0
- aponyx/data/__init__.py +99 -0
- aponyx/data/bloomberg_config.py +306 -0
- aponyx/data/bloomberg_instruments.json +26 -0
- aponyx/data/bloomberg_securities.json +42 -0
- aponyx/data/cache.py +294 -0
- aponyx/data/fetch.py +659 -0
- aponyx/data/fetch_registry.py +135 -0
- aponyx/data/loaders.py +205 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +383 -0
- aponyx/data/providers/file.py +111 -0
- aponyx/data/registry.py +500 -0
- aponyx/data/requirements.py +96 -0
- aponyx/data/sample_data.py +415 -0
- aponyx/data/schemas.py +60 -0
- aponyx/data/sources.py +171 -0
- aponyx/data/synthetic_params.json +46 -0
- aponyx/data/transforms.py +336 -0
- aponyx/data/validation.py +308 -0
- aponyx/docs/__init__.py +24 -0
- aponyx/docs/adding_data_providers.md +682 -0
- aponyx/docs/cdx_knowledge_base.md +455 -0
- aponyx/docs/cdx_overlay_strategy.md +135 -0
- aponyx/docs/cli_guide.md +607 -0
- aponyx/docs/governance_design.md +551 -0
- aponyx/docs/logging_design.md +251 -0
- aponyx/docs/performance_evaluation_design.md +265 -0
- aponyx/docs/python_guidelines.md +786 -0
- aponyx/docs/signal_registry_usage.md +369 -0
- aponyx/docs/signal_suitability_design.md +558 -0
- aponyx/docs/visualization_design.md +277 -0
- aponyx/evaluation/__init__.py +11 -0
- aponyx/evaluation/performance/__init__.py +24 -0
- aponyx/evaluation/performance/adapters.py +109 -0
- aponyx/evaluation/performance/analyzer.py +384 -0
- aponyx/evaluation/performance/config.py +320 -0
- aponyx/evaluation/performance/decomposition.py +304 -0
- aponyx/evaluation/performance/metrics.py +761 -0
- aponyx/evaluation/performance/registry.py +327 -0
- aponyx/evaluation/performance/report.py +541 -0
- aponyx/evaluation/suitability/__init__.py +67 -0
- aponyx/evaluation/suitability/config.py +143 -0
- aponyx/evaluation/suitability/evaluator.py +389 -0
- aponyx/evaluation/suitability/registry.py +328 -0
- aponyx/evaluation/suitability/report.py +398 -0
- aponyx/evaluation/suitability/scoring.py +367 -0
- aponyx/evaluation/suitability/tests.py +303 -0
- aponyx/examples/01_generate_synthetic_data.py +53 -0
- aponyx/examples/02_fetch_data_file.py +82 -0
- aponyx/examples/03_fetch_data_bloomberg.py +104 -0
- aponyx/examples/04_compute_signal.py +164 -0
- aponyx/examples/05_evaluate_suitability.py +224 -0
- aponyx/examples/06_run_backtest.py +242 -0
- aponyx/examples/07_analyze_performance.py +214 -0
- aponyx/examples/08_visualize_results.py +272 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +45 -0
- aponyx/models/config.py +83 -0
- aponyx/models/indicator_transformation.json +52 -0
- aponyx/models/indicators.py +292 -0
- aponyx/models/metadata.py +447 -0
- aponyx/models/orchestrator.py +213 -0
- aponyx/models/registry.py +860 -0
- aponyx/models/score_transformation.json +42 -0
- aponyx/models/signal_catalog.json +29 -0
- aponyx/models/signal_composer.py +513 -0
- aponyx/models/signal_transformation.json +29 -0
- aponyx/persistence/__init__.py +16 -0
- aponyx/persistence/json_io.py +132 -0
- aponyx/persistence/parquet_io.py +378 -0
- aponyx/py.typed +0 -0
- aponyx/reporting/__init__.py +10 -0
- aponyx/reporting/generator.py +517 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx/workflows/__init__.py +18 -0
- aponyx/workflows/concrete_steps.py +720 -0
- aponyx/workflows/config.py +122 -0
- aponyx/workflows/engine.py +279 -0
- aponyx/workflows/registry.py +116 -0
- aponyx/workflows/steps.py +180 -0
- aponyx-0.1.18.dist-info/METADATA +552 -0
- aponyx-0.1.18.dist-info/RECORD +104 -0
- aponyx-0.1.18.dist-info/WHEEL +4 -0
- aponyx-0.1.18.dist-info/entry_points.txt +2 -0
- aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,682 @@
|
|
|
1
|
+
# Adding Data Providers
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The data layer uses a **provider pattern** to support multiple data sources (files, Bloomberg, APIs) through a common interface. This guide shows how to add a new data provider to the framework.
|
|
6
|
+
|
|
7
|
+
**Goal:** Extend data sources without modifying existing code—add new providers as separate modules.
|
|
8
|
+
|
|
9
|
+
## Provider Architecture
|
|
10
|
+
|
|
11
|
+
### Current Providers
|
|
12
|
+
|
|
13
|
+
| Provider | Module | Status | Use Case |
|
|
14
|
+
|----------|--------|--------|----------|
|
|
15
|
+
| `FileSource` | `providers/file.py` | ✅ Implemented | Local Parquet/CSV files |
|
|
16
|
+
| `BloombergSource` | `providers/bloomberg.py` | ✅ Implemented | Bloomberg Terminal data (requires `xbbg` and manual `blpapi` install) |
|
|
17
|
+
| `APISource` | `sources.py` (dataclass only) | ⚠️ Defined but no fetch implementation | REST API endpoints |
|
|
18
|
+
|
|
19
|
+
### Bloomberg Provider Setup
|
|
20
|
+
|
|
21
|
+
The Bloomberg provider requires manual installation of the `blpapi` library:
|
|
22
|
+
|
|
23
|
+
1. Download `blpapi` from Bloomberg's developer portal
|
|
24
|
+
2. Install manually: `pip install path/to/blpapi-*.whl`
|
|
25
|
+
3. Install aponyx with Bloomberg support: `uv pip install aponyx[bloomberg]`
|
|
26
|
+
|
|
27
|
+
The `xbbg` wrapper is included in the `bloomberg` optional dependency, but `blpapi` itself must be installed separately due to Bloomberg's proprietary distribution.
|
|
28
|
+
|
|
29
|
+
**Intraday Updates:** Bloomberg provider supports efficient current-day updates via BDP.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Data Storage Architecture
|
|
34
|
+
|
|
35
|
+
The project uses a three-tier storage structure:
|
|
36
|
+
|
|
37
|
+
| Directory | Purpose | Lifecycle | Regenerable |
|
|
38
|
+
|-----------|---------|-----------|-------------|
|
|
39
|
+
| `data/raw/` | Original source data (Bloomberg downloads, synthetic generation) | **Permanent** — Never auto-deleted | ❌ No |
|
|
40
|
+
| `data/cache/` | Performance optimization for repeated reads | **Temporary** — TTL-based expiration | ✅ Yes |
|
|
41
|
+
| `data/workflows/` | Timestamped workflow outputs (signals, backtests, reports, visualizations) | **Temporary** — Recomputable from raw | ✅ Yes |
|
|
42
|
+
|
|
43
|
+
**Data Flow:**
|
|
44
|
+
```
|
|
45
|
+
Raw Storage (Bloomberg/Synthetic)
|
|
46
|
+
↓
|
|
47
|
+
Cache Layer (TTL-based, automatic)
|
|
48
|
+
↓
|
|
49
|
+
Models/Signals
|
|
50
|
+
↓
|
|
51
|
+
Processed Storage (Results)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**Key Principle:** Raw data is the source of truth. Cache and processed data can always be regenerated from raw.
|
|
55
|
+
|
|
56
|
+
### Raw Data Storage
|
|
57
|
+
|
|
58
|
+
**File Naming:** `{instrument}_{security}_{hash}.parquet`
|
|
59
|
+
|
|
60
|
+
**Examples:**
|
|
61
|
+
```
|
|
62
|
+
cdx_cdx_ig_5y_b1f849bfe3a1.parquet
|
|
63
|
+
vix_vix_00252a34df0f.parquet
|
|
64
|
+
etf_hyg_108d48a6a616.parquet
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Hash Generation:**
|
|
68
|
+
- 12-character SHA256 hash prefix
|
|
69
|
+
- Computed from: provider, instrument, date range, row count, metadata
|
|
70
|
+
- Ensures uniqueness across different data pulls
|
|
71
|
+
|
|
72
|
+
**Metadata Sidecar:** Each `.parquet` file has a corresponding `.json` metadata file:
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"provider": "synthetic",
|
|
77
|
+
"instrument": "cdx",
|
|
78
|
+
"security": "cdx_ig_5y",
|
|
79
|
+
"stored_at": "2025-11-16T20:32:53.953000",
|
|
80
|
+
"date_range": {
|
|
81
|
+
"start": "2020-11-17",
|
|
82
|
+
"end": "2025-11-16"
|
|
83
|
+
},
|
|
84
|
+
"row_count": 1304,
|
|
85
|
+
"columns": ["spread", "security"],
|
|
86
|
+
"hash": "b1f849bfe3a1",
|
|
87
|
+
"generation_params": {
|
|
88
|
+
"base_spread": 100.0,
|
|
89
|
+
"volatility": 5.0
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Cache Layer
|
|
95
|
+
|
|
96
|
+
**Purpose:** Transparent time-to-live (TTL) based caching for data fetching operations.
|
|
97
|
+
|
|
98
|
+
**Cache Location:**
|
|
99
|
+
```
|
|
100
|
+
data/
|
|
101
|
+
cache/
|
|
102
|
+
file/ # Temporary cache from FileSource loads
|
|
103
|
+
cdx_ig_5y_abc123.parquet
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Configuration:**
|
|
107
|
+
```python
|
|
108
|
+
from aponyx.config import CACHE_ENABLED, CACHE_TTL_DAYS
|
|
109
|
+
|
|
110
|
+
# Default: enabled, 1 day TTL
|
|
111
|
+
# Control per fetch call:
|
|
112
|
+
df = fetch_cdx(source, security="cdx_ig_5y", use_cache=True) # Use cache
|
|
113
|
+
df = fetch_cdx(source, security="cdx_ig_5y", use_cache=False) # Skip cache
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
**Automatic Invalidation:**
|
|
117
|
+
1. **TTL expiration:** Entry older than `CACHE_TTL_DAYS`
|
|
118
|
+
2. **Source modification:** Source file modified after cache creation
|
|
119
|
+
|
|
120
|
+
**Intraday Updates (Bloomberg only):**
|
|
121
|
+
```python
|
|
122
|
+
# Morning: Full history
|
|
123
|
+
cdx_df = fetch_cdx(BloombergSource(), security="cdx_ig_5y")
|
|
124
|
+
|
|
125
|
+
# Afternoon: Update only today (~10x faster)
|
|
126
|
+
cdx_df = fetch_cdx(BloombergSource(), security="cdx_ig_5y", update_current_day=True)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Benefits:**
|
|
130
|
+
- ~10x faster than full refetch
|
|
131
|
+
- 500x less data transfer (1 point vs 1800 days)
|
|
132
|
+
- Preserves historical data in cache
|
|
133
|
+
|
|
134
|
+
### Provider Interface
|
|
135
|
+
|
|
136
|
+
Providers are defined as dataclasses and used by fetch functions:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from dataclasses import dataclass
|
|
140
|
+
from pathlib import Path
|
|
141
|
+
import pandas as pd
|
|
142
|
+
|
|
143
|
+
@dataclass(frozen=True)
|
|
144
|
+
class FileSource:
|
|
145
|
+
"""File-based data source (Parquet or CSV)."""
|
|
146
|
+
path: Path
|
|
147
|
+
|
|
148
|
+
@dataclass(frozen=True)
|
|
149
|
+
class BloombergSource:
|
|
150
|
+
"""Bloomberg Terminal data source."""
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
# Fetch functions handle provider-specific logic
|
|
154
|
+
def fetch_from_file(
|
|
155
|
+
file_path: str | Path,
|
|
156
|
+
instrument: str,
|
|
157
|
+
start_date: str | None = None,
|
|
158
|
+
end_date: str | None = None,
|
|
159
|
+
**params,
|
|
160
|
+
) -> pd.DataFrame:
|
|
161
|
+
"""Fetch data from local file."""
|
|
162
|
+
...
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Adding a New Provider
|
|
166
|
+
|
|
167
|
+
### Step 1: Define Data Source
|
|
168
|
+
|
|
169
|
+
Add to `src/aponyx/data/sources.py`:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
"""Data source configuration for pluggable data providers."""
|
|
173
|
+
|
|
174
|
+
from dataclasses import dataclass
|
|
175
|
+
from typing import Any
|
|
176
|
+
|
|
177
|
+
@dataclass(frozen=True)
|
|
178
|
+
class MyCustomSource:
|
|
179
|
+
"""
|
|
180
|
+
Custom data source for [your provider].
|
|
181
|
+
|
|
182
|
+
Attributes
|
|
183
|
+
----------
|
|
184
|
+
endpoint : str
|
|
185
|
+
API endpoint or connection string.
|
|
186
|
+
params : dict[str, Any] | None
|
|
187
|
+
Additional connection parameters.
|
|
188
|
+
"""
|
|
189
|
+
endpoint: str
|
|
190
|
+
params: dict[str, Any] | None = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# Update DataSource union type
|
|
194
|
+
DataSource = FileSource | BloombergSource | MyCustomSource
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Step 2: Create Provider Fetch Function
|
|
198
|
+
|
|
199
|
+
Create `src/aponyx/data/providers/my_provider.py`:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
"""
|
|
203
|
+
Custom data provider fetch implementation.
|
|
204
|
+
|
|
205
|
+
Fetches data from [describe your source].
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
import logging
|
|
209
|
+
from typing import Any
|
|
210
|
+
|
|
211
|
+
import pandas as pd
|
|
212
|
+
|
|
213
|
+
logger = logging.getLogger(__name__)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def fetch_from_mycustom(
|
|
217
|
+
endpoint: str,
|
|
218
|
+
instrument: str,
|
|
219
|
+
start_date: str | None = None,
|
|
220
|
+
end_date: str | None = None,
|
|
221
|
+
**params: Any,
|
|
222
|
+
) -> pd.DataFrame:
|
|
223
|
+
"""
|
|
224
|
+
Fetch data from custom source.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
endpoint : str
|
|
229
|
+
API endpoint or data source URL.
|
|
230
|
+
instrument : str
|
|
231
|
+
Instrument identifier.
|
|
232
|
+
start_date : str | None
|
|
233
|
+
Optional start date filter (ISO format).
|
|
234
|
+
end_date : str | None
|
|
235
|
+
Optional end date filter (ISO format).
|
|
236
|
+
**params : Any
|
|
237
|
+
Additional provider-specific parameters.
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
pd.DataFrame
|
|
242
|
+
Raw data with datetime index.
|
|
243
|
+
|
|
244
|
+
Notes
|
|
245
|
+
-----
|
|
246
|
+
Caching is handled by the fetch layer, not provider implementation.
|
|
247
|
+
"""
|
|
248
|
+
logger.info("Fetching %s from endpoint: %s", instrument, endpoint)
|
|
249
|
+
|
|
250
|
+
# Implement provider-specific data fetching logic
|
|
251
|
+
# Example: API call, database query, etc.
|
|
252
|
+
# Your implementation here
|
|
253
|
+
|
|
254
|
+
# Build query parameters
|
|
255
|
+
query_params = {"instrument": instrument}
|
|
256
|
+
if start_date:
|
|
257
|
+
query_params["start_date"] = start_date
|
|
258
|
+
if end_date:
|
|
259
|
+
query_params["end_date"] = end_date
|
|
260
|
+
query_params.update(params)
|
|
261
|
+
|
|
262
|
+
# Fetch data (example - implement actual logic)
|
|
263
|
+
df = self._make_request(endpoint, query_params)
|
|
264
|
+
|
|
265
|
+
logger.info("Loaded %d rows from custom source", len(df))
|
|
266
|
+
return df
|
|
267
|
+
|
|
268
|
+
def _make_request(self, endpoint: str, params: dict[str, Any]) -> pd.DataFrame:
|
|
269
|
+
"""Make actual request to data source."""
|
|
270
|
+
raise NotImplementedError("Implement provider-specific request logic")
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Step 3: Update Provider Init
|
|
274
|
+
|
|
275
|
+
Add to `src/aponyx/data/providers/__init__.py`:
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
"""Data provider implementations."""
|
|
279
|
+
|
|
280
|
+
from .file import fetch_from_file
|
|
281
|
+
from .bloomberg import fetch_from_bloomberg
|
|
282
|
+
from .my_provider import fetch_from_mycustom # Add new provider
|
|
283
|
+
|
|
284
|
+
__all__ = [
|
|
285
|
+
"fetch_from_file",
|
|
286
|
+
"fetch_from_bloomberg",
|
|
287
|
+
"fetch_from_mycustom", # Export new fetch function
|
|
288
|
+
]
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Step 4: Integrate with Fetch Layer
|
|
292
|
+
|
|
293
|
+
Update `src/aponyx/data/fetch.py` to support new provider:
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
from .sources import MyCustomSource, resolve_provider
|
|
297
|
+
from .providers.my_provider import fetch_from_mycustom
|
|
298
|
+
|
|
299
|
+
def _get_provider_fetch_function(source: DataSource):
|
|
300
|
+
"""Get fetch function for data source."""
|
|
301
|
+
provider_type = resolve_provider(source)
|
|
302
|
+
|
|
303
|
+
if provider_type == "file":
|
|
304
|
+
return fetch_from_file
|
|
305
|
+
elif provider_type == "bloomberg":
|
|
306
|
+
return fetch_from_bloomberg
|
|
307
|
+
elif provider_type == "mycustom": # Add new provider
|
|
308
|
+
return fetch_from_mycustom
|
|
309
|
+
else:
|
|
310
|
+
raise ValueError(f"Unsupported provider: {provider_type}")
|
|
311
|
+
|
|
312
|
+
# Then use in instrument fetch functions:
|
|
313
|
+
def fetch_cdx(
|
|
314
|
+
source: DataSource | None = None,
|
|
315
|
+
security: str | None = None,
|
|
316
|
+
start_date: str | None = None,
|
|
317
|
+
end_date: str | None = None,
|
|
318
|
+
use_cache: bool = CACHE_ENABLED,
|
|
319
|
+
) -> pd.DataFrame:
|
|
320
|
+
"""Fetch CDX data from any provider."""
|
|
321
|
+
# ... caching logic ...
|
|
322
|
+
|
|
323
|
+
fetch_fn = _get_provider_fetch_function(source)
|
|
324
|
+
|
|
325
|
+
if isinstance(source, MyCustomSource):
|
|
326
|
+
df = fetch_fn(
|
|
327
|
+
endpoint=source.endpoint,
|
|
328
|
+
instrument="cdx",
|
|
329
|
+
start_date=start_date,
|
|
330
|
+
end_date=end_date,
|
|
331
|
+
**(source.params or {}),
|
|
332
|
+
)
|
|
333
|
+
# ... other providers ...
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Step 5: Update Provider Resolution
|
|
337
|
+
|
|
338
|
+
Add to `src/aponyx/data/sources.py`:
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
def resolve_provider(source: DataSource) -> str:
|
|
342
|
+
"""Resolve data source to provider type identifier."""
|
|
343
|
+
if isinstance(source, FileSource):
|
|
344
|
+
return "file"
|
|
345
|
+
elif isinstance(source, BloombergSource):
|
|
346
|
+
return "bloomberg"
|
|
347
|
+
elif isinstance(source, MyCustomSource): # Add new provider
|
|
348
|
+
return "mycustom"
|
|
349
|
+
else:
|
|
350
|
+
raise ValueError(f"Unknown source type: {type(source)}")
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### Step 6: Add Schema Validation (Optional)
|
|
354
|
+
|
|
355
|
+
If your data has a specific structure, add a schema in `src/aponyx/data/schemas.py`:
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
from dataclasses import dataclass
|
|
359
|
+
|
|
360
|
+
@dataclass
|
|
361
|
+
class MyCustomSchema:
|
|
362
|
+
"""Schema for custom data provider."""
|
|
363
|
+
|
|
364
|
+
required_columns: list[str] = field(
|
|
365
|
+
default_factory=lambda: ["date", "value", "volume"]
|
|
366
|
+
)
|
|
367
|
+
date_column: str = "date"
|
|
368
|
+
numeric_columns: list[str] = field(
|
|
369
|
+
default_factory=lambda: ["value", "volume"]
|
|
370
|
+
)
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Step 7: Write Tests
|
|
374
|
+
|
|
375
|
+
Create `tests/data/test_my_provider.py`:
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
"""Tests for custom data provider."""
|
|
379
|
+
|
|
380
|
+
import pytest
|
|
381
|
+
import pandas as pd
|
|
382
|
+
from aponyx.data import fetch_cdx
|
|
383
|
+
from aponyx.data.sources import MyCustomSource
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def test_fetch_basic(monkeypatch):
|
|
387
|
+
"""Test basic data fetching with custom provider."""
|
|
388
|
+
# Create source
|
|
389
|
+
source = MyCustomSource(
|
|
390
|
+
endpoint="https://api.example.com",
|
|
391
|
+
params={"api_key": "test"},
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Mock the provider fetch function
|
|
395
|
+
def mock_fetch(*args, **kwargs):
|
|
396
|
+
return pd.DataFrame({
|
|
397
|
+
"date": pd.date_range("2024-01-01", periods=10),
|
|
398
|
+
"spread": range(100, 110),
|
|
399
|
+
"security": ["cdx_ig_5y"] * 10,
|
|
400
|
+
}).set_index("date")
|
|
401
|
+
|
|
402
|
+
from aponyx.data import providers
|
|
403
|
+
monkeypatch.setattr(providers, "fetch_from_mycustom", mock_fetch)
|
|
404
|
+
|
|
405
|
+
# Fetch data
|
|
406
|
+
df = fetch_cdx(source, security="cdx_ig_5y")
|
|
407
|
+
|
|
408
|
+
# Validate
|
|
409
|
+
assert len(df) == 10
|
|
410
|
+
assert "spread" in df.columns
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
## Example: REST API Provider
|
|
414
|
+
|
|
415
|
+
### Source Definition
|
|
416
|
+
|
|
417
|
+
Add to `src/aponyx/data/sources.py`:
|
|
418
|
+
|
|
419
|
+
```python
|
|
420
|
+
@dataclass(frozen=True)
|
|
421
|
+
class APISource:
|
|
422
|
+
"""
|
|
423
|
+
Generic REST API data source.
|
|
424
|
+
|
|
425
|
+
Attributes
|
|
426
|
+
----------
|
|
427
|
+
endpoint : str
|
|
428
|
+
API endpoint URL.
|
|
429
|
+
api_key : str | None
|
|
430
|
+
API authentication key.
|
|
431
|
+
params : dict[str, Any] | None
|
|
432
|
+
Additional request parameters.
|
|
433
|
+
"""
|
|
434
|
+
endpoint: str
|
|
435
|
+
api_key: str | None = None
|
|
436
|
+
params: dict[str, Any] | None = None
|
|
437
|
+
|
|
438
|
+
# Update DataSource union
|
|
439
|
+
DataSource = FileSource | BloombergSource | APISource
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
### Provider Implementation
|
|
443
|
+
|
|
444
|
+
Create `src/aponyx/data/providers/api.py`:
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
"""REST API data provider."""
|
|
448
|
+
|
|
449
|
+
import logging
|
|
450
|
+
from typing import Any
|
|
451
|
+
|
|
452
|
+
import pandas as pd
|
|
453
|
+
import requests
|
|
454
|
+
|
|
455
|
+
logger = logging.getLogger(__name__)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def fetch_from_api(
|
|
459
|
+
endpoint: str,
|
|
460
|
+
instrument: str,
|
|
461
|
+
api_key: str | None = None,
|
|
462
|
+
start_date: str | None = None,
|
|
463
|
+
end_date: str | None = None,
|
|
464
|
+
**params: Any,
|
|
465
|
+
) -> pd.DataFrame:
|
|
466
|
+
"""
|
|
467
|
+
Fetch data from REST API endpoint.
|
|
468
|
+
|
|
469
|
+
Parameters
|
|
470
|
+
----------
|
|
471
|
+
endpoint : str
|
|
472
|
+
API endpoint URL.
|
|
473
|
+
instrument : str
|
|
474
|
+
Instrument identifier.
|
|
475
|
+
api_key : str | None
|
|
476
|
+
Optional API key for authentication.
|
|
477
|
+
start_date : str | None
|
|
478
|
+
Start date filter (ISO format).
|
|
479
|
+
end_date : str | None
|
|
480
|
+
End date filter (ISO format).
|
|
481
|
+
**params : Any
|
|
482
|
+
Additional query parameters.
|
|
483
|
+
|
|
484
|
+
Returns
|
|
485
|
+
-------
|
|
486
|
+
pd.DataFrame
|
|
487
|
+
JSON response converted to DataFrame with DatetimeIndex.
|
|
488
|
+
"""
|
|
489
|
+
# Build request parameters
|
|
490
|
+
query_params = {"instrument": instrument}
|
|
491
|
+
if start_date:
|
|
492
|
+
query_params["start_date"] = start_date
|
|
493
|
+
if end_date:
|
|
494
|
+
query_params["end_date"] = end_date
|
|
495
|
+
query_params.update(params)
|
|
496
|
+
|
|
497
|
+
# Add authentication if available
|
|
498
|
+
headers = {}
|
|
499
|
+
if api_key:
|
|
500
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
501
|
+
|
|
502
|
+
logger.info("GET %s with params=%s", endpoint, query_params)
|
|
503
|
+
|
|
504
|
+
# Make request
|
|
505
|
+
response = requests.get(endpoint, params=query_params, headers=headers)
|
|
506
|
+
response.raise_for_status()
|
|
507
|
+
|
|
508
|
+
# Parse JSON to DataFrame
|
|
509
|
+
data = response.json()
|
|
510
|
+
df = pd.DataFrame(data)
|
|
511
|
+
|
|
512
|
+
# Convert date column if present
|
|
513
|
+
if "date" in df.columns:
|
|
514
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
515
|
+
df = df.set_index("date")
|
|
516
|
+
|
|
517
|
+
logger.info("Fetched %d rows from API", len(df))
|
|
518
|
+
return df
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
### Usage
|
|
522
|
+
|
|
523
|
+
```python
|
|
524
|
+
from aponyx.data import fetch_cdx
|
|
525
|
+
from aponyx.data.sources import MyCustomSource
|
|
526
|
+
|
|
527
|
+
# Setup custom source
|
|
528
|
+
source = MyCustomSource(
|
|
529
|
+
endpoint="https://api.example.com/market-data",
|
|
530
|
+
params={"api_key": "your-key-here"},
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Fetch data (caching handled automatically)
|
|
534
|
+
df = fetch_cdx(
|
|
535
|
+
source=source,
|
|
536
|
+
security="cdx_ig_5y",
|
|
537
|
+
start_date="2024-01-01",
|
|
538
|
+
end_date="2024-12-31",
|
|
539
|
+
)
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
## Provider Design Patterns
|
|
543
|
+
|
|
544
|
+
### Pattern 1: Stateful Connection
|
|
545
|
+
|
|
546
|
+
```python
|
|
547
|
+
# For providers requiring persistent connections,
|
|
548
|
+
# manage state in module-level variables
|
|
549
|
+
|
|
550
|
+
_connection = None
|
|
551
|
+
|
|
552
|
+
def _get_connection():
|
|
553
|
+
"""Get or create connection instance."""
|
|
554
|
+
global _connection
|
|
555
|
+
if _connection is None:
|
|
556
|
+
_connection = initialize_connection()
|
|
557
|
+
logger.info("Connection established")
|
|
558
|
+
return _connection
|
|
559
|
+
|
|
560
|
+
def fetch_from_database(
|
|
561
|
+
query: str,
|
|
562
|
+
instrument: str,
|
|
563
|
+
**params,
|
|
564
|
+
) -> pd.DataFrame:
|
|
565
|
+
"""Fetch using persistent connection."""
|
|
566
|
+
conn = _get_connection()
|
|
567
|
+
return pd.read_sql(query, conn, params=params)
|
|
568
|
+
```
|
|
569
|
+
|
|
570
|
+
### Pattern 2: Retry Logic
|
|
571
|
+
|
|
572
|
+
```python
|
|
573
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
574
|
+
|
|
575
|
+
@retry(
|
|
576
|
+
stop=stop_after_attempt(3),
|
|
577
|
+
wait=wait_exponential(min=1, max=10),
|
|
578
|
+
)
|
|
579
|
+
def fetch_from_api(
|
|
580
|
+
endpoint: str,
|
|
581
|
+
instrument: str,
|
|
582
|
+
**params,
|
|
583
|
+
) -> pd.DataFrame:
|
|
584
|
+
"""Fetch with automatic retry on network errors."""
|
|
585
|
+
response = requests.get(endpoint, params=params)
|
|
586
|
+
response.raise_for_status()
|
|
587
|
+
return pd.DataFrame(response.json())
|
|
588
|
+
```
|
|
589
|
+
|
|
590
|
+
### Pattern 3: Batch Fetching
|
|
591
|
+
|
|
592
|
+
```python
|
|
593
|
+
def fetch_from_batch_api(
|
|
594
|
+
endpoint: str,
|
|
595
|
+
instrument: str,
|
|
596
|
+
start_date: str | None = None,
|
|
597
|
+
end_date: str | None = None,
|
|
598
|
+
batch_size: int = 1000,
|
|
599
|
+
**params,
|
|
600
|
+
) -> pd.DataFrame:
|
|
601
|
+
"""Fetch data in batches for large date ranges."""
|
|
602
|
+
all_data = []
|
|
603
|
+
|
|
604
|
+
# Split date range into batches
|
|
605
|
+
batches = _create_date_batches(start_date, end_date, batch_size)
|
|
606
|
+
|
|
607
|
+
for batch_start, batch_end in batches:
|
|
608
|
+
logger.debug("Fetching batch: %s to %s", batch_start, batch_end)
|
|
609
|
+
batch_df = _fetch_single_batch(
|
|
610
|
+
endpoint,
|
|
611
|
+
instrument,
|
|
612
|
+
batch_start,
|
|
613
|
+
batch_end,
|
|
614
|
+
**params,
|
|
615
|
+
)
|
|
616
|
+
all_data.append(batch_df)
|
|
617
|
+
|
|
618
|
+
# Combine all batches
|
|
619
|
+
return pd.concat(all_data).sort_index()
|
|
620
|
+
```
|
|
621
|
+
|
|
622
|
+
## Best Practices
|
|
623
|
+
|
|
624
|
+
1. **Define data sources as frozen dataclasses** for immutability
|
|
625
|
+
2. **Implement fetch functions** instead of class methods for simplicity
|
|
626
|
+
3. **Let the fetch layer handle caching** - providers should focus on data retrieval
|
|
627
|
+
4. **Log all operations** (connections, queries, errors) using %-formatting
|
|
628
|
+
5. **Validate output schema** in the fetch layer, not provider
|
|
629
|
+
6. **Handle errors gracefully** with informative messages
|
|
630
|
+
7. **Use type hints** for all parameters and return values
|
|
631
|
+
8. **Test with mocked data** to avoid external dependencies
|
|
632
|
+
9. **Document connection requirements** (credentials, network access)
|
|
633
|
+
10. **Follow naming convention**: `fetch_from_*` for provider functions
|
|
634
|
+
|
|
635
|
+
## Troubleshooting
|
|
636
|
+
|
|
637
|
+
### Provider Not Found
|
|
638
|
+
|
|
639
|
+
```python
|
|
640
|
+
# Check import
|
|
641
|
+
from aponyx.data.providers import fetch_from_mycustom # Should work
|
|
642
|
+
|
|
643
|
+
# Verify __init__.py exports
|
|
644
|
+
from aponyx.data import providers
|
|
645
|
+
print(dir(providers)) # Should list fetch_from_mycustom
|
|
646
|
+
```
|
|
647
|
+
|
|
648
|
+
### Cache Not Working
|
|
649
|
+
|
|
650
|
+
```python
|
|
651
|
+
# Enable debug logging to see cache operations
|
|
652
|
+
import logging
|
|
653
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
654
|
+
|
|
655
|
+
# Check if caching is enabled
|
|
656
|
+
from aponyx.config import CACHE_ENABLED, CACHE_TTL_DAYS
|
|
657
|
+
print(f"Cache enabled: {CACHE_ENABLED}, TTL: {CACHE_TTL_DAYS} days")
|
|
658
|
+
|
|
659
|
+
# Explicitly control cache usage
|
|
660
|
+
df = fetch_cdx(source, security="cdx_ig_5y", use_cache=True)
|
|
661
|
+
```
|
|
662
|
+
|
|
663
|
+
### Authentication Failures
|
|
664
|
+
|
|
665
|
+
```python
|
|
666
|
+
# Don't hardcode credentials in source definitions
|
|
667
|
+
import os
|
|
668
|
+
|
|
669
|
+
api_key = os.environ.get("MY_API_KEY")
|
|
670
|
+
if not api_key:
|
|
671
|
+
raise ValueError("MY_API_KEY environment variable not set")
|
|
672
|
+
|
|
673
|
+
source = APISource(
|
|
674
|
+
endpoint="https://api.example.com",
|
|
675
|
+
api_key=api_key,
|
|
676
|
+
)
|
|
677
|
+
```
|
|
678
|
+
|
|
679
|
+
---
|
|
680
|
+
|
|
681
|
+
**Maintained by:** stabilefrisur
|
|
682
|
+
**Last Updated:** December 13, 2025
|