aponyx 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aponyx/__init__.py +14 -0
- aponyx/backtest/__init__.py +31 -0
- aponyx/backtest/adapters.py +77 -0
- aponyx/backtest/config.py +84 -0
- aponyx/backtest/engine.py +560 -0
- aponyx/backtest/protocols.py +101 -0
- aponyx/backtest/registry.py +334 -0
- aponyx/backtest/strategy_catalog.json +50 -0
- aponyx/cli/__init__.py +5 -0
- aponyx/cli/commands/__init__.py +8 -0
- aponyx/cli/commands/clean.py +349 -0
- aponyx/cli/commands/list.py +302 -0
- aponyx/cli/commands/report.py +167 -0
- aponyx/cli/commands/run.py +377 -0
- aponyx/cli/main.py +125 -0
- aponyx/config/__init__.py +82 -0
- aponyx/data/__init__.py +99 -0
- aponyx/data/bloomberg_config.py +306 -0
- aponyx/data/bloomberg_instruments.json +26 -0
- aponyx/data/bloomberg_securities.json +42 -0
- aponyx/data/cache.py +294 -0
- aponyx/data/fetch.py +659 -0
- aponyx/data/fetch_registry.py +135 -0
- aponyx/data/loaders.py +205 -0
- aponyx/data/providers/__init__.py +13 -0
- aponyx/data/providers/bloomberg.py +383 -0
- aponyx/data/providers/file.py +111 -0
- aponyx/data/registry.py +500 -0
- aponyx/data/requirements.py +96 -0
- aponyx/data/sample_data.py +415 -0
- aponyx/data/schemas.py +60 -0
- aponyx/data/sources.py +171 -0
- aponyx/data/synthetic_params.json +46 -0
- aponyx/data/transforms.py +336 -0
- aponyx/data/validation.py +308 -0
- aponyx/docs/__init__.py +24 -0
- aponyx/docs/adding_data_providers.md +682 -0
- aponyx/docs/cdx_knowledge_base.md +455 -0
- aponyx/docs/cdx_overlay_strategy.md +135 -0
- aponyx/docs/cli_guide.md +607 -0
- aponyx/docs/governance_design.md +551 -0
- aponyx/docs/logging_design.md +251 -0
- aponyx/docs/performance_evaluation_design.md +265 -0
- aponyx/docs/python_guidelines.md +786 -0
- aponyx/docs/signal_registry_usage.md +369 -0
- aponyx/docs/signal_suitability_design.md +558 -0
- aponyx/docs/visualization_design.md +277 -0
- aponyx/evaluation/__init__.py +11 -0
- aponyx/evaluation/performance/__init__.py +24 -0
- aponyx/evaluation/performance/adapters.py +109 -0
- aponyx/evaluation/performance/analyzer.py +384 -0
- aponyx/evaluation/performance/config.py +320 -0
- aponyx/evaluation/performance/decomposition.py +304 -0
- aponyx/evaluation/performance/metrics.py +761 -0
- aponyx/evaluation/performance/registry.py +327 -0
- aponyx/evaluation/performance/report.py +541 -0
- aponyx/evaluation/suitability/__init__.py +67 -0
- aponyx/evaluation/suitability/config.py +143 -0
- aponyx/evaluation/suitability/evaluator.py +389 -0
- aponyx/evaluation/suitability/registry.py +328 -0
- aponyx/evaluation/suitability/report.py +398 -0
- aponyx/evaluation/suitability/scoring.py +367 -0
- aponyx/evaluation/suitability/tests.py +303 -0
- aponyx/examples/01_generate_synthetic_data.py +53 -0
- aponyx/examples/02_fetch_data_file.py +82 -0
- aponyx/examples/03_fetch_data_bloomberg.py +104 -0
- aponyx/examples/04_compute_signal.py +164 -0
- aponyx/examples/05_evaluate_suitability.py +224 -0
- aponyx/examples/06_run_backtest.py +242 -0
- aponyx/examples/07_analyze_performance.py +214 -0
- aponyx/examples/08_visualize_results.py +272 -0
- aponyx/main.py +7 -0
- aponyx/models/__init__.py +45 -0
- aponyx/models/config.py +83 -0
- aponyx/models/indicator_transformation.json +52 -0
- aponyx/models/indicators.py +292 -0
- aponyx/models/metadata.py +447 -0
- aponyx/models/orchestrator.py +213 -0
- aponyx/models/registry.py +860 -0
- aponyx/models/score_transformation.json +42 -0
- aponyx/models/signal_catalog.json +29 -0
- aponyx/models/signal_composer.py +513 -0
- aponyx/models/signal_transformation.json +29 -0
- aponyx/persistence/__init__.py +16 -0
- aponyx/persistence/json_io.py +132 -0
- aponyx/persistence/parquet_io.py +378 -0
- aponyx/py.typed +0 -0
- aponyx/reporting/__init__.py +10 -0
- aponyx/reporting/generator.py +517 -0
- aponyx/visualization/__init__.py +20 -0
- aponyx/visualization/app.py +37 -0
- aponyx/visualization/plots.py +309 -0
- aponyx/visualization/visualizer.py +242 -0
- aponyx/workflows/__init__.py +18 -0
- aponyx/workflows/concrete_steps.py +720 -0
- aponyx/workflows/config.py +122 -0
- aponyx/workflows/engine.py +279 -0
- aponyx/workflows/registry.py +116 -0
- aponyx/workflows/steps.py +180 -0
- aponyx-0.1.18.dist-info/METADATA +552 -0
- aponyx-0.1.18.dist-info/RECORD +104 -0
- aponyx-0.1.18.dist-info/WHEEL +4 -0
- aponyx-0.1.18.dist-info/entry_points.txt +2 -0
- aponyx-0.1.18.dist-info/licenses/LICENSE +21 -0
aponyx/data/registry.py
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data registry for tracking available datasets and their metadata.
|
|
3
|
+
|
|
4
|
+
Provides a centralized catalog of market data files with versioning,
|
|
5
|
+
validation status, and update timestamps.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from dataclasses import dataclass, field, asdict
|
|
12
|
+
from typing import Any
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from ..persistence.json_io import save_json, load_json
|
|
16
|
+
from ..persistence.parquet_io import load_parquet
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class DatasetEntry:
|
|
23
|
+
"""
|
|
24
|
+
Metadata for a registered dataset.
|
|
25
|
+
|
|
26
|
+
Attributes
|
|
27
|
+
----------
|
|
28
|
+
instrument : str
|
|
29
|
+
Instrument identifier (e.g., 'CDX.NA.IG', 'VIX', 'HYG').
|
|
30
|
+
file_path : str
|
|
31
|
+
Path to the Parquet file.
|
|
32
|
+
registered_at : str
|
|
33
|
+
ISO format timestamp of registration.
|
|
34
|
+
start_date : str or None
|
|
35
|
+
ISO format start date of data coverage.
|
|
36
|
+
end_date : str or None
|
|
37
|
+
ISO format end date of data coverage.
|
|
38
|
+
row_count : int or None
|
|
39
|
+
Number of rows in the dataset.
|
|
40
|
+
last_updated : str or None
|
|
41
|
+
ISO format timestamp of last statistics update.
|
|
42
|
+
metadata : dict[str, Any]
|
|
43
|
+
Additional user-defined metadata.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
instrument: str
|
|
47
|
+
file_path: str
|
|
48
|
+
registered_at: str
|
|
49
|
+
start_date: str | None = None
|
|
50
|
+
end_date: str | None = None
|
|
51
|
+
row_count: int | None = None
|
|
52
|
+
last_updated: str | None = None
|
|
53
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
def to_dict(self) -> dict[str, Any]:
|
|
56
|
+
"""Convert entry to dictionary for JSON serialization."""
|
|
57
|
+
return asdict(self)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dict(cls, data: dict[str, Any]) -> "DatasetEntry":
|
|
61
|
+
"""Create entry from dictionary loaded from JSON."""
|
|
62
|
+
return cls(**data)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DataRegistry:
|
|
66
|
+
"""
|
|
67
|
+
Registry for tracking and managing available market data files.
|
|
68
|
+
|
|
69
|
+
Maintains a catalog of Parquet datasets with metadata including:
|
|
70
|
+
- Data source and instrument
|
|
71
|
+
- Date range coverage
|
|
72
|
+
- Last update timestamp
|
|
73
|
+
- Validation status
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
registry_path : str or Path
|
|
78
|
+
Path to the registry JSON file.
|
|
79
|
+
data_directory : str or Path
|
|
80
|
+
Root directory containing data files.
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
--------
|
|
84
|
+
>>> registry = DataRegistry('data/registry.json', 'data/')
|
|
85
|
+
>>> registry.register_dataset(
|
|
86
|
+
... name='cdx_ig_5y',
|
|
87
|
+
... file_path='data/cdx_ig_5y.parquet',
|
|
88
|
+
... instrument='CDX.NA.IG'
|
|
89
|
+
... )
|
|
90
|
+
>>> info = registry.get_dataset_info('cdx_ig_5y')
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
registry_path: str | Path,
|
|
96
|
+
data_directory: str | Path,
|
|
97
|
+
):
|
|
98
|
+
"""Initialize registry with paths to catalog and data storage."""
|
|
99
|
+
self.registry_path = Path(registry_path)
|
|
100
|
+
self.data_directory = Path(data_directory).resolve()
|
|
101
|
+
self.data_directory.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
# Load existing registry or create new
|
|
104
|
+
if self.registry_path.exists():
|
|
105
|
+
self._catalog = load_json(self.registry_path)
|
|
106
|
+
logger.info(
|
|
107
|
+
"Loaded existing registry: path=%s, datasets=%d",
|
|
108
|
+
self.registry_path,
|
|
109
|
+
len(self._catalog),
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
self._catalog = {}
|
|
113
|
+
self._save()
|
|
114
|
+
logger.info("Created new registry: path=%s", self.registry_path)
|
|
115
|
+
|
|
116
|
+
def _resolve_path(self, path: str | Path) -> Path:
|
|
117
|
+
"""
|
|
118
|
+
Resolve path relative to data directory.
|
|
119
|
+
|
|
120
|
+
Converts relative paths stored in registry to absolute paths
|
|
121
|
+
for file operations.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
path : str or Path
|
|
126
|
+
Path from registry (may be relative or absolute).
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
Path
|
|
131
|
+
Absolute path for file access.
|
|
132
|
+
"""
|
|
133
|
+
p = Path(path)
|
|
134
|
+
if p.is_absolute():
|
|
135
|
+
return p
|
|
136
|
+
return self.data_directory / p
|
|
137
|
+
|
|
138
|
+
def _normalize_path(self, path: str | Path) -> str:
|
|
139
|
+
"""
|
|
140
|
+
Normalize path to relative format for storage in registry.
|
|
141
|
+
|
|
142
|
+
Converts absolute paths to relative paths from data_directory.
|
|
143
|
+
Relative paths are stored as-is.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
path : str or Path
|
|
148
|
+
Path to normalize (absolute or relative).
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
str
|
|
153
|
+
Relative path string for registry storage.
|
|
154
|
+
"""
|
|
155
|
+
p = Path(path).resolve()
|
|
156
|
+
try:
|
|
157
|
+
# Try to make path relative to data_directory
|
|
158
|
+
relative = p.relative_to(self.data_directory)
|
|
159
|
+
return str(relative).replace("\\", "/") # Use forward slashes
|
|
160
|
+
except ValueError:
|
|
161
|
+
# Path is outside data_directory, store as-is
|
|
162
|
+
logger.warning("Path outside data directory, storing absolute: %s", p)
|
|
163
|
+
return str(p)
|
|
164
|
+
|
|
165
|
+
def register_dataset(
|
|
166
|
+
self,
|
|
167
|
+
name: str,
|
|
168
|
+
file_path: str | Path,
|
|
169
|
+
instrument: str,
|
|
170
|
+
metadata: dict[str, Any] | None = None,
|
|
171
|
+
) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Register a dataset in the catalog with metadata.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
name : str
|
|
178
|
+
Unique identifier for the dataset (e.g., 'cdx_ig_5y').
|
|
179
|
+
file_path : str or Path
|
|
180
|
+
Path to the Parquet file (relative to data_directory or absolute).
|
|
181
|
+
instrument : str
|
|
182
|
+
Instrument identifier (e.g., 'CDX.NA.IG', 'VIX', 'HYG').
|
|
183
|
+
metadata : dict, optional
|
|
184
|
+
Additional metadata to store with the dataset.
|
|
185
|
+
|
|
186
|
+
Examples
|
|
187
|
+
--------
|
|
188
|
+
>>> registry.register_dataset(
|
|
189
|
+
... name='vix_index',
|
|
190
|
+
... file_path='data/vix.parquet',
|
|
191
|
+
... instrument='VIX',
|
|
192
|
+
... metadata={'source': 'CBOE', 'frequency': 'daily'}
|
|
193
|
+
... )
|
|
194
|
+
"""
|
|
195
|
+
file_path = Path(file_path)
|
|
196
|
+
# Normalize to relative path for storage
|
|
197
|
+
normalized_path = self._normalize_path(file_path)
|
|
198
|
+
# Resolve to absolute path for file operations
|
|
199
|
+
resolved_path = self._resolve_path(normalized_path)
|
|
200
|
+
|
|
201
|
+
# Get dataset statistics if file exists
|
|
202
|
+
if resolved_path.exists():
|
|
203
|
+
try:
|
|
204
|
+
df = load_parquet(resolved_path)
|
|
205
|
+
start_date = (
|
|
206
|
+
df.index.min() if isinstance(df.index, pd.DatetimeIndex) else None
|
|
207
|
+
)
|
|
208
|
+
end_date = (
|
|
209
|
+
df.index.max() if isinstance(df.index, pd.DatetimeIndex) else None
|
|
210
|
+
)
|
|
211
|
+
row_count = len(df)
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.warning(
|
|
214
|
+
"Failed to extract stats from %s: %s",
|
|
215
|
+
file_path,
|
|
216
|
+
str(e),
|
|
217
|
+
)
|
|
218
|
+
start_date = end_date = row_count = None
|
|
219
|
+
else:
|
|
220
|
+
logger.debug("Registering non-existent file: %s", resolved_path)
|
|
221
|
+
start_date = end_date = row_count = None
|
|
222
|
+
|
|
223
|
+
# Build registry entry using dataclass
|
|
224
|
+
entry = DatasetEntry(
|
|
225
|
+
instrument=instrument,
|
|
226
|
+
file_path=normalized_path,
|
|
227
|
+
registered_at=datetime.now().isoformat(),
|
|
228
|
+
start_date=start_date.isoformat() if start_date else None,
|
|
229
|
+
end_date=end_date.isoformat() if end_date else None,
|
|
230
|
+
row_count=row_count,
|
|
231
|
+
metadata=metadata or {},
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self._catalog[name] = entry.to_dict()
|
|
235
|
+
self._save()
|
|
236
|
+
|
|
237
|
+
logger.info(
|
|
238
|
+
"Registered dataset: name=%s, instrument=%s, rows=%s",
|
|
239
|
+
name,
|
|
240
|
+
instrument,
|
|
241
|
+
row_count,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def get_dataset_info(self, name: str) -> dict[str, Any]:
|
|
245
|
+
"""
|
|
246
|
+
Retrieve metadata for a registered dataset.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
name : str
|
|
251
|
+
Dataset identifier.
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
dict[str, Any]
|
|
256
|
+
Dataset metadata including file path, date range, etc.
|
|
257
|
+
The file_path is returned as an absolute path.
|
|
258
|
+
|
|
259
|
+
Raises
|
|
260
|
+
------
|
|
261
|
+
KeyError
|
|
262
|
+
If dataset name not found in registry.
|
|
263
|
+
|
|
264
|
+
Notes
|
|
265
|
+
-----
|
|
266
|
+
Returns a copy to prevent external modification of catalog.
|
|
267
|
+
For type-safe access, use `get_dataset_entry()` instead.
|
|
268
|
+
"""
|
|
269
|
+
if name not in self._catalog:
|
|
270
|
+
raise KeyError(f"Dataset '{name}' not found in registry")
|
|
271
|
+
|
|
272
|
+
info = self._catalog[name].copy()
|
|
273
|
+
# Resolve relative path to absolute for consumers
|
|
274
|
+
info["file_path"] = str(self._resolve_path(info["file_path"]))
|
|
275
|
+
return info
|
|
276
|
+
|
|
277
|
+
def get_dataset_entry(self, name: str) -> DatasetEntry:
|
|
278
|
+
"""
|
|
279
|
+
Retrieve metadata as a typed DatasetEntry object.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
name : str
|
|
284
|
+
Dataset identifier.
|
|
285
|
+
|
|
286
|
+
Returns
|
|
287
|
+
-------
|
|
288
|
+
DatasetEntry
|
|
289
|
+
Typed dataset metadata with attribute access.
|
|
290
|
+
|
|
291
|
+
Raises
|
|
292
|
+
------
|
|
293
|
+
KeyError
|
|
294
|
+
If dataset name not found in registry.
|
|
295
|
+
|
|
296
|
+
Examples
|
|
297
|
+
--------
|
|
298
|
+
>>> entry = registry.get_dataset_entry('cdx_ig_5y')
|
|
299
|
+
>>> print(entry.instrument) # IDE autocomplete works
|
|
300
|
+
'CDX.NA.IG'
|
|
301
|
+
>>> print(entry.row_count)
|
|
302
|
+
215
|
|
303
|
+
"""
|
|
304
|
+
if name not in self._catalog:
|
|
305
|
+
raise KeyError(f"Dataset '{name}' not found in registry")
|
|
306
|
+
return DatasetEntry.from_dict(self._catalog[name])
|
|
307
|
+
|
|
308
|
+
def list_datasets(
|
|
309
|
+
self,
|
|
310
|
+
instrument: str | None = None,
|
|
311
|
+
) -> list[str]:
|
|
312
|
+
"""
|
|
313
|
+
List registered datasets, optionally filtered by instrument.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
instrument : str, optional
|
|
318
|
+
Filter by instrument (e.g., 'CDX.NA.IG', 'VIX').
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
list of str
|
|
323
|
+
Sorted list of dataset names matching filters.
|
|
324
|
+
|
|
325
|
+
Examples
|
|
326
|
+
--------
|
|
327
|
+
>>> registry.list_datasets(instrument='CDX.NA.IG')
|
|
328
|
+
['cdx_ig_5y', 'cdx_ig_10y']
|
|
329
|
+
"""
|
|
330
|
+
datasets = []
|
|
331
|
+
for name, info in self._catalog.items():
|
|
332
|
+
if instrument and info.get("instrument") != instrument:
|
|
333
|
+
continue
|
|
334
|
+
datasets.append(name)
|
|
335
|
+
return sorted(datasets)
|
|
336
|
+
|
|
337
|
+
def find_dataset_by_security(self, security_id: str) -> str | None:
|
|
338
|
+
"""
|
|
339
|
+
Find the most recent dataset for a specific security ID.
|
|
340
|
+
|
|
341
|
+
Searches for datasets where metadata.params.security matches the
|
|
342
|
+
provided security_id. Returns the most recently registered dataset
|
|
343
|
+
if multiple matches exist.
|
|
344
|
+
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
347
|
+
security_id : str
|
|
348
|
+
Security identifier (e.g., 'cdx_ig_5y', 'lqd', 'vix').
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
str or None
|
|
353
|
+
Dataset name if found, None otherwise.
|
|
354
|
+
|
|
355
|
+
Examples
|
|
356
|
+
--------
|
|
357
|
+
>>> registry.find_dataset_by_security('cdx_ig_5y')
|
|
358
|
+
'cache_cdx_c3bedc49b771b0f2'
|
|
359
|
+
>>> registry.find_dataset_by_security('vix')
|
|
360
|
+
'cache_vix_d09015690dfa93d9'
|
|
361
|
+
"""
|
|
362
|
+
matching_datasets = []
|
|
363
|
+
|
|
364
|
+
for name, info in self._catalog.items():
|
|
365
|
+
metadata = info.get("metadata", {})
|
|
366
|
+
params = metadata.get("params", {})
|
|
367
|
+
|
|
368
|
+
# Match by security ID in params
|
|
369
|
+
if params.get("security") == security_id:
|
|
370
|
+
matching_datasets.append(name)
|
|
371
|
+
# For instruments without security param (VIX), match by security_id == instrument
|
|
372
|
+
elif security_id == "vix" and info.get("instrument") == "vix":
|
|
373
|
+
matching_datasets.append(name)
|
|
374
|
+
|
|
375
|
+
if not matching_datasets:
|
|
376
|
+
return None
|
|
377
|
+
|
|
378
|
+
# Return most recent (sort by registration timestamp)
|
|
379
|
+
return sorted(matching_datasets)[-1]
|
|
380
|
+
|
|
381
|
+
def load_dataset_by_security(self, security_id: str) -> pd.DataFrame:
|
|
382
|
+
"""
|
|
383
|
+
Find and load the most recent dataset for a specific security.
|
|
384
|
+
|
|
385
|
+
Convenience method that combines find_dataset_by_security() with
|
|
386
|
+
data loading from the registry.
|
|
387
|
+
|
|
388
|
+
Parameters
|
|
389
|
+
----------
|
|
390
|
+
security_id : str
|
|
391
|
+
Security identifier (e.g., 'cdx_ig_5y', 'lqd', 'vix').
|
|
392
|
+
|
|
393
|
+
Returns
|
|
394
|
+
-------
|
|
395
|
+
pd.DataFrame
|
|
396
|
+
Loaded dataset with DatetimeIndex.
|
|
397
|
+
|
|
398
|
+
Raises
|
|
399
|
+
------
|
|
400
|
+
ValueError
|
|
401
|
+
If no dataset found for the security ID.
|
|
402
|
+
|
|
403
|
+
Examples
|
|
404
|
+
--------
|
|
405
|
+
>>> registry = DataRegistry(REGISTRY_PATH, DATA_DIR)
|
|
406
|
+
>>> cdx_df = registry.load_dataset_by_security('cdx_ig_5y')
|
|
407
|
+
>>> vix_df = registry.load_dataset_by_security('vix')
|
|
408
|
+
"""
|
|
409
|
+
dataset_name = self.find_dataset_by_security(security_id)
|
|
410
|
+
|
|
411
|
+
if dataset_name is None:
|
|
412
|
+
raise ValueError(
|
|
413
|
+
f"No dataset found for security '{security_id}'. "
|
|
414
|
+
f"Available datasets: {', '.join(sorted(self._catalog.keys()))}"
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
info = self.get_dataset_info(dataset_name)
|
|
418
|
+
return load_parquet(info["file_path"])
|
|
419
|
+
|
|
420
|
+
def update_dataset_stats(self, name: str) -> None:
|
|
421
|
+
"""
|
|
422
|
+
Refresh date range and row count statistics for a dataset.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
name : str
|
|
427
|
+
Dataset identifier.
|
|
428
|
+
|
|
429
|
+
Raises
|
|
430
|
+
------
|
|
431
|
+
KeyError
|
|
432
|
+
If dataset not found in registry.
|
|
433
|
+
FileNotFoundError
|
|
434
|
+
If dataset file does not exist.
|
|
435
|
+
"""
|
|
436
|
+
if name not in self._catalog:
|
|
437
|
+
raise KeyError(f"Dataset '{name}' not found in registry")
|
|
438
|
+
|
|
439
|
+
entry = self._catalog[name]
|
|
440
|
+
file_path = self._resolve_path(entry["file_path"])
|
|
441
|
+
|
|
442
|
+
if not file_path.exists():
|
|
443
|
+
raise FileNotFoundError(f"Dataset file not found: {file_path}")
|
|
444
|
+
|
|
445
|
+
df = load_parquet(file_path)
|
|
446
|
+
|
|
447
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
448
|
+
entry["start_date"] = df.index.min().isoformat()
|
|
449
|
+
entry["end_date"] = df.index.max().isoformat()
|
|
450
|
+
entry["row_count"] = len(df)
|
|
451
|
+
entry["last_updated"] = datetime.now().isoformat()
|
|
452
|
+
|
|
453
|
+
self._save()
|
|
454
|
+
|
|
455
|
+
logger.info(
|
|
456
|
+
"Updated dataset stats: name=%s, rows=%d, date_range=%s to %s",
|
|
457
|
+
name,
|
|
458
|
+
len(df),
|
|
459
|
+
entry["start_date"],
|
|
460
|
+
entry["end_date"],
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
def remove_dataset(self, name: str, delete_file: bool = False) -> None:
|
|
464
|
+
"""
|
|
465
|
+
Remove a dataset from the registry.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
name : str
|
|
470
|
+
Dataset identifier.
|
|
471
|
+
delete_file : bool, default False
|
|
472
|
+
If True, also delete the underlying Parquet file.
|
|
473
|
+
|
|
474
|
+
Raises
|
|
475
|
+
------
|
|
476
|
+
KeyError
|
|
477
|
+
If dataset not found in registry.
|
|
478
|
+
"""
|
|
479
|
+
if name not in self._catalog:
|
|
480
|
+
raise KeyError(f"Dataset '{name}' not found in registry")
|
|
481
|
+
|
|
482
|
+
if delete_file:
|
|
483
|
+
file_path = self._resolve_path(self._catalog[name]["file_path"])
|
|
484
|
+
if file_path.exists():
|
|
485
|
+
file_path.unlink()
|
|
486
|
+
logger.info(
|
|
487
|
+
"Deleted file for dataset: name=%s, path=%s", name, file_path
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
del self._catalog[name]
|
|
491
|
+
self._save()
|
|
492
|
+
logger.info("Removed dataset from registry: name=%s", name)
|
|
493
|
+
|
|
494
|
+
def _save(self) -> None:
|
|
495
|
+
"""Persist registry catalog to JSON file."""
|
|
496
|
+
save_json(self._catalog, self.registry_path)
|
|
497
|
+
|
|
498
|
+
def __repr__(self) -> str:
|
|
499
|
+
"""String representation showing registry statistics."""
|
|
500
|
+
return f"DataRegistry(path={self.registry_path}, datasets={len(self._catalog)})"
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Signal data requirements resolution.
|
|
3
|
+
|
|
4
|
+
Determines what market data to load based on signal catalog configuration.
|
|
5
|
+
Bridges signal metadata (models layer) with data loading (data layer).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_required_data_keys(signal_catalog_path: Path) -> set[str]:
|
|
16
|
+
"""
|
|
17
|
+
Get union of all data keys required by enabled signals.
|
|
18
|
+
|
|
19
|
+
Use this to determine what market data to load before computing signals.
|
|
20
|
+
Reads signal catalog JSON directly without importing models layer.
|
|
21
|
+
|
|
22
|
+
The correct workflow is:
|
|
23
|
+
1. Get required data keys from catalog
|
|
24
|
+
2. Load all required data into market_data dict
|
|
25
|
+
3. Compute all enabled signals at once
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
signal_catalog_path : Path
|
|
30
|
+
Path to signal catalog JSON file.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
set[str]
|
|
35
|
+
Set of data keys (e.g., {"cdx", "etf", "vix"}) required
|
|
36
|
+
by all enabled signals.
|
|
37
|
+
|
|
38
|
+
Raises
|
|
39
|
+
------
|
|
40
|
+
FileNotFoundError
|
|
41
|
+
If signal catalog file does not exist.
|
|
42
|
+
ValueError
|
|
43
|
+
If catalog JSON is invalid or missing required fields.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> from aponyx.config import SIGNAL_CATALOG_PATH
|
|
48
|
+
>>> from aponyx.data.requirements import get_required_data_keys
|
|
49
|
+
>>> data_keys = get_required_data_keys(SIGNAL_CATALOG_PATH)
|
|
50
|
+
>>> # Load all required data
|
|
51
|
+
>>> market_data = {}
|
|
52
|
+
>>> for key in data_keys:
|
|
53
|
+
... market_data[key] = load_data_for(key)
|
|
54
|
+
>>> # Compute all signals
|
|
55
|
+
>>> from aponyx.models import compute_registered_signals, SignalConfig
|
|
56
|
+
>>> from aponyx.models.registry import SignalRegistry
|
|
57
|
+
>>> registry = SignalRegistry(SIGNAL_CATALOG_PATH)
|
|
58
|
+
>>> config = SignalConfig(lookback=20)
|
|
59
|
+
>>> signals = compute_registered_signals(registry, market_data, config)
|
|
60
|
+
"""
|
|
61
|
+
if not signal_catalog_path.exists():
|
|
62
|
+
raise FileNotFoundError(f"Signal catalog not found: {signal_catalog_path}")
|
|
63
|
+
|
|
64
|
+
# Load catalog JSON
|
|
65
|
+
with open(signal_catalog_path, encoding="utf-8") as f:
|
|
66
|
+
catalog_data = json.load(f)
|
|
67
|
+
|
|
68
|
+
if not isinstance(catalog_data, list):
|
|
69
|
+
raise ValueError("Signal catalog must be a JSON array")
|
|
70
|
+
|
|
71
|
+
# Aggregate data requirements from enabled signals
|
|
72
|
+
all_data_keys = set()
|
|
73
|
+
|
|
74
|
+
for entry in catalog_data:
|
|
75
|
+
# Skip disabled signals
|
|
76
|
+
if not entry.get("enabled", True):
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Get data requirements
|
|
80
|
+
data_requirements = entry.get("data_requirements", {})
|
|
81
|
+
if not isinstance(data_requirements, dict):
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Signal '{entry.get('name', 'unknown')}' has invalid data_requirements. "
|
|
84
|
+
f"Expected dict, got {type(data_requirements)}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Add all data keys
|
|
88
|
+
all_data_keys.update(data_requirements.keys())
|
|
89
|
+
|
|
90
|
+
logger.debug(
|
|
91
|
+
"Required data keys from %d enabled signals: %s",
|
|
92
|
+
sum(1 for e in catalog_data if e.get("enabled", True)),
|
|
93
|
+
sorted(all_data_keys),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return all_data_keys
|