alpha-engine-lib 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alpha_engine_lib/__init__.py +3 -0
- alpha_engine_lib/agent_schemas.py +663 -0
- alpha_engine_lib/alerts.py +576 -0
- alpha_engine_lib/arcticdb.py +340 -0
- alpha_engine_lib/collector_results.py +69 -0
- alpha_engine_lib/cost.py +665 -0
- alpha_engine_lib/dates.py +273 -0
- alpha_engine_lib/decision_capture.py +462 -0
- alpha_engine_lib/ec2_spot.py +363 -0
- alpha_engine_lib/email_sender.py +206 -0
- alpha_engine_lib/eval_artifacts.py +361 -0
- alpha_engine_lib/logging.py +303 -0
- alpha_engine_lib/model_pricing.yaml +73 -0
- alpha_engine_lib/pillars.py +756 -0
- alpha_engine_lib/pipeline_status/__init__.py +70 -0
- alpha_engine_lib/pipeline_status/read.py +541 -0
- alpha_engine_lib/pipeline_status/registry.py +368 -0
- alpha_engine_lib/pipeline_status/templates.py +120 -0
- alpha_engine_lib/preflight.py +444 -0
- alpha_engine_lib/rag/__init__.py +39 -0
- alpha_engine_lib/rag/db.py +96 -0
- alpha_engine_lib/rag/embeddings.py +63 -0
- alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
- alpha_engine_lib/rag/rerank.py +377 -0
- alpha_engine_lib/rag/retrieval.py +465 -0
- alpha_engine_lib/rag/schema.sql +65 -0
- alpha_engine_lib/reconcile.py +203 -0
- alpha_engine_lib/secrets.py +186 -0
- alpha_engine_lib/sources/__init__.py +35 -0
- alpha_engine_lib/sources/protocols.py +227 -0
- alpha_engine_lib/ssm_log_capture.py +274 -0
- alpha_engine_lib/telegram.py +165 -0
- alpha_engine_lib/trading_calendar.py +236 -0
- alpha_engine_lib/transparency.py +746 -0
- alpha_engine_lib/transparency_inventory.yaml +260 -0
- alpha_engine_lib/universe.py +83 -0
- alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
- alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
- alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
- alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quantitative parity reconciliation for ``dict[str, DataFrame]`` price stores.
|
|
3
|
+
|
|
4
|
+
The SOTA observation substrate for contract-safe data-tier migrations: when a
|
|
5
|
+
consumer is being moved from one price source to another (e.g. the
|
|
6
|
+
``predictor/price_cache_slim/`` parquet tier -> the ArcticDB universe lib,
|
|
7
|
+
Wave 4 of the predictor/ S3 namespace rationalization), the cutover decision
|
|
8
|
+
must be **data-driven**, not eyeballed. This module turns "do the two sources
|
|
9
|
+
agree?" into a single auditable :class:`ParityReport` with:
|
|
10
|
+
|
|
11
|
+
- ticker-set symmetric difference (coverage),
|
|
12
|
+
- per-ticker row-count delta (shape),
|
|
13
|
+
- max absolute value delta over the *overlapping* dates/columns (fidelity),
|
|
14
|
+
- a binary ``passed`` keyed on an explicit epsilon.
|
|
15
|
+
|
|
16
|
+
:meth:`ParityReport.as_metrics` is JSON-able so the same object that decides a
|
|
17
|
+
gate can be emitted to the metrics surface and observed over a window before
|
|
18
|
+
the producer is retired. One implementation, reused by every consumer
|
|
19
|
+
migration PR (data macro-breadth, backtester exit-timing) and the final
|
|
20
|
+
deletion gate — no per-repo re-implementation of "are these prices equal".
|
|
21
|
+
|
|
22
|
+
Pure pandas; importable without the ``[arcticdb]`` extra (pandas is imported
|
|
23
|
+
lazily inside the function, mirroring the arcticdb module's contract).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from typing import Mapping, Optional, Sequence, Tuple
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class ParityReport:
|
|
34
|
+
"""Outcome of comparing two ticker -> DataFrame price stores.
|
|
35
|
+
|
|
36
|
+
``passed`` is the gate signal. ``as_metrics()`` is what you log over the
|
|
37
|
+
observation window. ``summary()`` is the one-line operator string.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
only_in_a: frozenset
|
|
41
|
+
only_in_b: frozenset
|
|
42
|
+
common: frozenset
|
|
43
|
+
rowcount_deltas: Mapping[str, int] # ticker -> len(a)-len(b), nonzero only
|
|
44
|
+
max_abs_value_delta: float
|
|
45
|
+
worst_cell: Optional[Tuple[str, str, str]] # (ticker, column, iso-date)
|
|
46
|
+
n_cells_over_epsilon: int
|
|
47
|
+
n_cells_compared: int
|
|
48
|
+
epsilon: float
|
|
49
|
+
value_cols: Tuple[str, ...]
|
|
50
|
+
require_ticker_match: bool
|
|
51
|
+
require_rowcount_match: bool
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def ticker_sets_match(self) -> bool:
|
|
55
|
+
return not self.only_in_a and not self.only_in_b
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def rowcounts_match(self) -> bool:
|
|
59
|
+
return not self.rowcount_deltas
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def passed(self) -> bool:
|
|
63
|
+
ok = self.n_cells_over_epsilon == 0
|
|
64
|
+
if self.require_ticker_match:
|
|
65
|
+
ok = ok and self.ticker_sets_match
|
|
66
|
+
if self.require_rowcount_match:
|
|
67
|
+
ok = ok and self.rowcounts_match
|
|
68
|
+
return ok
|
|
69
|
+
|
|
70
|
+
def summary(self) -> str:
|
|
71
|
+
return (
|
|
72
|
+
"parity {verdict}: common={c} only_a={oa} only_b={ob} "
|
|
73
|
+
"rowcount_mismatch_tickers={rc} max_abs_delta={mad:.3e} "
|
|
74
|
+
"cells_over_eps={oe}/{tot} (eps={eps:.1e})"
|
|
75
|
+
).format(
|
|
76
|
+
verdict="PASS" if self.passed else "FAIL",
|
|
77
|
+
c=len(self.common),
|
|
78
|
+
oa=len(self.only_in_a),
|
|
79
|
+
ob=len(self.only_in_b),
|
|
80
|
+
rc=len(self.rowcount_deltas),
|
|
81
|
+
mad=self.max_abs_value_delta,
|
|
82
|
+
oe=self.n_cells_over_epsilon,
|
|
83
|
+
tot=self.n_cells_compared,
|
|
84
|
+
eps=self.epsilon,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def as_metrics(self) -> dict:
|
|
88
|
+
"""JSON-able metric dict for the observation gate / metrics surface."""
|
|
89
|
+
return {
|
|
90
|
+
"passed": self.passed,
|
|
91
|
+
"ticker_sets_match": self.ticker_sets_match,
|
|
92
|
+
"rowcounts_match": self.rowcounts_match,
|
|
93
|
+
"n_common": len(self.common),
|
|
94
|
+
"n_only_in_a": len(self.only_in_a),
|
|
95
|
+
"n_only_in_b": len(self.only_in_b),
|
|
96
|
+
"only_in_a": sorted(self.only_in_a),
|
|
97
|
+
"only_in_b": sorted(self.only_in_b),
|
|
98
|
+
"n_rowcount_mismatch_tickers": len(self.rowcount_deltas),
|
|
99
|
+
"rowcount_deltas": dict(sorted(self.rowcount_deltas.items())),
|
|
100
|
+
"max_abs_value_delta": self.max_abs_value_delta,
|
|
101
|
+
"worst_cell": list(self.worst_cell) if self.worst_cell else None,
|
|
102
|
+
"n_cells_over_epsilon": self.n_cells_over_epsilon,
|
|
103
|
+
"n_cells_compared": self.n_cells_compared,
|
|
104
|
+
"epsilon": self.epsilon,
|
|
105
|
+
"value_cols": list(self.value_cols),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def reconcile_frame_dicts(
|
|
110
|
+
a: Mapping[str, "object"],
|
|
111
|
+
b: Mapping[str, "object"],
|
|
112
|
+
*,
|
|
113
|
+
value_cols: Sequence[str] = ("Close",),
|
|
114
|
+
epsilon: float = 1e-6,
|
|
115
|
+
require_ticker_match: bool = True,
|
|
116
|
+
require_rowcount_match: bool = False,
|
|
117
|
+
) -> ParityReport:
|
|
118
|
+
"""Compare two ticker -> DataFrame price stores into a :class:`ParityReport`.
|
|
119
|
+
|
|
120
|
+
Value fidelity is measured on the **intersection of dates** per common
|
|
121
|
+
ticker (an inner join on the DatetimeIndex), over the ``value_cols``
|
|
122
|
+
present in *both* frames. Row-count deltas are reported separately and,
|
|
123
|
+
by default, do **not** fail the gate: a slim-cache tail slice and an
|
|
124
|
+
ArcticDB ``date_range`` read legitimately differ by a few boundary rows
|
|
125
|
+
while being bit-identical on the overlap — the migration question is
|
|
126
|
+
"do they agree where they overlap", not "are the artifacts shaped
|
|
127
|
+
identically". Set ``require_rowcount_match=True`` for stricter contexts.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
a, b: ticker -> pandas DataFrame (DatetimeIndex). Conventionally
|
|
131
|
+
``a`` = incumbent source, ``b`` = candidate source.
|
|
132
|
+
value_cols: columns compared for numeric equality.
|
|
133
|
+
epsilon: absolute tolerance; a cell counts as a mismatch when
|
|
134
|
+
``abs(a - b) > epsilon``.
|
|
135
|
+
require_ticker_match: include ticker-set symmetry in ``passed``.
|
|
136
|
+
require_rowcount_match: include row-count equality in ``passed``.
|
|
137
|
+
"""
|
|
138
|
+
import pandas as pd # lazy: keeps module importable without the extra
|
|
139
|
+
|
|
140
|
+
keys_a = set(a)
|
|
141
|
+
keys_b = set(b)
|
|
142
|
+
common = keys_a & keys_b
|
|
143
|
+
|
|
144
|
+
rowcount_deltas: dict = {}
|
|
145
|
+
max_abs = 0.0
|
|
146
|
+
worst_cell: Optional[Tuple[str, str, str]] = None
|
|
147
|
+
n_over = 0
|
|
148
|
+
n_compared = 0
|
|
149
|
+
cols = tuple(value_cols)
|
|
150
|
+
|
|
151
|
+
for ticker in sorted(common):
|
|
152
|
+
fa = a[ticker]
|
|
153
|
+
fb = b[ticker]
|
|
154
|
+
|
|
155
|
+
delta = len(fa) - len(fb)
|
|
156
|
+
if delta != 0:
|
|
157
|
+
rowcount_deltas[ticker] = delta
|
|
158
|
+
|
|
159
|
+
idx = fa.index.intersection(fb.index)
|
|
160
|
+
if len(idx) == 0:
|
|
161
|
+
continue
|
|
162
|
+
for col in cols:
|
|
163
|
+
if col not in fa.columns or col not in fb.columns:
|
|
164
|
+
continue
|
|
165
|
+
sa = pd.to_numeric(fa.loc[idx, col], errors="coerce")
|
|
166
|
+
sb = pd.to_numeric(fb.loc[idx, col], errors="coerce")
|
|
167
|
+
diff = (sa - sb).abs()
|
|
168
|
+
# NaN on either side -> treat as comparable only where both
|
|
169
|
+
# present; an asymmetric NaN is a real mismatch.
|
|
170
|
+
both_nan = sa.isna() & sb.isna()
|
|
171
|
+
one_nan = (sa.isna() ^ sb.isna())
|
|
172
|
+
diff = diff.where(~both_nan)
|
|
173
|
+
n_compared += int((~both_nan).sum())
|
|
174
|
+
|
|
175
|
+
over = (diff > epsilon) | one_nan
|
|
176
|
+
n_over += int(over.sum())
|
|
177
|
+
|
|
178
|
+
valid = diff.dropna()
|
|
179
|
+
if not valid.empty:
|
|
180
|
+
cell_max = float(valid.max())
|
|
181
|
+
if cell_max > max_abs:
|
|
182
|
+
max_abs = cell_max
|
|
183
|
+
worst_dt = valid.idxmax()
|
|
184
|
+
worst_cell = (
|
|
185
|
+
ticker,
|
|
186
|
+
col,
|
|
187
|
+
getattr(worst_dt, "isoformat", lambda: str(worst_dt))(),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return ParityReport(
|
|
191
|
+
only_in_a=frozenset(keys_a - keys_b),
|
|
192
|
+
only_in_b=frozenset(keys_b - keys_a),
|
|
193
|
+
common=frozenset(common),
|
|
194
|
+
rowcount_deltas=rowcount_deltas,
|
|
195
|
+
max_abs_value_delta=max_abs,
|
|
196
|
+
worst_cell=worst_cell,
|
|
197
|
+
n_cells_over_epsilon=n_over,
|
|
198
|
+
n_cells_compared=n_compared,
|
|
199
|
+
epsilon=epsilon,
|
|
200
|
+
value_cols=cols,
|
|
201
|
+
require_ticker_match=require_ticker_match,
|
|
202
|
+
require_rowcount_match=require_rowcount_match,
|
|
203
|
+
)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Per-key SSM-backed secret fetcher for Alpha Engine modules.
|
|
3
|
+
|
|
4
|
+
This module is the consolidation point for secret resolution across all six
|
|
5
|
+
alpha-engine repos. Before this, each repo duplicated some variant of the
|
|
6
|
+
``ssm_secrets.py`` bulk-load-into-os.environ pattern (alpha-engine-data has
|
|
7
|
+
the canonical one). Going forward, callers do::
|
|
8
|
+
|
|
9
|
+
from alpha_engine_lib.secrets import get_secret
|
|
10
|
+
|
|
11
|
+
api_key = get_secret("ANTHROPIC_API_KEY") # required → raises if absent
|
|
12
|
+
opt_key = get_secret("DEBUG_TRACE_KEY", required=False) # optional → returns None
|
|
13
|
+
|
|
14
|
+
**Resolution order** (first hit wins):
|
|
15
|
+
|
|
16
|
+
1. Per-process cache (populated on first read; thread-safe).
|
|
17
|
+
2. ``ALPHA_ENGINE_SECRETS_SOURCE`` env-var toggle:
|
|
18
|
+
|
|
19
|
+
- ``env`` → ``os.environ[name]`` only (local-dev escape hatch — never hit SSM)
|
|
20
|
+
- ``ssm`` → SSM only (production strictness — no silent env fallback)
|
|
21
|
+
- unset / ``auto`` / anything else → SSM first, ``os.environ`` fallback
|
|
22
|
+
|
|
23
|
+
3. SSM at ``/alpha-engine/{name}`` (under :data:`SSM_PREFIX`).
|
|
24
|
+
4. ``os.environ[name]`` (the fallback in ``auto`` mode).
|
|
25
|
+
5. ``default`` arg if provided.
|
|
26
|
+
6. :exc:`SecretNotFoundError` if ``required=True``; else ``None``.
|
|
27
|
+
|
|
28
|
+
**Caching design.** Per-process dict keyed on secret name. The SSM round-trip
|
|
29
|
+
happens at most once per name per process; subsequent calls hit the dict.
|
|
30
|
+
Lambda cold-starts pay the round-trip once; warm invocations reuse the cache.
|
|
31
|
+
:func:`clear_cache` is exposed for tests.
|
|
32
|
+
|
|
33
|
+
**SSM-unavailable latch.** If the first SSM call fails (no boto3, no creds,
|
|
34
|
+
network error), latch ``_ssm_unavailable = True`` and skip SSM for the rest of
|
|
35
|
+
the process. Avoids repeated multi-second timeouts in local dev. Reset via
|
|
36
|
+
:func:`clear_cache` if a test needs to re-probe SSM.
|
|
37
|
+
|
|
38
|
+
**Migration arc**: ``alpha-engine-config/private-docs/ROADMAP.md`` line ~2780
|
|
39
|
+
(Deprecate ``.env`` entirely). Plan doc:
|
|
40
|
+
``alpha-engine-docs/private/env-to-ssm-260512.md``.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import logging
|
|
46
|
+
import os
|
|
47
|
+
import threading
|
|
48
|
+
from typing import Final
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
SSM_PREFIX: Final[str] = "/alpha-engine/"
|
|
53
|
+
SOURCE_TOGGLE_ENV: Final[str] = "ALPHA_ENGINE_SECRETS_SOURCE"
|
|
54
|
+
|
|
55
|
+
_cache: dict[str, str] = {}
|
|
56
|
+
_cache_lock = threading.Lock()
|
|
57
|
+
_ssm_unavailable = False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SecretNotFoundError(LookupError):
|
|
61
|
+
"""Raised when a required secret is missing from both SSM and the environment."""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_secret(
|
|
65
|
+
name: str,
|
|
66
|
+
*,
|
|
67
|
+
required: bool = True,
|
|
68
|
+
default: str | None = None,
|
|
69
|
+
) -> str | None:
|
|
70
|
+
"""Fetch a secret by ``name`` from SSM with environment fallback.
|
|
71
|
+
|
|
72
|
+
See module docstring for the full resolution order. The lookup is
|
|
73
|
+
per-process cached; the first call per process pays the SSM round-trip,
|
|
74
|
+
subsequent calls hit the in-memory dict.
|
|
75
|
+
|
|
76
|
+
:param name: Secret name (no prefix). E.g. ``"POLYGON_API_KEY"``.
|
|
77
|
+
:param required: If ``True`` (default), raise :exc:`SecretNotFoundError`
|
|
78
|
+
when the secret is absent. If ``False``, return ``default`` (or
|
|
79
|
+
``None`` if ``default`` is unset).
|
|
80
|
+
:param default: Value to return if the secret is absent and
|
|
81
|
+
``required=False``. Ignored when ``required=True``.
|
|
82
|
+
:raises SecretNotFoundError: When ``required=True`` and the secret is
|
|
83
|
+
absent from cache, SSM, and ``os.environ``.
|
|
84
|
+
:raises ValueError: When ``name`` is empty or contains a forward slash.
|
|
85
|
+
"""
|
|
86
|
+
if not name:
|
|
87
|
+
raise ValueError("secret name must be non-empty")
|
|
88
|
+
if "/" in name:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"secret name must not contain '/': got {name!r} "
|
|
91
|
+
f"(the SSM_PREFIX is added automatically)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
with _cache_lock:
|
|
95
|
+
cached = _cache.get(name)
|
|
96
|
+
if cached is not None:
|
|
97
|
+
return cached
|
|
98
|
+
|
|
99
|
+
source = os.environ.get(SOURCE_TOGGLE_ENV, "auto").lower()
|
|
100
|
+
if source not in ("auto", "env", "ssm"):
|
|
101
|
+
logger.warning(
|
|
102
|
+
"unknown %s=%r — falling back to 'auto'", SOURCE_TOGGLE_ENV, source
|
|
103
|
+
)
|
|
104
|
+
source = "auto"
|
|
105
|
+
|
|
106
|
+
value: str | None = None
|
|
107
|
+
|
|
108
|
+
if source in ("auto", "ssm"):
|
|
109
|
+
value = _fetch_from_ssm(name)
|
|
110
|
+
|
|
111
|
+
if value is None and source in ("auto", "env"):
|
|
112
|
+
value = os.environ.get(name)
|
|
113
|
+
|
|
114
|
+
if value is None:
|
|
115
|
+
if default is not None:
|
|
116
|
+
return default
|
|
117
|
+
if required:
|
|
118
|
+
raise SecretNotFoundError(
|
|
119
|
+
f"secret {name!r} not found in cache, SSM ({SSM_PREFIX}{name}), "
|
|
120
|
+
f"or environment (source={source!r})"
|
|
121
|
+
)
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
with _cache_lock:
|
|
125
|
+
_cache[name] = value
|
|
126
|
+
return value
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def clear_cache() -> None:
|
|
130
|
+
"""Clear the per-process cache and re-arm SSM probing.
|
|
131
|
+
|
|
132
|
+
Mostly for tests — production code should not need to call this. Resets
|
|
133
|
+
both the secret cache and the ``_ssm_unavailable`` latch.
|
|
134
|
+
"""
|
|
135
|
+
global _ssm_unavailable
|
|
136
|
+
with _cache_lock:
|
|
137
|
+
_cache.clear()
|
|
138
|
+
_ssm_unavailable = False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _fetch_from_ssm(name: str) -> str | None:
|
|
142
|
+
"""Single-key SSM read. Returns ``None`` on miss or unavailability."""
|
|
143
|
+
global _ssm_unavailable
|
|
144
|
+
if _ssm_unavailable:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
import boto3
|
|
149
|
+
from botocore.exceptions import BotoCoreError, ClientError
|
|
150
|
+
except ImportError:
|
|
151
|
+
logger.debug("boto3 not installed — skipping SSM for %s", name)
|
|
152
|
+
_ssm_unavailable = True
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
region = os.environ.get("AWS_REGION") or os.environ.get(
|
|
156
|
+
"AWS_DEFAULT_REGION", "us-east-1"
|
|
157
|
+
)
|
|
158
|
+
try:
|
|
159
|
+
client = boto3.client("ssm", region_name=region)
|
|
160
|
+
resp = client.get_parameter(
|
|
161
|
+
Name=f"{SSM_PREFIX}{name}",
|
|
162
|
+
WithDecryption=True,
|
|
163
|
+
)
|
|
164
|
+
return resp["Parameter"]["Value"]
|
|
165
|
+
except ClientError as e:
|
|
166
|
+
code = e.response.get("Error", {}).get("Code", "")
|
|
167
|
+
if code == "ParameterNotFound":
|
|
168
|
+
# Genuine miss — not an SSM-availability problem. Fall through
|
|
169
|
+
# to env without latching, since other secrets may resolve fine.
|
|
170
|
+
logger.debug("SSM miss for %s (ParameterNotFound)", name)
|
|
171
|
+
return None
|
|
172
|
+
logger.warning(
|
|
173
|
+
"SSM read for %s failed (%s) — latching unavailable for this process",
|
|
174
|
+
name,
|
|
175
|
+
code or "unknown",
|
|
176
|
+
)
|
|
177
|
+
_ssm_unavailable = True
|
|
178
|
+
return None
|
|
179
|
+
except BotoCoreError as e:
|
|
180
|
+
logger.warning(
|
|
181
|
+
"SSM read for %s failed (%s) — latching unavailable for this process",
|
|
182
|
+
name,
|
|
183
|
+
type(e).__name__,
|
|
184
|
+
)
|
|
185
|
+
_ssm_unavailable = True
|
|
186
|
+
return None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Shared data-source contracts — Pydantic shapes + Protocols.
|
|
2
|
+
|
|
3
|
+
This package defines the canonical normalized shapes (``NewsArticle``,
|
|
4
|
+
``AnalystSnapshot``, ``FilingDocument``) and the adapter Protocols
|
|
5
|
+
(``NewsSource``, ``AnalystSource``, ``FilingSource``) that both
|
|
6
|
+
producers (alpha-engine-data) and consumers (alpha-engine-research,
|
|
7
|
+
alpha-engine-backtester) consume.
|
|
8
|
+
|
|
9
|
+
**Architectural pattern:** Lib defines the contract; producers
|
|
10
|
+
(alpha-engine-data) implement concrete adapters; consumers
|
|
11
|
+
(alpha-engine-research) read records produced by them via S3 / RAG
|
|
12
|
+
retrieval and never import adapter classes directly.
|
|
13
|
+
|
|
14
|
+
See the institutional data-revamp plan doc at
|
|
15
|
+
``~/Development/alpha-engine-docs/private/data-revamp-260513.md`` for
|
|
16
|
+
full context. PR α — lib (this); PR β — data (adapter implementations).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from alpha_engine_lib.sources.protocols import (
|
|
20
|
+
AnalystSnapshot,
|
|
21
|
+
AnalystSource,
|
|
22
|
+
FilingDocument,
|
|
23
|
+
FilingSource,
|
|
24
|
+
NewsArticle,
|
|
25
|
+
NewsSource,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"NewsArticle",
|
|
30
|
+
"AnalystSnapshot",
|
|
31
|
+
"FilingDocument",
|
|
32
|
+
"NewsSource",
|
|
33
|
+
"AnalystSource",
|
|
34
|
+
"FilingSource",
|
|
35
|
+
]
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""Source-substrate Protocols + normalized Pydantic shapes.
|
|
2
|
+
|
|
3
|
+
Wave 1 PR α of the institutional data revamp (see
|
|
4
|
+
``~/Development/alpha-engine-docs/private/data-revamp-260513.md``). Each
|
|
5
|
+
data slot (news, filings, analyst, alt data) becomes a Protocol with
|
|
6
|
+
multiple adapters implementing it; concrete adapters live in
|
|
7
|
+
alpha-engine-data (producer-side) and consumers (alpha-engine-research)
|
|
8
|
+
never import them directly — they read producer outputs via S3 + the
|
|
9
|
+
shared RAG retrieval API.
|
|
10
|
+
|
|
11
|
+
Why Protocols over ABCs:
|
|
12
|
+
|
|
13
|
+
- Structural subtyping — third-party SDK wrappers can satisfy without
|
|
14
|
+
inheriting from our base class.
|
|
15
|
+
- Static type-checking via ``runtime_checkable`` for explicit gating
|
|
16
|
+
in the aggregator.
|
|
17
|
+
- No vtable overhead in hot loops.
|
|
18
|
+
|
|
19
|
+
Why Pydantic shapes (not raw dicts):
|
|
20
|
+
|
|
21
|
+
- Cross-vendor schema normalization is brittle. Pydantic gives us a
|
|
22
|
+
single canonical shape with validation at the adapter boundary —
|
|
23
|
+
adapter bugs surface as ``ValidationError``, not as silent
|
|
24
|
+
``KeyError`` in downstream NLP three layers deeper.
|
|
25
|
+
- ``extra='forbid'`` ensures vendor schema drift surfaces immediately
|
|
26
|
+
instead of leaking unmapped fields through the pipeline.
|
|
27
|
+
- ``frozen=True`` so records are hashable + safe to share across
|
|
28
|
+
threads / fan-out workers without defensive copies.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
from datetime import datetime
|
|
34
|
+
from typing import Protocol, runtime_checkable
|
|
35
|
+
|
|
36
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── Normalized shapes ──────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NewsArticle(BaseModel):
|
|
43
|
+
"""One news article, normalized across all vendors.
|
|
44
|
+
|
|
45
|
+
The canonical key for cross-vendor dedup is the composite
|
|
46
|
+
(normalized title, URL host+path hash). Different vendors syndicate
|
|
47
|
+
the same wire story; the aggregator's dedup catches both the URL-
|
|
48
|
+
collapse case (different querystrings) and the
|
|
49
|
+
title-paraphrase-on-same-URL case.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
53
|
+
|
|
54
|
+
tickers: tuple[str, ...] = Field(
|
|
55
|
+
description="Tickers this article concerns. Multi-ticker articles "
|
|
56
|
+
"(e.g. sector pieces) are kept as a single record with "
|
|
57
|
+
"a multi-element tuple; the aggregator's ticker-union "
|
|
58
|
+
"logic merges variants. Adapter's choice: emit once per "
|
|
59
|
+
"(article, ticker) or once with the full set."
|
|
60
|
+
)
|
|
61
|
+
title: str
|
|
62
|
+
body_excerpt: str = Field(
|
|
63
|
+
description="Lead paragraph or summary. Full-text body lives in the "
|
|
64
|
+
"RAG corpus chunk store, not in this struct — chunked at "
|
|
65
|
+
"embedding time by the ingest pipeline."
|
|
66
|
+
)
|
|
67
|
+
url: str
|
|
68
|
+
published_at: datetime = Field(
|
|
69
|
+
description="UTC publish time. Vendor wall-clock; ingest-time is "
|
|
70
|
+
"in `fetched_at`."
|
|
71
|
+
)
|
|
72
|
+
source: str = Field(
|
|
73
|
+
description="Vendor slug: 'polygon', 'gdelt', 'yahoo_rss', "
|
|
74
|
+
"'edgar_press', 'benzinga' (paid), 'bloomberg' (paid), "
|
|
75
|
+
"'ravenpack' (paid). Joins onto the trust-weight config "
|
|
76
|
+
"downstream."
|
|
77
|
+
)
|
|
78
|
+
vendor_article_id: str | None = Field(
|
|
79
|
+
default=None,
|
|
80
|
+
description="Vendor-native unique ID for cross-reference back to "
|
|
81
|
+
"the source system (Polygon `id`, GDELT `GKGRECORDID`, "
|
|
82
|
+
"etc.). Used by ingest-side idempotency checks.",
|
|
83
|
+
)
|
|
84
|
+
fetched_at: datetime = Field(
|
|
85
|
+
description="When this adapter pulled the article (UTC). For "
|
|
86
|
+
"freshness audit + cache-age computation."
|
|
87
|
+
)
|
|
88
|
+
headline_authors: tuple[str, ...] | None = Field(
|
|
89
|
+
default=None,
|
|
90
|
+
description="Bylines if available. None if the source doesn't "
|
|
91
|
+
"expose authors (e.g. wire feeds).",
|
|
92
|
+
)
|
|
93
|
+
tags: tuple[str, ...] = Field(
|
|
94
|
+
default_factory=tuple,
|
|
95
|
+
description="Vendor-supplied topic / event tags. GDELT emits "
|
|
96
|
+
"structured event codes; Polygon emits keywords; "
|
|
97
|
+
"Benzinga emits Channels. Used as a soft signal for "
|
|
98
|
+
"downstream event-flag extraction.",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class AnalystSnapshot(BaseModel):
|
|
103
|
+
"""One vendor's analyst consensus snapshot for one ticker at one
|
|
104
|
+
point in time.
|
|
105
|
+
|
|
106
|
+
Time-series of these in S3 drives self-derived revisions tracking
|
|
107
|
+
(see ``alpha-engine-data/data/derived/revisions.py``, PR C).
|
|
108
|
+
Adapter is responsible for vendor-string normalization at the
|
|
109
|
+
boundary — downstream consumers see the canonical 5-class
|
|
110
|
+
consensus_rating ladder.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
114
|
+
|
|
115
|
+
ticker: str
|
|
116
|
+
source: str
|
|
117
|
+
fetched_at: datetime
|
|
118
|
+
consensus_rating: str | None = Field(
|
|
119
|
+
default=None,
|
|
120
|
+
description="Categorical: 'strongBuy' | 'buy' | 'hold' | 'sell' | "
|
|
121
|
+
"'strongSell'. Vendor strings normalized at adapter "
|
|
122
|
+
"boundary.",
|
|
123
|
+
)
|
|
124
|
+
mean_target: float | None = Field(
|
|
125
|
+
default=None, description="Mean price target (USD)."
|
|
126
|
+
)
|
|
127
|
+
median_target: float | None = Field(
|
|
128
|
+
default=None, description="Median price target if vendor exposes it."
|
|
129
|
+
)
|
|
130
|
+
num_analysts: int | None = Field(
|
|
131
|
+
default=None, description="Number of contributing analysts."
|
|
132
|
+
)
|
|
133
|
+
rating_changes_30d: tuple[dict, ...] = Field(
|
|
134
|
+
default_factory=tuple,
|
|
135
|
+
description="Recent upgrades/downgrades. Each entry: "
|
|
136
|
+
"{analyst, firm, action, prior_rating, new_rating, "
|
|
137
|
+
"date}. Used by downstream NLP/event-flag extraction.",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class FilingDocument(BaseModel):
|
|
142
|
+
"""One filing document. Filings substrate (PR B). Pinned here so
|
|
143
|
+
PR α can reference the shape from Protocols without forward refs."""
|
|
144
|
+
|
|
145
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
146
|
+
|
|
147
|
+
ticker: str
|
|
148
|
+
form_type: str = Field(
|
|
149
|
+
description="'10-K' | '10-Q' | '8-K' | '14A' | 'DEF' | 'S-1' | "
|
|
150
|
+
"'S-4' | '13D' | '13G' | '13F' | 'Form 4' | etc."
|
|
151
|
+
)
|
|
152
|
+
filed_date: datetime
|
|
153
|
+
accession_number: str = Field(
|
|
154
|
+
description="EDGAR accession (e.g. '0000320193-25-000001'). "
|
|
155
|
+
"Canonical key for filing dedup + RAG idempotency check."
|
|
156
|
+
)
|
|
157
|
+
title: str | None = None
|
|
158
|
+
url: str
|
|
159
|
+
source: str = "edgar"
|
|
160
|
+
fetched_at: datetime
|
|
161
|
+
body_excerpt: str = Field(
|
|
162
|
+
description="Lead snippet. Full body goes to the RAG corpus, "
|
|
163
|
+
"chunked at ingest time."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ── Protocols ──────────────────────────────────────────────────────────
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@runtime_checkable
|
|
171
|
+
class NewsSource(Protocol):
|
|
172
|
+
"""News adapter contract. Adapters are vendor-specific transports
|
|
173
|
+
that produce normalized :class:`NewsArticle` records.
|
|
174
|
+
|
|
175
|
+
Adapters MUST:
|
|
176
|
+
|
|
177
|
+
- Be safely callable from concurrent contexts (own their HTTP client +
|
|
178
|
+
rate-limiter; no shared mutable state).
|
|
179
|
+
- Return an empty list (never raise) on transient vendor failures.
|
|
180
|
+
Re-raise only on auth failures, contract-breaking schema drift, or
|
|
181
|
+
configuration errors — those fail loud.
|
|
182
|
+
- Normalize wall-clock timestamps to UTC.
|
|
183
|
+
- Stamp ``fetched_at`` on every returned article.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
name: str # vendor slug — joins onto trust-weight config
|
|
187
|
+
|
|
188
|
+
def fetch(
|
|
189
|
+
self,
|
|
190
|
+
tickers: list[str],
|
|
191
|
+
*,
|
|
192
|
+
hours: int = 48,
|
|
193
|
+
) -> list[NewsArticle]: ...
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@runtime_checkable
|
|
197
|
+
class AnalystSource(Protocol):
|
|
198
|
+
"""Analyst data adapter contract. PR C.
|
|
199
|
+
|
|
200
|
+
Returns ``None`` for tickers the vendor doesn't cover (e.g. micro-
|
|
201
|
+
caps absent from FMP); empty fields within a returned snapshot
|
|
202
|
+
indicate vendor-side coverage gaps that are typed in the shape.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
name: str
|
|
206
|
+
|
|
207
|
+
def fetch(self, ticker: str) -> AnalystSnapshot | None: ...
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@runtime_checkable
|
|
211
|
+
class FilingSource(Protocol):
|
|
212
|
+
"""Filings adapter contract. PR B.
|
|
213
|
+
|
|
214
|
+
``form_types`` filter is advisory — adapters that don't support
|
|
215
|
+
form-type pre-filtering at the vendor layer return all forms and
|
|
216
|
+
let downstream filter.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
name: str
|
|
220
|
+
|
|
221
|
+
def fetch(
|
|
222
|
+
self,
|
|
223
|
+
tickers: list[str],
|
|
224
|
+
*,
|
|
225
|
+
form_types: list[str] | None = None,
|
|
226
|
+
days: int = 7,
|
|
227
|
+
) -> list[FilingDocument]: ...
|