alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ """
2
+ Quantitative parity reconciliation for ``dict[str, DataFrame]`` price stores.
3
+
4
+ The SOTA observation substrate for contract-safe data-tier migrations: when a
5
+ consumer is being moved from one price source to another (e.g. the
6
+ ``predictor/price_cache_slim/`` parquet tier -> the ArcticDB universe lib,
7
+ Wave 4 of the predictor/ S3 namespace rationalization), the cutover decision
8
+ must be **data-driven**, not eyeballed. This module turns "do the two sources
9
+ agree?" into a single auditable :class:`ParityReport` with:
10
+
11
+ - ticker-set symmetric difference (coverage),
12
+ - per-ticker row-count delta (shape),
13
+ - max absolute value delta over the *overlapping* dates/columns (fidelity),
14
+ - a binary ``passed`` keyed on an explicit epsilon.
15
+
16
+ :meth:`ParityReport.as_metrics` is JSON-able so the same object that decides a
17
+ gate can be emitted to the metrics surface and observed over a window before
18
+ the producer is retired. One implementation, reused by every consumer
19
+ migration PR (data macro-breadth, backtester exit-timing) and the final
20
+ deletion gate — no per-repo re-implementation of "are these prices equal".
21
+
22
+ Pure pandas; importable without the ``[arcticdb]`` extra (pandas is imported
23
+ lazily inside the function, mirroring the arcticdb module's contract).
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from dataclasses import dataclass
29
+ from typing import Mapping, Optional, Sequence, Tuple
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class ParityReport:
34
+ """Outcome of comparing two ticker -> DataFrame price stores.
35
+
36
+ ``passed`` is the gate signal. ``as_metrics()`` is what you log over the
37
+ observation window. ``summary()`` is the one-line operator string.
38
+ """
39
+
40
+ only_in_a: frozenset
41
+ only_in_b: frozenset
42
+ common: frozenset
43
+ rowcount_deltas: Mapping[str, int] # ticker -> len(a)-len(b), nonzero only
44
+ max_abs_value_delta: float
45
+ worst_cell: Optional[Tuple[str, str, str]] # (ticker, column, iso-date)
46
+ n_cells_over_epsilon: int
47
+ n_cells_compared: int
48
+ epsilon: float
49
+ value_cols: Tuple[str, ...]
50
+ require_ticker_match: bool
51
+ require_rowcount_match: bool
52
+
53
+ @property
54
+ def ticker_sets_match(self) -> bool:
55
+ return not self.only_in_a and not self.only_in_b
56
+
57
+ @property
58
+ def rowcounts_match(self) -> bool:
59
+ return not self.rowcount_deltas
60
+
61
+ @property
62
+ def passed(self) -> bool:
63
+ ok = self.n_cells_over_epsilon == 0
64
+ if self.require_ticker_match:
65
+ ok = ok and self.ticker_sets_match
66
+ if self.require_rowcount_match:
67
+ ok = ok and self.rowcounts_match
68
+ return ok
69
+
70
+ def summary(self) -> str:
71
+ return (
72
+ "parity {verdict}: common={c} only_a={oa} only_b={ob} "
73
+ "rowcount_mismatch_tickers={rc} max_abs_delta={mad:.3e} "
74
+ "cells_over_eps={oe}/{tot} (eps={eps:.1e})"
75
+ ).format(
76
+ verdict="PASS" if self.passed else "FAIL",
77
+ c=len(self.common),
78
+ oa=len(self.only_in_a),
79
+ ob=len(self.only_in_b),
80
+ rc=len(self.rowcount_deltas),
81
+ mad=self.max_abs_value_delta,
82
+ oe=self.n_cells_over_epsilon,
83
+ tot=self.n_cells_compared,
84
+ eps=self.epsilon,
85
+ )
86
+
87
+ def as_metrics(self) -> dict:
88
+ """JSON-able metric dict for the observation gate / metrics surface."""
89
+ return {
90
+ "passed": self.passed,
91
+ "ticker_sets_match": self.ticker_sets_match,
92
+ "rowcounts_match": self.rowcounts_match,
93
+ "n_common": len(self.common),
94
+ "n_only_in_a": len(self.only_in_a),
95
+ "n_only_in_b": len(self.only_in_b),
96
+ "only_in_a": sorted(self.only_in_a),
97
+ "only_in_b": sorted(self.only_in_b),
98
+ "n_rowcount_mismatch_tickers": len(self.rowcount_deltas),
99
+ "rowcount_deltas": dict(sorted(self.rowcount_deltas.items())),
100
+ "max_abs_value_delta": self.max_abs_value_delta,
101
+ "worst_cell": list(self.worst_cell) if self.worst_cell else None,
102
+ "n_cells_over_epsilon": self.n_cells_over_epsilon,
103
+ "n_cells_compared": self.n_cells_compared,
104
+ "epsilon": self.epsilon,
105
+ "value_cols": list(self.value_cols),
106
+ }
107
+
108
+
109
+ def reconcile_frame_dicts(
110
+ a: Mapping[str, "object"],
111
+ b: Mapping[str, "object"],
112
+ *,
113
+ value_cols: Sequence[str] = ("Close",),
114
+ epsilon: float = 1e-6,
115
+ require_ticker_match: bool = True,
116
+ require_rowcount_match: bool = False,
117
+ ) -> ParityReport:
118
+ """Compare two ticker -> DataFrame price stores into a :class:`ParityReport`.
119
+
120
+ Value fidelity is measured on the **intersection of dates** per common
121
+ ticker (an inner join on the DatetimeIndex), over the ``value_cols``
122
+ present in *both* frames. Row-count deltas are reported separately and,
123
+ by default, do **not** fail the gate: a slim-cache tail slice and an
124
+ ArcticDB ``date_range`` read legitimately differ by a few boundary rows
125
+ while being bit-identical on the overlap — the migration question is
126
+ "do they agree where they overlap", not "are the artifacts shaped
127
+ identically". Set ``require_rowcount_match=True`` for stricter contexts.
128
+
129
+ Args:
130
+ a, b: ticker -> pandas DataFrame (DatetimeIndex). Conventionally
131
+ ``a`` = incumbent source, ``b`` = candidate source.
132
+ value_cols: columns compared for numeric equality.
133
+ epsilon: absolute tolerance; a cell counts as a mismatch when
134
+ ``abs(a - b) > epsilon``.
135
+ require_ticker_match: include ticker-set symmetry in ``passed``.
136
+ require_rowcount_match: include row-count equality in ``passed``.
137
+ """
138
+ import pandas as pd # lazy: keeps module importable without the extra
139
+
140
+ keys_a = set(a)
141
+ keys_b = set(b)
142
+ common = keys_a & keys_b
143
+
144
+ rowcount_deltas: dict = {}
145
+ max_abs = 0.0
146
+ worst_cell: Optional[Tuple[str, str, str]] = None
147
+ n_over = 0
148
+ n_compared = 0
149
+ cols = tuple(value_cols)
150
+
151
+ for ticker in sorted(common):
152
+ fa = a[ticker]
153
+ fb = b[ticker]
154
+
155
+ delta = len(fa) - len(fb)
156
+ if delta != 0:
157
+ rowcount_deltas[ticker] = delta
158
+
159
+ idx = fa.index.intersection(fb.index)
160
+ if len(idx) == 0:
161
+ continue
162
+ for col in cols:
163
+ if col not in fa.columns or col not in fb.columns:
164
+ continue
165
+ sa = pd.to_numeric(fa.loc[idx, col], errors="coerce")
166
+ sb = pd.to_numeric(fb.loc[idx, col], errors="coerce")
167
+ diff = (sa - sb).abs()
168
+ # NaN on either side -> treat as comparable only where both
169
+ # present; an asymmetric NaN is a real mismatch.
170
+ both_nan = sa.isna() & sb.isna()
171
+ one_nan = (sa.isna() ^ sb.isna())
172
+ diff = diff.where(~both_nan)
173
+ n_compared += int((~both_nan).sum())
174
+
175
+ over = (diff > epsilon) | one_nan
176
+ n_over += int(over.sum())
177
+
178
+ valid = diff.dropna()
179
+ if not valid.empty:
180
+ cell_max = float(valid.max())
181
+ if cell_max > max_abs:
182
+ max_abs = cell_max
183
+ worst_dt = valid.idxmax()
184
+ worst_cell = (
185
+ ticker,
186
+ col,
187
+ getattr(worst_dt, "isoformat", lambda: str(worst_dt))(),
188
+ )
189
+
190
+ return ParityReport(
191
+ only_in_a=frozenset(keys_a - keys_b),
192
+ only_in_b=frozenset(keys_b - keys_a),
193
+ common=frozenset(common),
194
+ rowcount_deltas=rowcount_deltas,
195
+ max_abs_value_delta=max_abs,
196
+ worst_cell=worst_cell,
197
+ n_cells_over_epsilon=n_over,
198
+ n_cells_compared=n_compared,
199
+ epsilon=epsilon,
200
+ value_cols=cols,
201
+ require_ticker_match=require_ticker_match,
202
+ require_rowcount_match=require_rowcount_match,
203
+ )
@@ -0,0 +1,186 @@
1
+ """
2
+ Per-key SSM-backed secret fetcher for Alpha Engine modules.
3
+
4
+ This module is the consolidation point for secret resolution across all six
5
+ alpha-engine repos. Before this, each repo duplicated some variant of the
6
+ ``ssm_secrets.py`` bulk-load-into-os.environ pattern (alpha-engine-data has
7
+ the canonical one). Going forward, callers do::
8
+
9
+ from alpha_engine_lib.secrets import get_secret
10
+
11
+ api_key = get_secret("ANTHROPIC_API_KEY") # required → raises if absent
12
+ opt_key = get_secret("DEBUG_TRACE_KEY", required=False) # optional → returns None
13
+
14
+ **Resolution order** (first hit wins):
15
+
16
+ 1. Per-process cache (populated on first read; thread-safe).
17
+ 2. ``ALPHA_ENGINE_SECRETS_SOURCE`` env-var toggle:
18
+
19
+ - ``env`` → ``os.environ[name]`` only (local-dev escape hatch — never hit SSM)
20
+ - ``ssm`` → SSM only (production strictness — no silent env fallback)
21
+ - unset / ``auto`` / anything else → SSM first, ``os.environ`` fallback
22
+
23
+ 3. SSM at ``/alpha-engine/{name}`` (under :data:`SSM_PREFIX`).
24
+ 4. ``os.environ[name]`` (the fallback in ``auto`` mode).
25
+ 5. ``default`` arg if provided.
26
+ 6. :exc:`SecretNotFoundError` if ``required=True``; else ``None``.
27
+
28
+ **Caching design.** Per-process dict keyed on secret name. The SSM round-trip
29
+ happens at most once per name per process; subsequent calls hit the dict.
30
+ Lambda cold-starts pay the round-trip once; warm invocations reuse the cache.
31
+ :func:`clear_cache` is exposed for tests.
32
+
33
+ **SSM-unavailable latch.** If the first SSM call fails (no boto3, no creds,
34
+ network error), latch ``_ssm_unavailable = True`` and skip SSM for the rest of
35
+ the process. Avoids repeated multi-second timeouts in local dev. Reset via
36
+ :func:`clear_cache` if a test needs to re-probe SSM.
37
+
38
+ **Migration arc**: ``alpha-engine-config/private-docs/ROADMAP.md`` line ~2780
39
+ (Deprecate ``.env`` entirely). Plan doc:
40
+ ``alpha-engine-docs/private/env-to-ssm-260512.md``.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import logging
46
+ import os
47
+ import threading
48
+ from typing import Final
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+ SSM_PREFIX: Final[str] = "/alpha-engine/"
53
+ SOURCE_TOGGLE_ENV: Final[str] = "ALPHA_ENGINE_SECRETS_SOURCE"
54
+
55
+ _cache: dict[str, str] = {}
56
+ _cache_lock = threading.Lock()
57
+ _ssm_unavailable = False
58
+
59
+
60
+ class SecretNotFoundError(LookupError):
61
+ """Raised when a required secret is missing from both SSM and the environment."""
62
+
63
+
64
+ def get_secret(
65
+ name: str,
66
+ *,
67
+ required: bool = True,
68
+ default: str | None = None,
69
+ ) -> str | None:
70
+ """Fetch a secret by ``name`` from SSM with environment fallback.
71
+
72
+ See module docstring for the full resolution order. The lookup is
73
+ per-process cached; the first call per process pays the SSM round-trip,
74
+ subsequent calls hit the in-memory dict.
75
+
76
+ :param name: Secret name (no prefix). E.g. ``"POLYGON_API_KEY"``.
77
+ :param required: If ``True`` (default), raise :exc:`SecretNotFoundError`
78
+ when the secret is absent. If ``False``, return ``default`` (or
79
+ ``None`` if ``default`` is unset).
80
+ :param default: Value to return if the secret is absent and
81
+ ``required=False``. Ignored when ``required=True``.
82
+ :raises SecretNotFoundError: When ``required=True`` and the secret is
83
+ absent from cache, SSM, and ``os.environ``.
84
+ :raises ValueError: When ``name`` is empty or contains a forward slash.
85
+ """
86
+ if not name:
87
+ raise ValueError("secret name must be non-empty")
88
+ if "/" in name:
89
+ raise ValueError(
90
+ f"secret name must not contain '/': got {name!r} "
91
+ f"(the SSM_PREFIX is added automatically)"
92
+ )
93
+
94
+ with _cache_lock:
95
+ cached = _cache.get(name)
96
+ if cached is not None:
97
+ return cached
98
+
99
+ source = os.environ.get(SOURCE_TOGGLE_ENV, "auto").lower()
100
+ if source not in ("auto", "env", "ssm"):
101
+ logger.warning(
102
+ "unknown %s=%r — falling back to 'auto'", SOURCE_TOGGLE_ENV, source
103
+ )
104
+ source = "auto"
105
+
106
+ value: str | None = None
107
+
108
+ if source in ("auto", "ssm"):
109
+ value = _fetch_from_ssm(name)
110
+
111
+ if value is None and source in ("auto", "env"):
112
+ value = os.environ.get(name)
113
+
114
+ if value is None:
115
+ if default is not None:
116
+ return default
117
+ if required:
118
+ raise SecretNotFoundError(
119
+ f"secret {name!r} not found in cache, SSM ({SSM_PREFIX}{name}), "
120
+ f"or environment (source={source!r})"
121
+ )
122
+ return None
123
+
124
+ with _cache_lock:
125
+ _cache[name] = value
126
+ return value
127
+
128
+
129
+ def clear_cache() -> None:
130
+ """Clear the per-process cache and re-arm SSM probing.
131
+
132
+ Mostly for tests — production code should not need to call this. Resets
133
+ both the secret cache and the ``_ssm_unavailable`` latch.
134
+ """
135
+ global _ssm_unavailable
136
+ with _cache_lock:
137
+ _cache.clear()
138
+ _ssm_unavailable = False
139
+
140
+
141
+ def _fetch_from_ssm(name: str) -> str | None:
142
+ """Single-key SSM read. Returns ``None`` on miss or unavailability."""
143
+ global _ssm_unavailable
144
+ if _ssm_unavailable:
145
+ return None
146
+
147
+ try:
148
+ import boto3
149
+ from botocore.exceptions import BotoCoreError, ClientError
150
+ except ImportError:
151
+ logger.debug("boto3 not installed — skipping SSM for %s", name)
152
+ _ssm_unavailable = True
153
+ return None
154
+
155
+ region = os.environ.get("AWS_REGION") or os.environ.get(
156
+ "AWS_DEFAULT_REGION", "us-east-1"
157
+ )
158
+ try:
159
+ client = boto3.client("ssm", region_name=region)
160
+ resp = client.get_parameter(
161
+ Name=f"{SSM_PREFIX}{name}",
162
+ WithDecryption=True,
163
+ )
164
+ return resp["Parameter"]["Value"]
165
+ except ClientError as e:
166
+ code = e.response.get("Error", {}).get("Code", "")
167
+ if code == "ParameterNotFound":
168
+ # Genuine miss — not an SSM-availability problem. Fall through
169
+ # to env without latching, since other secrets may resolve fine.
170
+ logger.debug("SSM miss for %s (ParameterNotFound)", name)
171
+ return None
172
+ logger.warning(
173
+ "SSM read for %s failed (%s) — latching unavailable for this process",
174
+ name,
175
+ code or "unknown",
176
+ )
177
+ _ssm_unavailable = True
178
+ return None
179
+ except BotoCoreError as e:
180
+ logger.warning(
181
+ "SSM read for %s failed (%s) — latching unavailable for this process",
182
+ name,
183
+ type(e).__name__,
184
+ )
185
+ _ssm_unavailable = True
186
+ return None
@@ -0,0 +1,35 @@
1
+ """Shared data-source contracts — Pydantic shapes + Protocols.
2
+
3
+ This package defines the canonical normalized shapes (``NewsArticle``,
4
+ ``AnalystSnapshot``, ``FilingDocument``) and the adapter Protocols
5
+ (``NewsSource``, ``AnalystSource``, ``FilingSource``) that both
6
+ producers (alpha-engine-data) and consumers (alpha-engine-research,
7
+ alpha-engine-backtester) consume.
8
+
9
+ **Architectural pattern:** Lib defines the contract; producers
10
+ (alpha-engine-data) implement concrete adapters; consumers
11
+ (alpha-engine-research) read records produced by them via S3 / RAG
12
+ retrieval and never import adapter classes directly.
13
+
14
+ See the institutional data-revamp plan doc at
15
+ ``~/Development/alpha-engine-docs/private/data-revamp-260513.md`` for
16
+ full context. PR α — lib (this); PR β — data (adapter implementations).
17
+ """
18
+
19
+ from alpha_engine_lib.sources.protocols import (
20
+ AnalystSnapshot,
21
+ AnalystSource,
22
+ FilingDocument,
23
+ FilingSource,
24
+ NewsArticle,
25
+ NewsSource,
26
+ )
27
+
28
+ __all__ = [
29
+ "NewsArticle",
30
+ "AnalystSnapshot",
31
+ "FilingDocument",
32
+ "NewsSource",
33
+ "AnalystSource",
34
+ "FilingSource",
35
+ ]
@@ -0,0 +1,227 @@
1
+ """Source-substrate Protocols + normalized Pydantic shapes.
2
+
3
+ Wave 1 PR α of the institutional data revamp (see
4
+ ``~/Development/alpha-engine-docs/private/data-revamp-260513.md``). Each
5
+ data slot (news, filings, analyst, alt data) becomes a Protocol with
6
+ multiple adapters implementing it; concrete adapters live in
7
+ alpha-engine-data (producer-side) and consumers (alpha-engine-research)
8
+ never import them directly — they read producer outputs via S3 + the
9
+ shared RAG retrieval API.
10
+
11
+ Why Protocols over ABCs:
12
+
13
+ - Structural subtyping — third-party SDK wrappers can satisfy without
14
+ inheriting from our base class.
15
+ - Static type-checking via ``runtime_checkable`` for explicit gating
16
+ in the aggregator.
17
+ - No vtable overhead in hot loops.
18
+
19
+ Why Pydantic shapes (not raw dicts):
20
+
21
+ - Cross-vendor schema normalization is brittle. Pydantic gives us a
22
+ single canonical shape with validation at the adapter boundary —
23
+ adapter bugs surface as ``ValidationError``, not as silent
24
+ ``KeyError`` in downstream NLP three layers deeper.
25
+ - ``extra='forbid'`` ensures vendor schema drift surfaces immediately
26
+ instead of leaking unmapped fields through the pipeline.
27
+ - ``frozen=True`` so records are hashable + safe to share across
28
+ threads / fan-out workers without defensive copies.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ from datetime import datetime
34
+ from typing import Protocol, runtime_checkable
35
+
36
+ from pydantic import BaseModel, ConfigDict, Field
37
+
38
+
39
+ # ── Normalized shapes ──────────────────────────────────────────────────
40
+
41
+
42
+ class NewsArticle(BaseModel):
43
+ """One news article, normalized across all vendors.
44
+
45
+ The canonical key for cross-vendor dedup is the composite
46
+ (normalized title, URL host+path hash). Different vendors syndicate
47
+ the same wire story; the aggregator's dedup catches both the URL-
48
+ collapse case (different querystrings) and the
49
+ title-paraphrase-on-same-URL case.
50
+ """
51
+
52
+ model_config = ConfigDict(frozen=True, extra="forbid")
53
+
54
+ tickers: tuple[str, ...] = Field(
55
+ description="Tickers this article concerns. Multi-ticker articles "
56
+ "(e.g. sector pieces) are kept as a single record with "
57
+ "a multi-element tuple; the aggregator's ticker-union "
58
+ "logic merges variants. Adapter's choice: emit once per "
59
+ "(article, ticker) or once with the full set."
60
+ )
61
+ title: str
62
+ body_excerpt: str = Field(
63
+ description="Lead paragraph or summary. Full-text body lives in the "
64
+ "RAG corpus chunk store, not in this struct — chunked at "
65
+ "embedding time by the ingest pipeline."
66
+ )
67
+ url: str
68
+ published_at: datetime = Field(
69
+ description="UTC publish time. Vendor wall-clock; ingest-time is "
70
+ "in `fetched_at`."
71
+ )
72
+ source: str = Field(
73
+ description="Vendor slug: 'polygon', 'gdelt', 'yahoo_rss', "
74
+ "'edgar_press', 'benzinga' (paid), 'bloomberg' (paid), "
75
+ "'ravenpack' (paid). Joins onto the trust-weight config "
76
+ "downstream."
77
+ )
78
+ vendor_article_id: str | None = Field(
79
+ default=None,
80
+ description="Vendor-native unique ID for cross-reference back to "
81
+ "the source system (Polygon `id`, GDELT `GKGRECORDID`, "
82
+ "etc.). Used by ingest-side idempotency checks.",
83
+ )
84
+ fetched_at: datetime = Field(
85
+ description="When this adapter pulled the article (UTC). For "
86
+ "freshness audit + cache-age computation."
87
+ )
88
+ headline_authors: tuple[str, ...] | None = Field(
89
+ default=None,
90
+ description="Bylines if available. None if the source doesn't "
91
+ "expose authors (e.g. wire feeds).",
92
+ )
93
+ tags: tuple[str, ...] = Field(
94
+ default_factory=tuple,
95
+ description="Vendor-supplied topic / event tags. GDELT emits "
96
+ "structured event codes; Polygon emits keywords; "
97
+ "Benzinga emits Channels. Used as a soft signal for "
98
+ "downstream event-flag extraction.",
99
+ )
100
+
101
+
102
+ class AnalystSnapshot(BaseModel):
103
+ """One vendor's analyst consensus snapshot for one ticker at one
104
+ point in time.
105
+
106
+ Time-series of these in S3 drives self-derived revisions tracking
107
+ (see ``alpha-engine-data/data/derived/revisions.py``, PR C).
108
+ Adapter is responsible for vendor-string normalization at the
109
+ boundary — downstream consumers see the canonical 5-class
110
+ consensus_rating ladder.
111
+ """
112
+
113
+ model_config = ConfigDict(frozen=True, extra="forbid")
114
+
115
+ ticker: str
116
+ source: str
117
+ fetched_at: datetime
118
+ consensus_rating: str | None = Field(
119
+ default=None,
120
+ description="Categorical: 'strongBuy' | 'buy' | 'hold' | 'sell' | "
121
+ "'strongSell'. Vendor strings normalized at adapter "
122
+ "boundary.",
123
+ )
124
+ mean_target: float | None = Field(
125
+ default=None, description="Mean price target (USD)."
126
+ )
127
+ median_target: float | None = Field(
128
+ default=None, description="Median price target if vendor exposes it."
129
+ )
130
+ num_analysts: int | None = Field(
131
+ default=None, description="Number of contributing analysts."
132
+ )
133
+ rating_changes_30d: tuple[dict, ...] = Field(
134
+ default_factory=tuple,
135
+ description="Recent upgrades/downgrades. Each entry: "
136
+ "{analyst, firm, action, prior_rating, new_rating, "
137
+ "date}. Used by downstream NLP/event-flag extraction.",
138
+ )
139
+
140
+
141
+ class FilingDocument(BaseModel):
142
+ """One filing document. Filings substrate (PR B). Pinned here so
143
+ PR α can reference the shape from Protocols without forward refs."""
144
+
145
+ model_config = ConfigDict(frozen=True, extra="forbid")
146
+
147
+ ticker: str
148
+ form_type: str = Field(
149
+ description="'10-K' | '10-Q' | '8-K' | '14A' | 'DEF' | 'S-1' | "
150
+ "'S-4' | '13D' | '13G' | '13F' | 'Form 4' | etc."
151
+ )
152
+ filed_date: datetime
153
+ accession_number: str = Field(
154
+ description="EDGAR accession (e.g. '0000320193-25-000001'). "
155
+ "Canonical key for filing dedup + RAG idempotency check."
156
+ )
157
+ title: str | None = None
158
+ url: str
159
+ source: str = "edgar"
160
+ fetched_at: datetime
161
+ body_excerpt: str = Field(
162
+ description="Lead snippet. Full body goes to the RAG corpus, "
163
+ "chunked at ingest time."
164
+ )
165
+
166
+
167
+ # ── Protocols ──────────────────────────────────────────────────────────
168
+
169
+
170
+ @runtime_checkable
171
+ class NewsSource(Protocol):
172
+ """News adapter contract. Adapters are vendor-specific transports
173
+ that produce normalized :class:`NewsArticle` records.
174
+
175
+ Adapters MUST:
176
+
177
+ - Be safely callable from concurrent contexts (own their HTTP client +
178
+ rate-limiter; no shared mutable state).
179
+ - Return an empty list (never raise) on transient vendor failures.
180
+ Re-raise only on auth failures, contract-breaking schema drift, or
181
+ configuration errors — those fail loud.
182
+ - Normalize wall-clock timestamps to UTC.
183
+ - Stamp ``fetched_at`` on every returned article.
184
+ """
185
+
186
+ name: str # vendor slug — joins onto trust-weight config
187
+
188
+ def fetch(
189
+ self,
190
+ tickers: list[str],
191
+ *,
192
+ hours: int = 48,
193
+ ) -> list[NewsArticle]: ...
194
+
195
+
196
+ @runtime_checkable
197
+ class AnalystSource(Protocol):
198
+ """Analyst data adapter contract. PR C.
199
+
200
+ Returns ``None`` for tickers the vendor doesn't cover (e.g. micro-
201
+ caps absent from FMP); empty fields within a returned snapshot
202
+ indicate vendor-side coverage gaps that are typed in the shape.
203
+ """
204
+
205
+ name: str
206
+
207
+ def fetch(self, ticker: str) -> AnalystSnapshot | None: ...
208
+
209
+
210
+ @runtime_checkable
211
+ class FilingSource(Protocol):
212
+ """Filings adapter contract. PR B.
213
+
214
+ ``form_types`` filter is advisory — adapters that don't support
215
+ form-type pre-filtering at the vendor layer return all forms and
216
+ let downstream filter.
217
+ """
218
+
219
+ name: str
220
+
221
+ def fetch(
222
+ self,
223
+ tickers: list[str],
224
+ *,
225
+ form_types: list[str] | None = None,
226
+ days: int = 7,
227
+ ) -> list[FilingDocument]: ...