alpha-engine-lib 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alpha_engine_lib/__init__.py +3 -0
- alpha_engine_lib/agent_schemas.py +663 -0
- alpha_engine_lib/alerts.py +576 -0
- alpha_engine_lib/arcticdb.py +340 -0
- alpha_engine_lib/collector_results.py +69 -0
- alpha_engine_lib/cost.py +665 -0
- alpha_engine_lib/dates.py +273 -0
- alpha_engine_lib/decision_capture.py +462 -0
- alpha_engine_lib/ec2_spot.py +363 -0
- alpha_engine_lib/email_sender.py +206 -0
- alpha_engine_lib/eval_artifacts.py +361 -0
- alpha_engine_lib/logging.py +303 -0
- alpha_engine_lib/model_pricing.yaml +73 -0
- alpha_engine_lib/pillars.py +756 -0
- alpha_engine_lib/pipeline_status/__init__.py +70 -0
- alpha_engine_lib/pipeline_status/read.py +541 -0
- alpha_engine_lib/pipeline_status/registry.py +368 -0
- alpha_engine_lib/pipeline_status/templates.py +120 -0
- alpha_engine_lib/preflight.py +444 -0
- alpha_engine_lib/rag/__init__.py +39 -0
- alpha_engine_lib/rag/db.py +96 -0
- alpha_engine_lib/rag/embeddings.py +63 -0
- alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
- alpha_engine_lib/rag/rerank.py +377 -0
- alpha_engine_lib/rag/retrieval.py +465 -0
- alpha_engine_lib/rag/schema.sql +65 -0
- alpha_engine_lib/reconcile.py +203 -0
- alpha_engine_lib/secrets.py +186 -0
- alpha_engine_lib/sources/__init__.py +35 -0
- alpha_engine_lib/sources/protocols.py +227 -0
- alpha_engine_lib/ssm_log_capture.py +274 -0
- alpha_engine_lib/telegram.py +165 -0
- alpha_engine_lib/trading_calendar.py +236 -0
- alpha_engine_lib/transparency.py +746 -0
- alpha_engine_lib/transparency_inventory.yaml +260 -0
- alpha_engine_lib/universe.py +83 -0
- alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
- alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
- alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
- alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preflight: fast fail-fast connectivity + freshness checks.
|
|
3
|
+
|
|
4
|
+
``BasePreflight`` provides the shared primitives; consumer modules
|
|
5
|
+
subclass it and override ``run()`` to compose a module-specific check
|
|
6
|
+
sequence. The base raises ``RuntimeError`` on any failure — consumers
|
|
7
|
+
catch nothing, so the raise propagates up through ``main()`` → non-zero
|
|
8
|
+
exit → the orchestration layer's failure handler.
|
|
9
|
+
|
|
10
|
+
Design context (2026-04-14): the alpha-engine-data DailyData step
|
|
11
|
+
silently ran against a stale ArcticDB universe library for two
|
|
12
|
+
weekdays because an ``ImportError`` on ``arcticdb`` was caught at debug
|
|
13
|
+
level. A freshness check on SPY would have flagged the outage in ~1s.
|
|
14
|
+
Preflight exists to catch that class of failure *before* spending 30
|
|
15
|
+
minutes on real work.
|
|
16
|
+
|
|
17
|
+
Scope is deliberately narrow: **external-world handshakes only** (env
|
|
18
|
+
vars, S3 reachability, ArcticDB symbol freshness). Data-correctness
|
|
19
|
+
hard-fails still live in the hardened collectors themselves.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
import urllib.request
|
|
28
|
+
import warnings
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
log = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Default location for the deploy-time GIT_SHA stamp inside a Lambda
|
|
35
|
+
# image. Stamped by deploy.sh via ``--build-arg GIT_SHA=…`` then COPYed
|
|
36
|
+
# to /var/task/GIT_SHA.txt; consumers running outside Lambda can pass an
|
|
37
|
+
# alternate path.
|
|
38
|
+
_DEFAULT_GIT_SHA_FILE = Path("/var/task/GIT_SHA.txt")
|
|
39
|
+
|
|
40
|
+
# Public-repo branch-HEAD API. No auth required; 60 req/hr unauth rate
|
|
41
|
+
# limit is fine for Lambda cold-starts and CI runs.
|
|
42
|
+
_GITHUB_BRANCH_URL = "https://api.github.com/repos/{repo}/branches/{branch}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BasePreflight:
|
|
46
|
+
"""Shared preflight primitives.
|
|
47
|
+
|
|
48
|
+
Subclass and override :meth:`run` to compose a module-specific
|
|
49
|
+
check sequence. Each primitive raises :class:`RuntimeError` on
|
|
50
|
+
failure with an explanatory message that includes what was checked
|
|
51
|
+
and what went wrong.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, bucket: str, region: str | None = None):
|
|
55
|
+
if not bucket:
|
|
56
|
+
raise ValueError("BasePreflight: bucket is required")
|
|
57
|
+
self.bucket = bucket
|
|
58
|
+
self.region = region or os.environ.get("AWS_REGION", "us-east-1")
|
|
59
|
+
|
|
60
|
+
# ── Composition entry point ──────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def run(self) -> None:
|
|
63
|
+
"""Execute the preflight check sequence.
|
|
64
|
+
|
|
65
|
+
Subclasses override this to compose primitives. The default
|
|
66
|
+
raises to prevent a misuse where a subclass forgets to override
|
|
67
|
+
and silently passes.
|
|
68
|
+
"""
|
|
69
|
+
raise NotImplementedError(
|
|
70
|
+
f"{type(self).__name__} must override run() to compose preflight checks"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# ── Primitives ───────────────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
def check_env_vars(self, *names: str) -> None:
|
|
76
|
+
"""Raise if any of the given env vars are unset or empty."""
|
|
77
|
+
missing = [n for n in names if not os.environ.get(n)]
|
|
78
|
+
if missing:
|
|
79
|
+
raise RuntimeError(f"Pre-flight: required env vars missing: {missing}")
|
|
80
|
+
|
|
81
|
+
def check_s3_bucket(self) -> None:
|
|
82
|
+
"""Raise if the configured bucket is not reachable (auth, network, or missing)."""
|
|
83
|
+
import boto3
|
|
84
|
+
try:
|
|
85
|
+
boto3.client("s3").head_bucket(Bucket=self.bucket)
|
|
86
|
+
except Exception as exc:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
f"Pre-flight: S3 bucket {self.bucket!r} unreachable: {exc}"
|
|
89
|
+
) from exc
|
|
90
|
+
|
|
91
|
+
def check_s3_key(self, key: str, max_age_days: int | None = None) -> None:
|
|
92
|
+
"""Raise if ``s3://{bucket}/{key}`` is missing or older than ``max_age_days``.
|
|
93
|
+
|
|
94
|
+
``max_age_days=None`` disables the freshness check — existence only.
|
|
95
|
+
"""
|
|
96
|
+
import boto3
|
|
97
|
+
from botocore.exceptions import ClientError
|
|
98
|
+
try:
|
|
99
|
+
head = boto3.client("s3").head_object(Bucket=self.bucket, Key=key)
|
|
100
|
+
except ClientError as exc:
|
|
101
|
+
err_code = exc.response.get("Error", {}).get("Code")
|
|
102
|
+
if err_code in ("404", "NoSuchKey"):
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f"Pre-flight: S3 key s3://{self.bucket}/{key} does not exist"
|
|
105
|
+
) from exc
|
|
106
|
+
raise RuntimeError(
|
|
107
|
+
f"Pre-flight: S3 key s3://{self.bucket}/{key} unreachable: {exc}"
|
|
108
|
+
) from exc
|
|
109
|
+
if max_age_days is not None:
|
|
110
|
+
last_modified = head["LastModified"]
|
|
111
|
+
age_days = (datetime.now(timezone.utc) - last_modified).days
|
|
112
|
+
if age_days > max_age_days:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
f"Pre-flight: S3 key s3://{self.bucket}/{key} is "
|
|
115
|
+
f"{age_days} days stale (threshold {max_age_days})"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def check_arcticdb_fresh(
|
|
119
|
+
self,
|
|
120
|
+
library: str,
|
|
121
|
+
symbol: str,
|
|
122
|
+
max_stale_days: int,
|
|
123
|
+
) -> None:
|
|
124
|
+
"""Raise if ``arcticdb`` is unavailable, the library/symbol is
|
|
125
|
+
unreadable, or the last date in ``symbol`` is older than
|
|
126
|
+
``max_stale_days`` calendar days from today (UTC).
|
|
127
|
+
|
|
128
|
+
Requires the ``arcticdb`` optional extra
|
|
129
|
+
(``alpha-engine-lib[arcticdb]``).
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
import arcticdb as adb
|
|
133
|
+
import pandas as pd
|
|
134
|
+
except ImportError as exc:
|
|
135
|
+
raise RuntimeError(
|
|
136
|
+
"Pre-flight: arcticdb not importable — install "
|
|
137
|
+
"alpha-engine-lib[arcticdb] or add arcticdb to the deploy image: "
|
|
138
|
+
f"{exc}"
|
|
139
|
+
) from exc
|
|
140
|
+
|
|
141
|
+
uri = (
|
|
142
|
+
f"s3s://s3.{self.region}.amazonaws.com:{self.bucket}"
|
|
143
|
+
"?path_prefix=arcticdb&aws_auth=true"
|
|
144
|
+
)
|
|
145
|
+
try:
|
|
146
|
+
lib = adb.Arctic(uri).get_library(library)
|
|
147
|
+
except Exception as exc:
|
|
148
|
+
raise RuntimeError(
|
|
149
|
+
f"Pre-flight: ArcticDB library {library!r} unreachable "
|
|
150
|
+
f"at {uri}: {exc}"
|
|
151
|
+
) from exc
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
df = lib.read(symbol).data
|
|
155
|
+
except Exception as exc:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
f"Pre-flight: ArcticDB {library}/{symbol} read failed: {exc}"
|
|
158
|
+
) from exc
|
|
159
|
+
|
|
160
|
+
if df.empty:
|
|
161
|
+
raise RuntimeError(
|
|
162
|
+
f"Pre-flight: ArcticDB {library}/{symbol} is empty"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
last_ts = pd.Timestamp(df.index[-1])
|
|
166
|
+
# Normalize to tz-naive date for comparison against today's UTC date.
|
|
167
|
+
if last_ts.tzinfo is not None:
|
|
168
|
+
last_ts = last_ts.tz_convert("UTC").tz_localize(None)
|
|
169
|
+
today = pd.Timestamp(datetime.now(timezone.utc).date())
|
|
170
|
+
age_days = (today - last_ts.normalize()).days
|
|
171
|
+
if age_days > max_stale_days:
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
f"Pre-flight: ArcticDB {library}/{symbol} last date "
|
|
174
|
+
f"{last_ts.date()} is {age_days} days stale "
|
|
175
|
+
f"(threshold {max_stale_days})"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def check_arcticdb_universe_fresh(
|
|
179
|
+
self,
|
|
180
|
+
library: str,
|
|
181
|
+
max_stale_days: int,
|
|
182
|
+
*,
|
|
183
|
+
max_workers: int = 20,
|
|
184
|
+
) -> None:
|
|
185
|
+
"""[DEPRECATED 2026-05-05] Per-symbol freshness scan over an
|
|
186
|
+
ArcticDB library.
|
|
187
|
+
|
|
188
|
+
Deprecated because data-freshness now lives upstream in
|
|
189
|
+
``alpha-engine-data``'s preflight, which runs before any
|
|
190
|
+
consumer in every Step Function. Consumers (executor,
|
|
191
|
+
backtester, predictor) dropped their calls in 2026-05-05's
|
|
192
|
+
consolidation arc. Scheduled for removal after 6-month soak;
|
|
193
|
+
current callers should migrate to trusting SF ordering.
|
|
194
|
+
|
|
195
|
+
Original docstring follows.
|
|
196
|
+
|
|
197
|
+
Scan every symbol in ``library`` and raise if any symbol's
|
|
198
|
+
last_date is older than ``max_stale_days`` calendar days from
|
|
199
|
+
today (UTC).
|
|
200
|
+
|
|
201
|
+
Where :meth:`check_arcticdb_fresh` covers a single canonical
|
|
202
|
+
liveness probe (e.g. macro/SPY), this primitive catches the
|
|
203
|
+
partial-write class — individual tickers stop receiving writes
|
|
204
|
+
while the canonical SPY symbol stays fresh, so the single-symbol
|
|
205
|
+
check reports healthy but downstream consumers fail two hours
|
|
206
|
+
deep on stale per-ticker reads.
|
|
207
|
+
|
|
208
|
+
Motivation (2026-04-21 backtester incident): macro.SPY was fresh,
|
|
209
|
+
ASGN + MOH had stalled at 2026-04-01 because daily_append silently
|
|
210
|
+
skipped them, executor's load_atr_14_pct guard aborted the
|
|
211
|
+
backtester ~2 hours into its predictor-backtest mode. This scan
|
|
212
|
+
catches the same class at preflight in ~5-10 seconds (20 threads
|
|
213
|
+
× ~900 tickers × tail(1) read each).
|
|
214
|
+
|
|
215
|
+
Implementation notes:
|
|
216
|
+
- Reads ``tail(1)`` rather than the full series — ~20ms/symbol.
|
|
217
|
+
- Read errors on any symbol are themselves fatal: a silent read
|
|
218
|
+
error here would mask exactly the kind of write-skip this
|
|
219
|
+
primitive exists to catch.
|
|
220
|
+
- Stale list is sorted by stalest-first so the operator sees
|
|
221
|
+
the worst offenders without scrolling.
|
|
222
|
+
|
|
223
|
+
Requires the ``arcticdb`` optional extra
|
|
224
|
+
(``alpha-engine-lib[arcticdb]``).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
library: ArcticDB library name to scan (e.g. ``"universe"``).
|
|
228
|
+
max_stale_days: Symbols with ``last_date`` older than today
|
|
229
|
+
minus this many calendar days are flagged as stale.
|
|
230
|
+
max_workers: Thread pool size for the per-symbol scan.
|
|
231
|
+
Default 20 matches backtester precedent. Tune lower for
|
|
232
|
+
rate-limited backends; higher for fan-out-bound cases.
|
|
233
|
+
|
|
234
|
+
Raises:
|
|
235
|
+
RuntimeError: If arcticdb is unimportable, the library is
|
|
236
|
+
unreachable, the library is empty, any symbol's
|
|
237
|
+
``tail(1)`` read raises, or ANY symbol is stale beyond
|
|
238
|
+
the threshold.
|
|
239
|
+
"""
|
|
240
|
+
warnings.warn(
|
|
241
|
+
"BasePreflight.check_arcticdb_universe_fresh is deprecated; "
|
|
242
|
+
"data-freshness now lives upstream in alpha-engine-data's "
|
|
243
|
+
"preflight (runs before consumers in every Step Function). "
|
|
244
|
+
"Scheduled for removal after 6-month soak.",
|
|
245
|
+
DeprecationWarning,
|
|
246
|
+
stacklevel=2,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
250
|
+
from datetime import date, timedelta
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
import arcticdb as adb
|
|
254
|
+
import pandas as pd
|
|
255
|
+
except ImportError as exc:
|
|
256
|
+
raise RuntimeError(
|
|
257
|
+
"Pre-flight: arcticdb not importable — install "
|
|
258
|
+
"alpha-engine-lib[arcticdb] or add arcticdb to the deploy image: "
|
|
259
|
+
f"{exc}"
|
|
260
|
+
) from exc
|
|
261
|
+
|
|
262
|
+
uri = (
|
|
263
|
+
f"s3s://s3.{self.region}.amazonaws.com:{self.bucket}"
|
|
264
|
+
"?path_prefix=arcticdb&aws_auth=true"
|
|
265
|
+
)
|
|
266
|
+
try:
|
|
267
|
+
lib = adb.Arctic(uri).get_library(library)
|
|
268
|
+
except Exception as exc:
|
|
269
|
+
raise RuntimeError(
|
|
270
|
+
f"Pre-flight: ArcticDB library {library!r} unreachable "
|
|
271
|
+
f"at {uri}: {exc}"
|
|
272
|
+
) from exc
|
|
273
|
+
|
|
274
|
+
symbols = list(lib.list_symbols())
|
|
275
|
+
if not symbols:
|
|
276
|
+
raise RuntimeError(
|
|
277
|
+
f"Pre-flight: ArcticDB library {library!r} on bucket "
|
|
278
|
+
f"{self.bucket!r} has zero symbols — upstream pipeline "
|
|
279
|
+
"has not written anything."
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
today = date.today()
|
|
283
|
+
cutoff = today - timedelta(days=max_stale_days)
|
|
284
|
+
|
|
285
|
+
def _last_date_for(sym: str) -> tuple[str, "date | None", "str | None"]:
|
|
286
|
+
try:
|
|
287
|
+
df = lib.tail(sym, n=1).data
|
|
288
|
+
if df.empty:
|
|
289
|
+
return sym, None, "empty frame"
|
|
290
|
+
last_ts = pd.Timestamp(df.index[-1])
|
|
291
|
+
if last_ts.tzinfo is not None:
|
|
292
|
+
last_ts = last_ts.tz_convert("UTC").tz_localize(None)
|
|
293
|
+
return sym, last_ts.date(), None
|
|
294
|
+
except Exception as exc: # pragma: no cover — covered via mock
|
|
295
|
+
return sym, None, str(exc)
|
|
296
|
+
|
|
297
|
+
stale: list[tuple[str, date]] = []
|
|
298
|
+
errored: list[tuple[str, str]] = []
|
|
299
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
300
|
+
for sym, last_date, err in pool.map(_last_date_for, symbols):
|
|
301
|
+
if err is not None:
|
|
302
|
+
errored.append((sym, err))
|
|
303
|
+
elif last_date is None:
|
|
304
|
+
errored.append((sym, "no last_date"))
|
|
305
|
+
elif last_date < cutoff:
|
|
306
|
+
stale.append((sym, last_date))
|
|
307
|
+
|
|
308
|
+
if errored:
|
|
309
|
+
sample = [f"{s}({e[:40]})" for s, e in errored[:5]]
|
|
310
|
+
raise RuntimeError(
|
|
311
|
+
f"Pre-flight: {len(errored)} symbol(s) in ArcticDB "
|
|
312
|
+
f"library {library!r} could not be read for freshness check. "
|
|
313
|
+
f"Sample: {sample}. Treated as fatal because a silent read "
|
|
314
|
+
"error here would mask exactly the kind of per-symbol write "
|
|
315
|
+
"skip this scan exists to catch."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if stale:
|
|
319
|
+
stale.sort(key=lambda x: x[1])
|
|
320
|
+
summary = [f"{sym} (last={d.isoformat()})" for sym, d in stale[:10]]
|
|
321
|
+
more = f" (+{len(stale) - 10} more)" if len(stale) > 10 else ""
|
|
322
|
+
raise RuntimeError(
|
|
323
|
+
f"Pre-flight: {len(stale)}/{len(symbols)} symbol(s) in "
|
|
324
|
+
f"ArcticDB library {library!r} have stale data (older "
|
|
325
|
+
f"than {max_stale_days} calendar days, "
|
|
326
|
+
f"cutoff={cutoff.isoformat()}). Top offenders: "
|
|
327
|
+
f"{summary}{more}. Backfill upstream or investigate "
|
|
328
|
+
"the per-symbol write path before re-running."
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def check_ib_paper_account(self, account_id: str) -> None:
|
|
332
|
+
"""Raise if ``account_id`` doesn't start with 'D' (IBKR paper prefix).
|
|
333
|
+
|
|
334
|
+
Defensive check for the executor — prevents live credentials
|
|
335
|
+
leaking into a paper-trading run (or vice versa).
|
|
336
|
+
"""
|
|
337
|
+
if not account_id:
|
|
338
|
+
raise RuntimeError("Pre-flight: IB account_id is empty")
|
|
339
|
+
if not account_id.startswith("D"):
|
|
340
|
+
raise RuntimeError(
|
|
341
|
+
f"Pre-flight: IB account_id {account_id!r} is not a paper "
|
|
342
|
+
"account (paper accounts start with 'D')"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
def check_deploy_drift(
|
|
346
|
+
self,
|
|
347
|
+
repo: str,
|
|
348
|
+
branch: str = "main",
|
|
349
|
+
*,
|
|
350
|
+
sha_file: Path | None = None,
|
|
351
|
+
timeout: float = 5.0,
|
|
352
|
+
) -> None:
|
|
353
|
+
"""Hard-fail if the deploy-baked SHA lags ``repo@branch`` HEAD.
|
|
354
|
+
|
|
355
|
+
The deployed image is stamped with ``GIT_SHA`` at build time
|
|
356
|
+
(via Docker ``--build-arg GIT_SHA=…``); this check compares
|
|
357
|
+
that stamp against the current ``branch`` HEAD SHA on GitHub.
|
|
358
|
+
A mismatch means a merge landed on main but the CI deploy
|
|
359
|
+
workflow either failed, was skipped by a paths filter, or
|
|
360
|
+
hasn't run yet — i.e. the deployed code is a prior commit,
|
|
361
|
+
which is exactly the deploy-drift mode that motivated this
|
|
362
|
+
check (2026-04-20 coverage-gap session).
|
|
363
|
+
|
|
364
|
+
Degraded modes (warn, don't fail) — chosen so a GitHub outage
|
|
365
|
+
or an unstamped legacy image doesn't block a trading-hours
|
|
366
|
+
Lambda:
|
|
367
|
+
- Stamp file missing or "unknown" → image predates drift
|
|
368
|
+
checking; log warn and continue.
|
|
369
|
+
- GitHub API unreachable → log warn and continue.
|
|
370
|
+
|
|
371
|
+
Hard-fail mode — when both stamps are present and differ.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
repo: ``"owner/name"`` — e.g. ``"cipher813/alpha-engine-predictor"``.
|
|
375
|
+
branch: Branch HEAD to compare against. Default ``"main"``.
|
|
376
|
+
sha_file: Path to the GIT_SHA stamp. Defaults to
|
|
377
|
+
``/var/task/GIT_SHA.txt`` (Lambda image convention).
|
|
378
|
+
timeout: GitHub API timeout in seconds.
|
|
379
|
+
"""
|
|
380
|
+
baked = _read_baked_git_sha(sha_file or _DEFAULT_GIT_SHA_FILE)
|
|
381
|
+
if baked is None:
|
|
382
|
+
log.warning(
|
|
383
|
+
"Deploy-drift: no baked GIT_SHA in image at %s (legacy build "
|
|
384
|
+
"or build-arg omitted). Rebuild via deploy.sh to enable this check.",
|
|
385
|
+
sha_file or _DEFAULT_GIT_SHA_FILE,
|
|
386
|
+
)
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
upstream = _fetch_origin_main_sha(repo, branch=branch, timeout=timeout)
|
|
390
|
+
if upstream is None:
|
|
391
|
+
# _fetch_origin_main_sha already logged the reason
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
if baked != upstream:
|
|
395
|
+
raise RuntimeError(
|
|
396
|
+
f"Deploy drift: image was built from {baked[:12]} but "
|
|
397
|
+
f"{repo}@{branch} is now at {upstream[:12]}. The CI deploy "
|
|
398
|
+
f"workflow did not promote the latest commit. Re-run "
|
|
399
|
+
f"`.github/workflows/deploy.yml` on main (or the local "
|
|
400
|
+
f"deploy.sh) before resuming. Refusing to proceed — "
|
|
401
|
+
f"running stale code on new signals is how 2026-04-20 happened."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
log.info("Deploy-drift: image at %s matches %s@%s ✓", baked[:12], repo, branch)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _read_baked_git_sha(sha_file: Path) -> str | None:
|
|
408
|
+
"""Return the SHA baked into the image by ``deploy.sh --build-arg GIT_SHA=…``.
|
|
409
|
+
|
|
410
|
+
Returns ``None`` if the stamp file is missing (legacy image) or holds
|
|
411
|
+
``"unknown"`` (build-arg omitted). Callers decide whether ``None`` is
|
|
412
|
+
warn-and-continue or hard-fail.
|
|
413
|
+
"""
|
|
414
|
+
try:
|
|
415
|
+
sha = sha_file.read_text().strip()
|
|
416
|
+
except FileNotFoundError:
|
|
417
|
+
return None
|
|
418
|
+
if not sha or sha == "unknown":
|
|
419
|
+
return None
|
|
420
|
+
return sha
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _fetch_origin_main_sha(repo: str, branch: str = "main", timeout: float = 5.0) -> str | None:
|
|
424
|
+
"""Fetch HEAD SHA of ``branch`` for ``repo`` via GitHub REST API.
|
|
425
|
+
|
|
426
|
+
Returns ``None`` on any network/parse error — the drift check treats a
|
|
427
|
+
GitHub outage as "unknown, proceed with warning" rather than blocking
|
|
428
|
+
the consumer. ``repo`` is ``"owner/name"`` (e.g.
|
|
429
|
+
``"cipher813/alpha-engine-predictor"``).
|
|
430
|
+
"""
|
|
431
|
+
url = _GITHUB_BRANCH_URL.format(repo=repo, branch=branch)
|
|
432
|
+
req = urllib.request.Request(url, headers={"Accept": "application/vnd.github+json"})
|
|
433
|
+
try:
|
|
434
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
435
|
+
payload = json.loads(resp.read())
|
|
436
|
+
return payload.get("commit", {}).get("sha")
|
|
437
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
438
|
+
# OSError covers urllib.error.URLError/HTTPError plus the bare
|
|
439
|
+
# TimeoutError that urlopen raises on a read-phase timeout (the
|
|
440
|
+
# 2026-05-07 weekday SF DeployDriftCheck failure: read timed out
|
|
441
|
+
# inside http.client.getresponse, which is past urllib's
|
|
442
|
+
# OSError → URLError wrap point in do_open).
|
|
443
|
+
log.warning("Deploy-drift: GitHub API unreachable (%s) — cannot compare", exc)
|
|
444
|
+
return None
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""RAG submodule — semantic retrieval over SEC filings, earnings transcripts, and theses.
|
|
2
|
+
|
|
3
|
+
Shared library code used by both alpha-engine-research (retrieval consumer:
|
|
4
|
+
qual analyst tools) and alpha-engine-data (ingestion producer: weekly Saturday
|
|
5
|
+
RAGIngestion step). Previously duplicated across both repos with drift; moved
|
|
6
|
+
here in alpha-engine-lib v0.3.0 as the single source of truth.
|
|
7
|
+
|
|
8
|
+
Top-level imports re-export the most common surface so consumers can write
|
|
9
|
+
``from alpha_engine_lib.rag import retrieve`` without reaching into submodules.
|
|
10
|
+
|
|
11
|
+
Pgvector + psycopg2 are heavy dependencies; install via the ``[rag]`` extra:
|
|
12
|
+
|
|
13
|
+
pip install "alpha-engine-lib[rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.3.0"
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Auto-load .env so RAG_DATABASE_URL and VOYAGE_API_KEY are available
|
|
17
|
+
# whether run from CLI, Lambda (already in env), or imported in tests.
|
|
18
|
+
try:
|
|
19
|
+
from dotenv import load_dotenv
|
|
20
|
+
load_dotenv()
|
|
21
|
+
except ImportError:
|
|
22
|
+
pass # python-dotenv not installed (e.g. Lambda) — env vars set externally
|
|
23
|
+
|
|
24
|
+
from .db import get_connection, is_available
|
|
25
|
+
from .embeddings import embed_texts
|
|
26
|
+
from .retrieval import (
|
|
27
|
+
retrieve,
|
|
28
|
+
ingest_document,
|
|
29
|
+
document_exists,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"get_connection",
|
|
34
|
+
"is_available",
|
|
35
|
+
"embed_texts",
|
|
36
|
+
"retrieve",
|
|
37
|
+
"ingest_document",
|
|
38
|
+
"document_exists",
|
|
39
|
+
]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Neon PostgreSQL connection management for RAG.
|
|
2
|
+
|
|
3
|
+
Uses psycopg2 with connection pooling suitable for Lambda (short-lived
|
|
4
|
+
connections via Neon's built-in pgbouncer pooler).
|
|
5
|
+
|
|
6
|
+
Requires: RAG_DATABASE_URL environment variable (Neon pooled connection string).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
from contextlib import contextmanager
|
|
14
|
+
|
|
15
|
+
import psycopg2
|
|
16
|
+
import psycopg2.extras
|
|
17
|
+
from pgvector.psycopg2 import register_vector
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_DATABASE_URL: str | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_url() -> str:
|
|
25
|
+
global _DATABASE_URL
|
|
26
|
+
if _DATABASE_URL is None:
|
|
27
|
+
_DATABASE_URL = os.environ.get("RAG_DATABASE_URL")
|
|
28
|
+
if not _DATABASE_URL:
|
|
29
|
+
raise RuntimeError("RAG_DATABASE_URL not set — cannot connect to vector DB")
|
|
30
|
+
return _DATABASE_URL
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@contextmanager
|
|
34
|
+
def get_connection():
|
|
35
|
+
"""Context manager for a database connection.
|
|
36
|
+
|
|
37
|
+
Opens a new connection per call (Neon pooler handles connection reuse
|
|
38
|
+
server-side). Commits on success, rolls back on exception.
|
|
39
|
+
"""
|
|
40
|
+
conn = psycopg2.connect(_get_url())
|
|
41
|
+
# Register pgvector type codecs so SELECTs on `vector` columns return
|
|
42
|
+
# numpy arrays instead of stringified lists. Without this, reads like
|
|
43
|
+
# rag/pipelines/filing_change_detection.py crash with
|
|
44
|
+
# "could not convert string to float" on np.array(embedding). Must run
|
|
45
|
+
# per-connection because psycopg2 scopes type adapters to the connection.
|
|
46
|
+
register_vector(conn)
|
|
47
|
+
try:
|
|
48
|
+
yield conn
|
|
49
|
+
conn.commit()
|
|
50
|
+
except Exception:
|
|
51
|
+
conn.rollback()
|
|
52
|
+
raise
|
|
53
|
+
finally:
|
|
54
|
+
conn.close()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def execute_query(sql: str, params: tuple | list = ()) -> list[dict]:
|
|
58
|
+
"""Execute a SELECT query and return results as list of dicts."""
|
|
59
|
+
with get_connection() as conn:
|
|
60
|
+
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
|
61
|
+
cur.execute(sql, params)
|
|
62
|
+
return [dict(row) for row in cur.fetchall()]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def execute_insert(sql: str, params: tuple | list = ()) -> None:
|
|
66
|
+
"""Execute an INSERT/UPDATE statement."""
|
|
67
|
+
with get_connection() as conn:
|
|
68
|
+
with conn.cursor() as cur:
|
|
69
|
+
cur.execute(sql, params)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def execute_batch(sql: str, params_list: list[tuple]) -> None:
|
|
73
|
+
"""Execute a batch of INSERT statements efficiently."""
|
|
74
|
+
with get_connection() as conn:
|
|
75
|
+
with conn.cursor() as cur:
|
|
76
|
+
psycopg2.extras.execute_batch(cur, sql, params_list, page_size=100)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def is_available() -> bool:
|
|
80
|
+
"""Check if the RAG database is reachable. Never raises.
|
|
81
|
+
|
|
82
|
+
NOTE (2026-04-14): currently has zero callers inside alpha-engine-data.
|
|
83
|
+
The ingestion pipelines call ``get_connection()`` directly, which
|
|
84
|
+
hard-fails on connect errors (correct behavior while the system is
|
|
85
|
+
unstable). Kept in the module in case retrieval-side consumers want
|
|
86
|
+
a non-raising probe; flag for deletion if still unused after the
|
|
87
|
+
cross-repo audit completes.
|
|
88
|
+
"""
|
|
89
|
+
try:
|
|
90
|
+
with get_connection() as conn:
|
|
91
|
+
with conn.cursor() as cur:
|
|
92
|
+
cur.execute("SELECT 1")
|
|
93
|
+
return True
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning("RAG database unavailable: %s", e)
|
|
96
|
+
return False
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Voyage embedding wrapper for RAG document and query embeddings.
|
|
2
|
+
|
|
3
|
+
Uses Voyage voyage-3-lite (512 dimensions), optimized for retrieval on
|
|
4
|
+
financial text. Batch support up to 128 texts per call.
|
|
5
|
+
|
|
6
|
+
The 512d output matches the ``embedding vector(512)`` column declared
|
|
7
|
+
in ``rag/schema.sql`` — pgvector enforces dimension on INSERT, so any
|
|
8
|
+
drift between the model and the schema would be a hard failure on
|
|
9
|
+
ingestion.
|
|
10
|
+
|
|
11
|
+
Requires: VOYAGE_API_KEY environment variable.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_client = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_client():
|
|
26
|
+
global _client
|
|
27
|
+
if _client is None:
|
|
28
|
+
import voyageai
|
|
29
|
+
_client = voyageai.Client(api_key=os.environ.get("VOYAGE_API_KEY"))
|
|
30
|
+
return _client
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def embed_texts(
|
|
34
|
+
texts: list[str],
|
|
35
|
+
input_type: str = "document",
|
|
36
|
+
model: str = "voyage-3-lite",
|
|
37
|
+
batch_size: int = 128,
|
|
38
|
+
) -> list[list[float]]:
|
|
39
|
+
"""Embed a batch of text chunks.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
texts: List of text strings to embed.
|
|
43
|
+
input_type: 'document' for storage, 'query' for retrieval.
|
|
44
|
+
model: Voyage model name.
|
|
45
|
+
batch_size: Max texts per API call (Voyage limit is 128).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of embedding vectors (each 512 floats for voyage-3-lite).
|
|
49
|
+
"""
|
|
50
|
+
client = _get_client()
|
|
51
|
+
all_embeddings = []
|
|
52
|
+
|
|
53
|
+
for i in range(0, len(texts), batch_size):
|
|
54
|
+
batch = texts[i : i + batch_size]
|
|
55
|
+
result = client.embed(batch, model=model, input_type=input_type)
|
|
56
|
+
all_embeddings.extend(result.embeddings)
|
|
57
|
+
|
|
58
|
+
return all_embeddings
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def embed_query(query: str, model: str = "voyage-3-lite") -> list[float]:
|
|
62
|
+
"""Embed a single query for retrieval."""
|
|
63
|
+
return embed_texts([query], input_type="query", model=model)[0]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
-- Migration 0001: add content_tsv (tsvector) + GIN index to rag.chunks
|
|
2
|
+
-- Companion to alpha_engine_lib v0.5.7 schema update for hybrid retrieval.
|
|
3
|
+
--
|
|
4
|
+
-- Idempotent: ``ADD COLUMN IF NOT EXISTS`` + ``CREATE INDEX IF NOT EXISTS``.
|
|
5
|
+
-- Safe to run repeatedly. Re-running is a no-op once both objects exist.
|
|
6
|
+
--
|
|
7
|
+
-- Usage:
|
|
8
|
+
-- psql "$RAG_DATABASE_URL" -f migrations/0001_content_tsv.sql
|
|
9
|
+
--
|
|
10
|
+
-- What this does:
|
|
11
|
+
-- - Adds ``content_tsv`` STORED generated column to ``rag.chunks``.
|
|
12
|
+
-- The generated expression is ``to_tsvector('english', content)``;
|
|
13
|
+
-- PostgreSQL rewrites the table to populate the new column for every
|
|
14
|
+
-- existing row. On the current corpus scale (Neon free tier, low
|
|
15
|
+
-- thousands of chunks) this is seconds.
|
|
16
|
+
-- - Creates a GIN index on ``content_tsv`` for fast Full-Text Search
|
|
17
|
+
-- (FTS) lookups. This is the keyword-side companion to the existing
|
|
18
|
+
-- HNSW index on ``embedding``.
|
|
19
|
+
--
|
|
20
|
+
-- Why STORED rather than VIRTUAL:
|
|
21
|
+
-- - PostgreSQL's STORED is the only generated-column flavor supported
|
|
22
|
+
-- today. VIRTUAL is reserved for a future major version.
|
|
23
|
+
-- - Even when VIRTUAL lands, indexing it would require REFRESH or an
|
|
24
|
+
-- IMMUTABLE wrapper expression — STORED is simpler.
|
|
25
|
+
--
|
|
26
|
+
-- Locking surface:
|
|
27
|
+
-- - ``ALTER TABLE … ADD COLUMN GENERATED … STORED`` rewrites the table
|
|
28
|
+
-- under an ACCESS EXCLUSIVE lock for the duration of the rewrite. On
|
|
29
|
+
-- Neon at the current corpus size this is brief (low single-digit
|
|
30
|
+
-- seconds). At ≥1M chunks consider partitioned rollout instead.
|
|
31
|
+
-- - ``CREATE INDEX`` (without CONCURRENTLY) holds a SHARE lock that
|
|
32
|
+
-- blocks writes but not reads. Same scale caveat applies.
|
|
33
|
+
|
|
34
|
+
ALTER TABLE rag.chunks
|
|
35
|
+
ADD COLUMN IF NOT EXISTS content_tsv tsvector
|
|
36
|
+
GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
|
|
37
|
+
|
|
38
|
+
CREATE INDEX IF NOT EXISTS chunks_content_tsv_gin
|
|
39
|
+
ON rag.chunks USING gin (content_tsv);
|