alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,444 @@
1
+ """
2
+ Preflight: fast fail-fast connectivity + freshness checks.
3
+
4
+ ``BasePreflight`` provides the shared primitives; consumer modules
5
+ subclass it and override ``run()`` to compose a module-specific check
6
+ sequence. The base raises ``RuntimeError`` on any failure — consumers
7
+ catch nothing, so the raise propagates up through ``main()`` → non-zero
8
+ exit → the orchestration layer's failure handler.
9
+
10
+ Design context (2026-04-14): the alpha-engine-data DailyData step
11
+ silently ran against a stale ArcticDB universe library for two
12
+ weekdays because an ``ImportError`` on ``arcticdb`` was caught at debug
13
+ level. A freshness check on SPY would have flagged the outage in ~1s.
14
+ Preflight exists to catch that class of failure *before* spending 30
15
+ minutes on real work.
16
+
17
+ Scope is deliberately narrow: **external-world handshakes only** (env
18
+ vars, S3 reachability, ArcticDB symbol freshness). Data-correctness
19
+ hard-fails still live in the hardened collectors themselves.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import logging
26
+ import os
27
+ import urllib.request
28
+ import warnings
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+
32
+ log = logging.getLogger(__name__)
33
+
34
+ # Default location for the deploy-time GIT_SHA stamp inside a Lambda
35
+ # image. Stamped by deploy.sh via ``--build-arg GIT_SHA=…`` then COPYed
36
+ # to /var/task/GIT_SHA.txt; consumers running outside Lambda can pass an
37
+ # alternate path.
38
+ _DEFAULT_GIT_SHA_FILE = Path("/var/task/GIT_SHA.txt")
39
+
40
+ # Public-repo branch-HEAD API. No auth required; 60 req/hr unauth rate
41
+ # limit is fine for Lambda cold-starts and CI runs.
42
+ _GITHUB_BRANCH_URL = "https://api.github.com/repos/{repo}/branches/{branch}"
43
+
44
+
45
+ class BasePreflight:
46
+ """Shared preflight primitives.
47
+
48
+ Subclass and override :meth:`run` to compose a module-specific
49
+ check sequence. Each primitive raises :class:`RuntimeError` on
50
+ failure with an explanatory message that includes what was checked
51
+ and what went wrong.
52
+ """
53
+
54
+ def __init__(self, bucket: str, region: str | None = None):
55
+ if not bucket:
56
+ raise ValueError("BasePreflight: bucket is required")
57
+ self.bucket = bucket
58
+ self.region = region or os.environ.get("AWS_REGION", "us-east-1")
59
+
60
+ # ── Composition entry point ──────────────────────────────────────────
61
+
62
+ def run(self) -> None:
63
+ """Execute the preflight check sequence.
64
+
65
+ Subclasses override this to compose primitives. The default
66
+ raises to prevent a misuse where a subclass forgets to override
67
+ and silently passes.
68
+ """
69
+ raise NotImplementedError(
70
+ f"{type(self).__name__} must override run() to compose preflight checks"
71
+ )
72
+
73
+ # ── Primitives ───────────────────────────────────────────────────────
74
+
75
+ def check_env_vars(self, *names: str) -> None:
76
+ """Raise if any of the given env vars are unset or empty."""
77
+ missing = [n for n in names if not os.environ.get(n)]
78
+ if missing:
79
+ raise RuntimeError(f"Pre-flight: required env vars missing: {missing}")
80
+
81
+ def check_s3_bucket(self) -> None:
82
+ """Raise if the configured bucket is not reachable (auth, network, or missing)."""
83
+ import boto3
84
+ try:
85
+ boto3.client("s3").head_bucket(Bucket=self.bucket)
86
+ except Exception as exc:
87
+ raise RuntimeError(
88
+ f"Pre-flight: S3 bucket {self.bucket!r} unreachable: {exc}"
89
+ ) from exc
90
+
91
+ def check_s3_key(self, key: str, max_age_days: int | None = None) -> None:
92
+ """Raise if ``s3://{bucket}/{key}`` is missing or older than ``max_age_days``.
93
+
94
+ ``max_age_days=None`` disables the freshness check — existence only.
95
+ """
96
+ import boto3
97
+ from botocore.exceptions import ClientError
98
+ try:
99
+ head = boto3.client("s3").head_object(Bucket=self.bucket, Key=key)
100
+ except ClientError as exc:
101
+ err_code = exc.response.get("Error", {}).get("Code")
102
+ if err_code in ("404", "NoSuchKey"):
103
+ raise RuntimeError(
104
+ f"Pre-flight: S3 key s3://{self.bucket}/{key} does not exist"
105
+ ) from exc
106
+ raise RuntimeError(
107
+ f"Pre-flight: S3 key s3://{self.bucket}/{key} unreachable: {exc}"
108
+ ) from exc
109
+ if max_age_days is not None:
110
+ last_modified = head["LastModified"]
111
+ age_days = (datetime.now(timezone.utc) - last_modified).days
112
+ if age_days > max_age_days:
113
+ raise RuntimeError(
114
+ f"Pre-flight: S3 key s3://{self.bucket}/{key} is "
115
+ f"{age_days} days stale (threshold {max_age_days})"
116
+ )
117
+
118
+ def check_arcticdb_fresh(
119
+ self,
120
+ library: str,
121
+ symbol: str,
122
+ max_stale_days: int,
123
+ ) -> None:
124
+ """Raise if ``arcticdb`` is unavailable, the library/symbol is
125
+ unreadable, or the last date in ``symbol`` is older than
126
+ ``max_stale_days`` calendar days from today (UTC).
127
+
128
+ Requires the ``arcticdb`` optional extra
129
+ (``alpha-engine-lib[arcticdb]``).
130
+ """
131
+ try:
132
+ import arcticdb as adb
133
+ import pandas as pd
134
+ except ImportError as exc:
135
+ raise RuntimeError(
136
+ "Pre-flight: arcticdb not importable — install "
137
+ "alpha-engine-lib[arcticdb] or add arcticdb to the deploy image: "
138
+ f"{exc}"
139
+ ) from exc
140
+
141
+ uri = (
142
+ f"s3s://s3.{self.region}.amazonaws.com:{self.bucket}"
143
+ "?path_prefix=arcticdb&aws_auth=true"
144
+ )
145
+ try:
146
+ lib = adb.Arctic(uri).get_library(library)
147
+ except Exception as exc:
148
+ raise RuntimeError(
149
+ f"Pre-flight: ArcticDB library {library!r} unreachable "
150
+ f"at {uri}: {exc}"
151
+ ) from exc
152
+
153
+ try:
154
+ df = lib.read(symbol).data
155
+ except Exception as exc:
156
+ raise RuntimeError(
157
+ f"Pre-flight: ArcticDB {library}/{symbol} read failed: {exc}"
158
+ ) from exc
159
+
160
+ if df.empty:
161
+ raise RuntimeError(
162
+ f"Pre-flight: ArcticDB {library}/{symbol} is empty"
163
+ )
164
+
165
+ last_ts = pd.Timestamp(df.index[-1])
166
+ # Normalize to tz-naive date for comparison against today's UTC date.
167
+ if last_ts.tzinfo is not None:
168
+ last_ts = last_ts.tz_convert("UTC").tz_localize(None)
169
+ today = pd.Timestamp(datetime.now(timezone.utc).date())
170
+ age_days = (today - last_ts.normalize()).days
171
+ if age_days > max_stale_days:
172
+ raise RuntimeError(
173
+ f"Pre-flight: ArcticDB {library}/{symbol} last date "
174
+ f"{last_ts.date()} is {age_days} days stale "
175
+ f"(threshold {max_stale_days})"
176
+ )
177
+
178
+ def check_arcticdb_universe_fresh(
179
+ self,
180
+ library: str,
181
+ max_stale_days: int,
182
+ *,
183
+ max_workers: int = 20,
184
+ ) -> None:
185
+ """[DEPRECATED 2026-05-05] Per-symbol freshness scan over an
186
+ ArcticDB library.
187
+
188
+ Deprecated because data-freshness now lives upstream in
189
+ ``alpha-engine-data``'s preflight, which runs before any
190
+ consumer in every Step Function. Consumers (executor,
191
+ backtester, predictor) dropped their calls in 2026-05-05's
192
+ consolidation arc. Scheduled for removal after 6-month soak;
193
+ current callers should migrate to trusting SF ordering.
194
+
195
+ Original docstring follows.
196
+
197
+ Scan every symbol in ``library`` and raise if any symbol's
198
+ last_date is older than ``max_stale_days`` calendar days from
199
+ today (UTC).
200
+
201
+ Where :meth:`check_arcticdb_fresh` covers a single canonical
202
+ liveness probe (e.g. macro/SPY), this primitive catches the
203
+ partial-write class — individual tickers stop receiving writes
204
+ while the canonical SPY symbol stays fresh, so the single-symbol
205
+ check reports healthy but downstream consumers fail two hours
206
+ deep on stale per-ticker reads.
207
+
208
+ Motivation (2026-04-21 backtester incident): macro.SPY was fresh,
209
+ ASGN + MOH had stalled at 2026-04-01 because daily_append silently
210
+ skipped them, executor's load_atr_14_pct guard aborted the
211
+ backtester ~2 hours into its predictor-backtest mode. This scan
212
+ catches the same class at preflight in ~5-10 seconds (20 threads
213
+ × ~900 tickers × tail(1) read each).
214
+
215
+ Implementation notes:
216
+ - Reads ``tail(1)`` rather than the full series — ~20ms/symbol.
217
+ - Read errors on any symbol are themselves fatal: a silent read
218
+ error here would mask exactly the kind of write-skip this
219
+ primitive exists to catch.
220
+ - Stale list is sorted by stalest-first so the operator sees
221
+ the worst offenders without scrolling.
222
+
223
+ Requires the ``arcticdb`` optional extra
224
+ (``alpha-engine-lib[arcticdb]``).
225
+
226
+ Args:
227
+ library: ArcticDB library name to scan (e.g. ``"universe"``).
228
+ max_stale_days: Symbols with ``last_date`` older than today
229
+ minus this many calendar days are flagged as stale.
230
+ max_workers: Thread pool size for the per-symbol scan.
231
+ Default 20 matches backtester precedent. Tune lower for
232
+ rate-limited backends; higher for fan-out-bound cases.
233
+
234
+ Raises:
235
+ RuntimeError: If arcticdb is unimportable, the library is
236
+ unreachable, the library is empty, any symbol's
237
+ ``tail(1)`` read raises, or ANY symbol is stale beyond
238
+ the threshold.
239
+ """
240
+ warnings.warn(
241
+ "BasePreflight.check_arcticdb_universe_fresh is deprecated; "
242
+ "data-freshness now lives upstream in alpha-engine-data's "
243
+ "preflight (runs before consumers in every Step Function). "
244
+ "Scheduled for removal after 6-month soak.",
245
+ DeprecationWarning,
246
+ stacklevel=2,
247
+ )
248
+
249
+ from concurrent.futures import ThreadPoolExecutor
250
+ from datetime import date, timedelta
251
+
252
+ try:
253
+ import arcticdb as adb
254
+ import pandas as pd
255
+ except ImportError as exc:
256
+ raise RuntimeError(
257
+ "Pre-flight: arcticdb not importable — install "
258
+ "alpha-engine-lib[arcticdb] or add arcticdb to the deploy image: "
259
+ f"{exc}"
260
+ ) from exc
261
+
262
+ uri = (
263
+ f"s3s://s3.{self.region}.amazonaws.com:{self.bucket}"
264
+ "?path_prefix=arcticdb&aws_auth=true"
265
+ )
266
+ try:
267
+ lib = adb.Arctic(uri).get_library(library)
268
+ except Exception as exc:
269
+ raise RuntimeError(
270
+ f"Pre-flight: ArcticDB library {library!r} unreachable "
271
+ f"at {uri}: {exc}"
272
+ ) from exc
273
+
274
+ symbols = list(lib.list_symbols())
275
+ if not symbols:
276
+ raise RuntimeError(
277
+ f"Pre-flight: ArcticDB library {library!r} on bucket "
278
+ f"{self.bucket!r} has zero symbols — upstream pipeline "
279
+ "has not written anything."
280
+ )
281
+
282
+ today = date.today()
283
+ cutoff = today - timedelta(days=max_stale_days)
284
+
285
+ def _last_date_for(sym: str) -> tuple[str, "date | None", "str | None"]:
286
+ try:
287
+ df = lib.tail(sym, n=1).data
288
+ if df.empty:
289
+ return sym, None, "empty frame"
290
+ last_ts = pd.Timestamp(df.index[-1])
291
+ if last_ts.tzinfo is not None:
292
+ last_ts = last_ts.tz_convert("UTC").tz_localize(None)
293
+ return sym, last_ts.date(), None
294
+ except Exception as exc: # pragma: no cover — covered via mock
295
+ return sym, None, str(exc)
296
+
297
+ stale: list[tuple[str, date]] = []
298
+ errored: list[tuple[str, str]] = []
299
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
300
+ for sym, last_date, err in pool.map(_last_date_for, symbols):
301
+ if err is not None:
302
+ errored.append((sym, err))
303
+ elif last_date is None:
304
+ errored.append((sym, "no last_date"))
305
+ elif last_date < cutoff:
306
+ stale.append((sym, last_date))
307
+
308
+ if errored:
309
+ sample = [f"{s}({e[:40]})" for s, e in errored[:5]]
310
+ raise RuntimeError(
311
+ f"Pre-flight: {len(errored)} symbol(s) in ArcticDB "
312
+ f"library {library!r} could not be read for freshness check. "
313
+ f"Sample: {sample}. Treated as fatal because a silent read "
314
+ "error here would mask exactly the kind of per-symbol write "
315
+ "skip this scan exists to catch."
316
+ )
317
+
318
+ if stale:
319
+ stale.sort(key=lambda x: x[1])
320
+ summary = [f"{sym} (last={d.isoformat()})" for sym, d in stale[:10]]
321
+ more = f" (+{len(stale) - 10} more)" if len(stale) > 10 else ""
322
+ raise RuntimeError(
323
+ f"Pre-flight: {len(stale)}/{len(symbols)} symbol(s) in "
324
+ f"ArcticDB library {library!r} have stale data (older "
325
+ f"than {max_stale_days} calendar days, "
326
+ f"cutoff={cutoff.isoformat()}). Top offenders: "
327
+ f"{summary}{more}. Backfill upstream or investigate "
328
+ "the per-symbol write path before re-running."
329
+ )
330
+
331
+ def check_ib_paper_account(self, account_id: str) -> None:
332
+ """Raise if ``account_id`` doesn't start with 'D' (IBKR paper prefix).
333
+
334
+ Defensive check for the executor — prevents live credentials
335
+ leaking into a paper-trading run (or vice versa).
336
+ """
337
+ if not account_id:
338
+ raise RuntimeError("Pre-flight: IB account_id is empty")
339
+ if not account_id.startswith("D"):
340
+ raise RuntimeError(
341
+ f"Pre-flight: IB account_id {account_id!r} is not a paper "
342
+ "account (paper accounts start with 'D')"
343
+ )
344
+
345
+ def check_deploy_drift(
346
+ self,
347
+ repo: str,
348
+ branch: str = "main",
349
+ *,
350
+ sha_file: Path | None = None,
351
+ timeout: float = 5.0,
352
+ ) -> None:
353
+ """Hard-fail if the deploy-baked SHA lags ``repo@branch`` HEAD.
354
+
355
+ The deployed image is stamped with ``GIT_SHA`` at build time
356
+ (via Docker ``--build-arg GIT_SHA=…``); this check compares
357
+ that stamp against the current ``branch`` HEAD SHA on GitHub.
358
+ A mismatch means a merge landed on main but the CI deploy
359
+ workflow either failed, was skipped by a paths filter, or
360
+ hasn't run yet — i.e. the deployed code is a prior commit,
361
+ which is exactly the deploy-drift mode that motivated this
362
+ check (2026-04-20 coverage-gap session).
363
+
364
+ Degraded modes (warn, don't fail) — chosen so a GitHub outage
365
+ or an unstamped legacy image doesn't block a trading-hours
366
+ Lambda:
367
+ - Stamp file missing or "unknown" → image predates drift
368
+ checking; log warn and continue.
369
+ - GitHub API unreachable → log warn and continue.
370
+
371
+ Hard-fail mode — when both stamps are present and differ.
372
+
373
+ Args:
374
+ repo: ``"owner/name"`` — e.g. ``"cipher813/alpha-engine-predictor"``.
375
+ branch: Branch HEAD to compare against. Default ``"main"``.
376
+ sha_file: Path to the GIT_SHA stamp. Defaults to
377
+ ``/var/task/GIT_SHA.txt`` (Lambda image convention).
378
+ timeout: GitHub API timeout in seconds.
379
+ """
380
+ baked = _read_baked_git_sha(sha_file or _DEFAULT_GIT_SHA_FILE)
381
+ if baked is None:
382
+ log.warning(
383
+ "Deploy-drift: no baked GIT_SHA in image at %s (legacy build "
384
+ "or build-arg omitted). Rebuild via deploy.sh to enable this check.",
385
+ sha_file or _DEFAULT_GIT_SHA_FILE,
386
+ )
387
+ return
388
+
389
+ upstream = _fetch_origin_main_sha(repo, branch=branch, timeout=timeout)
390
+ if upstream is None:
391
+ # _fetch_origin_main_sha already logged the reason
392
+ return
393
+
394
+ if baked != upstream:
395
+ raise RuntimeError(
396
+ f"Deploy drift: image was built from {baked[:12]} but "
397
+ f"{repo}@{branch} is now at {upstream[:12]}. The CI deploy "
398
+ f"workflow did not promote the latest commit. Re-run "
399
+ f"`.github/workflows/deploy.yml` on main (or the local "
400
+ f"deploy.sh) before resuming. Refusing to proceed — "
401
+ f"running stale code on new signals is how 2026-04-20 happened."
402
+ )
403
+
404
+ log.info("Deploy-drift: image at %s matches %s@%s ✓", baked[:12], repo, branch)
405
+
406
+
407
+ def _read_baked_git_sha(sha_file: Path) -> str | None:
408
+ """Return the SHA baked into the image by ``deploy.sh --build-arg GIT_SHA=…``.
409
+
410
+ Returns ``None`` if the stamp file is missing (legacy image) or holds
411
+ ``"unknown"`` (build-arg omitted). Callers decide whether ``None`` is
412
+ warn-and-continue or hard-fail.
413
+ """
414
+ try:
415
+ sha = sha_file.read_text().strip()
416
+ except FileNotFoundError:
417
+ return None
418
+ if not sha or sha == "unknown":
419
+ return None
420
+ return sha
421
+
422
+
423
+ def _fetch_origin_main_sha(repo: str, branch: str = "main", timeout: float = 5.0) -> str | None:
424
+ """Fetch HEAD SHA of ``branch`` for ``repo`` via GitHub REST API.
425
+
426
+ Returns ``None`` on any network/parse error — the drift check treats a
427
+ GitHub outage as "unknown, proceed with warning" rather than blocking
428
+ the consumer. ``repo`` is ``"owner/name"`` (e.g.
429
+ ``"cipher813/alpha-engine-predictor"``).
430
+ """
431
+ url = _GITHUB_BRANCH_URL.format(repo=repo, branch=branch)
432
+ req = urllib.request.Request(url, headers={"Accept": "application/vnd.github+json"})
433
+ try:
434
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
435
+ payload = json.loads(resp.read())
436
+ return payload.get("commit", {}).get("sha")
437
+ except (OSError, json.JSONDecodeError) as exc:
438
+ # OSError covers urllib.error.URLError/HTTPError plus the bare
439
+ # TimeoutError that urlopen raises on a read-phase timeout (the
440
+ # 2026-05-07 weekday SF DeployDriftCheck failure: read timed out
441
+ # inside http.client.getresponse, which is past urllib's
442
+ # OSError → URLError wrap point in do_open).
443
+ log.warning("Deploy-drift: GitHub API unreachable (%s) — cannot compare", exc)
444
+ return None
@@ -0,0 +1,39 @@
1
+ """RAG submodule — semantic retrieval over SEC filings, earnings transcripts, and theses.
2
+
3
+ Shared library code used by both alpha-engine-research (retrieval consumer:
4
+ qual analyst tools) and alpha-engine-data (ingestion producer: weekly Saturday
5
+ RAGIngestion step). Previously duplicated across both repos with drift; moved
6
+ here in alpha-engine-lib v0.3.0 as the single source of truth.
7
+
8
+ Top-level imports re-export the most common surface so consumers can write
9
+ ``from alpha_engine_lib.rag import retrieve`` without reaching into submodules.
10
+
11
+ Pgvector + psycopg2 are heavy dependencies; install via the ``[rag]`` extra:
12
+
13
+ pip install "alpha-engine-lib[rag] @ git+https://github.com/cipher813/alpha-engine-lib@v0.3.0"
14
+ """
15
+
16
+ # Auto-load .env so RAG_DATABASE_URL and VOYAGE_API_KEY are available
17
+ # whether run from CLI, Lambda (already in env), or imported in tests.
18
+ try:
19
+ from dotenv import load_dotenv
20
+ load_dotenv()
21
+ except ImportError:
22
+ pass # python-dotenv not installed (e.g. Lambda) — env vars set externally
23
+
24
+ from .db import get_connection, is_available
25
+ from .embeddings import embed_texts
26
+ from .retrieval import (
27
+ retrieve,
28
+ ingest_document,
29
+ document_exists,
30
+ )
31
+
32
+ __all__ = [
33
+ "get_connection",
34
+ "is_available",
35
+ "embed_texts",
36
+ "retrieve",
37
+ "ingest_document",
38
+ "document_exists",
39
+ ]
@@ -0,0 +1,96 @@
1
+ """Neon PostgreSQL connection management for RAG.
2
+
3
+ Uses psycopg2 with connection pooling suitable for Lambda (short-lived
4
+ connections via Neon's built-in pgbouncer pooler).
5
+
6
+ Requires: RAG_DATABASE_URL environment variable (Neon pooled connection string).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ from contextlib import contextmanager
14
+
15
+ import psycopg2
16
+ import psycopg2.extras
17
+ from pgvector.psycopg2 import register_vector
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _DATABASE_URL: str | None = None
22
+
23
+
24
+ def _get_url() -> str:
25
+ global _DATABASE_URL
26
+ if _DATABASE_URL is None:
27
+ _DATABASE_URL = os.environ.get("RAG_DATABASE_URL")
28
+ if not _DATABASE_URL:
29
+ raise RuntimeError("RAG_DATABASE_URL not set — cannot connect to vector DB")
30
+ return _DATABASE_URL
31
+
32
+
33
+ @contextmanager
34
+ def get_connection():
35
+ """Context manager for a database connection.
36
+
37
+ Opens a new connection per call (Neon pooler handles connection reuse
38
+ server-side). Commits on success, rolls back on exception.
39
+ """
40
+ conn = psycopg2.connect(_get_url())
41
+ # Register pgvector type codecs so SELECTs on `vector` columns return
42
+ # numpy arrays instead of stringified lists. Without this, reads like
43
+ # rag/pipelines/filing_change_detection.py crash with
44
+ # "could not convert string to float" on np.array(embedding). Must run
45
+ # per-connection because psycopg2 scopes type adapters to the connection.
46
+ register_vector(conn)
47
+ try:
48
+ yield conn
49
+ conn.commit()
50
+ except Exception:
51
+ conn.rollback()
52
+ raise
53
+ finally:
54
+ conn.close()
55
+
56
+
57
+ def execute_query(sql: str, params: tuple | list = ()) -> list[dict]:
58
+ """Execute a SELECT query and return results as list of dicts."""
59
+ with get_connection() as conn:
60
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
61
+ cur.execute(sql, params)
62
+ return [dict(row) for row in cur.fetchall()]
63
+
64
+
65
+ def execute_insert(sql: str, params: tuple | list = ()) -> None:
66
+ """Execute an INSERT/UPDATE statement."""
67
+ with get_connection() as conn:
68
+ with conn.cursor() as cur:
69
+ cur.execute(sql, params)
70
+
71
+
72
+ def execute_batch(sql: str, params_list: list[tuple]) -> None:
73
+ """Execute a batch of INSERT statements efficiently."""
74
+ with get_connection() as conn:
75
+ with conn.cursor() as cur:
76
+ psycopg2.extras.execute_batch(cur, sql, params_list, page_size=100)
77
+
78
+
79
+ def is_available() -> bool:
80
+ """Check if the RAG database is reachable. Never raises.
81
+
82
+ NOTE (2026-04-14): currently has zero callers inside alpha-engine-data.
83
+ The ingestion pipelines call ``get_connection()`` directly, which
84
+ hard-fails on connect errors (correct behavior while the system is
85
+ unstable). Kept in the module in case retrieval-side consumers want
86
+ a non-raising probe; flag for deletion if still unused after the
87
+ cross-repo audit completes.
88
+ """
89
+ try:
90
+ with get_connection() as conn:
91
+ with conn.cursor() as cur:
92
+ cur.execute("SELECT 1")
93
+ return True
94
+ except Exception as e:
95
+ logger.warning("RAG database unavailable: %s", e)
96
+ return False
@@ -0,0 +1,63 @@
1
+ """Voyage embedding wrapper for RAG document and query embeddings.
2
+
3
+ Uses Voyage voyage-3-lite (512 dimensions), optimized for retrieval on
4
+ financial text. Batch support up to 128 texts per call.
5
+
6
+ The 512d output matches the ``embedding vector(512)`` column declared
7
+ in ``rag/schema.sql`` — pgvector enforces dimension on INSERT, so any
8
+ drift between the model and the schema would be a hard failure on
9
+ ingestion.
10
+
11
+ Requires: VOYAGE_API_KEY environment variable.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ import os
18
+ from typing import Optional
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _client = None
23
+
24
+
25
+ def _get_client():
26
+ global _client
27
+ if _client is None:
28
+ import voyageai
29
+ _client = voyageai.Client(api_key=os.environ.get("VOYAGE_API_KEY"))
30
+ return _client
31
+
32
+
33
+ def embed_texts(
34
+ texts: list[str],
35
+ input_type: str = "document",
36
+ model: str = "voyage-3-lite",
37
+ batch_size: int = 128,
38
+ ) -> list[list[float]]:
39
+ """Embed a batch of text chunks.
40
+
41
+ Args:
42
+ texts: List of text strings to embed.
43
+ input_type: 'document' for storage, 'query' for retrieval.
44
+ model: Voyage model name.
45
+ batch_size: Max texts per API call (Voyage limit is 128).
46
+
47
+ Returns:
48
+ List of embedding vectors (each 512 floats for voyage-3-lite).
49
+ """
50
+ client = _get_client()
51
+ all_embeddings = []
52
+
53
+ for i in range(0, len(texts), batch_size):
54
+ batch = texts[i : i + batch_size]
55
+ result = client.embed(batch, model=model, input_type=input_type)
56
+ all_embeddings.extend(result.embeddings)
57
+
58
+ return all_embeddings
59
+
60
+
61
+ def embed_query(query: str, model: str = "voyage-3-lite") -> list[float]:
62
+ """Embed a single query for retrieval."""
63
+ return embed_texts([query], input_type="query", model=model)[0]
@@ -0,0 +1,39 @@
1
+ -- Migration 0001: add content_tsv (tsvector) + GIN index to rag.chunks
2
+ -- Companion to alpha_engine_lib v0.5.7 schema update for hybrid retrieval.
3
+ --
4
+ -- Idempotent: ``ADD COLUMN IF NOT EXISTS`` + ``CREATE INDEX IF NOT EXISTS``.
5
+ -- Safe to run repeatedly. Re-running is a no-op once both objects exist.
6
+ --
7
+ -- Usage:
8
+ -- psql "$RAG_DATABASE_URL" -f migrations/0001_content_tsv.sql
9
+ --
10
+ -- What this does:
11
+ -- - Adds ``content_tsv`` STORED generated column to ``rag.chunks``.
12
+ -- The generated expression is ``to_tsvector('english', content)``;
13
+ -- PostgreSQL rewrites the table to populate the new column for every
14
+ -- existing row. On the current corpus scale (Neon free tier, low
15
+ -- thousands of chunks) this is seconds.
16
+ -- - Creates a GIN index on ``content_tsv`` for fast Full-Text Search
17
+ -- (FTS) lookups. This is the keyword-side companion to the existing
18
+ -- HNSW index on ``embedding``.
19
+ --
20
+ -- Why STORED rather than VIRTUAL:
21
+ -- - PostgreSQL's STORED is the only generated-column flavor supported
22
+ -- today. VIRTUAL is reserved for a future major version.
23
+ -- - Even when VIRTUAL lands, indexing it would require REFRESH or an
24
+ -- IMMUTABLE wrapper expression — STORED is simpler.
25
+ --
26
+ -- Locking surface:
27
+ -- - ``ALTER TABLE … ADD COLUMN GENERATED … STORED`` rewrites the table
28
+ -- under an ACCESS EXCLUSIVE lock for the duration of the rewrite. On
29
+ -- Neon at the current corpus size this is brief (low single-digit
30
+ -- seconds). At ≥1M chunks consider partitioned rollout instead.
31
+ -- - ``CREATE INDEX`` (without CONCURRENTLY) holds a SHARE lock that
32
+ -- blocks writes but not reads. Same scale caveat applies.
33
+
34
+ ALTER TABLE rag.chunks
35
+ ADD COLUMN IF NOT EXISTS content_tsv tsvector
36
+ GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
37
+
38
+ CREATE INDEX IF NOT EXISTS chunks_content_tsv_gin
39
+ ON rag.chunks USING gin (content_tsv);