alpha-engine-lib 0.32.0__tar.gz → 0.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/PKG-INFO +1 -1
  2. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/pyproject.toml +1 -1
  3. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/__init__.py +1 -1
  4. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/cost.py +93 -0
  5. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/rerank.py +36 -147
  6. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/retrieval.py +7 -8
  7. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/PKG-INFO +1 -1
  8. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_cost.py +116 -0
  9. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_rag_rerank.py +7 -81
  10. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/README.md +0 -0
  11. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/setup.cfg +0 -0
  12. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/agent_schemas.py +0 -0
  13. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/alerts.py +0 -0
  14. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/arcticdb.py +0 -0
  15. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/collector_results.py +0 -0
  16. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/dates.py +0 -0
  17. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/decision_capture.py +0 -0
  18. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/ec2_spot.py +0 -0
  19. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/email_sender.py +0 -0
  20. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/eval_artifacts.py +0 -0
  21. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/logging.py +0 -0
  22. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/model_pricing.yaml +0 -0
  23. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/pillars.py +0 -0
  24. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/pipeline_status/__init__.py +0 -0
  25. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/pipeline_status/read.py +0 -0
  26. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/pipeline_status/registry.py +0 -0
  27. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/pipeline_status/templates.py +0 -0
  28. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/preflight.py +0 -0
  29. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/__init__.py +0 -0
  30. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/db.py +0 -0
  31. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/embeddings.py +0 -0
  32. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/migrations/0001_content_tsv.sql +0 -0
  33. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/schema.sql +0 -0
  34. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/reconcile.py +0 -0
  35. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/secrets.py +0 -0
  36. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/sources/__init__.py +0 -0
  37. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/sources/protocols.py +0 -0
  38. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/ssm_log_capture.py +0 -0
  39. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/telegram.py +0 -0
  40. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/trading_calendar.py +0 -0
  41. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/transparency.py +0 -0
  42. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/transparency_inventory.yaml +0 -0
  43. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/universe.py +0 -0
  44. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/SOURCES.txt +0 -0
  45. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/dependency_links.txt +0 -0
  46. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/requires.txt +0 -0
  47. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/top_level.txt +0 -0
  48. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_agent_schemas.py +0 -0
  49. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_alerts.py +0 -0
  50. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_arcticdb.py +0 -0
  51. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_collector_results.py +0 -0
  52. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_dates.py +0 -0
  53. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_decision_capture.py +0 -0
  54. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_ec2_spot.py +0 -0
  55. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_email_sender.py +0 -0
  56. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_eval_artifacts.py +0 -0
  57. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_logging.py +0 -0
  58. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_pillars.py +0 -0
  59. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_pipeline_status_read.py +0 -0
  60. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_pipeline_status_registry.py +0 -0
  61. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_pipeline_status_templates.py +0 -0
  62. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_preflight.py +0 -0
  63. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_rag.py +0 -0
  64. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_rag_retrieval_hybrid.py +0 -0
  65. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_reconcile.py +0 -0
  66. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_secrets.py +0 -0
  67. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_sources_protocols.py +0 -0
  68. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_ssm_log_capture.py +0 -0
  69. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_telegram.py +0 -0
  70. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_trading_calendar.py +0 -0
  71. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_transparency.py +0 -0
  72. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_universe.py +0 -0
  73. {alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_version_pin.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alpha-engine-lib
3
- Version: 0.32.0
3
+ Version: 0.34.0
4
4
  Summary: Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README.
5
5
  Author: Brian McMahon
6
6
  License: Proprietary
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alpha-engine-lib"
7
- version = "0.32.0"
7
+ version = "0.34.0"
8
8
  description = "Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README."
9
9
  readme = "README.md"
10
10
  # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops
@@ -1,3 +1,3 @@
1
1
  """alpha-engine-lib — shared utilities for Alpha Engine modules."""
2
2
 
3
- __version__ = "0.32.0"
3
+ __version__ = "0.34.0"
@@ -663,3 +663,96 @@ def metadata_from_anthropic_message(
663
663
  web_fetch_requests=(getattr(stu, "web_fetch_requests", 0) or 0)
664
664
  if stu is not None else 0,
665
665
  )
666
+
667
+
668
+ # ── Capture chokepoint (v0.33.0) ──────────────────────────────────────────
669
+
670
+
671
+ def record_anthropic_call(
672
+ msg: _AnthropicMessageLike,
673
+ *,
674
+ model_name: str | None = None,
675
+ pricing: PriceTable | None = None,
676
+ tool_fees: ToolFeeTable | None = None,
677
+ at: datetime | date | None = None,
678
+ extra_fields: dict[str, Any] | None = None,
679
+ ) -> dict[str, Any]:
680
+ """Map an Anthropic SDK ``Message`` → priced JSONL-ready cost record.
681
+
682
+ Single chokepoint for raw-SDK consumers (morning-signal, alpha-engine
683
+ /executor, alpha-engine-data, et al.). Returns a flat dict ready for
684
+ ``json.dumps``; the caller chooses the sink (local file / S3 /
685
+ CloudWatch). No I/O performed here — pure mapper.
686
+
687
+ Per ``[[feedback_lift_invariants_to_chokepoint_after_second_recurrence]]``
688
+ — extracted from morning-signal v0.32.0's ``cost_telemetry.record_call_cost``
689
+ after data + executor became the 2nd + 3rd consumers needing the same
690
+ shape. Composes with :func:`metadata_from_anthropic_message` (token-count
691
+ extraction) + :func:`recompute_cost` (USD pricing) into the single call
692
+ a typical consumer wants.
693
+
694
+ Parameters
695
+ ----------
696
+ msg
697
+ Anthropic SDK ``Message`` (or anything matching
698
+ :class:`_AnthropicMessageLike`). Forwarded to
699
+ :func:`metadata_from_anthropic_message`.
700
+ model_name
701
+ Override for ``ModelMetadata.model_name``. Defaults to ``msg.model``.
702
+ pricing
703
+ :class:`PriceTable` for USD recompute. Defaults to
704
+ :func:`load_default_pricing` when ``None`` (packaged Anthropic rate
705
+ card). Pass an explicit table for operator-managed pricing.
706
+ tool_fees
707
+ :class:`ToolFeeTable` for server-tool fee recompute. Defaults to
708
+ :func:`load_default_tool_fees`. Pass an explicit table for
709
+ operator-managed fees.
710
+ at
711
+ Wall-clock date for price-card / tool-fee lookup. Defaults to
712
+ ``datetime.now(timezone.utc)``. Pass the original capture
713
+ timestamp for historical recompute.
714
+ extra_fields
715
+ Optional dict merged into the returned record AFTER the standard
716
+ fields. Consumers attach run-context (``run_id``, ``agent_id``,
717
+ ``sector_team_id``, ``edition``, ``date``, ...) here so the
718
+ JSONL row is self-describing without out-of-band metadata.
719
+
720
+ Returns
721
+ -------
722
+ dict
723
+ Flat dict with: ``ts`` (ISO-8601 UTC capture time), ``model``,
724
+ ``input_tokens``, ``output_tokens``, ``cache_read_tokens``,
725
+ ``cache_create_tokens``, ``web_search_requests``,
726
+ ``web_fetch_requests``, ``cost_usd`` (priced via
727
+ ``recompute_cost``), plus any ``extra_fields`` merged in.
728
+ Caller-owned field names take precedence over the standard set
729
+ when keys collide.
730
+
731
+ Raises
732
+ ------
733
+ PriceCardLookupError
734
+ Propagated from :func:`recompute_cost` if no price card matches
735
+ ``model_name`` at ``at``, or if the message records non-zero
736
+ server-tool requests with no matching :class:`ToolFee` in the
737
+ active table. Per ``[[feedback_no_silent_fails]]`` — a missing
738
+ card on a real call is a load-bearing error worth surfacing.
739
+ """
740
+ metadata = metadata_from_anthropic_message(msg, model_name=model_name)
741
+ table = pricing if pricing is not None else load_default_pricing()
742
+ fees = tool_fees if tool_fees is not None else load_default_tool_fees()
743
+ recompute_cost(metadata, table, tool_fee_table=fees, at=at)
744
+
745
+ record: dict[str, Any] = {
746
+ "ts": datetime.now(timezone.utc).isoformat(),
747
+ "model": metadata.model_name,
748
+ "input_tokens": metadata.input_tokens,
749
+ "output_tokens": metadata.output_tokens,
750
+ "cache_read_tokens": metadata.cache_read_tokens,
751
+ "cache_create_tokens": metadata.cache_create_tokens,
752
+ "web_search_requests": metadata.web_search_requests,
753
+ "web_fetch_requests": metadata.web_fetch_requests,
754
+ "cost_usd": metadata.cost_usd,
755
+ }
756
+ if extra_fields:
757
+ record.update(extra_fields)
758
+ return record
@@ -4,36 +4,41 @@ Reranking sits between candidate generation (`retrieve(method="hybrid", ...)`)
4
4
  and LLM consumption. Hybrid retrieval over a wide candidate pool (e.g. top-30)
5
5
  gives high recall; rerank then provides precision by scoring each
6
6
  ``(query, document)`` pair jointly under a model that's purpose-built for
7
- relevance ranking. This decouples the two trade-offs that bi-encoders /
8
- keyword retrieval can't resolve simultaneously.
9
-
10
- Two implementations are shipped:
11
-
12
- - :class:`CrossEncoderReranker` local BAAI ``bge-reranker-v2-m3`` (or any
13
- cross-encoder loadable via ``sentence-transformers``). Zero external API
14
- surface, deterministic, ~100-300ms latency on CPU at top-50. Default for
15
- Alpha Engine consumers per the no-new-vendor posture.
16
- - :class:`LLMJudgeReranker` Anthropic Haiku with a 1-5 relevance rubric.
17
- Higher latency + cost than cross-encoder; configurable opt-in for
18
- scenarios that need rerank criteria beyond pure semantic similarity
19
- ("rerank by recency-weighted relevance", "rerank by financial
20
- materiality").
21
-
22
- Both implementations share the :class:`Reranker` protocol and the in-process
23
- :class:`RerankCache` (LRU, keyed by ``sha256(query) + chunk_id``). Cache
24
- lifetime is the process / Lambda container — no cross-run persistence,
25
- because query embeddings drift with corpus updates and rerank scores are
26
- cheap-to-recompute relative to the LLM call they enable.
7
+ relevance ranking.
8
+
9
+ **One implementation shipped:** :class:`CrossEncoderReranker` — local
10
+ BAAI ``bge-reranker-v2-m3`` (or any cross-encoder loadable via
11
+ ``sentence-transformers``). Zero external API surface, deterministic,
12
+ ~100-300ms latency on CPU at top-50. The institutional/SOTA rerank
13
+ pattern for production RAG is domain-finetuned cross-encoders;
14
+ general-purpose CE models (like our bundled BAAI default) are tier-2
15
+ SOTA, dominant for general-domain RAG but expected to regress on
16
+ specialized corpora until finetuned on domain-labeled (query, doc,
17
+ relevance) pairs.
18
+
19
+ **``LLMJudgeReranker`` removed v0.34.0** (2026-05-25). The class
20
+ fired one Haiku call per (query, doc) pair — a tier-5 SOTA approach
21
+ useful for novel rubrics that lack training labels, not for general
22
+ relevance reranking. Empirical eval on the SEC-filings RAG corpus
23
+ (2026-05-12, EXPERIMENTS.md) measured -14.2% recall@10 vs the hybrid
24
+ w=0.7 baseline. Removed per ``[[preference_llm_calls_confined_to_research_module]]``
25
+ + the no-lift finding. Re-attempting LLM-judge rerank in the future
26
+ goes inside alpha-engine-research (where LLM calls belong); the
27
+ institutional rerank-revisit path is domain-finetune the CE model
28
+ on operator-labeled retrieval triples.
29
+
30
+ The :class:`RerankCache` (LRU, keyed by ``sha256(query) + chunk_id``)
31
+ is process-local — no cross-run persistence, because query embeddings
32
+ drift with corpus updates and rerank scores are cheap to recompute.
27
33
  """
28
34
 
29
35
  from __future__ import annotations
30
36
 
31
37
  import hashlib
32
38
  import logging
33
- import os
34
39
  from collections import OrderedDict
35
40
  from dataclasses import dataclass, field
36
- from typing import Callable, Protocol, runtime_checkable
41
+ from typing import Protocol, runtime_checkable
37
42
 
38
43
  from .retrieval import RetrievalResult
39
44
 
@@ -201,100 +206,6 @@ class CrossEncoderReranker:
201
206
  return _attach_and_sort(candidates, scores, self.name, top_k)
202
207
 
203
208
 
204
- # ── LLM-as-judge ────────────────────────────────────────────────────────────
205
-
206
-
207
- # Default rubric — kept terse to fit a Haiku context window comfortably
208
- # at top-50 candidates and to leave room for the candidate text itself.
209
- # Scores follow a 1-5 integer Likert that the model returns as plain
210
- # JSON for deterministic parsing.
211
- _DEFAULT_LLM_RUBRIC = (
212
- "Rate the relevance of the following document to the query on a "
213
- "1-5 scale where 1=irrelevant, 3=tangentially related, 5=directly "
214
- "answers the query. Respond with ONLY a single integer between 1 "
215
- "and 5."
216
- )
217
-
218
-
219
- @dataclass
220
- class LLMJudgeReranker:
221
- """LLM-as-judge reranker — one Haiku call per (query, doc) pair.
222
-
223
- More expensive + slower than the cross-encoder (one LLM round-trip
224
- per candidate vs. one batched local-model inference for the whole
225
- set) but more flexible: the rubric can encode criteria beyond
226
- semantic similarity ("rerank by recency-weighted financial
227
- materiality"). Configure via :attr:`rubric` at construction.
228
-
229
- Default ``rubric`` is a strict 1-5 Likert; output is parsed as
230
- ``int(response.strip()[0])`` to tolerate the occasional Haiku
231
- leading whitespace or trailing punctuation. Parses that fail
232
- produce a neutral score of 3 + a warning log; the caller's batch
233
- still completes.
234
-
235
- The Anthropic client is injected so consumers can plug in a
236
- pre-configured ``ChatAnthropic`` (langchain) or
237
- ``anthropic.Anthropic`` instance. The protocol surface is just
238
- ``client.messages.create(...)`` for the raw SDK shape.
239
- """
240
-
241
- client: object
242
- model: str = "claude-haiku-4-5-20251001"
243
- rubric: str = _DEFAULT_LLM_RUBRIC
244
- cache: RerankCache = field(default_factory=RerankCache)
245
- name: str = "llm_judge"
246
-
247
- def rerank(
248
- self,
249
- query: str,
250
- candidates: list[RetrievalResult],
251
- top_k: int,
252
- ) -> list[RetrievalResult]:
253
- if not candidates:
254
- return []
255
-
256
- scores: list[float | None] = [None] * len(candidates)
257
- for idx, cand in enumerate(candidates):
258
- key = self.cache.make_key(query, cand.chunk_id)
259
- cached = self.cache.get(key)
260
- if cached is not None:
261
- scores[idx] = cached
262
- continue
263
- score = self._score_one(query, cand.content)
264
- scores[idx] = score
265
- self.cache.put(key, score)
266
-
267
- return _attach_and_sort(candidates, scores, self.name, top_k)
268
-
269
- def _score_one(self, query: str, content: str) -> float:
270
- # Truncate the candidate text so a top-50 sweep at ~3K tokens per
271
- # candidate doesn't push the prompt past Haiku's window.
272
- snippet = content[:4000]
273
- prompt = (
274
- f"{self.rubric}\n\n"
275
- f"Query: {query}\n\n"
276
- f"Document:\n{snippet}\n\n"
277
- f"Score (1-5):"
278
- )
279
- try:
280
- response = self.client.messages.create( # type: ignore[attr-defined]
281
- model=self.model,
282
- max_tokens=8,
283
- messages=[{"role": "user", "content": prompt}],
284
- )
285
- # Anthropic SDK response shape: response.content is a list of
286
- # content blocks; the first text block holds the integer.
287
- text_block = response.content[0]
288
- raw = getattr(text_block, "text", str(text_block)).strip()
289
- return float(int(raw[0]))
290
- except (ValueError, IndexError, AttributeError) as exc:
291
- logger.warning(
292
- "LLMJudgeReranker parse-fail (returning neutral 3): %s — raw=%r",
293
- exc, locals().get("raw", "<no response>"),
294
- )
295
- return 3.0
296
-
297
-
298
209
  # ── Helpers ─────────────────────────────────────────────────────────────────
299
210
 
300
211
 
@@ -331,47 +242,25 @@ def _attach_and_sort(
331
242
  _RERANKER_REGISTRY: dict[str, Reranker] = {}
332
243
 
333
244
 
334
- # Factory hook used by :func:`get_reranker` for the ``"llm_judge"``
335
- # case — exposed at module scope so tests can patch it without
336
- # importing the anthropic SDK. Default constructs an Anthropic client
337
- # from the environment, matching the pattern used elsewhere in
338
- # alpha-engine-research.
339
- def _default_llm_judge_factory() -> Reranker:
340
- try:
341
- from anthropic import Anthropic # type: ignore[import-not-found]
342
- except ImportError as exc:
343
- raise ImportError(
344
- "LLMJudgeReranker requires the anthropic SDK. "
345
- "Install via: pip install anthropic"
346
- ) from exc
347
- api_key = os.environ.get("ANTHROPIC_API_KEY")
348
- if not api_key:
349
- raise RuntimeError(
350
- "LLMJudgeReranker needs ANTHROPIC_API_KEY in the environment."
351
- )
352
- return LLMJudgeReranker(client=Anthropic(api_key=api_key))
353
-
354
-
355
- _LLM_JUDGE_FACTORY: Callable[[], Reranker] = _default_llm_judge_factory
356
-
357
-
358
245
  def get_reranker(name: str) -> Reranker:
359
246
  """Resolve a named reranker, constructing + caching on first use.
360
247
 
361
- Supported names: ``"cross_encoder"`` (default — local BAAI),
362
- ``"llm_judge"`` (Anthropic Haiku via the ``anthropic`` SDK).
363
- Tests register fakes by writing directly to
364
- :data:`_RERANKER_REGISTRY` before the ``retrieve(rerank=...)`` call.
248
+ Supported names: ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3
249
+ via sentence-transformers). Tests register fakes by writing
250
+ directly to :data:`_RERANKER_REGISTRY` before the
251
+ ``retrieve(rerank=...)`` call.
252
+
253
+ ``"llm_judge"`` was removed v0.34.0 — see module docstring for the
254
+ no-lift finding + the institutional rerank-revisit path
255
+ (domain-finetune the CE model, not LLM-judge).
365
256
  """
366
257
  if name in _RERANKER_REGISTRY:
367
258
  return _RERANKER_REGISTRY[name]
368
259
  if name == "cross_encoder":
369
260
  instance: Reranker = CrossEncoderReranker()
370
- elif name == "llm_judge":
371
- instance = _LLM_JUDGE_FACTORY()
372
261
  else:
373
262
  raise ValueError(
374
- f"Unknown reranker {name!r}; supported: 'cross_encoder', 'llm_judge'"
263
+ f"Unknown reranker {name!r}; supported: 'cross_encoder'"
375
264
  )
376
265
  _RERANKER_REGISTRY[name] = instance
377
266
  return instance
@@ -46,8 +46,8 @@ class RetrievalResult:
46
46
  vector_score: float | None = None # cosine similarity, [-1, 1]; None if not retrieved via vector
47
47
  keyword_score: float | None = None # ts_rank_cd, [0, ∞); None if not retrieved via keyword
48
48
  combined_score: float | None = None # blended score in hybrid mode; None for non-hybrid
49
- rerank_score: float | None = None # cross-encoder / LLM-judge score; None if rerank wasn't run
50
- rerank_method: str | None = None # "cross_encoder" / "llm_judge" / None — disambiguates which reranker stamped this
49
+ rerank_score: float | None = None # cross-encoder score; None if rerank wasn't run
50
+ rerank_method: str | None = None # "cross_encoder" / None — disambiguates which reranker stamped this
51
51
 
52
52
 
53
53
  def retrieve(
@@ -78,12 +78,11 @@ def retrieve(
78
78
  Ignored for non-hybrid methods.
79
79
  rerank: When set, run a reranker over the retrieved candidates
80
80
  before truncating to ``top_k``. Supported values:
81
- ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3 — default
82
- choice when reranking, no API cost) or ``"llm_judge"``
83
- (Anthropic Haiku with a 1-5 relevance rubric opt-in,
84
- higher latency + cost). ``None`` (default) preserves the
85
- pre-rerank behavior back-compat path for callers not yet
86
- wired to reranking.
81
+ ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3 — no
82
+ API cost). ``None`` (default) preserves the pre-rerank
83
+ behavior back-compat path for callers not yet wired to
84
+ reranking. ``"llm_judge"`` was removed v0.34.0 (see
85
+ ``rerank`` module docstring for the no-lift finding).
87
86
  rerank_input_n: When ``rerank`` is set, retrieve this many
88
87
  candidates from the underlying method before passing the
89
88
  pool to the reranker. Larger pools give the reranker more
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alpha-engine-lib
3
- Version: 0.32.0
3
+ Version: 0.34.0
4
4
  Summary: Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README.
5
5
  Author: Brian McMahon
6
6
  License: Proprietary
@@ -25,6 +25,7 @@ from alpha_engine_lib.cost import (
25
25
  load_pricing,
26
26
  load_tool_fees,
27
27
  metadata_from_anthropic_message,
28
+ record_anthropic_call,
28
29
  recompute_cost,
29
30
  )
30
31
  from alpha_engine_lib.decision_capture import ModelMetadata
@@ -741,3 +742,118 @@ class TestRecomputeCostWithToolFees:
741
742
  # 1M Sonnet input @ $3/M + 10 web_search @ $10/1k = $3.10.
742
743
  assert cost == pytest.approx(3.10)
743
744
 
745
+
746
+ # ── record_anthropic_call (capture chokepoint, v0.33.0) ───────────────────
747
+
748
+
749
+ class TestRecordAnthropicCall:
750
+ """Lock down the lifted capture primitive that morning-signal,
751
+ alpha-engine-data, and alpha-engine (executor) all consume in their
752
+ raw-SDK call sites."""
753
+
754
+ def test_returns_priced_jsonl_ready_record(self):
755
+ msg = _FakeMessage(
756
+ model="claude-haiku-4-5",
757
+ usage=_FakeUsage(input_tokens=1000, output_tokens=200),
758
+ )
759
+ record = record_anthropic_call(msg)
760
+ # Token cost: (1000 * 1.0 + 200 * 5.0) / 1M = 0.002
761
+ assert record["cost_usd"] == pytest.approx(0.002, abs=1e-6)
762
+ assert record["model"] == "claude-haiku-4-5"
763
+ assert record["input_tokens"] == 1000
764
+ assert record["output_tokens"] == 200
765
+ assert record["cache_read_tokens"] == 0
766
+ assert record["cache_create_tokens"] == 0
767
+ assert record["web_search_requests"] == 0
768
+ assert record["web_fetch_requests"] == 0
769
+ # Timestamp is ISO-8601 round-trippable.
770
+ from datetime import datetime
771
+ datetime.fromisoformat(record["ts"])
772
+
773
+ def test_includes_tool_fee_pricing(self):
774
+ msg = _FakeMessage(
775
+ model="claude-haiku-4-5",
776
+ usage=_FakeUsage(
777
+ input_tokens=1000, output_tokens=200,
778
+ server_tool_use=_FakeServerToolUsage(web_search_requests=50),
779
+ ),
780
+ )
781
+ record = record_anthropic_call(msg)
782
+ # Tokens 0.002 + 50 × $10/1k = 0.5 → 0.502
783
+ assert record["cost_usd"] == pytest.approx(0.502, abs=1e-6)
784
+ assert record["web_search_requests"] == 50
785
+
786
+ def test_extra_fields_merged(self):
787
+ msg = _FakeMessage(
788
+ model="claude-haiku-4-5",
789
+ usage=_FakeUsage(input_tokens=10, output_tokens=5),
790
+ )
791
+ record = record_anthropic_call(msg, extra_fields={
792
+ "run_id": "2026-05-25",
793
+ "agent_id": "data:news_event_extraction",
794
+ "fingerprint": "abc123",
795
+ })
796
+ assert record["run_id"] == "2026-05-25"
797
+ assert record["agent_id"] == "data:news_event_extraction"
798
+ assert record["fingerprint"] == "abc123"
799
+ # Standard fields preserved alongside extras.
800
+ assert record["model"] == "claude-haiku-4-5"
801
+
802
+ def test_extra_fields_can_override_standard_fields(self):
803
+ """Caller-owned keys take precedence — the consumer is the
804
+ authority on what a record should look like in its sink."""
805
+ msg = _FakeMessage(
806
+ model="claude-haiku-4-5",
807
+ usage=_FakeUsage(input_tokens=10, output_tokens=5),
808
+ )
809
+ custom_ts = "2026-05-25T17:30:00+00:00"
810
+ record = record_anthropic_call(msg, extra_fields={"ts": custom_ts})
811
+ assert record["ts"] == custom_ts
812
+
813
+ def test_model_name_override_propagates(self):
814
+ msg = _FakeMessage(
815
+ model="claude-haiku-4-5-20251001",
816
+ usage=_FakeUsage(input_tokens=10, output_tokens=5),
817
+ )
818
+ record = record_anthropic_call(msg, model_name="claude-haiku-4-5")
819
+ assert record["model"] == "claude-haiku-4-5"
820
+
821
+ def test_uses_default_pricing_when_none_passed(self):
822
+ """Caller without operator-managed pricing gets packaged defaults."""
823
+ msg = _FakeMessage(
824
+ model="claude-sonnet-4-6",
825
+ usage=_FakeUsage(input_tokens=1_000_000, output_tokens=0),
826
+ )
827
+ record = record_anthropic_call(msg)
828
+ # 1M Sonnet input @ $3/M = $3.00 against packaged default rate card.
829
+ assert record["cost_usd"] == pytest.approx(3.0)
830
+
831
+ def test_explicit_pricing_table_used(self):
832
+ """Operator-managed pricing wins over defaults when passed."""
833
+ custom_table = PriceTable(cards=[PriceCard(
834
+ model_name="claude-sonnet-4-6",
835
+ effective_from=date(2026, 1, 1),
836
+ input_per_1m=99.0,
837
+ output_per_1m=99.0,
838
+ cache_read_per_1m=99.0,
839
+ cache_create_per_1m=99.0,
840
+ )])
841
+ msg = _FakeMessage(
842
+ model="claude-sonnet-4-6",
843
+ usage=_FakeUsage(input_tokens=1_000_000, output_tokens=0),
844
+ )
845
+ record = record_anthropic_call(msg, pricing=custom_table)
846
+ assert record["cost_usd"] == pytest.approx(99.0)
847
+
848
+ def test_at_kwarg_threads_to_recompute(self):
849
+ """Historical recompute path: caller passes capture timestamp."""
850
+ msg = _FakeMessage(
851
+ model="claude-haiku-4-5",
852
+ usage=_FakeUsage(input_tokens=1000, output_tokens=0),
853
+ )
854
+ record = record_anthropic_call(msg, at=date(2026, 5, 25))
855
+ # Whatever the at= date evaluates to, no PriceCardLookupError raised
856
+ # is the load-bearing assertion — we have a packaged-default card
857
+ # effective 2026-01-01.
858
+ assert record["cost_usd"] > 0
859
+
@@ -1,4 +1,4 @@
1
- """Tests for the RAG rerank primitive (alpha-engine-lib v0.11.0).
1
+ """Tests for the RAG rerank primitive (alpha-engine-lib v0.11.0+).
2
2
 
3
3
  Covers:
4
4
 
@@ -7,12 +7,15 @@ Covers:
7
7
  circuits repeat scoring; passthrough when candidates empty. Real
8
8
  BAAI model load is mocked via the ``_model`` slot so tests don't
9
9
  download 600MB of weights.
10
- 3. ``LLMJudgeReranker`` — parses Haiku output; falls back to neutral
11
- score on parse failure; cache short-circuits repeats.
12
- 4. ``retrieve(rerank=...)`` — fetches ``rerank_input_n`` from the
10
+ 3. ``retrieve(rerank=...)`` — fetches ``rerank_input_n`` from the
13
11
  underlying method, passes through to the reranker, truncates to
14
12
  ``top_k``; rerank=None preserves legacy behavior; invalid
15
13
  ``rerank_input_n < top_k`` raises.
14
+
15
+ ``LLMJudgeReranker`` (formerly tested here) was removed v0.34.0. See
16
+ the ``rerank`` module docstring for the no-lift finding +
17
+ institutional rerank-revisit path (domain-finetune CE on retrieval
18
+ triples, not LLM-judge).
16
19
  """
17
20
 
18
21
  from __future__ import annotations
@@ -24,7 +27,6 @@ import pytest
24
27
 
25
28
  from alpha_engine_lib.rag.rerank import (
26
29
  CrossEncoderReranker,
27
- LLMJudgeReranker,
28
30
  RerankCache,
29
31
  _RERANKER_REGISTRY,
30
32
  get_reranker,
@@ -178,82 +180,6 @@ class TestCrossEncoderReranker:
178
180
  reranker._ensure_model()
179
181
 
180
182
 
181
- # ── LLMJudgeReranker ────────────────────────────────────────────────────────
182
-
183
-
184
- def _mock_anthropic_client(score_by_content: dict[str, int]) -> MagicMock:
185
- """Return a MagicMock anthropic client that scores by content lookup."""
186
- client = MagicMock()
187
-
188
- def _create(*, model: str, max_tokens: int, messages: list[dict]) -> object:
189
- prompt = messages[0]["content"]
190
- # The prompt embeds the document content after "Document:\n".
191
- doc_start = prompt.index("Document:\n") + len("Document:\n")
192
- doc_end = prompt.index("\n\nScore")
193
- content = prompt[doc_start:doc_end]
194
- score = score_by_content.get(content, 3)
195
- block = MagicMock()
196
- block.text = str(score)
197
- response = MagicMock()
198
- response.content = [block]
199
- return response
200
-
201
- client.messages.create.side_effect = _create
202
- return client
203
-
204
-
205
- class TestLLMJudgeReranker:
206
- def test_parses_haiku_integer_response(self) -> None:
207
- client = _mock_anthropic_client({"low": 1, "mid": 3, "high": 5})
208
- reranker = LLMJudgeReranker(client=client)
209
- candidates = [
210
- _make_result("low", "c1"),
211
- _make_result("mid", "c2"),
212
- _make_result("high", "c3"),
213
- ]
214
- out = reranker.rerank("query", candidates, top_k=3)
215
- assert [r.content for r in out] == ["high", "mid", "low"]
216
- assert out[0].rerank_score == pytest.approx(5.0)
217
- assert out[0].rerank_method == "llm_judge"
218
-
219
- def test_cache_hit_skips_llm_call(self) -> None:
220
- client = _mock_anthropic_client({"x": 4, "y": 2})
221
- reranker = LLMJudgeReranker(client=client)
222
- candidates = [_make_result("x", "cx"), _make_result("y", "cy")]
223
- reranker.rerank("query", candidates, top_k=2)
224
- first = client.messages.create.call_count
225
- reranker.rerank("query", candidates, top_k=2)
226
- assert client.messages.create.call_count == first
227
-
228
- def test_parse_failure_returns_neutral_three(self) -> None:
229
- # Mock client returns malformed output for "bad", normal for "good".
230
- client = MagicMock()
231
-
232
- def _create(*, model, max_tokens, messages):
233
- prompt = messages[0]["content"]
234
- block = MagicMock()
235
- if "bad" in prompt:
236
- block.text = "garbage" # int(garbage[0]) → ValueError
237
- else:
238
- block.text = "5"
239
- response = MagicMock()
240
- response.content = [block]
241
- return response
242
-
243
- client.messages.create.side_effect = _create
244
- reranker = LLMJudgeReranker(client=client)
245
- out = reranker.rerank(
246
- "query",
247
- [_make_result("bad", "c1"), _make_result("good", "c2")],
248
- top_k=2,
249
- )
250
- # "good" wins with 5.0; "bad" falls back to neutral 3.0.
251
- assert out[0].content == "good"
252
- assert out[0].rerank_score == pytest.approx(5.0)
253
- assert out[1].content == "bad"
254
- assert out[1].rerank_score == pytest.approx(3.0)
255
-
256
-
257
183
  # ── retrieve(rerank=...) integration ────────────────────────────────────────
258
184
 
259
185