PyPI - alpha-engine-lib - Versions diffs - 0.32.0__tar.gz → 0.34.0__tar.gz - Mend

alpha-engine-lib 0.32.0tar.gz → 0.34.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alpha-engine-lib
-Version: 0.32.0
+Version: 0.34.0
 Summary: Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README.
 Author: Brian McMahon
 License: Proprietary

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alpha-engine-lib"
-version = "0.32.0"
+version = "0.34.0"
 description = "Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README."
 readme = "README.md"
 # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """alpha-engine-lib — shared utilities for Alpha Engine modules."""
-__version__ = "0.32.0"
+__version__ = "0.34.0"

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/cost.py RENAMED Viewed

@@ -663,3 +663,96 @@ def metadata_from_anthropic_message(
         web_fetch_requests=(getattr(stu, "web_fetch_requests", 0) or 0)
             if stu is not None else 0,
     )
+# ── Capture chokepoint (v0.33.0) ──────────────────────────────────────────
+def record_anthropic_call(
+    msg: _AnthropicMessageLike,
+    *,
+    model_name: str | None = None,
+    pricing: PriceTable | None = None,
+    tool_fees: ToolFeeTable | None = None,
+    at: datetime | date | None = None,
+    extra_fields: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Map an Anthropic SDK ``Message`` → priced JSONL-ready cost record.
+    Single chokepoint for raw-SDK consumers (morning-signal, alpha-engine
+    /executor, alpha-engine-data, et al.). Returns a flat dict ready for
+    ``json.dumps``; the caller chooses the sink (local file / S3 /
+    CloudWatch). No I/O performed here — pure mapper.
+    Per ``[[feedback_lift_invariants_to_chokepoint_after_second_recurrence]]``
+    — extracted from morning-signal v0.32.0's ``cost_telemetry.record_call_cost``
+    after data + executor became the 2nd + 3rd consumers needing the same
+    shape. Composes with :func:`metadata_from_anthropic_message` (token-count
+    extraction) + :func:`recompute_cost` (USD pricing) into the single call
+    a typical consumer wants.
+    Parameters
+    ----------
+    msg
+        Anthropic SDK ``Message`` (or anything matching
+        :class:`_AnthropicMessageLike`). Forwarded to
+        :func:`metadata_from_anthropic_message`.
+    model_name
+        Override for ``ModelMetadata.model_name``. Defaults to ``msg.model``.
+    pricing
+        :class:`PriceTable` for USD recompute. Defaults to
+        :func:`load_default_pricing` when ``None`` (packaged Anthropic rate
+        card). Pass an explicit table for operator-managed pricing.
+    tool_fees
+        :class:`ToolFeeTable` for server-tool fee recompute. Defaults to
+        :func:`load_default_tool_fees`. Pass an explicit table for
+        operator-managed fees.
+    at
+        Wall-clock date for price-card / tool-fee lookup. Defaults to
+        ``datetime.now(timezone.utc)``. Pass the original capture
+        timestamp for historical recompute.
+    extra_fields
+        Optional dict merged into the returned record AFTER the standard
+        fields. Consumers attach run-context (``run_id``, ``agent_id``,
+        ``sector_team_id``, ``edition``, ``date``, ...) here so the
+        JSONL row is self-describing without out-of-band metadata.
+    Returns
+    -------
+    dict
+        Flat dict with: ``ts`` (ISO-8601 UTC capture time), ``model``,
+        ``input_tokens``, ``output_tokens``, ``cache_read_tokens``,
+        ``cache_create_tokens``, ``web_search_requests``,
+        ``web_fetch_requests``, ``cost_usd`` (priced via
+        ``recompute_cost``), plus any ``extra_fields`` merged in.
+        Caller-owned field names take precedence over the standard set
+        when keys collide.
+    Raises
+    ------
+    PriceCardLookupError
+        Propagated from :func:`recompute_cost` if no price card matches
+        ``model_name`` at ``at``, or if the message records non-zero
+        server-tool requests with no matching :class:`ToolFee` in the
+        active table. Per ``[[feedback_no_silent_fails]]`` — a missing
+        card on a real call is a load-bearing error worth surfacing.
+    """
+    metadata = metadata_from_anthropic_message(msg, model_name=model_name)
+    table = pricing if pricing is not None else load_default_pricing()
+    fees = tool_fees if tool_fees is not None else load_default_tool_fees()
+    recompute_cost(metadata, table, tool_fee_table=fees, at=at)
+    record: dict[str, Any] = {
+        "ts": datetime.now(timezone.utc).isoformat(),
+        "model": metadata.model_name,
+        "input_tokens": metadata.input_tokens,
+        "output_tokens": metadata.output_tokens,
+        "cache_read_tokens": metadata.cache_read_tokens,
+        "cache_create_tokens": metadata.cache_create_tokens,
+        "web_search_requests": metadata.web_search_requests,
+        "web_fetch_requests": metadata.web_fetch_requests,
+        "cost_usd": metadata.cost_usd,
+    }
+    if extra_fields:
+        record.update(extra_fields)
+    return record

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/rerank.py RENAMED Viewed

@@ -4,36 +4,41 @@ Reranking sits between candidate generation (`retrieve(method="hybrid", ...)`)
 and LLM consumption. Hybrid retrieval over a wide candidate pool (e.g. top-30)
 gives high recall; rerank then provides precision by scoring each
 ``(query, document)`` pair jointly under a model that's purpose-built for
-relevance ranking. This decouples the two trade-offs that bi-encoders /
-keyword retrieval can't resolve simultaneously.
-Two implementations are shipped:
-- :class:`CrossEncoderReranker` — local BAAI ``bge-reranker-v2-m3`` (or any
-  cross-encoder loadable via ``sentence-transformers``). Zero external API
-  surface, deterministic, ~100-300ms latency on CPU at top-50. Default for
-  Alpha Engine consumers per the no-new-vendor posture.
-- :class:`LLMJudgeReranker` — Anthropic Haiku with a 1-5 relevance rubric.
-  Higher latency + cost than cross-encoder; configurable opt-in for
-  scenarios that need rerank criteria beyond pure semantic similarity
-  ("rerank by recency-weighted relevance", "rerank by financial
-  materiality").
-Both implementations share the :class:`Reranker` protocol and the in-process
-:class:`RerankCache` (LRU, keyed by ``sha256(query) + chunk_id``). Cache
-lifetime is the process / Lambda container — no cross-run persistence,
-because query embeddings drift with corpus updates and rerank scores are
-cheap-to-recompute relative to the LLM call they enable.
+relevance ranking.
+**One implementation shipped:** :class:`CrossEncoderReranker` — local
+BAAI ``bge-reranker-v2-m3`` (or any cross-encoder loadable via
+``sentence-transformers``). Zero external API surface, deterministic,
+~100-300ms latency on CPU at top-50. The institutional/SOTA rerank
+pattern for production RAG is domain-finetuned cross-encoders;
+general-purpose CE models (like our bundled BAAI default) are tier-2
+SOTA, dominant for general-domain RAG but expected to regress on
+specialized corpora until finetuned on domain-labeled (query, doc,
+relevance) pairs.
+**``LLMJudgeReranker`` removed v0.34.0** (2026-05-25). The class
+fired one Haiku call per (query, doc) pair — a tier-5 SOTA approach
+useful for novel rubrics that lack training labels, not for general
+relevance reranking. Empirical eval on the SEC-filings RAG corpus
+(2026-05-12, EXPERIMENTS.md) measured -14.2% recall@10 vs the hybrid
+w=0.7 baseline. Removed per ``[[preference_llm_calls_confined_to_research_module]]``
++ the no-lift finding. Re-attempting LLM-judge rerank in the future
+goes inside alpha-engine-research (where LLM calls belong); the
+institutional rerank-revisit path is domain-finetune the CE model
+on operator-labeled retrieval triples.
+The :class:`RerankCache` (LRU, keyed by ``sha256(query) + chunk_id``)
+is process-local — no cross-run persistence, because query embeddings
+drift with corpus updates and rerank scores are cheap to recompute.
 """
 from __future__ import annotations
 import hashlib
 import logging
-import os
 from collections import OrderedDict
 from dataclasses import dataclass, field
-from typing import Callable, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 from .retrieval import RetrievalResult
@@ -201,100 +206,6 @@ class CrossEncoderReranker:
         return _attach_and_sort(candidates, scores, self.name, top_k)
-# ── LLM-as-judge ────────────────────────────────────────────────────────────
-# Default rubric — kept terse to fit a Haiku context window comfortably
-# at top-50 candidates and to leave room for the candidate text itself.
-# Scores follow a 1-5 integer Likert that the model returns as plain
-# JSON for deterministic parsing.
-_DEFAULT_LLM_RUBRIC = (
-    "Rate the relevance of the following document to the query on a "
-    "1-5 scale where 1=irrelevant, 3=tangentially related, 5=directly "
-    "answers the query. Respond with ONLY a single integer between 1 "
-    "and 5."
-)
-@dataclass
-class LLMJudgeReranker:
-    """LLM-as-judge reranker — one Haiku call per (query, doc) pair.
-    More expensive + slower than the cross-encoder (one LLM round-trip
-    per candidate vs. one batched local-model inference for the whole
-    set) but more flexible: the rubric can encode criteria beyond
-    semantic similarity ("rerank by recency-weighted financial
-    materiality"). Configure via :attr:`rubric` at construction.
-    Default ``rubric`` is a strict 1-5 Likert; output is parsed as
-    ``int(response.strip()[0])`` to tolerate the occasional Haiku
-    leading whitespace or trailing punctuation. Parses that fail
-    produce a neutral score of 3 + a warning log; the caller's batch
-    still completes.
-    The Anthropic client is injected so consumers can plug in a
-    pre-configured ``ChatAnthropic`` (langchain) or
-    ``anthropic.Anthropic`` instance. The protocol surface is just
-    ``client.messages.create(...)`` for the raw SDK shape.
-    """
-    client: object
-    model: str = "claude-haiku-4-5-20251001"
-    rubric: str = _DEFAULT_LLM_RUBRIC
-    cache: RerankCache = field(default_factory=RerankCache)
-    name: str = "llm_judge"
-    def rerank(
-        self,
-        query: str,
-        candidates: list[RetrievalResult],
-        top_k: int,
-    ) -> list[RetrievalResult]:
-        if not candidates:
-            return []
-        scores: list[float | None] = [None] * len(candidates)
-        for idx, cand in enumerate(candidates):
-            key = self.cache.make_key(query, cand.chunk_id)
-            cached = self.cache.get(key)
-            if cached is not None:
-                scores[idx] = cached
-                continue
-            score = self._score_one(query, cand.content)
-            scores[idx] = score
-            self.cache.put(key, score)
-        return _attach_and_sort(candidates, scores, self.name, top_k)
-    def _score_one(self, query: str, content: str) -> float:
-        # Truncate the candidate text so a top-50 sweep at ~3K tokens per
-        # candidate doesn't push the prompt past Haiku's window.
-        snippet = content[:4000]
-        prompt = (
-            f"{self.rubric}\n\n"
-            f"Query: {query}\n\n"
-            f"Document:\n{snippet}\n\n"
-            f"Score (1-5):"
-        )
-        try:
-            response = self.client.messages.create(  # type: ignore[attr-defined]
-                model=self.model,
-                max_tokens=8,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            # Anthropic SDK response shape: response.content is a list of
-            # content blocks; the first text block holds the integer.
-            text_block = response.content[0]
-            raw = getattr(text_block, "text", str(text_block)).strip()
-            return float(int(raw[0]))
-        except (ValueError, IndexError, AttributeError) as exc:
-            logger.warning(
-                "LLMJudgeReranker parse-fail (returning neutral 3): %s — raw=%r",
-                exc, locals().get("raw", "<no response>"),
-            )
-            return 3.0
 # ── Helpers ─────────────────────────────────────────────────────────────────
@@ -331,47 +242,25 @@ def _attach_and_sort(
 _RERANKER_REGISTRY: dict[str, Reranker] = {}
-# Factory hook used by :func:`get_reranker` for the ``"llm_judge"``
-# case — exposed at module scope so tests can patch it without
-# importing the anthropic SDK. Default constructs an Anthropic client
-# from the environment, matching the pattern used elsewhere in
-# alpha-engine-research.
-def _default_llm_judge_factory() -> Reranker:
-    try:
-        from anthropic import Anthropic  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError(
-            "LLMJudgeReranker requires the anthropic SDK. "
-            "Install via: pip install anthropic"
-        ) from exc
-    api_key = os.environ.get("ANTHROPIC_API_KEY")
-    if not api_key:
-        raise RuntimeError(
-            "LLMJudgeReranker needs ANTHROPIC_API_KEY in the environment."
-        )
-    return LLMJudgeReranker(client=Anthropic(api_key=api_key))
-_LLM_JUDGE_FACTORY: Callable[[], Reranker] = _default_llm_judge_factory
 def get_reranker(name: str) -> Reranker:
     """Resolve a named reranker, constructing + caching on first use.
-    Supported names: ``"cross_encoder"`` (default — local BAAI),
-    ``"llm_judge"`` (Anthropic Haiku via the ``anthropic`` SDK).
-    Tests register fakes by writing directly to
-    :data:`_RERANKER_REGISTRY` before the ``retrieve(rerank=...)`` call.
+    Supported names: ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3
+    via sentence-transformers). Tests register fakes by writing
+    directly to :data:`_RERANKER_REGISTRY` before the
+    ``retrieve(rerank=...)`` call.
+    ``"llm_judge"`` was removed v0.34.0 — see module docstring for the
+    no-lift finding + the institutional rerank-revisit path
+    (domain-finetune the CE model, not LLM-judge).
     """
     if name in _RERANKER_REGISTRY:
         return _RERANKER_REGISTRY[name]
     if name == "cross_encoder":
         instance: Reranker = CrossEncoderReranker()
-    elif name == "llm_judge":
-        instance = _LLM_JUDGE_FACTORY()
     else:
         raise ValueError(
-            f"Unknown reranker {name!r}; supported: 'cross_encoder', 'llm_judge'"
+            f"Unknown reranker {name!r}; supported: 'cross_encoder'"
         )
     _RERANKER_REGISTRY[name] = instance
     return instance

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib/rag/retrieval.py RENAMED Viewed

@@ -46,8 +46,8 @@ class RetrievalResult:
     vector_score: float | None = None    # cosine similarity, [-1, 1]; None if not retrieved via vector
     keyword_score: float | None = None   # ts_rank_cd, [0, ∞); None if not retrieved via keyword
     combined_score: float | None = None  # blended score in hybrid mode; None for non-hybrid
-    rerank_score: float | None = None    # cross-encoder / LLM-judge score; None if rerank wasn't run
-    rerank_method: str | None = None     # "cross_encoder" / "llm_judge" / None — disambiguates which reranker stamped this
+    rerank_score: float | None = None    # cross-encoder score; None if rerank wasn't run
+    rerank_method: str | None = None     # "cross_encoder" / None — disambiguates which reranker stamped this
 def retrieve(
@@ -78,12 +78,11 @@ def retrieve(
             Ignored for non-hybrid methods.
         rerank: When set, run a reranker over the retrieved candidates
             before truncating to ``top_k``. Supported values:
-            ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3 — default
-            choice when reranking, no API cost) or ``"llm_judge"``
-            (Anthropic Haiku with a 1-5 relevance rubric — opt-in,
-            higher latency + cost). ``None`` (default) preserves the
-            pre-rerank behavior — back-compat path for callers not yet
-            wired to reranking.
+            ``"cross_encoder"`` (local BAAI bge-reranker-v2-m3 — no
+            API cost). ``None`` (default) preserves the pre-rerank
+            behavior — back-compat path for callers not yet wired to
+            reranking. ``"llm_judge"`` was removed v0.34.0 (see
+            ``rerank`` module docstring for the no-lift finding).
         rerank_input_n: When ``rerank`` is set, retrieve this many
             candidates from the underlying method before passing the
             pool to the reranker. Larger pools give the reranker more

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/src/alpha_engine_lib.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alpha-engine-lib
-Version: 0.32.0
+Version: 0.34.0
 Summary: Shared utilities for the Alpha Engine modules: preflight, structured logging with secret-redaction, ArcticDB universe access, NYSE-calendar dates + freshness predicates, decision capture, cost telemetry, RAG, agent output schemas, SSM-backed secrets, Telegram alerts + SNS fan-out, EC2 spot-launch resilience, SSM log-capture chokepoint, and Step-Functions execution-state projection. Full surface documented in README.
 Author: Brian McMahon
 License: Proprietary

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_cost.py RENAMED Viewed

@@ -25,6 +25,7 @@ from alpha_engine_lib.cost import (
     load_pricing,
     load_tool_fees,
     metadata_from_anthropic_message,
+    record_anthropic_call,
     recompute_cost,
 )
 from alpha_engine_lib.decision_capture import ModelMetadata
@@ -741,3 +742,118 @@ class TestRecomputeCostWithToolFees:
         # 1M Sonnet input @ $3/M + 10 web_search @ $10/1k = $3.10.
         assert cost == pytest.approx(3.10)
+# ── record_anthropic_call (capture chokepoint, v0.33.0) ───────────────────
+class TestRecordAnthropicCall:
+    """Lock down the lifted capture primitive that morning-signal,
+    alpha-engine-data, and alpha-engine (executor) all consume in their
+    raw-SDK call sites."""
+    def test_returns_priced_jsonl_ready_record(self):
+        msg = _FakeMessage(
+            model="claude-haiku-4-5",
+            usage=_FakeUsage(input_tokens=1000, output_tokens=200),
+        )
+        record = record_anthropic_call(msg)
+        # Token cost: (1000 * 1.0 + 200 * 5.0) / 1M = 0.002
+        assert record["cost_usd"] == pytest.approx(0.002, abs=1e-6)
+        assert record["model"] == "claude-haiku-4-5"
+        assert record["input_tokens"] == 1000
+        assert record["output_tokens"] == 200
+        assert record["cache_read_tokens"] == 0
+        assert record["cache_create_tokens"] == 0
+        assert record["web_search_requests"] == 0
+        assert record["web_fetch_requests"] == 0
+        # Timestamp is ISO-8601 round-trippable.
+        from datetime import datetime
+        datetime.fromisoformat(record["ts"])
+    def test_includes_tool_fee_pricing(self):
+        msg = _FakeMessage(
+            model="claude-haiku-4-5",
+            usage=_FakeUsage(
+                input_tokens=1000, output_tokens=200,
+                server_tool_use=_FakeServerToolUsage(web_search_requests=50),
+            ),
+        )
+        record = record_anthropic_call(msg)
+        # Tokens 0.002 + 50 × $10/1k = 0.5 → 0.502
+        assert record["cost_usd"] == pytest.approx(0.502, abs=1e-6)
+        assert record["web_search_requests"] == 50
+    def test_extra_fields_merged(self):
+        msg = _FakeMessage(
+            model="claude-haiku-4-5",
+            usage=_FakeUsage(input_tokens=10, output_tokens=5),
+        )
+        record = record_anthropic_call(msg, extra_fields={
+            "run_id": "2026-05-25",
+            "agent_id": "data:news_event_extraction",
+            "fingerprint": "abc123",
+        })
+        assert record["run_id"] == "2026-05-25"
+        assert record["agent_id"] == "data:news_event_extraction"
+        assert record["fingerprint"] == "abc123"
+        # Standard fields preserved alongside extras.
+        assert record["model"] == "claude-haiku-4-5"
+    def test_extra_fields_can_override_standard_fields(self):
+        """Caller-owned keys take precedence — the consumer is the
+        authority on what a record should look like in its sink."""
+        msg = _FakeMessage(
+            model="claude-haiku-4-5",
+            usage=_FakeUsage(input_tokens=10, output_tokens=5),
+        )
+        custom_ts = "2026-05-25T17:30:00+00:00"
+        record = record_anthropic_call(msg, extra_fields={"ts": custom_ts})
+        assert record["ts"] == custom_ts
+    def test_model_name_override_propagates(self):
+        msg = _FakeMessage(
+            model="claude-haiku-4-5-20251001",
+            usage=_FakeUsage(input_tokens=10, output_tokens=5),
+        )
+        record = record_anthropic_call(msg, model_name="claude-haiku-4-5")
+        assert record["model"] == "claude-haiku-4-5"
+    def test_uses_default_pricing_when_none_passed(self):
+        """Caller without operator-managed pricing gets packaged defaults."""
+        msg = _FakeMessage(
+            model="claude-sonnet-4-6",
+            usage=_FakeUsage(input_tokens=1_000_000, output_tokens=0),
+        )
+        record = record_anthropic_call(msg)
+        # 1M Sonnet input @ $3/M = $3.00 against packaged default rate card.
+        assert record["cost_usd"] == pytest.approx(3.0)
+    def test_explicit_pricing_table_used(self):
+        """Operator-managed pricing wins over defaults when passed."""
+        custom_table = PriceTable(cards=[PriceCard(
+            model_name="claude-sonnet-4-6",
+            effective_from=date(2026, 1, 1),
+            input_per_1m=99.0,
+            output_per_1m=99.0,
+            cache_read_per_1m=99.0,
+            cache_create_per_1m=99.0,
+        )])
+        msg = _FakeMessage(
+            model="claude-sonnet-4-6",
+            usage=_FakeUsage(input_tokens=1_000_000, output_tokens=0),
+        )
+        record = record_anthropic_call(msg, pricing=custom_table)
+        assert record["cost_usd"] == pytest.approx(99.0)
+    def test_at_kwarg_threads_to_recompute(self):
+        """Historical recompute path: caller passes capture timestamp."""
+        msg = _FakeMessage(
+            model="claude-haiku-4-5",
+            usage=_FakeUsage(input_tokens=1000, output_tokens=0),
+        )
+        record = record_anthropic_call(msg, at=date(2026, 5, 25))
+        # Whatever the at= date evaluates to, no PriceCardLookupError raised
+        # is the load-bearing assertion — we have a packaged-default card
+        # effective 2026-01-01.
+        assert record["cost_usd"] > 0

{alpha_engine_lib-0.32.0 → alpha_engine_lib-0.34.0}/tests/test_rag_rerank.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Tests for the RAG rerank primitive (alpha-engine-lib v0.11.0).
+"""Tests for the RAG rerank primitive (alpha-engine-lib v0.11.0+).
 Covers:
@@ -7,12 +7,15 @@ Covers:
    circuits repeat scoring; passthrough when candidates empty. Real
    BAAI model load is mocked via the ``_model`` slot so tests don't
    download 600MB of weights.
-3. ``LLMJudgeReranker`` — parses Haiku output; falls back to neutral
-   score on parse failure; cache short-circuits repeats.
-4. ``retrieve(rerank=...)`` — fetches ``rerank_input_n`` from the
+3. ``retrieve(rerank=...)`` — fetches ``rerank_input_n`` from the
    underlying method, passes through to the reranker, truncates to
    ``top_k``; rerank=None preserves legacy behavior; invalid
    ``rerank_input_n < top_k`` raises.
+``LLMJudgeReranker`` (formerly tested here) was removed v0.34.0. See
+the ``rerank`` module docstring for the no-lift finding +
+institutional rerank-revisit path (domain-finetune CE on retrieval
+triples, not LLM-judge).
 """
 from __future__ import annotations
@@ -24,7 +27,6 @@ import pytest
 from alpha_engine_lib.rag.rerank import (
     CrossEncoderReranker,
-    LLMJudgeReranker,
     RerankCache,
     _RERANKER_REGISTRY,
     get_reranker,
@@ -178,82 +180,6 @@ class TestCrossEncoderReranker:
                 reranker._ensure_model()
-# ── LLMJudgeReranker ────────────────────────────────────────────────────────
-def _mock_anthropic_client(score_by_content: dict[str, int]) -> MagicMock:
-    """Return a MagicMock anthropic client that scores by content lookup."""
-    client = MagicMock()
-    def _create(*, model: str, max_tokens: int, messages: list[dict]) -> object:
-        prompt = messages[0]["content"]
-        # The prompt embeds the document content after "Document:\n".
-        doc_start = prompt.index("Document:\n") + len("Document:\n")
-        doc_end = prompt.index("\n\nScore")
-        content = prompt[doc_start:doc_end]
-        score = score_by_content.get(content, 3)
-        block = MagicMock()
-        block.text = str(score)
-        response = MagicMock()
-        response.content = [block]
-        return response
-    client.messages.create.side_effect = _create
-    return client
-class TestLLMJudgeReranker:
-    def test_parses_haiku_integer_response(self) -> None:
-        client = _mock_anthropic_client({"low": 1, "mid": 3, "high": 5})
-        reranker = LLMJudgeReranker(client=client)
-        candidates = [
-            _make_result("low", "c1"),
-            _make_result("mid", "c2"),
-            _make_result("high", "c3"),
-        ]
-        out = reranker.rerank("query", candidates, top_k=3)
-        assert [r.content for r in out] == ["high", "mid", "low"]
-        assert out[0].rerank_score == pytest.approx(5.0)
-        assert out[0].rerank_method == "llm_judge"
-    def test_cache_hit_skips_llm_call(self) -> None:
-        client = _mock_anthropic_client({"x": 4, "y": 2})
-        reranker = LLMJudgeReranker(client=client)
-        candidates = [_make_result("x", "cx"), _make_result("y", "cy")]
-        reranker.rerank("query", candidates, top_k=2)
-        first = client.messages.create.call_count
-        reranker.rerank("query", candidates, top_k=2)
-        assert client.messages.create.call_count == first
-    def test_parse_failure_returns_neutral_three(self) -> None:
-        # Mock client returns malformed output for "bad", normal for "good".
-        client = MagicMock()
-        def _create(*, model, max_tokens, messages):
-            prompt = messages[0]["content"]
-            block = MagicMock()
-            if "bad" in prompt:
-                block.text = "garbage"  # int(garbage[0]) → ValueError
-            else:
-                block.text = "5"
-            response = MagicMock()
-            response.content = [block]
-            return response
-        client.messages.create.side_effect = _create
-        reranker = LLMJudgeReranker(client=client)
-        out = reranker.rerank(
-            "query",
-            [_make_result("bad", "c1"), _make_result("good", "c2")],
-            top_k=2,
-        )
-        # "good" wins with 5.0; "bad" falls back to neutral 3.0.
-        assert out[0].content == "good"
-        assert out[0].rerank_score == pytest.approx(5.0)
-        assert out[1].content == "bad"
-        assert out[1].rerank_score == pytest.approx(3.0)
 # ── retrieve(rerank=...) integration ────────────────────────────────────────