npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.5 → 0.10.6 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/index.cjs CHANGED Viewed

@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.5";
+var VERSION = "0.10.6";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.5";
+var VERSION = "0.10.6";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.10.5",
+  "version": "0.10.6",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine-v2/compat/requirements.txt CHANGED Viewed

@@ -4,3 +4,9 @@ psycopg[binary,pool]==3.2.3
 httpx==0.27.2
 qdrant-client==1.12.1
 pydantic==2.9.2
+# BET 3 (hybrid retrieval): CPU-only BM25 sparse encoder for the named
+# 'lex' vector. 0.3.6 = the exact pin qdrant-client 1.12.1's own
+# [fastembed] extra uses (and it requires python <3.13 — the compat
+# image is python:3.12-slim). Only imported lazily when
+# SEARCH_HYBRID_ENABLED is on; flag-off behavior is unchanged.
+fastembed==0.3.6

package/packages/memory-engine-v2/compat/server.py CHANGED Viewed

@@ -102,6 +102,24 @@ SEARCH_INTENT_BOOST = os.environ.get("SEARCH_INTENT_BOOST", "1") not in ("0", "f
 # without a parseable timestamp sink to the bottom but aren't dropped.
 SEARCH_TEMPORAL_RERANK = os.environ.get("SEARCH_TEMPORAL_RERANK", "1") not in ("0", "false", "")
+# ── Hybrid lexical+dense retrieval (roadmap BET 3) ───────────────────
+# SEARCH_HYBRID_ENABLED gates EVERY hybrid behavior in one switch:
+#   - /store and /store-batch additionally write a NAMED sparse vector
+#     ("lex", BM25 term weights via fastembed, CPU-only) alongside the
+#     existing unnamed dense vector. The dense embedder + its vectors
+#     are never touched — additive only, zero dense re-embed.
+#   - startup runs an idempotent update_collection to add the sparse
+#     vector config ("lex": IDF modifier, on-disk index) when missing.
+#   - /search swaps the single dense search() for a server-side
+#     RRF-fused query_points(prefetch=[dense, lex]) — everything
+#     downstream (dedup → intent boost → MMR/temporal → quota →
+#     hydration) is untouched; the RRF score lands in r.score.
+# Default OFF (env unset/0/false): the request path is byte-identical
+# to pre-hybrid behavior and fastembed is never imported at all.
+SEARCH_HYBRID_ENABLED = os.environ.get("SEARCH_HYBRID_ENABLED", "") not in ("", "0", "false")
+SPARSE_VECTOR_NAME = "lex"
+SPARSE_MODEL_NAME = os.environ.get("SEARCH_SPARSE_MODEL", "Qdrant/bm25")
 TEMPORAL_INTENT_RE = re.compile(
     r"\b(when did|when was|last (?:time|met|saw|spoke|called)|"
     r"how long ago|first time (?:i|we) (?:met|saw|spoke)|recent(?:ly)?|"
@@ -116,6 +134,18 @@ FACTUAL_INTENT_RE = re.compile(
 )
 INTENT_BOOSTS: dict[str, dict[str, float]] = {
     # source_kind -> additive boost on cosine score
+    #
+    # ⚠️ HYBRID-RRF RECALIBRATION NEEDED (BET 3): these magnitudes were
+    # tuned against COSINE similarity scores (typical 0.7–0.85 range,
+    # where +0.06 flips a near-tie). When SEARCH_HYBRID_ENABLED is on,
+    # /search returns RRF fusion scores instead — 1/(k+rank) with
+    # Qdrant's k=60, i.e. ~0.016 at rank 1 decaying to ~0.006 at rank
+    # 100. On that scale a +0.06 additive boost is no longer a nudge:
+    # it catapults any matching source_kind above EVERY un-boosted
+    # result regardless of rank. Do not flip the hybrid flag to
+    # default-on until these are recalibrated against eval-harness
+    # numbers (see eval/recall_at_k.py); flag-off default protects
+    # prod until then.
     "temporal": {"event": 0.08, "doc": 0.04, "note": 0.02},
     "factual": {"doc": 0.06, "note": 0.03, "event": 0.03},
 }
@@ -184,6 +214,64 @@ def _apply_temporal_sort(
     return sorted(results, key=neg_ts)
+# ── Sparse (BM25) encoding — hybrid retrieval, BET 3 ─────────────────
+# fastembed's Qdrant/bm25 sparse encoder. CPU-only — no GPU contention
+# with the dense embed gateway. Lazily initialised so that (a) flag-off
+# deployments never import fastembed (it isn't even a hard dependency
+# of the request path) and (b) the model artifact download happens on
+# first use, not at process start.
+_sparse_encoder: Any | None = None
+def _get_sparse_encoder() -> Any:
+    global _sparse_encoder
+    if _sparse_encoder is None:
+        # Deferred import — module load must stay fastembed-free when
+        # SEARCH_HYBRID_ENABLED is off.
+        from fastembed import SparseTextEmbedding
+        _sparse_encoder = SparseTextEmbedding(model_name=SPARSE_MODEL_NAME)
+        log.info(f"sparse encoder initialised: {SPARSE_MODEL_NAME}")
+    return _sparse_encoder
+def _to_sparse_vector(emb: Any) -> qmodels.SparseVector:
+    """fastembed SparseEmbedding (numpy indices/values) → Qdrant model."""
+    return qmodels.SparseVector(
+        indices=[int(i) for i in emb.indices],
+        values=[float(v) for v in emb.values],
+    )
+async def _sparse_encode_documents(texts: list[str]) -> list[qmodels.SparseVector]:
+    """BM25-encode full document content for the named 'lex' vector.
+    Runs in a thread — fastembed is synchronous CPU work and must not
+    block the event loop under concurrent /store-batch load."""
+    enc = _get_sparse_encoder()
+    embs = await asyncio.to_thread(lambda: list(enc.embed(texts)))
+    return [_to_sparse_vector(e) for e in embs]
+async def _sparse_encode_query(text: str) -> qmodels.SparseVector:
+    """BM25-encode a query. `query_embed` (not `embed`) — BM25 weights
+    documents by term frequency/length but queries as bare term sets;
+    the IDF half lives server-side via Modifier.IDF on the collection."""
+    enc = _get_sparse_encoder()
+    embs = await asyncio.to_thread(lambda: list(enc.query_embed(text)))
+    return _to_sparse_vector(embs[0])
+def _dense_vector_of(candidate: Any) -> Any:
+    """Extract the dense vector from a scored point. With hybrid on,
+    Qdrant returns the full named-vector bag ({'': dense, 'lex':
+    sparse}); the dense vector rides the default '' slot. Flag-off
+    points return the bare list unchanged."""
+    v = getattr(candidate, "vector", None)
+    if isinstance(v, dict):
+        return v.get("")
+    return v
 def _mmr_select(
     candidates: list[Any], target: int, lambda_: float
 ) -> list[Any]:
@@ -199,10 +287,12 @@ def _mmr_select(
     if not candidates or target <= 0:
         return []
     # Bail to pure-relevance ordering if vectors weren't returned.
-    if any(getattr(c, "vector", None) is None for c in candidates):
+    # (_dense_vector_of unwraps the hybrid named-vector bag; flag-off
+    # bare-list vectors pass through unchanged.)
+    if any(_dense_vector_of(c) is None for c in candidates):
         return sorted(candidates, key=lambda r: r.score, reverse=True)[:target]
-    vecs = np.asarray([c.vector for c in candidates], dtype=np.float32)
+    vecs = np.asarray([_dense_vector_of(c) for c in candidates], dtype=np.float32)
     scores = np.asarray([c.score for c in candidates], dtype=np.float32)
     # Precompute pairwise similarity matrix; cheaper than per-step
     # dot products at our scale and lets us slice into it by index.
@@ -239,6 +329,47 @@ _qdrant: AsyncQdrantClient | None = None
 _http: httpx.AsyncClient | None = None
+def _sparse_vectors_config() -> dict[str, Any]:
+    """The 'lex' named-sparse-vector schema (BET 3).
+    Modifier.IDF — Qdrant computes/applies IDF server-side, so the
+    client-side BM25 encoding only needs term frequency × length
+    normalisation (which is exactly what fastembed's Qdrant/bm25
+    produces). on_disk index — the sparse index joins the dense
+    vectors on disk rather than competing for RAM; the 06-05 outage
+    was disk pressure, not RAM, and mmap/page-cache governs hot set
+    the same way the dense side is configured."""
+    return {
+        SPARSE_VECTOR_NAME: qmodels.SparseVectorParams(
+            modifier=qmodels.Modifier.IDF,
+            index=qmodels.SparseIndexParams(on_disk=True),
+        )
+    }
+async def _ensure_sparse_vector_config() -> bool:
+    """Idempotent collection migration: add the 'lex' sparse vector
+    config to the existing collection when missing. Called from
+    lifespan only when SEARCH_HYBRID_ENABLED — flag-off startups never
+    touch the collection config. Adding a sparse vector config is
+    additive metadata: existing points and the unnamed dense vector
+    are untouched (no re-embed, no rebuild). Returns True if the
+    config was added, False if already present."""
+    info = await _qdrant.get_collection(COLLECTION_NAME)
+    existing = getattr(info.config.params, "sparse_vectors", None) or {}
+    if SPARSE_VECTOR_NAME in existing:
+        return False
+    await _qdrant.update_collection(
+        collection_name=COLLECTION_NAME,
+        sparse_vectors_config=_sparse_vectors_config(),
+    )
+    log.info(
+        f"added sparse vector config '{SPARSE_VECTOR_NAME}' "
+        f"(modifier=idf, on_disk=true) to collection {COLLECTION_NAME}"
+    )
+    return True
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global _pool, _qdrant, _http
@@ -260,6 +391,13 @@ async def lifespan(app: FastAPI):
         collections = await _qdrant.get_collections()
         names = {c.name for c in collections.collections}
         if COLLECTION_NAME not in names:
+            create_kwargs: dict[str, Any] = {}
+            if SEARCH_HYBRID_ENABLED:
+                # Fresh collection with the flag on gets the 'lex'
+                # sparse config at creation time (BET 3); existing
+                # collections are migrated by
+                # _ensure_sparse_vector_config below.
+                create_kwargs["sparse_vectors_config"] = _sparse_vectors_config()
             await _qdrant.create_collection(
                 collection_name=COLLECTION_NAME,
                 vectors_config=qmodels.VectorParams(
@@ -275,6 +413,7 @@ async def lifespan(app: FastAPI):
                         always_ram=False,
                     )
                 ),
+                **create_kwargs,
             )
             log.info(f"created qdrant collection: {COLLECTION_NAME} dim={EMBED_DIM}")
             # Payload indexes for fast filtered search (this is the
@@ -286,6 +425,12 @@ async def lifespan(app: FastAPI):
                     field_schema=qmodels.PayloadSchemaType.KEYWORD,
                 )
             log.info("created qdrant payload indexes: arena, source_kind, clientId, userId")
+        if SEARCH_HYBRID_ENABLED:
+            # BET 3 migration — idempotent, additive-only; no-op when
+            # the 'lex' config is already present. Flag-off startups
+            # never reach this line, so the collection config is
+            # byte-identical to today until the flag is flipped.
+            await _ensure_sparse_vector_config()
     except Exception as e:
         log.error(f"qdrant init error: {e}")
         # Don't crash compat on Qdrant init failure — let liveness
@@ -553,6 +698,16 @@ async def store(req: StoreRequest):
     event_id = await _extract(arena, clientId, userId, source_kind, req.content, meta)
     embeddings = await _embed_batch([req.content])
+    # BET 3: BM25-encode the FULL content into the named 'lex' sparse
+    # vector. Encode failure degrades to dense-only (ingest must not
+    # fail on the lexical leg; the backfill script repairs gaps).
+    sparse_vec: Any | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_vec = (await _sparse_encode_documents([req.content]))[0]
+        except Exception as e:
+            log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
     vector_id = str(uuid.uuid4())
     # Write vector_provenance + Qdrant point in the same logical
     # operation. If Qdrant fails, the provenance row gets rolled back —
@@ -569,7 +724,15 @@ async def store(req: StoreRequest):
             points=[
                 qmodels.PointStruct(
                     id=vector_id,
-                    vector=embeddings[0],
+                    # Flag-off: bare dense list — byte-identical to
+                    # today. Flag-on: named-vector bag; the dense
+                    # vector keeps its unnamed ('') slot, 'lex' is
+                    # purely additive.
+                    vector=(
+                        embeddings[0]
+                        if sparse_vec is None
+                        else {"": embeddings[0], SPARSE_VECTOR_NAME: sparse_vec}
+                    ),
                     # Issue #345 (caps #342/#343/#344): Pip emits a rich
                     # metadata bag — timestamp, contact_email, channel,
                     # kind, direction, source, etc. Pre-fix the payload
@@ -621,6 +784,22 @@ async def store_batch(req: StoreBatchRequest):
     if len(embeddings) != len(texts):
         raise HTTPException(500, f"embed count mismatch: {len(embeddings)} vs {len(texts)}")
+    # BET 3: sparse-encode the FULL content batch for the named 'lex'
+    # vector. Best-effort — a sparse failure degrades the whole batch
+    # to dense-only rather than failing ingest (backfill repairs).
+    sparse_vecs: list[Any] | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_vecs = await _sparse_encode_documents(texts)
+            if len(sparse_vecs) != len(texts):
+                log.warning(
+                    f"sparse encode count mismatch ({len(sparse_vecs)} vs {len(texts)}); storing dense-only"
+                )
+                sparse_vecs = None
+        except Exception as e:
+            log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
+            sparse_vecs = None
     # Resolve per-record routing fields first so we can fan out the
     # extractor-sync calls in parallel. Each _extract is a network
     # round-trip; serialising them was the dominant cost in /store-batch
@@ -644,9 +823,9 @@ async def store_batch(req: StoreBatchRequest):
     ids: list[str] = []
     points: list[qmodels.PointStruct] = []
     provenance_rows: list[tuple] = []
-    for (arena, clientId, userId, source_kind, content, meta), vec, event_id in zip(
+    for idx, ((arena, clientId, userId, source_kind, content, meta), vec, event_id) in enumerate(zip(
         resolved, embeddings, event_ids
-    ):
+    )):
         vector_id = str(uuid.uuid4())
         provenance_rows.append((vector_id, event_id, "nv-embed-v2", EMBED_DIM))
         # See /store above — issue #345. Spread the caller's metadata
@@ -655,7 +834,13 @@ async def store_batch(req: StoreBatchRequest):
         # work with. Structural keys override on collision.
         points.append(qmodels.PointStruct(
             id=vector_id,
-            vector=vec,
+            # BET 3: flag-off keeps the bare dense list (byte-identical
+            # to today); flag-on adds the named 'lex' sparse vector.
+            vector=(
+                vec
+                if sparse_vecs is None
+                else {"": vec, SPARSE_VECTOR_NAME: sparse_vecs[idx]}
+            ),
             payload={
                 **(meta or {}),
                 "event_id": event_id,
@@ -896,18 +1081,73 @@ async def search(req: SearchRequest):
     # vector-payload bandwidth (4096 × float32 × overfetch) when
     # vectors won't be used.
     temporal_active = (intent == "temporal") and SEARCH_TEMPORAL_RERANK
-    raw_results = await _qdrant.search(
-        collection_name=COLLECTION_NAME,
-        query_vector=qvec,
-        query_filter=filter_,
-        limit=max(overfetch, target_limit),
-        score_threshold=req.min_score,
-        with_payload=True,
-        # Phase 3 (#343): MMR needs the actual vectors to score pairwise
-        # similarity. Only pull them when MMR is enabled AND we aren't
-        # about to skip MMR for a temporal re-rank.
-        with_vectors=SEARCH_MMR_ENABLED and not temporal_active,
-    )
+    fetch_limit = max(overfetch, target_limit)
+    # Phase 3 (#343): MMR needs the actual vectors to score pairwise
+    # similarity. Only pull them when MMR is enabled AND we aren't
+    # about to skip MMR for a temporal re-rank.
+    fetch_vectors = SEARCH_MMR_ENABLED and not temporal_active
+    # ── BET 3: hybrid lexical+dense retrieval ────────────────────────
+    # Flag on → encode the query with BM25 and replace the single dense
+    # search() with a server-side RRF fusion over two prefetch legs
+    # (dense on the unnamed '' vector, lexical on the named 'lex'
+    # sparse vector). Qdrant runs both legs inside one request, fuses
+    # by reciprocal rank (1/(k+rank), k=60), and the fused score lands
+    # in r.score — everything downstream (dedup → intent boost →
+    # MMR/temporal → quota → hydration) is untouched.
+    #
+    # ⚠️ SCORE-SCALE CAVEAT (recalibration required before default-on):
+    # RRF scores live on a ~0.006–0.033 scale, NOT the cosine 0.7–0.85
+    # scale the intent-boost magnitudes (+0.02…+0.08, see INTENT_BOOSTS)
+    # were tuned against. With hybrid on, those additive boosts dominate
+    # the fused ranking instead of nudging it. The flag-off default
+    # protects prod until eval-harness numbers (eval/recall_at_k.py)
+    # exist to recalibrate them. `min_score` is likewise a cosine-scale
+    # knob, so it is NOT applied to the fused path.
+    #
+    # A sparse-encode failure (e.g. fastembed missing/model fetch
+    # failed) logs and falls back to the legacy dense-only path —
+    # /search availability never depends on the lexical leg.
+    sparse_qvec: Any | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_qvec = await _sparse_encode_query(req.query)
+        except Exception as e:
+            log.warning(f"sparse query encode failed; dense-only fallback: {e}")
+    if sparse_qvec is not None:
+        fused = await _qdrant.query_points(
+            collection_name=COLLECTION_NAME,
+            prefetch=[
+                qmodels.Prefetch(
+                    query=qvec,
+                    using="",  # the unnamed dense vector's internal name
+                    filter=filter_,
+                    limit=fetch_limit,
+                ),
+                qmodels.Prefetch(
+                    query=sparse_qvec,
+                    using=SPARSE_VECTOR_NAME,
+                    filter=filter_,
+                    limit=fetch_limit,
+                ),
+            ],
+            query=qmodels.FusionQuery(fusion=qmodels.Fusion.RRF),
+            limit=fetch_limit,
+            with_payload=True,
+            with_vectors=fetch_vectors,
+        )
+        raw_results = fused.points
+    else:
+        raw_results = await _qdrant.search(
+            collection_name=COLLECTION_NAME,
+            query_vector=qvec,
+            query_filter=filter_,
+            limit=fetch_limit,
+            score_threshold=req.min_score,
+            with_payload=True,
+            with_vectors=fetch_vectors,
+        )
     # (a) dedup by event_id — first occurrence wins (highest score).
     seen_eids: set[str] = set()

package/packages/memory-engine-v2/eval/recall_at_k.py ADDED Viewed

@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""Retrieval eval: recall@k / nDCG@k for hybrid (SEARCH_HYBRID_ENABLED=1)
+vs baseline dense-only /search (roadmap BET 3).
+The hybrid flag is a SERVER-side env var, not a request parameter, so a
+flag-on/flag-off comparison needs either (a) two compat instances — one
+with the flag on, one off — passed as --base-url-on/--base-url-off, or
+(b) two separate runs against one instance while the operator flips the
+flag, each labelled with --label and saved with --out, then compared
+offline with --compare run_a.json run_b.json.
+Stdlib-only (urllib) — runnable on the engine box or anywhere with HTTP
+access to compat. This script makes NO calls until you point it at an
+engine (--base-url*); CI never runs it. Usage:
+  # two instances side by side
+  python3 recall_at_k.py --golden retrieval_golden.seed.json \
+      --base-url-off http://127.0.0.1:8099 \
+      --base-url-on  http://127.0.0.1:8098 \
+      --k 5 10 20
+  # one instance, two passes (operator flips SEARCH_HYBRID_ENABLED between)
+  python3 recall_at_k.py --golden ... --base-url http://127.0.0.1:8099 \
+      --label flag-off --out runs/off.json
+  python3 recall_at_k.py --golden ... --base-url http://127.0.0.1:8099 \
+      --label flag-on  --out runs/on.json
+  python3 recall_at_k.py --compare runs/off.json runs/on.json
+Metrics per question (and mean over questions):
+  recall@k  — |relevant ∩ top-k| / |relevant|
+  nDCG@k    — graded (relevance 2/1), log2 discount, normalised by the
+              ideal ordering of that question's judged set.
+Questions whose `relevant` list still contains placeholders (or is
+empty) are skipped and reported as unjudged.
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import sys
+import urllib.error
+import urllib.request
+DEFAULT_KS = [5, 10, 20]
+# ----------------------------------------------------------------------
+# Metrics — pure functions, unit-testable without any engine.
+# ----------------------------------------------------------------------
+def recall_at_k(ranked_ids: list[str], relevant_ids: set[str], k: int) -> float:
+    if not relevant_ids:
+        return 0.0
+    hits = sum(1 for rid in ranked_ids[:k] if rid in relevant_ids)
+    return hits / len(relevant_ids)
+def dcg_at_k(ranked_ids: list[str], gains: dict[str, float], k: int) -> float:
+    return sum(
+        gains.get(rid, 0.0) / math.log2(i + 2)  # i=0 → log2(2)=1
+        for i, rid in enumerate(ranked_ids[:k])
+    )
+def ndcg_at_k(ranked_ids: list[str], gains: dict[str, float], k: int) -> float:
+    ideal = sorted(gains.values(), reverse=True)[:k]
+    idcg = sum(g / math.log2(i + 2) for i, g in enumerate(ideal))
+    if idcg <= 0:
+        return 0.0
+    return dcg_at_k(ranked_ids, gains, k) / idcg
+def is_judged(question: dict) -> bool:
+    rel = question.get("relevant") or []
+    return bool(rel) and not any(
+        "PLACEHOLDER" in (r.get("event_id") or "") for r in rel
+    )
+def evaluate_ranking(ranked_ids: list[str], question: dict, ks: list[int]) -> dict:
+    rel = question.get("relevant") or []
+    relevant_ids = {r["event_id"] for r in rel}
+    gains = {r["event_id"]: float(r.get("relevance", 1)) for r in rel}
+    return {
+        "recall": {k: recall_at_k(ranked_ids, relevant_ids, k) for k in ks},
+        "ndcg": {k: ndcg_at_k(ranked_ids, gains, k) for k in ks},
+    }
+def summarize(per_question: list[dict], ks: list[int]) -> dict:
+    if not per_question:
+        return {"recall": {k: 0.0 for k in ks}, "ndcg": {k: 0.0 for k in ks}, "n": 0}
+    return {
+        "n": len(per_question),
+        "recall": {
+            k: sum(q["metrics"]["recall"][k] for q in per_question) / len(per_question)
+            for k in ks
+        },
+        "ndcg": {
+            k: sum(q["metrics"]["ndcg"][k] for q in per_question) / len(per_question)
+            for k in ks
+        },
+    }
+# ----------------------------------------------------------------------
+# Engine I/O
+# ----------------------------------------------------------------------
+def search(base_url: str, query: str, arena: str, limit: int, timeout: float = 30.0) -> list[str]:
+    body = json.dumps({"query": query, "arena": arena, "limit": limit}).encode()
+    req = urllib.request.Request(
+        base_url.rstrip("/") + "/search",
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        data = json.loads(r.read())
+    return [res["id"] for res in data.get("results", [])]
+def run_pass(base_url: str, golden: dict, ks: list[int], label: str) -> dict:
+    max_k = max(ks)
+    per_question = []
+    unjudged = []
+    for q in golden.get("questions", []):
+        if not is_judged(q):
+            unjudged.append(q.get("id"))
+            continue
+        arena = q.get("arena") or golden.get("default_arena")
+        try:
+            ranked = search(base_url, q["query"], arena, limit=max_k)
+        except (urllib.error.URLError, OSError) as e:
+            print(f"  [{label}] {q['id']}: SEARCH FAILED: {e}", file=sys.stderr)
+            continue
+        m = evaluate_ranking(ranked, q, ks)
+        per_question.append({"id": q["id"], "class": q.get("class"),
+                             "ranked": ranked, "metrics": m})
+    return {
+        "label": label,
+        "base_url": base_url,
+        "ks": ks,
+        "unjudged": unjudged,
+        "per_question": per_question,
+        "summary": summarize(per_question, ks),
+    }
+def print_run(run: dict) -> None:
+    ks = run["ks"]
+    s = run["summary"]
+    print(f"\n== {run['label']} ({run['base_url']}) — {s.get('n', 0)} judged questions ==")
+    if run.get("unjudged"):
+        print(f"   skipped (placeholders/empty): {', '.join(run['unjudged'])}")
+    header = "metric   " + "".join(f"  @{k:<5}" for k in ks)
+    print(header)
+    print("recall   " + "".join(f"  {s['recall'][k]:.3f} " for k in ks))
+    print("nDCG     " + "".join(f"  {s['ndcg'][k]:.3f} " for k in ks))
+    for q in run["per_question"]:
+        r = q["metrics"]
+        print(f"  {q['id']:<20} ({q.get('class') or '-':<8}) "
+              + " ".join(f"R@{k}={r['recall'][k]:.2f}" for k in ks))
+def print_comparison(off: dict, on: dict) -> None:
+    ks = off["ks"]
+    print(f"\n== Δ ({on['label']} − {off['label']}) ==")
+    for name in ("recall", "ndcg"):
+        deltas = "".join(
+            f"  {on['summary'][name][k] - off['summary'][name][k]:+.3f}" for k in ks
+        )
+        print(f"{name:<8}{deltas}   (k = {ks})")
+def _coerce_keys(run: dict) -> dict:
+    """JSON round-trip turns int dict keys into strings — undo that."""
+    run["ks"] = [int(k) for k in run["ks"]]
+    for scope in [run["summary"], *[q["metrics"] for q in run["per_question"]]]:
+        for name in ("recall", "ndcg"):
+            if name in scope:
+                scope[name] = {int(k): v for k, v in scope[name].items()}
+    return run
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--golden", default=None, help="golden questions JSON")
+    p.add_argument("--base-url", default=None, help="single engine base url")
+    p.add_argument("--base-url-off", default=None, help="flag-OFF engine base url")
+    p.add_argument("--base-url-on", default=None, help="flag-ON engine base url")
+    p.add_argument("--k", type=int, nargs="+", default=DEFAULT_KS)
+    p.add_argument("--label", default="run", help="label for single-pass mode")
+    p.add_argument("--out", default=None, help="write run JSON here")
+    p.add_argument("--compare", nargs=2, metavar=("OFF_JSON", "ON_JSON"),
+                   help="compare two previously saved runs; no engine calls")
+    args = p.parse_args()
+    if args.compare:
+        with open(args.compare[0]) as f:
+            off = _coerce_keys(json.load(f))
+        with open(args.compare[1]) as f:
+            on = _coerce_keys(json.load(f))
+        print_run(off)
+        print_run(on)
+        print_comparison(off, on)
+        return 0
+    if not args.golden:
+        p.error("--golden required unless --compare")
+    with open(args.golden) as f:
+        golden = json.load(f)
+    ks = sorted(set(args.k))
+    runs = []
+    if args.base_url_off and args.base_url_on:
+        runs.append(run_pass(args.base_url_off, golden, ks, "flag-off"))
+        runs.append(run_pass(args.base_url_on, golden, ks, "flag-on"))
+    elif args.base_url:
+        runs.append(run_pass(args.base_url, golden, ks, args.label))
+    else:
+        p.error("provide --base-url, or both --base-url-off and --base-url-on")
+    for r in runs:
+        print_run(r)
+    if len(runs) == 2:
+        print_comparison(runs[0], runs[1])
+    if args.out:
+        with open(args.out, "w") as f:
+            json.dump(runs[0] if len(runs) == 1 else {"runs": runs}, f, indent=2)
+        print(f"\nwrote {args.out}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())