npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.4 → 0.10.6 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py ADDED Viewed

@@ -0,0 +1,810 @@
+"""Unit tests for flag-gated hybrid BM25+RRF retrieval (roadmap BET 3).
+Covers, without any live engine / network / docker:
+  - flag OFF  → /search uses the legacy `search()` call (query_points
+    never touched) and /store writes the bare unnamed dense vector —
+    i.e. the request path is unchanged.
+  - flag ON   → /search issues `query_points` with the exact two-leg
+    prefetch (dense on the unnamed '' vector, sparse on 'lex') fused by
+    FusionQuery(RRF); /store and /store-batch write the named-vector
+    bag {'': dense, 'lex': sparse} from FULL content.
+  - sparse-encode failure with flag ON → graceful dense-only fallback.
+  - backfill script dry-run math + state round-trip.
+  - eval harness metric math (recall@k / nDCG@k).
+  - the real fastembed encoder wrapper (skipped when fastembed is not
+    installed — pytest.importorskip / stdlib-runner skip).
+Dependency strategy: compat/server.py imports fastapi/pydantic/qdrant/
+httpx/numpy/psycopg at module load. These tests use the REAL packages
+when importable and install minimal in-memory stubs into sys.modules
+otherwise, so the suite runs both in a full dev env (pytest) and in a
+bare stdlib environment:
+    python3 packages/memory-engine-v2/tests/test_hybrid_retrieval.py
+"""
+from __future__ import annotations
+import asyncio
+import importlib.util
+import json
+import os
+import sys
+import tempfile
+import types
+from pathlib import Path
+_PKG = Path(__file__).resolve().parent.parent
+_SERVER = _PKG / "compat" / "server.py"
+_BACKFILL = _PKG / "scripts" / "backfill_sparse_vectors.py"
+_EVAL = _PKG / "eval" / "recall_at_k.py"
+try:
+    import pytest
+except ImportError:  # bare stdlib runner
+    pytest = None
+class _Skip(Exception):
+    """Stdlib-runner skip marker (pytest path uses pytest.skip)."""
+def _skip(msg: str):
+    if pytest is not None:
+        pytest.skip(msg)
+    raise _Skip(msg)
+# ----------------------------------------------------------------------
+# Minimal stubs for server.py's import surface (used only when the real
+# package is not importable in this environment).
+# ----------------------------------------------------------------------
+class _Rec:
+    """Generic kwargs-record stand-in for qdrant model classes."""
+    def __init__(self, **kw):
+        self._kw = kw
+        for k, v in kw.items():
+            setattr(self, k, v)
+    def __eq__(self, other):
+        return type(other) is type(self) and self._kw == other._kw
+    def __repr__(self):
+        return f"{type(self).__name__}({self._kw})"
+def _stub_qdrant_client() -> types.ModuleType:
+    mod = types.ModuleType("qdrant_client")
+    models = types.ModuleType("qdrant_client.models")
+    for name in (
+        "VectorParams", "ScalarQuantization", "ScalarQuantizationConfig",
+        "FieldCondition", "MatchAny", "MatchValue", "Filter", "PointStruct",
+        "FilterSelector", "SparseVector", "SparseVectorParams",
+        "SparseIndexParams", "Prefetch", "FusionQuery", "PointVectors",
+    ):
+        setattr(models, name, type(name, (_Rec,), {}))
+    class Distance:
+        COSINE = "Cosine"
+    class ScalarType:
+        INT8 = "int8"
+    class PayloadSchemaType:
+        KEYWORD = "keyword"
+    class Modifier:
+        IDF = "idf"
+    class Fusion:
+        RRF = "rrf"
+        DBSF = "dbsf"
+    models.Distance = Distance
+    models.ScalarType = ScalarType
+    models.PayloadSchemaType = PayloadSchemaType
+    models.Modifier = Modifier
+    models.Fusion = Fusion
+    class AsyncQdrantClient:  # never instantiated in tests
+        def __init__(self, *a, **kw):
+            pass
+    mod.AsyncQdrantClient = AsyncQdrantClient
+    mod.models = models
+    sys.modules["qdrant_client.models"] = models
+    return mod
+def _stub_pydantic() -> types.ModuleType:
+    mod = types.ModuleType("pydantic")
+    class _FieldInfo:
+        def __init__(self, default=None, default_factory=None):
+            self.default = default
+            self.default_factory = default_factory
+    def Field(default=None, **kw):
+        return _FieldInfo(default, kw.get("default_factory"))
+    class BaseModel:
+        def __init__(self, **kwargs):
+            ann: dict = {}
+            for klass in reversed(type(self).__mro__):
+                ann.update(getattr(klass, "__annotations__", {}))
+            for name in ann:
+                if name in kwargs:
+                    value = kwargs[name]
+                else:
+                    default = getattr(type(self), name, None)
+                    if isinstance(default, _FieldInfo):
+                        value = (default.default_factory()
+                                 if default.default_factory else default.default)
+                    else:
+                        value = default
+                setattr(self, name, value)
+    mod.BaseModel = BaseModel
+    mod.Field = Field
+    return mod
+def _stub_fastapi() -> types.ModuleType:
+    mod = types.ModuleType("fastapi")
+    class FastAPI:
+        def __init__(self, **kw):
+            pass
+        def get(self, path):
+            return lambda fn: fn
+        def post(self, path):
+            return lambda fn: fn
+    class HTTPException(Exception):
+        def __init__(self, status_code, detail=None):
+            super().__init__(detail)
+            self.status_code = status_code
+            self.detail = detail
+    mod.FastAPI = FastAPI
+    mod.HTTPException = HTTPException
+    return mod
+def _stub_httpx() -> types.ModuleType:
+    mod = types.ModuleType("httpx")
+    class AsyncClient:
+        def __init__(self, *a, **kw):
+            pass
+    class Timeout:
+        def __init__(self, *a, **kw):
+            pass
+    class HTTPStatusError(Exception):
+        def __init__(self, *a, **kw):
+            super().__init__(*a)
+    class TimeoutException(Exception):
+        pass
+    class NetworkError(Exception):
+        pass
+    mod.AsyncClient = AsyncClient
+    mod.Timeout = Timeout
+    mod.HTTPStatusError = HTTPStatusError
+    mod.TimeoutException = TimeoutException
+    mod.NetworkError = NetworkError
+    return mod
+def _stub_numpy() -> types.ModuleType:
+    mod = types.ModuleType("numpy")
+    def _unused(*a, **kw):  # MMR is vector-gated; tests never reach numpy
+        raise AssertionError("numpy stub should not be exercised by these tests")
+    mod.asarray = _unused
+    mod.max = _unused
+    mod.float32 = "float32"
+    return mod
+def _stub_psycopg() -> tuple[types.ModuleType, types.ModuleType, types.ModuleType]:
+    psycopg = types.ModuleType("psycopg")
+    rows = types.ModuleType("psycopg.rows")
+    rows.dict_row = object()
+    tjson = types.ModuleType("psycopg.types.json")
+    class Json:
+        def __init__(self, obj):
+            self.obj = obj
+    tjson.Json = Json
+    tmod = types.ModuleType("psycopg.types")
+    tmod.json = tjson
+    psycopg.rows = rows
+    psycopg.types = tmod
+    pool = types.ModuleType("psycopg_pool")
+    class AsyncConnectionPool:
+        def __init__(self, *a, **kw):
+            pass
+    pool.AsyncConnectionPool = AsyncConnectionPool
+    return psycopg, rows, tmod, tjson, pool
+def _ensure_modules():
+    """Install stubs for any of server.py's deps that aren't importable."""
+    def missing(name: str) -> bool:
+        if name in sys.modules:
+            return False
+        try:
+            return importlib.util.find_spec(name) is None
+        except (ImportError, ValueError):
+            return True
+    if missing("qdrant_client"):
+        sys.modules["qdrant_client"] = _stub_qdrant_client()
+    if missing("pydantic"):
+        sys.modules["pydantic"] = _stub_pydantic()
+    if missing("fastapi"):
+        sys.modules["fastapi"] = _stub_fastapi()
+    if missing("httpx"):
+        sys.modules["httpx"] = _stub_httpx()
+    if missing("numpy"):
+        sys.modules["numpy"] = _stub_numpy()
+    if missing("psycopg"):
+        psycopg, rows, tmod, tjson, pool = _stub_psycopg()
+        sys.modules["psycopg"] = psycopg
+        sys.modules["psycopg.rows"] = rows
+        sys.modules["psycopg.types"] = tmod
+        sys.modules["psycopg.types.json"] = tjson
+        sys.modules["psycopg_pool"] = pool
+    elif missing("psycopg_pool"):
+        _, _, _, _, pool = _stub_psycopg()
+        sys.modules["psycopg_pool"] = pool
+_LOAD_SEQ = 0
+def _load_module(path: Path, name: str):
+    spec = importlib.util.spec_from_file_location(name, path)
+    assert spec and spec.loader
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+def load_server(hybrid: bool):
+    """Fresh server module instance with SEARCH_HYBRID_ENABLED set
+    before import (the flag is read at module load)."""
+    global _LOAD_SEQ
+    _LOAD_SEQ += 1
+    _ensure_modules()
+    os.environ["SEARCH_HYBRID_ENABLED"] = "1" if hybrid else "0"
+    return _load_module(_SERVER, f"_compat_server_under_test_{_LOAD_SEQ}")
+# ----------------------------------------------------------------------
+# Async fakes (qdrant client / pg pool)
+# ----------------------------------------------------------------------
+class FakeScored:
+    def __init__(self, event_id: str, score: float, source_kind: str = "note",
+                 payload_extra: dict | None = None):
+        self.payload = {"event_id": event_id, "arena": "arena-a",
+                        "source_kind": source_kind, **(payload_extra or {})}
+        self.score = score
+        self.vector = None  # vector-less → MMR falls back to score order
+class FakeQdrant:
+    def __init__(self, search_results=None, query_points_results=None):
+        self.search_results = search_results or []
+        self.query_points_results = query_points_results or []
+        self.search_calls: list[dict] = []
+        self.query_points_calls: list[dict] = []
+        self.upsert_calls: list[dict] = []
+    async def search(self, **kw):
+        self.search_calls.append(kw)
+        return list(self.search_results)
+    async def query_points(self, **kw):
+        self.query_points_calls.append(kw)
+        return types.SimpleNamespace(points=list(self.query_points_results))
+    async def upsert(self, **kw):
+        self.upsert_calls.append(kw)
+class _AsyncCM:
+    def __init__(self, value):
+        self.value = value
+    async def __aenter__(self):
+        return self.value
+    async def __aexit__(self, *a):
+        return False
+class FakeCursor:
+    def __init__(self, rows=None):
+        self.rows = rows or []
+        self.executed: list[tuple] = []
+    async def execute(self, sql, params=None):
+        self.executed.append((sql, params))
+    async def executemany(self, sql, rows):
+        self.executed.append((sql, rows))
+    async def fetchall(self):
+        return list(self.rows)
+class FakeConn:
+    def __init__(self, cursor):
+        self._cursor = cursor
+    def cursor(self):
+        return _AsyncCM(self._cursor)
+class FakePool:
+    def __init__(self, cursor=None):
+        self.cursor = cursor or FakeCursor()
+    def connection(self):
+        return _AsyncCM(FakeConn(self.cursor))
+def _wire_search_fakes(server, qdrant, db_rows):
+    async def fake_embed(texts, lane="bulk"):
+        return [[0.1, 0.2, 0.3, 0.4] for _ in texts]
+    server._embed_batch = fake_embed
+    server._qdrant = qdrant
+    server._pool = FakePool(FakeCursor(rows=db_rows))
+def _db_row(event_id: str, content: str = "full content", ts: str | None = None):
+    attrs = {"timestamp": ts} if ts else {}
+    return {"id": event_id, "content": content, "attributes": attrs}
+# ----------------------------------------------------------------------
+# /search — flag OFF: legacy path, byte-identical behavior
+# ----------------------------------------------------------------------
+def test_flag_off_search_uses_legacy_search_not_query_points():
+    server = load_server(hybrid=False)
+    qdrant = FakeQdrant(search_results=[FakeScored("e1", 0.91), FakeScored("e2", 0.84)])
+    _wire_search_fakes(server, qdrant, [_db_row("e1"), _db_row("e2")])
+    out = asyncio.run(server.search(server.SearchRequest(query="who is pact", arena="arena-a")))
+    assert len(qdrant.search_calls) == 1, "flag-off must use the legacy search()"
+    assert qdrant.query_points_calls == [], "flag-off must NEVER call query_points"
+    call = qdrant.search_calls[0]
+    assert call["collection_name"] == "evidence"
+    assert call["query_vector"] == [0.1, 0.2, 0.3, 0.4]
+    assert call["limit"] == 30  # limit 10 × SEARCH_OVERFETCH_MULT 3
+    assert call["score_threshold"] == 0.001
+    assert call["with_payload"] is True
+    ids = [r["id"] for r in out["results"]]
+    assert ids == ["e1", "e2"]
+    assert out["results"][0]["content"] == "full content"
+def test_flag_off_never_calls_sparse_encoder():
+    server = load_server(hybrid=False)
+    def boom(*a, **kw):
+        raise AssertionError("sparse encoder must not be touched when flag is off")
+    server._get_sparse_encoder = boom
+    qdrant = FakeQdrant(search_results=[])
+    _wire_search_fakes(server, qdrant, [])
+    out = asyncio.run(server.search(server.SearchRequest(query="x", arena="arena-a")))
+    assert out == {"results": []}
+    assert len(qdrant.search_calls) == 1
+# ----------------------------------------------------------------------
+# /search — flag ON: RRF-fused query_points with two prefetch legs
+# ----------------------------------------------------------------------
+def _sentinel_sparse(server):
+    return server.qmodels.SparseVector(indices=[3, 17], values=[1.0, 1.0])
+def test_flag_on_search_uses_query_points_with_rrf_prefetch():
+    server = load_server(hybrid=True)
+    qdrant = FakeQdrant(query_points_results=[FakeScored("e1", 0.0163), FakeScored("e2", 0.0161)])
+    _wire_search_fakes(server, qdrant, [_db_row("e1"), _db_row("e2")])
+    sentinel = _sentinel_sparse(server)
+    async def fake_sparse_query(text):
+        return sentinel
+    server._sparse_encode_query = fake_sparse_query
+    out = asyncio.run(server.search(server.SearchRequest(query="acme invoice 4711", arena="arena-a")))
+    assert qdrant.search_calls == [], "flag-on must not use the legacy search()"
+    assert len(qdrant.query_points_calls) == 1
+    call = qdrant.query_points_calls[0]
+    assert call["collection_name"] == "evidence"
+    assert call["with_payload"] is True
+    assert call["limit"] == 30
+    prefetch = call["prefetch"]
+    assert len(prefetch) == 2
+    dense, sparse = prefetch
+    assert dense.using == ""  # unnamed dense vector
+    assert dense.query == [0.1, 0.2, 0.3, 0.4]
+    assert dense.limit == 30
+    assert dense.filter is not None
+    assert sparse.using == "lex"
+    assert sparse.query == sentinel
+    assert sparse.limit == 30
+    assert sparse.filter is not None
+    fusion_query = call["query"]
+    assert fusion_query.fusion == server.qmodels.Fusion.RRF
+    # downstream pipeline (dedup → hydration) untouched: RRF score
+    # surfaces as `similarity`, content hydrated from postgres.
+    assert [r["id"] for r in out["results"]] == ["e1", "e2"]
+    assert out["results"][0]["similarity"] == 0.0163
+    assert out["results"][0]["content"] == "full content"
+def test_flag_on_sparse_query_failure_falls_back_to_dense():
+    server = load_server(hybrid=True)
+    qdrant = FakeQdrant(search_results=[FakeScored("e1", 0.9)])
+    _wire_search_fakes(server, qdrant, [_db_row("e1")])
+    async def broken_sparse_query(text):
+        raise RuntimeError("fastembed unavailable")
+    server._sparse_encode_query = broken_sparse_query
+    out = asyncio.run(server.search(server.SearchRequest(query="x", arena="arena-a")))
+    assert len(qdrant.search_calls) == 1, "sparse failure must fall back to dense search()"
+    assert qdrant.query_points_calls == []
+    assert [r["id"] for r in out["results"]] == ["e1"]
+# ----------------------------------------------------------------------
+# /store + /store-batch — named sparse vector writes
+# ----------------------------------------------------------------------
+def _wire_store_fakes(server, qdrant):
+    async def fake_embed(texts, lane="bulk"):
+        return [[0.5, 0.6] for _ in texts]
+    async def fake_extract(arena, clientId, userId, source_kind, content, attributes):
+        return "evt-" + str(abs(hash(content)) % 10_000)
+    server._embed_batch = fake_embed
+    server._extract = fake_extract
+    server._qdrant = qdrant
+    server._pool = FakePool()
+def test_flag_off_store_writes_bare_dense_vector():
+    server = load_server(hybrid=False)
+    qdrant = FakeQdrant()
+    _wire_store_fakes(server, qdrant)
+    asyncio.run(server.store(server.StoreRequest(content="hello world", metadata={"arena": "arena-a"})))
+    assert len(qdrant.upsert_calls) == 1
+    point = qdrant.upsert_calls[0]["points"][0]
+    assert point.vector == [0.5, 0.6], "flag-off must keep the bare unnamed dense vector"
+    assert not isinstance(point.vector, dict)
+def test_flag_on_store_writes_named_dense_plus_lex_sparse_from_full_content():
+    server = load_server(hybrid=True)
+    qdrant = FakeQdrant()
+    _wire_store_fakes(server, qdrant)
+    sentinel = _sentinel_sparse(server)
+    seen_texts: list[list[str]] = []
+    async def fake_sparse_docs(texts):
+        seen_texts.append(list(texts))
+        return [sentinel for _ in texts]
+    server._sparse_encode_documents = fake_sparse_docs
+    long_content = "x" * 800  # > the 300-char content_preview truncation
+    asyncio.run(server.store(server.StoreRequest(content=long_content, metadata={"arena": "arena-a"})))
+    point = qdrant.upsert_calls[0]["points"][0]
+    assert isinstance(point.vector, dict)
+    assert point.vector[""] == [0.5, 0.6]
+    assert point.vector["lex"] == sentinel
+    # sparse encoding must see FULL content, not the 300-char preview
+    assert seen_texts == [[long_content]]
+    assert point.payload["content_preview"] == "x" * 300
+def test_flag_on_store_batch_writes_named_vectors_per_record():
+    server = load_server(hybrid=True)
+    qdrant = FakeQdrant()
+    _wire_store_fakes(server, qdrant)
+    s1 = server.qmodels.SparseVector(indices=[1], values=[1.0])
+    s2 = server.qmodels.SparseVector(indices=[2], values=[1.0])
+    async def fake_sparse_docs(texts):
+        assert texts == ["first record", "second record"]
+        return [s1, s2]
+    server._sparse_encode_documents = fake_sparse_docs
+    out = asyncio.run(server.store_batch(server.StoreBatchRequest(
+        records=[
+            {"content": "first record", "metadata": {"arena": "arena-a"}},
+            {"content": "second record", "metadata": {"arena": "arena-a"}},
+        ],
+        arena="arena-a",
+    )))
+    assert out["inserted"] == 2
+    points = qdrant.upsert_calls[0]["points"]
+    assert points[0].vector["lex"] == s1
+    assert points[1].vector["lex"] == s2
+    assert points[0].vector[""] == [0.5, 0.6]
+def test_flag_on_store_sparse_failure_degrades_to_dense_only():
+    server = load_server(hybrid=True)
+    qdrant = FakeQdrant()
+    _wire_store_fakes(server, qdrant)
+    async def broken(texts):
+        raise RuntimeError("model fetch failed")
+    server._sparse_encode_documents = broken
+    asyncio.run(server.store(server.StoreRequest(content="hello", metadata={"arena": "arena-a"})))
+    point = qdrant.upsert_calls[0]["points"][0]
+    assert point.vector == [0.5, 0.6], "sparse failure must not fail ingest"
+def test_flag_off_store_batch_keeps_bare_dense_vectors():
+    server = load_server(hybrid=False)
+    qdrant = FakeQdrant()
+    _wire_store_fakes(server, qdrant)
+    asyncio.run(server.store_batch(server.StoreBatchRequest(
+        records=[{"content": "rec", "metadata": {"arena": "arena-a"}}],
+        arena="arena-a",
+    )))
+    point = qdrant.upsert_calls[0]["points"][0]
+    assert point.vector == [0.5, 0.6]
+    assert not isinstance(point.vector, dict)
+# ----------------------------------------------------------------------
+# Collection migration helper
+# ----------------------------------------------------------------------
+class _FakeCollectionInfo:
+    def __init__(self, sparse: dict | None):
+        self.config = types.SimpleNamespace(
+            params=types.SimpleNamespace(sparse_vectors=sparse)
+        )
+def test_ensure_sparse_config_adds_when_missing():
+    server = load_server(hybrid=True)
+    calls = {}
+    class Q:
+        async def get_collection(self, name):
+            return _FakeCollectionInfo(sparse=None)
+        async def update_collection(self, collection_name, sparse_vectors_config):
+            calls["collection"] = collection_name
+            calls["config"] = sparse_vectors_config
+    server._qdrant = Q()
+    added = asyncio.run(server._ensure_sparse_vector_config())
+    assert added is True
+    assert calls["collection"] == "evidence"
+    cfg = calls["config"]["lex"]
+    assert cfg.modifier == server.qmodels.Modifier.IDF
+    assert cfg.index.on_disk is True
+def test_ensure_sparse_config_noop_when_present():
+    server = load_server(hybrid=True)
+    class Q:
+        async def get_collection(self, name):
+            return _FakeCollectionInfo(sparse={"lex": object()})
+        async def update_collection(self, **kw):
+            raise AssertionError("must not update when 'lex' already configured")
+    server._qdrant = Q()
+    assert asyncio.run(server._ensure_sparse_vector_config()) is False
+# ----------------------------------------------------------------------
+# Sparse encoder wrapper (real fastembed — skipped if not installed)
+# ----------------------------------------------------------------------
+def test_sparse_encoder_wrapper_roundtrip():
+    try:
+        import fastembed  # noqa: F401
+    except ImportError:
+        _skip("fastembed not installed in this test environment")
+    server = load_server(hybrid=True)
+    docs = asyncio.run(server._sparse_encode_documents(
+        ["the quick brown fox", "pays the invoice 4711"]
+    ))
+    assert len(docs) == 2
+    for d in docs:
+        assert len(d.indices) == len(d.values) > 0
+        assert all(isinstance(i, int) for i in d.indices)
+    q = asyncio.run(server._sparse_encode_query("invoice 4711"))
+    assert len(q.indices) == len(q.values) > 0
+def test_to_sparse_vector_coerces_numpy_like_arrays():
+    server = load_server(hybrid=True)
+    class FakeEmb:
+        indices = [7, 11, 13]
+        values = [0.5, 1.5, 2.0]
+    sv = server._to_sparse_vector(FakeEmb())
+    assert sv.indices == [7, 11, 13]
+    assert sv.values == [0.5, 1.5, 2.0]
+# ----------------------------------------------------------------------
+# Backfill script — dry-run math + state handling (stdlib only)
+# ----------------------------------------------------------------------
+def _load_backfill():
+    return _load_module(_BACKFILL, "_backfill_sparse_under_test")
+def test_backfill_batch_count_math():
+    bf = _load_backfill()
+    assert bf.batch_count(0, 256) == 0
+    assert bf.batch_count(1, 256) == 1
+    assert bf.batch_count(256, 256) == 1
+    assert bf.batch_count(257, 256) == 2
+    assert bf.batch_count(745_000, 256) == 2911
+    assert bf.batch_count(100, 0) == 0
+def test_backfill_eta_math():
+    bf = _load_backfill()
+    assert bf.eta_seconds(620_000, 400.0) == 1550.0
+    assert bf.eta_seconds(0, 400.0) == 0.0
+    assert bf.eta_seconds(100, 0) == 0.0
+    assert bf.format_eta(1550) == "25m50s"
+    assert bf.format_eta(7325) == "2h02m"
+    assert bf.format_eta(42) == "42s"
+def test_backfill_state_roundtrip_and_corruption_tolerance():
+    bf = _load_backfill()
+    with tempfile.TemporaryDirectory() as d:
+        path = os.path.join(d, "state.json")
+        assert bf.load_state(path) == {}
+        bf.save_state(path, {"next_offset": "abc-123", "scanned": 512})
+        assert bf.load_state(path) == {"next_offset": "abc-123", "scanned": 512}
+        with open(path, "w") as f:
+            f.write("{corrupt")
+        assert bf.load_state(path) == {}
+def test_backfill_defaults_are_safe():
+    bf = _load_backfill()
+    args = bf.parse_args([])
+    assert args.apply is False, "backfill must be dry-run by default"
+    assert args.collection == "evidence"
+    assert args.batch_size == 256
+    assert args.force is False
+# ----------------------------------------------------------------------
+# Eval harness metric math (stdlib only)
+# ----------------------------------------------------------------------
+def _load_eval():
+    return _load_module(_EVAL, "_recall_at_k_under_test")
+def test_eval_recall_at_k():
+    ev = _load_eval()
+    assert ev.recall_at_k(["a", "b", "c"], {"a", "c"}, 2) == 0.5
+    assert ev.recall_at_k(["a", "b", "c"], {"a", "c"}, 3) == 1.0
+    assert ev.recall_at_k([], {"a"}, 5) == 0.0
+    assert ev.recall_at_k(["a"], set(), 5) == 0.0
+def test_eval_ndcg_at_k():
+    ev = _load_eval()
+    gains = {"a": 2.0, "b": 1.0}
+    assert abs(ev.ndcg_at_k(["a", "b"], gains, 2) - 1.0) < 1e-9  # ideal order
+    worse = ev.ndcg_at_k(["b", "a"], gains, 2)
+    assert 0.0 < worse < 1.0
+    assert ev.ndcg_at_k(["x", "y"], gains, 2) == 0.0
+def test_eval_skips_placeholder_questions():
+    ev = _load_eval()
+    assert ev.is_judged({"relevant": [{"event_id": "EVENT_ID_PLACEHOLDER_1A"}]}) is False
+    assert ev.is_judged({"relevant": []}) is False
+    assert ev.is_judged({}) is False
+    assert ev.is_judged({"relevant": [{"event_id": "ev-real-1"}]}) is True
+def test_eval_seed_file_parses_and_is_all_placeholders():
+    ev = _load_eval()
+    with open(_PKG / "eval" / "retrieval_golden.seed.json") as f:
+        golden = json.load(f)
+    assert golden["questions"], "seed must ship with example questions"
+    assert all(not ev.is_judged(q) for q in golden["questions"]), (
+        "the committed seed must contain only placeholders — no live ids"
+    )
+# ----------------------------------------------------------------------
+# Stdlib runner (pytest collects the same functions when available)
+# ----------------------------------------------------------------------
+if __name__ == "__main__":
+    passed, skipped, failed = 0, 0, []
+    for name, fn in sorted(globals().items()):
+        if not (name.startswith("test_") and callable(fn)):
+            continue
+        try:
+            fn()
+            passed += 1
+            print(f"PASS  {name}")
+        except _Skip as e:
+            skipped += 1
+            print(f"SKIP  {name} ({e})")
+        except BaseException as e:  # pytest.skip raises BaseException subclass
+            if pytest is not None and isinstance(e, pytest.skip.Exception):
+                skipped += 1
+                print(f"SKIP  {name} ({e})")
+            else:
+                failed.append((name, e))
+                print(f"FAIL  {name}: {type(e).__name__}: {e}")
+    print(f"\n{passed} passed, {skipped} skipped, {len(failed)} failed")
+    sys.exit(1 if failed else 0)