npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.4 → 0.9.5 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/packages/memory-engine/docker-compose.yml +8 -1
package/packages/memory-engine/engine/services/l2/Dockerfile +7 -0
package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +233 -60
package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +280 -0

package/dist/index.cjs CHANGED Viewed

@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.4";
+var VERSION = "0.9.5";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.4";
+var VERSION = "0.9.5";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.9.4",
+  "version": "0.9.5",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine/docker-compose.yml CHANGED Viewed

@@ -220,7 +220,14 @@ services:
       interval: 10s
       timeout: 5s
       retries: 30
-      start_period: 60s
+      # 180s gives L2 enough time to finish Neo4j schema + index creation
+      # on a cold start before compat's healthcheck starts counting failures.
+      # Observed concretely on the v0.9.4 deploy (2026-05-14): L2 took
+      # ~90s to warm up; with start_period: 60s, compat went unhealthy
+      # mid-startup, cloudflared's `depends_on: condition: service_healthy`
+      # failed, and `docker compose up` errored out before wait_for_health
+      # could observe the eventual recovery.
+      start_period: 180s
 networks:
   engine-net:

package/packages/memory-engine/engine/services/l2/Dockerfile CHANGED Viewed

@@ -9,9 +9,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Reranker = sentence-transformers MiniLM cross-encoder.
 # Torch CPU wheels are fine — reranker is small enough to be CPU-bound.
+#
+# sqlite-vec 0.1.9: native KNN over packed-f32 vectors stored in a vec0
+# virtual table. Replaces the legacy hand-rolled Python cosine loop over
+# JSON-serialised embeddings in search_qmd_informed (~15s timeout at 450k
+# rows → ~50ms native MATCH). Pin to 0.1.9 — that's the version probed
+# against L4 QMD's wire format (struct.pack f32 + cosine distance_metric).
 RUN pip install --no-cache-dir \
         fastapi "uvicorn[standard]" httpx requests pydantic \
         neo4j \
+        sqlite-vec==0.1.9 \
         "sentence-transformers" \
         "torch" --extra-index-url https://download.pytorch.org/whl/cpu

package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py CHANGED Viewed

@@ -18,6 +18,7 @@ import json
 import logging
 import os
 import sqlite3
+import struct
 import sys
 import time
 from contextlib import asynccontextmanager
@@ -34,6 +35,11 @@ from neo4j.time import DateTime as Neo4jDateTime, Date as Neo4jDate
 from pydantic import BaseModel
 import uvicorn
+try:
+    import sqlite_vec  # 0.1.9 — native KNN MATCH over packed-f32 vec0 tables
+except ImportError:
+    sqlite_vec = None  # Caller logs loudly if helpers can't load the extension
 # Shared embed client lives at engine/services/_shared/.
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 from _shared.embed_provider import EmbedClient  # noqa: E402
@@ -101,6 +107,59 @@ QMD_DB_PATH = _resolve_qmd_db()
 OLLAMA_URL = os.environ.get("PME_OLLAMA_URL", "http://localhost:11434/api/embeddings")
 EMBEDDING_MODEL = os.environ.get("PME_EMBED_MODEL", "nomic-embed-text")
+# Embedding dimension for the vec0 virtual table. Production gateway
+# (lambda-gateway.pentatonic.com/v1/embed via pentatonic-gateway provider)
+# returns NV-Embed-v2 4096-dim vectors. The vec0 schema requires the dim
+# at DDL time and writers must match — keep this in lockstep with the
+# gateway / EmbedClient config.
+EMBED_DIM = int(os.environ.get("PME_EMBED_DIM", "4096"))
+def _open_qmd_conn() -> sqlite3.Connection:
+    """Open qmd.sqlite with sqlite-vec loaded.
+    Falls back to a plain sqlite3 connection if the extension can't load —
+    MATCH-form queries will then fail loudly at execute time, which is the
+    right signal (loud error > silent degradation back to Python cosine).
+    Callers that only need scalar columns (chunks.path, chunks.text) work
+    fine without the extension.
+    ``check_same_thread=False`` is intentional: the async backfill yields
+    via ``asyncio.to_thread`` to keep /search responsive, which means the
+    connection is handed off between event-loop / thread-pool workers.
+    sqlite's default thread-safety check would otherwise reject the
+    cross-thread reuse even though only one worker touches it at a time.
+    """
+    conn = sqlite3.connect(QMD_DB_PATH, timeout=10, check_same_thread=False)
+    if sqlite_vec is None:
+        log.error("sqlite_vec module not importable — qmd vec_index unavailable")
+        return conn
+    try:
+        conn.enable_load_extension(True)
+        sqlite_vec.load(conn)
+        conn.enable_load_extension(False)
+    except Exception as e:
+        log.error(f"sqlite-vec load failed: {e} — qmd search will be degraded")
+    return conn
+def _ensure_vec_index(conn: sqlite3.Connection) -> None:
+    """Create the vec0 KNN index if not already present. Idempotent.
+    `distance_metric=cosine` is non-default — sqlite-vec defaults to L2
+    (Euclidean). Probe confirmed cosine returns `1 - cos_sim` as the
+    distance. The id column is a regular INTEGER PRIMARY KEY so we can
+    JOIN back to `chunks` on the row's autoinc id.
+    """
+    conn.execute(
+        f"""
+        CREATE VIRTUAL TABLE IF NOT EXISTS vec_index USING vec0(
+            id INTEGER PRIMARY KEY,
+            embedding float[{EMBED_DIM}] distance_metric=cosine
+        )
+        """
+    )
 # NV-Embed-v2 service (primary, 4096-dim). URL/auth/path/body/response are
 # managed by the shared EmbedClient; PME_EMBED_PROVIDER (default openai)
 # selects auth scheme (Bearer vs X-API-Key) and request shape.
@@ -177,13 +236,25 @@ def get_http_client() -> httpx.AsyncClient:
 async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
     """Open the neo4j driver + HTTP client at process startup, close on
     shutdown. Without this, the first request pays driver-open latency
-    and the driver is never properly closed on SIGTERM (leaking conns)."""
+    and the driver is never properly closed on SIGTERM (leaking conns).
+    Also schedules the vec_index backfill as a background task so the
+    proxy can start serving immediately while older chunks copy across
+    into the KNN index — first-time migration of ~450k rows takes
+    minutes and would otherwise block /health.
+    """
     global _neo4j_driver, _http_client
     _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
     _http_client = httpx.AsyncClient(timeout=30.0)
+    backfill_task = asyncio.create_task(_backfill_vec_index())
     try:
         yield
     finally:
+        backfill_task.cancel()
+        try:
+            await backfill_task
+        except (asyncio.CancelledError, Exception):
+            pass
         if _neo4j_driver is not None:
             await _neo4j_driver.close()
             _neo4j_driver = None
@@ -192,6 +263,82 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
             _http_client = None
+async def _backfill_vec_index() -> None:
+    """One-time migration: copy existing chunks.embedding (JSON) into
+    vec_index (f32 bytes).
+    Idempotent. Runs at proxy startup if vec_index has fewer rows than
+    chunks. Async so it doesn't block /health — the proxy serves
+    requests in parallel and search degrades to partial-corpus results
+    until the backfill finishes (any chunk already mirrored into
+    vec_index is findable; the rest are invisible to search but still
+    in L0/L3/L5/L6).
+    At 450k rows + ~460 rows/s insert rate this takes ~16 min on a
+    cold prod instance. Subsequent restarts no-op cleanly.
+    """
+    if sqlite_vec is None:
+        log.error("sqlite_vec module missing — backfill skipped, search will be degraded")
+        return
+    if not os.path.exists(QMD_DB_PATH):
+        log.info("vec_index backfill skipped — qmd.sqlite does not exist yet")
+        return
+    try:
+        conn = await asyncio.to_thread(_open_qmd_conn)
+        await asyncio.to_thread(_ensure_vec_index, conn)
+        chunks_n = conn.execute(
+            "SELECT count(*) FROM chunks WHERE embedding IS NOT NULL"
+        ).fetchone()[0]
+        vec_n = conn.execute("SELECT count(*) FROM vec_index").fetchone()[0]
+        if vec_n >= chunks_n:
+            log.info(f"vec_index backfill skipped — already in sync ({vec_n}/{chunks_n})")
+            conn.close()
+            return
+        missing = chunks_n - vec_n
+        log.info(f"vec_index backfill starting — {missing} rows to copy")
+        cursor = conn.execute(
+            """
+            SELECT c.id, c.embedding
+            FROM chunks c
+            LEFT JOIN vec_index v ON v.id = c.id
+            WHERE v.id IS NULL AND c.embedding IS NOT NULL
+            """
+        )
+        BATCH = 500
+        copied = 0
+        while True:
+            batch = await asyncio.to_thread(cursor.fetchmany, BATCH)
+            if not batch:
+                break
+            def _insert_batch() -> int:
+                inserted = 0
+                with conn:
+                    for cid, emb_json in batch:
+                        try:
+                            vec = json.loads(emb_json)
+                        except Exception:
+                            continue
+                        if len(vec) != EMBED_DIM:
+                            continue
+                        conn.execute(
+                            "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+                            (cid, struct.pack(f"{len(vec)}f", *vec)),
+                        )
+                        inserted += 1
+                return inserted
+            copied += await asyncio.to_thread(_insert_batch)
+            log.info(f"vec_index backfill progress: {copied}/{missing}")
+            # Yield generously so /search + writers aren't starved.
+            await asyncio.sleep(0)
+        log.info(f"vec_index backfill done — {copied} rows copied")
+        conn.close()
+    except asyncio.CancelledError:
+        log.info("vec_index backfill cancelled during shutdown")
+        raise
+    except Exception as e:
+        log.error(f"vec_index backfill failed: {e}")
 app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0", lifespan=lifespan)
 # ---------------------------------------------------------------------------
@@ -613,7 +760,15 @@ def cross_encoder_rerank(query: str, results: List[Dict], top_k: int = 16) -> Li
     return scored[:top_k] + remaining
 def search_qmd_informed(query: str, graph_context: Dict, limit: int = 12) -> List[Dict]:
-    """Phase 2: QMD vector search informed by graph results."""
+    """Phase 2: QMD vector search via sqlite-vec MATCH.
+    Replaces the legacy Python cosine loop over JSON-serialised embeddings
+    (which also had an `ORDER BY id LIMIT 2000` bug — only the OLDEST
+    2000 rows were ever considered, so 99%+ of the corpus was invisible to
+    search at production scale). Now: native KNN over the vec0 index,
+    full-corpus top-k. Wall time at 450k rows: ~50ms native MATCH vs
+    ~15s timeout previously.
+    """
     if not os.path.exists(QMD_DB_PATH):
         return []
@@ -621,69 +776,64 @@ def search_qmd_informed(query: str, graph_context: Dict, limit: int = 12) -> Lis
     if not query_embedding:
         return []
-    # Enhance query with graph entities for better vector search
     enhanced_query = query
     if graph_context["graph_entities"]:
         enhanced_query += " " + " ".join(graph_context["graph_entities"][:3])
-    enhanced_embedding = get_embedding(enhanced_query)
-    if not enhanced_embedding:
-        enhanced_embedding = query_embedding
+    enhanced_embedding = get_embedding(enhanced_query) or query_embedding
+    if len(enhanced_embedding) != EMBED_DIM:
+        # Dim mismatch vs vec0 DDL — the MATCH would error inside sqlite-vec.
+        # Bail with a loud log; an embedding-model mismatch in prod is the
+        # likely root cause and silent degradation would hide it.
+        log.error(
+            f"QMD search: query dim {len(enhanced_embedding)} != vec_index dim "
+            f"{EMBED_DIM} — embedding model mismatch?"
+        )
+        return []
+    qbytes = struct.pack(f"{len(enhanced_embedding)}f", *enhanced_embedding)
     try:
-        conn = sqlite3.connect(QMD_DB_PATH, timeout=5)
-        conn.row_factory = sqlite3.Row
-        # Get vectors and compute similarity
-        rows = conn.execute("""
-            SELECT id, path, text, embedding
-            FROM chunks
-            WHERE embedding IS NOT NULL
-            ORDER BY id
-            LIMIT 2000
-        """).fetchall()
+        conn = _open_qmd_conn()
+        # Pull a candidate pool larger than `limit` so entity-boost
+        # re-ranking has material to work with — 4× limit, floor 50.
+        k_pool = max(limit * 4, 50)
+        rows = conn.execute(
+            """
+            SELECT c.id, c.path, c.text, v.distance
+            FROM vec_index v
+            JOIN chunks c ON c.id = v.id
+            WHERE v.embedding MATCH ? AND k = ?
+            ORDER BY v.distance
+            """,
+            (qbytes, k_pool),
+        ).fetchall()
+        conn.close()
         results = []
-        for row in rows:
-            try:
-                # Deserialize embedding
-                embedding_data = row["embedding"]
-                if isinstance(embedding_data, str):
-                    embedding = json.loads(embedding_data)
-                else:
-                    embedding = list(embedding_data)
-                # Cosine similarity with enhanced query
-                dot = sum(a * b for a, b in zip(enhanced_embedding, embedding))
-                norm_q = sum(x * x for x in enhanced_embedding) ** 0.5
-                norm_e = sum(x * x for x in embedding) ** 0.5
-                if norm_q > 0 and norm_e > 0:
-                    similarity = dot / (norm_q * norm_e)
-                    # Boost score if path contains graph entities
-                    entity_boost = 0
-                    path_lower = row["path"].lower()
-                    for entity in graph_context["graph_entities"]:
-                        if entity.lower() in path_lower or entity.lower() in row["text"].lower():
-                            entity_boost = GRAPH_PRIORITY_BOOST
-                            break
-                    final_score = (similarity * VECTOR_BASE_WEIGHT) + entity_boost
-                    if similarity > 0.2:  # Threshold for inclusion
-                        results.append({
-                            "path": row["path"],
-                            "text": row["text"][:600],
-                            "score": final_score,
-                            "source": "vector",
-                            "base_similarity": similarity,
-                            "entity_boost": entity_boost
-                        })
-            except Exception as e:
-                logging.debug(f"Suppressed: {e}")
-        conn.close()
+        for row_id, path, text, distance in rows:
+            # vec0 distance_metric=cosine returns `1 - cos_sim` —
+            # invert to align with the rest of the codebase's `similarity`
+            # convention (1.0 = identical, 0.0 = orthogonal).
+            similarity = 1.0 - distance
+            if similarity <= 0.2:
+                continue
+            entity_boost = 0
+            path_lower = (path or "").lower()
+            text_lower = (text or "").lower()
+            for entity in graph_context["graph_entities"]:
+                el = entity.lower()
+                if el in path_lower or el in text_lower:
+                    entity_boost = GRAPH_PRIORITY_BOOST
+                    break
+            final_score = (similarity * VECTOR_BASE_WEIGHT) + entity_boost
+            results.append({
+                "path": path,
+                "text": (text or "")[:600],
+                "score": final_score,
+                "source": "vector",
+                "base_similarity": similarity,
+                "entity_boost": entity_boost,
+            })
         results.sort(key=lambda x: x["score"], reverse=True)
         return results[:limit]
@@ -1598,7 +1748,11 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
             log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
         qmd_db = Path(QMD_DB_PATH)
         qmd_db.parent.mkdir(parents=True, exist_ok=True)
-        conn = sqlite3.connect(str(qmd_db), timeout=10)
+        # Open with sqlite-vec loaded so we can dual-write to vec_index
+        # below. If extension load fails, vec_index inserts silently no-op
+        # via the try/except — chunks (JSON) still gets the write so the
+        # corpus stays whole; search just degrades to the old path.
+        conn = _open_qmd_conn()
         conn.execute("PRAGMA journal_mode=WAL")
         conn.execute("""
             CREATE TABLE IF NOT EXISTS chunks (
@@ -1612,14 +1766,33 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                 created_at TEXT
             )
         """)
+        try:
+            _ensure_vec_index(conn)
+        except Exception as e:
+            log.error(f"vec_index DDL failed: {e} — falling back to chunks-only write")
         for n, vec in zip(norm, embeddings):
             if not vec:
                 continue
-            conn.execute(
+            cur = conn.execute(
                 "INSERT INTO chunks (path, text, embedding, embedding_model, embedding_dim, chunk_index, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
                 (f"bench/{arena}/{n['path']}.md", n["content"],
                  json.dumps(vec), "nv-embed-v2", len(vec), 0, now_iso),
             )
+            # Mirror into the vec0 KNN index so search_qmd_informed can
+            # MATCH on the f32-packed vector. Dim must match the vec0 DDL
+            # (EMBED_DIM); skip rows where the embedding shape disagrees
+            # so a single bad row doesn't poison the batch insert.
+            if cur.lastrowid is not None and len(vec) == EMBED_DIM:
+                try:
+                    conn.execute(
+                        "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+                        (cur.lastrowid, struct.pack(f"{len(vec)}f", *vec)),
+                    )
+                except Exception as e:
+                    # vec_index dual-write is defensive — the JSON column
+                    # in chunks is still the source of truth until the
+                    # backfill task confirms vec_index is in sync.
+                    log.debug(f"vec_index insert skipped for row {cur.lastrowid}: {e}")
             l4_inserted += 1
         conn.commit()
         conn.close()

package/packages/memory-engine/tests/test_l2_qmd_vec_search.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""Tests for the sqlite-vec-backed QMD search path in l2-hybridrag-proxy.
+Validates the migration from the legacy Python-cosine-over-JSON path
+(which had a silent `ORDER BY id LIMIT 2000` correctness bug — only
+the OLDEST 2000 chunks were ever considered) to native sqlite-vec
+KNN MATCH over a vec0 virtual table.
+Pure-Python tests — no Neo4j, no Milvus. The proxy module is loaded
+via importlib so we can call helpers and handlers directly, and
+QMD_DB_PATH is overridden to a tmp_path file per test.
+Run:
+    cd packages/memory-engine
+    .venv/bin/python -m pytest tests/test_l2_qmd_vec_search.py -v
+The tests skip cleanly when ``sqlite_vec`` is not importable — useful
+for unit-only runs on machines that don't have the wheel installed.
+"""
+from __future__ import annotations
+import importlib.util
+import json
+import struct
+import sys
+from pathlib import Path
+import pytest
+try:
+    import sqlite_vec  # noqa: F401
+    _SQLITE_VEC_OK = True
+except ImportError:
+    _SQLITE_VEC_OK = False
+_skip_no_sqlite_vec = pytest.mark.skipif(
+    not _SQLITE_VEC_OK,
+    reason="sqlite_vec wheel not installed in this venv",
+)
+ENGINE_ROOT = Path(__file__).resolve().parent.parent / "engine" / "services" / "l2"
+sys.path.insert(0, str(ENGINE_ROOT))
+@pytest.fixture(scope="module")
+def proxy_module():
+    """Load l2-hybridrag-proxy as a module. Same pattern as
+    test_channel_stat_reader / test_people_list_reader so the
+    module-load failure mode (missing deps) skips cleanly rather than
+    erroring."""
+    spec = importlib.util.spec_from_file_location(
+        "l2_proxy_module_qmd_vec",
+        ENGINE_ROOT / "l2-hybridrag-proxy.py",
+    )
+    assert spec and spec.loader
+    try:
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+    except ImportError:
+        pytest.skip("l2 proxy deps unavailable in this venv (fine for unit-only runs)")
+    return mod
+@pytest.fixture
+def qmd_db(tmp_path, proxy_module, monkeypatch):
+    """Per-test qmd.sqlite at a tmp path, with the proxy module pointed
+    at it. Yields the path so tests can run their own asserting queries
+    against it."""
+    db_path = tmp_path / "qmd.sqlite"
+    monkeypatch.setattr(proxy_module, "QMD_DB_PATH", str(db_path))
+    return db_path
+def _make_vec(seed: int, dim: int) -> list[float]:
+    """Deterministic synthetic embedding — small enough to test fast,
+    structured enough that nearest-neighbour relationships are stable
+    across runs. The first slot dominates the cosine direction so we
+    can build orthogonal-ish clusters by varying its sign + magnitude."""
+    import random as _r
+    rng = _r.Random(seed)
+    return [rng.gauss(0.0, 1.0) for _ in range(dim)]
+# ---------------------------------------------------------------------------
+# 1. vec_index MATCH semantics — sanity check the SDK glue against sqlite-vec.
+# ---------------------------------------------------------------------------
+@_skip_no_sqlite_vec
+def test_vec_index_match_returns_top_k(qmd_db, proxy_module) -> None:
+    """Insert N known vectors with a planted ringer, query with the
+    ringer's vector, assert the ringer is the top hit. This is the
+    minimum signal that ``_ensure_vec_index`` + native MATCH actually
+    work end-to-end against the dim our proxy is configured for."""
+    conn = proxy_module._open_qmd_conn()
+    proxy_module._ensure_vec_index(conn)
+    dim = proxy_module.EMBED_DIM
+    # 20 rows of noise + 1 planted ringer at id=999. Planted vector is
+    # near-orthogonal to the noise (which uses positive-slot dominance)
+    # by flipping the first slot's sign — confirms the cosine MATCH
+    # actually orders by similarity, not by row id.
+    for i in range(20):
+        v = _make_vec(seed=i + 1, dim=dim)
+        v[0] = abs(v[0]) + 10.0  # bias positive
+        conn.execute(
+            "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+            (i + 1, struct.pack(f"{dim}f", *v)),
+        )
+    ringer = _make_vec(seed=999, dim=dim)
+    ringer[0] = -abs(ringer[0]) - 10.0  # bias negative — opposite cluster
+    conn.execute(
+        "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+        (999, struct.pack(f"{dim}f", *ringer)),
+    )
+    conn.commit()
+    qbytes = struct.pack(f"{dim}f", *ringer)
+    rows = conn.execute(
+        """
+        SELECT id, distance
+        FROM vec_index
+        WHERE embedding MATCH ? AND k = ?
+        ORDER BY distance
+        """,
+        (qbytes, 5),
+    ).fetchall()
+    conn.close()
+    assert len(rows) == 5
+    top_id, top_dist = rows[0]
+    assert top_id == 999, f"expected ringer id=999, got {top_id} ({rows!r})"
+    # Cosine distance = 1 - cos_sim, so identity vector → ~0 distance.
+    # Ringer-vs-itself is exact, so we expect ~0 here; allow float32
+    # round-trip slop.
+    assert top_dist < 1e-3, f"ringer-vs-itself should be ~0, got {top_dist}"
+# ---------------------------------------------------------------------------
+# 2. search_qmd_informed uses vec_index, not the legacy JSON-cosine path.
+# ---------------------------------------------------------------------------
+@_skip_no_sqlite_vec
+def test_search_qmd_informed_uses_vec_index(qmd_db, proxy_module, monkeypatch) -> None:
+    """Full search path test: seed chunks + vec_index, mock
+    ``get_embedding`` to return a vector that matches the ringer,
+    assert the returned results are sourced from the vec_index JOIN
+    (which preserves path/text from chunks) and ranked by similarity.
+    This is the test that would fail if someone reverted the search
+    body to the legacy ``ORDER BY id LIMIT 2000`` path — because the
+    ringer's id is 999 (well outside the 2000-row prefix), the legacy
+    path would never see it."""
+    import sqlite3
+    conn = proxy_module._open_qmd_conn()
+    proxy_module._ensure_vec_index(conn)
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS chunks (
+            id INTEGER PRIMARY KEY,
+            path TEXT,
+            text TEXT,
+            embedding TEXT,
+            embedding_model TEXT,
+            embedding_dim INTEGER,
+            chunk_index INTEGER,
+            created_at TEXT
+        )
+        """
+    )
+    dim = proxy_module.EMBED_DIM
+    # Noise rows 1..20 + planted ringer id=999. Same orthogonal-cluster
+    # setup as test 1 — guarantees the ringer wins on cosine.
+    for i in range(20):
+        v = _make_vec(seed=i + 1, dim=dim)
+        v[0] = abs(v[0]) + 10.0
+        conn.execute(
+            "INSERT INTO chunks(id, path, text, embedding) VALUES (?, ?, ?, ?)",
+            (i + 1, f"noise/{i}.md", f"noise text {i}", json.dumps(v)),
+        )
+        conn.execute(
+            "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+            (i + 1, struct.pack(f"{dim}f", *v)),
+        )
+    ringer = _make_vec(seed=999, dim=dim)
+    ringer[0] = -abs(ringer[0]) - 10.0
+    conn.execute(
+        "INSERT INTO chunks(id, path, text, embedding) VALUES (?, ?, ?, ?)",
+        (999, "ringer/needle.md", "needle in the haystack", json.dumps(ringer)),
+    )
+    conn.execute(
+        "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+        (999, struct.pack(f"{dim}f", *ringer)),
+    )
+    conn.commit()
+    conn.close()
+    # Mock get_embedding to return the ringer's vector for any query.
+    monkeypatch.setattr(proxy_module, "get_embedding", lambda *_a, **_kw: ringer)
+    out = proxy_module.search_qmd_informed(
+        "any query — get_embedding is mocked",
+        {"graph_entities": []},
+        limit=3,
+    )
+    assert out, "search returned empty; vec_index path must surface ringer"
+    top = out[0]
+    assert top["path"] == "ringer/needle.md", (
+        f"top hit should be the ringer at row 999; got {top['path']}. "
+        f"If this fails, the search may have reverted to the LIMIT 2000 "
+        f"legacy path which never sees row 999."
+    )
+    assert top["text"] == "needle in the haystack"
+    assert top["base_similarity"] > 0.9
+    assert top["source"] == "vector"
+# ---------------------------------------------------------------------------
+# 3. Backfill is idempotent — second run on a populated vec_index is no-op.
+# ---------------------------------------------------------------------------
+@_skip_no_sqlite_vec
+def test_backfill_idempotent(qmd_db, proxy_module) -> None:
+    """Seed chunks with JSON embeddings only (no vec_index rows),
+    call ``_backfill_vec_index`` twice, assert:
+      1. First call copies all rows into vec_index.
+      2. Second call observes vec_n >= chunks_n and is a no-op (no
+         duplicate inserts, no errors).
+    Catches the failure mode where a missing idempotency check would
+    INSERT duplicate ids on the second invocation, blow up the UNIQUE
+    constraint, and corrupt the index."""
+    import asyncio
+    conn = proxy_module._open_qmd_conn()
+    proxy_module._ensure_vec_index(conn)
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS chunks (
+            id INTEGER PRIMARY KEY,
+            path TEXT,
+            text TEXT,
+            embedding TEXT,
+            embedding_model TEXT,
+            embedding_dim INTEGER,
+            chunk_index INTEGER,
+            created_at TEXT
+        )
+        """
+    )
+    dim = proxy_module.EMBED_DIM
+    N = 7
+    for i in range(N):
+        v = _make_vec(seed=i + 100, dim=dim)
+        conn.execute(
+            "INSERT INTO chunks(id, path, text, embedding) VALUES (?, ?, ?, ?)",
+            (i + 1, f"p/{i}.md", f"t{i}", json.dumps(v)),
+        )
+    conn.commit()
+    pre_chunks = conn.execute("SELECT count(*) FROM chunks").fetchone()[0]
+    pre_vec = conn.execute("SELECT count(*) FROM vec_index").fetchone()[0]
+    conn.close()
+    assert pre_chunks == N and pre_vec == 0, (
+        f"setup mismatch: chunks={pre_chunks}, vec={pre_vec}"
+    )
+    # First run — should copy all N rows.
+    asyncio.run(proxy_module._backfill_vec_index())
+    conn = proxy_module._open_qmd_conn()
+    mid_vec = conn.execute("SELECT count(*) FROM vec_index").fetchone()[0]
+    conn.close()
+    assert mid_vec == N, f"first backfill should copy all {N} rows, got {mid_vec}"
+    # Second run — must no-op cleanly. No exception, no duplicate inserts.
+    asyncio.run(proxy_module._backfill_vec_index())
+    conn = proxy_module._open_qmd_conn()
+    final_vec = conn.execute("SELECT count(*) FROM vec_index").fetchone()[0]
+    conn.close()
+    assert final_vec == N, (
+        f"second backfill should be no-op; got {final_vec} rows instead of {N}"
+    )