npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.8.3 → 0.8.4 - Mend

@pentatonic-ai/ai-agent-sdk 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.8.3",
+  "version": "0.8.4",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine/compat/server.py CHANGED Viewed

@@ -453,21 +453,95 @@ async def health():
     if failures:
         out["status"] = "degraded" if failures < 3 else "down"
-    # Memory count: query L6 doc-store as authoritative. L6 /stats
-    # returns vector_chunks (Milvus) and fts_chunks (sqlite content
-    # table). Under healthy operation they're equal — take the max so
-    # the count is honest if one side is mid-rebuild.
+    # Per-layer chunk counts. Replaces the previous single `memories` int
+    # which only reflected L6's vector chunk count — misleading because
+    # L0/L4/L5 hold different (overlapping) projections of the corpus.
+    # Each layer is independently probed; transient failure on one layer
+    # leaves its slot null rather than zeroing the whole field.
+    memories: dict[str, int | None] = {
+        "l0_bm25_chunks": None,
+        "l4_vectors": None,
+        "l5_chats_chunks": None,
+        "l6_vector_chunks": None,
+        "l6_fts_chunks": None,
+    }
+    # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
+    try:
+        r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
+        if r.status_code == 200:
+            stats = r.json()
+            memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
+    except Exception:
+        pass
+    # L4 reports n_vectors on its own /health.
+    try:
+        r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
+        if r.status_code == 200:
+            memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
+    except Exception:
+        pass
+    # L5 reports per-collection counts on /health. We surface chats —
+    # the only collection currently populated; emails/contacts/memory
+    # collections coming back online with L5 collection bootstrap.
+    try:
+        r = await _client().get(f"{L5_MILVUS_URL}/health", timeout=3.0)
+        if r.status_code == 200:
+            colls = r.json().get("collections") or {}
+            chats = colls.get("chats") if isinstance(colls, dict) else None
+            if isinstance(chats, dict):
+                memories["l5_chats_chunks"] = int(chats.get("count") or 0)
+    except Exception:
+        pass
+    # L6 exposes vector vs fts splits on /stats.
     try:
         r = await _client().get(f"{L6_DOC_URL}/stats", timeout=3.0)
         if r.status_code == 200:
             stats = r.json()
-            out["memories"] = max(
-                int(stats.get("vector_chunks") or 0),
-                int(stats.get("fts_chunks") or 0),
-                int(stats.get("total_chunks") or 0),
-            )
+            memories["l6_vector_chunks"] = int(stats.get("vector_chunks") or 0)
+            memories["l6_fts_chunks"] = int(stats.get("fts_chunks") or 0)
     except Exception:
-        out["memories"] = None
+        pass
+    out["memories"] = memories
+    return out
+@app.get("/health/deep")
+async def health_deep():
+    """Aggregate functional probe — fans out to each layer's /health/deep
+    and reports per-layer ok/status.
+    Where /health checks "process up + port answering", /health/deep
+    actually exercises embed → write → search round-trips on each layer
+    that supports it. Catches the class of bug that masquerades as
+    "healthy" — request handlers 500'ing while the process stays up.
+    Slower than /health (~1–2s); intended for ops/monitoring/cron use,
+    not the deploy gate or compose healthcheck.
+    """
+    import asyncio
+    out: dict[str, Any] = {"status": "ok", "ok": True, "layers": {}}
+    async def _probe_deep(name: str, url: str) -> tuple[str, dict]:
+        try:
+            r = await _client().get(url, timeout=15.0)
+            if r.status_code != 200:
+                return name, {"ok": False, "status": f"http {r.status_code}"}
+            return name, r.json()
+        except Exception as exc:
+            return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
+    results = await asyncio.gather(
+        _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
+        _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
+        _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
+    )
+    for name, body in results:
+        out["layers"][name] = body
+        if not body.get("ok", False):
+            out["ok"] = False
+    if not out["ok"]:
+        out["status"] = "degraded"
     return out

package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py CHANGED Viewed

@@ -12,6 +12,7 @@ Port: 8031 (replaces neo4j-qmd-proxy.py)
 """
 import argparse
+import asyncio
 import hashlib
 import json
 import logging
@@ -19,14 +20,16 @@ import os
 import sqlite3
 import sys
 import time
+from contextlib import asynccontextmanager
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, AsyncIterator, Dict, List, Optional, Set
 import re
+import httpx
 import requests
 from fastapi import FastAPI, HTTPException, Request
-from neo4j import GraphDatabase
+from neo4j import AsyncGraphDatabase
 from neo4j.time import DateTime as Neo4jDateTime, Date as Neo4jDate
 from pydantic import BaseModel
 import uvicorn
@@ -129,7 +132,67 @@ TRACKER_FILE = WORKSPACE / "memory" / "memory-tracker.jsonl"
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("sequential-hybridrag")
-app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0")
+# ---------------------------------------------------------------------------
+# Async driver + HTTP client singletons
+# ---------------------------------------------------------------------------
+#
+# v0.8.4: migrated from sync neo4j (`GraphDatabase`) to async
+# (`AsyncGraphDatabase`) and from per-call drivers to one process-wide
+# pool. Previously every handler created a fresh sync driver inline,
+# which (a) blocked the event loop on connection establishment and
+# every query, and (b) held one thread per in-flight request. Under
+# sustained ingest the threadpool exhausted and /health itself timed
+# out (the `l0/l1/l2: unreachable: ReadTimeout` we saw in prod).
+#
+# The async driver multiplexes many queries through bolt without
+# blocking the event loop; one shared pool means connection
+# establishment is amortised. fastapi lifespan handles open/close.
+_neo4j_driver: "AsyncGraphDatabase | None" = None
+_http_client: "httpx.AsyncClient | None" = None
+def get_neo4j_driver():
+    """Lazy module-level singleton — created at first call (which the
+    lifespan handler does at startup). Sharing across requests is the
+    documented neo4j Python-driver pattern; the driver itself is
+    thread- and task-safe."""
+    global _neo4j_driver
+    if _neo4j_driver is None:
+        _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
+    return _neo4j_driver
+def get_http_client() -> httpx.AsyncClient:
+    """Shared async HTTP client for L4/L5/L6 fan-out and embedding-proxy
+    pass-through. Reuses TCP connections via httpx's built-in pool."""
+    global _http_client
+    if _http_client is None:
+        _http_client = httpx.AsyncClient(timeout=30.0)
+    return _http_client
+@asynccontextmanager
+async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
+    """Open the neo4j driver + HTTP client at process startup, close on
+    shutdown. Without this, the first request pays driver-open latency
+    and the driver is never properly closed on SIGTERM (leaking conns)."""
+    global _neo4j_driver, _http_client
+    _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
+    _http_client = httpx.AsyncClient(timeout=30.0)
+    try:
+        yield
+    finally:
+        if _neo4j_driver is not None:
+            await _neo4j_driver.close()
+            _neo4j_driver = None
+        if _http_client is not None:
+            await _http_client.aclose()
+            _http_client = None
+app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0", lifespan=lifespan)
 # ---------------------------------------------------------------------------
 # Memory Usage Tracking
@@ -267,7 +330,7 @@ def extract_query_entities(query: str) -> List[str]:
     log.info(f"Extracted entities: {potential_entities}")
     return potential_entities
-def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], increment: float = 0.05) -> None:
+async def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], increment: float = 0.05) -> None:
     """Hebbian: strengthen edges between co-accessed nodes during query.
     Scoped by arena so a search inside tenant A can't reinforce edges
@@ -281,7 +344,7 @@ def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], incre
     for i, n1 in enumerate(node_names):
         for n2 in node_names[i+1:]:
             try:
-                session.run(
+                await session.run(
                     """MATCH (a {name: $n1})-[r]-(b {name: $n2})
                        WHERE a.arena IN $arenas AND b.arena IN $arenas
                        SET r.weight = coalesce(r.weight, 1.0) + $inc,
@@ -292,7 +355,7 @@ def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], incre
                 pass  # non-critical
-def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str], limit: int = 12) -> Dict:
+async def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str], limit: int = 12) -> Dict:
     """Phase 1: Neo4j graph search with spreading activation + Hebbian.
     `arenas` is the tenant-scope set the caller is authorised for —
@@ -306,11 +369,11 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
         log.warning("search_neo4j_sequential called without arenas — returning empty results")
         return {"results": [], "graph_entities": [], "entity_count": 0}
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
+        driver = get_neo4j_driver()
         results = []
         graph_entities = set()
-        with driver.session() as session:
+        async with driver.session() as session:
             # Search for specific entities — use weighted spreading activation
             for entity in entities:
                 # Direct match first — arena-scoped on every node we touch.
@@ -326,9 +389,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
                 LIMIT $limit
                 """
-                records = session.run(cypher, entity=entity, arenas=arenas, limit=8)
+                result = await session.run(cypher, entity=entity, arenas=arenas, limit=8)
-                for record in records:
+                async for record in result:
                     node = _serialize_neo4j_value(dict(record["n"]))
                     rel = record["r"]
                     connected = record["connected"]
@@ -366,7 +429,7 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
                 # the filter, an activation could walk into another
                 # tenant's graph via a name-collision on the start node.
                 if entity:
-                    activation_results = session.run("""
+                    activation_results = await session.run("""
                         MATCH (start)-[r1]-(mid)-[r2]-(end)
                         WHERE start.name CONTAINS $entity
                           AND start.arena IN $arenas
@@ -382,7 +445,7 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
                         LIMIT 5
                     """, entity=entity, arenas=arenas)
-                    for rec in activation_results:
+                    async for rec in activation_results:
                         end_node = _serialize_neo4j_value(dict(rec["end"])) if rec["end"] else {}
                         name = end_node.get("name", "")
                         if name and name not in graph_entities:
@@ -413,9 +476,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
                     LIMIT $limit
                     """
-                    records = session.run(cypher, term=word, arenas=arenas, limit=4)
+                    result = await session.run(cypher, term=word, arenas=arenas, limit=4)
-                    for record in records:
+                    async for record in result:
                         node = _serialize_neo4j_value(dict(record["n"]))
                         context = f"Related: {node}"
                         graph_entities.add(node.get('name', ''))
@@ -430,9 +493,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
                         })
             # Hebbian: strengthen edges between all accessed entities
-            _hebbian_strengthen(session, arenas, list(graph_entities))
+            await _hebbian_strengthen(session, arenas, list(graph_entities))
-        driver.close()
+        # Note: driver is a module-level singleton, do NOT close here.
         return {
             "results": results[:limit],
@@ -861,8 +924,8 @@ def search_l0_bm25(query: str, limit: int = 6, arena: str = None,
 L5_API_URL = os.environ.get("PME_L5_URL", "http://127.0.0.1:8034")
-def search_l5_communications(query: str, limit: int = 6, arena: str = None,
-                             arenas: List[str] = None) -> List[Dict]:
+async def search_l5_communications(query: str, limit: int = 6, arena: str = None,
+                                   arenas: List[str] = None) -> List[Dict]:
     """Search L5 Communications Context via L5 API (emails, chats, calendar).
     arena / arenas (optional): forwarded to L5; filters Milvus by the
@@ -876,7 +939,7 @@ def search_l5_communications(query: str, limit: int = 6, arena: str = None,
         params: list = [("q", query), ("limit", str(limit))]
         for a in arena_list:
             params.append(("arenas", a))
-        resp = requests.get(
+        resp = await get_http_client().get(
             f"{L5_API_URL}/search",
             params=params,
             timeout=10,
@@ -922,8 +985,8 @@ def search_l5_communications(query: str, limit: int = 6, arena: str = None,
 # L6: Document Store Search
 L6_URL = os.environ.get("PME_L6_URL", "http://localhost:8037")
-def search_l6_documents(query: str, limit: int = 6, arena: str = None,
-                        arenas: List[str] = None) -> List[Dict]:
+async def search_l6_documents(query: str, limit: int = 6, arena: str = None,
+                              arenas: List[str] = None) -> List[Dict]:
     """Search L6 Document Store (research, legal, financial, project docs).
     arena / arenas (optional): forwarded to L6 — L6 supports multi-arena
@@ -939,7 +1002,7 @@ def search_l6_documents(query: str, limit: int = 6, arena: str = None,
         ]
         for a in arena_list:
             params.append(("arenas", a))
-        resp = requests.get(
+        resp = await get_http_client().get(
             f"{L6_URL}/search",
             params=params,
             timeout=10,
@@ -986,50 +1049,52 @@ def search_l6_documents(query: str, limit: int = 6, arena: str = None,
         return []
-def sequential_hybridrag_search(query: str, limit: int = 16,
-                                arena: str = None,
-                                arenas: List[str] = None) -> List[Dict]:
+async def sequential_hybridrag_search(query: str, limit: int = 16,
+                                      arena: str = None,
+                                      arenas: List[str] = None) -> List[Dict]:
     """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs).
     arena / arenas (optional): tenant + user scope. Multi-arena lets a
     user's search span tenant-wide rows + their own user-scoped rows in
     a single hybrid pass. Forwarded to L0, L5, L6 native filters; L4
     and L3 still rely on the compat shim post-filter.
+    Async since v0.8.4: independent layer fan-out runs concurrently via
+    asyncio.gather; sync workers (sqlite, file I/O, PyTorch reranker)
+    are dispatched to threads via asyncio.to_thread to keep the event
+    loop responsive under sustained ingest.
     """
     arena_list = list(arenas) if arenas else ([arena] if arena else [])
     start_time = time.time()
     log.info(f"Starting sequential HybridRAG search for: '{query}' arenas={arena_list!r}")
-    # L0: BM25 workspace memory (keyword search — complements semantic layers)
-    l0_results = search_l0_bm25(query, limit=6, arenas=arena_list)
+    # L0 (sqlite) + L1 (file I/O) + entity extraction run in parallel.
+    # Each is sync — offloaded to the threadpool so the event loop stays
+    # free for the in-flight L3/L5/L6 calls.
+    l0_results, system_results, entities = await asyncio.gather(
+        asyncio.to_thread(search_l0_bm25, query, 6, None, arena_list),
+        asyncio.to_thread(search_core_memory_files, query, 4),
+        asyncio.to_thread(extract_query_entities, query),
+    )
     log.info(f"L0 BM25 workspace: {len(l0_results)} results")
-    # L1: System Files (HIGHEST PRIORITY)
-    system_results = search_core_memory_files(query, limit=4)
     log.info(f"L1 System files: {len(system_results)} results")
-    # L2: HybridRAG orchestration
-    # L3: Graph search (entity extraction + Neo4j) — arena-scoped so a
-    # tenant's search can never traverse another tenant's entity graph
-    # via name collisions on shared :Entity nodes. The post-filter shim
-    # protects chunks; this protects the entity-walking layer too.
-    entities = extract_query_entities(query)
-    graph_context = search_neo4j_sequential(query, entities, arena_list, limit=8)
+    # L3: Graph search (now native async via AsyncGraphDatabase).
+    graph_context = await search_neo4j_sequential(query, entities, arena_list, limit=8)
     log.info(f"L3 Graph search: {len(graph_context['results'])} results, {graph_context['entity_count']} entities")
-    # HyDE: expand query for better vector embeddings
-    hyde_query = hyde_expand(query)
+    # HyDE expansion is sync (LLM call), offload to thread.
+    hyde_query = await asyncio.to_thread(hyde_expand, query)
-    # L4: Vector search (informed by L3 graph context + HyDE)
-    vector_results = search_qmd_informed(hyde_query, graph_context, limit=8)
+    # L4/L5/L6 fan out concurrently — L4 (sqlite) via to_thread, L5/L6
+    # native async via httpx.AsyncClient.
+    vector_results, l5_results, l6_results = await asyncio.gather(
+        asyncio.to_thread(search_qmd_informed, hyde_query, graph_context, 8),
+        search_l5_communications(hyde_query, limit=6, arenas=arena_list),
+        search_l6_documents(hyde_query, limit=6, arenas=arena_list),
+    )
     log.info(f"L4 Vector search: {len(vector_results)} results (HyDE={'on' if hyde_query != query else 'off'})")
-    # L5: Communications Context (emails, chats, calendar) — also use HyDE
-    l5_results = search_l5_communications(hyde_query, limit=6, arenas=arena_list)
     log.info(f"L5 Communications: {len(l5_results)} results")
-    # L6: Document Store (research, legal, financial, project docs)
-    l6_results = search_l6_documents(hyde_query, limit=6, arenas=arena_list)
     log.info(f"L6 Documents: {len(l6_results)} results")
     # L2: HybridRAG fusion (combines all layers with L1 priority)
@@ -1046,8 +1111,9 @@ def sequential_hybridrag_search(query: str, limit: int = 16,
     # Sort by layer priority: L1 System (1.0) > L3 Graph (0.9) > L4 Vector (0.7+)
     deduplicated.sort(key=lambda x: x["score"], reverse=True)
-    # Cross-encoder reranking: re-embed top results and blend scores
-    deduplicated = cross_encoder_rerank(query, deduplicated, top_k=limit)
+    # Cross-encoder reranking: re-embed top results and blend scores.
+    # PyTorch CrossEncoder.predict is sync — offload to thread.
+    deduplicated = await asyncio.to_thread(cross_encoder_rerank, query, deduplicated, limit)
     # Track layer usage for evolution
     search_time_ms = (time.time() - start_time) * 1000
@@ -1094,7 +1160,7 @@ async def search_endpoint(request: Request) -> dict:
         if not query:
             raise HTTPException(status_code=400, detail="query is required")
-        results = sequential_hybridrag_search(query, limit=limit, arena=arena, arenas=arenas)
+        results = await sequential_hybridrag_search(query, limit=limit, arena=arena, arenas=arenas)
         # Also return raw graph entities for context enrichment.
         # Same arena scope as the cascade search above — without it
@@ -1172,7 +1238,7 @@ async def chat_completions(request: ChatCompletionRequest) -> dict:
         # empty when no arenas are supplied; callers that need L3 must
         # pass `arena` or `arenas` on the request body.
         start_time = time.time()
-        results = sequential_hybridrag_search(
+        results = await sequential_hybridrag_search(
             query, limit=16, arena=request.arena, arenas=request.arenas,
         )
         search_time = time.time() - start_time
@@ -1244,33 +1310,35 @@ async def check_contradictions(node_name: str, arena: Optional[str] = None) -> d
             detail="arena query parameter is required to scope contradiction lookup",
         )
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
+        driver = get_neo4j_driver()
         contradictions = []
-        with driver.session() as session:
+        async with driver.session() as session:
             # Find the node — must be in the caller's arena.
-            node = session.run(
+            result = await session.run(
                 """MATCH (n) WHERE toLower(n.name) = toLower($name) AND n.arena = $arena
                    RETURN elementId(n) AS id""",
                 name=node_name, arena=arena,
-            ).single()
+            )
+            node = await result.single()
             if not node:
                 return {"node": node_name, "contradictions": [], "error": "Node not found"}
             nid = node["id"]
             # Explicit CONTRADICTS — both endpoints must be in the same arena.
-            for rec in session.run(
+            result = await session.run(
                 """MATCH (a)-[r:CONTRADICTS]-(b)
                    WHERE elementId(a) = $nid AND b.arena = $arena
                    RETURN a.name AS a, b.name AS b, r.reason AS reason""",
                 nid=nid, arena=arena,
-            ):
+            )
+            async for rec in result:
                 contradictions.append({"type": "explicit", "a": rec["a"], "b": rec["b"], "reason": rec["reason"]})
             # Property conflicts via shared neighbour — every node along
             # the (a)--(shared)--(b) path filtered by arena so a shared
             # neighbour from another tenant can't trigger a false-positive
             # conflict in this tenant's view.
-            for rec in session.run(
+            result = await session.run(
                 """MATCH (a)--(shared)--(b)
                    WHERE elementId(a) = $nid AND a <> b
                      AND shared.arena = $arena AND b.arena = $arena
@@ -1281,28 +1349,28 @@ async def check_contradictions(node_name: str, arena: Optional[str] = None) -> d
                    WHERE size(ck) > 0
                    RETURN a.name AS a, b.name AS b, shared.name AS via, ck
                    LIMIT 10""", nid=nid, arena=arena,
-            ):
+            )
+            async for rec in result:
                 contradictions.append({
                     "type": "property_conflict", "a": rec["a"], "b": rec["b"],
                     "via": rec["via"], "conflicting_keys": rec["ck"]
                 })
-        driver.close()
         return {"node": node_name, "contradictions": contradictions, "count": len(contradictions)}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-def _check_l5_health() -> bool:
+async def _check_l5_health() -> bool:
     """Quick check if L5 Communications API is responding."""
     try:
-        resp = requests.get(f"{L5_API_URL}/health", timeout=3)
+        resp = await get_http_client().get(f"{L5_API_URL}/health", timeout=3)
         return resp.status_code == 200
     except Exception:
         return False
-def _check_l6_health() -> bool:
+async def _check_l6_health() -> bool:
     """Quick check if L6 Document Store is responding."""
     try:
-        resp = requests.get(f"{L6_URL}/health", timeout=3)
+        resp = await get_http_client().get(f"{L6_URL}/health", timeout=3)
         return resp.status_code == 200 and resp.json().get("status") in ("ok", "degraded")
     except Exception:
         return False
@@ -1323,16 +1391,15 @@ async def health() -> dict:
     neo4j_healthy = False
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
-        with driver.session() as session:
-            session.run("RETURN 1")
+        driver = get_neo4j_driver()
+        async with driver.session() as session:
+            await session.run("RETURN 1")
         neo4j_healthy = True
-        driver.close()
     except Exception as e:
         logging.debug(f"Suppressed: {e}")
-    l5_reachable = _check_l5_health()
-    l6_reachable = _check_l6_health()
+    l5_reachable = await _check_l5_health()
+    l6_reachable = await _check_l6_health()
     # Top-level status: degrade only on layers L2 is the sole gatekeeper for.
     # L5/L6 are independent services probed by the compat shim.
@@ -1411,15 +1478,19 @@ def _extract_entities_for_kg(text: str, max_entities: int = 32) -> List[str]:
     return found[:max_entities]
-def _embed_batch_local(texts: List[str]) -> List[List[float]]:
-    """Batch embed via the shared EmbedClient. Returns vectors in input order."""
+async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
+    """Batch embed via the shared EmbedClient (async). Returns vectors in input order."""
     if not texts:
         return []
     try:
-        return _embed_client().embed_batch(texts)
+        return await _embed_client().embed_batch_async(texts)
     except Exception as e:
         log.warning(f"NV-Embed batch failed: {e}; trying singletons")
-        return [get_embedding(t) for t in texts]
+        # Singleton fallback stays sync (each one-shot embed is small);
+        # offload to thread so we don't block the loop.
+        return await asyncio.gather(
+            *(asyncio.to_thread(get_embedding, t) for t in texts)
+        )
 class IndexInternalBatchRequest(BaseModel):
@@ -1504,7 +1575,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
     # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
     l4_inserted = 0
     try:
-        embeddings = _embed_batch_local([n["content"] for n in norm])
+        embeddings = await _embed_batch_local([n["content"] for n in norm])
         if len(embeddings) != len(norm):
             log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
         qmd_db = Path(QMD_DB_PATH)
@@ -1560,30 +1631,30 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
     l3_entities = 0
     l3_chunks = 0
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
-        with driver.session() as session:
+        driver = get_neo4j_driver()
+        async with driver.session() as session:
             # Indexes — idempotent. The compound (arena, name) is the
             # right shape now that entities are arena-scoped; the legacy
             # entity_name index stays for the wipe-migration to work
             # against pre-arena rows, then can be dropped in a follow-up.
             try:
-                session.run("CREATE INDEX entity_arena_name IF NOT EXISTS FOR (n:Entity) ON (n.arena, n.name)")
-                session.run("CREATE INDEX person_arena_email IF NOT EXISTS FOR (n:Person) ON (n.arena, n.email)")
-                session.run("CREATE INDEX chunk_arena IF NOT EXISTS FOR (c:Chunk) ON (c.arena)")
-                session.run("CREATE INDEX chunk_id IF NOT EXISTS FOR (c:Chunk) ON (c.id)")
+                await session.run("CREATE INDEX entity_arena_name IF NOT EXISTS FOR (n:Entity) ON (n.arena, n.name)")
+                await session.run("CREATE INDEX person_arena_email IF NOT EXISTS FOR (n:Person) ON (n.arena, n.email)")
+                await session.run("CREATE INDEX chunk_arena IF NOT EXISTS FOR (c:Chunk) ON (c.arena)")
+                await session.run("CREATE INDEX chunk_id IF NOT EXISTS FOR (c:Chunk) ON (c.id)")
                 # ChannelStat is the denormalised aggregate read by
                 # /aggregate on the fast path. Compound index covers
                 # the (arena, person_email) lookup that the reader
                 # uses; the per-channel rows are returned in one
                 # range scan.
-                session.run("CREATE INDEX channelstat_arena_email IF NOT EXISTS FOR (s:ChannelStat) ON (s.arena, s.person_email)")
+                await session.run("CREATE INDEX channelstat_arena_email IF NOT EXISTS FOR (s:ChannelStat) ON (s.arena, s.person_email)")
                 # UNIQUE constraint on the writer's MERGE key. Without
                 # this, two concurrent index-internal-batch transactions
                 # can both decide a ChannelStat doesn't exist and create
                 # rival nodes — the index doesn't lock, the constraint
                 # does. The constraint also implies an index on the
                 # full key so the MERGE locks efficiently.
-                session.run("CREATE CONSTRAINT channelstat_unique IF NOT EXISTS FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE")
+                await session.run("CREATE CONSTRAINT channelstat_unique IF NOT EXISTS FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE")
             except Exception:
                 pass
             for n in norm:
@@ -1602,7 +1673,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                 # tenant-isolation anchor. Every read traverses through
                 # this node, so getting the arena right here is the
                 # single most important invariant of this whole block.
-                session.run(
+                await session.run(
                     """
                     MERGE (c:Chunk {id: $cid})
                     SET c.text = $text,
@@ -1618,7 +1689,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                 # Concept entities — heuristic, arena-scoped.
                 for ent in heuristic_entities:
-                    session.run(
+                    await session.run(
                         """
                         MERGE (e:Entity:Concept {arena: $arena, name: $name})
                         ON CREATE SET e.type = 'Concept',
@@ -1639,7 +1710,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                 if len(heuristic_entities) >= 2:
                     for i in range(len(heuristic_entities)):
                         for j in range(i + 1, len(heuristic_entities)):
-                            session.run(
+                            await session.run(
                                 """
                                 MATCH (a:Entity:Concept {arena: $arena, name: $a})
                                 MATCH (b:Entity:Concept {arena: $arena, name: $b})
@@ -1666,7 +1737,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                     # flipped to true after the stat update, so replays
                     # of the same eventId never double-count even when
                     # the chunk already exists.
-                    session.run(
+                    await session.run(
                         """
                         MERGE (p:Entity:Person {arena: $arena, email: $email})
                         ON CREATE SET p.created_at = $now,
@@ -1725,7 +1796,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                     l3_entities += 1
                 if isinstance(contact_name, str) and contact_name.strip():
                     cname = contact_name.strip()
-                    session.run(
+                    await session.run(
                         """
                         MERGE (p:Entity:Person {arena: $arena, name: $name})
                         ON CREATE SET p.created_at = $now,
@@ -1751,7 +1822,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                     # Link name→email node so the relationships query
                     # can resolve either alias to the same person.
                     if person_email_node:
-                        session.run(
+                        await session.run(
                             """
                             MATCH (n:Person {arena: $arena, name: $name})
                             MATCH (e:Person {arena: $arena, email: $email})
@@ -1759,7 +1830,6 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                             """,
                             arena=arena, name=cname, email=person_email_node,
                         )
-        driver.close()
     except Exception as e:
         log.error(f"L3 KG write failed: {e}")
@@ -1841,25 +1911,28 @@ async def forget_internal(request: Request) -> dict:
     # Neo4j chunks AND entities both carry arena now, so tenant-scoped
     # delete works correctly here even if L0/L4 still need a migration.
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
-        with driver.session() as session:
+        driver = get_neo4j_driver()
+        async with driver.session() as session:
             if arena:
-                r1 = session.run(
+                r1 = await session.run(
                     "MATCH (c:Chunk {arena: $arena}) DETACH DELETE c RETURN count(c) AS n",
                     arena=arena,
                 )
-                deleted["l3_chunks"] = r1.single()["n"]
-                r2 = session.run(
+                rec = await r1.single()
+                deleted["l3_chunks"] = rec["n"]
+                r2 = await session.run(
                     "MATCH (e:Entity {arena: $arena}) DETACH DELETE e RETURN count(e) AS n",
                     arena=arena,
                 )
-                deleted["l3_entities"] = r2.single()["n"]
+                rec = await r2.single()
+                deleted["l3_entities"] = rec["n"]
             else:  # confirm == "GLOBAL_WIPE", validated above
-                r1 = session.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n")
-                deleted["l3_chunks"] = r1.single()["n"]
-                r2 = session.run("MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n")
-                deleted["l3_entities"] = r2.single()["n"]
-        driver.close()
+                r1 = await session.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n")
+                rec = await r1.single()
+                deleted["l3_chunks"] = rec["n"]
+                r2 = await session.run("MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n")
+                rec = await r2.single()
+                deleted["l3_entities"] = rec["n"]
     except Exception as e:
         log.error(f"L3 forget failed: {e}")
     return {"status": "ok", "deleted": deleted, "arena": arena, "global_wipe": confirm == "GLOBAL_WIPE"}
@@ -1948,13 +2021,10 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
             seen.add(k)
             safe_group_by.append(k)
-    try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"neo4j connect: {e}")
+    driver = get_neo4j_driver()
     try:
-        with driver.session() as session:
+        async with driver.session() as session:
             # Fast path: read from the ChannelStat denormalisation
             # whenever the caller has an email and is grouping by
             # channel. ChannelStats are written by /index-internal-batch
@@ -1973,7 +2043,7 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
                 not safe_group_by or safe_group_by == ["channel"]
             )
             if fast_path_eligible:
-                stats_rows = list(session.run(
+                _res = await session.run(
                     "MATCH (s:ChannelStat {arena: $arena, person_email: $email})\n"
                     "RETURN s.channel AS channel,\n"
                     "       s.count AS count,\n"
@@ -1983,7 +2053,8 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
                     "       s.first_seen AS first_seen\n"
                     "ORDER BY s.count DESC\n",
                     arena=arena, email=contact_email,
-                ))
+                )
+                stats_rows = [rec async for rec in _res]
                 if stats_rows:
                     # Build buckets directly. When group_by=[] we
                     # collapse to a single overall bucket; otherwise
@@ -2098,7 +2169,8 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
             buckets: List[AggregateBucket] = []
             total = 0
             latest: Optional[str] = None
-            for rec in session.run(cypher, **params):
+            _res = await session.run(cypher, **params)
+            async for rec in _res:
                 count = int(rec["count"] or 0)
                 total += count
                 last_seen = rec["last_seen"]
@@ -2126,8 +2198,6 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
     except Exception as e:
         log.error(f"aggregate-internal failed: {e}")
         raise HTTPException(status_code=500, detail=f"aggregate failed: {e}")
-    finally:
-        driver.close()
 @app.get("/index-internal-stats")
@@ -2153,13 +2223,14 @@ async def index_internal_stats() -> dict:
     except Exception as e:
         out["l4_qmd_error"] = str(e)
     try:
-        driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
-        with driver.session() as session:
-            r = session.run("MATCH (c:Chunk) RETURN count(c) AS n").single()
+        driver = get_neo4j_driver()
+        async with driver.session() as session:
+            res = await session.run("MATCH (c:Chunk) RETURN count(c) AS n")
+            r = await res.single()
             out["l3_chunks"] = r["n"] if r else 0
-            r = session.run("MATCH (e:Entity) RETURN count(e) AS n").single()
+            res = await session.run("MATCH (e:Entity) RETURN count(e) AS n")
+            r = await res.single()
             out["l3_entities"] = r["n"] if r else 0
-        driver.close()
     except Exception as e:
         out["l3_error"] = str(e)
     return out

package/packages/memory-engine/engine/services/l4/server.py CHANGED Viewed

@@ -136,7 +136,11 @@ def health():
         n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
         conn.close()
         return {"status": "ok", "loaded": True, "n_vectors": n,
-                "dim": EMBED_DIM, "db_path": DB_PATH, "backend": "sqlite-vec-fallback"}
+                "dim": EMBED_DIM, "db_path": DB_PATH,
+                # BLOB+Python-cosine is the intentional implementation path,
+                # not a degraded fallback (see _get_db docstring). The previous
+                # "sqlite-vec-fallback" label gave operators the wrong signal.
+                "backend": "sqlite-vec"}
     except Exception as exc:
         return {"status": "degraded", "error": str(exc)}
@@ -212,6 +216,80 @@ def refresh():
     return {"status": "ok", "noop": True}
+# ----------------------------------------------------------------------
+# /health/deep — synthetic round-trip
+# ----------------------------------------------------------------------
+# Fixed sentinel id used by /health/deep. Upserted on every probe call,
+# so the row is idempotent. Kept under id="__healthcheck__sentinel" so
+# the L4 corpus has at most one healthcheck row regardless of probe rate.
+_HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
+_HEALTH_SENTINEL_TEXT = (
+    "healthcheck sentinel — embed-write-search round-trip verifier"
+)
+@app.get("/health/deep")
+async def health_deep():
+    """Real functional probe: embed → write → search the sentinel.
+    Catches the class of failure that plain /health misses — broken
+    embed paths, write 500s, query path bugs — i.e. exactly the bug
+    shape that silently degraded L6 from v0.8.0 → v0.8.2.
+    Returns:
+        {status, embed_ms, write_ms, search_ms, hit, ok}
+    `hit` confirms the sentinel was returned from search; `ok` is the
+    aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
+    regardless so callers can read the body for diagnostics; status:
+    field carries the verdict.
+    """
+    t_total = time.perf_counter()
+    out: dict[str, Any] = {"status": "ok", "ok": True}
+    try:
+        t0 = time.perf_counter()
+        embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
+        out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
+        if not embs or not embs[0]:
+            out["status"] = "embed_failed"
+            out["ok"] = False
+            return out
+        vec = embs[0]
+    except Exception as exc:
+        out["status"] = f"embed_error: {type(exc).__name__}"
+        out["ok"] = False
+        return out
+    try:
+        conn = _get_db()
+        t1 = time.perf_counter()
+        conn.execute(
+            "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
+            "VALUES (?, ?, ?, ?)",
+            (_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
+        )
+        conn.commit()
+        out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
+        t2 = time.perf_counter()
+        rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
+                            (_HEALTH_SENTINEL_ID,)).fetchone()
+        out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
+        conn.close()
+    except Exception as exc:
+        out["status"] = f"db_error: {type(exc).__name__}"
+        out["ok"] = False
+        return out
+    out["hit"] = rows is not None
+    if not out["hit"]:
+        out["status"] = "sentinel_missing"
+        out["ok"] = False
+    out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
+    return out
 # ----------------------------------------------------------------------
 # Entrypoint
 # ----------------------------------------------------------------------

package/packages/memory-engine/engine/services/l5/l5-comms-layer.py CHANGED Viewed

@@ -558,6 +558,20 @@ def serve(port=8034):
     from fastapi import FastAPI, Query
     import uvicorn
+    # Bootstrap all 4 collections on startup. Previously only `chats`
+    # was being created (the indexer entrypoints below each call their
+    # own ensure_collection lazily, so collections without an indexer
+    # — i.e. those fed solely via /index-internal or the compat shim's
+    # _index_l5 fan-out — never came into existence and writes to them
+    # 500'd). Idempotent: ensure_collection short-circuits if exists.
+    try:
+        bootstrap_client = get_client()
+        for _name in ("chats", "emails", "contacts", "memory"):
+            ensure_collection(bootstrap_client, _name)
+        logging.info("L5 collections bootstrapped: chats, emails, contacts, memory")
+    except Exception as exc:
+        logging.warning(f"L5 collection bootstrap failed (continuing): {exc}")
     api = FastAPI(title="L5 Communications Layer")
     @api.get("/health")
@@ -658,6 +672,71 @@ def serve(port=8034):
             "insert_ms": round(insert_ms, 1),
         }
+    @api.get("/health/deep")
+    def api_health_deep():
+        """Real functional probe: assert all 4 collections exist + run
+        embed+insert+search of a sentinel chunk in the `chats` collection.
+        Catches missing-collection regressions (Issue 3) and embed/insert/
+        search path bugs the shallow /health misses."""
+        import time as _time, hashlib as _hashlib
+        out = {"status": "ok", "ok": True}
+        client = get_client()
+        # 1. Collection presence
+        expected = ("chats", "emails", "contacts", "memory")
+        present = {n: client.has_collection(n) for n in expected}
+        out["collections"] = present
+        missing = [n for n, ok in present.items() if not ok]
+        if missing:
+            out["status"] = f"missing_collections:{','.join(missing)}"
+            out["ok"] = False
+            return out
+        # 2. Synthetic embed → insert → search in `chats`
+        sentinel_id = "__healthcheck__sentinel"
+        sentinel_text = "healthcheck sentinel — L5 embed-write-search round-trip verifier"
+        try:
+            t0 = _time.time()
+            embs = _embed_post([sentinel_text])
+            out["embed_ms"] = round((_time.time() - t0) * 1000.0, 1)
+            if not embs or embs[0] is None:
+                out["status"] = "embed_failed"
+                out["ok"] = False
+                return out
+            t1 = _time.time()
+            from datetime import datetime as _dt, timezone as _tz
+            client.upsert(collection_name="chats", data=[{
+                "id": sentinel_id,
+                "vector": embs[0],
+                "text": sentinel_text,
+                "source": "healthcheck",
+                "channel": "__healthcheck__",
+                "contact": "",
+                "timestamp": _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+                "arena": "__healthcheck__",
+            }])
+            out["write_ms"] = round((_time.time() - t1) * 1000.0, 1)
+            t2 = _time.time()
+            hits = client.search(
+                collection_name="chats",
+                data=[embs[0]],
+                limit=1,
+                filter='arena == "__healthcheck__"',
+                output_fields=["id"],
+            )
+            out["search_ms"] = round((_time.time() - t2) * 1000.0, 1)
+            found = bool(hits and hits[0] and hits[0][0].get("entity", {}).get("id") == sentinel_id)
+            out["hit"] = found
+            if not found:
+                out["status"] = "sentinel_missing"
+                out["ok"] = False
+        except Exception as exc:
+            out["status"] = f"probe_error: {type(exc).__name__}: {exc}"
+            out["ok"] = False
+        return out
     print(f"\n  L5 Communications Layer — http://127.0.0.1:{port}")
     uvicorn.run(api, host=os.environ.get("HOST","127.0.0.1"), port=port, log_level="warning")

package/packages/memory-engine/engine/services/l6/Dockerfile CHANGED Viewed

@@ -1,8 +1,25 @@
 FROM python:3.12-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
-RUN pip install --no-cache-dir fastapi "uvicorn[standard]" httpx "pymilvus[milvus_lite]" "setuptools<70" pydantic spacy
+# gcc/g++ needed by some sentence-transformers transitive deps; curl kept
+# for in-container debugging.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc g++ curl \
+    && rm -rf /var/lib/apt/lists/*
+# Core deps + sentence-transformers/torch for the cross-encoder reranker.
+# Torch CPU wheel is enough — the reranker is small (MiniLM L-6) and
+# CPU-bound throughput is fine at L6's request volume. Without these,
+# get_reranker() falls back to RRF-only, capping recall ranking quality.
+RUN pip install --no-cache-dir \
+        fastapi "uvicorn[standard]" httpx "pymilvus[milvus_lite]" "setuptools<70" pydantic spacy \
+        "sentence-transformers" \
+        "torch" --extra-index-url https://download.pytorch.org/whl/cpu
 RUN python -m spacy download en_core_web_sm
+ENV HF_HOME=/data/.cache/huggingface
+# Pre-download the cross-encoder so cold-start doesn't pay first-pull
+# latency. The model is small (~80MB) and gets cached at /data — survives
+# container recreates since /data is a volume mount.
+RUN mkdir -p /data/.cache/huggingface && \
+    python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', cache_folder='/data/.cache/huggingface')"
 # Shared embed_provider module (build context is engine/services).
 COPY _shared /app/_shared
 COPY l6/l6-document-store.py /app/server.py

package/packages/memory-engine/engine/services/l6/l6-document-store.py CHANGED Viewed

@@ -838,6 +838,67 @@ def serve(port: int = DEFAULT_PORT):
     def api_health():
         return health()
+    @api.get("/health/deep")
+    def api_health_deep():
+        """Real functional probe: embed → insert via /index-batch path →
+        search the sentinel via hybrid search → assert reranker loaded.
+        Built to catch the v0.8.0–0.8.2 L6 _embed_client shadowing bug
+        and its kind (request-handler-level breakage with the layer
+        process appearing healthy)."""
+        import time as _time
+        sentinel_id = "__healthcheck__sentinel"
+        sentinel_text = "healthcheck sentinel — L6 embed-write-search round-trip verifier"
+        out = {"status": "ok", "ok": True}
+        try:
+            t0 = _time.time()
+            try:
+                emb = embed_text(sentinel_text)
+            except Exception as exc:
+                out["status"] = f"embed_failed: {type(exc).__name__}: {exc}"
+                out["ok"] = False
+                return out
+            out["embed_ms"] = round((_time.time() - t0) * 1000.0, 1)
+            # Insert via the same path real ingest uses, so the probe
+            # actually exercises /index-batch's code.
+            t1 = _time.time()
+            import httpx as _httpx
+            r = _httpx.post(
+                f"http://localhost:{DEFAULT_PORT}/index-batch",
+                json={
+                    "arena": "__healthcheck__",
+                    "records": [{"id": sentinel_id, "text": sentinel_text}],
+                },
+                timeout=15.0,
+            )
+            out["write_ms"] = round((_time.time() - t1) * 1000.0, 1)
+            if r.status_code != 200:
+                out["status"] = f"write_failed: http {r.status_code}"
+                out["ok"] = False
+                return out
+            t2 = _time.time()
+            results = search(
+                sentinel_text, method="hybrid", limit=3,
+                arena="__healthcheck__", enable_rerank=False,
+            )
+            out["search_ms"] = round((_time.time() - t2) * 1000.0, 1)
+            hit = any(r.get("id", "").startswith(sentinel_id) for r in (results or []))
+            out["hit"] = hit
+            if not hit:
+                out["status"] = "sentinel_missing"
+                out["ok"] = False
+            # Reranker check — informational. Failure here doesn't flip
+            # ok=False because L6 falls back to RRF and still serves
+            # results; it just caps the recall ranking quality.
+            out["reranker"] = "ok" if get_reranker() is not None else "rrf_fallback"
+        except Exception as exc:
+            out["status"] = f"probe_error: {type(exc).__name__}: {exc}"
+            out["ok"] = False
+        return out
     @api.get("/stats")
     def api_stats():
         return get_stats()