npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.7.3 → 0.7.5 - Mend

@pentatonic-ai/ai-agent-sdk 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.7.3",
+  "version": "0.7.5",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine/compat/server.py CHANGED Viewed

@@ -55,6 +55,13 @@ L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
 L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
 NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
+# Neo4j has no /health endpoint, so the shim probes the HTTP transactional
+# API with a trivial RETURN 1 — that confirms Neo4j is actually answering
+# Cypher, not just serving HTTP. Auth shape is the same as L2 / docker-compose:
+# "user/pass" string. Default matches the local-dev compose default.
+NEO4J_AUTH = os.environ.get("NEO4J_AUTH", "neo4j/local-dev-pw")
+NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
 PORT = int(os.environ.get("PORT", "8099"))
 CLIENT_ID = os.environ.get("CLIENT_ID", "default")
@@ -204,8 +211,12 @@ async def _index_l4(records: list[dict[str, Any]]) -> int:
         return 0
-async def _index_l5(records: list[dict[str, Any]]) -> int:
-    """Index records into the L5 Milvus comms layer (chats collection)."""
+async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
+    """Index records into the L5 Milvus comms layer (chats collection).
+    arena is forwarded as a Milvus dynamic field so /search can filter
+    by arena natively (vs the shim's defence-in-depth post-filter).
+    """
     payload = {
         "collection": "chats",
         "records": [
@@ -215,6 +226,7 @@ async def _index_l5(records: list[dict[str, Any]]) -> int:
                 "source": (r.get("metadata") or {}).get("source", "shim"),
                 "channel": "pentatonic-memory",
                 "contact": (r.get("metadata") or {}).get("user", ""),
+                "arena": (r.get("metadata") or {}).get("arena") or arena,
             }
             for r in records
         ],
@@ -294,9 +306,82 @@ app = FastAPI(
 )
+def _interpret_body_status(body: Any) -> str | None:
+    """Pull a layer's self-reported status out of its /health body.
+    Layers don't all use the same vocabulary — L4 says "ok"/"degraded",
+    L2 says "healthy"/"unavailable", some return nothing. Normalize to
+    "ok" or a short failure reason; None means the body didn't carry
+    a status field, in which case the HTTP code is the source of truth.
+    """
+    if not isinstance(body, dict):
+        return None
+    raw = body.get("status")
+    if raw is None:
+        return None
+    s = str(raw).lower()
+    if s in ("ok", "healthy"):
+        return "ok"
+    err = body.get("error") or body.get("reason") or ""
+    return f"{s}: {str(err)[:80]}" if err else s
+async def _probe(url: str) -> str:
+    """Probe a layer /health endpoint and return a single-string verdict
+    that surfaces both transport-level failure and self-reported status."""
+    try:
+        r = await _client().get(url, timeout=3.0)
+    except Exception as exc:
+        return f"unreachable: {type(exc).__name__}"
+    if r.status_code != 200:
+        return f"http {r.status_code}"
+    try:
+        body_status = _interpret_body_status(r.json())
+    except Exception:
+        body_status = None
+    return body_status or "ok"
+async def _probe_l3() -> str:
+    """Real Neo4j probe — POST a trivial Cypher via the HTTP transactional
+    API and require a 200 response. Confirms Neo4j is actually answering
+    queries, not just serving the Browser HTML on :7474.
+    """
+    user, _, password = NEO4J_AUTH.partition("/")
+    url = f"{L3_KG_URL}/db/{NEO4J_DB}/tx/commit"
+    try:
+        r = await _client().post(
+            url,
+            json={"statements": [{"statement": "RETURN 1"}]},
+            auth=(user, password),
+            timeout=3.0,
+        )
+    except Exception as exc:
+        return f"unreachable: {type(exc).__name__}"
+    if r.status_code != 200:
+        return f"http {r.status_code}"
+    try:
+        body = r.json()
+        # Neo4j tx/commit returns {"results":[...], "errors":[...]}.
+        # Any errors here means the DB is up but rejecting queries.
+        errs = body.get("errors") or []
+        if errs:
+            return f"cypher error: {str(errs[0])[:80]}"
+    except Exception:
+        return "non-json response"
+    return "ok"
 @app.get("/health")
 async def health():
-    """Aggregate health across all 7 layers."""
+    """Aggregate health across all 7 layers.
+    Each layer's verdict is honest: it reflects whether the layer can
+    actually do its job, not just whether its HTTP server answers. The
+    shim reads the layer's body.status (when present) and degrades when
+    the layer self-reports a problem. L3 uses a real Cypher probe since
+    Neo4j has no /health route.
+    """
     out = {
         "status": "ok",
         "client": CLIENT_ID,
@@ -304,49 +389,43 @@ async def health():
         "engine": "pentatonic-memory-engine",
         "layers": {},
     }
-    # L0 BM25 is in-process inside the L2 proxy (SQLite FTS5 is a library,
-    # not a service). Reporting it via L2's /health.
-    layer_health_endpoints = {
-        "l2": f"{L2_PROXY_URL}/health",       # also reports L0 status
-        "l3": f"{L3_KG_URL}/health",
-        "l4": f"{L4_VEC_URL}/health",
-        "l5": f"{L5_MILVUS_URL}/health",
-        "l6": f"{L6_DOC_URL}/health",
-        # NV-Embed exposes both /health and /v1/embeddings; /health is enough.
-        "nv_embed": NV_EMBED_URL.replace("/v1/embeddings", "/health"),
-    }
-    failures = 0
-    for name, url in layer_health_endpoints.items():
-        try:
-            r = await _client().get(url, timeout=3.0)
-            out["layers"][name] = "ok" if r.status_code == 200 else f"http {r.status_code}"
-            if r.status_code != 200:
-                failures += 1
-        except Exception:
-            out["layers"][name] = "unreachable"
-            failures += 1
+    # NV-Embed exposes /health alongside /v1/embeddings.
+    nv_embed_health = NV_EMBED_URL.replace("/v1/embeddings", "/health")
+    import asyncio
+    l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
+        _probe(f"{L2_PROXY_URL}/health"),
+        _probe(f"{L4_VEC_URL}/health"),
+        _probe(f"{L5_MILVUS_URL}/health"),
+        _probe(f"{L6_DOC_URL}/health"),
+        _probe(nv_embed_health),
+        _probe_l3(),
+    )
     # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
-    # inside the L2 proxy. They have no separate health endpoint; if L2 is
-    # responding, both are usable. Report them as "ok" tied to L2.
-    raw_layers = out["layers"]
-    l2_ok = raw_layers.get("l2") == "ok"
+    # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
+    # both layers are usable. Tie their status to L2.
+    l2_ok = l2_v == "ok"
     out["layers"] = {
-        "l0": "ok" if l2_ok else "unknown",
-        "l1": "ok" if l2_ok else "unknown",
-        "l2": raw_layers.get("l2", "unknown"),
-        "l3": raw_layers.get("l3", "unknown"),
-        "l4": raw_layers.get("l4", "unknown"),
-        "l5": raw_layers.get("l5", "unknown"),
-        "l6": raw_layers.get("l6", "unknown"),
-        "nv_embed": raw_layers.get("nv_embed", "unknown"),
+        "l0": "ok" if l2_ok else l2_v,
+        "l1": "ok" if l2_ok else l2_v,
+        "l2": l2_v,
+        "l3": l3_v,
+        "l4": l4_v,
+        "l5": l5_v,
+        "l6": l6_v,
+        "nv_embed": nv_v,
     }
+    failures = sum(1 for v in out["layers"].values() if v != "ok")
     if failures:
         out["status"] = "degraded" if failures < 3 else "down"
-    # Memory count: query L6 doc-store as authoritative
+    # Memory count: query L6 doc-store as authoritative.
     try:
         r = await _client().get(f"{L6_DOC_URL}/stats", timeout=3.0)
         if r.status_code == 200:
-            out["memories"] = r.json().get("total_chunks", 0)
+            stats = r.json()
+            out["memories"] = stats.get("total_chunks") or stats.get("fts_chunks") or 0
     except Exception:
         out["memories"] = None
     return out
@@ -369,7 +448,7 @@ async def store(req: StoreRequest):
     import asyncio
     l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
         _index_l4([record]),
-        _index_l5([record]),
+        _index_l5([record], arena=arena),
         _index_l6([record], arena=arena),
         _index_l2_internal([record], arena=arena),
     )
@@ -414,7 +493,7 @@ async def store_batch(req: StoreBatchRequest):
     import asyncio
     l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
         _index_l4(normalised),
-        _index_l5(normalised),
+        _index_l5(normalised, arena=req.arena or "general"),
         _index_l6(normalised, arena=req.arena or "general"),
         _index_l2_internal(normalised, arena=req.arena or "general"),
     )
@@ -633,9 +712,12 @@ async def search(req: SearchRequest):
         out_results = _apply_metadata_filters(out_results, req)
         return {"results": out_results[: req.limit or 10]}
     try:
+        get_params: dict[str, Any] = {"q": req.query, "limit": _search_overfetch(req)}
+        if req.arena:
+            get_params["arena"] = req.arena
         r = await _client().get(
             f"{L2_PROXY_URL}/search",
-            params={"q": req.query, "limit": _search_overfetch(req)},
+            params=get_params,
             timeout=30.0,
         )
         r.raise_for_status()
@@ -643,10 +725,16 @@ async def search(req: SearchRequest):
     except Exception as exc:
         last_err = exc
         try:
+            post_body: dict[str, Any] = {
+                "query": req.query,
+                "limit": _search_overfetch(req),
+                "min_score": req.min_score or 0.001,
+            }
+            if req.arena:
+                post_body["arena"] = req.arena
             r = await _client().post(
                 f"{L2_PROXY_URL}/v1/search",
-                json={"query": req.query, "limit": _search_overfetch(req),
-                      "min_score": req.min_score or 0.001},
+                json=post_body,
                 timeout=30.0,
             )
             r.raise_for_status()

package/packages/memory-engine/docker-compose.test.yml ADDED Viewed

@@ -0,0 +1,60 @@
+# docker-compose.test.yml — overlay for hermetic CI runs.
+#
+# Replaces the nv-embed GPU service with a deterministic embedding
+# stub that mimics both the OpenAI /v1/embeddings shape and the
+# lambda-gateway /v1/embed shape. Lets CI exercise every layer's
+# vector path without an actual model.
+#
+# Usage:
+#   docker compose -f docker-compose.yml -f docker-compose.test.yml \
+#     up -d --wait l3 l4 l5 l6 l2 compat embed-stub
+#
+# The base nv-embed service is intentionally NOT started in CI
+# (requires a GPU). l4/l5/l6 are pointed at embed-stub via env.
+services:
+  embed-stub:
+    build:
+      context: ./tests/embed_stub
+      dockerfile: Dockerfile
+    container_name: pme-embed-stub
+    networks:
+      - engine-net
+    environment:
+      EMBED_DIM: "4096"
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8041/health',timeout=3)"]
+      interval: 5s
+      timeout: 3s
+      retries: 20
+      start_period: 5s
+  l4:
+    environment:
+      L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
+      L4_EMBED_API_KEY: ""
+  l5:
+    environment:
+      L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
+      L5_EMBED_API_KEY: ""
+  l6:
+    environment:
+      L6_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
+      L6_EMBED_API_KEY: ""
+  l2:
+    environment:
+      PME_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
+  compat:
+    environment:
+      NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
+    depends_on:
+      embed-stub:
+        condition: service_healthy
+      l2: { condition: service_started }
+      l4: { condition: service_started }
+      l5: { condition: service_started }
+      l6: { condition: service_started }

package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py CHANGED Viewed

@@ -719,12 +719,17 @@ L0_MEMORY_DB = Path(os.environ.get(
     str(Path.home() / ".pentatonic" / "memory" / "main.sqlite"),
 ))
-def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
+def search_l0_bm25(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
     """Search native BM25 index over workspace memory files.
     Covers chunks from daily notes, memory files, people profiles,
     infrastructure docs, project files — corpus that L3-L6 don't index.
     Sub-millisecond local SQLite reads, zero network overhead.
+    arena (optional): when set, filter to paths under bench/<arena>/.
+    Records stored via the compat shim land under that prefix per
+    _stash_all_keys; this is the L0 path-based equivalent of the
+    arena dynamic-field filter on L5/L6.
     """
     if not L0_MEMORY_DB.exists():
         return []
@@ -741,16 +746,21 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
         conn = sqlite3.connect(str(L0_MEMORY_DB), timeout=2)
         conn.execute("PRAGMA journal_mode=WAL")
-        rows = conn.execute("""
+        sql = """
             SELECT path, text, bm25(chunks_fts) as rank
             FROM chunks_fts
             WHERE chunks_fts MATCH ?
               AND path NOT LIKE '%/snapshots/%'
               AND path NOT LIKE '%/archive/%'
               AND path NOT LIKE '%-backup-%'
-            ORDER BY rank ASC
-            LIMIT ?
-        """, (fts_query, limit * 2)).fetchall()
+        """
+        params: list = [fts_query]
+        if arena:
+            sql += " AND path LIKE ?"
+            params.append(f"bench/{arena}/%")
+        sql += " ORDER BY rank ASC LIMIT ?"
+        params.append(limit * 2)
+        rows = conn.execute(sql, params).fetchall()
         conn.close()
         results = []
@@ -761,12 +771,20 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
             seen_paths.add(path)
             relevance = -rank if rank < 0 else 0.001
             score = min(relevance / (1 + relevance) * 0.85, 0.75)
+            # Parse arena from path (bench/<arena>/...) so downstream
+            # consumers can read it directly without parsing again.
+            row_arena = ""
+            if path.startswith("bench/"):
+                parts = path.split("/", 2)
+                if len(parts) >= 3:
+                    row_arena = parts[1]
             results.append({
                 "path": f"L0/{path}",
                 "snippet": text[:500],
                 "score": round(score, 4),
                 "layer": "L0_workspace_bm25",
                 "source": path,
+                "arena": row_arena,
             })
             if len(results) >= limit:
                 break
@@ -782,12 +800,20 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
 L5_API_URL = os.environ.get("PME_L5_URL", "http://127.0.0.1:8034")
-def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
-    """Search L5 Communications Context via L5 API (emails, chats, calendar)."""
+def search_l5_communications(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
+    """Search L5 Communications Context via L5 API (emails, chats, calendar).
+    arena (optional): forwarded to L5; filters Milvus by the arena
+    dynamic field. Records id is included in the result so callers
+    can attach metadata via the shim's _META_CACHE.
+    """
     try:
+        params: dict = {"q": query, "limit": limit}
+        if arena:
+            params["arena"] = arena
         resp = requests.get(
             f"{L5_API_URL}/search",
-            params={"q": query, "limit": limit},
+            params=params,
             timeout=10,
         )
         if resp.status_code != 200:
@@ -804,10 +830,15 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
                 continue  # skip low relevance
             contact = hit.get("contact", "")
             channel = hit.get("channel", "")
-            path_label = f"L5/{source}"
-            if contact:
+            hit_id = hit.get("id", "")
+            # Use record id as path label so the shim can attach
+            # metadata via _META_CACHE; falls back to source label
+            # for legacy records that have no id.
+            path_label = hit_id or f"L5/{source}"
+            if not hit_id and contact:
                 path_label = f"L5/{channel}/{contact}"
             results.append({
+                "id": hit_id,
                 "path": path_label,
                 "snippet": hit.get("text", "")[:500],
                 "score": scaled_score,
@@ -815,6 +846,7 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
                 "source": source,
                 "collection": hit.get("collection", ""),
                 "timestamp": hit.get("timestamp", ""),
+                "arena": hit.get("arena", ""),
             })
         return results
     except Exception as e:
@@ -825,12 +857,19 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
 # L6: Document Store Search
 L6_URL = os.environ.get("PME_L6_URL", "http://localhost:8037")
-def search_l6_documents(query: str, limit: int = 6) -> List[Dict]:
-    """Search L6 Document Store (research, legal, financial, project docs)."""
+def search_l6_documents(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
+    """Search L6 Document Store (research, legal, financial, project docs).
+    arena (optional): forwarded to L6 — L6 already supports arena
+    natively (see l6-document-store.py search_vector / search_fts).
+    """
     try:
+        params: dict = {"q": query, "method": "hybrid", "limit": limit, "rerank": "true"}
+        if arena:
+            params["arena"] = arena
         resp = requests.get(
             f"{L6_URL}/search",
-            params={"q": query, "method": "hybrid", "limit": limit, "rerank": "true"},
+            params=params,
             timeout=10,
         )
         if resp.status_code != 200:
@@ -875,13 +914,19 @@ def search_l6_documents(query: str, limit: int = 6) -> List[Dict]:
         return []
-def sequential_hybridrag_search(query: str, limit: int = 16) -> List[Dict]:
-    """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs)."""
+def sequential_hybridrag_search(query: str, limit: int = 16, arena: str = None) -> List[Dict]:
+    """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs).
+    arena (optional): tenant scope. Forwarded to L0 (path-prefix
+    filter), L5 (Milvus dynamic-field filter), L6 (native arena).
+    L4 vector and L3 graph don't yet support native arena filtering;
+    the compat shim post-filter catches those before they leak out.
+    """
     start_time = time.time()
-    log.info(f"Starting sequential HybridRAG search for: '{query}'")
+    log.info(f"Starting sequential HybridRAG search for: '{query}' arena={arena!r}")
     # L0: BM25 workspace memory (keyword search — complements semantic layers)
-    l0_results = search_l0_bm25(query, limit=6)
+    l0_results = search_l0_bm25(query, limit=6, arena=arena)
     log.info(f"L0 BM25 workspace: {len(l0_results)} results")
     # L1: System Files (HIGHEST PRIORITY)
@@ -902,11 +947,11 @@ def sequential_hybridrag_search(query: str, limit: int = 16) -> List[Dict]:
     log.info(f"L4 Vector search: {len(vector_results)} results (HyDE={'on' if hyde_query != query else 'off'})")
     # L5: Communications Context (emails, chats, calendar) — also use HyDE
-    l5_results = search_l5_communications(hyde_query, limit=6)
+    l5_results = search_l5_communications(hyde_query, limit=6, arena=arena)
     log.info(f"L5 Communications: {len(l5_results)} results")
     # L6: Document Store (research, legal, financial, project docs)
-    l6_results = search_l6_documents(hyde_query, limit=6)
+    l6_results = search_l6_documents(hyde_query, limit=6, arena=arena)
     log.info(f"L6 Documents: {len(l6_results)} results")
     # L2: HybridRAG fusion (combines all layers with L1 priority)
@@ -966,10 +1011,11 @@ async def search_endpoint(request: Request) -> dict:
         body = await request.json()
         query = body.get("query", "")
         limit = body.get("limit", 16)
+        arena = body.get("arena") or None
         if not query:
             raise HTTPException(status_code=400, detail="query is required")
-        results = sequential_hybridrag_search(query, limit=limit)
+        results = sequential_hybridrag_search(query, limit=limit, arena=arena)
         # Also return raw graph entities for context enrichment
         entities = extract_query_entities(query)
@@ -1150,8 +1196,17 @@ def _check_l6_health() -> bool:
 @app.get("/health")
 async def health() -> dict:
-    """System health check."""
+    """System health check.
+    Reports "ok" iff every layer L2 directly owns is healthy: L0 BM25
+    (SQLite FTS5 file), L4 QMD vector store (sqlite file), and the
+    Neo4j connection. L5/L6 reachability is reported informationally
+    only — the compat shim probes them directly. Ollama is no longer
+    a hard dependency anywhere; the engine uses the configured
+    NV_EMBED_URL via _embed_post helpers in each layer.
+    """
     qmd_healthy = os.path.exists(QMD_DB_PATH)
+    l0_healthy = L0_MEMORY_DB.exists()
     neo4j_healthy = False
     try:
@@ -1163,25 +1218,26 @@ async def health() -> dict:
     except Exception as e:
         logging.debug(f"Suppressed: {e}")
-    ollama_healthy = False
-    try:
-        r = requests.get("http://localhost:11434/api/tags", timeout=5)
-        ollama_healthy = r.status_code == 200
-    except Exception as e:
-        logging.debug(f"Suppressed: {e}")
+    l5_reachable = _check_l5_health()
+    l6_reachable = _check_l6_health()
+    # Top-level status: degrade only on layers L2 is the sole gatekeeper for.
+    # L5/L6 are independent services probed by the compat shim.
+    must_be_ok = [l0_healthy, qmd_healthy, neo4j_healthy]
+    overall = "ok" if all(must_be_ok) else "degraded"
     return {
+        "status": overall,
         "proxy": "healthy",
         "architecture": "sequential-hybridrag-proper-layers",
         "layers": {
-            "L0_workspace_bm25": {"status": "healthy" if L0_MEMORY_DB.exists() else "unavailable", "backend": "sqlite-fts5"},
+            "L0_workspace_bm25": {"status": "healthy" if l0_healthy else "unavailable", "backend": "sqlite-fts5"},
             "L1_system_files": {"status": "healthy", "description": "MEMORY.md, plans.md, daily notes"},
             "L2_hybridrag": {"status": "healthy", "description": "Orchestrates L3+L4 fusion"},
             "L3_graph_search": {"status": "healthy" if neo4j_healthy else "unavailable", "backend": "neo4j"},
-            "L4_vector_search": {"status": "healthy" if qmd_healthy else "unavailable", "backend": "qmd+ollama"},
-            "L5_communications": {"status": "healthy" if _check_l5_health() else "unavailable", "backend": "sqlite+ollama"},
-            "L6_document_store": {"status": "healthy" if _check_l6_health() else "unavailable", "backend": "milvus+fts5+reranker", "port": 8037},
-            "ollama_embeddings": {"status": "healthy" if ollama_healthy else "unavailable"}
+            "L4_vector_search": {"status": "healthy" if qmd_healthy else "unavailable", "backend": "qmd"},
+            "L5_communications": {"status": "healthy" if l5_reachable else "unavailable", "backend": "milvus"},
+            "L6_document_store": {"status": "healthy" if l6_reachable else "unavailable", "backend": "milvus+fts5+reranker", "port": 8037},
         }
     }

package/packages/memory-engine/engine/services/l5/l5-comms-layer.py CHANGED Viewed

@@ -449,8 +449,13 @@ def index_memory(client):
 # --- Search ---
-def search(query: str, collection: str = None, limit: int = 10):
-    """Search across collections."""
+def search(query: str, collection: str = None, limit: int = 10, arena: str = None):
+    """Search across collections.
+    arena (optional): when set, filter to records whose arena dynamic
+    field matches. Records indexed before arena was added carry no
+    arena field — those are dropped under multi-tenant safety.
+    """
     client = get_client()
     vectors = embed_texts([query])
     if not vectors or all(v == 0.0 for v in vectors[0]):
@@ -460,6 +465,12 @@ def search(query: str, collection: str = None, limit: int = 10):
     collections = [collection] if collection else ["chats", "emails", "contacts", "memory"]
     all_results = []
+    filter_expr = ""
+    if arena:
+        # Escape double quotes; Milvus filter syntax for dynamic fields.
+        safe = str(arena).replace('"', '\\"')
+        filter_expr = f'arena == "{safe}"'
     for coll in collections:
         if not client.has_collection(coll):
             continue
@@ -468,12 +479,14 @@ def search(query: str, collection: str = None, limit: int = 10):
                 collection_name=coll,
                 data=[vectors[0]],
                 limit=limit,
-                output_fields=["text", "source", "channel", "contact", "timestamp"],
+                filter=filter_expr,
+                output_fields=["text", "source", "channel", "contact", "timestamp", "arena"],
             )
             for hits in results:
                 for hit in hits:
                     entity = hit.get("entity", {})
                     all_results.append({
+                        "id": hit.get("id", ""),
                         "collection": coll,
                         "score": round(hit.get("distance", 0), 4),
                         "text": entity.get("text", ""),
@@ -481,6 +494,7 @@ def search(query: str, collection: str = None, limit: int = 10):
                         "channel": entity.get("channel", ""),
                         "contact": entity.get("contact", ""),
                         "timestamp": entity.get("timestamp", ""),
+                        "arena": entity.get("arena", ""),
                     })
         except Exception as e:
             print(f"  Search error in {coll}: {e}")
@@ -492,28 +506,28 @@ def search(query: str, collection: str = None, limit: int = 10):
 # --- Health / Stats ---
 def health():
-    """Check L5 health."""
+    """Check L5 health.
+    Reports "ok" iff the Milvus client can list collections — that's
+    L5's actual data plane. Embeddings are intentionally NOT probed
+    here: that's a separate concern reported by the compat shim's
+    nv_embed entry. Probing an external embedding endpoint on every
+    /health adds latency and false negatives for layers that only
+    embed on demand.
+    """
     try:
         client = get_client()
         collections = ["chats", "emails", "contacts", "memory"]
-        status = {"status": "ok", "db_path": DB_PATH, "collections": {}}
+        out = {"status": "ok", "db_path": DB_PATH, "collections": {}}
         for coll in collections:
             if client.has_collection(coll):
                 stats = client.get_collection_stats(coll)
                 count = stats.get("row_count", 0)
-                status["collections"][coll] = {"exists": True, "count": count}
+                out["collections"][coll] = {"exists": True, "count": count}
             else:
-                status["collections"][coll] = {"exists": False, "count": 0}
-        total = sum(c["count"] for c in status["collections"].values())
-        status["total_chunks"] = total
-        # Check embeddings
-        try:
-            r = httpx.get("http://localhost:11434/api/tags", timeout=3)
-            models = [m["name"] for m in r.json().get("models", [])]
-            status["embeddings"] = EMBED_MODEL in str(models)
-        except Exception:
-            status["embeddings"] = False
-        return status
+                out["collections"][coll] = {"exists": False, "count": 0}
+        out["total_chunks"] = sum(c["count"] for c in out["collections"].values())
+        return out
     except Exception as e:
         return {"status": "error", "error": str(e)}
@@ -547,8 +561,9 @@ def serve(port=8034):
         return health()
     @api.get("/search")
-    def api_search(q: str = Query(...), collection: str = None, limit: int = 10):
-        results = search(q, collection=collection, limit=limit)
+    def api_search(q: str = Query(...), collection: str = None, limit: int = 10,
+                   arena: str = None):
+        results = search(q, collection=collection, limit=limit, arena=arena)
         return {"query": q, "results": results, "count": len(results)}
     @api.get("/stats")
@@ -618,6 +633,10 @@ def serve(port=8034):
                 "channel": (r.get("channel") or "")[:64],
                 "contact": (r.get("contact") or "")[:256],
                 "timestamp": (r.get("timestamp") or _now)[:32],
+                # arena lands in the dynamic-field section of the
+                # collection (enable_dynamic_field=True). Filterable
+                # via `arena == "..."` in /search.
+                "arena": (r.get("arena") or "general")[:64],
             })
         t1 = _time.time()
         if rows:

package/packages/memory-engine/engine/services/l6/l6-document-store.py CHANGED Viewed

@@ -94,35 +94,13 @@ log = logging.getLogger("l6-document-store")
 _embed_client = httpx.Client(timeout=60)
 def embed_text(text: str) -> List[float]:
-    """Get embedding — NV-Embed-v2 primary, Ollama fallback."""
-    if NV_EMBED_ENABLED:
-        try:
-            resp = _embed_client.post(NV_EMBED_URL, json={"input": text[:4000]})
-            resp.raise_for_status()
-            return resp.json()["data"][0]["embedding"]
-        except Exception as e:
-            log.warning(f"NV-Embed-v2 failed, falling back to Ollama: {e}")
-    # Ollama fallback
-    resp = _embed_client.post(
-        f"{OLLAMA_URL}/api/embeddings",
-        json={"model": EMBED_MODEL, "prompt": text[:8000]},
-    )
-    resp.raise_for_status()
-    return resp.json()["embedding"]
+    """Single-text embed via _embed_post (OpenAI-compat first, lambda-gateway fallback)."""
+    return _embed_post([text[:8000]])[0]
 def embed_batch(texts: List[str]) -> List[List[float]]:
-    """Embed a batch of texts — NV-Embed-v2 supports native batching."""
-    if NV_EMBED_ENABLED:
-        try:
-            resp = _embed_client.post(NV_EMBED_URL, json={"input": [t[:4000] for t in texts]})
-            resp.raise_for_status()
-            return [d["embedding"] for d in resp.json()["data"]]
-        except Exception as e:
-            log.warning(f"NV-Embed-v2 batch failed, falling back to sequential: {e}")
-    return [embed_text(t) for t in texts]
+    """Batched embed via _embed_post."""
+    return _embed_post([t[:8000] for t in texts])
 # ---------------------------------------------------------------------------
 # Cross-Encoder Reranker
@@ -767,41 +745,40 @@ def get_stats() -> Dict:
 def health() -> Dict:
-    """Health check."""
-    status = {"status": "ok", "milvus": "unknown", "fts": "unknown", "ollama": "unknown", "reranker": "unknown"}
+    """Health check.
-    # Milvus
+    Reports "ok" iff Milvus and the FTS sidecar both answer. Embeddings
+    are NOT probed here — the compat shim's nv_embed entry covers that.
+    Ollama was a legacy fallback that is not used in any deployment, so
+    its previous probe was a false negative on prod.
+    """
+    out = {"status": "ok", "milvus": "unknown", "fts": "unknown", "reranker": "unknown"}
+    # Milvus — vector store
     try:
         client = get_milvus()
         colls = client.list_collections()
-        status["milvus"] = f"ok ({len(colls)} collections)"
+        out["milvus"] = f"ok ({len(colls)} collections)"
     except Exception as e:
-        status["milvus"] = f"error: {e}"
-        status["status"] = "degraded"
+        out["milvus"] = f"error: {e}"
+        out["status"] = "degraded"
-    # FTS
+    # FTS — keyword fallback over the same chunk set
     try:
         conn = get_fts_db()
         cnt = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
-        status["fts"] = f"ok ({cnt} chunks)"
+        out["fts"] = f"ok ({cnt} chunks)"
         conn.close()
     except Exception as e:
-        status["fts"] = f"error: {e}"
-        status["status"] = "degraded"
-    # Ollama
-    try:
-        resp = _embed_client.get(f"{OLLAMA_URL}/api/tags", timeout=5)
-        status["ollama"] = "ok" if resp.status_code == 200 else f"http {resp.status_code}"
-    except Exception as e:
-        status["ollama"] = f"error: {e}"
-        status["status"] = "degraded"
+        out["fts"] = f"error: {e}"
+        out["status"] = "degraded"
-    # Reranker
+    # Reranker — informational; CPU fallback to RRF is acceptable, so
+    # don't degrade overall status when it's unavailable.
     reranker = get_reranker()
-    status["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
+    out["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
-    return status
+    return out
 # ---------------------------------------------------------------------------
 # FastAPI Server

package/packages/memory-engine/tests/e2e_arena.sh ADDED Viewed

@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+# e2e_arena.sh — multi-tenant store/retrieve smoke test against a live
+# memory-engine stack. Exercises /store, arena-scoped /search, and
+# /forget end-to-end across L0/L4/L5/L6 + the compat shim.
+#
+# Run after `docker compose -f docker-compose.yml -f docker-compose.test.yml \
+#   up -d --wait l3 l4 l5 l6 l2 compat embed-stub`.
+set -eu
+BASE="${BASE:-http://localhost:8099}"
+WAIT_HEALTH_SECS="${WAIT_HEALTH_SECS:-180}"
+PASS=0
+FAIL=0
+ok()   { echo "  ✅ $1"; PASS=$((PASS+1)); }
+fail() { echo "  ❌ $1"; FAIL=$((FAIL+1)); }
+# ---------------------------------------------------------------------------
+# Wait for the compat shim to come up. Its /health aggregates layer
+# health; we accept "ok" or "degraded" (l3 cosmetic 404 is known and
+# doesn't block functional paths).
+# ---------------------------------------------------------------------------
+echo "=== waiting for $BASE/health (up to ${WAIT_HEALTH_SECS}s) ==="
+deadline=$(( $(date +%s) + WAIT_HEALTH_SECS ))
+while :; do
+  if H=$(curl -sf --max-time 5 "$BASE/health"); then
+    s=$(echo "$H" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",""))')
+    if [ "$s" = "ok" ] || [ "$s" = "degraded" ]; then
+      echo "  health: $s"
+      break
+    fi
+  fi
+  if [ "$(date +%s)" -ge "$deadline" ]; then
+    echo "  ❌ engine never became healthy"
+    exit 1
+  fi
+  sleep 3
+done
+# ---------------------------------------------------------------------------
+# /store — two arenas, two distinct documents per arena.
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== /store ==="
+post() {
+  curl -sf -X POST "$BASE/store" \
+    -H "Content-Type: application/json" \
+    -d "$1"
+}
+R1=$(post '{"content":"Alpha team owns project Atlas","metadata":{"arena":"e2e-tenant-a","kind":"note","probe":"e2e-arena"}}')
+R2=$(post '{"content":"Alpha team owns project Borealis","metadata":{"arena":"e2e-tenant-b","kind":"note","probe":"e2e-arena"}}')
+R3=$(post '{"content":"Bravo team owns project Cobalt","metadata":{"arena":"e2e-tenant-a","kind":"note","probe":"e2e-arena"}}')
+R4=$(post '{"content":"Bravo team owns project Diamond","metadata":{"arena":"e2e-tenant-b","kind":"note","probe":"e2e-arena"}}')
+[ -n "$R1" ] && [ -n "$R2" ] && [ -n "$R3" ] && [ -n "$R4" ] \
+  && ok "stored 4 docs across 2 arenas" \
+  || fail "store"
+# Indexing is async on some layers — give the stack a brief settle.
+sleep 4
+# ---------------------------------------------------------------------------
+# /search — arena scoping. tenant-a should never see Borealis/Diamond,
+# tenant-b should never see Atlas/Cobalt.
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== /search arena=e2e-tenant-a ==="
+SA=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
+  -d '{"query":"team project","limit":20,"arena":"e2e-tenant-a"}')
+echo "  hits: $(echo "$SA" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("results",[])))')"
+leak_a=$(echo "$SA" | python3 -c '
+import json,sys
+data=json.load(sys.stdin).get("results",[])
+bad=[r for r in data if "Borealis" in r.get("content","") or "Diamond" in r.get("content","")]
+print(len(bad))')
+[ "$leak_a" = "0" ] && ok "tenant-a: no Borealis/Diamond leakage" \
+  || fail "tenant-a leaked $leak_a tenant-b docs"
+found_atlas=$(echo "$SA" | python3 -c '
+import json,sys
+data=json.load(sys.stdin).get("results",[])
+print("yes" if any("Atlas" in r.get("content","") for r in data) else "no")')
+[ "$found_atlas" = "yes" ] && ok "tenant-a: Atlas recovered" \
+  || fail "tenant-a missing Atlas"
+echo ""
+echo "=== /search arena=e2e-tenant-b ==="
+SB=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
+  -d '{"query":"team project","limit":20,"arena":"e2e-tenant-b"}')
+echo "  hits: $(echo "$SB" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("results",[])))')"
+leak_b=$(echo "$SB" | python3 -c '
+import json,sys
+data=json.load(sys.stdin).get("results",[])
+bad=[r for r in data if "Atlas" in r.get("content","") or "Cobalt" in r.get("content","")]
+print(len(bad))')
+[ "$leak_b" = "0" ] && ok "tenant-b: no Atlas/Cobalt leakage" \
+  || fail "tenant-b leaked $leak_b tenant-a docs"
+found_borealis=$(echo "$SB" | python3 -c '
+import json,sys
+data=json.load(sys.stdin).get("results",[])
+print("yes" if any("Borealis" in r.get("content","") for r in data) else "no")')
+[ "$found_borealis" = "yes" ] && ok "tenant-b: Borealis recovered" \
+  || fail "tenant-b missing Borealis"
+# ---------------------------------------------------------------------------
+# /search with metadata_filter — arena+probe combo should still scope.
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== /search metadata_filter probe=e2e-arena ==="
+SF=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
+  -d '{"query":"team","limit":20,"arena":"e2e-tenant-a","metadata_filter":{"probe":"e2e-arena"}}')
+all_match=$(echo "$SF" | python3 -c '
+import json,sys
+data=json.load(sys.stdin).get("results",[])
+ok=all(r.get("metadata",{}).get("probe")=="e2e-arena" and r.get("metadata",{}).get("arena")=="e2e-tenant-a" for r in data)
+print("yes" if ok and data else "no")')
+[ "$all_match" = "yes" ] && ok "metadata_filter scopes to probe + arena" \
+  || fail "metadata_filter let other rows through"
+# ---------------------------------------------------------------------------
+# /forget — by metadata_contains. Cleans up so reruns are idempotent.
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== /forget probe=e2e-arena ==="
+F=$(curl -sf -X POST "$BASE/forget" -H "Content-Type: application/json" \
+  -d '{"metadata_contains":{"probe":"e2e-arena"}}')
+deleted=$(echo "$F" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("deleted",0))')
+echo "  deleted: $deleted"
+[ "$deleted" -ge "1" ] && ok "/forget removed at least 1 row" || fail "/forget"
+echo ""
+echo "=== Result ==="
+echo "  PASS: $PASS"
+echo "  FAIL: $FAIL"
+exit $FAIL

package/packages/memory-engine/tests/embed_stub/Dockerfile ADDED Viewed

@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /app
+RUN pip install --no-cache-dir fastapi "uvicorn[standard]" pydantic
+COPY server.py /app/server.py
+ENV EMBED_DIM=4096
+EXPOSE 8041
+CMD ["python", "-m", "uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8041"]

package/packages/memory-engine/tests/embed_stub/server.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Deterministic embedding stub for hermetic CI runs.
+Returns a fixed-dim vector per input string, derived from a hash so the
+same text always maps to the same vector. Cosine similarity between two
+embeddings equals 1.0 only for identical input strings, and decreases
+roughly with edit distance — enough to exercise the engine's vector
+search paths in CI without an actual embedding model.
+Speaks both shapes the engine uses:
+  POST /v1/embeddings  { input, model }   -> { data:[{embedding:[...] }] }
+  POST /v1/embed       { input, model }   -> { embeddings:[[...]] }
+Run:
+  EMBED_DIM=4096 uvicorn server:app --host 0.0.0.0 --port 8041
+"""
+from __future__ import annotations
+import hashlib
+import math
+import os
+from typing import Any
+from fastapi import FastAPI
+from pydantic import BaseModel
+EMBED_DIM = int(os.environ.get("EMBED_DIM", "4096"))
+app = FastAPI(title="embed-stub")
+class EmbedRequest(BaseModel):
+    input: Any
+    model: str | None = None
+def _vector_for(text: str) -> list[float]:
+    """Deterministic vector: hash the text, expand to EMBED_DIM, L2-normalise."""
+    text = text or ""
+    seed = hashlib.sha256(text.encode("utf-8")).digest()
+    raw: list[int] = []
+    counter = 0
+    while len(raw) < EMBED_DIM:
+        chunk = hashlib.sha256(seed + counter.to_bytes(4, "big")).digest()
+        raw.extend(chunk)
+        counter += 1
+    floats = [(b - 127.5) / 127.5 for b in raw[:EMBED_DIM]]
+    norm = math.sqrt(sum(x * x for x in floats)) or 1.0
+    return [x / norm for x in floats]
+def _normalise_inputs(inp: Any) -> list[str]:
+    if isinstance(inp, str):
+        return [inp]
+    if isinstance(inp, list):
+        return [str(x) for x in inp]
+    return [str(inp)]
+@app.get("/health")
+def health() -> dict:
+    return {"status": "ok", "dim": EMBED_DIM}
+@app.post("/v1/embeddings")
+def openai_embeddings(req: EmbedRequest) -> dict:
+    texts = _normalise_inputs(req.input)
+    return {
+        "object": "list",
+        "data": [
+            {"object": "embedding", "index": i, "embedding": _vector_for(t)}
+            for i, t in enumerate(texts)
+        ],
+        "model": req.model or "embed-stub",
+    }
+@app.post("/v1/embed")
+def lambda_gateway_embed(req: EmbedRequest) -> dict:
+    texts = _normalise_inputs(req.input)
+    return {"embeddings": [_vector_for(t) for t in texts]}