npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.2 → 0.9.4 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/packages/memory/package-lock.json +3 -3
package/packages/memory-engine/compat/Dockerfile +12 -1
package/packages/memory-engine/compat/server.py +135 -55
package/packages/memory-engine/docker-compose.test.yml +0 -7
package/packages/memory-engine/docker-compose.yml +16 -35
package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +19 -1
package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +11 -5
package/packages/memory-engine/engine/services/l6/l6-document-store.py +11 -5
package/packages/memory-engine/engine/services/l4/Dockerfile +0 -19
package/packages/memory-engine/engine/services/l4/server.py +0 -305

package/dist/index.cjs CHANGED Viewed

@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.2";
+var VERSION = "0.9.4";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.2";
+var VERSION = "0.9.4";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.9.2",
+  "version": "0.9.4",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory/package-lock.json CHANGED Viewed

@@ -568,9 +568,9 @@
       }
     },
     "node_modules/hono": {
-      "version": "4.12.12",
-      "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.12.tgz",
-      "integrity": "sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==",
+      "version": "4.12.18",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
+      "integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
       "license": "MIT",
       "engines": {
         "node": ">=16.9.0"

package/packages/memory-engine/compat/Dockerfile CHANGED Viewed

@@ -4,7 +4,18 @@ WORKDIR /app
 RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx pydantic
-COPY server.py /app/server.py
+# Build context is the memory-engine root (see docker-compose.yml). The
+# shim's server.py side-loads engine/services/_shared/embed_provider.py
+# for shared-embed mode on /store-batch (one embed call across all 4
+# layer indexers vs 4 redundant calls).
+COPY compat/server.py /app/server.py
+# server.py's sys.path.insert resolves "../engine/services" relative to
+# its own location (/app/server.py → /engine/services). Mirror that
+# layout so the import works without changing the runtime code.
+COPY engine/services/_shared /engine/services/_shared
+# Make `_shared` an importable package (mirror the layer services'
+# layout where __init__.py exists or python detects PEP 420 namespace).
+RUN touch /engine/services/__init__.py
 EXPOSE 8099

package/packages/memory-engine/compat/server.py CHANGED Viewed

@@ -25,7 +25,6 @@ Environment:
     L0_URL                   default http://l0:8030
     L2_PROXY_URL             default http://l2:8031
     L3_KG_URL                default http://l3:8047
-    L4_VEC_URL               default http://l4:8042
     L5_MILVUS_URL            default http://l5:8035
     L6_DOC_URL               default http://l6:8037
     NV_EMBED_URL             default http://nv-embed:8041/v1/embeddings
@@ -34,6 +33,7 @@ Environment:
 import hashlib
 import os
+import sys
 import time
 from datetime import datetime, timezone
 from typing import Any, Optional
@@ -42,6 +42,17 @@ import httpx
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
+# Reach into the engine/services tree so we can reuse EmbedClient. The
+# tree isn't a real installed package; layer services and the compat
+# shim both side-load it the same way. Keeps the chunking + auto-detect
+# behaviour identical between the shim's pre-embed and the per-layer
+# embeds that previously did the same work N times.
+sys.path.insert(
+    0,
+    os.path.join(os.path.dirname(__file__), "..", "engine", "services"),
+)
+from _shared.embed_provider import EmbedClient  # noqa: E402
 # ----------------------------------------------------------------------
 # Config
 # ----------------------------------------------------------------------
@@ -49,7 +60,6 @@ from pydantic import BaseModel, Field
 L0_URL = os.environ.get("L0_URL", "http://l0:8030")
 L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
 L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
-L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
 L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
 L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
 NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
@@ -63,6 +73,30 @@ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
 PORT = int(os.environ.get("PORT", "8099"))
+# Shared-embed mode. When on, /store-batch computes embeddings once at
+# the shim level and forwards them to each layer's /index-batch so the
+# layer skips its own embed call. Cuts gateway RPC count by ~4× (L4 +
+# L5 + L6 + L2-internal all did the same embed work independently).
+# Default ON because all layers in this engine use the same NV-Embed
+# model; disable if you ever wire up per-layer differentiated embedders
+# (e.g. cohere on L5, openai on L4).
+SHARE_EMBEDDINGS = os.environ.get("PME_SHARE_EMBEDDINGS", "true").lower() == "true"
+_embed_client: EmbedClient | None = None
+def _get_embed_client() -> EmbedClient:
+    """Lazy-init the shim's EmbedClient using PME_-prefixed env vars
+    (matches L2's pattern). Cached for the process lifetime so the
+    auto-detect handshake only happens once."""
+    global _embed_client
+    if _embed_client is None:
+        _embed_client = EmbedClient.from_env(
+            prefix="PME_",
+            default_url=NV_EMBED_URL,
+        )
+    return _embed_client
 # Layer types we surface as the SDK 4-layer projection. Engine stores
 # everything as chunks tagged with arena + layer_type metadata; this
@@ -252,28 +286,23 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
     return [d["embedding"] for d in resp.json()["data"]]
-async def _index_l4(records: list[dict[str, Any]]) -> int:
-    """Index records into the L4 sqlite-vec layer."""
-    payload = {"records": [
-        {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
-         "text": r["content"]} for r in records
-    ]}
-    try:
-        resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
-        resp.raise_for_status()
-        return resp.json().get("inserted", 0)
-    except Exception as exc:
-        print(f"[shim] L4 index-batch failed: {exc}")
-        return 0
-async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
+async def _index_l5(
+    records: list[dict[str, Any]],
+    arena: str = "general",
+    embeddings: list[list[float]] | None = None,
+) -> int:
     """Index records into the L5 Milvus comms layer (chats collection).
     arena is forwarded as a Milvus dynamic field so /search can filter
     by arena natively (vs the shim's defence-in-depth post-filter).
+    When `embeddings` is supplied (parallel to records), L5 skips its
+    own embed call — the shim pre-computes vectors once at /store-batch
+    level and threads them through each layer to avoid 3× redundant
+    embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
+    texts in parallel).
     """
-    payload = {
+    payload: dict[str, Any] = {
         "collection": "chats",
         "records": [
             {
@@ -287,21 +316,32 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
             for r in records
         ],
     }
+    if embeddings is not None:
+        payload["embeddings"] = embeddings
     try:
         resp = await _client().post(f"{L5_MILVUS_URL}/index-batch", json=payload, timeout=60.0)
         resp.raise_for_status()
         return resp.json().get("inserted", 0)
     except Exception as exc:
-        # Best-effort: L5 is one of six redundant layers; failure here doesn't
-        # mean the record is unsearchable. L0 BM25 + L4 vec + L6 doc-store
-        # all carry it independently.
+        # Best-effort: L5 is one of five redundant layers; failure here
+        # doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
+        # L6 doc-store all carry it independently.
         print(f"[shim] L5 index-batch failed: {exc}")
         return 0
-async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> int:
-    """Index records into the L6 document store."""
-    payload = {
+async def _index_l6(
+    records: list[dict[str, Any]],
+    arena: str = "general",
+    embeddings: list[list[float]] | None = None,
+) -> int:
+    """Index records into the L6 document store.
+    When `embeddings` is supplied (parallel to records), L6 skips its
+    own embed call — the shim pre-computes vectors once at /store-batch
+    level and threads them through each layer.
+    """
+    payload: dict[str, Any] = {
         "arena": arena,
         "records": [
             {
@@ -314,6 +354,8 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
             for r in records
         ],
     }
+    if embeddings is not None:
+        payload["embeddings"] = embeddings
     try:
         resp = await _client().post(f"{L6_DOC_URL}/index-batch", json=payload, timeout=120.0)
         resp.raise_for_status()
@@ -323,14 +365,23 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
         return 0
-async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "general") -> dict:
+async def _index_l2_internal(
+    records: list[dict[str, Any]],
+    arena: str = "general",
+    embeddings: list[list[float]] | None = None,
+) -> dict:
     """Populate L2's internal stores: L0 BM25 + L4 QMD vec + L3 Neo4j KG.
     Without this, L2's RRF fusion runs over empty L0/L4-qmd/L3 layers and
     those zero-result rank lists pollute the score. The L2 proxy exposes
     /index-internal-batch which writes to all three in one round-trip.
+    When `embeddings` is supplied (parallel to records), L2's internal
+    embed call (used for L4-QMD population) is skipped — the shim
+    pre-computes vectors once at /store-batch level and threads them
+    through to L4_QMD via this endpoint.
     """
-    payload = {
+    payload: dict[str, Any] = {
         "arena": arena,
         "records": [
             {
@@ -341,6 +392,8 @@ async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "genera
             for r in records
         ],
     }
+    if embeddings is not None:
+        payload["embeddings"] = embeddings
     try:
         resp = await _client().post(f"{L2_PROXY_URL}/index-internal-batch",
                                     json=payload, timeout=180.0)
@@ -454,25 +507,25 @@ async def health():
     nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
     import asyncio
-    l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
+    l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
         _probe(f"{L2_PROXY_URL}/health"),
-        _probe(f"{L4_VEC_URL}/health"),
         _probe(f"{L5_MILVUS_URL}/health"),
         _probe(f"{L6_DOC_URL}/health"),
         _probe(nv_embed_health),
         _probe_l3(),
     )
-    # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
-    # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
-    # both layers are usable. Tie their status to L2.
+    # L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
+    # all in-process inside the L2 proxy — L0+L1 in workspace.db / core
+    # files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
+    # if L2 is healthy, all three layers are usable. Tie their status to L2.
     l2_ok = l2_v == "ok"
     out["layers"] = {
         "l0": "ok" if l2_ok else l2_v,
         "l1": "ok" if l2_ok else l2_v,
         "l2": l2_v,
         "l3": l3_v,
-        "l4": l4_v,
+        "l4": "ok" if l2_ok else l2_v,
         "l5": l5_v,
         "l6": l6_v,
         "nv_embed": nv_v,
@@ -493,19 +546,15 @@ async def health():
         "l6_vector_chunks": None,
         "l6_fts_chunks": None,
     }
-    # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
+    # L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
+    # opened by the L2 proxy). L2 exposes /index-internal-stats with both
+    # counts in one round-trip.
     try:
         r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
         if r.status_code == 200:
             stats = r.json()
             memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
-    except Exception:
-        pass
-    # L4 reports n_vectors on its own /health.
-    try:
-        r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
-        if r.status_code == 200:
-            memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
+            memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
     except Exception:
         pass
     # L5 reports per-collection counts on /health. We surface chats —
@@ -558,8 +607,9 @@ async def health_deep():
         except Exception as exc:
             return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
+    # L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
+    # round-trip is covered by L2's /health/deep, no separate probe needed.
     results = await asyncio.gather(
-        _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
         _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
         _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
     )
@@ -599,15 +649,15 @@ async def store(req: StoreRequest):
     # depending on which one was supplied).
     _stash_all_keys(rid, req.metadata or {}, arena)
-    # Fan out to L4 + L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
+    # Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
     import asyncio
-    l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
-        _index_l4([record]),
+    l5_count, l6_count, l2_internal = await asyncio.gather(
         _index_l5([record], arena=arena),
         _index_l6([record], arena=arena),
         _index_l2_internal([record], arena=arena),
     )
+    l4_qmd_count = l2_internal.get("l4_qmd", 0)
     return {
         "id": rid,
         "content": req.content,
@@ -616,8 +666,11 @@ async def store(req: StoreRequest):
             "l0": l2_internal.get("l0", 0),
             "l3_chunks": l2_internal.get("l3_chunks", 0),
             "l3_entities": l2_internal.get("l3_entities", 0),
-            "l4_qmd": l2_internal.get("l4_qmd", 0),
-            "l4": l4_count,
+            "l4_qmd": l4_qmd_count,
+            # `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
+            # sidecar has been dropped. Kept in the response for wire-format
+            # back-compat with callers that read engine.l4.
+            "l4": l4_qmd_count,
             "l5": l5_count,
             "l6": l6_count,
         },
@@ -646,24 +699,51 @@ async def store_batch(req: StoreBatchRequest):
     t0 = time.perf_counter()
     import asyncio
-    l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
-        _index_l4(normalised),
-        _index_l5(normalised, arena=req.arena or "general"),
-        _index_l6(normalised, arena=req.arena or "general"),
-        _index_l2_internal(normalised, arena=req.arena or "general"),
+    # Shared-embed mode: compute embeddings ONCE here, pass them down to
+    # every layer so they skip their own embed call. Previously L5 + L6
+    # + L2-internal each re-embedded the same texts in parallel, which
+    # fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
+    # requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
+    # each = ~2.5s of pure embed time per /store-batch. With shared
+    # embeddings we issue one chunked embed pass (10 sub-calls for N=50
+    # records) and skip the per-layer redundant work entirely.
+    # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
+    # per-layer differentiated embedders.
+    shared_embeddings: list[list[float]] | None = None
+    embed_ms = 0.0
+    if SHARE_EMBEDDINGS and normalised:
+        texts = [r["content"] for r in normalised]
+        embed_t0 = time.perf_counter()
+        try:
+            shared_embeddings = await _get_embed_client().embed_batch_async(texts)
+        except Exception as exc:
+            # Fall back to per-layer embedding rather than failing the
+            # whole batch. The layers' /index-batch still works when
+            # `embeddings` is absent.
+            print(f"[shim] shared embed failed, falling back to per-layer: {exc}")
+            shared_embeddings = None
+        embed_ms = (time.perf_counter() - embed_t0) * 1000.0
+    l5_count, l6_count, l2_internal = await asyncio.gather(
+        _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
+        _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
+        _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
     )
     dur_ms = (time.perf_counter() - t0) * 1000.0
+    l4_qmd_count = l2_internal.get("l4_qmd", 0)
     return {
         "status": "ok",
-        "inserted": max(l4_count, l5_count, l6_count),
+        "inserted": max(l4_qmd_count, l5_count, l6_count),
         "ids": [r["id"] for r in normalised],
         "engine": {
             "l0": l2_internal.get("l0", 0),
             "l3_chunks": l2_internal.get("l3_chunks", 0),
             "l3_entities": l2_internal.get("l3_entities", 0),
-            "l4_qmd": l2_internal.get("l4_qmd", 0),
-            "l4": l4_count,
+            "l4_qmd": l4_qmd_count,
+            # `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
+            "l4": l4_qmd_count,
             "l5": l5_count,
             "l6": l6_count,
         },

package/packages/memory-engine/docker-compose.test.yml CHANGED Viewed

@@ -32,12 +32,6 @@ services:
   # Pin the embedding dim explicitly across layers, independent of any
   # developer-local .env (which may set EMBED_DIM=768 for Ollama-based
   # local dev). The stub returns 4096; layers must agree.
-  l4:
-    environment:
-      L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
-      L4_EMBED_API_KEY: ""
-      L4_EMBED_DIM: "4096"
   l5:
     environment:
       L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
@@ -61,6 +55,5 @@ services:
       embed-stub:
         condition: service_healthy
       l2: { condition: service_started }
-      l4: { condition: service_started }
       l5: { condition: service_started }
       l6: { condition: service_started }

package/packages/memory-engine/docker-compose.yml CHANGED Viewed

@@ -82,36 +82,6 @@ services:
       retries: 30
       start_period: 30s
-  # --------------------------------------------------------------------
-  # L4 — sqlite-vec sidecar
-  # --------------------------------------------------------------------
-  l4:
-    <<: *engine-base
-    build:
-      context: ./engine/services
-      dockerfile: l4/Dockerfile
-    container_name: pme-l4
-    # Default 18042 to avoid port collisions on 8042.
-    # Override via PME_L4_PORT for bench setups that intentionally replace it.
-    ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
-    environment:
-      L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
-      L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
-      L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
-      L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
-      L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
-      L4_EMBED_DIM: ${EMBED_DIM:-4096}
-      L4_DB_PATH: /data/vec.db
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - pme-l4-data:/data
-    healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
-      interval: 10s
-      timeout: 5s
-      retries: 30
   # --------------------------------------------------------------------
   # L5 — Qdrant comms layer
   # --------------------------------------------------------------------
@@ -212,8 +182,11 @@ services:
   compat:
     <<: *engine-base
     build:
-      context: ./compat
-      dockerfile: Dockerfile
+      # Build context is the memory-engine root so the Dockerfile can
+      # COPY both compat/server.py and engine/services/_shared (shared
+      # EmbedClient for /store-batch dedup).
+      context: .
+      dockerfile: compat/Dockerfile
     container_name: pme-compat
     ports:
       - "127.0.0.1:${PME_PORT:-8099}:8099"
@@ -221,16 +194,25 @@ services:
       L0_URL: http://l2:8031
       L2_PROXY_URL: http://l2:8031
       L3_KG_URL: http://l3:7474
-      L4_VEC_URL: http://l4:8042
       L5_MILVUS_URL: http://l5:8034
       L6_DOC_URL: http://l6:8037
       NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
+      # PME_ prefix vars feed the shim's EmbedClient for shared-embed
+      # mode on /store-batch (one embed call across all 3 indexers vs
+      # 3 redundant calls). Match the L2 config block so both clients
+      # hit the same gateway with the same model. Set
+      # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
+      PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
+      PME_EMBED_API_KEY: ${EMBED_API_KEY:-}
+      PME_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
+      PME_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
+      PME_NV_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
+      PME_SHARE_EMBEDDINGS: ${PME_SHARE_EMBEDDINGS:-true}
       BYPASS_L2_PROXY: ${BYPASS_L2_PROXY:-0}
     extra_hosts:
       - "host.docker.internal:host-gateway"
     depends_on:
       l2: { condition: service_started }
-      l4: { condition: service_started }
       l5: { condition: service_started }
       l6: { condition: service_started }
     healthcheck:
@@ -247,6 +229,5 @@ volumes:
   pme-nv-embed-cache:
   pme-l2-data:
   pme-l3-data:
-  pme-l4-data:
   pme-l5-data:
   pme-l6-data:

package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py CHANGED Viewed

@@ -1496,6 +1496,12 @@ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
 class IndexInternalBatchRequest(BaseModel):
     records: List[Dict[str, Any]]  # [{"id": str, "content": str, "metadata": dict}, ...]
     arena: Optional[str] = "general"
+    # When supplied (parallel to `records`), skip the L4-QMD embed call
+    # and use these vectors directly. Compat shim populates this when
+    # shared-embed mode is on so we don't duplicate embed work across
+    # layers. Length must match records — defensive bail-out below if
+    # it doesn't.
+    embeddings: Optional[List[List[float]]] = None
 @app.post("/index-internal-batch")
@@ -1575,7 +1581,19 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
     # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
     l4_inserted = 0
     try:
-        embeddings = await _embed_batch_local([n["content"] for n in norm])
+        # Shared-embed shortcut: if the compat shim handed us pre-computed
+        # vectors that line up with our normalised records, use them and
+        # skip our own embed RPC. Fall back to per-layer embedding when
+        # the vectors are absent or the lengths don't match (defensive).
+        shared_embs = req.embeddings
+        if (
+            shared_embs is not None
+            and len(shared_embs) == len(records)
+            and len(records) == len(norm)
+        ):
+            embeddings = shared_embs
+        else:
+            embeddings = await _embed_batch_local([n["content"] for n in norm])
         if len(embeddings) != len(norm):
             log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
         qmd_db = Path(QMD_DB_PATH)

package/packages/memory-engine/engine/services/l5/l5-comms-layer.py CHANGED Viewed

@@ -629,13 +629,19 @@ def serve(port=8034):
         client = get_client()
         ensure_collection(client, collection)
-        # Single batched embed call.
+        # Shared-embed shortcut: caller (compat shim) computed vectors
+        # once and forwards them so we skip the embed RPC. Length must
+        # match records — fall back to per-layer embed if it doesn't.
         texts = [(r.get("text") or "")[:8192] for r in records]
+        shared_embs = req.get("embeddings")
         t0 = _time.time()
-        try:
-            embs = _embed_post(texts)
-        except Exception as exc:
-            return {"status": "error", "error": f"embed failed: {exc}"}
+        if isinstance(shared_embs, list) and len(shared_embs) == len(records):
+            embs = shared_embs
+        else:
+            try:
+                embs = _embed_post(texts)
+            except Exception as exc:
+                return {"status": "error", "error": f"embed failed: {exc}"}
         embed_ms = (_time.time() - t0) * 1000.0
         # Single batched insert. Mirror every field the chats collection

package/packages/memory-engine/engine/services/l6/l6-document-store.py CHANGED Viewed

@@ -990,12 +990,18 @@ def serve(port: int = DEFAULT_PORT):
         texts = [(r.get("text") or "")[:16000] for r in records]
-        # Single batched embed call (OpenAI-compat first, lambda-gateway fallback).
+        # Shared-embed shortcut: caller (compat shim) computed vectors
+        # once and forwards them so we skip the embed RPC. Length must
+        # match records — fall back to per-layer embed if it doesn't.
+        shared_embs = req.get("embeddings")
         t0 = _time.time()
-        try:
-            embs = _embed_post(texts)
-        except Exception as exc:
-            raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
+        if isinstance(shared_embs, list) and len(shared_embs) == len(records):
+            embs = shared_embs
+        else:
+            try:
+                embs = _embed_post(texts)
+            except Exception as exc:
+                raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
         embed_ms = (_time.time() - t0) * 1000.0
         # Single milvus insert.

package/packages/memory-engine/engine/services/l4/Dockerfile DELETED Viewed

@@ -1,19 +0,0 @@
-FROM python:3.12-slim
-WORKDIR /app
-RUN pip install --no-cache-dir fastapi 'uvicorn[standard]' httpx pydantic
-# Build context is engine/services so the shared embed_provider module is
-# COPYable. server.py adds engine/services to sys.path at startup, then
-# imports from `_shared.embed_provider`.
-COPY _shared /app/_shared
-COPY l4/server.py /app/server.py
-RUN mkdir -p /data
-ENV L4_DB_PATH=/data/vec.db
-ENV PORT=8042
-EXPOSE 8042
-CMD ["python", "server.py", "--port", "8042"]

package/packages/memory-engine/engine/services/l4/server.py DELETED Viewed

@@ -1,305 +0,0 @@
-"""
-L4 sqlite-vec sidecar.
-Vector index sidecar for the Pentatonic Memory Engine stack.
-Exposes /health, /search, /index-batch, /refresh over HTTP.
-Endpoints:
-    GET  /health
-    POST /search       body: {"query":"...", "limit":10}
-    POST /index-batch  body: {"records":[{"id","text"}, ...]}
-    POST /refresh      no-op (sqlite-vec writes are immediate)
-Env:
-    L4_DB_PATH       default /data/vec.db
-    L4_NV_EMBED_URL  default http://nv-embed:8041/v1/embeddings
-    PORT             default 8042
-"""
-from __future__ import annotations
-import argparse
-import hashlib
-import os
-import sqlite3
-import struct
-import sys
-import time
-from pathlib import Path
-from typing import Any
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-# Shared embedding client lives at engine/services/_shared/. Add the parent of
-# the service dir to sys.path so `from _shared.embed_provider import ...` works
-# regardless of how the service is launched (uvicorn, python server.py, etc.).
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from _shared.embed_provider import EmbedClient  # noqa: E402
-# ----------------------------------------------------------------------
-# Config
-# ----------------------------------------------------------------------
-DB_PATH = os.environ.get("L4_DB_PATH", "/data/vec.db")
-EMBED_DIM = int(os.environ.get("L4_EMBED_DIM", "4096"))
-# ----------------------------------------------------------------------
-# DB helpers
-# ----------------------------------------------------------------------
-def _vec_to_blob(vec: list[float]) -> bytes:
-    """Pack a list of floats as little-endian f32 bytes for sqlite-vec."""
-    return struct.pack(f"<{len(vec)}f", *vec)
-def _blob_to_vec(blob: bytes) -> list[float]:
-    n = len(blob) // 4
-    return list(struct.unpack(f"<{n}f", blob))
-def _cosine(a: list[float], b: list[float]) -> float:
-    import math
-    dot = sum(x * y for x, y in zip(a, b))
-    na = math.sqrt(sum(x * x for x in a))
-    nb = math.sqrt(sum(y * y for y in b))
-    if na == 0 or nb == 0:
-        return 0.0
-    return dot / (na * nb)
-def _get_db() -> sqlite3.Connection:
-    """Open DB and ensure schema. We use plain BLOB columns rather than
-    the sqlite-vec virtual table because sqlite-vec is an optional ext
-    that may not be loadable in every container — plain BLOB lets us
-    fall back to a Python-side cosine pass without losing correctness.
-    """
-    Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
-    conn = sqlite3.connect(DB_PATH, timeout=10)
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.execute("""
-        CREATE TABLE IF NOT EXISTS chunks (
-            id TEXT PRIMARY KEY,
-            text TEXT,
-            embedding BLOB,
-            indexed_at REAL
-        )
-    """)
-    return conn
-# ----------------------------------------------------------------------
-# Embedding client
-# ----------------------------------------------------------------------
-_embed: EmbedClient | None = None
-def _embed_client() -> EmbedClient:
-    """Lazily build the embed client so env vars are read at first use."""
-    global _embed
-    if _embed is None:
-        _embed = EmbedClient.from_env(
-            prefix="L4_",
-            default_url="http://nv-embed:8041/v1/embeddings",
-        )
-    return _embed
-async def _embed_batch(texts: list[str]) -> list[list[float]]:
-    """Embed a batch of texts via the shared EmbedClient."""
-    return await _embed_client().embed_batch_async(texts)
-# ----------------------------------------------------------------------
-# FastAPI
-# ----------------------------------------------------------------------
-class SearchRequest(BaseModel):
-    query: str
-    limit: int = 10
-class IndexBatchRequest(BaseModel):
-    records: list[dict[str, Any]]
-app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
-@app.get("/health")
-def health():
-    try:
-        conn = _get_db()
-        n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
-        conn.close()
-        return {"status": "ok", "loaded": True, "n_vectors": n,
-                "dim": EMBED_DIM, "db_path": DB_PATH,
-                # BLOB+Python-cosine is the intentional implementation path,
-                # not a degraded fallback (see _get_db docstring). The previous
-                # "sqlite-vec-fallback" label gave operators the wrong signal.
-                "backend": "sqlite-vec"}
-    except Exception as exc:
-        return {"status": "degraded", "error": str(exc)}
-@app.post("/search")
-async def search(req: SearchRequest):
-    if not req.query:
-        return []
-    try:
-        embs = await _embed_batch([req.query])
-        if not embs or embs[0] is None:
-            raise HTTPException(status_code=502, detail="embed failed")
-        q_vec = embs[0]
-    except Exception as exc:
-        raise HTTPException(status_code=502, detail=f"embed: {exc}")
-    conn = _get_db()
-    rows = conn.execute("SELECT id, text, embedding FROM chunks").fetchall()
-    conn.close()
-    # Cosine similarity in Python — fine for OSS / small corpora. For
-    # large corpora: consider a dedicated vector DB.
-    scored: list[tuple[float, str, str]] = []
-    for rid, text, blob in rows:
-        if not blob:
-            continue
-        v = _blob_to_vec(blob)
-        if len(v) != len(q_vec):
-            continue
-        s = _cosine(q_vec, v)
-        scored.append((s, rid, text))
-    scored.sort(reverse=True)
-    out = [
-        {"path": rid, "text": text, "score": float(s),
-         "source": "L4-sqlite-vec", "layer": "L4"}
-        for s, rid, text in scored[: req.limit]
-    ]
-    return out
-@app.post("/index-batch")
-async def index_batch(req: IndexBatchRequest):
-    if not req.records:
-        return {"status": "ok", "inserted": 0}
-    texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
-    t0 = time.perf_counter()
-    embs = await _embed_batch(texts)
-    embed_ms = (time.perf_counter() - t0) * 1000.0
-    conn = _get_db()
-    t1 = time.perf_counter()
-    rows = []
-    for r, emb, txt in zip(req.records, embs, texts):
-        if not emb:
-            continue
-        rid = r.get("id") or hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
-        rows.append((rid, txt, _vec_to_blob(emb), time.time()))
-    if rows:
-        conn.executemany(
-            "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
-            "VALUES (?, ?, ?, ?)", rows,
-        )
-        conn.commit()
-    insert_ms = (time.perf_counter() - t1) * 1000.0
-    conn.close()
-    return {"status": "ok", "inserted": len(rows),
-            "embed_ms": round(embed_ms, 1), "insert_ms": round(insert_ms, 1)}
-@app.post("/refresh")
-def refresh():
-    """No-op for sqlite-vec — writes are immediate. Kept for API parity."""
-    return {"status": "ok", "noop": True}
-# ----------------------------------------------------------------------
-# /health/deep — synthetic round-trip
-# ----------------------------------------------------------------------
-# Fixed sentinel id used by /health/deep. Upserted on every probe call,
-# so the row is idempotent. Kept under id="__healthcheck__sentinel" so
-# the L4 corpus has at most one healthcheck row regardless of probe rate.
-_HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
-_HEALTH_SENTINEL_TEXT = (
-    "healthcheck sentinel — embed-write-search round-trip verifier"
-)
-@app.get("/health/deep")
-async def health_deep():
-    """Real functional probe: embed → write → search the sentinel.
-    Catches the class of failure that plain /health misses — broken
-    embed paths, write 500s, query path bugs — i.e. exactly the bug
-    shape that silently degraded L6 from v0.8.0 → v0.8.2.
-    Returns:
-        {status, embed_ms, write_ms, search_ms, hit, ok}
-    `hit` confirms the sentinel was returned from search; `ok` is the
-    aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
-    regardless so callers can read the body for diagnostics; status:
-    field carries the verdict.
-    """
-    t_total = time.perf_counter()
-    out: dict[str, Any] = {"status": "ok", "ok": True}
-    try:
-        t0 = time.perf_counter()
-        embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
-        out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
-        if not embs or not embs[0]:
-            out["status"] = "embed_failed"
-            out["ok"] = False
-            return out
-        vec = embs[0]
-    except Exception as exc:
-        out["status"] = f"embed_error: {type(exc).__name__}"
-        out["ok"] = False
-        return out
-    try:
-        conn = _get_db()
-        t1 = time.perf_counter()
-        conn.execute(
-            "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
-            "VALUES (?, ?, ?, ?)",
-            (_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
-        )
-        conn.commit()
-        out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
-        t2 = time.perf_counter()
-        rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
-                            (_HEALTH_SENTINEL_ID,)).fetchone()
-        out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
-        conn.close()
-    except Exception as exc:
-        out["status"] = f"db_error: {type(exc).__name__}"
-        out["ok"] = False
-        return out
-    out["hit"] = rows is not None
-    if not out["hit"]:
-        out["status"] = "sentinel_missing"
-        out["ok"] = False
-    out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
-    return out
-# ----------------------------------------------------------------------
-# Entrypoint
-# ----------------------------------------------------------------------
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8042")))
-    parser.add_argument("--data-dir", default=None)
-    args = parser.parse_args()
-    if args.data_dir:
-        os.environ["L4_DB_PATH"] = str(Path(args.data_dir) / "vec.db")
-    import uvicorn
-    uvicorn.run("server:app", host="0.0.0.0", port=args.port, log_level="info")