npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.3 → 0.9.5 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/packages/memory/package-lock.json +3 -3
package/packages/memory-engine/compat/server.py +45 -67
package/packages/memory-engine/docker-compose.test.yml +0 -7
package/packages/memory-engine/docker-compose.yml +10 -36
package/packages/memory-engine/engine/services/l2/Dockerfile +7 -0
package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +233 -60
package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +280 -0
package/packages/memory-engine/engine/services/l4/Dockerfile +0 -19
package/packages/memory-engine/engine/services/l4/server.py +0 -315

package/dist/index.cjs CHANGED Viewed

@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.3";
+var VERSION = "0.9.5";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.3";
+var VERSION = "0.9.5";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.9.3",
+  "version": "0.9.5",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory/package-lock.json CHANGED Viewed

@@ -568,9 +568,9 @@
       }
     },
     "node_modules/hono": {
-      "version": "4.12.12",
-      "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.12.tgz",
-      "integrity": "sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==",
+      "version": "4.12.18",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
+      "integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
       "license": "MIT",
       "engines": {
         "node": ">=16.9.0"

package/packages/memory-engine/compat/server.py CHANGED Viewed

@@ -25,7 +25,6 @@ Environment:
     L0_URL                   default http://l0:8030
     L2_PROXY_URL             default http://l2:8031
     L3_KG_URL                default http://l3:8047
-    L4_VEC_URL               default http://l4:8042
     L5_MILVUS_URL            default http://l5:8035
     L6_DOC_URL               default http://l6:8037
     NV_EMBED_URL             default http://nv-embed:8041/v1/embeddings
@@ -61,7 +60,6 @@ from _shared.embed_provider import EmbedClient  # noqa: E402
 L0_URL = os.environ.get("L0_URL", "http://l0:8030")
 L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
 L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
-L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
 L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
 L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
 NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
@@ -288,32 +286,6 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
     return [d["embedding"] for d in resp.json()["data"]]
-async def _index_l4(
-    records: list[dict[str, Any]],
-    embeddings: list[list[float]] | None = None,
-) -> int:
-    """Index records into the L4 sqlite-vec layer.
-    When `embeddings` is supplied (parallel to records), L4's /index-batch
-    skips its own embed call and uses ours — eliminates the redundant
-    embed work that previously cost ~850ms per drain alarm. When None,
-    L4 embeds itself (backwards-compatible path for older callers / tests
-    that don't share embeddings)."""
-    payload: dict[str, Any] = {"records": [
-        {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
-         "text": r["content"]} for r in records
-    ]}
-    if embeddings is not None:
-        payload["embeddings"] = embeddings
-    try:
-        resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
-        resp.raise_for_status()
-        return resp.json().get("inserted", 0)
-    except Exception as exc:
-        print(f"[shim] L4 index-batch failed: {exc}")
-        return 0
 async def _index_l5(
     records: list[dict[str, Any]],
     arena: str = "general",
@@ -325,7 +297,10 @@ async def _index_l5(
     by arena natively (vs the shim's defence-in-depth post-filter).
     When `embeddings` is supplied (parallel to records), L5 skips its
-    own embed call — see _index_l4 docstring for the dedup story.
+    own embed call — the shim pre-computes vectors once at /store-batch
+    level and threads them through each layer to avoid 3× redundant
+    embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
+    texts in parallel).
     """
     payload: dict[str, Any] = {
         "collection": "chats",
@@ -348,9 +323,9 @@ async def _index_l5(
         resp.raise_for_status()
         return resp.json().get("inserted", 0)
     except Exception as exc:
-        # Best-effort: L5 is one of six redundant layers; failure here doesn't
-        # mean the record is unsearchable. L0 BM25 + L4 vec + L6 doc-store
-        # all carry it independently.
+        # Best-effort: L5 is one of five redundant layers; failure here
+        # doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
+        # L6 doc-store all carry it independently.
         print(f"[shim] L5 index-batch failed: {exc}")
         return 0
@@ -363,7 +338,8 @@ async def _index_l6(
     """Index records into the L6 document store.
     When `embeddings` is supplied (parallel to records), L6 skips its
-    own embed call — see _index_l4 docstring for the dedup story.
+    own embed call — the shim pre-computes vectors once at /store-batch
+    level and threads them through each layer.
     """
     payload: dict[str, Any] = {
         "arena": arena,
@@ -401,8 +377,9 @@ async def _index_l2_internal(
     /index-internal-batch which writes to all three in one round-trip.
     When `embeddings` is supplied (parallel to records), L2's internal
-    embed call (used for L4-QMD population) is skipped — see _index_l4
-    docstring for the dedup story.
+    embed call (used for L4-QMD population) is skipped — the shim
+    pre-computes vectors once at /store-batch level and threads them
+    through to L4_QMD via this endpoint.
     """
     payload: dict[str, Any] = {
         "arena": arena,
@@ -530,25 +507,25 @@ async def health():
     nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
     import asyncio
-    l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
+    l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
         _probe(f"{L2_PROXY_URL}/health"),
-        _probe(f"{L4_VEC_URL}/health"),
         _probe(f"{L5_MILVUS_URL}/health"),
         _probe(f"{L6_DOC_URL}/health"),
         _probe(nv_embed_health),
         _probe_l3(),
     )
-    # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
-    # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
-    # both layers are usable. Tie their status to L2.
+    # L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
+    # all in-process inside the L2 proxy — L0+L1 in workspace.db / core
+    # files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
+    # if L2 is healthy, all three layers are usable. Tie their status to L2.
     l2_ok = l2_v == "ok"
     out["layers"] = {
         "l0": "ok" if l2_ok else l2_v,
         "l1": "ok" if l2_ok else l2_v,
         "l2": l2_v,
         "l3": l3_v,
-        "l4": l4_v,
+        "l4": "ok" if l2_ok else l2_v,
         "l5": l5_v,
         "l6": l6_v,
         "nv_embed": nv_v,
@@ -569,19 +546,15 @@ async def health():
         "l6_vector_chunks": None,
         "l6_fts_chunks": None,
     }
-    # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
+    # L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
+    # opened by the L2 proxy). L2 exposes /index-internal-stats with both
+    # counts in one round-trip.
     try:
         r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
         if r.status_code == 200:
             stats = r.json()
             memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
-    except Exception:
-        pass
-    # L4 reports n_vectors on its own /health.
-    try:
-        r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
-        if r.status_code == 200:
-            memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
+            memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
     except Exception:
         pass
     # L5 reports per-collection counts on /health. We surface chats —
@@ -634,8 +607,9 @@ async def health_deep():
         except Exception as exc:
             return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
+    # L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
+    # round-trip is covered by L2's /health/deep, no separate probe needed.
     results = await asyncio.gather(
-        _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
         _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
         _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
     )
@@ -675,15 +649,15 @@ async def store(req: StoreRequest):
     # depending on which one was supplied).
     _stash_all_keys(rid, req.metadata or {}, arena)
-    # Fan out to L4 + L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
+    # Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
     import asyncio
-    l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
-        _index_l4([record]),
+    l5_count, l6_count, l2_internal = await asyncio.gather(
         _index_l5([record], arena=arena),
         _index_l6([record], arena=arena),
         _index_l2_internal([record], arena=arena),
     )
+    l4_qmd_count = l2_internal.get("l4_qmd", 0)
     return {
         "id": rid,
         "content": req.content,
@@ -692,8 +666,11 @@ async def store(req: StoreRequest):
             "l0": l2_internal.get("l0", 0),
             "l3_chunks": l2_internal.get("l3_chunks", 0),
             "l3_entities": l2_internal.get("l3_entities", 0),
-            "l4_qmd": l2_internal.get("l4_qmd", 0),
-            "l4": l4_count,
+            "l4_qmd": l4_qmd_count,
+            # `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
+            # sidecar has been dropped. Kept in the response for wire-format
+            # back-compat with callers that read engine.l4.
+            "l4": l4_qmd_count,
             "l5": l5_count,
             "l6": l6_count,
         },
@@ -724,13 +701,13 @@ async def store_batch(req: StoreBatchRequest):
     import asyncio
     # Shared-embed mode: compute embeddings ONCE here, pass them down to
-    # every layer so they skip their own embed call. Previously L4 + L5
-    # + L6 + L2-internal each re-embedded the same texts in parallel,
-    # which fanned 4× the gateway RPCs. The gateway throttles at K≈10
-    # concurrent requests, so 40-way fan-out serialised into ~4 rounds
-    # of ~850ms each = ~3.5s of pure embed time per /store-batch. With
-    # shared embeddings we issue one chunked embed pass (10 sub-calls
-    # for N=50 records) and skip the per-layer redundant work entirely.
+    # every layer so they skip their own embed call. Previously L5 + L6
+    # + L2-internal each re-embedded the same texts in parallel, which
+    # fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
+    # requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
+    # each = ~2.5s of pure embed time per /store-batch. With shared
+    # embeddings we issue one chunked embed pass (10 sub-calls for N=50
+    # records) and skip the per-layer redundant work entirely.
     # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
     # per-layer differentiated embedders.
     shared_embeddings: list[list[float]] | None = None
@@ -748,24 +725,25 @@ async def store_batch(req: StoreBatchRequest):
             shared_embeddings = None
         embed_ms = (time.perf_counter() - embed_t0) * 1000.0
-    l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
-        _index_l4(normalised, embeddings=shared_embeddings),
+    l5_count, l6_count, l2_internal = await asyncio.gather(
         _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
         _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
         _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
     )
     dur_ms = (time.perf_counter() - t0) * 1000.0
+    l4_qmd_count = l2_internal.get("l4_qmd", 0)
     return {
         "status": "ok",
-        "inserted": max(l4_count, l5_count, l6_count),
+        "inserted": max(l4_qmd_count, l5_count, l6_count),
         "ids": [r["id"] for r in normalised],
         "engine": {
             "l0": l2_internal.get("l0", 0),
             "l3_chunks": l2_internal.get("l3_chunks", 0),
             "l3_entities": l2_internal.get("l3_entities", 0),
-            "l4_qmd": l2_internal.get("l4_qmd", 0),
-            "l4": l4_count,
+            "l4_qmd": l4_qmd_count,
+            # `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
+            "l4": l4_qmd_count,
             "l5": l5_count,
             "l6": l6_count,
         },

package/packages/memory-engine/docker-compose.test.yml CHANGED Viewed

@@ -32,12 +32,6 @@ services:
   # Pin the embedding dim explicitly across layers, independent of any
   # developer-local .env (which may set EMBED_DIM=768 for Ollama-based
   # local dev). The stub returns 4096; layers must agree.
-  l4:
-    environment:
-      L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
-      L4_EMBED_API_KEY: ""
-      L4_EMBED_DIM: "4096"
   l5:
     environment:
       L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
@@ -61,6 +55,5 @@ services:
       embed-stub:
         condition: service_healthy
       l2: { condition: service_started }
-      l4: { condition: service_started }
       l5: { condition: service_started }
       l6: { condition: service_started }

package/packages/memory-engine/docker-compose.yml CHANGED Viewed

@@ -82,36 +82,6 @@ services:
       retries: 30
       start_period: 30s
-  # --------------------------------------------------------------------
-  # L4 — sqlite-vec sidecar
-  # --------------------------------------------------------------------
-  l4:
-    <<: *engine-base
-    build:
-      context: ./engine/services
-      dockerfile: l4/Dockerfile
-    container_name: pme-l4
-    # Default 18042 to avoid port collisions on 8042.
-    # Override via PME_L4_PORT for bench setups that intentionally replace it.
-    ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
-    environment:
-      L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
-      L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
-      L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
-      L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
-      L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
-      L4_EMBED_DIM: ${EMBED_DIM:-4096}
-      L4_DB_PATH: /data/vec.db
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - pme-l4-data:/data
-    healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
-      interval: 10s
-      timeout: 5s
-      retries: 30
   # --------------------------------------------------------------------
   # L5 — Qdrant comms layer
   # --------------------------------------------------------------------
@@ -224,13 +194,12 @@ services:
       L0_URL: http://l2:8031
       L2_PROXY_URL: http://l2:8031
       L3_KG_URL: http://l3:7474
-      L4_VEC_URL: http://l4:8042
       L5_MILVUS_URL: http://l5:8034
       L6_DOC_URL: http://l6:8037
       NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
       # PME_ prefix vars feed the shim's EmbedClient for shared-embed
-      # mode on /store-batch (one embed call across all 4 indexers vs
-      # 4 redundant calls). Match the L2 config block so both clients
+      # mode on /store-batch (one embed call across all 3 indexers vs
+      # 3 redundant calls). Match the L2 config block so both clients
       # hit the same gateway with the same model. Set
       # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
       PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
@@ -244,7 +213,6 @@ services:
       - "host.docker.internal:host-gateway"
     depends_on:
       l2: { condition: service_started }
-      l4: { condition: service_started }
       l5: { condition: service_started }
       l6: { condition: service_started }
     healthcheck:
@@ -252,7 +220,14 @@ services:
       interval: 10s
       timeout: 5s
       retries: 30
-      start_period: 60s
+      # 180s gives L2 enough time to finish Neo4j schema + index creation
+      # on a cold start before compat's healthcheck starts counting failures.
+      # Observed concretely on the v0.9.4 deploy (2026-05-14): L2 took
+      # ~90s to warm up; with start_period: 60s, compat went unhealthy
+      # mid-startup, cloudflared's `depends_on: condition: service_healthy`
+      # failed, and `docker compose up` errored out before wait_for_health
+      # could observe the eventual recovery.
+      start_period: 180s
 networks:
   engine-net:
@@ -261,6 +236,5 @@ volumes:
   pme-nv-embed-cache:
   pme-l2-data:
   pme-l3-data:
-  pme-l4-data:
   pme-l5-data:
   pme-l6-data:

package/packages/memory-engine/engine/services/l2/Dockerfile CHANGED Viewed

@@ -9,9 +9,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Reranker = sentence-transformers MiniLM cross-encoder.
 # Torch CPU wheels are fine — reranker is small enough to be CPU-bound.
+#
+# sqlite-vec 0.1.9: native KNN over packed-f32 vectors stored in a vec0
+# virtual table. Replaces the legacy hand-rolled Python cosine loop over
+# JSON-serialised embeddings in search_qmd_informed (~15s timeout at 450k
+# rows → ~50ms native MATCH). Pin to 0.1.9 — that's the version probed
+# against L4 QMD's wire format (struct.pack f32 + cosine distance_metric).
 RUN pip install --no-cache-dir \
         fastapi "uvicorn[standard]" httpx requests pydantic \
         neo4j \
+        sqlite-vec==0.1.9 \
         "sentence-transformers" \
         "torch" --extra-index-url https://download.pytorch.org/whl/cpu