npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.6.0 → 0.7.0 - Mend

@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/packages/memory/src/corpus/cli.js CHANGED Viewed

@@ -6,7 +6,7 @@
  * formatting.
  */
-import { resolve, basename } from "node:path";
+import { resolve } from "node:path";
 import { existsSync, readFileSync } from "node:fs";
 import { promises as fsp } from "node:fs";
 import { execFileSync } from "node:child_process";
@@ -19,6 +19,7 @@ import {
   ingestPaths,
   estimateCorpus,
   hostedAdapter,
+  engineAdapter,
   loadState,
   saveState,
   defaultStatePath,
@@ -64,16 +65,80 @@ function resolveTenant() {
   return null;
 }
+/**
+ * Read the Claude Code plugin config (tes-memory.local.md) to discover
+ * which memory backend the user has configured. Single source of truth
+ * for `tes ingest` so it routes the same way the plugin's hooks do.
+ *
+ * Returns: { mode: "local"|"hosted", memory_url?, tes_endpoint?, … } or null.
+ */
+function readPluginConfig() {
+  const candidates = [
+    process.env.CLAUDE_CONFIG_DIR,
+    join(homedir(), ".claude-pentatonic"),
+    join(homedir(), ".claude"),
+  ].filter(Boolean);
+  for (const dir of candidates) {
+    const p = join(dir, "tes-memory.local.md");
+    if (!existsSync(p)) continue;
+    try {
+      const content = readFileSync(p, "utf-8");
+      const m = content.match(/^---\n([\s\S]*?)\n---/);
+      if (!m) continue;
+      const out = { _path: p };
+      for (const line of m[1].split("\n")) {
+        const kv = line.match(/^(\w+):\s*(.+)$/);
+        if (kv) out[kv[1]] = kv[2].trim();
+      }
+      return out;
+    } catch {
+      continue;
+    }
+  }
+  return null;
+}
 function buildAdapterOrFail() {
+  // 1. Env-var override (CI / scripts / explicit). Highest precedence.
+  const envEngineUrl =
+    process.env.MEMORY_ENGINE_URL || process.env.PENTATONIC_ENGINE_URL || null;
+  if (envEngineUrl) {
+    const arena =
+      process.env.MEMORY_ARENA ||
+      process.env.PENTATONIC_CLIENT_ID ||
+      process.env.TES_CLIENT_ID ||
+      "default";
+    return {
+      tenant: { source: "env-engine", engineUrl: envEngineUrl, arena },
+      adapter: engineAdapter({
+        engineUrl: envEngineUrl,
+        arena,
+        apiKey: process.env.MEMORY_ENGINE_API_KEY || null,
+      }),
+    };
+  }
+  // 2. Plugin config — same source of truth as the Claude Code hooks.
+  const pluginConfig = readPluginConfig();
+  if (pluginConfig?.mode === "local" && pluginConfig.memory_url) {
+    const arena = pluginConfig.client_id || "default";
+    return {
+      tenant: { source: `plugin-config (${pluginConfig._path})`, engineUrl: pluginConfig.memory_url, arena },
+      adapter: engineAdapter({ engineUrl: pluginConfig.memory_url, arena }),
+    };
+  }
+  // 3. Hosted/TES path: env vars + ~/.config/tes/credentials.json.
   const tenant = resolveTenant();
   if (!tenant) {
     process.stderr.write(
-      "Error: TES tenant not configured.\n\n" +
-        "  Set environment variables:\n" +
-        "    export TES_ENDPOINT=https://your-co.api.pentatonic.com\n" +
-        "    export TES_CLIENT_ID=your-co\n" +
-        "    export TES_API_KEY=tes_your-co_xxxxx\n\n" +
-        "  Or run: npx @pentatonic-ai/ai-agent-sdk init\n"
+      "Error: no memory backend configured.\n\n" +
+        "  Configure with one of:\n" +
+        "    npx @pentatonic-ai/ai-agent-sdk config local       # local engine\n" +
+        "    npx @pentatonic-ai/ai-agent-sdk login              # hosted TES\n\n" +
+        "  Or set env vars directly:\n" +
+        "    MEMORY_ENGINE_URL=http://localhost:8099\n" +
+        "    TES_ENDPOINT=… TES_CLIENT_ID=… TES_API_KEY=…\n"
     );
     return null;
   }

package/packages/memory/src/corpus/index.js CHANGED Viewed

@@ -27,7 +27,7 @@
 export { discover, isPathEligible } from "./discover.js";
 export { chunkFile } from "./chunkers.js";
 export { ingestCorpus, syncCorpus, ingestPaths } from "./ingest.js";
-export { localAdapter, hostedAdapter } from "./adapters.js";
+export { localAdapter, hostedAdapter, engineAdapter } from "./adapters.js";
 export {
   loadState,
   saveState,

package/packages/memory-engine/.env.example ADDED Viewed

@@ -0,0 +1,13 @@
+# pentatonic-memory-engine — environment overrides
+# Compat shim port (matches pentatonic-memory v0.5 default)
+PME_PORT=8099
+# Client tenant scoping
+CLIENT_ID=default
+# Neo4j auth (L3 KG)
+NEO4J_AUTH=neo4j/local-dev-pw
+# NV-Embed model (auto-downloaded from Hugging Face on first run)
+NV_EMBED_MODEL=nvidia/NV-Embed-v2

package/packages/memory-engine/README.md ADDED Viewed

@@ -0,0 +1,131 @@
+# pentatonic-memory-engine
+**Drop-in replacement for `pentatonic-memory` v0.5.x with a 7-layer retrieval stack underneath.**
+| Configuration | Mean accuracy* | p50 latency |
+|---|---|---|
+| pentatonic-memory v0.5.6 (current OSS) | 17.6% | 33ms |
+| pentatonic-memory v0.4.7 (legacy OSS) | 38.8% | 27ms |
+| **pentatonic-memory-engine — fast path** (L6-only via docker, default config) | **84.6%** | **110ms** |
+| **pentatonic-memory-engine — max accuracy** (full 7-layer L2 fusion) | **85.7%** | **1241ms** |
+| langmem (in-process) | 83.0% | 121ms |
+| cognee | 82.1% | 192ms |
+| single-store baseline | 79.3% | 110ms |
+\* Mean over 6 commerce-domain benches (agent-coding, chat-recall, circular-economy, customer-support, marketplace-ops, product-catalogue) using substring grading. Full reports under `bench/`.
+**Two configurations, same package.** The fast path (L6-only) is the default and ships at #1 on accuracy among real OSS memory stacks. The max-accuracy 7-layer mode adds Knowledge-Graph entity matching + L0 BM25 + L4 vec fusion via the L2 orchestrator — buys you +1.1pp at 11× latency. Pick per workload (live agent loop → fast path; offline batch / accuracy-graded eval → 7-layer).
+---
+## What this is
+A self-contained docker-compose package that exposes the **same HTTP API as `pentatonic-memory`** (`/store`, `/search`, `/health`), plus two regression-fix endpoints (`/store-batch`, `/forget`) — but routes every call through a 7-layer hybrid retrieval engine instead of the single Postgres + pgvector store.
+Same client code. Same SDK. ~5x better accuracy on retrieval-style benchmarks.
+## Why does the existing OSS underperform?
+Detailed analysis in `docs/why-v05-underperforms.md`. Short version:
+- Single vector store (pgvector), single embedding per row → diluted vectors on long content
+- `atomBoost: +0.15` makes LLM-paraphrased atoms outrank source verbatim → substring grading fails
+- HyDE generated at ingest time (60s LLM call per /store), not at query time
+- pgvector HNSW broken at >2000 dims → 4096d NV-Embed falls back to sequential scan
+- No reranker, no graph traversal, no multi-store fusion
+## Architecture (7-layer)
+The engine is the same `sequential-hybridrag-7-layer` stack the L2 proxy reports in its health endpoint.
+```
+                                                     ┌──────────────────┐
+                                                     │  L0  BM25 (FTS)  │
+                                                     ├──────────────────┤
+                                                     │  L1  Core files  │
+                  POST /store    ┌──────────────┐   ├──────────────────┤
+                  POST /search   │ compat shim  │   │  L2  HybridRAG   │
+client (any) ───► POST /forget ──►   (FastAPI)  │──►│      orchestrator│
+                  POST /store-batch└──────────────┘   ├──────────────────┤
+                  GET  /health                       │  L3  Knowledge    │
+                                                     │      Graph (KG)   │
+                                                     ├──────────────────┤
+                                                     │  L4  sqlite-vec   │
+                                                     ├──────────────────┤
+                                                     │  L5  Qdrant comms │
+                                                     ├──────────────────┤
+                                                     │  L6  Document     │
+                                                     │      Store +      │
+                                                     │      reranker     │
+                                                     └─────────┬────────┘
+                                                               │
+                                              ┌────────────────┴───────┐
+                                              │  NV-Embed-v2            │
+                                              │  Cross-encoder reranker │
+                                              └─────────────────────────┘
+```
+Each layer indexes the same content differently. Search runs all seven in parallel and fuses results via Reciprocal Rank Fusion (RRF). Different query types win on different layers — agent-coding queries land on L0 BM25, chat-recall on L5, multi-hop entity questions on L3, conversational context on L1.
+**Layer cheat-sheet:**
+| # | Layer | Purpose | Backing tech |
+|---|---|---|---|
+| L0 | BM25 | Lexical / keyword recall | SQLite FTS5 |
+| L1 | Core files | Always-loaded high-priority text (system manuals, key docs) | flat markdown read by L2 |
+| L2 | HybridRAG orchestrator | Fan-out + RRF fusion across all layers | Python FastAPI |
+| L3 | Knowledge Graph | Entity-aware retrieval, multi-hop relationships | Neo4j (OSS) |
+| L4 | Vector index | High-recall semantic search | sqlite-vec |
+| L5 | Comms / multi-collection vectors | Chat / email / contact / memory namespaces | Qdrant |
+| L6 | Document store | Per-arena docs + cross-encoder reranker | sqlite + Milvus + MiniLM |
+## Quick start
+```bash
+git clone <this-repo>
+cd pentatonic-memory-engine
+cp .env.example .env       # set NEO4J_AUTH, etc.
+docker compose up -d
+```
+Wait ~30s for layers to come up. Verify:
+```bash
+curl http://localhost:8099/health
+# → {"status":"ok","layers":{"l0":"ok","l1":"ok","l2":"ok","l3":"ok","l4":"ok","l5":"ok","l6":"ok"},"engine":"pentatonic-memory-engine"}
+```
+Now point your existing `pentatonic-memory` SDK client at `http://localhost:8099` — no code change.
+### Picking a mode
+Both modes share the same `docker compose up -d` and the same HTTP API. Switch via one env var on the `compat` container:
+```bash
+# Fast path — L6-only, 84.6% / 110ms p50  (default)
+BYPASS_L2_PROXY=1 docker compose up -d compat
+# Max accuracy — full 7-layer L2 fusion, 85.7% / 1241ms p50
+BYPASS_L2_PROXY=0 docker compose up -d compat
+```
+| Mode | Mean acc | p50 | When to use |
+|---|---|---|---|
+| L6-only (default) | 84.6% | 110ms | Live agent calls, latency-sensitive paths |
+| 7-layer fusion | 85.7% | 1241ms | Offline batch retrieval, accuracy-graded eval, multi-hop entity queries |
+Both modes populate all 7 layers on `/store-batch` (since v0.2). The mode flag only changes which layers the **search** path queries.
+## API compatibility
+| Endpoint | v0.5 | This package | Notes |
+|---|---|---|---|
+| `POST /store` | ✅ | ✅ | Same request/response shape |
+| `POST /search` | ✅ | ✅ | Same request/response shape; ?mode=vector/text both supported |
+| `GET /health` | ✅ | ✅ | Returns aggregate health across all 7 layers |
+| `POST /store-batch` | ❌ | ✅ | New: batch-ingest N records in one HTTP call (30-50× faster) |
+| `POST /forget` | ❌ (regression) | ✅ | Restored from v0.4.x; supports `metadata_contains` filter |
+Migration: see `docs/MIGRATION.md`.

package/packages/memory-engine/bench/README.md ADDED Viewed

@@ -0,0 +1,99 @@
+# Benchmark Results
+All runs were conducted on **DGX Spark GB10** (10-core ARM CPU, 128GB unified memory, NVIDIA GB10 SoC) on **2026-04-27**.
+## Summary
+| Stack | Mean accuracy | Mean p50 latency | Coverage |
+|---|---|---|---|
+| **pentatonic-memory-engine — 7-layer fusion** | **85.7%** | 1241ms | 6/6 |
+| **pentatonic-memory-engine — L6-only fast path** | **84.6%** | 110ms | 6/6 |
+| pentatonic-memory v0.4.7 (current canonical OSS) | 38.8% | 27ms | 6/6 |
+| pentatonic-memory v0.5.6 (latest OSS) | 17.6% | 33ms | 6/6 |
+Both pentatonic-memory baselines were freshly purged before the run (no stale data pollution). Both modes of `pentatonic-memory-engine` ship in the same docker-compose package — one env var (`BYPASS_L2_PROXY`) toggles between fast path and 7-layer fusion.
+## Per-bench breakdown
+| Bench | 7-layer | L6-only | v0.4.7 | v0.5.6 |
+|---|---|---|---|---|
+| agent-coding | 100.0% (22/22) | 100.0% (22/22) | 63.6% (14/22) | 9.1% (2/22) |
+| chat-recall | 100.0% (16/16) | 100.0% (16/16) | 12.5% (2/16) | 0.0% (0/16) |
+| circular-economy | 76.0% (19/25) | 80.0% (20/25) | 40.0% (10/25) | 32.0% (8/25) |
+| customer-support | 75.0% (15/20) | 70.0% (14/20) | 25.0% (5/20) | 5.0% (1/20) |
+| marketplace-ops | 80.0% (16/20) | 80.0% (16/20) | 25.0% (5/20) | 15.0% (3/20) |
+| product-catalogue | 83.3% (15/18) | 77.8% (14/18) | 66.7% (12/18) | 44.4% (8/18) |
+| **MEAN** | **85.7%** | **84.6%** | **38.8%** | **17.6%** |
+### When does 7-layer fusion help?
+Layer-by-layer effect over L6-only:
+- **+5.6pp on product-catalogue** — KG entity matching pulls related SKUs / materials in one hop; L0 BM25 catches part numbers that vector search alone misses.
+- **+5.0pp on customer-support** — Multi-hop entity resolution (customer → order → policy) lifts retrieval where pure semantic search loses the relationship.
+- **Tied on agent-coding, chat-recall, marketplace-ops** — L6 already saturated (100%, 100%, 80%); extra layers add nothing.
+- **−4.0pp on circular-economy** — Extra layers add noise on this sustainability corpus; L6's reranker alone is the better signal.
+Net: +1.1pp accuracy at 11× latency cost. Use 7-layer for accuracy-graded eval and offline batch retrieval; stay on L6-only for live agent calls.
+## Bench corpora
+The 6 benches use commerce-domain corpora that overlap Pentatonic's actual product space:
+- `agent-coding` — 22 questions over 22 docs (TES + agent SDK source/docs)
+- `chat-recall` — 16 questions over a 16-turn chat transcript
+- `circular-economy` — 25 questions over 25 sustainability docs
+- `customer-support` — 20 questions over a 20-doc support knowledge base
+- `marketplace-ops` — 20 questions over 20 marketplace listings
+- `product-catalogue` — 18 questions over an 18-SKU product catalogue
+All grading uses **substring match**: a hit is correct if the retrieved text contains the literal answer string. This is the strictest grading mode and the closest analogue to "did the SDK return a chunk that actually answers the question."
+## Reproduce
+```bash
+# Bring up the engine
+cd pentatonic-memory-engine && docker compose up -d
+# Wait for healthy
+until curl -sf http://localhost:8099/health | grep -q '"status":"ok"'; do sleep 2; done
+# Set up the bench harness
+cd ~/pentatonic-memory-bench
+pip install -e .
+# Run the L6-only fast path (default)
+PENTATONIC_MEMORY_URL=http://localhost:8099 \
+  python -m pentatonic_bench.cli run -b chat-recall -s pentatonic-memory -k 3
+# Run the 7-layer fusion (toggle BYPASS_L2_PROXY=0 + restart compat)
+BYPASS_L2_PROXY=0 docker compose up -d --force-recreate compat
+PENTATONIC_MEMORY_URL=http://localhost:8099 \
+  python -m pentatonic_bench.cli run -b chat-recall -s pentatonic-memory -k 3
+```
+## Comparison to other open-source memory stacks
+| Stack | Mean acc | Mean p50 | Notes |
+|---|---|---|---|
+| 🥇 **pentatonic-memory-engine — 7-layer** | **85.7%** | **1241ms** | This package, full L2 fusion |
+| 🥈 **pentatonic-memory-engine — L6-only** | **84.6%** | **110ms** | This package, fast path |
+| 🥉 langmem | 83.0% | 121ms | LangChain's in-process memory; no HTTP/embedding overhead |
+| cognee | 82.1% | 192ms | Graph + vector hybrid, KG-first |
+| single-store baseline | 79.3% | 110ms | Single vector store + sentence-transformers |
+| llamaindex | 79.3% | 203ms | LlamaIndex with default config |
+| bm25-baseline | 75.9% | 0ms | Pure SQLite FTS5, no embeddings |
+| pentatonic-memory v0.4.7 | 38.8% | 27ms | Current canonical OSS |
+| graphiti | 30.1% | 156ms | Graph-only, no vector |
+| pentatonic-memory v0.5.6 | 17.6% | 33ms | Latest OSS |
+Engine beats every other OSS memory stack on accuracy in both modes. The L6-only fast path matches langmem's latency profile while delivering +1.6pp accuracy. The 7-layer mode is the genuine #1 on accuracy across all benchmarked stacks.
+## Raw scorecards
+- `scorecards-engine-via-docker/` — 6 JSON scorecards, L6-only fast path (84.6% mean / 110ms p50)
+- `scorecards-engine-via-l2-7-layer-populated/` — 6 JSON scorecards, full 7-layer fusion (85.7% mean / 1241ms p50)
+- `scorecards-engine-via-l2-empty-layers/` — earlier experiment, 7-layer with empty L0/L4-qmd/L3 (82.1%, rolled back; superseded by populated 7-layer)
+- `scorecards-engine-via-shim/` — earlier experiment, shim-direct ingestion path
+- `scorecards-engine/` — initial bench (1183ms, before L6-only optimisation)
+- `scorecards-pentatonic-baseline/` — 12 JSON scorecards (6 per stack) for the v0.4.7 and v0.5.6 baselines