npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.5.11 → 0.7.0 - Mend

@pentatonic-ai/ai-agent-sdk 0.5.11 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/packages/memory/src/corpus/state.js ADDED Viewed

@@ -0,0 +1,134 @@
+/**
+ * Local corpus state — what repos are tracked, content hashes per file,
+ * last sync timestamps. Lives at ~/.config/tes/corpus.json (or
+ * $XDG_CONFIG_HOME/tes/corpus.json) so it survives plugin reinstalls
+ * but stays per-developer.
+ *
+ * Schema:
+ *   {
+ *     "version": 1,
+ *     "tenant": { "clientId": "acme", "endpoint": "https://acme.api..." },
+ *     "sources": {
+ *       "/abs/path/to/repo": {
+ *         "sourceType": "git" | "directory",
+ *         "sourceUrl": "git@github.com:org/repo.git" | null,
+ *         "addedAt": "2026-04-27T12:00:00Z",
+ *         "lastSyncedAt": "2026-04-27T12:05:00Z",
+ *         "lastSyncedCommit": "abc123" | null,
+ *         "files": {
+ *           "src/index.ts": { "hash": "sha256...", "chunks": 3, "indexedAt": "..." }
+ *         },
+ *         "stats": { "fileCount": 47, "chunkCount": 132, "totalBytes": 184320 }
+ *       }
+ *     }
+ *   }
+ *
+ * Atomic writes via tmpfile + rename so partial writes can't corrupt
+ * the state file mid-sync.
+ */
+import { promises as fsp, existsSync } from "node:fs";
+import { join, dirname, resolve } from "node:path";
+import { homedir } from "node:os";
+const STATE_VERSION = 1;
+export function defaultStatePath() {
+  const xdg = process.env.XDG_CONFIG_HOME || join(homedir(), ".config");
+  return join(xdg, "tes", "corpus.json");
+}
+export function emptyState() {
+  return {
+    version: STATE_VERSION,
+    tenant: null,
+    sources: {},
+  };
+}
+export async function loadState(path = defaultStatePath()) {
+  if (!existsSync(path)) return emptyState();
+  try {
+    const raw = await fsp.readFile(path, "utf-8");
+    const parsed = JSON.parse(raw);
+    if (!parsed.version || parsed.version > STATE_VERSION) {
+      throw new Error(
+        `corpus state at ${path} has unsupported version ${parsed.version} (we understand up to ${STATE_VERSION}). Upgrade the SDK.`
+      );
+    }
+    parsed.sources = parsed.sources || {};
+    return parsed;
+  } catch (err) {
+    if (err instanceof SyntaxError) {
+      throw new Error(`corpus state at ${path} is corrupt JSON: ${err.message}`);
+    }
+    throw err;
+  }
+}
+export async function saveState(state, path = defaultStatePath()) {
+  await fsp.mkdir(dirname(path), { recursive: true });
+  const tmp = `${path}.${process.pid}.tmp`;
+  await fsp.writeFile(tmp, JSON.stringify(state, null, 2), {
+    mode: 0o600, // user-only — state may include endpoint URLs
+  });
+  await fsp.rename(tmp, path);
+}
+export function getSource(state, repoPath) {
+  const abs = resolve(repoPath);
+  return state.sources[abs] || null;
+}
+export function upsertSource(state, repoPath, patch) {
+  const abs = resolve(repoPath);
+  const existing = state.sources[abs] || {
+    sourceType: "directory",
+    sourceUrl: null,
+    addedAt: new Date().toISOString(),
+    lastSyncedAt: null,
+    lastSyncedCommit: null,
+    files: {},
+    stats: { fileCount: 0, chunkCount: 0, totalBytes: 0 },
+  };
+  state.sources[abs] = { ...existing, ...patch };
+  return state.sources[abs];
+}
+export function removeSource(state, repoPath) {
+  const abs = resolve(repoPath);
+  if (state.sources[abs]) {
+    delete state.sources[abs];
+    return true;
+  }
+  return false;
+}
+export function recordFile(source, relPath, hash, chunks) {
+  source.files[relPath] = {
+    hash,
+    chunks,
+    indexedAt: new Date().toISOString(),
+  };
+}
+export function forgetFile(source, relPath) {
+  if (source.files[relPath]) {
+    delete source.files[relPath];
+    return true;
+  }
+  return false;
+}
+export function recomputeStats(source) {
+  let fileCount = 0;
+  let chunkCount = 0;
+  for (const f of Object.values(source.files)) {
+    fileCount++;
+    chunkCount += f.chunks || 0;
+  }
+  source.stats = { ...source.stats, fileCount, chunkCount };
+  return source.stats;
+}
+export { STATE_VERSION };

package/packages/memory/src/index.js CHANGED Viewed

@@ -133,3 +133,21 @@ export { decay } from "./decay.js";
 export { consolidate } from "./consolidate.js";
 export { ensureLayers, getLayers } from "./layers.js";
 export { migrate } from "./migrate.js";
+export {
+  ingestCorpus,
+  syncCorpus,
+  ingestPaths,
+  estimateCorpus,
+  discover,
+  isPathEligible,
+  chunkFile,
+  localAdapter,
+  hostedAdapter,
+  loadState,
+  saveState,
+  defaultStatePath,
+  emptyState,
+  upsertSource,
+  removeSource,
+  getSource,
+} from "./corpus/index.js";

package/packages/memory/src/ingest.js CHANGED Viewed

@@ -16,6 +16,10 @@ import { distill } from "./distill.js";
  * @param {string} [opts.userId] - Optional user ID
  * @param {string} [opts.layerType="episodic"] - Target layer
  * @param {object} [opts.metadata] - Additional metadata
+ * @param {boolean} [opts.distill=true] - Run conversation-shaped fact
+ *   extraction. Pass false for code/structured content.
+ * @param {boolean} [opts.hyde=true] - Generate hypothetical queries
+ *   (HyDE). Pass false for code/structured content.
  * @param {Function} [opts.logger] - Optional logger
  * @param {Function} [opts.waitUntil] - Platform hook to register background
  *   tasks (e.g. Cloudflare Worker ctx.waitUntil). If provided, the distill
@@ -165,18 +169,23 @@ export async function ingest(db, ai, llm, content, opts = {}) {
     log(`Embedding failed for ${memoryId}: ${err.message}`);
   }
-  // HyDE: generate hypothetical queries (non-fatal)
-  try {
-    const queries = await generateHypotheticalQueries(llm, content);
-    if (queries.length) {
-      await db(
-        `UPDATE memory_nodes SET metadata = jsonb_set(COALESCE(metadata, '{}')::jsonb, '{hypothetical_queries}', $1::jsonb), updated_at = NOW() WHERE id = $2`,
-        [JSON.stringify(queries), memoryId]
-      );
-      log(`Generated ${queries.length} hypothetical queries for ${memoryId}`);
+  // HyDE: generate hypothetical queries (non-fatal). Skippable via
+  // opts.hyde === false — corpus ingest passes this for code_reference
+  // chunks because hypothetical-question expansion against function
+  // signatures degrades retrieval and burns one LLM call per chunk.
+  if (opts.hyde !== false) {
+    try {
+      const queries = await generateHypotheticalQueries(llm, content);
+      if (queries.length) {
+        await db(
+          `UPDATE memory_nodes SET metadata = jsonb_set(COALESCE(metadata, '{}')::jsonb, '{hypothetical_queries}', $1::jsonb), updated_at = NOW() WHERE id = $2`,
+          [JSON.stringify(queries), memoryId]
+        );
+        log(`Generated ${queries.length} hypothetical queries for ${memoryId}`);
+      }
+    } catch (err) {
+      log(`HyDE failed for ${memoryId}: ${err.message}`);
     }
-  } catch (err) {
-    log(`HyDE failed for ${memoryId}: ${err.message}`);
   }
   // Distill atomic facts — only for raw ingestions (skip if this call is

package/packages/memory/src/openclaw/index.js CHANGED Viewed

@@ -36,6 +36,9 @@
  */
 import pg from "pg";
+import { existsSync, readFileSync } from "node:fs";
+import { homedir } from "node:os";
+import { join } from "node:path";
 import { createMemorySystem } from "../index.js";
 import { createContextEngine } from "./context-engine.js";
 import { sanitizeMemoryContent } from "../sanitize.js";
@@ -45,6 +48,37 @@ import {
   hostedStoreMemory as _hostedStoreMemory,
 } from "../hosted.js";
+/**
+ * Hydrate hosted-mode credentials from ~/.config/tes/credentials.json
+ * when the plugin config doesn't carry tes_endpoint/tes_api_key/
+ * tes_client_id. Lets a user who ran `tes login` get OpenClaw working
+ * without also editing openclaw.json by hand. Plugin-config values
+ * take precedence over the credentials file.
+ */
+function hydrateHostedConfig(config) {
+  if (config?.tes_endpoint && config?.tes_api_key && config?.tes_client_id) {
+    return config;
+  }
+  const credPath = join(
+    process.env.XDG_CONFIG_HOME || join(homedir(), ".config"),
+    "tes",
+    "credentials.json"
+  );
+  if (!existsSync(credPath)) return config;
+  try {
+    const creds = JSON.parse(readFileSync(credPath, "utf-8"));
+    if (!creds?.endpoint || !creds?.clientId || !creds?.apiKey) return config;
+    return {
+      ...(config || {}),
+      tes_endpoint: config?.tes_endpoint || creds.endpoint,
+      tes_client_id: config?.tes_client_id || creds.clientId,
+      tes_api_key: config?.tes_api_key || creds.apiKey,
+    };
+  } catch {
+    return config;
+  }
+}
 // --- Hosted-mode adapters ---
 //
 // The OpenClaw plugin predates the public hosted-helper API (`packages/
@@ -399,7 +433,11 @@ export default {
   kind: "context-engine",
   register(api) {
-    const config = api.config || {};
+    // Hydrate hosted creds from ~/.config/tes/credentials.json if the
+    // plugin config doesn't carry tes_* keys. Lets `tes login` auto-
+    // configure OpenClaw without the user editing openclaw.json by
+    // hand. Plugin-config values take precedence.
+    const config = hydrateHostedConfig(api.config || {});
     const hosted = isHostedMode(config);
     const log = (msg) =>
       process.stderr.write(`[pentatonic-memory] ${msg}\n`);

package/packages/memory/src/search.js CHANGED Viewed

@@ -29,6 +29,11 @@ const DEFAULT_WEIGHTS = {
  * @param {number} [opts.limit=20] - Max results
  * @param {number} [opts.minScore=0.5] - Minimum score threshold
  * @param {string} [opts.userId] - Optional user scope
+ * @param {string} [opts.kind] - Filter by metadata.kind exact match
+ *   (e.g. "code_reference"). When omitted, all kinds are searched.
+ *   Lets corpus ingest scope a query to code references only,
+ *   isolating them from conversational memories that share the
+ *   semantic layer.
  * @param {object} [opts.weights] - Override scoring weights
  *   (relevance, recency, frequency, atomBoost, verbosityPenalty)
  * @param {boolean} [opts.dedupeBySource=true] - When an atom matches,
@@ -76,10 +81,18 @@ export async function search(db, ai, query, opts = {}) {
   }
   const embJson = JSON.stringify(embResult.embedding);
-  const userFilter = opts.userId ? `AND mn.user_id = $5` : "";
   const params = [opts.clientId, embJson, query, limit];
-  if (opts.userId) params.push(opts.userId);
+  let nextParam = 5;
+  let userFilter = "";
+  if (opts.userId) {
+    userFilter = `AND mn.user_id = $${nextParam++}`;
+    params.push(opts.userId);
+  }
+  let kindFilter = "";
+  if (opts.kind) {
+    kindFilter = `AND mn.metadata->>'kind' = $${nextParam++}`;
+    params.push(opts.kind);
+  }
   const sql = `
     WITH max_ac AS (
@@ -143,6 +156,7 @@ export async function search(db, ai, query, opts = {}) {
       AND mn.embedding_vec IS NOT NULL
       AND vector_dims(mn.embedding_vec) = vector_dims($2::vector)
       ${userFilter}
+      ${kindFilter}
     ORDER BY final_score DESC
     LIMIT $4
   `;
@@ -203,10 +217,18 @@ export async function search(db, ai, query, opts = {}) {
  */
 export async function textSearch(db, query, opts = {}) {
   const limit = Math.min(Math.max(1, opts.limit || 20), 200);
-  const userFilter = opts.userId ? `AND mn.user_id = $4` : "";
-  const params = opts.userId
-    ? [opts.clientId, query, limit, opts.userId]
-    : [opts.clientId, query, limit];
+  const params = [opts.clientId, query, limit];
+  let nextParam = 4;
+  let userFilter = "";
+  if (opts.userId) {
+    userFilter = `AND mn.user_id = $${nextParam++}`;
+    params.push(opts.userId);
+  }
+  let kindFilter = "";
+  if (opts.kind) {
+    kindFilter = `AND mn.metadata->>'kind' = $${nextParam++}`;
+    params.push(opts.kind);
+  }
   const sql = `
     SELECT mn.* FROM memory_nodes mn
@@ -216,6 +238,7 @@ export async function textSearch(db, query, opts = {}) {
         OR mn.content ILIKE '%' || $2 || '%'
       )
       ${userFilter}
+      ${kindFilter}
     ORDER BY
       ts_rank(to_tsvector('english', mn.content), plainto_tsquery('english', $2)) DESC,
       mn.confidence DESC

package/packages/memory-engine/.env.example ADDED Viewed

@@ -0,0 +1,13 @@
+# pentatonic-memory-engine — environment overrides
+# Compat shim port (matches pentatonic-memory v0.5 default)
+PME_PORT=8099
+# Client tenant scoping
+CLIENT_ID=default
+# Neo4j auth (L3 KG)
+NEO4J_AUTH=neo4j/local-dev-pw
+# NV-Embed model (auto-downloaded from Hugging Face on first run)
+NV_EMBED_MODEL=nvidia/NV-Embed-v2

package/packages/memory-engine/README.md ADDED Viewed

@@ -0,0 +1,131 @@
+# pentatonic-memory-engine
+**Drop-in replacement for `pentatonic-memory` v0.5.x with a 7-layer retrieval stack underneath.**
+| Configuration | Mean accuracy* | p50 latency |
+|---|---|---|
+| pentatonic-memory v0.5.6 (current OSS) | 17.6% | 33ms |
+| pentatonic-memory v0.4.7 (legacy OSS) | 38.8% | 27ms |
+| **pentatonic-memory-engine — fast path** (L6-only via docker, default config) | **84.6%** | **110ms** |
+| **pentatonic-memory-engine — max accuracy** (full 7-layer L2 fusion) | **85.7%** | **1241ms** |
+| langmem (in-process) | 83.0% | 121ms |
+| cognee | 82.1% | 192ms |
+| single-store baseline | 79.3% | 110ms |
+\* Mean over 6 commerce-domain benches (agent-coding, chat-recall, circular-economy, customer-support, marketplace-ops, product-catalogue) using substring grading. Full reports under `bench/`.
+**Two configurations, same package.** The fast path (L6-only) is the default and ships at #1 on accuracy among real OSS memory stacks. The max-accuracy 7-layer mode adds Knowledge-Graph entity matching + L0 BM25 + L4 vec fusion via the L2 orchestrator — buys you +1.1pp at 11× latency. Pick per workload (live agent loop → fast path; offline batch / accuracy-graded eval → 7-layer).
+---
+## What this is
+A self-contained docker-compose package that exposes the **same HTTP API as `pentatonic-memory`** (`/store`, `/search`, `/health`), plus two regression-fix endpoints (`/store-batch`, `/forget`) — but routes every call through a 7-layer hybrid retrieval engine instead of the single Postgres + pgvector store.
+Same client code. Same SDK. ~5x better accuracy on retrieval-style benchmarks.
+## Why does the existing OSS underperform?
+Detailed analysis in `docs/why-v05-underperforms.md`. Short version:
+- Single vector store (pgvector), single embedding per row → diluted vectors on long content
+- `atomBoost: +0.15` makes LLM-paraphrased atoms outrank source verbatim → substring grading fails
+- HyDE generated at ingest time (60s LLM call per /store), not at query time
+- pgvector HNSW broken at >2000 dims → 4096d NV-Embed falls back to sequential scan
+- No reranker, no graph traversal, no multi-store fusion
+## Architecture (7-layer)
+The engine is the same `sequential-hybridrag-7-layer` stack the L2 proxy reports in its health endpoint.
+```
+                                                     ┌──────────────────┐
+                                                     │  L0  BM25 (FTS)  │
+                                                     ├──────────────────┤
+                                                     │  L1  Core files  │
+                  POST /store    ┌──────────────┐   ├──────────────────┤
+                  POST /search   │ compat shim  │   │  L2  HybridRAG   │
+client (any) ───► POST /forget ──►   (FastAPI)  │──►│      orchestrator│
+                  POST /store-batch└──────────────┘   ├──────────────────┤
+                  GET  /health                       │  L3  Knowledge    │
+                                                     │      Graph (KG)   │
+                                                     ├──────────────────┤
+                                                     │  L4  sqlite-vec   │
+                                                     ├──────────────────┤
+                                                     │  L5  Qdrant comms │
+                                                     ├──────────────────┤
+                                                     │  L6  Document     │
+                                                     │      Store +      │
+                                                     │      reranker     │
+                                                     └─────────┬────────┘
+                                                               │
+                                              ┌────────────────┴───────┐
+                                              │  NV-Embed-v2            │
+                                              │  Cross-encoder reranker │
+                                              └─────────────────────────┘
+```
+Each layer indexes the same content differently. Search runs all seven in parallel and fuses results via Reciprocal Rank Fusion (RRF). Different query types win on different layers — agent-coding queries land on L0 BM25, chat-recall on L5, multi-hop entity questions on L3, conversational context on L1.
+**Layer cheat-sheet:**
+| # | Layer | Purpose | Backing tech |
+|---|---|---|---|
+| L0 | BM25 | Lexical / keyword recall | SQLite FTS5 |
+| L1 | Core files | Always-loaded high-priority text (system manuals, key docs) | flat markdown read by L2 |
+| L2 | HybridRAG orchestrator | Fan-out + RRF fusion across all layers | Python FastAPI |
+| L3 | Knowledge Graph | Entity-aware retrieval, multi-hop relationships | Neo4j (OSS) |
+| L4 | Vector index | High-recall semantic search | sqlite-vec |
+| L5 | Comms / multi-collection vectors | Chat / email / contact / memory namespaces | Qdrant |
+| L6 | Document store | Per-arena docs + cross-encoder reranker | sqlite + Milvus + MiniLM |
+## Quick start
+```bash
+git clone <this-repo>
+cd pentatonic-memory-engine
+cp .env.example .env       # set NEO4J_AUTH, etc.
+docker compose up -d
+```
+Wait ~30s for layers to come up. Verify:
+```bash
+curl http://localhost:8099/health
+# → {"status":"ok","layers":{"l0":"ok","l1":"ok","l2":"ok","l3":"ok","l4":"ok","l5":"ok","l6":"ok"},"engine":"pentatonic-memory-engine"}
+```
+Now point your existing `pentatonic-memory` SDK client at `http://localhost:8099` — no code change.
+### Picking a mode
+Both modes share the same `docker compose up -d` and the same HTTP API. Switch via one env var on the `compat` container:
+```bash
+# Fast path — L6-only, 84.6% / 110ms p50  (default)
+BYPASS_L2_PROXY=1 docker compose up -d compat
+# Max accuracy — full 7-layer L2 fusion, 85.7% / 1241ms p50
+BYPASS_L2_PROXY=0 docker compose up -d compat
+```
+| Mode | Mean acc | p50 | When to use |
+|---|---|---|---|
+| L6-only (default) | 84.6% | 110ms | Live agent calls, latency-sensitive paths |
+| 7-layer fusion | 85.7% | 1241ms | Offline batch retrieval, accuracy-graded eval, multi-hop entity queries |
+Both modes populate all 7 layers on `/store-batch` (since v0.2). The mode flag only changes which layers the **search** path queries.
+## API compatibility
+| Endpoint | v0.5 | This package | Notes |
+|---|---|---|---|
+| `POST /store` | ✅ | ✅ | Same request/response shape |
+| `POST /search` | ✅ | ✅ | Same request/response shape; ?mode=vector/text both supported |
+| `GET /health` | ✅ | ✅ | Returns aggregate health across all 7 layers |
+| `POST /store-batch` | ❌ | ✅ | New: batch-ingest N records in one HTTP call (30-50× faster) |
+| `POST /forget` | ❌ (regression) | ✅ | Restored from v0.4.x; supports `metadata_contains` filter |
+Migration: see `docs/MIGRATION.md`.

package/packages/memory-engine/bench/README.md ADDED Viewed

@@ -0,0 +1,99 @@
+# Benchmark Results
+All runs were conducted on **DGX Spark GB10** (10-core ARM CPU, 128GB unified memory, NVIDIA GB10 SoC) on **2026-04-27**.
+## Summary
+| Stack | Mean accuracy | Mean p50 latency | Coverage |
+|---|---|---|---|
+| **pentatonic-memory-engine — 7-layer fusion** | **85.7%** | 1241ms | 6/6 |
+| **pentatonic-memory-engine — L6-only fast path** | **84.6%** | 110ms | 6/6 |
+| pentatonic-memory v0.4.7 (current canonical OSS) | 38.8% | 27ms | 6/6 |
+| pentatonic-memory v0.5.6 (latest OSS) | 17.6% | 33ms | 6/6 |
+Both pentatonic-memory baselines were freshly purged before the run (no stale data pollution). Both modes of `pentatonic-memory-engine` ship in the same docker-compose package — one env var (`BYPASS_L2_PROXY`) toggles between fast path and 7-layer fusion.
+## Per-bench breakdown
+| Bench | 7-layer | L6-only | v0.4.7 | v0.5.6 |
+|---|---|---|---|---|
+| agent-coding | 100.0% (22/22) | 100.0% (22/22) | 63.6% (14/22) | 9.1% (2/22) |
+| chat-recall | 100.0% (16/16) | 100.0% (16/16) | 12.5% (2/16) | 0.0% (0/16) |
+| circular-economy | 76.0% (19/25) | 80.0% (20/25) | 40.0% (10/25) | 32.0% (8/25) |
+| customer-support | 75.0% (15/20) | 70.0% (14/20) | 25.0% (5/20) | 5.0% (1/20) |
+| marketplace-ops | 80.0% (16/20) | 80.0% (16/20) | 25.0% (5/20) | 15.0% (3/20) |
+| product-catalogue | 83.3% (15/18) | 77.8% (14/18) | 66.7% (12/18) | 44.4% (8/18) |
+| **MEAN** | **85.7%** | **84.6%** | **38.8%** | **17.6%** |
+### When does 7-layer fusion help?
+Layer-by-layer effect over L6-only:
+- **+5.6pp on product-catalogue** — KG entity matching pulls related SKUs / materials in one hop; L0 BM25 catches part numbers that vector search alone misses.
+- **+5.0pp on customer-support** — Multi-hop entity resolution (customer → order → policy) lifts retrieval where pure semantic search loses the relationship.
+- **Tied on agent-coding, chat-recall, marketplace-ops** — L6 already saturated (100%, 100%, 80%); extra layers add nothing.
+- **−4.0pp on circular-economy** — Extra layers add noise on this sustainability corpus; L6's reranker alone is the better signal.
+Net: +1.1pp accuracy at 11× latency cost. Use 7-layer for accuracy-graded eval and offline batch retrieval; stay on L6-only for live agent calls.
+## Bench corpora
+The 6 benches use commerce-domain corpora that overlap Pentatonic's actual product space:
+- `agent-coding` — 22 questions over 22 docs (TES + agent SDK source/docs)
+- `chat-recall` — 16 questions over a 16-turn chat transcript
+- `circular-economy` — 25 questions over 25 sustainability docs
+- `customer-support` — 20 questions over a 20-doc support knowledge base
+- `marketplace-ops` — 20 questions over 20 marketplace listings
+- `product-catalogue` — 18 questions over an 18-SKU product catalogue
+All grading uses **substring match**: a hit is correct if the retrieved text contains the literal answer string. This is the strictest grading mode and the closest analogue to "did the SDK return a chunk that actually answers the question."
+## Reproduce
+```bash
+# Bring up the engine
+cd pentatonic-memory-engine && docker compose up -d
+# Wait for healthy
+until curl -sf http://localhost:8099/health | grep -q '"status":"ok"'; do sleep 2; done
+# Set up the bench harness
+cd ~/pentatonic-memory-bench
+pip install -e .
+# Run the L6-only fast path (default)
+PENTATONIC_MEMORY_URL=http://localhost:8099 \
+  python -m pentatonic_bench.cli run -b chat-recall -s pentatonic-memory -k 3
+# Run the 7-layer fusion (toggle BYPASS_L2_PROXY=0 + restart compat)
+BYPASS_L2_PROXY=0 docker compose up -d --force-recreate compat
+PENTATONIC_MEMORY_URL=http://localhost:8099 \
+  python -m pentatonic_bench.cli run -b chat-recall -s pentatonic-memory -k 3
+```
+## Comparison to other open-source memory stacks
+| Stack | Mean acc | Mean p50 | Notes |
+|---|---|---|---|
+| 🥇 **pentatonic-memory-engine — 7-layer** | **85.7%** | **1241ms** | This package, full L2 fusion |
+| 🥈 **pentatonic-memory-engine — L6-only** | **84.6%** | **110ms** | This package, fast path |
+| 🥉 langmem | 83.0% | 121ms | LangChain's in-process memory; no HTTP/embedding overhead |
+| cognee | 82.1% | 192ms | Graph + vector hybrid, KG-first |
+| single-store baseline | 79.3% | 110ms | Single vector store + sentence-transformers |
+| llamaindex | 79.3% | 203ms | LlamaIndex with default config |
+| bm25-baseline | 75.9% | 0ms | Pure SQLite FTS5, no embeddings |
+| pentatonic-memory v0.4.7 | 38.8% | 27ms | Current canonical OSS |
+| graphiti | 30.1% | 156ms | Graph-only, no vector |
+| pentatonic-memory v0.5.6 | 17.6% | 33ms | Latest OSS |
+Engine beats every other OSS memory stack on accuracy in both modes. The L6-only fast path matches langmem's latency profile while delivering +1.6pp accuracy. The 7-layer mode is the genuine #1 on accuracy across all benchmarked stacks.
+## Raw scorecards
+- `scorecards-engine-via-docker/` — 6 JSON scorecards, L6-only fast path (84.6% mean / 110ms p50)
+- `scorecards-engine-via-l2-7-layer-populated/` — 6 JSON scorecards, full 7-layer fusion (85.7% mean / 1241ms p50)
+- `scorecards-engine-via-l2-empty-layers/` — earlier experiment, 7-layer with empty L0/L4-qmd/L3 (82.1%, rolled back; superseded by populated 7-layer)
+- `scorecards-engine-via-shim/` — earlier experiment, shim-direct ingestion path
+- `scorecards-engine/` — initial bench (1183ms, before L6-only optimisation)
+- `scorecards-pentatonic-baseline/` — 12 JSON scorecards (6 per stack) for the v0.4.7 and v0.5.6 baselines