npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.5 → 0.9.6 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.5 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.cjs +39 -72
package/dist/index.js +36 -69
package/package.json +9 -2
package/packages/memory/package-lock.json +49 -33
package/packages/memory/package.json +4 -1
package/packages/memory/src/__tests__/engine.test.js +40 -5
package/packages/memory/src/engine.js +38 -3
package/packages/memory-engine/docker-compose.yml +16 -1
package/packages/memory-engine/engine/services/_shared/embed_provider.py +125 -31
package/packages/memory-engine/tests/test_embed_provider.py +201 -0

package/dist/index.cjs CHANGED Viewed

@@ -17,8 +17,8 @@ var __copyProps = (to, from, except, desc) => {
 var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 // src/index.js
-var src_exports = {};
-__export(src_exports, {
+var index_exports = {};
+__export(index_exports, {
   Session: () => Session,
   TESClient: () => TESClient,
   buildTrackUrl: () => buildTrackUrl,
@@ -27,7 +27,7 @@ __export(src_exports, {
   signPayload: () => signPayload,
   verifyPayload: () => verifyPayload
 });
-module.exports = __toCommonJS(src_exports);
+module.exports = __toCommonJS(index_exports);
 // src/normalizer.js
 function normalizeResponse(raw) {
@@ -189,8 +189,7 @@ var encoder = new TextEncoder();
 function toBase64Url(buffer) {
   const bytes = new Uint8Array(buffer);
   let binary = "";
-  for (let i = 0; i < bytes.length; i++)
-    binary += String.fromCharCode(bytes[i]);
+  for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
   return btoa(binary).replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
 }
 async function signPayload(secret, payload) {
@@ -211,27 +210,22 @@ async function verifyPayload(secret, payload, signature) {
 }
 async function buildTrackUrl(endpoint, apiKey, payload) {
   const p = { ...payload };
-  if (!p.e)
-    p.e = "LINK_CLICK";
+  if (!p.e) p.e = "LINK_CLICK";
   const encoded = toBase64Url(encoder.encode(JSON.stringify(p)));
   const sig = await signPayload(apiKey, p);
   return `${endpoint}/r/${encoded}?sig=${sig}`;
 }
 var URL_RE = /https?:\/\/[^\s"'<>)\]]+/g;
 async function rewriteUrls(text, config, sessionId, metadata) {
-  if (!text)
-    return text;
+  if (!text) return text;
   const redirectPrefix = `${config.endpoint}/r/`;
   const matches = [...text.matchAll(URL_RE)];
-  if (matches.length === 0)
-    return text;
+  if (matches.length === 0) return text;
   const replacements = /* @__PURE__ */ new Map();
   for (const m of matches) {
     const originalUrl = m[0];
-    if (originalUrl.startsWith(redirectPrefix))
-      continue;
-    if (replacements.has(originalUrl))
-      continue;
+    if (originalUrl.startsWith(redirectPrefix)) continue;
+    if (replacements.has(originalUrl)) continue;
     const payload = {
       u: originalUrl,
       s: sessionId,
@@ -254,10 +248,8 @@ async function rewriteUrls(text, config, sessionId, metadata) {
 // src/session.js
 function truncate(value, maxLen) {
-  if (!value || !maxLen || typeof value !== "string")
-    return value;
-  if (value.length <= maxLen)
-    return value;
+  if (!value || !maxLen || typeof value !== "string") return value;
+  if (value.length <= maxLen) return value;
   return value.slice(0, maxLen) + "...[truncated]";
 }
 var Session = class {
@@ -420,8 +412,7 @@ var Session = class {
 // packages/memory/src/inject.js
 var MAX_CHARS_PER_MEMORY = 1200;
 function injectMemories(body, memories, provider) {
-  if (!memories || memories.length === 0)
-    return body;
+  if (!memories || memories.length === 0) return body;
   const preamble = formatPreamble(memories);
   if (provider === "anthropic") {
     return injectAnthropic(body, preamble);
@@ -482,8 +473,7 @@ var DEFAULT_SEARCH_TIMEOUT_MS = 5e3;
 var DEFAULT_SEARCH_LIMIT = 6;
 var DEFAULT_SEARCH_MIN_SCORE = 0.55;
 function normalizeConfig(config) {
-  if (!config)
-    throw new Error("hosted: config is required");
+  if (!config) throw new Error("hosted: config is required");
   const endpoint = config.endpoint || config.tes_endpoint;
   const clientId = config.clientId || config.tes_client_id;
   const apiKey = config.apiKey || config.tes_api_key;
@@ -508,8 +498,7 @@ function buildHostedHeaders(config) {
   return headers;
 }
 async function hostedSearch(config, query, opts = {}) {
-  if (!query)
-    return { memories: [], skipped: "no_query" };
+  if (!query) return { memories: [], skipped: "no_query" };
   let cfg;
   try {
     cfg = normalizeConfig(config);
@@ -556,8 +545,7 @@ async function hostedSearch(config, query, opts = {}) {
   return { memories: payload.data?.semanticSearchMemories || [] };
 }
 function shortenReason(msg) {
-  if (typeof msg !== "string")
-    return "unknown";
+  if (typeof msg !== "string") return "unknown";
   return msg.toLowerCase().replace(/[^a-z0-9]+/g, "_").slice(0, 60);
 }
@@ -568,23 +556,19 @@ var MEMORY_DEFAULTS = {
   timeoutMs: 800
 };
 function detectClientType(client) {
-  if (client?.chat?.completions?.create)
-    return "openai";
-  if (client?.messages?.create)
-    return "anthropic";
-  if (typeof client?.run === "function")
-    return "workers-ai";
+  if (client?.chat?.completions?.create) return "openai";
+  if (client?.messages?.create) return "anthropic";
+  if (typeof client?.run === "function") return "workers-ai";
   return "unknown";
 }
 function extractLastUserMessage(params, provider) {
+  void provider;
   const msgs = Array.isArray(params?.messages) ? params.messages : null;
-  if (!msgs)
-    return null;
+  if (!msgs) return null;
   for (let i = msgs.length - 1; i >= 0; i--) {
     if (msgs[i].role === "user") {
       const c = msgs[i].content;
-      if (typeof c === "string")
-        return c;
+      if (typeof c === "string") return c;
       if (Array.isArray(c)) {
         return c.filter((p) => p.type === "text" && typeof p.text === "string").map((p) => p.text).join("\n");
       }
@@ -634,8 +618,7 @@ function wrapClient(clientConfig, client, sessionOpts = {}) {
     metadata: sessionOpts.metadata
   });
   const type = detectClientType(client);
-  if (type === "openai")
-    return wrapOpenAI(clientConfig, client, sessionOpts);
+  if (type === "openai") return wrapOpenAI(clientConfig, client, sessionOpts);
   if (type === "anthropic")
     return wrapAnthropic(clientConfig, client, sessionOpts);
   if (type === "workers-ai")
@@ -649,10 +632,8 @@ function wrapOpenAI(clientConfig, client, sessionOpts) {
     get(target, prop) {
       if (prop === "chat")
         return wrapOpenAIChat(clientConfig, target.chat, target, sessionOpts);
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new OpenAISession(clientConfig, target, opts);
       return target[prop];
@@ -729,10 +710,8 @@ function wrapAnthropic(clientConfig, client, sessionOpts) {
           target,
           sessionOpts
         );
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new AnthropicSession(clientConfig, target, opts);
       return target[prop];
@@ -819,10 +798,8 @@ function wrapWorkersAI(clientConfig, aiBinding, sessionOpts) {
           return result;
         };
       }
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new WorkersAISession(clientConfig, target, opts);
       return target[prop];
@@ -841,29 +818,24 @@ var WorkersAISession = class extends Session {
   }
 };
 function extractToolResults(session, messages) {
-  if (!messages?.length || !session._toolCalls.length)
-    return;
+  if (!messages?.length || !session._toolCalls.length) return;
   const idToName = /* @__PURE__ */ new Map();
   for (const msg of messages) {
     if (msg.role === "assistant" && msg.tool_calls) {
       for (const tc of msg.tool_calls) {
         const id = tc.id || tc.tool_call_id;
         const name = tc.function?.name || tc.name;
-        if (id && name)
-          idToName.set(id, name);
+        if (id && name) idToName.set(id, name);
       }
     }
   }
   for (const msg of messages) {
-    if (msg.role !== "tool" || !msg.content)
-      continue;
+    if (msg.role !== "tool" || !msg.content) continue;
     const callId = msg.tool_call_id;
     const toolName = callId ? idToName.get(callId) : null;
     for (const tc of session._toolCalls) {
-      if (tc.result)
-        continue;
-      if (toolName && tc.tool !== toolName)
-        continue;
+      if (tc.result) continue;
+      if (toolName && tc.tool !== toolName) continue;
       try {
         const parsed = JSON.parse(msg.content);
         if (Array.isArray(parsed)) {
@@ -906,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.5";
+var VERSION = "0.9.6";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
@@ -917,11 +889,9 @@ function machineId() {
   return (hash >>> 0).toString(16).padStart(8, "0");
 }
 function emitTelemetry(mode) {
-  if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0")
-    return;
+  if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0") return;
   const f = globalThis.fetch;
-  if (!f)
-    return;
+  if (!f) return;
   f(TELEMETRY_URL, {
     method: "POST",
     headers: { "Content-Type": "application/json" },
@@ -941,12 +911,9 @@ function emitTelemetry(mode) {
 // src/client.js
 var TESClient = class {
   constructor({ clientId, apiKey, endpoint, headers, userId, captureContent = true, maxContentLength = 4096 }) {
-    if (!clientId)
-      throw new Error("clientId is required");
-    if (!apiKey)
-      throw new Error("apiKey is required");
-    if (!endpoint)
-      throw new Error("endpoint is required");
+    if (!clientId) throw new Error("clientId is required");
+    if (!apiKey) throw new Error("apiKey is required");
+    if (!endpoint) throw new Error("endpoint is required");
     const cleanEndpoint = endpoint.replace(/\/$/, "");
     const isLocalDev = /^http:\/\/localhost(:\d+)?(\/|$)/.test(cleanEndpoint) || /^http:\/\/127\.0\.0\.1(:\d+)?(\/|$)/.test(cleanEndpoint);
     if (!cleanEndpoint.startsWith("https://") && !isLocalDev) {

package/dist/index.js CHANGED Viewed

@@ -158,8 +158,7 @@ var encoder = new TextEncoder();
 function toBase64Url(buffer) {
   const bytes = new Uint8Array(buffer);
   let binary = "";
-  for (let i = 0; i < bytes.length; i++)
-    binary += String.fromCharCode(bytes[i]);
+  for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
   return btoa(binary).replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
 }
 async function signPayload(secret, payload) {
@@ -180,27 +179,22 @@ async function verifyPayload(secret, payload, signature) {
 }
 async function buildTrackUrl(endpoint, apiKey, payload) {
   const p = { ...payload };
-  if (!p.e)
-    p.e = "LINK_CLICK";
+  if (!p.e) p.e = "LINK_CLICK";
   const encoded = toBase64Url(encoder.encode(JSON.stringify(p)));
   const sig = await signPayload(apiKey, p);
   return `${endpoint}/r/${encoded}?sig=${sig}`;
 }
 var URL_RE = /https?:\/\/[^\s"'<>)\]]+/g;
 async function rewriteUrls(text, config, sessionId, metadata) {
-  if (!text)
-    return text;
+  if (!text) return text;
   const redirectPrefix = `${config.endpoint}/r/`;
   const matches = [...text.matchAll(URL_RE)];
-  if (matches.length === 0)
-    return text;
+  if (matches.length === 0) return text;
   const replacements = /* @__PURE__ */ new Map();
   for (const m of matches) {
     const originalUrl = m[0];
-    if (originalUrl.startsWith(redirectPrefix))
-      continue;
-    if (replacements.has(originalUrl))
-      continue;
+    if (originalUrl.startsWith(redirectPrefix)) continue;
+    if (replacements.has(originalUrl)) continue;
     const payload = {
       u: originalUrl,
       s: sessionId,
@@ -223,10 +217,8 @@ async function rewriteUrls(text, config, sessionId, metadata) {
 // src/session.js
 function truncate(value, maxLen) {
-  if (!value || !maxLen || typeof value !== "string")
-    return value;
-  if (value.length <= maxLen)
-    return value;
+  if (!value || !maxLen || typeof value !== "string") return value;
+  if (value.length <= maxLen) return value;
   return value.slice(0, maxLen) + "...[truncated]";
 }
 var Session = class {
@@ -389,8 +381,7 @@ var Session = class {
 // packages/memory/src/inject.js
 var MAX_CHARS_PER_MEMORY = 1200;
 function injectMemories(body, memories, provider) {
-  if (!memories || memories.length === 0)
-    return body;
+  if (!memories || memories.length === 0) return body;
   const preamble = formatPreamble(memories);
   if (provider === "anthropic") {
     return injectAnthropic(body, preamble);
@@ -451,8 +442,7 @@ var DEFAULT_SEARCH_TIMEOUT_MS = 5e3;
 var DEFAULT_SEARCH_LIMIT = 6;
 var DEFAULT_SEARCH_MIN_SCORE = 0.55;
 function normalizeConfig(config) {
-  if (!config)
-    throw new Error("hosted: config is required");
+  if (!config) throw new Error("hosted: config is required");
   const endpoint = config.endpoint || config.tes_endpoint;
   const clientId = config.clientId || config.tes_client_id;
   const apiKey = config.apiKey || config.tes_api_key;
@@ -477,8 +467,7 @@ function buildHostedHeaders(config) {
   return headers;
 }
 async function hostedSearch(config, query, opts = {}) {
-  if (!query)
-    return { memories: [], skipped: "no_query" };
+  if (!query) return { memories: [], skipped: "no_query" };
   let cfg;
   try {
     cfg = normalizeConfig(config);
@@ -525,8 +514,7 @@ async function hostedSearch(config, query, opts = {}) {
   return { memories: payload.data?.semanticSearchMemories || [] };
 }
 function shortenReason(msg) {
-  if (typeof msg !== "string")
-    return "unknown";
+  if (typeof msg !== "string") return "unknown";
   return msg.toLowerCase().replace(/[^a-z0-9]+/g, "_").slice(0, 60);
 }
@@ -537,23 +525,19 @@ var MEMORY_DEFAULTS = {
   timeoutMs: 800
 };
 function detectClientType(client) {
-  if (client?.chat?.completions?.create)
-    return "openai";
-  if (client?.messages?.create)
-    return "anthropic";
-  if (typeof client?.run === "function")
-    return "workers-ai";
+  if (client?.chat?.completions?.create) return "openai";
+  if (client?.messages?.create) return "anthropic";
+  if (typeof client?.run === "function") return "workers-ai";
   return "unknown";
 }
 function extractLastUserMessage(params, provider) {
+  void provider;
   const msgs = Array.isArray(params?.messages) ? params.messages : null;
-  if (!msgs)
-    return null;
+  if (!msgs) return null;
   for (let i = msgs.length - 1; i >= 0; i--) {
     if (msgs[i].role === "user") {
       const c = msgs[i].content;
-      if (typeof c === "string")
-        return c;
+      if (typeof c === "string") return c;
       if (Array.isArray(c)) {
         return c.filter((p) => p.type === "text" && typeof p.text === "string").map((p) => p.text).join("\n");
       }
@@ -603,8 +587,7 @@ function wrapClient(clientConfig, client, sessionOpts = {}) {
     metadata: sessionOpts.metadata
   });
   const type = detectClientType(client);
-  if (type === "openai")
-    return wrapOpenAI(clientConfig, client, sessionOpts);
+  if (type === "openai") return wrapOpenAI(clientConfig, client, sessionOpts);
   if (type === "anthropic")
     return wrapAnthropic(clientConfig, client, sessionOpts);
   if (type === "workers-ai")
@@ -618,10 +601,8 @@ function wrapOpenAI(clientConfig, client, sessionOpts) {
     get(target, prop) {
       if (prop === "chat")
         return wrapOpenAIChat(clientConfig, target.chat, target, sessionOpts);
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new OpenAISession(clientConfig, target, opts);
       return target[prop];
@@ -698,10 +679,8 @@ function wrapAnthropic(clientConfig, client, sessionOpts) {
           target,
           sessionOpts
         );
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new AnthropicSession(clientConfig, target, opts);
       return target[prop];
@@ -788,10 +767,8 @@ function wrapWorkersAI(clientConfig, aiBinding, sessionOpts) {
           return result;
         };
       }
-      if (prop === "sessionId")
-        return sessionOpts._resolvedSessionId;
-      if (prop === "tesSession")
-        return sessionOpts._session;
+      if (prop === "sessionId") return sessionOpts._resolvedSessionId;
+      if (prop === "tesSession") return sessionOpts._session;
       if (prop === "session")
         return (opts) => new WorkersAISession(clientConfig, target, opts);
       return target[prop];
@@ -810,29 +787,24 @@ var WorkersAISession = class extends Session {
   }
 };
 function extractToolResults(session, messages) {
-  if (!messages?.length || !session._toolCalls.length)
-    return;
+  if (!messages?.length || !session._toolCalls.length) return;
   const idToName = /* @__PURE__ */ new Map();
   for (const msg of messages) {
     if (msg.role === "assistant" && msg.tool_calls) {
       for (const tc of msg.tool_calls) {
         const id = tc.id || tc.tool_call_id;
         const name = tc.function?.name || tc.name;
-        if (id && name)
-          idToName.set(id, name);
+        if (id && name) idToName.set(id, name);
       }
     }
   }
   for (const msg of messages) {
-    if (msg.role !== "tool" || !msg.content)
-      continue;
+    if (msg.role !== "tool" || !msg.content) continue;
     const callId = msg.tool_call_id;
     const toolName = callId ? idToName.get(callId) : null;
     for (const tc of session._toolCalls) {
-      if (tc.result)
-        continue;
-      if (toolName && tc.tool !== toolName)
-        continue;
+      if (tc.result) continue;
+      if (toolName && tc.tool !== toolName) continue;
       try {
         const parsed = JSON.parse(msg.content);
         if (Array.isArray(parsed)) {
@@ -875,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.5";
+var VERSION = "0.9.6";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
@@ -886,11 +858,9 @@ function machineId() {
   return (hash >>> 0).toString(16).padStart(8, "0");
 }
 function emitTelemetry(mode) {
-  if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0")
-    return;
+  if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0") return;
   const f = globalThis.fetch;
-  if (!f)
-    return;
+  if (!f) return;
   f(TELEMETRY_URL, {
     method: "POST",
     headers: { "Content-Type": "application/json" },
@@ -910,12 +880,9 @@ function emitTelemetry(mode) {
 // src/client.js
 var TESClient = class {
   constructor({ clientId, apiKey, endpoint, headers, userId, captureContent = true, maxContentLength = 4096 }) {
-    if (!clientId)
-      throw new Error("clientId is required");
-    if (!apiKey)
-      throw new Error("apiKey is required");
-    if (!endpoint)
-      throw new Error("endpoint is required");
+    if (!clientId) throw new Error("clientId is required");
+    if (!apiKey) throw new Error("apiKey is required");
+    if (!endpoint) throw new Error("endpoint is required");
     const cleanEndpoint = endpoint.replace(/\/$/, "");
     const isLocalDev = /^http:\/\/localhost(:\d+)?(\/|$)/.test(cleanEndpoint) || /^http:\/\/127\.0\.0\.1(:\d+)?(\/|$)/.test(cleanEndpoint);
     if (!cleanEndpoint.startsWith("https://") && !isLocalDev) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.9.5",
+  "version": "0.9.6",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",
@@ -73,11 +73,18 @@
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.27.1",
     "@pentatonic-ai/ai-agent-sdk": "^0.4.0",
-    "esbuild": "^0.20.0"
+    "esbuild": "^0.25.0"
   },
   "devDependencies": {
     "@jest/globals": "^29.7.0",
     "jest": "^29.7.0",
     "pg": "^8.20.0"
+  },
+  "overrides": {
+    "path-to-regexp": "^8.4.0",
+    "ip-address": "^10.1.1",
+    "@hono/node-server": "^1.19.13",
+    "picomatch": "^4.0.4",
+    "esbuild": "^0.25.0"
   }
 }

package/packages/memory/package-lock.json CHANGED Viewed

@@ -1,20 +1,19 @@
 {
-  "name": "@pentatonic/memory",
-  "version": "0.1.0",
+  "name": "memory",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
-      "name": "@pentatonic/memory",
+      "name": "memory",
       "dependencies": {
         "@modelcontextprotocol/sdk": "^1.0.0",
         "pg": "^8.13.0"
       }
     },
     "node_modules/@hono/node-server": {
-      "version": "1.19.13",
-      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.13.tgz",
-      "integrity": "sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ==",
+      "version": "1.19.14",
+      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz",
+      "integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==",
       "license": "MIT",
       "engines": {
         "node": ">=18.14.1"
@@ -77,9 +76,9 @@
       }
     },
     "node_modules/ajv": {
-      "version": "8.18.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
-      "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==",
+      "version": "8.20.0",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz",
+      "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==",
       "license": "MIT",
       "dependencies": {
         "fast-deep-equal": "^3.1.3",
@@ -355,9 +354,9 @@
       }
     },
     "node_modules/eventsource-parser": {
-      "version": "3.0.6",
-      "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz",
-      "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==",
+      "version": "3.0.8",
+      "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.8.tgz",
+      "integrity": "sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==",
       "license": "MIT",
       "engines": {
         "node": ">=18.0.0"
@@ -407,12 +406,12 @@
       }
     },
     "node_modules/express-rate-limit": {
-      "version": "8.3.2",
-      "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.3.2.tgz",
-      "integrity": "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg==",
+      "version": "8.5.1",
+      "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.1.tgz",
+      "integrity": "sha512-5O6KYmyJEpuPJV5hNTXKbAHWRqrzyu+OI3vUnSd2kXFubIVpG7ezpgxQy76Zo5GQZtrQBg86hF+CM/NX+cioiQ==",
       "license": "MIT",
       "dependencies": {
-        "ip-address": "10.1.0"
+        "ip-address": "^10.2.0"
       },
       "engines": {
         "node": ">= 16"
@@ -556,9 +555,9 @@
       }
     },
     "node_modules/hasown": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
-      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.3.tgz",
+      "integrity": "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==",
       "license": "MIT",
       "dependencies": {
         "function-bind": "^1.1.2"
@@ -619,9 +618,9 @@
       "license": "ISC"
     },
     "node_modules/ip-address": {
-      "version": "10.1.0",
-      "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
-      "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
+      "version": "10.2.0",
+      "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
+      "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
       "license": "MIT",
       "engines": {
         "node": ">= 12"
@@ -649,9 +648,9 @@
       "license": "ISC"
     },
     "node_modules/jose": {
-      "version": "6.2.2",
-      "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.2.tgz",
-      "integrity": "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ==",
+      "version": "6.2.3",
+      "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz",
+      "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==",
       "license": "MIT",
       "funding": {
         "url": "https://github.com/sponsors/panva"
@@ -1201,17 +1200,34 @@
       }
     },
     "node_modules/type-is": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz",
-      "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.1.0.tgz",
+      "integrity": "sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==",
       "license": "MIT",
       "dependencies": {
-        "content-type": "^1.0.5",
+        "content-type": "^2.0.0",
         "media-typer": "^1.1.0",
         "mime-types": "^3.0.0"
       },
       "engines": {
-        "node": ">= 0.6"
+        "node": ">= 18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/type-is/node_modules/content-type": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-2.0.0.tgz",
+      "integrity": "sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
       }
     },
     "node_modules/unpipe": {
@@ -1263,9 +1279,9 @@
       }
     },
     "node_modules/zod": {
-      "version": "4.3.6",
-      "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz",
-      "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==",
+      "version": "4.4.3",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz",
+      "integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==",
       "license": "MIT",
       "funding": {
         "url": "https://github.com/sponsors/colinhacks"

package/packages/memory/package.json CHANGED Viewed

@@ -1,10 +1,13 @@
 {
   "private": true,
   "name": "memory",
-  "description": "Memory subsystem — imported via @pentatonic-ai/ai-agent-sdk/memory",
+  "description": "Memory subsystem \u2014 imported via @pentatonic-ai/ai-agent-sdk/memory",
   "type": "module",
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.0.0",
     "pg": "^8.13.0"
+  },
+  "overrides": {
+    "ip-address": "^10.1.1"
   }
 }

package/packages/memory/src/__tests__/engine.test.js CHANGED Viewed

@@ -730,15 +730,22 @@ describe("engine HTTP client", () => {
   });
   describe("engineForget", () => {
-    it("forwards id when provided", async () => {
+    it("forwards id when provided (no arena composition for id-based deletes)", async () => {
       mockOk({ deleted: 1 });
       await engineForget("https://e", { clientId: "acme", id: "abc" });
       const body = JSON.parse(calls[0].init.body);
       expect(calls[0].url).toBe("https://e/forget");
-      expect(body).toEqual({ arena: "acme", id: "abc" });
+      // id-only deletes target the global record id; the engine's
+      // id path doesn't read arena scope, so we don't inject it.
+      expect(body).toEqual({ id: "abc" });
     });
-    it("forwards metadata_contains when provided", async () => {
+    it("forwards metadata_contains and injects arena INSIDE it (tenant default)", async () => {
+      // The engine reads `metadata_contains.arena` (not top-level
+      // arena) to scope a forget at L2. Pre-2026-05-14 this helper
+      // put arena at the top level, which the engine silently
+      // ignored — only L6 ever got wiped. Pinning the post-fix
+      // contract here so a regression can't sneak back in.
       mockOk({ deleted: 5 });
       await engineForget("https://e", {
         clientId: "acme",
@@ -746,11 +753,39 @@ describe("engine HTTP client", () => {
       });
       const body = JSON.parse(calls[0].init.body);
       expect(body).toEqual({
-        arena: "acme",
-        metadata_contains: { source_repo: "monorepo" },
+        metadata_contains: { arena: "acme", source_repo: "monorepo" },
+      });
+      // Top-level arena must NOT be sent — the engine ignores it and
+      // its presence would mislead anyone reading wire dumps.
+      expect(body.arena).toBeUndefined();
+    });
+    it("composes user-scoped arena when userId is supplied", async () => {
+      mockOk({ deleted: 12 });
+      await engineForget("https://e", {
+        clientId: "acme",
+        userId: "u-1",
+        metadataContains: { actor_user_id: "u-1" },
+      });
+      const body = JSON.parse(calls[0].init.body);
+      expect(body).toEqual({
+        metadata_contains: { arena: "acme:u-1", actor_user_id: "u-1" },
       });
     });
+    it("respects caller-supplied arena inside metadataContains (super-admin override)", async () => {
+      // Super-admin tooling that wipes "some other tenant's user arena"
+      // — pass the explicit arena and the SDK leaves it alone instead
+      // of recomposing from (clientId, userId).
+      mockOk({ deleted: 99 });
+      await engineForget("https://e", {
+        clientId: "tes-admin",
+        metadataContains: { arena: "victim-tenant:u-7", source: "x" },
+      });
+      const body = JSON.parse(calls[0].init.body);
+      expect(body.metadata_contains.arena).toBe("victim-tenant:u-7");
+    });
     it("requires id or metadataContains", async () => {
       await expect(
         engineForget("https://e", { clientId: "acme" })

package/packages/memory/src/engine.js CHANGED Viewed

@@ -328,9 +328,31 @@ export async function engineSearch(engineUrl, opts) {
  *
  * Caller must supply exactly one of `id` or `metadataContains`.
  *
+ * Arena scope: the engine extracts the arena from `metadata_contains.arena`
+ * (see memory-engine `compat/server.py:1048-1052`). Top-level `arena` is
+ * NOT read by the engine — previous versions of this helper put it there
+ * and the resulting calls only ever wiped L6, leaving L0/L2/L3/L4 records
+ * untouched. The 2026-05-14 Pip dedup cutover surfaced the bug: an
+ * actor_user_id wipe returned 0 against an arena that personFacets
+ * confirmed held thousands of records. This helper now injects `arena`
+ * into `metadata_contains` so the engine forwards to L2 /forget-internal
+ * and actually wipes the cross-layer arena.
+ *
+ * By default the row is **user-scoped** (`arena = clientId:userId`) when
+ * `userId` is supplied, otherwise **tenant-wide** (`arena = clientId`).
+ * Pass `scope: "tenant"` explicitly to bypass the user-arena scope from a
+ * user-context. Matches `engineStore`'s arena semantics for symmetry.
+ *
+ * If the caller passes `arena` inside `metadataContains` themselves, the
+ * SDK respects it as-is and skips composition — useful for super-admin
+ * tools that need to wipe an arena other than the one derived from
+ * (clientId, userId).
+ *
  * @param {string} engineUrl
  * @param {object} opts
  * @param {string} opts.clientId
+ * @param {string} [opts.userId]            user id within the tenant; controls default scope
+ * @param {"tenant"|"user"} [opts.scope]    override the default scope. "user" requires userId.
  * @param {string} [opts.id]                forget a single record by engine id
  * @param {object} [opts.metadataContains]  forget all records matching every key=value pair
  * @param {Record<string,string>} [opts.headers]  forwarded HTTP headers
@@ -338,15 +360,28 @@ export async function engineSearch(engineUrl, opts) {
  * @returns {Promise<{deleted: number}>}
  */
 export async function engineForget(engineUrl, opts) {
-  const { clientId, id, metadataContains, headers } = opts || {};
+  const { clientId, userId, scope, id, metadataContains, headers } = opts || {};
   if (!clientId) throw new Error("engineForget: clientId required");
   if (!id && !metadataContains) {
     throw new Error("engineForget: provide id or metadataContains");
   }
+  // Compose arena from (clientId, userId, scope) using the same shape
+  // engineStore uses. Caller-supplied `metadataContains.arena` wins —
+  // the SDK shouldn't second-guess a super-admin explicitly targeting
+  // a specific arena.
+  let mergedMetadata;
+  if (metadataContains) {
+    const hasExplicitArena =
+      typeof metadataContains.arena === "string" && metadataContains.arena;
+    mergedMetadata = hasExplicitArena
+      ? metadataContains
+      : { ...metadataContains, arena: composeArena(clientId, userId, scope) };
+  }
   const body = {
-    arena: clientId,
     ...(id ? { id } : {}),
-    ...(metadataContains ? { metadata_contains: metadataContains } : {}),
+    ...(mergedMetadata ? { metadata_contains: mergedMetadata } : {}),
   };
   return fetchEngine(engineUrl, "/forget", body, { headers });
 }

package/packages/memory-engine/docker-compose.yml CHANGED Viewed

@@ -72,7 +72,22 @@ services:
     environment:
       NEO4J_AUTH: ${NEO4J_AUTH:-neo4j/local-dev-pw}
       NEO4J_PLUGINS: '["apoc"]'
-      NEO4J_dbms_memory_heap_max__size: 512m
+      # Heap defaults were 512m hardcoded — fine for an empty dev
+      # graph, catastrophic at production scale. A 2026-05-14 prod
+      # incident on a ~10M-relationship KG saw L3 sit at >600% CPU
+      # locked in parallel GC, blocking the L2 write fan-out and
+      # triggering cascading 5xx through L6 and the embed gateway.
+      # The graph fit in RAM fine; the JVM just had nowhere to put
+      # short-lived allocations.
+      #
+      # Defaults now sized for a small-but-realistic local graph
+      # (~1M relationships): 1g heap + 256m initial + 512m pagecache.
+      # Production deployments override via PME_L3_HEAP_MAX etc.
+      # (the AWS overlay sets 4g/1g/1g — see thing-event-system
+      # modules/pentatonic-memory/deploy/docker-compose.aws.yml).
+      NEO4J_dbms_memory_heap_max__size: ${PME_L3_HEAP_MAX:-1g}
+      NEO4J_dbms_memory_heap_initial__size: ${PME_L3_HEAP_INITIAL:-256m}
+      NEO4J_dbms_memory_pagecache_size: ${PME_L3_PAGECACHE:-512m}
     volumes:
       - pme-l3-data:/data
     healthcheck:

package/packages/memory-engine/engine/services/_shared/embed_provider.py CHANGED Viewed

@@ -212,6 +212,9 @@ class EmbedClient:
         timeout: float = 120.0,
         env_prefix: str = "",
         max_batch: int = 5,
+        max_retries: int = 3,
+        retry_base_delay: float = 0.1,
+        retry_max_delay: float = 1.0,
     ) -> None:
         self._configured_provider = provider
         self._provider = provider
@@ -229,6 +232,25 @@ class EmbedClient:
         # cap observed on Pentatonic AI Gateway — above which it 502s and the
         # caller silently loses vector writes (see test_chunking_* tests).
         self._max_batch = max(0, max_batch)
+        # Retry-with-jitter for transient gateway saturation. The
+        # Pentatonic AI Gateway has a K≈10 concurrent-request cap; when
+        # multiple chunks of a single batch (or multiple concurrent
+        # batches from different layers) saturate it, individual POSTs
+        # 502/503. The 2026-05-15 incident showed an L6 fallback path
+        # 502-rate of 96% under Pip backfill load — every shared-embed
+        # failed, every per-layer fallback also failed, the cascade
+        # cleared only when traffic dropped.
+        #
+        # Retries with full jitter let those transient saturations
+        # absorb instead of cascading: when many concurrent chunks all
+        # 502 at once, jittered backoff staggers their retries so the
+        # gateway recovers slot-by-slot rather than thundering-herding.
+        # Tuned via {prefix}EMBED_MAX_RETRIES (default 3); set to 0
+        # to restore pre-fix behaviour. Only 429/502/503/504 are
+        # retried — auth + 4xx errors fail fast.
+        self._max_retries = max(0, max_retries)
+        self._retry_base_delay = max(0.0, retry_base_delay)
+        self._retry_max_delay = max(self._retry_base_delay, retry_max_delay)
     # ------------------------------------------------------------------
     # Construction
@@ -268,6 +290,13 @@ class EmbedClient:
         autodetect = os.environ.get(f"{prefix}EMBED_AUTODETECT", "true").lower() == "true"
         timeout = float(os.environ.get(f"{prefix}EMBED_TIMEOUT", "120"))
         max_batch = int(os.environ.get(f"{prefix}EMBED_MAX_BATCH", "5"))
+        max_retries = int(os.environ.get(f"{prefix}EMBED_MAX_RETRIES", "3"))
+        retry_base_delay = float(
+            os.environ.get(f"{prefix}EMBED_RETRY_BASE_DELAY", "0.1")
+        )
+        retry_max_delay = float(
+            os.environ.get(f"{prefix}EMBED_RETRY_MAX_DELAY", "1.0")
+        )
         provider = resolve_provider(provider_name, env_prefix=prefix)
         return cls(
@@ -279,6 +308,9 @@ class EmbedClient:
             timeout=timeout,
             env_prefix=prefix,
             max_batch=max_batch,
+            max_retries=max_retries,
+            retry_base_delay=retry_base_delay,
+            retry_max_delay=retry_max_delay,
         )
     # ------------------------------------------------------------------
@@ -369,41 +401,103 @@ class EmbedClient:
     # Request paths
     # ------------------------------------------------------------------
+    # Status codes that indicate transient gateway capacity issues
+    # (rate-limit, upstream saturation, transient unavailability,
+    # upstream timeout). 401 + other 4xx + non-listed 5xx fail fast —
+    # they typically indicate caller or config problems where retrying
+    # won't help.
+    _RETRYABLE_STATUS = frozenset({429, 502, 503, 504})
+    def _backoff_delay(self, attempt: int) -> float:
+        """Exponential backoff with full jitter.
+        Full jitter (random.uniform(0, cap)) is preferred over equal
+        jitter for the embed gateway case: many concurrent chunks all
+        503 at the same instant, and full jitter maximally spreads
+        their retries so the gateway recovers slot-by-slot instead of
+        seeing periodic thundering herds.
+        """
+        import random
+        cap = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
+        return random.uniform(0, cap)
     def _post_with_autodetect(self, texts: list[str], *, async_mode: bool) -> list[list[float]]:
         del async_mode  # kept for symmetry; sync path is its own method
-        body = self._provider.body_builder(texts, self._model)
-        headers = self._headers(self._provider)
-        try:
-            r = httpx.post(self._url, json=body, headers=headers, timeout=self._timeout)
-        except httpx.HTTPError as exc:
-            raise EmbedHTTPError(0, str(exc)) from exc
-        if r.status_code == 401 and self._autodetect and not self._detected:
-            return self._autodetect_and_retry(texts, last_body=r.text)
-        if r.status_code == 401:
-            raise EmbedAuthError(r.text)
-        if not r.is_success:
-            raise EmbedHTTPError(r.status_code, r.text)
-        return self._provider.response_parser(r.json())
+        import time as _time
+        last_exc: EmbedHTTPError | None = None
+        for attempt in range(self._max_retries + 1):
+            body = self._provider.body_builder(texts, self._model)
+            headers = self._headers(self._provider)
+            try:
+                r = httpx.post(
+                    self._url, json=body, headers=headers, timeout=self._timeout
+                )
+            except httpx.HTTPError as exc:
+                # Network-level error (DNS, connect refused, timeout).
+                # Treat as retryable — transient network blips are
+                # exactly what jittered retry is designed to absorb.
+                last_exc = EmbedHTTPError(0, str(exc))
+                if attempt >= self._max_retries:
+                    raise last_exc from exc
+                _time.sleep(self._backoff_delay(attempt))
+                continue
+            if r.status_code == 401 and self._autodetect and not self._detected:
+                # Autodetect runs at most once (gated by self._detected)
+                # and tries other providers in sequence; no retry layer
+                # needed on top.
+                return self._autodetect_and_retry(texts, last_body=r.text)
+            if r.status_code == 401:
+                raise EmbedAuthError(r.text)
+            if not r.is_success:
+                if (
+                    r.status_code in self._RETRYABLE_STATUS
+                    and attempt < self._max_retries
+                ):
+                    last_exc = EmbedHTTPError(r.status_code, r.text)
+                    _time.sleep(self._backoff_delay(attempt))
+                    continue
+                raise EmbedHTTPError(r.status_code, r.text)
+            return self._provider.response_parser(r.json())
+        # Loop exited without success or raise — shouldn't happen, but
+        # keep the type checker happy.
+        assert last_exc is not None
+        raise last_exc
     async def _post_with_autodetect_async(self, texts: list[str]) -> list[list[float]]:
-        body = self._provider.body_builder(texts, self._model)
-        headers = self._headers(self._provider)
-        try:
-            async with httpx.AsyncClient(timeout=self._timeout) as client:
-                r = await client.post(self._url, json=body, headers=headers)
-        except httpx.HTTPError as exc:
-            raise EmbedHTTPError(0, str(exc)) from exc
-        if r.status_code == 401 and self._autodetect and not self._detected:
-            return await self._autodetect_and_retry_async(texts, last_body=r.text)
-        if r.status_code == 401:
-            raise EmbedAuthError(r.text)
-        if not r.is_success:
-            raise EmbedHTTPError(r.status_code, r.text)
-        return self._provider.response_parser(r.json())
+        import asyncio as _asyncio
+        last_exc: EmbedHTTPError | None = None
+        for attempt in range(self._max_retries + 1):
+            body = self._provider.body_builder(texts, self._model)
+            headers = self._headers(self._provider)
+            try:
+                async with httpx.AsyncClient(timeout=self._timeout) as client:
+                    r = await client.post(self._url, json=body, headers=headers)
+            except httpx.HTTPError as exc:
+                last_exc = EmbedHTTPError(0, str(exc))
+                if attempt >= self._max_retries:
+                    raise last_exc from exc
+                await _asyncio.sleep(self._backoff_delay(attempt))
+                continue
+            if r.status_code == 401 and self._autodetect and not self._detected:
+                return await self._autodetect_and_retry_async(texts, last_body=r.text)
+            if r.status_code == 401:
+                raise EmbedAuthError(r.text)
+            if not r.is_success:
+                if (
+                    r.status_code in self._RETRYABLE_STATUS
+                    and attempt < self._max_retries
+                ):
+                    last_exc = EmbedHTTPError(r.status_code, r.text)
+                    await _asyncio.sleep(self._backoff_delay(attempt))
+                    continue
+                raise EmbedHTTPError(r.status_code, r.text)
+            return self._provider.response_parser(r.json())
+        assert last_exc is not None
+        raise last_exc
     # ------------------------------------------------------------------
     # Auto-detect

package/packages/memory-engine/tests/test_embed_provider.py CHANGED Viewed

@@ -268,6 +268,9 @@ def test_autodetect_all_fail_raises(recorder):
 # ----------------------------------------------------------------------
 def test_non_401_http_error_does_not_trigger_autodetect(recorder):
+    # max_retries=0 isolates this test to autodetect behaviour. With
+    # retries enabled (default), 503 triggers the retry path which is
+    # exercised separately in the retry tests below.
     recorder.respond(
         "https://gw/v1/embeddings",
         _FakeResponse(503, "upstream down"),
@@ -277,6 +280,7 @@ def test_non_401_http_error_does_not_trigger_autodetect(recorder):
         api_key="k",
         model="m",
         provider=PROVIDERS["openai"],
+        max_retries=0,
     )
     with pytest.raises(EmbedHTTPError) as exc:
         client.embed_batch(["x"])
@@ -490,3 +494,200 @@ def test_from_env_default_max_batch_is_five(monkeypatch):
     client.embed_batch([f"t{i}" for i in range(10)])
     # 10 with default chunk=5 → [5, 5] → 2 calls
     assert len(stub.calls) == 2
+# ----------------------------------------------------------------------
+# Retry-with-jitter on transient gateway saturation (502/503/504/429)
+# ----------------------------------------------------------------------
+#
+# These tests exercise the retry path added 2026-05-15. Motivation:
+# the Pentatonic AI Gateway has a K≈10 concurrency cap and 502s under
+# saturation; without retry, a single 502 cascades through the engine's
+# per-layer fallback path and amplifies load instead of damping it.
+# See the prod incident note on EmbedClient.__init__ for context.
+class _SequencedRecorder:
+    """Returns a different response on each successive call.
+    The default `_Recorder` returns the same response every time, which
+    is wrong for retry tests — we need to verify "first call 502, then
+    succeed on retry". This recorder pops responses off a queue per
+    URL and falls back to the last response if the queue is empty
+    (matching the "persistent failure" test case naturally).
+    """
+    def __init__(self):
+        self.calls: list[dict] = []
+        self.queues: dict[str, list[_FakeResponse]] = {}
+    def queue(self, url: str, responses: list[_FakeResponse]) -> None:
+        self.queues[url] = list(responses)
+    def __call__(self, url, *, json, headers, timeout):
+        self.calls.append({"url": url, "json": json})
+        q = self.queues.get(url, [])
+        if not q:
+            return _FakeResponse(401, "no responses queued")
+        # Pop unless this is the last one — keep returning the tail so
+        # "all attempts fail" tests don't need to queue N copies.
+        return q.pop(0) if len(q) > 1 else q[0]
+@pytest.fixture
+def sequenced(monkeypatch):
+    rec = _SequencedRecorder()
+    monkeypatch.setattr(httpx, "post", rec)
+    # Avoid the test taking real wall time on backoff sleeps — patch
+    # time.sleep to no-op. The jitter calculation still runs, just
+    # without the actual delay.
+    import time as _time
+    monkeypatch.setattr(_time, "sleep", lambda _s: None)
+    return rec
+def test_retries_on_502_and_succeeds(sequenced):
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [
+            _FakeResponse(502, "bad gateway"),
+            _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
+        ],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    out = client.embed_batch(["hello"])
+    assert out == [[0.1, 0.2]]
+    # First call 502, second call 200 — exactly two attempts.
+    assert len(sequenced.calls) == 2
+def test_retries_on_503_504_429(sequenced):
+    """Each transient code triggers the retry path the same way."""
+    for code in (503, 504, 429):
+        sequenced.calls.clear()
+        sequenced.queue(
+            "https://gw/v1/embeddings",
+            [
+                _FakeResponse(code, "transient"),
+                _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
+            ],
+        )
+        client = EmbedClient(
+            url="https://gw/v1/embeddings",
+            api_key="k",
+            model="m",
+            provider=PROVIDERS["openai"],
+            max_retries=3,
+        )
+        out = client.embed_batch(["x"])
+        assert out == [[0.0]], f"retry failed for status {code}"
+        assert len(sequenced.calls) == 2, f"wrong call count for status {code}"
+def test_does_not_retry_on_500(sequenced):
+    """500 is server-side bug, not transient saturation — fail fast."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(500, "internal server error")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 500
+    # Exactly one attempt — no retry on 500.
+    assert len(sequenced.calls) == 1
+def test_does_not_retry_on_400(sequenced):
+    """4xx (other than 401-autodetect / 429) indicates caller error."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(400, "bad request")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 400
+    assert len(sequenced.calls) == 1
+def test_max_retries_exhausted_raises(sequenced):
+    """Persistent 502 raises after max_retries+1 attempts."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(502, "still down")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 502
+    # max_retries=3 → 1 original + 3 retries = 4 calls total.
+    assert len(sequenced.calls) == 4
+def test_max_retries_zero_disables_retry(sequenced):
+    """Explicit opt-out preserves pre-fix behaviour for callers that
+    handle their own retry."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(502, "down")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=0,
+    )
+    with pytest.raises(EmbedHTTPError):
+        client.embed_batch(["x"])
+    assert len(sequenced.calls) == 1
+def test_from_env_reads_retry_config(monkeypatch):
+    """{prefix}EMBED_MAX_RETRIES + EMBED_RETRY_BASE_DELAY +
+    EMBED_RETRY_MAX_DELAY override the defaults."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    monkeypatch.setenv("L4_EMBED_MAX_RETRIES", "5")
+    monkeypatch.setenv("L4_EMBED_RETRY_BASE_DELAY", "0.25")
+    monkeypatch.setenv("L4_EMBED_RETRY_MAX_DELAY", "2.5")
+    client = EmbedClient.from_env(prefix="L4_")
+    assert client._max_retries == 5
+    assert client._retry_base_delay == 0.25
+    assert client._retry_max_delay == 2.5
+def test_from_env_default_retry_config(monkeypatch):
+    """Defaults: 3 retries, 100ms base, 1s cap — tuned for K≈10
+    gateway under burst load."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    client = EmbedClient.from_env(prefix="L4_")
+    assert client._max_retries == 3
+    assert client._retry_base_delay == 0.1
+    assert client._retry_max_delay == 1.0