@pentatonic-ai/ai-agent-sdk 0.9.5 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -17,8 +17,8 @@ var __copyProps = (to, from, except, desc) => {
17
17
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
18
 
19
19
  // src/index.js
20
- var src_exports = {};
21
- __export(src_exports, {
20
+ var index_exports = {};
21
+ __export(index_exports, {
22
22
  Session: () => Session,
23
23
  TESClient: () => TESClient,
24
24
  buildTrackUrl: () => buildTrackUrl,
@@ -27,7 +27,7 @@ __export(src_exports, {
27
27
  signPayload: () => signPayload,
28
28
  verifyPayload: () => verifyPayload
29
29
  });
30
- module.exports = __toCommonJS(src_exports);
30
+ module.exports = __toCommonJS(index_exports);
31
31
 
32
32
  // src/normalizer.js
33
33
  function normalizeResponse(raw) {
@@ -189,8 +189,7 @@ var encoder = new TextEncoder();
189
189
  function toBase64Url(buffer) {
190
190
  const bytes = new Uint8Array(buffer);
191
191
  let binary = "";
192
- for (let i = 0; i < bytes.length; i++)
193
- binary += String.fromCharCode(bytes[i]);
192
+ for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
194
193
  return btoa(binary).replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
195
194
  }
196
195
  async function signPayload(secret, payload) {
@@ -211,27 +210,22 @@ async function verifyPayload(secret, payload, signature) {
211
210
  }
212
211
  async function buildTrackUrl(endpoint, apiKey, payload) {
213
212
  const p = { ...payload };
214
- if (!p.e)
215
- p.e = "LINK_CLICK";
213
+ if (!p.e) p.e = "LINK_CLICK";
216
214
  const encoded = toBase64Url(encoder.encode(JSON.stringify(p)));
217
215
  const sig = await signPayload(apiKey, p);
218
216
  return `${endpoint}/r/${encoded}?sig=${sig}`;
219
217
  }
220
218
  var URL_RE = /https?:\/\/[^\s"'<>)\]]+/g;
221
219
  async function rewriteUrls(text, config, sessionId, metadata) {
222
- if (!text)
223
- return text;
220
+ if (!text) return text;
224
221
  const redirectPrefix = `${config.endpoint}/r/`;
225
222
  const matches = [...text.matchAll(URL_RE)];
226
- if (matches.length === 0)
227
- return text;
223
+ if (matches.length === 0) return text;
228
224
  const replacements = /* @__PURE__ */ new Map();
229
225
  for (const m of matches) {
230
226
  const originalUrl = m[0];
231
- if (originalUrl.startsWith(redirectPrefix))
232
- continue;
233
- if (replacements.has(originalUrl))
234
- continue;
227
+ if (originalUrl.startsWith(redirectPrefix)) continue;
228
+ if (replacements.has(originalUrl)) continue;
235
229
  const payload = {
236
230
  u: originalUrl,
237
231
  s: sessionId,
@@ -254,10 +248,8 @@ async function rewriteUrls(text, config, sessionId, metadata) {
254
248
 
255
249
  // src/session.js
256
250
  function truncate(value, maxLen) {
257
- if (!value || !maxLen || typeof value !== "string")
258
- return value;
259
- if (value.length <= maxLen)
260
- return value;
251
+ if (!value || !maxLen || typeof value !== "string") return value;
252
+ if (value.length <= maxLen) return value;
261
253
  return value.slice(0, maxLen) + "...[truncated]";
262
254
  }
263
255
  var Session = class {
@@ -420,8 +412,7 @@ var Session = class {
420
412
  // packages/memory/src/inject.js
421
413
  var MAX_CHARS_PER_MEMORY = 1200;
422
414
  function injectMemories(body, memories, provider) {
423
- if (!memories || memories.length === 0)
424
- return body;
415
+ if (!memories || memories.length === 0) return body;
425
416
  const preamble = formatPreamble(memories);
426
417
  if (provider === "anthropic") {
427
418
  return injectAnthropic(body, preamble);
@@ -482,8 +473,7 @@ var DEFAULT_SEARCH_TIMEOUT_MS = 5e3;
482
473
  var DEFAULT_SEARCH_LIMIT = 6;
483
474
  var DEFAULT_SEARCH_MIN_SCORE = 0.55;
484
475
  function normalizeConfig(config) {
485
- if (!config)
486
- throw new Error("hosted: config is required");
476
+ if (!config) throw new Error("hosted: config is required");
487
477
  const endpoint = config.endpoint || config.tes_endpoint;
488
478
  const clientId = config.clientId || config.tes_client_id;
489
479
  const apiKey = config.apiKey || config.tes_api_key;
@@ -508,8 +498,7 @@ function buildHostedHeaders(config) {
508
498
  return headers;
509
499
  }
510
500
  async function hostedSearch(config, query, opts = {}) {
511
- if (!query)
512
- return { memories: [], skipped: "no_query" };
501
+ if (!query) return { memories: [], skipped: "no_query" };
513
502
  let cfg;
514
503
  try {
515
504
  cfg = normalizeConfig(config);
@@ -556,8 +545,7 @@ async function hostedSearch(config, query, opts = {}) {
556
545
  return { memories: payload.data?.semanticSearchMemories || [] };
557
546
  }
558
547
  function shortenReason(msg) {
559
- if (typeof msg !== "string")
560
- return "unknown";
548
+ if (typeof msg !== "string") return "unknown";
561
549
  return msg.toLowerCase().replace(/[^a-z0-9]+/g, "_").slice(0, 60);
562
550
  }
563
551
 
@@ -568,23 +556,19 @@ var MEMORY_DEFAULTS = {
568
556
  timeoutMs: 800
569
557
  };
570
558
  function detectClientType(client) {
571
- if (client?.chat?.completions?.create)
572
- return "openai";
573
- if (client?.messages?.create)
574
- return "anthropic";
575
- if (typeof client?.run === "function")
576
- return "workers-ai";
559
+ if (client?.chat?.completions?.create) return "openai";
560
+ if (client?.messages?.create) return "anthropic";
561
+ if (typeof client?.run === "function") return "workers-ai";
577
562
  return "unknown";
578
563
  }
579
564
  function extractLastUserMessage(params, provider) {
565
+ void provider;
580
566
  const msgs = Array.isArray(params?.messages) ? params.messages : null;
581
- if (!msgs)
582
- return null;
567
+ if (!msgs) return null;
583
568
  for (let i = msgs.length - 1; i >= 0; i--) {
584
569
  if (msgs[i].role === "user") {
585
570
  const c = msgs[i].content;
586
- if (typeof c === "string")
587
- return c;
571
+ if (typeof c === "string") return c;
588
572
  if (Array.isArray(c)) {
589
573
  return c.filter((p) => p.type === "text" && typeof p.text === "string").map((p) => p.text).join("\n");
590
574
  }
@@ -634,8 +618,7 @@ function wrapClient(clientConfig, client, sessionOpts = {}) {
634
618
  metadata: sessionOpts.metadata
635
619
  });
636
620
  const type = detectClientType(client);
637
- if (type === "openai")
638
- return wrapOpenAI(clientConfig, client, sessionOpts);
621
+ if (type === "openai") return wrapOpenAI(clientConfig, client, sessionOpts);
639
622
  if (type === "anthropic")
640
623
  return wrapAnthropic(clientConfig, client, sessionOpts);
641
624
  if (type === "workers-ai")
@@ -649,10 +632,8 @@ function wrapOpenAI(clientConfig, client, sessionOpts) {
649
632
  get(target, prop) {
650
633
  if (prop === "chat")
651
634
  return wrapOpenAIChat(clientConfig, target.chat, target, sessionOpts);
652
- if (prop === "sessionId")
653
- return sessionOpts._resolvedSessionId;
654
- if (prop === "tesSession")
655
- return sessionOpts._session;
635
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
636
+ if (prop === "tesSession") return sessionOpts._session;
656
637
  if (prop === "session")
657
638
  return (opts) => new OpenAISession(clientConfig, target, opts);
658
639
  return target[prop];
@@ -729,10 +710,8 @@ function wrapAnthropic(clientConfig, client, sessionOpts) {
729
710
  target,
730
711
  sessionOpts
731
712
  );
732
- if (prop === "sessionId")
733
- return sessionOpts._resolvedSessionId;
734
- if (prop === "tesSession")
735
- return sessionOpts._session;
713
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
714
+ if (prop === "tesSession") return sessionOpts._session;
736
715
  if (prop === "session")
737
716
  return (opts) => new AnthropicSession(clientConfig, target, opts);
738
717
  return target[prop];
@@ -819,10 +798,8 @@ function wrapWorkersAI(clientConfig, aiBinding, sessionOpts) {
819
798
  return result;
820
799
  };
821
800
  }
822
- if (prop === "sessionId")
823
- return sessionOpts._resolvedSessionId;
824
- if (prop === "tesSession")
825
- return sessionOpts._session;
801
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
802
+ if (prop === "tesSession") return sessionOpts._session;
826
803
  if (prop === "session")
827
804
  return (opts) => new WorkersAISession(clientConfig, target, opts);
828
805
  return target[prop];
@@ -841,29 +818,24 @@ var WorkersAISession = class extends Session {
841
818
  }
842
819
  };
843
820
  function extractToolResults(session, messages) {
844
- if (!messages?.length || !session._toolCalls.length)
845
- return;
821
+ if (!messages?.length || !session._toolCalls.length) return;
846
822
  const idToName = /* @__PURE__ */ new Map();
847
823
  for (const msg of messages) {
848
824
  if (msg.role === "assistant" && msg.tool_calls) {
849
825
  for (const tc of msg.tool_calls) {
850
826
  const id = tc.id || tc.tool_call_id;
851
827
  const name = tc.function?.name || tc.name;
852
- if (id && name)
853
- idToName.set(id, name);
828
+ if (id && name) idToName.set(id, name);
854
829
  }
855
830
  }
856
831
  }
857
832
  for (const msg of messages) {
858
- if (msg.role !== "tool" || !msg.content)
859
- continue;
833
+ if (msg.role !== "tool" || !msg.content) continue;
860
834
  const callId = msg.tool_call_id;
861
835
  const toolName = callId ? idToName.get(callId) : null;
862
836
  for (const tc of session._toolCalls) {
863
- if (tc.result)
864
- continue;
865
- if (toolName && tc.tool !== toolName)
866
- continue;
837
+ if (tc.result) continue;
838
+ if (toolName && tc.tool !== toolName) continue;
867
839
  try {
868
840
  const parsed = JSON.parse(msg.content);
869
841
  if (Array.isArray(parsed)) {
@@ -906,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
878
  }
907
879
 
908
880
  // src/telemetry.js
909
- var VERSION = "0.9.5";
881
+ var VERSION = "0.9.6";
910
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
883
  function machineId() {
912
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
@@ -917,11 +889,9 @@ function machineId() {
917
889
  return (hash >>> 0).toString(16).padStart(8, "0");
918
890
  }
919
891
  function emitTelemetry(mode) {
920
- if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0")
921
- return;
892
+ if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0") return;
922
893
  const f = globalThis.fetch;
923
- if (!f)
924
- return;
894
+ if (!f) return;
925
895
  f(TELEMETRY_URL, {
926
896
  method: "POST",
927
897
  headers: { "Content-Type": "application/json" },
@@ -941,12 +911,9 @@ function emitTelemetry(mode) {
941
911
  // src/client.js
942
912
  var TESClient = class {
943
913
  constructor({ clientId, apiKey, endpoint, headers, userId, captureContent = true, maxContentLength = 4096 }) {
944
- if (!clientId)
945
- throw new Error("clientId is required");
946
- if (!apiKey)
947
- throw new Error("apiKey is required");
948
- if (!endpoint)
949
- throw new Error("endpoint is required");
914
+ if (!clientId) throw new Error("clientId is required");
915
+ if (!apiKey) throw new Error("apiKey is required");
916
+ if (!endpoint) throw new Error("endpoint is required");
950
917
  const cleanEndpoint = endpoint.replace(/\/$/, "");
951
918
  const isLocalDev = /^http:\/\/localhost(:\d+)?(\/|$)/.test(cleanEndpoint) || /^http:\/\/127\.0\.0\.1(:\d+)?(\/|$)/.test(cleanEndpoint);
952
919
  if (!cleanEndpoint.startsWith("https://") && !isLocalDev) {
package/dist/index.js CHANGED
@@ -158,8 +158,7 @@ var encoder = new TextEncoder();
158
158
  function toBase64Url(buffer) {
159
159
  const bytes = new Uint8Array(buffer);
160
160
  let binary = "";
161
- for (let i = 0; i < bytes.length; i++)
162
- binary += String.fromCharCode(bytes[i]);
161
+ for (let i = 0; i < bytes.length; i++) binary += String.fromCharCode(bytes[i]);
163
162
  return btoa(binary).replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
164
163
  }
165
164
  async function signPayload(secret, payload) {
@@ -180,27 +179,22 @@ async function verifyPayload(secret, payload, signature) {
180
179
  }
181
180
  async function buildTrackUrl(endpoint, apiKey, payload) {
182
181
  const p = { ...payload };
183
- if (!p.e)
184
- p.e = "LINK_CLICK";
182
+ if (!p.e) p.e = "LINK_CLICK";
185
183
  const encoded = toBase64Url(encoder.encode(JSON.stringify(p)));
186
184
  const sig = await signPayload(apiKey, p);
187
185
  return `${endpoint}/r/${encoded}?sig=${sig}`;
188
186
  }
189
187
  var URL_RE = /https?:\/\/[^\s"'<>)\]]+/g;
190
188
  async function rewriteUrls(text, config, sessionId, metadata) {
191
- if (!text)
192
- return text;
189
+ if (!text) return text;
193
190
  const redirectPrefix = `${config.endpoint}/r/`;
194
191
  const matches = [...text.matchAll(URL_RE)];
195
- if (matches.length === 0)
196
- return text;
192
+ if (matches.length === 0) return text;
197
193
  const replacements = /* @__PURE__ */ new Map();
198
194
  for (const m of matches) {
199
195
  const originalUrl = m[0];
200
- if (originalUrl.startsWith(redirectPrefix))
201
- continue;
202
- if (replacements.has(originalUrl))
203
- continue;
196
+ if (originalUrl.startsWith(redirectPrefix)) continue;
197
+ if (replacements.has(originalUrl)) continue;
204
198
  const payload = {
205
199
  u: originalUrl,
206
200
  s: sessionId,
@@ -223,10 +217,8 @@ async function rewriteUrls(text, config, sessionId, metadata) {
223
217
 
224
218
  // src/session.js
225
219
  function truncate(value, maxLen) {
226
- if (!value || !maxLen || typeof value !== "string")
227
- return value;
228
- if (value.length <= maxLen)
229
- return value;
220
+ if (!value || !maxLen || typeof value !== "string") return value;
221
+ if (value.length <= maxLen) return value;
230
222
  return value.slice(0, maxLen) + "...[truncated]";
231
223
  }
232
224
  var Session = class {
@@ -389,8 +381,7 @@ var Session = class {
389
381
  // packages/memory/src/inject.js
390
382
  var MAX_CHARS_PER_MEMORY = 1200;
391
383
  function injectMemories(body, memories, provider) {
392
- if (!memories || memories.length === 0)
393
- return body;
384
+ if (!memories || memories.length === 0) return body;
394
385
  const preamble = formatPreamble(memories);
395
386
  if (provider === "anthropic") {
396
387
  return injectAnthropic(body, preamble);
@@ -451,8 +442,7 @@ var DEFAULT_SEARCH_TIMEOUT_MS = 5e3;
451
442
  var DEFAULT_SEARCH_LIMIT = 6;
452
443
  var DEFAULT_SEARCH_MIN_SCORE = 0.55;
453
444
  function normalizeConfig(config) {
454
- if (!config)
455
- throw new Error("hosted: config is required");
445
+ if (!config) throw new Error("hosted: config is required");
456
446
  const endpoint = config.endpoint || config.tes_endpoint;
457
447
  const clientId = config.clientId || config.tes_client_id;
458
448
  const apiKey = config.apiKey || config.tes_api_key;
@@ -477,8 +467,7 @@ function buildHostedHeaders(config) {
477
467
  return headers;
478
468
  }
479
469
  async function hostedSearch(config, query, opts = {}) {
480
- if (!query)
481
- return { memories: [], skipped: "no_query" };
470
+ if (!query) return { memories: [], skipped: "no_query" };
482
471
  let cfg;
483
472
  try {
484
473
  cfg = normalizeConfig(config);
@@ -525,8 +514,7 @@ async function hostedSearch(config, query, opts = {}) {
525
514
  return { memories: payload.data?.semanticSearchMemories || [] };
526
515
  }
527
516
  function shortenReason(msg) {
528
- if (typeof msg !== "string")
529
- return "unknown";
517
+ if (typeof msg !== "string") return "unknown";
530
518
  return msg.toLowerCase().replace(/[^a-z0-9]+/g, "_").slice(0, 60);
531
519
  }
532
520
 
@@ -537,23 +525,19 @@ var MEMORY_DEFAULTS = {
537
525
  timeoutMs: 800
538
526
  };
539
527
  function detectClientType(client) {
540
- if (client?.chat?.completions?.create)
541
- return "openai";
542
- if (client?.messages?.create)
543
- return "anthropic";
544
- if (typeof client?.run === "function")
545
- return "workers-ai";
528
+ if (client?.chat?.completions?.create) return "openai";
529
+ if (client?.messages?.create) return "anthropic";
530
+ if (typeof client?.run === "function") return "workers-ai";
546
531
  return "unknown";
547
532
  }
548
533
  function extractLastUserMessage(params, provider) {
534
+ void provider;
549
535
  const msgs = Array.isArray(params?.messages) ? params.messages : null;
550
- if (!msgs)
551
- return null;
536
+ if (!msgs) return null;
552
537
  for (let i = msgs.length - 1; i >= 0; i--) {
553
538
  if (msgs[i].role === "user") {
554
539
  const c = msgs[i].content;
555
- if (typeof c === "string")
556
- return c;
540
+ if (typeof c === "string") return c;
557
541
  if (Array.isArray(c)) {
558
542
  return c.filter((p) => p.type === "text" && typeof p.text === "string").map((p) => p.text).join("\n");
559
543
  }
@@ -603,8 +587,7 @@ function wrapClient(clientConfig, client, sessionOpts = {}) {
603
587
  metadata: sessionOpts.metadata
604
588
  });
605
589
  const type = detectClientType(client);
606
- if (type === "openai")
607
- return wrapOpenAI(clientConfig, client, sessionOpts);
590
+ if (type === "openai") return wrapOpenAI(clientConfig, client, sessionOpts);
608
591
  if (type === "anthropic")
609
592
  return wrapAnthropic(clientConfig, client, sessionOpts);
610
593
  if (type === "workers-ai")
@@ -618,10 +601,8 @@ function wrapOpenAI(clientConfig, client, sessionOpts) {
618
601
  get(target, prop) {
619
602
  if (prop === "chat")
620
603
  return wrapOpenAIChat(clientConfig, target.chat, target, sessionOpts);
621
- if (prop === "sessionId")
622
- return sessionOpts._resolvedSessionId;
623
- if (prop === "tesSession")
624
- return sessionOpts._session;
604
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
605
+ if (prop === "tesSession") return sessionOpts._session;
625
606
  if (prop === "session")
626
607
  return (opts) => new OpenAISession(clientConfig, target, opts);
627
608
  return target[prop];
@@ -698,10 +679,8 @@ function wrapAnthropic(clientConfig, client, sessionOpts) {
698
679
  target,
699
680
  sessionOpts
700
681
  );
701
- if (prop === "sessionId")
702
- return sessionOpts._resolvedSessionId;
703
- if (prop === "tesSession")
704
- return sessionOpts._session;
682
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
683
+ if (prop === "tesSession") return sessionOpts._session;
705
684
  if (prop === "session")
706
685
  return (opts) => new AnthropicSession(clientConfig, target, opts);
707
686
  return target[prop];
@@ -788,10 +767,8 @@ function wrapWorkersAI(clientConfig, aiBinding, sessionOpts) {
788
767
  return result;
789
768
  };
790
769
  }
791
- if (prop === "sessionId")
792
- return sessionOpts._resolvedSessionId;
793
- if (prop === "tesSession")
794
- return sessionOpts._session;
770
+ if (prop === "sessionId") return sessionOpts._resolvedSessionId;
771
+ if (prop === "tesSession") return sessionOpts._session;
795
772
  if (prop === "session")
796
773
  return (opts) => new WorkersAISession(clientConfig, target, opts);
797
774
  return target[prop];
@@ -810,29 +787,24 @@ var WorkersAISession = class extends Session {
810
787
  }
811
788
  };
812
789
  function extractToolResults(session, messages) {
813
- if (!messages?.length || !session._toolCalls.length)
814
- return;
790
+ if (!messages?.length || !session._toolCalls.length) return;
815
791
  const idToName = /* @__PURE__ */ new Map();
816
792
  for (const msg of messages) {
817
793
  if (msg.role === "assistant" && msg.tool_calls) {
818
794
  for (const tc of msg.tool_calls) {
819
795
  const id = tc.id || tc.tool_call_id;
820
796
  const name = tc.function?.name || tc.name;
821
- if (id && name)
822
- idToName.set(id, name);
797
+ if (id && name) idToName.set(id, name);
823
798
  }
824
799
  }
825
800
  }
826
801
  for (const msg of messages) {
827
- if (msg.role !== "tool" || !msg.content)
828
- continue;
802
+ if (msg.role !== "tool" || !msg.content) continue;
829
803
  const callId = msg.tool_call_id;
830
804
  const toolName = callId ? idToName.get(callId) : null;
831
805
  for (const tc of session._toolCalls) {
832
- if (tc.result)
833
- continue;
834
- if (toolName && tc.tool !== toolName)
835
- continue;
806
+ if (tc.result) continue;
807
+ if (toolName && tc.tool !== toolName) continue;
836
808
  try {
837
809
  const parsed = JSON.parse(msg.content);
838
810
  if (Array.isArray(parsed)) {
@@ -875,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
847
  }
876
848
 
877
849
  // src/telemetry.js
878
- var VERSION = "0.9.5";
850
+ var VERSION = "0.9.6";
879
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
852
  function machineId() {
881
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
@@ -886,11 +858,9 @@ function machineId() {
886
858
  return (hash >>> 0).toString(16).padStart(8, "0");
887
859
  }
888
860
  function emitTelemetry(mode) {
889
- if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0")
890
- return;
861
+ if (typeof process !== "undefined" && process.env?.PENTATONIC_TELEMETRY === "0") return;
891
862
  const f = globalThis.fetch;
892
- if (!f)
893
- return;
863
+ if (!f) return;
894
864
  f(TELEMETRY_URL, {
895
865
  method: "POST",
896
866
  headers: { "Content-Type": "application/json" },
@@ -910,12 +880,9 @@ function emitTelemetry(mode) {
910
880
  // src/client.js
911
881
  var TESClient = class {
912
882
  constructor({ clientId, apiKey, endpoint, headers, userId, captureContent = true, maxContentLength = 4096 }) {
913
- if (!clientId)
914
- throw new Error("clientId is required");
915
- if (!apiKey)
916
- throw new Error("apiKey is required");
917
- if (!endpoint)
918
- throw new Error("endpoint is required");
883
+ if (!clientId) throw new Error("clientId is required");
884
+ if (!apiKey) throw new Error("apiKey is required");
885
+ if (!endpoint) throw new Error("endpoint is required");
919
886
  const cleanEndpoint = endpoint.replace(/\/$/, "");
920
887
  const isLocalDev = /^http:\/\/localhost(:\d+)?(\/|$)/.test(cleanEndpoint) || /^http:\/\/127\.0\.0\.1(:\d+)?(\/|$)/.test(cleanEndpoint);
921
888
  if (!cleanEndpoint.startsWith("https://") && !isLocalDev) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.5",
3
+ "version": "0.9.6",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -73,11 +73,18 @@
73
73
  "dependencies": {
74
74
  "@modelcontextprotocol/sdk": "^1.27.1",
75
75
  "@pentatonic-ai/ai-agent-sdk": "^0.4.0",
76
- "esbuild": "^0.20.0"
76
+ "esbuild": "^0.25.0"
77
77
  },
78
78
  "devDependencies": {
79
79
  "@jest/globals": "^29.7.0",
80
80
  "jest": "^29.7.0",
81
81
  "pg": "^8.20.0"
82
+ },
83
+ "overrides": {
84
+ "path-to-regexp": "^8.4.0",
85
+ "ip-address": "^10.1.1",
86
+ "@hono/node-server": "^1.19.13",
87
+ "picomatch": "^4.0.4",
88
+ "esbuild": "^0.25.0"
82
89
  }
83
90
  }
@@ -1,20 +1,19 @@
1
1
  {
2
- "name": "@pentatonic/memory",
3
- "version": "0.1.0",
2
+ "name": "memory",
4
3
  "lockfileVersion": 3,
5
4
  "requires": true,
6
5
  "packages": {
7
6
  "": {
8
- "name": "@pentatonic/memory",
7
+ "name": "memory",
9
8
  "dependencies": {
10
9
  "@modelcontextprotocol/sdk": "^1.0.0",
11
10
  "pg": "^8.13.0"
12
11
  }
13
12
  },
14
13
  "node_modules/@hono/node-server": {
15
- "version": "1.19.13",
16
- "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.13.tgz",
17
- "integrity": "sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ==",
14
+ "version": "1.19.14",
15
+ "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz",
16
+ "integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==",
18
17
  "license": "MIT",
19
18
  "engines": {
20
19
  "node": ">=18.14.1"
@@ -77,9 +76,9 @@
77
76
  }
78
77
  },
79
78
  "node_modules/ajv": {
80
- "version": "8.18.0",
81
- "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
82
- "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==",
79
+ "version": "8.20.0",
80
+ "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz",
81
+ "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==",
83
82
  "license": "MIT",
84
83
  "dependencies": {
85
84
  "fast-deep-equal": "^3.1.3",
@@ -355,9 +354,9 @@
355
354
  }
356
355
  },
357
356
  "node_modules/eventsource-parser": {
358
- "version": "3.0.6",
359
- "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz",
360
- "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==",
357
+ "version": "3.0.8",
358
+ "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.8.tgz",
359
+ "integrity": "sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==",
361
360
  "license": "MIT",
362
361
  "engines": {
363
362
  "node": ">=18.0.0"
@@ -407,12 +406,12 @@
407
406
  }
408
407
  },
409
408
  "node_modules/express-rate-limit": {
410
- "version": "8.3.2",
411
- "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.3.2.tgz",
412
- "integrity": "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg==",
409
+ "version": "8.5.1",
410
+ "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.1.tgz",
411
+ "integrity": "sha512-5O6KYmyJEpuPJV5hNTXKbAHWRqrzyu+OI3vUnSd2kXFubIVpG7ezpgxQy76Zo5GQZtrQBg86hF+CM/NX+cioiQ==",
413
412
  "license": "MIT",
414
413
  "dependencies": {
415
- "ip-address": "10.1.0"
414
+ "ip-address": "^10.2.0"
416
415
  },
417
416
  "engines": {
418
417
  "node": ">= 16"
@@ -556,9 +555,9 @@
556
555
  }
557
556
  },
558
557
  "node_modules/hasown": {
559
- "version": "2.0.2",
560
- "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
561
- "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
558
+ "version": "2.0.3",
559
+ "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.3.tgz",
560
+ "integrity": "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==",
562
561
  "license": "MIT",
563
562
  "dependencies": {
564
563
  "function-bind": "^1.1.2"
@@ -619,9 +618,9 @@
619
618
  "license": "ISC"
620
619
  },
621
620
  "node_modules/ip-address": {
622
- "version": "10.1.0",
623
- "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
624
- "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
621
+ "version": "10.2.0",
622
+ "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
623
+ "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
625
624
  "license": "MIT",
626
625
  "engines": {
627
626
  "node": ">= 12"
@@ -649,9 +648,9 @@
649
648
  "license": "ISC"
650
649
  },
651
650
  "node_modules/jose": {
652
- "version": "6.2.2",
653
- "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.2.tgz",
654
- "integrity": "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ==",
651
+ "version": "6.2.3",
652
+ "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz",
653
+ "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==",
655
654
  "license": "MIT",
656
655
  "funding": {
657
656
  "url": "https://github.com/sponsors/panva"
@@ -1201,17 +1200,34 @@
1201
1200
  }
1202
1201
  },
1203
1202
  "node_modules/type-is": {
1204
- "version": "2.0.1",
1205
- "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz",
1206
- "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==",
1203
+ "version": "2.1.0",
1204
+ "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.1.0.tgz",
1205
+ "integrity": "sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==",
1207
1206
  "license": "MIT",
1208
1207
  "dependencies": {
1209
- "content-type": "^1.0.5",
1208
+ "content-type": "^2.0.0",
1210
1209
  "media-typer": "^1.1.0",
1211
1210
  "mime-types": "^3.0.0"
1212
1211
  },
1213
1212
  "engines": {
1214
- "node": ">= 0.6"
1213
+ "node": ">= 18"
1214
+ },
1215
+ "funding": {
1216
+ "type": "opencollective",
1217
+ "url": "https://opencollective.com/express"
1218
+ }
1219
+ },
1220
+ "node_modules/type-is/node_modules/content-type": {
1221
+ "version": "2.0.0",
1222
+ "resolved": "https://registry.npmjs.org/content-type/-/content-type-2.0.0.tgz",
1223
+ "integrity": "sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ==",
1224
+ "license": "MIT",
1225
+ "engines": {
1226
+ "node": ">=18"
1227
+ },
1228
+ "funding": {
1229
+ "type": "opencollective",
1230
+ "url": "https://opencollective.com/express"
1215
1231
  }
1216
1232
  },
1217
1233
  "node_modules/unpipe": {
@@ -1263,9 +1279,9 @@
1263
1279
  }
1264
1280
  },
1265
1281
  "node_modules/zod": {
1266
- "version": "4.3.6",
1267
- "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz",
1268
- "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==",
1282
+ "version": "4.4.3",
1283
+ "resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz",
1284
+ "integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==",
1269
1285
  "license": "MIT",
1270
1286
  "funding": {
1271
1287
  "url": "https://github.com/sponsors/colinhacks"
@@ -1,10 +1,13 @@
1
1
  {
2
2
  "private": true,
3
3
  "name": "memory",
4
- "description": "Memory subsystem imported via @pentatonic-ai/ai-agent-sdk/memory",
4
+ "description": "Memory subsystem \u2014 imported via @pentatonic-ai/ai-agent-sdk/memory",
5
5
  "type": "module",
6
6
  "dependencies": {
7
7
  "@modelcontextprotocol/sdk": "^1.0.0",
8
8
  "pg": "^8.13.0"
9
+ },
10
+ "overrides": {
11
+ "ip-address": "^10.1.1"
9
12
  }
10
13
  }
@@ -730,15 +730,22 @@ describe("engine HTTP client", () => {
730
730
  });
731
731
 
732
732
  describe("engineForget", () => {
733
- it("forwards id when provided", async () => {
733
+ it("forwards id when provided (no arena composition for id-based deletes)", async () => {
734
734
  mockOk({ deleted: 1 });
735
735
  await engineForget("https://e", { clientId: "acme", id: "abc" });
736
736
  const body = JSON.parse(calls[0].init.body);
737
737
  expect(calls[0].url).toBe("https://e/forget");
738
- expect(body).toEqual({ arena: "acme", id: "abc" });
738
+ // id-only deletes target the global record id; the engine's
739
+ // id path doesn't read arena scope, so we don't inject it.
740
+ expect(body).toEqual({ id: "abc" });
739
741
  });
740
742
 
741
- it("forwards metadata_contains when provided", async () => {
743
+ it("forwards metadata_contains and injects arena INSIDE it (tenant default)", async () => {
744
+ // The engine reads `metadata_contains.arena` (not top-level
745
+ // arena) to scope a forget at L2. Pre-2026-05-14 this helper
746
+ // put arena at the top level, which the engine silently
747
+ // ignored — only L6 ever got wiped. Pinning the post-fix
748
+ // contract here so a regression can't sneak back in.
742
749
  mockOk({ deleted: 5 });
743
750
  await engineForget("https://e", {
744
751
  clientId: "acme",
@@ -746,11 +753,39 @@ describe("engine HTTP client", () => {
746
753
  });
747
754
  const body = JSON.parse(calls[0].init.body);
748
755
  expect(body).toEqual({
749
- arena: "acme",
750
- metadata_contains: { source_repo: "monorepo" },
756
+ metadata_contains: { arena: "acme", source_repo: "monorepo" },
757
+ });
758
+ // Top-level arena must NOT be sent — the engine ignores it and
759
+ // its presence would mislead anyone reading wire dumps.
760
+ expect(body.arena).toBeUndefined();
761
+ });
762
+
763
+ it("composes user-scoped arena when userId is supplied", async () => {
764
+ mockOk({ deleted: 12 });
765
+ await engineForget("https://e", {
766
+ clientId: "acme",
767
+ userId: "u-1",
768
+ metadataContains: { actor_user_id: "u-1" },
769
+ });
770
+ const body = JSON.parse(calls[0].init.body);
771
+ expect(body).toEqual({
772
+ metadata_contains: { arena: "acme:u-1", actor_user_id: "u-1" },
751
773
  });
752
774
  });
753
775
 
776
+ it("respects caller-supplied arena inside metadataContains (super-admin override)", async () => {
777
+ // Super-admin tooling that wipes "some other tenant's user arena"
778
+ // — pass the explicit arena and the SDK leaves it alone instead
779
+ // of recomposing from (clientId, userId).
780
+ mockOk({ deleted: 99 });
781
+ await engineForget("https://e", {
782
+ clientId: "tes-admin",
783
+ metadataContains: { arena: "victim-tenant:u-7", source: "x" },
784
+ });
785
+ const body = JSON.parse(calls[0].init.body);
786
+ expect(body.metadata_contains.arena).toBe("victim-tenant:u-7");
787
+ });
788
+
754
789
  it("requires id or metadataContains", async () => {
755
790
  await expect(
756
791
  engineForget("https://e", { clientId: "acme" })
@@ -328,9 +328,31 @@ export async function engineSearch(engineUrl, opts) {
328
328
  *
329
329
  * Caller must supply exactly one of `id` or `metadataContains`.
330
330
  *
331
+ * Arena scope: the engine extracts the arena from `metadata_contains.arena`
332
+ * (see memory-engine `compat/server.py:1048-1052`). Top-level `arena` is
333
+ * NOT read by the engine — previous versions of this helper put it there
334
+ * and the resulting calls only ever wiped L6, leaving L0/L2/L3/L4 records
335
+ * untouched. The 2026-05-14 Pip dedup cutover surfaced the bug: an
336
+ * actor_user_id wipe returned 0 against an arena that personFacets
337
+ * confirmed held thousands of records. This helper now injects `arena`
338
+ * into `metadata_contains` so the engine forwards to L2 /forget-internal
339
+ * and actually wipes the cross-layer arena.
340
+ *
341
+ * By default the row is **user-scoped** (`arena = clientId:userId`) when
342
+ * `userId` is supplied, otherwise **tenant-wide** (`arena = clientId`).
343
+ * Pass `scope: "tenant"` explicitly to bypass the user-arena scope from a
344
+ * user-context. Matches `engineStore`'s arena semantics for symmetry.
345
+ *
346
+ * If the caller passes `arena` inside `metadataContains` themselves, the
347
+ * SDK respects it as-is and skips composition — useful for super-admin
348
+ * tools that need to wipe an arena other than the one derived from
349
+ * (clientId, userId).
350
+ *
331
351
  * @param {string} engineUrl
332
352
  * @param {object} opts
333
353
  * @param {string} opts.clientId
354
+ * @param {string} [opts.userId] user id within the tenant; controls default scope
355
+ * @param {"tenant"|"user"} [opts.scope] override the default scope. "user" requires userId.
334
356
  * @param {string} [opts.id] forget a single record by engine id
335
357
  * @param {object} [opts.metadataContains] forget all records matching every key=value pair
336
358
  * @param {Record<string,string>} [opts.headers] forwarded HTTP headers
@@ -338,15 +360,28 @@ export async function engineSearch(engineUrl, opts) {
338
360
  * @returns {Promise<{deleted: number}>}
339
361
  */
340
362
  export async function engineForget(engineUrl, opts) {
341
- const { clientId, id, metadataContains, headers } = opts || {};
363
+ const { clientId, userId, scope, id, metadataContains, headers } = opts || {};
342
364
  if (!clientId) throw new Error("engineForget: clientId required");
343
365
  if (!id && !metadataContains) {
344
366
  throw new Error("engineForget: provide id or metadataContains");
345
367
  }
368
+
369
+ // Compose arena from (clientId, userId, scope) using the same shape
370
+ // engineStore uses. Caller-supplied `metadataContains.arena` wins —
371
+ // the SDK shouldn't second-guess a super-admin explicitly targeting
372
+ // a specific arena.
373
+ let mergedMetadata;
374
+ if (metadataContains) {
375
+ const hasExplicitArena =
376
+ typeof metadataContains.arena === "string" && metadataContains.arena;
377
+ mergedMetadata = hasExplicitArena
378
+ ? metadataContains
379
+ : { ...metadataContains, arena: composeArena(clientId, userId, scope) };
380
+ }
381
+
346
382
  const body = {
347
- arena: clientId,
348
383
  ...(id ? { id } : {}),
349
- ...(metadataContains ? { metadata_contains: metadataContains } : {}),
384
+ ...(mergedMetadata ? { metadata_contains: mergedMetadata } : {}),
350
385
  };
351
386
  return fetchEngine(engineUrl, "/forget", body, { headers });
352
387
  }
@@ -72,7 +72,22 @@ services:
72
72
  environment:
73
73
  NEO4J_AUTH: ${NEO4J_AUTH:-neo4j/local-dev-pw}
74
74
  NEO4J_PLUGINS: '["apoc"]'
75
- NEO4J_dbms_memory_heap_max__size: 512m
75
+ # Heap defaults were 512m hardcoded — fine for an empty dev
76
+ # graph, catastrophic at production scale. A 2026-05-14 prod
77
+ # incident on a ~10M-relationship KG saw L3 sit at >600% CPU
78
+ # locked in parallel GC, blocking the L2 write fan-out and
79
+ # triggering cascading 5xx through L6 and the embed gateway.
80
+ # The graph fit in RAM fine; the JVM just had nowhere to put
81
+ # short-lived allocations.
82
+ #
83
+ # Defaults now sized for a small-but-realistic local graph
84
+ # (~1M relationships): 1g heap + 256m initial + 512m pagecache.
85
+ # Production deployments override via PME_L3_HEAP_MAX etc.
86
+ # (the AWS overlay sets 4g/1g/1g — see thing-event-system
87
+ # modules/pentatonic-memory/deploy/docker-compose.aws.yml).
88
+ NEO4J_dbms_memory_heap_max__size: ${PME_L3_HEAP_MAX:-1g}
89
+ NEO4J_dbms_memory_heap_initial__size: ${PME_L3_HEAP_INITIAL:-256m}
90
+ NEO4J_dbms_memory_pagecache_size: ${PME_L3_PAGECACHE:-512m}
76
91
  volumes:
77
92
  - pme-l3-data:/data
78
93
  healthcheck:
@@ -212,6 +212,9 @@ class EmbedClient:
212
212
  timeout: float = 120.0,
213
213
  env_prefix: str = "",
214
214
  max_batch: int = 5,
215
+ max_retries: int = 3,
216
+ retry_base_delay: float = 0.1,
217
+ retry_max_delay: float = 1.0,
215
218
  ) -> None:
216
219
  self._configured_provider = provider
217
220
  self._provider = provider
@@ -229,6 +232,25 @@ class EmbedClient:
229
232
  # cap observed on Pentatonic AI Gateway — above which it 502s and the
230
233
  # caller silently loses vector writes (see test_chunking_* tests).
231
234
  self._max_batch = max(0, max_batch)
235
+ # Retry-with-jitter for transient gateway saturation. The
236
+ # Pentatonic AI Gateway has a K≈10 concurrent-request cap; when
237
+ # multiple chunks of a single batch (or multiple concurrent
238
+ # batches from different layers) saturate it, individual POSTs
239
+ # 502/503. The 2026-05-15 incident showed an L6 fallback path
240
+ # 502-rate of 96% under Pip backfill load — every shared-embed
241
+ # failed, every per-layer fallback also failed, the cascade
242
+ # cleared only when traffic dropped.
243
+ #
244
+ # Retries with full jitter let those transient saturations
245
+ # absorb instead of cascading: when many concurrent chunks all
246
+ # 502 at once, jittered backoff staggers their retries so the
247
+ # gateway recovers slot-by-slot rather than thundering-herding.
248
+ # Tuned via {prefix}EMBED_MAX_RETRIES (default 3); set to 0
249
+ # to restore pre-fix behaviour. Only 429/502/503/504 are
250
+ # retried — auth + 4xx errors fail fast.
251
+ self._max_retries = max(0, max_retries)
252
+ self._retry_base_delay = max(0.0, retry_base_delay)
253
+ self._retry_max_delay = max(self._retry_base_delay, retry_max_delay)
232
254
 
233
255
  # ------------------------------------------------------------------
234
256
  # Construction
@@ -268,6 +290,13 @@ class EmbedClient:
268
290
  autodetect = os.environ.get(f"{prefix}EMBED_AUTODETECT", "true").lower() == "true"
269
291
  timeout = float(os.environ.get(f"{prefix}EMBED_TIMEOUT", "120"))
270
292
  max_batch = int(os.environ.get(f"{prefix}EMBED_MAX_BATCH", "5"))
293
+ max_retries = int(os.environ.get(f"{prefix}EMBED_MAX_RETRIES", "3"))
294
+ retry_base_delay = float(
295
+ os.environ.get(f"{prefix}EMBED_RETRY_BASE_DELAY", "0.1")
296
+ )
297
+ retry_max_delay = float(
298
+ os.environ.get(f"{prefix}EMBED_RETRY_MAX_DELAY", "1.0")
299
+ )
271
300
 
272
301
  provider = resolve_provider(provider_name, env_prefix=prefix)
273
302
  return cls(
@@ -279,6 +308,9 @@ class EmbedClient:
279
308
  timeout=timeout,
280
309
  env_prefix=prefix,
281
310
  max_batch=max_batch,
311
+ max_retries=max_retries,
312
+ retry_base_delay=retry_base_delay,
313
+ retry_max_delay=retry_max_delay,
282
314
  )
283
315
 
284
316
  # ------------------------------------------------------------------
@@ -369,41 +401,103 @@ class EmbedClient:
369
401
  # Request paths
370
402
  # ------------------------------------------------------------------
371
403
 
404
+ # Status codes that indicate transient gateway capacity issues
405
+ # (rate-limit, upstream saturation, transient unavailability,
406
+ # upstream timeout). 401 + other 4xx + non-listed 5xx fail fast —
407
+ # they typically indicate caller or config problems where retrying
408
+ # won't help.
409
+ _RETRYABLE_STATUS = frozenset({429, 502, 503, 504})
410
+
411
+ def _backoff_delay(self, attempt: int) -> float:
412
+ """Exponential backoff with full jitter.
413
+
414
+ Full jitter (random.uniform(0, cap)) is preferred over equal
415
+ jitter for the embed gateway case: many concurrent chunks all
416
+ 503 at the same instant, and full jitter maximally spreads
417
+ their retries so the gateway recovers slot-by-slot instead of
418
+ seeing periodic thundering herds.
419
+ """
420
+ import random
421
+ cap = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
422
+ return random.uniform(0, cap)
423
+
372
424
  def _post_with_autodetect(self, texts: list[str], *, async_mode: bool) -> list[list[float]]:
373
425
  del async_mode # kept for symmetry; sync path is its own method
374
- body = self._provider.body_builder(texts, self._model)
375
- headers = self._headers(self._provider)
376
- try:
377
- r = httpx.post(self._url, json=body, headers=headers, timeout=self._timeout)
378
- except httpx.HTTPError as exc:
379
- raise EmbedHTTPError(0, str(exc)) from exc
380
-
381
- if r.status_code == 401 and self._autodetect and not self._detected:
382
- return self._autodetect_and_retry(texts, last_body=r.text)
383
-
384
- if r.status_code == 401:
385
- raise EmbedAuthError(r.text)
386
- if not r.is_success:
387
- raise EmbedHTTPError(r.status_code, r.text)
388
- return self._provider.response_parser(r.json())
426
+ import time as _time
427
+ last_exc: EmbedHTTPError | None = None
428
+ for attempt in range(self._max_retries + 1):
429
+ body = self._provider.body_builder(texts, self._model)
430
+ headers = self._headers(self._provider)
431
+ try:
432
+ r = httpx.post(
433
+ self._url, json=body, headers=headers, timeout=self._timeout
434
+ )
435
+ except httpx.HTTPError as exc:
436
+ # Network-level error (DNS, connect refused, timeout).
437
+ # Treat as retryable — transient network blips are
438
+ # exactly what jittered retry is designed to absorb.
439
+ last_exc = EmbedHTTPError(0, str(exc))
440
+ if attempt >= self._max_retries:
441
+ raise last_exc from exc
442
+ _time.sleep(self._backoff_delay(attempt))
443
+ continue
444
+
445
+ if r.status_code == 401 and self._autodetect and not self._detected:
446
+ # Autodetect runs at most once (gated by self._detected)
447
+ # and tries other providers in sequence; no retry layer
448
+ # needed on top.
449
+ return self._autodetect_and_retry(texts, last_body=r.text)
450
+ if r.status_code == 401:
451
+ raise EmbedAuthError(r.text)
452
+ if not r.is_success:
453
+ if (
454
+ r.status_code in self._RETRYABLE_STATUS
455
+ and attempt < self._max_retries
456
+ ):
457
+ last_exc = EmbedHTTPError(r.status_code, r.text)
458
+ _time.sleep(self._backoff_delay(attempt))
459
+ continue
460
+ raise EmbedHTTPError(r.status_code, r.text)
461
+ return self._provider.response_parser(r.json())
462
+
463
+ # Loop exited without success or raise — shouldn't happen, but
464
+ # keep the type checker happy.
465
+ assert last_exc is not None
466
+ raise last_exc
389
467
 
390
468
  async def _post_with_autodetect_async(self, texts: list[str]) -> list[list[float]]:
391
- body = self._provider.body_builder(texts, self._model)
392
- headers = self._headers(self._provider)
393
- try:
394
- async with httpx.AsyncClient(timeout=self._timeout) as client:
395
- r = await client.post(self._url, json=body, headers=headers)
396
- except httpx.HTTPError as exc:
397
- raise EmbedHTTPError(0, str(exc)) from exc
398
-
399
- if r.status_code == 401 and self._autodetect and not self._detected:
400
- return await self._autodetect_and_retry_async(texts, last_body=r.text)
401
-
402
- if r.status_code == 401:
403
- raise EmbedAuthError(r.text)
404
- if not r.is_success:
405
- raise EmbedHTTPError(r.status_code, r.text)
406
- return self._provider.response_parser(r.json())
469
+ import asyncio as _asyncio
470
+ last_exc: EmbedHTTPError | None = None
471
+ for attempt in range(self._max_retries + 1):
472
+ body = self._provider.body_builder(texts, self._model)
473
+ headers = self._headers(self._provider)
474
+ try:
475
+ async with httpx.AsyncClient(timeout=self._timeout) as client:
476
+ r = await client.post(self._url, json=body, headers=headers)
477
+ except httpx.HTTPError as exc:
478
+ last_exc = EmbedHTTPError(0, str(exc))
479
+ if attempt >= self._max_retries:
480
+ raise last_exc from exc
481
+ await _asyncio.sleep(self._backoff_delay(attempt))
482
+ continue
483
+
484
+ if r.status_code == 401 and self._autodetect and not self._detected:
485
+ return await self._autodetect_and_retry_async(texts, last_body=r.text)
486
+ if r.status_code == 401:
487
+ raise EmbedAuthError(r.text)
488
+ if not r.is_success:
489
+ if (
490
+ r.status_code in self._RETRYABLE_STATUS
491
+ and attempt < self._max_retries
492
+ ):
493
+ last_exc = EmbedHTTPError(r.status_code, r.text)
494
+ await _asyncio.sleep(self._backoff_delay(attempt))
495
+ continue
496
+ raise EmbedHTTPError(r.status_code, r.text)
497
+ return self._provider.response_parser(r.json())
498
+
499
+ assert last_exc is not None
500
+ raise last_exc
407
501
 
408
502
  # ------------------------------------------------------------------
409
503
  # Auto-detect
@@ -268,6 +268,9 @@ def test_autodetect_all_fail_raises(recorder):
268
268
  # ----------------------------------------------------------------------
269
269
 
270
270
  def test_non_401_http_error_does_not_trigger_autodetect(recorder):
271
+ # max_retries=0 isolates this test to autodetect behaviour. With
272
+ # retries enabled (default), 503 triggers the retry path which is
273
+ # exercised separately in the retry tests below.
271
274
  recorder.respond(
272
275
  "https://gw/v1/embeddings",
273
276
  _FakeResponse(503, "upstream down"),
@@ -277,6 +280,7 @@ def test_non_401_http_error_does_not_trigger_autodetect(recorder):
277
280
  api_key="k",
278
281
  model="m",
279
282
  provider=PROVIDERS["openai"],
283
+ max_retries=0,
280
284
  )
281
285
  with pytest.raises(EmbedHTTPError) as exc:
282
286
  client.embed_batch(["x"])
@@ -490,3 +494,200 @@ def test_from_env_default_max_batch_is_five(monkeypatch):
490
494
  client.embed_batch([f"t{i}" for i in range(10)])
491
495
  # 10 with default chunk=5 → [5, 5] → 2 calls
492
496
  assert len(stub.calls) == 2
497
+
498
+
499
+ # ----------------------------------------------------------------------
500
+ # Retry-with-jitter on transient gateway saturation (502/503/504/429)
501
+ # ----------------------------------------------------------------------
502
+ #
503
+ # These tests exercise the retry path added 2026-05-15. Motivation:
504
+ # the Pentatonic AI Gateway has a K≈10 concurrency cap and 502s under
505
+ # saturation; without retry, a single 502 cascades through the engine's
506
+ # per-layer fallback path and amplifies load instead of damping it.
507
+ # See the prod incident note on EmbedClient.__init__ for context.
508
+
509
+
510
+ class _SequencedRecorder:
511
+ """Returns a different response on each successive call.
512
+
513
+ The default `_Recorder` returns the same response every time, which
514
+ is wrong for retry tests — we need to verify "first call 502, then
515
+ succeed on retry". This recorder pops responses off a queue per
516
+ URL and falls back to the last response if the queue is empty
517
+ (matching the "persistent failure" test case naturally).
518
+ """
519
+
520
+ def __init__(self):
521
+ self.calls: list[dict] = []
522
+ self.queues: dict[str, list[_FakeResponse]] = {}
523
+
524
+ def queue(self, url: str, responses: list[_FakeResponse]) -> None:
525
+ self.queues[url] = list(responses)
526
+
527
+ def __call__(self, url, *, json, headers, timeout):
528
+ self.calls.append({"url": url, "json": json})
529
+ q = self.queues.get(url, [])
530
+ if not q:
531
+ return _FakeResponse(401, "no responses queued")
532
+ # Pop unless this is the last one — keep returning the tail so
533
+ # "all attempts fail" tests don't need to queue N copies.
534
+ return q.pop(0) if len(q) > 1 else q[0]
535
+
536
+
537
+ @pytest.fixture
538
+ def sequenced(monkeypatch):
539
+ rec = _SequencedRecorder()
540
+ monkeypatch.setattr(httpx, "post", rec)
541
+ # Avoid the test taking real wall time on backoff sleeps — patch
542
+ # time.sleep to no-op. The jitter calculation still runs, just
543
+ # without the actual delay.
544
+ import time as _time
545
+ monkeypatch.setattr(_time, "sleep", lambda _s: None)
546
+ return rec
547
+
548
+
549
+ def test_retries_on_502_and_succeeds(sequenced):
550
+ sequenced.queue(
551
+ "https://gw/v1/embeddings",
552
+ [
553
+ _FakeResponse(502, "bad gateway"),
554
+ _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
555
+ ],
556
+ )
557
+ client = EmbedClient(
558
+ url="https://gw/v1/embeddings",
559
+ api_key="k",
560
+ model="m",
561
+ provider=PROVIDERS["openai"],
562
+ max_retries=3,
563
+ )
564
+ out = client.embed_batch(["hello"])
565
+ assert out == [[0.1, 0.2]]
566
+ # First call 502, second call 200 — exactly two attempts.
567
+ assert len(sequenced.calls) == 2
568
+
569
+
570
+ def test_retries_on_503_504_429(sequenced):
571
+ """Each transient code triggers the retry path the same way."""
572
+ for code in (503, 504, 429):
573
+ sequenced.calls.clear()
574
+ sequenced.queue(
575
+ "https://gw/v1/embeddings",
576
+ [
577
+ _FakeResponse(code, "transient"),
578
+ _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
579
+ ],
580
+ )
581
+ client = EmbedClient(
582
+ url="https://gw/v1/embeddings",
583
+ api_key="k",
584
+ model="m",
585
+ provider=PROVIDERS["openai"],
586
+ max_retries=3,
587
+ )
588
+ out = client.embed_batch(["x"])
589
+ assert out == [[0.0]], f"retry failed for status {code}"
590
+ assert len(sequenced.calls) == 2, f"wrong call count for status {code}"
591
+
592
+
593
+ def test_does_not_retry_on_500(sequenced):
594
+ """500 is server-side bug, not transient saturation — fail fast."""
595
+ sequenced.queue(
596
+ "https://gw/v1/embeddings",
597
+ [_FakeResponse(500, "internal server error")],
598
+ )
599
+ client = EmbedClient(
600
+ url="https://gw/v1/embeddings",
601
+ api_key="k",
602
+ model="m",
603
+ provider=PROVIDERS["openai"],
604
+ max_retries=3,
605
+ )
606
+ with pytest.raises(EmbedHTTPError) as exc:
607
+ client.embed_batch(["x"])
608
+ assert exc.value.status == 500
609
+ # Exactly one attempt — no retry on 500.
610
+ assert len(sequenced.calls) == 1
611
+
612
+
613
+ def test_does_not_retry_on_400(sequenced):
614
+ """4xx (other than 401-autodetect / 429) indicates caller error."""
615
+ sequenced.queue(
616
+ "https://gw/v1/embeddings",
617
+ [_FakeResponse(400, "bad request")],
618
+ )
619
+ client = EmbedClient(
620
+ url="https://gw/v1/embeddings",
621
+ api_key="k",
622
+ model="m",
623
+ provider=PROVIDERS["openai"],
624
+ max_retries=3,
625
+ )
626
+ with pytest.raises(EmbedHTTPError) as exc:
627
+ client.embed_batch(["x"])
628
+ assert exc.value.status == 400
629
+ assert len(sequenced.calls) == 1
630
+
631
+
632
+ def test_max_retries_exhausted_raises(sequenced):
633
+ """Persistent 502 raises after max_retries+1 attempts."""
634
+ sequenced.queue(
635
+ "https://gw/v1/embeddings",
636
+ [_FakeResponse(502, "still down")],
637
+ )
638
+ client = EmbedClient(
639
+ url="https://gw/v1/embeddings",
640
+ api_key="k",
641
+ model="m",
642
+ provider=PROVIDERS["openai"],
643
+ max_retries=3,
644
+ )
645
+ with pytest.raises(EmbedHTTPError) as exc:
646
+ client.embed_batch(["x"])
647
+ assert exc.value.status == 502
648
+ # max_retries=3 → 1 original + 3 retries = 4 calls total.
649
+ assert len(sequenced.calls) == 4
650
+
651
+
652
+ def test_max_retries_zero_disables_retry(sequenced):
653
+ """Explicit opt-out preserves pre-fix behaviour for callers that
654
+ handle their own retry."""
655
+ sequenced.queue(
656
+ "https://gw/v1/embeddings",
657
+ [_FakeResponse(502, "down")],
658
+ )
659
+ client = EmbedClient(
660
+ url="https://gw/v1/embeddings",
661
+ api_key="k",
662
+ model="m",
663
+ provider=PROVIDERS["openai"],
664
+ max_retries=0,
665
+ )
666
+ with pytest.raises(EmbedHTTPError):
667
+ client.embed_batch(["x"])
668
+ assert len(sequenced.calls) == 1
669
+
670
+
671
+ def test_from_env_reads_retry_config(monkeypatch):
672
+ """{prefix}EMBED_MAX_RETRIES + EMBED_RETRY_BASE_DELAY +
673
+ EMBED_RETRY_MAX_DELAY override the defaults."""
674
+ monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
675
+ monkeypatch.setenv("L4_EMBED_API_KEY", "k")
676
+ monkeypatch.setenv("L4_EMBED_MAX_RETRIES", "5")
677
+ monkeypatch.setenv("L4_EMBED_RETRY_BASE_DELAY", "0.25")
678
+ monkeypatch.setenv("L4_EMBED_RETRY_MAX_DELAY", "2.5")
679
+ client = EmbedClient.from_env(prefix="L4_")
680
+ assert client._max_retries == 5
681
+ assert client._retry_base_delay == 0.25
682
+ assert client._retry_max_delay == 2.5
683
+
684
+
685
+ def test_from_env_default_retry_config(monkeypatch):
686
+ """Defaults: 3 retries, 100ms base, 1s cap — tuned for K≈10
687
+ gateway under burst load."""
688
+ monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
689
+ monkeypatch.setenv("L4_EMBED_API_KEY", "k")
690
+ client = EmbedClient.from_env(prefix="L4_")
691
+ assert client._max_retries == 3
692
+ assert client._retry_base_delay == 0.1
693
+ assert client._retry_max_delay == 1.0