@pentatonic-ai/ai-agent-sdk 0.9.4 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +39 -72
- package/dist/index.js +36 -69
- package/package.json +9 -2
- package/packages/memory/package-lock.json +49 -33
- package/packages/memory/package.json +4 -1
- package/packages/memory/src/__tests__/engine.test.js +40 -5
- package/packages/memory/src/engine.js +38 -3
- package/packages/memory-engine/docker-compose.yml +24 -2
- package/packages/memory-engine/engine/services/_shared/embed_provider.py +125 -31
- package/packages/memory-engine/engine/services/l2/Dockerfile +7 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +233 -60
- package/packages/memory-engine/tests/test_embed_provider.py +201 -0
- package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +280 -0
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
{
|
|
2
|
-
"name": "
|
|
3
|
-
"version": "0.1.0",
|
|
2
|
+
"name": "memory",
|
|
4
3
|
"lockfileVersion": 3,
|
|
5
4
|
"requires": true,
|
|
6
5
|
"packages": {
|
|
7
6
|
"": {
|
|
8
|
-
"name": "
|
|
7
|
+
"name": "memory",
|
|
9
8
|
"dependencies": {
|
|
10
9
|
"@modelcontextprotocol/sdk": "^1.0.0",
|
|
11
10
|
"pg": "^8.13.0"
|
|
12
11
|
}
|
|
13
12
|
},
|
|
14
13
|
"node_modules/@hono/node-server": {
|
|
15
|
-
"version": "1.19.
|
|
16
|
-
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.
|
|
17
|
-
"integrity": "sha512-
|
|
14
|
+
"version": "1.19.14",
|
|
15
|
+
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz",
|
|
16
|
+
"integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==",
|
|
18
17
|
"license": "MIT",
|
|
19
18
|
"engines": {
|
|
20
19
|
"node": ">=18.14.1"
|
|
@@ -77,9 +76,9 @@
|
|
|
77
76
|
}
|
|
78
77
|
},
|
|
79
78
|
"node_modules/ajv": {
|
|
80
|
-
"version": "8.
|
|
81
|
-
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.
|
|
82
|
-
"integrity": "sha512-
|
|
79
|
+
"version": "8.20.0",
|
|
80
|
+
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz",
|
|
81
|
+
"integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==",
|
|
83
82
|
"license": "MIT",
|
|
84
83
|
"dependencies": {
|
|
85
84
|
"fast-deep-equal": "^3.1.3",
|
|
@@ -355,9 +354,9 @@
|
|
|
355
354
|
}
|
|
356
355
|
},
|
|
357
356
|
"node_modules/eventsource-parser": {
|
|
358
|
-
"version": "3.0.
|
|
359
|
-
"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.
|
|
360
|
-
"integrity": "sha512-
|
|
357
|
+
"version": "3.0.8",
|
|
358
|
+
"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.8.tgz",
|
|
359
|
+
"integrity": "sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==",
|
|
361
360
|
"license": "MIT",
|
|
362
361
|
"engines": {
|
|
363
362
|
"node": ">=18.0.0"
|
|
@@ -407,12 +406,12 @@
|
|
|
407
406
|
}
|
|
408
407
|
},
|
|
409
408
|
"node_modules/express-rate-limit": {
|
|
410
|
-
"version": "8.
|
|
411
|
-
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.
|
|
412
|
-
"integrity": "sha512-
|
|
409
|
+
"version": "8.5.1",
|
|
410
|
+
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.1.tgz",
|
|
411
|
+
"integrity": "sha512-5O6KYmyJEpuPJV5hNTXKbAHWRqrzyu+OI3vUnSd2kXFubIVpG7ezpgxQy76Zo5GQZtrQBg86hF+CM/NX+cioiQ==",
|
|
413
412
|
"license": "MIT",
|
|
414
413
|
"dependencies": {
|
|
415
|
-
"ip-address": "10.
|
|
414
|
+
"ip-address": "^10.2.0"
|
|
416
415
|
},
|
|
417
416
|
"engines": {
|
|
418
417
|
"node": ">= 16"
|
|
@@ -556,9 +555,9 @@
|
|
|
556
555
|
}
|
|
557
556
|
},
|
|
558
557
|
"node_modules/hasown": {
|
|
559
|
-
"version": "2.0.
|
|
560
|
-
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.
|
|
561
|
-
"integrity": "sha512-
|
|
558
|
+
"version": "2.0.3",
|
|
559
|
+
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.3.tgz",
|
|
560
|
+
"integrity": "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==",
|
|
562
561
|
"license": "MIT",
|
|
563
562
|
"dependencies": {
|
|
564
563
|
"function-bind": "^1.1.2"
|
|
@@ -619,9 +618,9 @@
|
|
|
619
618
|
"license": "ISC"
|
|
620
619
|
},
|
|
621
620
|
"node_modules/ip-address": {
|
|
622
|
-
"version": "10.
|
|
623
|
-
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.
|
|
624
|
-
"integrity": "sha512
|
|
621
|
+
"version": "10.2.0",
|
|
622
|
+
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
|
|
623
|
+
"integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
|
|
625
624
|
"license": "MIT",
|
|
626
625
|
"engines": {
|
|
627
626
|
"node": ">= 12"
|
|
@@ -649,9 +648,9 @@
|
|
|
649
648
|
"license": "ISC"
|
|
650
649
|
},
|
|
651
650
|
"node_modules/jose": {
|
|
652
|
-
"version": "6.2.
|
|
653
|
-
"resolved": "https://registry.npmjs.org/jose/-/jose-6.2.
|
|
654
|
-
"integrity": "sha512-
|
|
651
|
+
"version": "6.2.3",
|
|
652
|
+
"resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz",
|
|
653
|
+
"integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==",
|
|
655
654
|
"license": "MIT",
|
|
656
655
|
"funding": {
|
|
657
656
|
"url": "https://github.com/sponsors/panva"
|
|
@@ -1201,17 +1200,34 @@
|
|
|
1201
1200
|
}
|
|
1202
1201
|
},
|
|
1203
1202
|
"node_modules/type-is": {
|
|
1204
|
-
"version": "2.0
|
|
1205
|
-
"resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.
|
|
1206
|
-
"integrity": "sha512-
|
|
1203
|
+
"version": "2.1.0",
|
|
1204
|
+
"resolved": "https://registry.npmjs.org/type-is/-/type-is-2.1.0.tgz",
|
|
1205
|
+
"integrity": "sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==",
|
|
1207
1206
|
"license": "MIT",
|
|
1208
1207
|
"dependencies": {
|
|
1209
|
-
"content-type": "^
|
|
1208
|
+
"content-type": "^2.0.0",
|
|
1210
1209
|
"media-typer": "^1.1.0",
|
|
1211
1210
|
"mime-types": "^3.0.0"
|
|
1212
1211
|
},
|
|
1213
1212
|
"engines": {
|
|
1214
|
-
"node": ">=
|
|
1213
|
+
"node": ">= 18"
|
|
1214
|
+
},
|
|
1215
|
+
"funding": {
|
|
1216
|
+
"type": "opencollective",
|
|
1217
|
+
"url": "https://opencollective.com/express"
|
|
1218
|
+
}
|
|
1219
|
+
},
|
|
1220
|
+
"node_modules/type-is/node_modules/content-type": {
|
|
1221
|
+
"version": "2.0.0",
|
|
1222
|
+
"resolved": "https://registry.npmjs.org/content-type/-/content-type-2.0.0.tgz",
|
|
1223
|
+
"integrity": "sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ==",
|
|
1224
|
+
"license": "MIT",
|
|
1225
|
+
"engines": {
|
|
1226
|
+
"node": ">=18"
|
|
1227
|
+
},
|
|
1228
|
+
"funding": {
|
|
1229
|
+
"type": "opencollective",
|
|
1230
|
+
"url": "https://opencollective.com/express"
|
|
1215
1231
|
}
|
|
1216
1232
|
},
|
|
1217
1233
|
"node_modules/unpipe": {
|
|
@@ -1263,9 +1279,9 @@
|
|
|
1263
1279
|
}
|
|
1264
1280
|
},
|
|
1265
1281
|
"node_modules/zod": {
|
|
1266
|
-
"version": "4.3
|
|
1267
|
-
"resolved": "https://registry.npmjs.org/zod/-/zod-4.3.
|
|
1268
|
-
"integrity": "sha512-
|
|
1282
|
+
"version": "4.4.3",
|
|
1283
|
+
"resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz",
|
|
1284
|
+
"integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==",
|
|
1269
1285
|
"license": "MIT",
|
|
1270
1286
|
"funding": {
|
|
1271
1287
|
"url": "https://github.com/sponsors/colinhacks"
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"private": true,
|
|
3
3
|
"name": "memory",
|
|
4
|
-
"description": "Memory subsystem
|
|
4
|
+
"description": "Memory subsystem \u2014 imported via @pentatonic-ai/ai-agent-sdk/memory",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"dependencies": {
|
|
7
7
|
"@modelcontextprotocol/sdk": "^1.0.0",
|
|
8
8
|
"pg": "^8.13.0"
|
|
9
|
+
},
|
|
10
|
+
"overrides": {
|
|
11
|
+
"ip-address": "^10.1.1"
|
|
9
12
|
}
|
|
10
13
|
}
|
|
@@ -730,15 +730,22 @@ describe("engine HTTP client", () => {
|
|
|
730
730
|
});
|
|
731
731
|
|
|
732
732
|
describe("engineForget", () => {
|
|
733
|
-
it("forwards id when provided", async () => {
|
|
733
|
+
it("forwards id when provided (no arena composition for id-based deletes)", async () => {
|
|
734
734
|
mockOk({ deleted: 1 });
|
|
735
735
|
await engineForget("https://e", { clientId: "acme", id: "abc" });
|
|
736
736
|
const body = JSON.parse(calls[0].init.body);
|
|
737
737
|
expect(calls[0].url).toBe("https://e/forget");
|
|
738
|
-
|
|
738
|
+
// id-only deletes target the global record id; the engine's
|
|
739
|
+
// id path doesn't read arena scope, so we don't inject it.
|
|
740
|
+
expect(body).toEqual({ id: "abc" });
|
|
739
741
|
});
|
|
740
742
|
|
|
741
|
-
it("forwards metadata_contains
|
|
743
|
+
it("forwards metadata_contains and injects arena INSIDE it (tenant default)", async () => {
|
|
744
|
+
// The engine reads `metadata_contains.arena` (not top-level
|
|
745
|
+
// arena) to scope a forget at L2. Pre-2026-05-14 this helper
|
|
746
|
+
// put arena at the top level, which the engine silently
|
|
747
|
+
// ignored — only L6 ever got wiped. Pinning the post-fix
|
|
748
|
+
// contract here so a regression can't sneak back in.
|
|
742
749
|
mockOk({ deleted: 5 });
|
|
743
750
|
await engineForget("https://e", {
|
|
744
751
|
clientId: "acme",
|
|
@@ -746,11 +753,39 @@ describe("engine HTTP client", () => {
|
|
|
746
753
|
});
|
|
747
754
|
const body = JSON.parse(calls[0].init.body);
|
|
748
755
|
expect(body).toEqual({
|
|
749
|
-
arena: "acme",
|
|
750
|
-
|
|
756
|
+
metadata_contains: { arena: "acme", source_repo: "monorepo" },
|
|
757
|
+
});
|
|
758
|
+
// Top-level arena must NOT be sent — the engine ignores it and
|
|
759
|
+
// its presence would mislead anyone reading wire dumps.
|
|
760
|
+
expect(body.arena).toBeUndefined();
|
|
761
|
+
});
|
|
762
|
+
|
|
763
|
+
it("composes user-scoped arena when userId is supplied", async () => {
|
|
764
|
+
mockOk({ deleted: 12 });
|
|
765
|
+
await engineForget("https://e", {
|
|
766
|
+
clientId: "acme",
|
|
767
|
+
userId: "u-1",
|
|
768
|
+
metadataContains: { actor_user_id: "u-1" },
|
|
769
|
+
});
|
|
770
|
+
const body = JSON.parse(calls[0].init.body);
|
|
771
|
+
expect(body).toEqual({
|
|
772
|
+
metadata_contains: { arena: "acme:u-1", actor_user_id: "u-1" },
|
|
751
773
|
});
|
|
752
774
|
});
|
|
753
775
|
|
|
776
|
+
it("respects caller-supplied arena inside metadataContains (super-admin override)", async () => {
|
|
777
|
+
// Super-admin tooling that wipes "some other tenant's user arena"
|
|
778
|
+
// — pass the explicit arena and the SDK leaves it alone instead
|
|
779
|
+
// of recomposing from (clientId, userId).
|
|
780
|
+
mockOk({ deleted: 99 });
|
|
781
|
+
await engineForget("https://e", {
|
|
782
|
+
clientId: "tes-admin",
|
|
783
|
+
metadataContains: { arena: "victim-tenant:u-7", source: "x" },
|
|
784
|
+
});
|
|
785
|
+
const body = JSON.parse(calls[0].init.body);
|
|
786
|
+
expect(body.metadata_contains.arena).toBe("victim-tenant:u-7");
|
|
787
|
+
});
|
|
788
|
+
|
|
754
789
|
it("requires id or metadataContains", async () => {
|
|
755
790
|
await expect(
|
|
756
791
|
engineForget("https://e", { clientId: "acme" })
|
|
@@ -328,9 +328,31 @@ export async function engineSearch(engineUrl, opts) {
|
|
|
328
328
|
*
|
|
329
329
|
* Caller must supply exactly one of `id` or `metadataContains`.
|
|
330
330
|
*
|
|
331
|
+
* Arena scope: the engine extracts the arena from `metadata_contains.arena`
|
|
332
|
+
* (see memory-engine `compat/server.py:1048-1052`). Top-level `arena` is
|
|
333
|
+
* NOT read by the engine — previous versions of this helper put it there
|
|
334
|
+
* and the resulting calls only ever wiped L6, leaving L0/L2/L3/L4 records
|
|
335
|
+
* untouched. The 2026-05-14 Pip dedup cutover surfaced the bug: an
|
|
336
|
+
* actor_user_id wipe returned 0 against an arena that personFacets
|
|
337
|
+
* confirmed held thousands of records. This helper now injects `arena`
|
|
338
|
+
* into `metadata_contains` so the engine forwards to L2 /forget-internal
|
|
339
|
+
* and actually wipes the cross-layer arena.
|
|
340
|
+
*
|
|
341
|
+
* By default the row is **user-scoped** (`arena = clientId:userId`) when
|
|
342
|
+
* `userId` is supplied, otherwise **tenant-wide** (`arena = clientId`).
|
|
343
|
+
* Pass `scope: "tenant"` explicitly to bypass the user-arena scope from a
|
|
344
|
+
* user-context. Matches `engineStore`'s arena semantics for symmetry.
|
|
345
|
+
*
|
|
346
|
+
* If the caller passes `arena` inside `metadataContains` themselves, the
|
|
347
|
+
* SDK respects it as-is and skips composition — useful for super-admin
|
|
348
|
+
* tools that need to wipe an arena other than the one derived from
|
|
349
|
+
* (clientId, userId).
|
|
350
|
+
*
|
|
331
351
|
* @param {string} engineUrl
|
|
332
352
|
* @param {object} opts
|
|
333
353
|
* @param {string} opts.clientId
|
|
354
|
+
* @param {string} [opts.userId] user id within the tenant; controls default scope
|
|
355
|
+
* @param {"tenant"|"user"} [opts.scope] override the default scope. "user" requires userId.
|
|
334
356
|
* @param {string} [opts.id] forget a single record by engine id
|
|
335
357
|
* @param {object} [opts.metadataContains] forget all records matching every key=value pair
|
|
336
358
|
* @param {Record<string,string>} [opts.headers] forwarded HTTP headers
|
|
@@ -338,15 +360,28 @@ export async function engineSearch(engineUrl, opts) {
|
|
|
338
360
|
* @returns {Promise<{deleted: number}>}
|
|
339
361
|
*/
|
|
340
362
|
export async function engineForget(engineUrl, opts) {
|
|
341
|
-
const { clientId, id, metadataContains, headers } = opts || {};
|
|
363
|
+
const { clientId, userId, scope, id, metadataContains, headers } = opts || {};
|
|
342
364
|
if (!clientId) throw new Error("engineForget: clientId required");
|
|
343
365
|
if (!id && !metadataContains) {
|
|
344
366
|
throw new Error("engineForget: provide id or metadataContains");
|
|
345
367
|
}
|
|
368
|
+
|
|
369
|
+
// Compose arena from (clientId, userId, scope) using the same shape
|
|
370
|
+
// engineStore uses. Caller-supplied `metadataContains.arena` wins —
|
|
371
|
+
// the SDK shouldn't second-guess a super-admin explicitly targeting
|
|
372
|
+
// a specific arena.
|
|
373
|
+
let mergedMetadata;
|
|
374
|
+
if (metadataContains) {
|
|
375
|
+
const hasExplicitArena =
|
|
376
|
+
typeof metadataContains.arena === "string" && metadataContains.arena;
|
|
377
|
+
mergedMetadata = hasExplicitArena
|
|
378
|
+
? metadataContains
|
|
379
|
+
: { ...metadataContains, arena: composeArena(clientId, userId, scope) };
|
|
380
|
+
}
|
|
381
|
+
|
|
346
382
|
const body = {
|
|
347
|
-
arena: clientId,
|
|
348
383
|
...(id ? { id } : {}),
|
|
349
|
-
...(
|
|
384
|
+
...(mergedMetadata ? { metadata_contains: mergedMetadata } : {}),
|
|
350
385
|
};
|
|
351
386
|
return fetchEngine(engineUrl, "/forget", body, { headers });
|
|
352
387
|
}
|
|
@@ -72,7 +72,22 @@ services:
|
|
|
72
72
|
environment:
|
|
73
73
|
NEO4J_AUTH: ${NEO4J_AUTH:-neo4j/local-dev-pw}
|
|
74
74
|
NEO4J_PLUGINS: '["apoc"]'
|
|
75
|
-
|
|
75
|
+
# Heap defaults were 512m hardcoded — fine for an empty dev
|
|
76
|
+
# graph, catastrophic at production scale. A 2026-05-14 prod
|
|
77
|
+
# incident on a ~10M-relationship KG saw L3 sit at >600% CPU
|
|
78
|
+
# locked in parallel GC, blocking the L2 write fan-out and
|
|
79
|
+
# triggering cascading 5xx through L6 and the embed gateway.
|
|
80
|
+
# The graph fit in RAM fine; the JVM just had nowhere to put
|
|
81
|
+
# short-lived allocations.
|
|
82
|
+
#
|
|
83
|
+
# Defaults now sized for a small-but-realistic local graph
|
|
84
|
+
# (~1M relationships): 1g heap + 256m initial + 512m pagecache.
|
|
85
|
+
# Production deployments override via PME_L3_HEAP_MAX etc.
|
|
86
|
+
# (the AWS overlay sets 4g/1g/1g — see thing-event-system
|
|
87
|
+
# modules/pentatonic-memory/deploy/docker-compose.aws.yml).
|
|
88
|
+
NEO4J_dbms_memory_heap_max__size: ${PME_L3_HEAP_MAX:-1g}
|
|
89
|
+
NEO4J_dbms_memory_heap_initial__size: ${PME_L3_HEAP_INITIAL:-256m}
|
|
90
|
+
NEO4J_dbms_memory_pagecache_size: ${PME_L3_PAGECACHE:-512m}
|
|
76
91
|
volumes:
|
|
77
92
|
- pme-l3-data:/data
|
|
78
93
|
healthcheck:
|
|
@@ -220,7 +235,14 @@ services:
|
|
|
220
235
|
interval: 10s
|
|
221
236
|
timeout: 5s
|
|
222
237
|
retries: 30
|
|
223
|
-
|
|
238
|
+
# 180s gives L2 enough time to finish Neo4j schema + index creation
|
|
239
|
+
# on a cold start before compat's healthcheck starts counting failures.
|
|
240
|
+
# Observed concretely on the v0.9.4 deploy (2026-05-14): L2 took
|
|
241
|
+
# ~90s to warm up; with start_period: 60s, compat went unhealthy
|
|
242
|
+
# mid-startup, cloudflared's `depends_on: condition: service_healthy`
|
|
243
|
+
# failed, and `docker compose up` errored out before wait_for_health
|
|
244
|
+
# could observe the eventual recovery.
|
|
245
|
+
start_period: 180s
|
|
224
246
|
|
|
225
247
|
networks:
|
|
226
248
|
engine-net:
|
|
@@ -212,6 +212,9 @@ class EmbedClient:
|
|
|
212
212
|
timeout: float = 120.0,
|
|
213
213
|
env_prefix: str = "",
|
|
214
214
|
max_batch: int = 5,
|
|
215
|
+
max_retries: int = 3,
|
|
216
|
+
retry_base_delay: float = 0.1,
|
|
217
|
+
retry_max_delay: float = 1.0,
|
|
215
218
|
) -> None:
|
|
216
219
|
self._configured_provider = provider
|
|
217
220
|
self._provider = provider
|
|
@@ -229,6 +232,25 @@ class EmbedClient:
|
|
|
229
232
|
# cap observed on Pentatonic AI Gateway — above which it 502s and the
|
|
230
233
|
# caller silently loses vector writes (see test_chunking_* tests).
|
|
231
234
|
self._max_batch = max(0, max_batch)
|
|
235
|
+
# Retry-with-jitter for transient gateway saturation. The
|
|
236
|
+
# Pentatonic AI Gateway has a K≈10 concurrent-request cap; when
|
|
237
|
+
# multiple chunks of a single batch (or multiple concurrent
|
|
238
|
+
# batches from different layers) saturate it, individual POSTs
|
|
239
|
+
# 502/503. The 2026-05-15 incident showed an L6 fallback path
|
|
240
|
+
# 502-rate of 96% under Pip backfill load — every shared-embed
|
|
241
|
+
# failed, every per-layer fallback also failed, the cascade
|
|
242
|
+
# cleared only when traffic dropped.
|
|
243
|
+
#
|
|
244
|
+
# Retries with full jitter let those transient saturations
|
|
245
|
+
# absorb instead of cascading: when many concurrent chunks all
|
|
246
|
+
# 502 at once, jittered backoff staggers their retries so the
|
|
247
|
+
# gateway recovers slot-by-slot rather than thundering-herding.
|
|
248
|
+
# Tuned via {prefix}EMBED_MAX_RETRIES (default 3); set to 0
|
|
249
|
+
# to restore pre-fix behaviour. Only 429/502/503/504 are
|
|
250
|
+
# retried — auth + 4xx errors fail fast.
|
|
251
|
+
self._max_retries = max(0, max_retries)
|
|
252
|
+
self._retry_base_delay = max(0.0, retry_base_delay)
|
|
253
|
+
self._retry_max_delay = max(self._retry_base_delay, retry_max_delay)
|
|
232
254
|
|
|
233
255
|
# ------------------------------------------------------------------
|
|
234
256
|
# Construction
|
|
@@ -268,6 +290,13 @@ class EmbedClient:
|
|
|
268
290
|
autodetect = os.environ.get(f"{prefix}EMBED_AUTODETECT", "true").lower() == "true"
|
|
269
291
|
timeout = float(os.environ.get(f"{prefix}EMBED_TIMEOUT", "120"))
|
|
270
292
|
max_batch = int(os.environ.get(f"{prefix}EMBED_MAX_BATCH", "5"))
|
|
293
|
+
max_retries = int(os.environ.get(f"{prefix}EMBED_MAX_RETRIES", "3"))
|
|
294
|
+
retry_base_delay = float(
|
|
295
|
+
os.environ.get(f"{prefix}EMBED_RETRY_BASE_DELAY", "0.1")
|
|
296
|
+
)
|
|
297
|
+
retry_max_delay = float(
|
|
298
|
+
os.environ.get(f"{prefix}EMBED_RETRY_MAX_DELAY", "1.0")
|
|
299
|
+
)
|
|
271
300
|
|
|
272
301
|
provider = resolve_provider(provider_name, env_prefix=prefix)
|
|
273
302
|
return cls(
|
|
@@ -279,6 +308,9 @@ class EmbedClient:
|
|
|
279
308
|
timeout=timeout,
|
|
280
309
|
env_prefix=prefix,
|
|
281
310
|
max_batch=max_batch,
|
|
311
|
+
max_retries=max_retries,
|
|
312
|
+
retry_base_delay=retry_base_delay,
|
|
313
|
+
retry_max_delay=retry_max_delay,
|
|
282
314
|
)
|
|
283
315
|
|
|
284
316
|
# ------------------------------------------------------------------
|
|
@@ -369,41 +401,103 @@ class EmbedClient:
|
|
|
369
401
|
# Request paths
|
|
370
402
|
# ------------------------------------------------------------------
|
|
371
403
|
|
|
404
|
+
# Status codes that indicate transient gateway capacity issues
|
|
405
|
+
# (rate-limit, upstream saturation, transient unavailability,
|
|
406
|
+
# upstream timeout). 401 + other 4xx + non-listed 5xx fail fast —
|
|
407
|
+
# they typically indicate caller or config problems where retrying
|
|
408
|
+
# won't help.
|
|
409
|
+
_RETRYABLE_STATUS = frozenset({429, 502, 503, 504})
|
|
410
|
+
|
|
411
|
+
def _backoff_delay(self, attempt: int) -> float:
|
|
412
|
+
"""Exponential backoff with full jitter.
|
|
413
|
+
|
|
414
|
+
Full jitter (random.uniform(0, cap)) is preferred over equal
|
|
415
|
+
jitter for the embed gateway case: many concurrent chunks all
|
|
416
|
+
503 at the same instant, and full jitter maximally spreads
|
|
417
|
+
their retries so the gateway recovers slot-by-slot instead of
|
|
418
|
+
seeing periodic thundering herds.
|
|
419
|
+
"""
|
|
420
|
+
import random
|
|
421
|
+
cap = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
|
|
422
|
+
return random.uniform(0, cap)
|
|
423
|
+
|
|
372
424
|
def _post_with_autodetect(self, texts: list[str], *, async_mode: bool) -> list[list[float]]:
|
|
373
425
|
del async_mode # kept for symmetry; sync path is its own method
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
426
|
+
import time as _time
|
|
427
|
+
last_exc: EmbedHTTPError | None = None
|
|
428
|
+
for attempt in range(self._max_retries + 1):
|
|
429
|
+
body = self._provider.body_builder(texts, self._model)
|
|
430
|
+
headers = self._headers(self._provider)
|
|
431
|
+
try:
|
|
432
|
+
r = httpx.post(
|
|
433
|
+
self._url, json=body, headers=headers, timeout=self._timeout
|
|
434
|
+
)
|
|
435
|
+
except httpx.HTTPError as exc:
|
|
436
|
+
# Network-level error (DNS, connect refused, timeout).
|
|
437
|
+
# Treat as retryable — transient network blips are
|
|
438
|
+
# exactly what jittered retry is designed to absorb.
|
|
439
|
+
last_exc = EmbedHTTPError(0, str(exc))
|
|
440
|
+
if attempt >= self._max_retries:
|
|
441
|
+
raise last_exc from exc
|
|
442
|
+
_time.sleep(self._backoff_delay(attempt))
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
if r.status_code == 401 and self._autodetect and not self._detected:
|
|
446
|
+
# Autodetect runs at most once (gated by self._detected)
|
|
447
|
+
# and tries other providers in sequence; no retry layer
|
|
448
|
+
# needed on top.
|
|
449
|
+
return self._autodetect_and_retry(texts, last_body=r.text)
|
|
450
|
+
if r.status_code == 401:
|
|
451
|
+
raise EmbedAuthError(r.text)
|
|
452
|
+
if not r.is_success:
|
|
453
|
+
if (
|
|
454
|
+
r.status_code in self._RETRYABLE_STATUS
|
|
455
|
+
and attempt < self._max_retries
|
|
456
|
+
):
|
|
457
|
+
last_exc = EmbedHTTPError(r.status_code, r.text)
|
|
458
|
+
_time.sleep(self._backoff_delay(attempt))
|
|
459
|
+
continue
|
|
460
|
+
raise EmbedHTTPError(r.status_code, r.text)
|
|
461
|
+
return self._provider.response_parser(r.json())
|
|
462
|
+
|
|
463
|
+
# Loop exited without success or raise — shouldn't happen, but
|
|
464
|
+
# keep the type checker happy.
|
|
465
|
+
assert last_exc is not None
|
|
466
|
+
raise last_exc
|
|
389
467
|
|
|
390
468
|
async def _post_with_autodetect_async(self, texts: list[str]) -> list[list[float]]:
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
469
|
+
import asyncio as _asyncio
|
|
470
|
+
last_exc: EmbedHTTPError | None = None
|
|
471
|
+
for attempt in range(self._max_retries + 1):
|
|
472
|
+
body = self._provider.body_builder(texts, self._model)
|
|
473
|
+
headers = self._headers(self._provider)
|
|
474
|
+
try:
|
|
475
|
+
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
|
476
|
+
r = await client.post(self._url, json=body, headers=headers)
|
|
477
|
+
except httpx.HTTPError as exc:
|
|
478
|
+
last_exc = EmbedHTTPError(0, str(exc))
|
|
479
|
+
if attempt >= self._max_retries:
|
|
480
|
+
raise last_exc from exc
|
|
481
|
+
await _asyncio.sleep(self._backoff_delay(attempt))
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
if r.status_code == 401 and self._autodetect and not self._detected:
|
|
485
|
+
return await self._autodetect_and_retry_async(texts, last_body=r.text)
|
|
486
|
+
if r.status_code == 401:
|
|
487
|
+
raise EmbedAuthError(r.text)
|
|
488
|
+
if not r.is_success:
|
|
489
|
+
if (
|
|
490
|
+
r.status_code in self._RETRYABLE_STATUS
|
|
491
|
+
and attempt < self._max_retries
|
|
492
|
+
):
|
|
493
|
+
last_exc = EmbedHTTPError(r.status_code, r.text)
|
|
494
|
+
await _asyncio.sleep(self._backoff_delay(attempt))
|
|
495
|
+
continue
|
|
496
|
+
raise EmbedHTTPError(r.status_code, r.text)
|
|
497
|
+
return self._provider.response_parser(r.json())
|
|
498
|
+
|
|
499
|
+
assert last_exc is not None
|
|
500
|
+
raise last_exc
|
|
407
501
|
|
|
408
502
|
# ------------------------------------------------------------------
|
|
409
503
|
# Auto-detect
|
|
@@ -9,9 +9,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
9
9
|
|
|
10
10
|
# Reranker = sentence-transformers MiniLM cross-encoder.
|
|
11
11
|
# Torch CPU wheels are fine — reranker is small enough to be CPU-bound.
|
|
12
|
+
#
|
|
13
|
+
# sqlite-vec 0.1.9: native KNN over packed-f32 vectors stored in a vec0
|
|
14
|
+
# virtual table. Replaces the legacy hand-rolled Python cosine loop over
|
|
15
|
+
# JSON-serialised embeddings in search_qmd_informed (~15s timeout at 450k
|
|
16
|
+
# rows → ~50ms native MATCH). Pin to 0.1.9 — that's the version probed
|
|
17
|
+
# against L4 QMD's wire format (struct.pack f32 + cosine distance_metric).
|
|
12
18
|
RUN pip install --no-cache-dir \
|
|
13
19
|
fastapi "uvicorn[standard]" httpx requests pydantic \
|
|
14
20
|
neo4j \
|
|
21
|
+
sqlite-vec==0.1.9 \
|
|
15
22
|
"sentence-transformers" \
|
|
16
23
|
"torch" --extra-index-url https://download.pytorch.org/whl/cpu
|
|
17
24
|
|