@pentatonic-ai/ai-agent-sdk 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
906
  }
907
907
 
908
908
  // src/telemetry.js
909
- var VERSION = "0.9.1";
909
+ var VERSION = "0.9.3";
910
910
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
911
  function machineId() {
912
912
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
875
  }
876
876
 
877
877
  // src/telemetry.js
878
- var VERSION = "0.9.1";
878
+ var VERSION = "0.9.3";
879
879
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
880
  function machineId() {
881
881
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.1",
3
+ "version": "0.9.3",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, jest } from "@jest/globals";
2
2
  import {
3
3
  fetchEngine,
4
4
  engineStore,
5
+ engineStoreBatch,
5
6
  engineSearch,
6
7
  engineAggregate,
7
8
  enginePeopleList,
@@ -170,6 +171,129 @@ describe("engine HTTP client", () => {
170
171
  });
171
172
  });
172
173
 
174
+ describe("engineStoreBatch", () => {
175
+ it("posts to /store-batch with arena fixed per-record", async () => {
176
+ mockOk({ status: "ok", inserted: 2, ids: ["a1", "a2"] });
177
+ await engineStoreBatch("https://e", {
178
+ clientId: "acme",
179
+ records: [
180
+ { content: "first", metadata: { kind: "note" } },
181
+ { content: "second", metadata: { kind: "doc" } },
182
+ ],
183
+ layerType: "episodic",
184
+ actorUserId: "u-1",
185
+ });
186
+ const body = JSON.parse(calls[0].init.body);
187
+ expect(calls[0].url).toBe("https://e/store-batch");
188
+ expect(body).toEqual({
189
+ records: [
190
+ {
191
+ content: "first",
192
+ metadata: {
193
+ kind: "note",
194
+ arena: "acme",
195
+ layer_type: "episodic",
196
+ actor_user_id: "u-1",
197
+ },
198
+ },
199
+ {
200
+ content: "second",
201
+ metadata: {
202
+ kind: "doc",
203
+ arena: "acme",
204
+ layer_type: "episodic",
205
+ actor_user_id: "u-1",
206
+ },
207
+ },
208
+ ],
209
+ });
210
+ });
211
+
212
+ it("user-scoped arena applies to every record when userId provided", async () => {
213
+ mockOk({ status: "ok", inserted: 3, ids: ["a", "b", "c"] });
214
+ await engineStoreBatch("https://e", {
215
+ clientId: "acme",
216
+ userId: "user-42",
217
+ records: [
218
+ { content: "a" },
219
+ { content: "b" },
220
+ { content: "c" },
221
+ ],
222
+ });
223
+ const body = JSON.parse(calls[0].init.body);
224
+ for (const r of body.records) {
225
+ expect(r.metadata.arena).toBe("acme:user-42");
226
+ }
227
+ });
228
+
229
+ it("passes explicit per-record id through unchanged", async () => {
230
+ mockOk({ status: "ok", inserted: 1, ids: ["custom-id"] });
231
+ await engineStoreBatch("https://e", {
232
+ clientId: "acme",
233
+ records: [{ id: "custom-id", content: "x" }],
234
+ });
235
+ const body = JSON.parse(calls[0].init.body);
236
+ expect(body.records[0].id).toBe("custom-id");
237
+ expect(body.records[0].content).toBe("x");
238
+ });
239
+
240
+ it("omits id when not provided so engine hashes the content", async () => {
241
+ mockOk({ status: "ok", inserted: 1, ids: ["server-hashed"] });
242
+ await engineStoreBatch("https://e", {
243
+ clientId: "acme",
244
+ records: [{ content: "x" }],
245
+ });
246
+ const body = JSON.parse(calls[0].init.body);
247
+ expect(body.records[0]).not.toHaveProperty("id");
248
+ });
249
+
250
+ it("does NOT let caller override arena via per-record metadata", async () => {
251
+ mockOk({ status: "ok", inserted: 1, ids: ["x"] });
252
+ await engineStoreBatch("https://e", {
253
+ clientId: "acme",
254
+ records: [
255
+ { content: "x", metadata: { arena: "tenant-b" } },
256
+ ],
257
+ });
258
+ const body = JSON.parse(calls[0].init.body);
259
+ expect(body.records[0].metadata.arena).toBe("acme");
260
+ });
261
+
262
+ it("returns early without HTTP call when records is empty", async () => {
263
+ mockOk({ status: "ok", inserted: 0, ids: [] });
264
+ const out = await engineStoreBatch("https://e", {
265
+ clientId: "acme",
266
+ records: [],
267
+ });
268
+ expect(out).toEqual({ status: "ok", inserted: 0, ids: [] });
269
+ expect(calls.length).toBe(0);
270
+ });
271
+
272
+ it("rejects missing clientId", async () => {
273
+ await expect(
274
+ engineStoreBatch("https://e", { records: [{ content: "x" }] })
275
+ ).rejects.toThrow(/clientId/);
276
+ });
277
+
278
+ it("rejects non-array records", async () => {
279
+ await expect(
280
+ engineStoreBatch("https://e", { clientId: "a" })
281
+ ).rejects.toThrow(/records/);
282
+ await expect(
283
+ engineStoreBatch("https://e", { clientId: "a", records: "oops" })
284
+ ).rejects.toThrow(/records/);
285
+ });
286
+
287
+ it("rejects non-string content with the offending index", async () => {
288
+ await expect(
289
+ engineStoreBatch("https://e", {
290
+ clientId: "a",
291
+ records: [{ content: "ok" }, { content: 123 }],
292
+ })
293
+ ).rejects.toThrow(/records\[1\]\.content/);
294
+ });
295
+ });
296
+
173
297
  describe("engineSearch", () => {
174
298
  it("tenant-only arenas list when no userId", async () => {
175
299
  mockOk({ results: [] });
@@ -206,6 +206,75 @@ export async function engineStore(engineUrl, opts) {
206
206
  return fetchEngine(engineUrl, "/store", body, { headers });
207
207
  }
208
208
 
209
+ /**
210
+ * Batch-store many records in one engine call.
211
+ *
212
+ * One HTTP round-trip → engine issues one batched embed call covering
213
+ * every record (vs N round-trips + N single-text embeds for `engineStore`).
214
+ * Per `/store-batch` semantics, the response includes per-record ids
215
+ * plus the engine block with per-layer write counts.
216
+ *
217
+ * Arena composition matches `engineStore`: tenant-wide
218
+ * (`arena = clientId`) by default, user-scoped (`arena = clientId:userId`)
219
+ * when `userId` is supplied, overridable via `scope`. The shared arena
220
+ * is applied to **every** record; per-record `metadata` is preserved
221
+ * but cannot override the SDK-fixed arena (last-spread-wins ordering).
222
+ *
223
+ * Each record may carry an explicit `id` (stable dedup key) or omit
224
+ * it and let the engine hash the content. Per-record `metadata` is
225
+ * merged with the shared layerType / actorUserId before posting.
226
+ *
227
+ * @param {string} engineUrl
228
+ * @param {object} opts
229
+ * @param {string} opts.clientId
230
+ * @param {string} [opts.userId]
231
+ * @param {"tenant"|"user"} [opts.scope]
232
+ * @param {Array<{content: string, metadata?: object, id?: string}>} opts.records
233
+ * @param {string} [opts.layerType]
234
+ * @param {string} [opts.actorUserId]
235
+ * @param {Record<string,string>} [opts.headers]
236
+ * @returns {Promise<{status: string, inserted: number, ids: string[], engine?: object, duration_ms?: number}>}
237
+ */
238
+ export async function engineStoreBatch(engineUrl, opts) {
239
+ const {
240
+ clientId,
241
+ userId,
242
+ scope,
243
+ records,
244
+ layerType,
245
+ actorUserId,
246
+ headers,
247
+ } = opts || {};
248
+ if (!clientId) throw new Error("engineStoreBatch: clientId required");
249
+ if (!Array.isArray(records)) {
250
+ throw new Error("engineStoreBatch: records[] required");
251
+ }
252
+ // Short-circuit empty input — no network round-trip, no error.
253
+ // Matches engineStore's tolerance for trivial inputs at upstream callers.
254
+ if (records.length === 0) {
255
+ return { status: "ok", inserted: 0, ids: [] };
256
+ }
257
+ const arena = composeArena(clientId, userId, scope);
258
+ const body = {
259
+ records: records.map((r, i) => {
260
+ if (typeof r?.content !== "string") {
261
+ throw new Error(`engineStoreBatch: records[${i}].content must be a string`);
262
+ }
263
+ return {
264
+ ...(r.id ? { id: r.id } : {}),
265
+ content: r.content,
266
+ metadata: {
267
+ ...(r.metadata || {}),
268
+ arena,
269
+ ...(layerType ? { layer_type: layerType } : {}),
270
+ ...(actorUserId !== undefined ? { actor_user_id: actorUserId } : {}),
271
+ },
272
+ };
273
+ }),
274
+ };
275
+ return fetchEngine(engineUrl, "/store-batch", body, { headers });
276
+ }
277
+
209
278
  /**
210
279
  * Search the engine, scoped to a tenant.
211
280
  *
@@ -4,7 +4,18 @@ WORKDIR /app
4
4
 
5
5
  RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx pydantic
6
6
 
7
- COPY server.py /app/server.py
7
+ # Build context is the memory-engine root (see docker-compose.yml). The
8
+ # shim's server.py side-loads engine/services/_shared/embed_provider.py
9
+ # for shared-embed mode on /store-batch (one embed call across all 4
10
+ # layer indexers vs 4 redundant calls).
11
+ COPY compat/server.py /app/server.py
12
+ # server.py's sys.path.insert resolves "../engine/services" relative to
13
+ # its own location (/app/server.py → /engine/services). Mirror that
14
+ # layout so the import works without changing the runtime code.
15
+ COPY engine/services/_shared /engine/services/_shared
16
+ # Make `_shared` an importable package (mirror the layer services'
17
+ # layout where __init__.py exists or python detects PEP 420 namespace).
18
+ RUN touch /engine/services/__init__.py
8
19
 
9
20
  EXPOSE 8099
10
21
 
@@ -34,6 +34,7 @@ Environment:
34
34
 
35
35
  import hashlib
36
36
  import os
37
+ import sys
37
38
  import time
38
39
  from datetime import datetime, timezone
39
40
  from typing import Any, Optional
@@ -42,6 +43,17 @@ import httpx
42
43
  from fastapi import FastAPI, HTTPException
43
44
  from pydantic import BaseModel, Field
44
45
 
46
+ # Reach into the engine/services tree so we can reuse EmbedClient. The
47
+ # tree isn't a real installed package; layer services and the compat
48
+ # shim both side-load it the same way. Keeps the chunking + auto-detect
49
+ # behaviour identical between the shim's pre-embed and the per-layer
50
+ # embeds that previously did the same work N times.
51
+ sys.path.insert(
52
+ 0,
53
+ os.path.join(os.path.dirname(__file__), "..", "engine", "services"),
54
+ )
55
+ from _shared.embed_provider import EmbedClient # noqa: E402
56
+
45
57
  # ----------------------------------------------------------------------
46
58
  # Config
47
59
  # ----------------------------------------------------------------------
@@ -63,6 +75,30 @@ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
63
75
 
64
76
  PORT = int(os.environ.get("PORT", "8099"))
65
77
 
78
+ # Shared-embed mode. When on, /store-batch computes embeddings once at
79
+ # the shim level and forwards them to each layer's /index-batch so the
80
+ # layer skips its own embed call. Cuts gateway RPC count by ~4× (L4 +
81
+ # L5 + L6 + L2-internal all did the same embed work independently).
82
+ # Default ON because all layers in this engine use the same NV-Embed
83
+ # model; disable if you ever wire up per-layer differentiated embedders
84
+ # (e.g. cohere on L5, openai on L4).
85
+ SHARE_EMBEDDINGS = os.environ.get("PME_SHARE_EMBEDDINGS", "true").lower() == "true"
86
+
87
+ _embed_client: EmbedClient | None = None
88
+
89
+
90
+ def _get_embed_client() -> EmbedClient:
91
+ """Lazy-init the shim's EmbedClient using PME_-prefixed env vars
92
+ (matches L2's pattern). Cached for the process lifetime so the
93
+ auto-detect handshake only happens once."""
94
+ global _embed_client
95
+ if _embed_client is None:
96
+ _embed_client = EmbedClient.from_env(
97
+ prefix="PME_",
98
+ default_url=NV_EMBED_URL,
99
+ )
100
+ return _embed_client
101
+
66
102
 
67
103
  # Layer types we surface as the SDK 4-layer projection. Engine stores
68
104
  # everything as chunks tagged with arena + layer_type metadata; this
@@ -252,12 +288,23 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
252
288
  return [d["embedding"] for d in resp.json()["data"]]
253
289
 
254
290
 
255
- async def _index_l4(records: list[dict[str, Any]]) -> int:
256
- """Index records into the L4 sqlite-vec layer."""
257
- payload = {"records": [
291
+ async def _index_l4(
292
+ records: list[dict[str, Any]],
293
+ embeddings: list[list[float]] | None = None,
294
+ ) -> int:
295
+ """Index records into the L4 sqlite-vec layer.
296
+
297
+ When `embeddings` is supplied (parallel to records), L4's /index-batch
298
+ skips its own embed call and uses ours — eliminates the redundant
299
+ embed work that previously cost ~850ms per drain alarm. When None,
300
+ L4 embeds itself (backwards-compatible path for older callers / tests
301
+ that don't share embeddings)."""
302
+ payload: dict[str, Any] = {"records": [
258
303
  {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
259
304
  "text": r["content"]} for r in records
260
305
  ]}
306
+ if embeddings is not None:
307
+ payload["embeddings"] = embeddings
261
308
  try:
262
309
  resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
263
310
  resp.raise_for_status()
@@ -267,13 +314,20 @@ async def _index_l4(records: list[dict[str, Any]]) -> int:
267
314
  return 0
268
315
 
269
316
 
270
- async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
317
+ async def _index_l5(
318
+ records: list[dict[str, Any]],
319
+ arena: str = "general",
320
+ embeddings: list[list[float]] | None = None,
321
+ ) -> int:
271
322
  """Index records into the L5 Milvus comms layer (chats collection).
272
323
 
273
324
  arena is forwarded as a Milvus dynamic field so /search can filter
274
325
  by arena natively (vs the shim's defence-in-depth post-filter).
326
+
327
+ When `embeddings` is supplied (parallel to records), L5 skips its
328
+ own embed call — see _index_l4 docstring for the dedup story.
275
329
  """
276
- payload = {
330
+ payload: dict[str, Any] = {
277
331
  "collection": "chats",
278
332
  "records": [
279
333
  {
@@ -287,6 +341,8 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
287
341
  for r in records
288
342
  ],
289
343
  }
344
+ if embeddings is not None:
345
+ payload["embeddings"] = embeddings
290
346
  try:
291
347
  resp = await _client().post(f"{L5_MILVUS_URL}/index-batch", json=payload, timeout=60.0)
292
348
  resp.raise_for_status()
@@ -299,9 +355,17 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
299
355
  return 0
300
356
 
301
357
 
302
- async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> int:
303
- """Index records into the L6 document store."""
304
- payload = {
358
+ async def _index_l6(
359
+ records: list[dict[str, Any]],
360
+ arena: str = "general",
361
+ embeddings: list[list[float]] | None = None,
362
+ ) -> int:
363
+ """Index records into the L6 document store.
364
+
365
+ When `embeddings` is supplied (parallel to records), L6 skips its
366
+ own embed call — see _index_l4 docstring for the dedup story.
367
+ """
368
+ payload: dict[str, Any] = {
305
369
  "arena": arena,
306
370
  "records": [
307
371
  {
@@ -314,6 +378,8 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
314
378
  for r in records
315
379
  ],
316
380
  }
381
+ if embeddings is not None:
382
+ payload["embeddings"] = embeddings
317
383
  try:
318
384
  resp = await _client().post(f"{L6_DOC_URL}/index-batch", json=payload, timeout=120.0)
319
385
  resp.raise_for_status()
@@ -323,14 +389,22 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
323
389
  return 0
324
390
 
325
391
 
326
- async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "general") -> dict:
392
+ async def _index_l2_internal(
393
+ records: list[dict[str, Any]],
394
+ arena: str = "general",
395
+ embeddings: list[list[float]] | None = None,
396
+ ) -> dict:
327
397
  """Populate L2's internal stores: L0 BM25 + L4 QMD vec + L3 Neo4j KG.
328
398
 
329
399
  Without this, L2's RRF fusion runs over empty L0/L4-qmd/L3 layers and
330
400
  those zero-result rank lists pollute the score. The L2 proxy exposes
331
401
  /index-internal-batch which writes to all three in one round-trip.
402
+
403
+ When `embeddings` is supplied (parallel to records), L2's internal
404
+ embed call (used for L4-QMD population) is skipped — see _index_l4
405
+ docstring for the dedup story.
332
406
  """
333
- payload = {
407
+ payload: dict[str, Any] = {
334
408
  "arena": arena,
335
409
  "records": [
336
410
  {
@@ -341,6 +415,8 @@ async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "genera
341
415
  for r in records
342
416
  ],
343
417
  }
418
+ if embeddings is not None:
419
+ payload["embeddings"] = embeddings
344
420
  try:
345
421
  resp = await _client().post(f"{L2_PROXY_URL}/index-internal-batch",
346
422
  json=payload, timeout=180.0)
@@ -646,11 +722,37 @@ async def store_batch(req: StoreBatchRequest):
646
722
 
647
723
  t0 = time.perf_counter()
648
724
  import asyncio
725
+
726
+ # Shared-embed mode: compute embeddings ONCE here, pass them down to
727
+ # every layer so they skip their own embed call. Previously L4 + L5
728
+ # + L6 + L2-internal each re-embedded the same texts in parallel,
729
+ # which fanned 4× the gateway RPCs. The gateway throttles at K≈10
730
+ # concurrent requests, so 40-way fan-out serialised into ~4 rounds
731
+ # of ~850ms each = ~3.5s of pure embed time per /store-batch. With
732
+ # shared embeddings we issue one chunked embed pass (10 sub-calls
733
+ # for N=50 records) and skip the per-layer redundant work entirely.
734
+ # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
735
+ # per-layer differentiated embedders.
736
+ shared_embeddings: list[list[float]] | None = None
737
+ embed_ms = 0.0
738
+ if SHARE_EMBEDDINGS and normalised:
739
+ texts = [r["content"] for r in normalised]
740
+ embed_t0 = time.perf_counter()
741
+ try:
742
+ shared_embeddings = await _get_embed_client().embed_batch_async(texts)
743
+ except Exception as exc:
744
+ # Fall back to per-layer embedding rather than failing the
745
+ # whole batch. The layers' /index-batch still works when
746
+ # `embeddings` is absent.
747
+ print(f"[shim] shared embed failed, falling back to per-layer: {exc}")
748
+ shared_embeddings = None
749
+ embed_ms = (time.perf_counter() - embed_t0) * 1000.0
750
+
649
751
  l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
650
- _index_l4(normalised),
651
- _index_l5(normalised, arena=req.arena or "general"),
652
- _index_l6(normalised, arena=req.arena or "general"),
653
- _index_l2_internal(normalised, arena=req.arena or "general"),
752
+ _index_l4(normalised, embeddings=shared_embeddings),
753
+ _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
754
+ _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
755
+ _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
654
756
  )
655
757
  dur_ms = (time.perf_counter() - t0) * 1000.0
656
758
 
@@ -212,8 +212,11 @@ services:
212
212
  compat:
213
213
  <<: *engine-base
214
214
  build:
215
- context: ./compat
216
- dockerfile: Dockerfile
215
+ # Build context is the memory-engine root so the Dockerfile can
216
+ # COPY both compat/server.py and engine/services/_shared (shared
217
+ # EmbedClient for /store-batch dedup).
218
+ context: .
219
+ dockerfile: compat/Dockerfile
217
220
  container_name: pme-compat
218
221
  ports:
219
222
  - "127.0.0.1:${PME_PORT:-8099}:8099"
@@ -225,6 +228,17 @@ services:
225
228
  L5_MILVUS_URL: http://l5:8034
226
229
  L6_DOC_URL: http://l6:8037
227
230
  NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
231
+ # PME_ prefix vars feed the shim's EmbedClient for shared-embed
232
+ # mode on /store-batch (one embed call across all 4 indexers vs
233
+ # 4 redundant calls). Match the L2 config block so both clients
234
+ # hit the same gateway with the same model. Set
235
+ # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
236
+ PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
237
+ PME_EMBED_API_KEY: ${EMBED_API_KEY:-}
238
+ PME_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
239
+ PME_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
240
+ PME_NV_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
241
+ PME_SHARE_EMBEDDINGS: ${PME_SHARE_EMBEDDINGS:-true}
228
242
  BYPASS_L2_PROXY: ${BYPASS_L2_PROXY:-0}
229
243
  extra_hosts:
230
244
  - "host.docker.internal:host-gateway"
@@ -1496,6 +1496,12 @@ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
1496
1496
  class IndexInternalBatchRequest(BaseModel):
1497
1497
  records: List[Dict[str, Any]] # [{"id": str, "content": str, "metadata": dict}, ...]
1498
1498
  arena: Optional[str] = "general"
1499
+ # When supplied (parallel to `records`), skip the L4-QMD embed call
1500
+ # and use these vectors directly. Compat shim populates this when
1501
+ # shared-embed mode is on so we don't duplicate embed work across
1502
+ # layers. Length must match records — defensive bail-out below if
1503
+ # it doesn't.
1504
+ embeddings: Optional[List[List[float]]] = None
1499
1505
 
1500
1506
 
1501
1507
  @app.post("/index-internal-batch")
@@ -1575,7 +1581,19 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1575
1581
  # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
1576
1582
  l4_inserted = 0
1577
1583
  try:
1578
- embeddings = await _embed_batch_local([n["content"] for n in norm])
1584
+ # Shared-embed shortcut: if the compat shim handed us pre-computed
1585
+ # vectors that line up with our normalised records, use them and
1586
+ # skip our own embed RPC. Fall back to per-layer embedding when
1587
+ # the vectors are absent or the lengths don't match (defensive).
1588
+ shared_embs = req.embeddings
1589
+ if (
1590
+ shared_embs is not None
1591
+ and len(shared_embs) == len(records)
1592
+ and len(records) == len(norm)
1593
+ ):
1594
+ embeddings = shared_embs
1595
+ else:
1596
+ embeddings = await _embed_batch_local([n["content"] for n in norm])
1579
1597
  if len(embeddings) != len(norm):
1580
1598
  log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
1581
1599
  qmd_db = Path(QMD_DB_PATH)
@@ -2362,22 +2380,20 @@ async def people_list_internal(
2362
2380
  email_filter_clause = " AND s.person_email IN $emails"
2363
2381
  params["emails"] = emails_filter
2364
2382
 
2365
- search_clause = ""
2366
2383
  if search_pattern:
2367
- # Match against person_email; person_name is resolved
2368
- # via the OPTIONAL MATCH on Person below, so we can't
2369
- # apply it inside the initial WHERE without joining first.
2370
- # Two-step: filter on email here, then re-filter after
2371
- # joining the Person. (Works at our scale; revisit if
2372
- # we ever need search to scale to 10k+ people.)
2373
- search_clause = " AND toLower(s.person_email) CONTAINS $search"
2384
+ # Defer the whole search filter until after the Person
2385
+ # join person_name only exists after the OPTIONAL
2386
+ # MATCH below. Pre-filtering ChannelStats on email
2387
+ # alone (the earlier two-step approach) silently dropped
2388
+ # name-only matches, e.g. email=ag@x.io / name="Alex
2389
+ # Tong" with search="alex" the early WHERE failed and
2390
+ # the post-join filter never saw the row.
2374
2391
  params["search"] = search_pattern
2375
2392
 
2376
2393
  cypher = (
2377
2394
  "MATCH (s:ChannelStat)\n"
2378
2395
  "WHERE s.arena IN $arenas"
2379
2396
  + email_filter_clause
2380
- + search_clause
2381
2397
  + "\n"
2382
2398
  "WITH s.person_email AS person_email,\n"
2383
2399
  " collect({channel: s.channel, count: s.count,\n"
@@ -2393,11 +2409,17 @@ async def people_list_internal(
2393
2409
  "WITH person_email,\n"
2394
2410
  " channels,\n"
2395
2411
  " head(collect(DISTINCT p.name)) AS person_name\n"
2396
- # Apply the name-side of the search filter now that we
2397
- # have the joined name.
2412
+ # Apply the search filter now that we have both the
2413
+ # joined name and the email available. `coalesce(name,
2414
+ # '')` keeps people without a Person node in the result
2415
+ # set when their email matches — they fall through the
2416
+ # name probe cleanly instead of bypassing the filter
2417
+ # via a `person_name IS NULL` short-circuit, which was
2418
+ # the prior bug (anyone without a Person node passed
2419
+ # search regardless of term).
2398
2420
  + (
2399
- "WHERE ($search IS NULL OR person_name IS NULL OR toLower(person_name) CONTAINS $search\n"
2400
- " OR toLower(person_email) CONTAINS $search)\n"
2421
+ "WHERE toLower(coalesce(person_name, '')) CONTAINS $search\n"
2422
+ " OR toLower(person_email) CONTAINS $search\n"
2401
2423
  if search_pattern
2402
2424
  else ""
2403
2425
  )
@@ -124,6 +124,10 @@ class SearchRequest(BaseModel):
124
124
 
125
125
  class IndexBatchRequest(BaseModel):
126
126
  records: list[dict[str, Any]]
127
+ # When supplied (parallel to `records`), skip the embed call and use
128
+ # these vectors directly. Compat shim populates this when shared-embed
129
+ # mode is on so we don't duplicate the embed work across layers.
130
+ embeddings: list[list[float]] | None = None
127
131
 
128
132
 
129
133
  app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
@@ -187,7 +191,13 @@ async def index_batch(req: IndexBatchRequest):
187
191
  return {"status": "ok", "inserted": 0}
188
192
  texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
189
193
  t0 = time.perf_counter()
190
- embs = await _embed_batch(texts)
194
+ # Shared-embed shortcut: caller (compat shim) computed vectors once
195
+ # and forwards them so we skip the embed RPC. Length must match
196
+ # records — defensive bail if it doesn't.
197
+ if req.embeddings is not None and len(req.embeddings) == len(req.records):
198
+ embs = req.embeddings
199
+ else:
200
+ embs = await _embed_batch(texts)
191
201
  embed_ms = (time.perf_counter() - t0) * 1000.0
192
202
 
193
203
  conn = _get_db()
@@ -629,13 +629,19 @@ def serve(port=8034):
629
629
  client = get_client()
630
630
  ensure_collection(client, collection)
631
631
 
632
- # Single batched embed call.
632
+ # Shared-embed shortcut: caller (compat shim) computed vectors
633
+ # once and forwards them so we skip the embed RPC. Length must
634
+ # match records — fall back to per-layer embed if it doesn't.
633
635
  texts = [(r.get("text") or "")[:8192] for r in records]
636
+ shared_embs = req.get("embeddings")
634
637
  t0 = _time.time()
635
- try:
636
- embs = _embed_post(texts)
637
- except Exception as exc:
638
- return {"status": "error", "error": f"embed failed: {exc}"}
638
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
639
+ embs = shared_embs
640
+ else:
641
+ try:
642
+ embs = _embed_post(texts)
643
+ except Exception as exc:
644
+ return {"status": "error", "error": f"embed failed: {exc}"}
639
645
  embed_ms = (_time.time() - t0) * 1000.0
640
646
 
641
647
  # Single batched insert. Mirror every field the chats collection
@@ -990,12 +990,18 @@ def serve(port: int = DEFAULT_PORT):
990
990
 
991
991
  texts = [(r.get("text") or "")[:16000] for r in records]
992
992
 
993
- # Single batched embed call (OpenAI-compat first, lambda-gateway fallback).
993
+ # Shared-embed shortcut: caller (compat shim) computed vectors
994
+ # once and forwards them so we skip the embed RPC. Length must
995
+ # match records — fall back to per-layer embed if it doesn't.
996
+ shared_embs = req.get("embeddings")
994
997
  t0 = _time.time()
995
- try:
996
- embs = _embed_post(texts)
997
- except Exception as exc:
998
- raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
998
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
999
+ embs = shared_embs
1000
+ else:
1001
+ try:
1002
+ embs = _embed_post(texts)
1003
+ except Exception as exc:
1004
+ raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
999
1005
  embed_ms = (_time.time() - t0) * 1000.0
1000
1006
 
1001
1007
  # Single milvus insert.
@@ -301,6 +301,59 @@ def test_search_substring_matches_email_or_name(
301
301
  }
302
302
 
303
303
 
304
+ @_skip_no_neo4j
305
+ def test_search_matches_name_when_email_does_not(
306
+ neo4j_driver, proxy_module
307
+ ) -> None:
308
+ """Regression: an early-WHERE on ``ChannelStat`` filtered rows by
309
+ email-only before the Person join, so a person whose NAME matched
310
+ the search term but whose EMAIL didn't was silently dropped. Fixed
311
+ by deferring the whole search filter until after the OPTIONAL MATCH
312
+ on Person. Sentinel case: email ``ag@x.io`` / name ``Alex Tong`` /
313
+ search ``alex`` — must match on name even though email has no
314
+ substring overlap."""
315
+ driver, (arena, _, _) = neo4j_driver
316
+ with driver.session() as session:
317
+ _ensure_indexes(session)
318
+ _write_stat(session, arena, "ag@x.io", "email", name="Alex Tong")
319
+ _write_stat(session, arena, "other@x.io", "email", name="Bea Chen")
320
+
321
+ out = _call_people_list(proxy_module, arenas=[arena], search="alex")
322
+ assert {i.person_email for i in out.items} == {"ag@x.io"}
323
+
324
+
325
+ @_skip_no_neo4j
326
+ def test_search_does_not_bypass_filter_when_person_node_missing(
327
+ neo4j_driver, proxy_module
328
+ ) -> None:
329
+ """Regression: the previous WHERE clause had a ``person_name IS
330
+ NULL`` short-circuit that bypassed the search filter for anyone
331
+ without a Person node — they matched any search term. Fixed by
332
+ using ``coalesce(person_name, '')`` so the name probe just fails
333
+ cleanly when no Person record exists, falling through to the email
334
+ probe."""
335
+ driver, (arena, _, _) = neo4j_driver
336
+ with driver.session() as session:
337
+ _ensure_indexes(session)
338
+ # Insert a ChannelStat WITHOUT a Person node — simulates a
339
+ # contact who's been emailed but never had a Person record
340
+ # materialised. Use a raw write so _write_stat's MERGE doesn't
341
+ # auto-create a Person.
342
+ session.run(
343
+ "MERGE (s:ChannelStat {arena: $arena, person_email: $email, channel: 'email'}) "
344
+ "SET s.count = 1, s.inbound = 1, s.outbound = 0, "
345
+ " s.last_seen = '2026-05-10T00:00:00Z', "
346
+ " s.first_seen = '2026-05-10T00:00:00Z'",
347
+ arena=arena, email="orphan@x.io",
348
+ )
349
+ _write_stat(session, arena, "alex@x.io", "email", name="Alex Tong")
350
+
351
+ # Search "alex" must NOT match orphan@x.io — neither name (missing)
352
+ # nor email contains "alex".
353
+ out = _call_people_list(proxy_module, arenas=[arena], search="alex")
354
+ assert {i.person_email for i in out.items} == {"alex@x.io"}
355
+
356
+
304
357
  # ---------------------------------------------------------------------------
305
358
  # Pagination.
306
359
  # ---------------------------------------------------------------------------