@pentatonic-ai/ai-agent-sdk 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
906
  }
907
907
 
908
908
  // src/telemetry.js
909
- var VERSION = "0.9.2";
909
+ var VERSION = "0.9.3";
910
910
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
911
  function machineId() {
912
912
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
875
  }
876
876
 
877
877
  // src/telemetry.js
878
- var VERSION = "0.9.2";
878
+ var VERSION = "0.9.3";
879
879
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
880
  function machineId() {
881
881
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.2",
3
+ "version": "0.9.3",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -4,7 +4,18 @@ WORKDIR /app
4
4
 
5
5
  RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx pydantic
6
6
 
7
- COPY server.py /app/server.py
7
+ # Build context is the memory-engine root (see docker-compose.yml). The
8
+ # shim's server.py side-loads engine/services/_shared/embed_provider.py
9
+ # for shared-embed mode on /store-batch (one embed call across all 4
10
+ # layer indexers vs 4 redundant calls).
11
+ COPY compat/server.py /app/server.py
12
+ # server.py's sys.path.insert resolves "../engine/services" relative to
13
+ # its own location (/app/server.py → /engine/services). Mirror that
14
+ # layout so the import works without changing the runtime code.
15
+ COPY engine/services/_shared /engine/services/_shared
16
+ # Make `_shared` an importable package (mirror the layer services'
17
+ # layout where __init__.py exists or python detects PEP 420 namespace).
18
+ RUN touch /engine/services/__init__.py
8
19
 
9
20
  EXPOSE 8099
10
21
 
@@ -34,6 +34,7 @@ Environment:
34
34
 
35
35
  import hashlib
36
36
  import os
37
+ import sys
37
38
  import time
38
39
  from datetime import datetime, timezone
39
40
  from typing import Any, Optional
@@ -42,6 +43,17 @@ import httpx
42
43
  from fastapi import FastAPI, HTTPException
43
44
  from pydantic import BaseModel, Field
44
45
 
46
+ # Reach into the engine/services tree so we can reuse EmbedClient. The
47
+ # tree isn't a real installed package; layer services and the compat
48
+ # shim both side-load it the same way. Keeps the chunking + auto-detect
49
+ # behaviour identical between the shim's pre-embed and the per-layer
50
+ # embeds that previously did the same work N times.
51
+ sys.path.insert(
52
+ 0,
53
+ os.path.join(os.path.dirname(__file__), "..", "engine", "services"),
54
+ )
55
+ from _shared.embed_provider import EmbedClient # noqa: E402
56
+
45
57
  # ----------------------------------------------------------------------
46
58
  # Config
47
59
  # ----------------------------------------------------------------------
@@ -63,6 +75,30 @@ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
63
75
 
64
76
  PORT = int(os.environ.get("PORT", "8099"))
65
77
 
78
+ # Shared-embed mode. When on, /store-batch computes embeddings once at
79
+ # the shim level and forwards them to each layer's /index-batch so the
80
+ # layer skips its own embed call. Cuts gateway RPC count by ~4× (L4 +
81
+ # L5 + L6 + L2-internal all did the same embed work independently).
82
+ # Default ON because all layers in this engine use the same NV-Embed
83
+ # model; disable if you ever wire up per-layer differentiated embedders
84
+ # (e.g. cohere on L5, openai on L4).
85
+ SHARE_EMBEDDINGS = os.environ.get("PME_SHARE_EMBEDDINGS", "true").lower() == "true"
86
+
87
+ _embed_client: EmbedClient | None = None
88
+
89
+
90
+ def _get_embed_client() -> EmbedClient:
91
+ """Lazy-init the shim's EmbedClient using PME_-prefixed env vars
92
+ (matches L2's pattern). Cached for the process lifetime so the
93
+ auto-detect handshake only happens once."""
94
+ global _embed_client
95
+ if _embed_client is None:
96
+ _embed_client = EmbedClient.from_env(
97
+ prefix="PME_",
98
+ default_url=NV_EMBED_URL,
99
+ )
100
+ return _embed_client
101
+
66
102
 
67
103
  # Layer types we surface as the SDK 4-layer projection. Engine stores
68
104
  # everything as chunks tagged with arena + layer_type metadata; this
@@ -252,12 +288,23 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
252
288
  return [d["embedding"] for d in resp.json()["data"]]
253
289
 
254
290
 
255
- async def _index_l4(records: list[dict[str, Any]]) -> int:
256
- """Index records into the L4 sqlite-vec layer."""
257
- payload = {"records": [
291
+ async def _index_l4(
292
+ records: list[dict[str, Any]],
293
+ embeddings: list[list[float]] | None = None,
294
+ ) -> int:
295
+ """Index records into the L4 sqlite-vec layer.
296
+
297
+ When `embeddings` is supplied (parallel to records), L4's /index-batch
298
+ skips its own embed call and uses ours — eliminates the redundant
299
+ embed work that previously cost ~850ms per drain alarm. When None,
300
+ L4 embeds itself (backwards-compatible path for older callers / tests
301
+ that don't share embeddings)."""
302
+ payload: dict[str, Any] = {"records": [
258
303
  {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
259
304
  "text": r["content"]} for r in records
260
305
  ]}
306
+ if embeddings is not None:
307
+ payload["embeddings"] = embeddings
261
308
  try:
262
309
  resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
263
310
  resp.raise_for_status()
@@ -267,13 +314,20 @@ async def _index_l4(records: list[dict[str, Any]]) -> int:
267
314
  return 0
268
315
 
269
316
 
270
- async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
317
+ async def _index_l5(
318
+ records: list[dict[str, Any]],
319
+ arena: str = "general",
320
+ embeddings: list[list[float]] | None = None,
321
+ ) -> int:
271
322
  """Index records into the L5 Milvus comms layer (chats collection).
272
323
 
273
324
  arena is forwarded as a Milvus dynamic field so /search can filter
274
325
  by arena natively (vs the shim's defence-in-depth post-filter).
326
+
327
+ When `embeddings` is supplied (parallel to records), L5 skips its
328
+ own embed call — see _index_l4 docstring for the dedup story.
275
329
  """
276
- payload = {
330
+ payload: dict[str, Any] = {
277
331
  "collection": "chats",
278
332
  "records": [
279
333
  {
@@ -287,6 +341,8 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
287
341
  for r in records
288
342
  ],
289
343
  }
344
+ if embeddings is not None:
345
+ payload["embeddings"] = embeddings
290
346
  try:
291
347
  resp = await _client().post(f"{L5_MILVUS_URL}/index-batch", json=payload, timeout=60.0)
292
348
  resp.raise_for_status()
@@ -299,9 +355,17 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
299
355
  return 0
300
356
 
301
357
 
302
- async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> int:
303
- """Index records into the L6 document store."""
304
- payload = {
358
+ async def _index_l6(
359
+ records: list[dict[str, Any]],
360
+ arena: str = "general",
361
+ embeddings: list[list[float]] | None = None,
362
+ ) -> int:
363
+ """Index records into the L6 document store.
364
+
365
+ When `embeddings` is supplied (parallel to records), L6 skips its
366
+ own embed call — see _index_l4 docstring for the dedup story.
367
+ """
368
+ payload: dict[str, Any] = {
305
369
  "arena": arena,
306
370
  "records": [
307
371
  {
@@ -314,6 +378,8 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
314
378
  for r in records
315
379
  ],
316
380
  }
381
+ if embeddings is not None:
382
+ payload["embeddings"] = embeddings
317
383
  try:
318
384
  resp = await _client().post(f"{L6_DOC_URL}/index-batch", json=payload, timeout=120.0)
319
385
  resp.raise_for_status()
@@ -323,14 +389,22 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
323
389
  return 0
324
390
 
325
391
 
326
- async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "general") -> dict:
392
+ async def _index_l2_internal(
393
+ records: list[dict[str, Any]],
394
+ arena: str = "general",
395
+ embeddings: list[list[float]] | None = None,
396
+ ) -> dict:
327
397
  """Populate L2's internal stores: L0 BM25 + L4 QMD vec + L3 Neo4j KG.
328
398
 
329
399
  Without this, L2's RRF fusion runs over empty L0/L4-qmd/L3 layers and
330
400
  those zero-result rank lists pollute the score. The L2 proxy exposes
331
401
  /index-internal-batch which writes to all three in one round-trip.
402
+
403
+ When `embeddings` is supplied (parallel to records), L2's internal
404
+ embed call (used for L4-QMD population) is skipped — see _index_l4
405
+ docstring for the dedup story.
332
406
  """
333
- payload = {
407
+ payload: dict[str, Any] = {
334
408
  "arena": arena,
335
409
  "records": [
336
410
  {
@@ -341,6 +415,8 @@ async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "genera
341
415
  for r in records
342
416
  ],
343
417
  }
418
+ if embeddings is not None:
419
+ payload["embeddings"] = embeddings
344
420
  try:
345
421
  resp = await _client().post(f"{L2_PROXY_URL}/index-internal-batch",
346
422
  json=payload, timeout=180.0)
@@ -646,11 +722,37 @@ async def store_batch(req: StoreBatchRequest):
646
722
 
647
723
  t0 = time.perf_counter()
648
724
  import asyncio
725
+
726
+ # Shared-embed mode: compute embeddings ONCE here, pass them down to
727
+ # every layer so they skip their own embed call. Previously L4 + L5
728
+ # + L6 + L2-internal each re-embedded the same texts in parallel,
729
+ # which fanned 4× the gateway RPCs. The gateway throttles at K≈10
730
+ # concurrent requests, so 40-way fan-out serialised into ~4 rounds
731
+ # of ~850ms each = ~3.5s of pure embed time per /store-batch. With
732
+ # shared embeddings we issue one chunked embed pass (10 sub-calls
733
+ # for N=50 records) and skip the per-layer redundant work entirely.
734
+ # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
735
+ # per-layer differentiated embedders.
736
+ shared_embeddings: list[list[float]] | None = None
737
+ embed_ms = 0.0
738
+ if SHARE_EMBEDDINGS and normalised:
739
+ texts = [r["content"] for r in normalised]
740
+ embed_t0 = time.perf_counter()
741
+ try:
742
+ shared_embeddings = await _get_embed_client().embed_batch_async(texts)
743
+ except Exception as exc:
744
+ # Fall back to per-layer embedding rather than failing the
745
+ # whole batch. The layers' /index-batch still works when
746
+ # `embeddings` is absent.
747
+ print(f"[shim] shared embed failed, falling back to per-layer: {exc}")
748
+ shared_embeddings = None
749
+ embed_ms = (time.perf_counter() - embed_t0) * 1000.0
750
+
649
751
  l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
650
- _index_l4(normalised),
651
- _index_l5(normalised, arena=req.arena or "general"),
652
- _index_l6(normalised, arena=req.arena or "general"),
653
- _index_l2_internal(normalised, arena=req.arena or "general"),
752
+ _index_l4(normalised, embeddings=shared_embeddings),
753
+ _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
754
+ _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
755
+ _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
654
756
  )
655
757
  dur_ms = (time.perf_counter() - t0) * 1000.0
656
758
 
@@ -212,8 +212,11 @@ services:
212
212
  compat:
213
213
  <<: *engine-base
214
214
  build:
215
- context: ./compat
216
- dockerfile: Dockerfile
215
+ # Build context is the memory-engine root so the Dockerfile can
216
+ # COPY both compat/server.py and engine/services/_shared (shared
217
+ # EmbedClient for /store-batch dedup).
218
+ context: .
219
+ dockerfile: compat/Dockerfile
217
220
  container_name: pme-compat
218
221
  ports:
219
222
  - "127.0.0.1:${PME_PORT:-8099}:8099"
@@ -225,6 +228,17 @@ services:
225
228
  L5_MILVUS_URL: http://l5:8034
226
229
  L6_DOC_URL: http://l6:8037
227
230
  NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
231
+ # PME_ prefix vars feed the shim's EmbedClient for shared-embed
232
+ # mode on /store-batch (one embed call across all 4 indexers vs
233
+ # 4 redundant calls). Match the L2 config block so both clients
234
+ # hit the same gateway with the same model. Set
235
+ # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
236
+ PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
237
+ PME_EMBED_API_KEY: ${EMBED_API_KEY:-}
238
+ PME_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
239
+ PME_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
240
+ PME_NV_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
241
+ PME_SHARE_EMBEDDINGS: ${PME_SHARE_EMBEDDINGS:-true}
228
242
  BYPASS_L2_PROXY: ${BYPASS_L2_PROXY:-0}
229
243
  extra_hosts:
230
244
  - "host.docker.internal:host-gateway"
@@ -1496,6 +1496,12 @@ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
1496
1496
  class IndexInternalBatchRequest(BaseModel):
1497
1497
  records: List[Dict[str, Any]] # [{"id": str, "content": str, "metadata": dict}, ...]
1498
1498
  arena: Optional[str] = "general"
1499
+ # When supplied (parallel to `records`), skip the L4-QMD embed call
1500
+ # and use these vectors directly. Compat shim populates this when
1501
+ # shared-embed mode is on so we don't duplicate embed work across
1502
+ # layers. Length must match records — defensive bail-out below if
1503
+ # it doesn't.
1504
+ embeddings: Optional[List[List[float]]] = None
1499
1505
 
1500
1506
 
1501
1507
  @app.post("/index-internal-batch")
@@ -1575,7 +1581,19 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1575
1581
  # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
1576
1582
  l4_inserted = 0
1577
1583
  try:
1578
- embeddings = await _embed_batch_local([n["content"] for n in norm])
1584
+ # Shared-embed shortcut: if the compat shim handed us pre-computed
1585
+ # vectors that line up with our normalised records, use them and
1586
+ # skip our own embed RPC. Fall back to per-layer embedding when
1587
+ # the vectors are absent or the lengths don't match (defensive).
1588
+ shared_embs = req.embeddings
1589
+ if (
1590
+ shared_embs is not None
1591
+ and len(shared_embs) == len(records)
1592
+ and len(records) == len(norm)
1593
+ ):
1594
+ embeddings = shared_embs
1595
+ else:
1596
+ embeddings = await _embed_batch_local([n["content"] for n in norm])
1579
1597
  if len(embeddings) != len(norm):
1580
1598
  log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
1581
1599
  qmd_db = Path(QMD_DB_PATH)
@@ -124,6 +124,10 @@ class SearchRequest(BaseModel):
124
124
 
125
125
  class IndexBatchRequest(BaseModel):
126
126
  records: list[dict[str, Any]]
127
+ # When supplied (parallel to `records`), skip the embed call and use
128
+ # these vectors directly. Compat shim populates this when shared-embed
129
+ # mode is on so we don't duplicate the embed work across layers.
130
+ embeddings: list[list[float]] | None = None
127
131
 
128
132
 
129
133
  app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
@@ -187,7 +191,13 @@ async def index_batch(req: IndexBatchRequest):
187
191
  return {"status": "ok", "inserted": 0}
188
192
  texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
189
193
  t0 = time.perf_counter()
190
- embs = await _embed_batch(texts)
194
+ # Shared-embed shortcut: caller (compat shim) computed vectors once
195
+ # and forwards them so we skip the embed RPC. Length must match
196
+ # records — defensive bail if it doesn't.
197
+ if req.embeddings is not None and len(req.embeddings) == len(req.records):
198
+ embs = req.embeddings
199
+ else:
200
+ embs = await _embed_batch(texts)
191
201
  embed_ms = (time.perf_counter() - t0) * 1000.0
192
202
 
193
203
  conn = _get_db()
@@ -629,13 +629,19 @@ def serve(port=8034):
629
629
  client = get_client()
630
630
  ensure_collection(client, collection)
631
631
 
632
- # Single batched embed call.
632
+ # Shared-embed shortcut: caller (compat shim) computed vectors
633
+ # once and forwards them so we skip the embed RPC. Length must
634
+ # match records — fall back to per-layer embed if it doesn't.
633
635
  texts = [(r.get("text") or "")[:8192] for r in records]
636
+ shared_embs = req.get("embeddings")
634
637
  t0 = _time.time()
635
- try:
636
- embs = _embed_post(texts)
637
- except Exception as exc:
638
- return {"status": "error", "error": f"embed failed: {exc}"}
638
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
639
+ embs = shared_embs
640
+ else:
641
+ try:
642
+ embs = _embed_post(texts)
643
+ except Exception as exc:
644
+ return {"status": "error", "error": f"embed failed: {exc}"}
639
645
  embed_ms = (_time.time() - t0) * 1000.0
640
646
 
641
647
  # Single batched insert. Mirror every field the chats collection
@@ -990,12 +990,18 @@ def serve(port: int = DEFAULT_PORT):
990
990
 
991
991
  texts = [(r.get("text") or "")[:16000] for r in records]
992
992
 
993
- # Single batched embed call (OpenAI-compat first, lambda-gateway fallback).
993
+ # Shared-embed shortcut: caller (compat shim) computed vectors
994
+ # once and forwards them so we skip the embed RPC. Length must
995
+ # match records — fall back to per-layer embed if it doesn't.
996
+ shared_embs = req.get("embeddings")
994
997
  t0 = _time.time()
995
- try:
996
- embs = _embed_post(texts)
997
- except Exception as exc:
998
- raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
998
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
999
+ embs = shared_embs
1000
+ else:
1001
+ try:
1002
+ embs = _embed_post(texts)
1003
+ except Exception as exc:
1004
+ raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
999
1005
  embed_ms = (_time.time() - t0) * 1000.0
1000
1006
 
1001
1007
  # Single milvus insert.