@pentatonic-ai/ai-agent-sdk 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
906
  }
907
907
 
908
908
  // src/telemetry.js
909
- var VERSION = "0.9.2";
909
+ var VERSION = "0.9.4";
910
910
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
911
  function machineId() {
912
912
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
875
  }
876
876
 
877
877
  // src/telemetry.js
878
- var VERSION = "0.9.2";
878
+ var VERSION = "0.9.4";
879
879
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
880
  function machineId() {
881
881
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.2",
3
+ "version": "0.9.4",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -568,9 +568,9 @@
568
568
  }
569
569
  },
570
570
  "node_modules/hono": {
571
- "version": "4.12.12",
572
- "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.12.tgz",
573
- "integrity": "sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==",
571
+ "version": "4.12.18",
572
+ "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
573
+ "integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
574
574
  "license": "MIT",
575
575
  "engines": {
576
576
  "node": ">=16.9.0"
@@ -4,7 +4,18 @@ WORKDIR /app
4
4
 
5
5
  RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx pydantic
6
6
 
7
- COPY server.py /app/server.py
7
+ # Build context is the memory-engine root (see docker-compose.yml). The
8
+ # shim's server.py side-loads engine/services/_shared/embed_provider.py
9
+ # for shared-embed mode on /store-batch (one embed call across all 4
10
+ # layer indexers vs 4 redundant calls).
11
+ COPY compat/server.py /app/server.py
12
+ # server.py's sys.path.insert resolves "../engine/services" relative to
13
+ # its own location (/app/server.py → /engine/services). Mirror that
14
+ # layout so the import works without changing the runtime code.
15
+ COPY engine/services/_shared /engine/services/_shared
16
+ # Make `_shared` an importable package (mirror the layer services'
17
+ # layout where __init__.py exists or python detects PEP 420 namespace).
18
+ RUN touch /engine/services/__init__.py
8
19
 
9
20
  EXPOSE 8099
10
21
 
@@ -25,7 +25,6 @@ Environment:
25
25
  L0_URL default http://l0:8030
26
26
  L2_PROXY_URL default http://l2:8031
27
27
  L3_KG_URL default http://l3:8047
28
- L4_VEC_URL default http://l4:8042
29
28
  L5_MILVUS_URL default http://l5:8035
30
29
  L6_DOC_URL default http://l6:8037
31
30
  NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
@@ -34,6 +33,7 @@ Environment:
34
33
 
35
34
  import hashlib
36
35
  import os
36
+ import sys
37
37
  import time
38
38
  from datetime import datetime, timezone
39
39
  from typing import Any, Optional
@@ -42,6 +42,17 @@ import httpx
42
42
  from fastapi import FastAPI, HTTPException
43
43
  from pydantic import BaseModel, Field
44
44
 
45
+ # Reach into the engine/services tree so we can reuse EmbedClient. The
46
+ # tree isn't a real installed package; layer services and the compat
47
+ # shim both side-load it the same way. Keeps the chunking + auto-detect
48
+ # behaviour identical between the shim's pre-embed and the per-layer
49
+ # embeds that previously did the same work N times.
50
+ sys.path.insert(
51
+ 0,
52
+ os.path.join(os.path.dirname(__file__), "..", "engine", "services"),
53
+ )
54
+ from _shared.embed_provider import EmbedClient # noqa: E402
55
+
45
56
  # ----------------------------------------------------------------------
46
57
  # Config
47
58
  # ----------------------------------------------------------------------
@@ -49,7 +60,6 @@ from pydantic import BaseModel, Field
49
60
  L0_URL = os.environ.get("L0_URL", "http://l0:8030")
50
61
  L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
51
62
  L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
52
- L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
53
63
  L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
54
64
  L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
55
65
  NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
@@ -63,6 +73,30 @@ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
63
73
 
64
74
  PORT = int(os.environ.get("PORT", "8099"))
65
75
 
76
+ # Shared-embed mode. When on, /store-batch computes embeddings once at
77
+ # the shim level and forwards them to each layer's /index-batch so the
78
+ # layer skips its own embed call. Cuts gateway RPC count by ~4× (L4 +
79
+ # L5 + L6 + L2-internal all did the same embed work independently).
80
+ # Default ON because all layers in this engine use the same NV-Embed
81
+ # model; disable if you ever wire up per-layer differentiated embedders
82
+ # (e.g. cohere on L5, openai on L4).
83
+ SHARE_EMBEDDINGS = os.environ.get("PME_SHARE_EMBEDDINGS", "true").lower() == "true"
84
+
85
+ _embed_client: EmbedClient | None = None
86
+
87
+
88
+ def _get_embed_client() -> EmbedClient:
89
+ """Lazy-init the shim's EmbedClient using PME_-prefixed env vars
90
+ (matches L2's pattern). Cached for the process lifetime so the
91
+ auto-detect handshake only happens once."""
92
+ global _embed_client
93
+ if _embed_client is None:
94
+ _embed_client = EmbedClient.from_env(
95
+ prefix="PME_",
96
+ default_url=NV_EMBED_URL,
97
+ )
98
+ return _embed_client
99
+
66
100
 
67
101
  # Layer types we surface as the SDK 4-layer projection. Engine stores
68
102
  # everything as chunks tagged with arena + layer_type metadata; this
@@ -252,28 +286,23 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
252
286
  return [d["embedding"] for d in resp.json()["data"]]
253
287
 
254
288
 
255
- async def _index_l4(records: list[dict[str, Any]]) -> int:
256
- """Index records into the L4 sqlite-vec layer."""
257
- payload = {"records": [
258
- {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
259
- "text": r["content"]} for r in records
260
- ]}
261
- try:
262
- resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
263
- resp.raise_for_status()
264
- return resp.json().get("inserted", 0)
265
- except Exception as exc:
266
- print(f"[shim] L4 index-batch failed: {exc}")
267
- return 0
268
-
269
-
270
- async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
289
+ async def _index_l5(
290
+ records: list[dict[str, Any]],
291
+ arena: str = "general",
292
+ embeddings: list[list[float]] | None = None,
293
+ ) -> int:
271
294
  """Index records into the L5 Milvus comms layer (chats collection).
272
295
 
273
296
  arena is forwarded as a Milvus dynamic field so /search can filter
274
297
  by arena natively (vs the shim's defence-in-depth post-filter).
298
+
299
+ When `embeddings` is supplied (parallel to records), L5 skips its
300
+ own embed call — the shim pre-computes vectors once at /store-batch
301
+ level and threads them through each layer to avoid 3× redundant
302
+ embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
303
+ texts in parallel).
275
304
  """
276
- payload = {
305
+ payload: dict[str, Any] = {
277
306
  "collection": "chats",
278
307
  "records": [
279
308
  {
@@ -287,21 +316,32 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
287
316
  for r in records
288
317
  ],
289
318
  }
319
+ if embeddings is not None:
320
+ payload["embeddings"] = embeddings
290
321
  try:
291
322
  resp = await _client().post(f"{L5_MILVUS_URL}/index-batch", json=payload, timeout=60.0)
292
323
  resp.raise_for_status()
293
324
  return resp.json().get("inserted", 0)
294
325
  except Exception as exc:
295
- # Best-effort: L5 is one of six redundant layers; failure here doesn't
296
- # mean the record is unsearchable. L0 BM25 + L4 vec + L6 doc-store
297
- # all carry it independently.
326
+ # Best-effort: L5 is one of five redundant layers; failure here
327
+ # doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
328
+ # L6 doc-store all carry it independently.
298
329
  print(f"[shim] L5 index-batch failed: {exc}")
299
330
  return 0
300
331
 
301
332
 
302
- async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> int:
303
- """Index records into the L6 document store."""
304
- payload = {
333
+ async def _index_l6(
334
+ records: list[dict[str, Any]],
335
+ arena: str = "general",
336
+ embeddings: list[list[float]] | None = None,
337
+ ) -> int:
338
+ """Index records into the L6 document store.
339
+
340
+ When `embeddings` is supplied (parallel to records), L6 skips its
341
+ own embed call — the shim pre-computes vectors once at /store-batch
342
+ level and threads them through each layer.
343
+ """
344
+ payload: dict[str, Any] = {
305
345
  "arena": arena,
306
346
  "records": [
307
347
  {
@@ -314,6 +354,8 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
314
354
  for r in records
315
355
  ],
316
356
  }
357
+ if embeddings is not None:
358
+ payload["embeddings"] = embeddings
317
359
  try:
318
360
  resp = await _client().post(f"{L6_DOC_URL}/index-batch", json=payload, timeout=120.0)
319
361
  resp.raise_for_status()
@@ -323,14 +365,23 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
323
365
  return 0
324
366
 
325
367
 
326
- async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "general") -> dict:
368
+ async def _index_l2_internal(
369
+ records: list[dict[str, Any]],
370
+ arena: str = "general",
371
+ embeddings: list[list[float]] | None = None,
372
+ ) -> dict:
327
373
  """Populate L2's internal stores: L0 BM25 + L4 QMD vec + L3 Neo4j KG.
328
374
 
329
375
  Without this, L2's RRF fusion runs over empty L0/L4-qmd/L3 layers and
330
376
  those zero-result rank lists pollute the score. The L2 proxy exposes
331
377
  /index-internal-batch which writes to all three in one round-trip.
378
+
379
+ When `embeddings` is supplied (parallel to records), L2's internal
380
+ embed call (used for L4-QMD population) is skipped — the shim
381
+ pre-computes vectors once at /store-batch level and threads them
382
+ through to L4_QMD via this endpoint.
332
383
  """
333
- payload = {
384
+ payload: dict[str, Any] = {
334
385
  "arena": arena,
335
386
  "records": [
336
387
  {
@@ -341,6 +392,8 @@ async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "genera
341
392
  for r in records
342
393
  ],
343
394
  }
395
+ if embeddings is not None:
396
+ payload["embeddings"] = embeddings
344
397
  try:
345
398
  resp = await _client().post(f"{L2_PROXY_URL}/index-internal-batch",
346
399
  json=payload, timeout=180.0)
@@ -454,25 +507,25 @@ async def health():
454
507
  nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
455
508
 
456
509
  import asyncio
457
- l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
510
+ l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
458
511
  _probe(f"{L2_PROXY_URL}/health"),
459
- _probe(f"{L4_VEC_URL}/health"),
460
512
  _probe(f"{L5_MILVUS_URL}/health"),
461
513
  _probe(f"{L6_DOC_URL}/health"),
462
514
  _probe(nv_embed_health),
463
515
  _probe_l3(),
464
516
  )
465
517
 
466
- # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
467
- # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
468
- # both layers are usable. Tie their status to L2.
518
+ # L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
519
+ # all in-process inside the L2 proxy L0+L1 in workspace.db / core
520
+ # files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
521
+ # if L2 is healthy, all three layers are usable. Tie their status to L2.
469
522
  l2_ok = l2_v == "ok"
470
523
  out["layers"] = {
471
524
  "l0": "ok" if l2_ok else l2_v,
472
525
  "l1": "ok" if l2_ok else l2_v,
473
526
  "l2": l2_v,
474
527
  "l3": l3_v,
475
- "l4": l4_v,
528
+ "l4": "ok" if l2_ok else l2_v,
476
529
  "l5": l5_v,
477
530
  "l6": l6_v,
478
531
  "nv_embed": nv_v,
@@ -493,19 +546,15 @@ async def health():
493
546
  "l6_vector_chunks": None,
494
547
  "l6_fts_chunks": None,
495
548
  }
496
- # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
549
+ # L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
550
+ # opened by the L2 proxy). L2 exposes /index-internal-stats with both
551
+ # counts in one round-trip.
497
552
  try:
498
553
  r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
499
554
  if r.status_code == 200:
500
555
  stats = r.json()
501
556
  memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
502
- except Exception:
503
- pass
504
- # L4 reports n_vectors on its own /health.
505
- try:
506
- r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
507
- if r.status_code == 200:
508
- memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
557
+ memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
509
558
  except Exception:
510
559
  pass
511
560
  # L5 reports per-collection counts on /health. We surface chats —
@@ -558,8 +607,9 @@ async def health_deep():
558
607
  except Exception as exc:
559
608
  return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
560
609
 
610
+ # L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
611
+ # round-trip is covered by L2's /health/deep, no separate probe needed.
561
612
  results = await asyncio.gather(
562
- _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
563
613
  _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
564
614
  _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
565
615
  )
@@ -599,15 +649,15 @@ async def store(req: StoreRequest):
599
649
  # depending on which one was supplied).
600
650
  _stash_all_keys(rid, req.metadata or {}, arena)
601
651
 
602
- # Fan out to L4 + L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
652
+ # Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
603
653
  import asyncio
604
- l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
605
- _index_l4([record]),
654
+ l5_count, l6_count, l2_internal = await asyncio.gather(
606
655
  _index_l5([record], arena=arena),
607
656
  _index_l6([record], arena=arena),
608
657
  _index_l2_internal([record], arena=arena),
609
658
  )
610
659
 
660
+ l4_qmd_count = l2_internal.get("l4_qmd", 0)
611
661
  return {
612
662
  "id": rid,
613
663
  "content": req.content,
@@ -616,8 +666,11 @@ async def store(req: StoreRequest):
616
666
  "l0": l2_internal.get("l0", 0),
617
667
  "l3_chunks": l2_internal.get("l3_chunks", 0),
618
668
  "l3_entities": l2_internal.get("l3_entities", 0),
619
- "l4_qmd": l2_internal.get("l4_qmd", 0),
620
- "l4": l4_count,
669
+ "l4_qmd": l4_qmd_count,
670
+ # `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
671
+ # sidecar has been dropped. Kept in the response for wire-format
672
+ # back-compat with callers that read engine.l4.
673
+ "l4": l4_qmd_count,
621
674
  "l5": l5_count,
622
675
  "l6": l6_count,
623
676
  },
@@ -646,24 +699,51 @@ async def store_batch(req: StoreBatchRequest):
646
699
 
647
700
  t0 = time.perf_counter()
648
701
  import asyncio
649
- l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
650
- _index_l4(normalised),
651
- _index_l5(normalised, arena=req.arena or "general"),
652
- _index_l6(normalised, arena=req.arena or "general"),
653
- _index_l2_internal(normalised, arena=req.arena or "general"),
702
+
703
+ # Shared-embed mode: compute embeddings ONCE here, pass them down to
704
+ # every layer so they skip their own embed call. Previously L5 + L6
705
+ # + L2-internal each re-embedded the same texts in parallel, which
706
+ # fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
707
+ # requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
708
+ # each = ~2.5s of pure embed time per /store-batch. With shared
709
+ # embeddings we issue one chunked embed pass (10 sub-calls for N=50
710
+ # records) and skip the per-layer redundant work entirely.
711
+ # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
712
+ # per-layer differentiated embedders.
713
+ shared_embeddings: list[list[float]] | None = None
714
+ embed_ms = 0.0
715
+ if SHARE_EMBEDDINGS and normalised:
716
+ texts = [r["content"] for r in normalised]
717
+ embed_t0 = time.perf_counter()
718
+ try:
719
+ shared_embeddings = await _get_embed_client().embed_batch_async(texts)
720
+ except Exception as exc:
721
+ # Fall back to per-layer embedding rather than failing the
722
+ # whole batch. The layers' /index-batch still works when
723
+ # `embeddings` is absent.
724
+ print(f"[shim] shared embed failed, falling back to per-layer: {exc}")
725
+ shared_embeddings = None
726
+ embed_ms = (time.perf_counter() - embed_t0) * 1000.0
727
+
728
+ l5_count, l6_count, l2_internal = await asyncio.gather(
729
+ _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
730
+ _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
731
+ _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
654
732
  )
655
733
  dur_ms = (time.perf_counter() - t0) * 1000.0
656
734
 
735
+ l4_qmd_count = l2_internal.get("l4_qmd", 0)
657
736
  return {
658
737
  "status": "ok",
659
- "inserted": max(l4_count, l5_count, l6_count),
738
+ "inserted": max(l4_qmd_count, l5_count, l6_count),
660
739
  "ids": [r["id"] for r in normalised],
661
740
  "engine": {
662
741
  "l0": l2_internal.get("l0", 0),
663
742
  "l3_chunks": l2_internal.get("l3_chunks", 0),
664
743
  "l3_entities": l2_internal.get("l3_entities", 0),
665
- "l4_qmd": l2_internal.get("l4_qmd", 0),
666
- "l4": l4_count,
744
+ "l4_qmd": l4_qmd_count,
745
+ # `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
746
+ "l4": l4_qmd_count,
667
747
  "l5": l5_count,
668
748
  "l6": l6_count,
669
749
  },
@@ -32,12 +32,6 @@ services:
32
32
  # Pin the embedding dim explicitly across layers, independent of any
33
33
  # developer-local .env (which may set EMBED_DIM=768 for Ollama-based
34
34
  # local dev). The stub returns 4096; layers must agree.
35
- l4:
36
- environment:
37
- L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
38
- L4_EMBED_API_KEY: ""
39
- L4_EMBED_DIM: "4096"
40
-
41
35
  l5:
42
36
  environment:
43
37
  L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
@@ -61,6 +55,5 @@ services:
61
55
  embed-stub:
62
56
  condition: service_healthy
63
57
  l2: { condition: service_started }
64
- l4: { condition: service_started }
65
58
  l5: { condition: service_started }
66
59
  l6: { condition: service_started }
@@ -82,36 +82,6 @@ services:
82
82
  retries: 30
83
83
  start_period: 30s
84
84
 
85
- # --------------------------------------------------------------------
86
- # L4 — sqlite-vec sidecar
87
- # --------------------------------------------------------------------
88
- l4:
89
- <<: *engine-base
90
- build:
91
- context: ./engine/services
92
- dockerfile: l4/Dockerfile
93
- container_name: pme-l4
94
- # Default 18042 to avoid port collisions on 8042.
95
- # Override via PME_L4_PORT for bench setups that intentionally replace it.
96
- ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
97
- environment:
98
- L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
99
- L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
100
- L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
101
- L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
102
- L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
103
- L4_EMBED_DIM: ${EMBED_DIM:-4096}
104
- L4_DB_PATH: /data/vec.db
105
- extra_hosts:
106
- - "host.docker.internal:host-gateway"
107
- volumes:
108
- - pme-l4-data:/data
109
- healthcheck:
110
- test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
111
- interval: 10s
112
- timeout: 5s
113
- retries: 30
114
-
115
85
  # --------------------------------------------------------------------
116
86
  # L5 — Qdrant comms layer
117
87
  # --------------------------------------------------------------------
@@ -212,8 +182,11 @@ services:
212
182
  compat:
213
183
  <<: *engine-base
214
184
  build:
215
- context: ./compat
216
- dockerfile: Dockerfile
185
+ # Build context is the memory-engine root so the Dockerfile can
186
+ # COPY both compat/server.py and engine/services/_shared (shared
187
+ # EmbedClient for /store-batch dedup).
188
+ context: .
189
+ dockerfile: compat/Dockerfile
217
190
  container_name: pme-compat
218
191
  ports:
219
192
  - "127.0.0.1:${PME_PORT:-8099}:8099"
@@ -221,16 +194,25 @@ services:
221
194
  L0_URL: http://l2:8031
222
195
  L2_PROXY_URL: http://l2:8031
223
196
  L3_KG_URL: http://l3:7474
224
- L4_VEC_URL: http://l4:8042
225
197
  L5_MILVUS_URL: http://l5:8034
226
198
  L6_DOC_URL: http://l6:8037
227
199
  NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
200
+ # PME_ prefix vars feed the shim's EmbedClient for shared-embed
201
+ # mode on /store-batch (one embed call across all 3 indexers vs
202
+ # 3 redundant calls). Match the L2 config block so both clients
203
+ # hit the same gateway with the same model. Set
204
+ # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
205
+ PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
206
+ PME_EMBED_API_KEY: ${EMBED_API_KEY:-}
207
+ PME_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
208
+ PME_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
209
+ PME_NV_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
210
+ PME_SHARE_EMBEDDINGS: ${PME_SHARE_EMBEDDINGS:-true}
228
211
  BYPASS_L2_PROXY: ${BYPASS_L2_PROXY:-0}
229
212
  extra_hosts:
230
213
  - "host.docker.internal:host-gateway"
231
214
  depends_on:
232
215
  l2: { condition: service_started }
233
- l4: { condition: service_started }
234
216
  l5: { condition: service_started }
235
217
  l6: { condition: service_started }
236
218
  healthcheck:
@@ -247,6 +229,5 @@ volumes:
247
229
  pme-nv-embed-cache:
248
230
  pme-l2-data:
249
231
  pme-l3-data:
250
- pme-l4-data:
251
232
  pme-l5-data:
252
233
  pme-l6-data:
@@ -1496,6 +1496,12 @@ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
1496
1496
  class IndexInternalBatchRequest(BaseModel):
1497
1497
  records: List[Dict[str, Any]] # [{"id": str, "content": str, "metadata": dict}, ...]
1498
1498
  arena: Optional[str] = "general"
1499
+ # When supplied (parallel to `records`), skip the L4-QMD embed call
1500
+ # and use these vectors directly. Compat shim populates this when
1501
+ # shared-embed mode is on so we don't duplicate embed work across
1502
+ # layers. Length must match records — defensive bail-out below if
1503
+ # it doesn't.
1504
+ embeddings: Optional[List[List[float]]] = None
1499
1505
 
1500
1506
 
1501
1507
  @app.post("/index-internal-batch")
@@ -1575,7 +1581,19 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1575
1581
  # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
1576
1582
  l4_inserted = 0
1577
1583
  try:
1578
- embeddings = await _embed_batch_local([n["content"] for n in norm])
1584
+ # Shared-embed shortcut: if the compat shim handed us pre-computed
1585
+ # vectors that line up with our normalised records, use them and
1586
+ # skip our own embed RPC. Fall back to per-layer embedding when
1587
+ # the vectors are absent or the lengths don't match (defensive).
1588
+ shared_embs = req.embeddings
1589
+ if (
1590
+ shared_embs is not None
1591
+ and len(shared_embs) == len(records)
1592
+ and len(records) == len(norm)
1593
+ ):
1594
+ embeddings = shared_embs
1595
+ else:
1596
+ embeddings = await _embed_batch_local([n["content"] for n in norm])
1579
1597
  if len(embeddings) != len(norm):
1580
1598
  log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
1581
1599
  qmd_db = Path(QMD_DB_PATH)
@@ -629,13 +629,19 @@ def serve(port=8034):
629
629
  client = get_client()
630
630
  ensure_collection(client, collection)
631
631
 
632
- # Single batched embed call.
632
+ # Shared-embed shortcut: caller (compat shim) computed vectors
633
+ # once and forwards them so we skip the embed RPC. Length must
634
+ # match records — fall back to per-layer embed if it doesn't.
633
635
  texts = [(r.get("text") or "")[:8192] for r in records]
636
+ shared_embs = req.get("embeddings")
634
637
  t0 = _time.time()
635
- try:
636
- embs = _embed_post(texts)
637
- except Exception as exc:
638
- return {"status": "error", "error": f"embed failed: {exc}"}
638
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
639
+ embs = shared_embs
640
+ else:
641
+ try:
642
+ embs = _embed_post(texts)
643
+ except Exception as exc:
644
+ return {"status": "error", "error": f"embed failed: {exc}"}
639
645
  embed_ms = (_time.time() - t0) * 1000.0
640
646
 
641
647
  # Single batched insert. Mirror every field the chats collection
@@ -990,12 +990,18 @@ def serve(port: int = DEFAULT_PORT):
990
990
 
991
991
  texts = [(r.get("text") or "")[:16000] for r in records]
992
992
 
993
- # Single batched embed call (OpenAI-compat first, lambda-gateway fallback).
993
+ # Shared-embed shortcut: caller (compat shim) computed vectors
994
+ # once and forwards them so we skip the embed RPC. Length must
995
+ # match records — fall back to per-layer embed if it doesn't.
996
+ shared_embs = req.get("embeddings")
994
997
  t0 = _time.time()
995
- try:
996
- embs = _embed_post(texts)
997
- except Exception as exc:
998
- raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
998
+ if isinstance(shared_embs, list) and len(shared_embs) == len(records):
999
+ embs = shared_embs
1000
+ else:
1001
+ try:
1002
+ embs = _embed_post(texts)
1003
+ except Exception as exc:
1004
+ raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
999
1005
  embed_ms = (_time.time() - t0) * 1000.0
1000
1006
 
1001
1007
  # Single milvus insert.
@@ -1,19 +0,0 @@
1
- FROM python:3.12-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN pip install --no-cache-dir fastapi 'uvicorn[standard]' httpx pydantic
6
-
7
- # Build context is engine/services so the shared embed_provider module is
8
- # COPYable. server.py adds engine/services to sys.path at startup, then
9
- # imports from `_shared.embed_provider`.
10
- COPY _shared /app/_shared
11
- COPY l4/server.py /app/server.py
12
-
13
- RUN mkdir -p /data
14
- ENV L4_DB_PATH=/data/vec.db
15
- ENV PORT=8042
16
-
17
- EXPOSE 8042
18
-
19
- CMD ["python", "server.py", "--port", "8042"]
@@ -1,305 +0,0 @@
1
- """
2
- L4 sqlite-vec sidecar.
3
-
4
- Vector index sidecar for the Pentatonic Memory Engine stack.
5
- Exposes /health, /search, /index-batch, /refresh over HTTP.
6
-
7
- Endpoints:
8
- GET /health
9
- POST /search body: {"query":"...", "limit":10}
10
- POST /index-batch body: {"records":[{"id","text"}, ...]}
11
- POST /refresh no-op (sqlite-vec writes are immediate)
12
-
13
- Env:
14
- L4_DB_PATH default /data/vec.db
15
- L4_NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
16
- PORT default 8042
17
- """
18
-
19
- from __future__ import annotations
20
-
21
- import argparse
22
- import hashlib
23
- import os
24
- import sqlite3
25
- import struct
26
- import sys
27
- import time
28
- from pathlib import Path
29
- from typing import Any
30
-
31
- from fastapi import FastAPI, HTTPException
32
- from pydantic import BaseModel
33
-
34
- # Shared embedding client lives at engine/services/_shared/. Add the parent of
35
- # the service dir to sys.path so `from _shared.embed_provider import ...` works
36
- # regardless of how the service is launched (uvicorn, python server.py, etc.).
37
- sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
38
- from _shared.embed_provider import EmbedClient # noqa: E402
39
-
40
- # ----------------------------------------------------------------------
41
- # Config
42
- # ----------------------------------------------------------------------
43
-
44
- DB_PATH = os.environ.get("L4_DB_PATH", "/data/vec.db")
45
- EMBED_DIM = int(os.environ.get("L4_EMBED_DIM", "4096"))
46
-
47
-
48
-
49
- # ----------------------------------------------------------------------
50
- # DB helpers
51
- # ----------------------------------------------------------------------
52
-
53
- def _vec_to_blob(vec: list[float]) -> bytes:
54
- """Pack a list of floats as little-endian f32 bytes for sqlite-vec."""
55
- return struct.pack(f"<{len(vec)}f", *vec)
56
-
57
-
58
- def _blob_to_vec(blob: bytes) -> list[float]:
59
- n = len(blob) // 4
60
- return list(struct.unpack(f"<{n}f", blob))
61
-
62
-
63
- def _cosine(a: list[float], b: list[float]) -> float:
64
- import math
65
- dot = sum(x * y for x, y in zip(a, b))
66
- na = math.sqrt(sum(x * x for x in a))
67
- nb = math.sqrt(sum(y * y for y in b))
68
- if na == 0 or nb == 0:
69
- return 0.0
70
- return dot / (na * nb)
71
-
72
-
73
- def _get_db() -> sqlite3.Connection:
74
- """Open DB and ensure schema. We use plain BLOB columns rather than
75
- the sqlite-vec virtual table because sqlite-vec is an optional ext
76
- that may not be loadable in every container — plain BLOB lets us
77
- fall back to a Python-side cosine pass without losing correctness.
78
- """
79
- Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
80
- conn = sqlite3.connect(DB_PATH, timeout=10)
81
- conn.execute("PRAGMA journal_mode=WAL")
82
- conn.execute("""
83
- CREATE TABLE IF NOT EXISTS chunks (
84
- id TEXT PRIMARY KEY,
85
- text TEXT,
86
- embedding BLOB,
87
- indexed_at REAL
88
- )
89
- """)
90
- return conn
91
-
92
-
93
- # ----------------------------------------------------------------------
94
- # Embedding client
95
- # ----------------------------------------------------------------------
96
-
97
- _embed: EmbedClient | None = None
98
-
99
-
100
- def _embed_client() -> EmbedClient:
101
- """Lazily build the embed client so env vars are read at first use."""
102
- global _embed
103
- if _embed is None:
104
- _embed = EmbedClient.from_env(
105
- prefix="L4_",
106
- default_url="http://nv-embed:8041/v1/embeddings",
107
- )
108
- return _embed
109
-
110
-
111
- async def _embed_batch(texts: list[str]) -> list[list[float]]:
112
- """Embed a batch of texts via the shared EmbedClient."""
113
- return await _embed_client().embed_batch_async(texts)
114
-
115
-
116
- # ----------------------------------------------------------------------
117
- # FastAPI
118
- # ----------------------------------------------------------------------
119
-
120
- class SearchRequest(BaseModel):
121
- query: str
122
- limit: int = 10
123
-
124
-
125
- class IndexBatchRequest(BaseModel):
126
- records: list[dict[str, Any]]
127
-
128
-
129
- app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
130
-
131
-
132
- @app.get("/health")
133
- def health():
134
- try:
135
- conn = _get_db()
136
- n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
137
- conn.close()
138
- return {"status": "ok", "loaded": True, "n_vectors": n,
139
- "dim": EMBED_DIM, "db_path": DB_PATH,
140
- # BLOB+Python-cosine is the intentional implementation path,
141
- # not a degraded fallback (see _get_db docstring). The previous
142
- # "sqlite-vec-fallback" label gave operators the wrong signal.
143
- "backend": "sqlite-vec"}
144
- except Exception as exc:
145
- return {"status": "degraded", "error": str(exc)}
146
-
147
-
148
- @app.post("/search")
149
- async def search(req: SearchRequest):
150
- if not req.query:
151
- return []
152
- try:
153
- embs = await _embed_batch([req.query])
154
- if not embs or embs[0] is None:
155
- raise HTTPException(status_code=502, detail="embed failed")
156
- q_vec = embs[0]
157
- except Exception as exc:
158
- raise HTTPException(status_code=502, detail=f"embed: {exc}")
159
-
160
- conn = _get_db()
161
- rows = conn.execute("SELECT id, text, embedding FROM chunks").fetchall()
162
- conn.close()
163
-
164
- # Cosine similarity in Python — fine for OSS / small corpora. For
165
- # large corpora: consider a dedicated vector DB.
166
- scored: list[tuple[float, str, str]] = []
167
- for rid, text, blob in rows:
168
- if not blob:
169
- continue
170
- v = _blob_to_vec(blob)
171
- if len(v) != len(q_vec):
172
- continue
173
- s = _cosine(q_vec, v)
174
- scored.append((s, rid, text))
175
- scored.sort(reverse=True)
176
- out = [
177
- {"path": rid, "text": text, "score": float(s),
178
- "source": "L4-sqlite-vec", "layer": "L4"}
179
- for s, rid, text in scored[: req.limit]
180
- ]
181
- return out
182
-
183
-
184
- @app.post("/index-batch")
185
- async def index_batch(req: IndexBatchRequest):
186
- if not req.records:
187
- return {"status": "ok", "inserted": 0}
188
- texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
189
- t0 = time.perf_counter()
190
- embs = await _embed_batch(texts)
191
- embed_ms = (time.perf_counter() - t0) * 1000.0
192
-
193
- conn = _get_db()
194
- t1 = time.perf_counter()
195
- rows = []
196
- for r, emb, txt in zip(req.records, embs, texts):
197
- if not emb:
198
- continue
199
- rid = r.get("id") or hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
200
- rows.append((rid, txt, _vec_to_blob(emb), time.time()))
201
- if rows:
202
- conn.executemany(
203
- "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
204
- "VALUES (?, ?, ?, ?)", rows,
205
- )
206
- conn.commit()
207
- insert_ms = (time.perf_counter() - t1) * 1000.0
208
- conn.close()
209
- return {"status": "ok", "inserted": len(rows),
210
- "embed_ms": round(embed_ms, 1), "insert_ms": round(insert_ms, 1)}
211
-
212
-
213
- @app.post("/refresh")
214
- def refresh():
215
- """No-op for sqlite-vec — writes are immediate. Kept for API parity."""
216
- return {"status": "ok", "noop": True}
217
-
218
-
219
- # ----------------------------------------------------------------------
220
- # /health/deep — synthetic round-trip
221
- # ----------------------------------------------------------------------
222
-
223
- # Fixed sentinel id used by /health/deep. Upserted on every probe call,
224
- # so the row is idempotent. Kept under id="__healthcheck__sentinel" so
225
- # the L4 corpus has at most one healthcheck row regardless of probe rate.
226
- _HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
227
- _HEALTH_SENTINEL_TEXT = (
228
- "healthcheck sentinel — embed-write-search round-trip verifier"
229
- )
230
-
231
-
232
- @app.get("/health/deep")
233
- async def health_deep():
234
- """Real functional probe: embed → write → search the sentinel.
235
-
236
- Catches the class of failure that plain /health misses — broken
237
- embed paths, write 500s, query path bugs — i.e. exactly the bug
238
- shape that silently degraded L6 from v0.8.0 → v0.8.2.
239
-
240
- Returns:
241
- {status, embed_ms, write_ms, search_ms, hit, ok}
242
-
243
- `hit` confirms the sentinel was returned from search; `ok` is the
244
- aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
245
- regardless so callers can read the body for diagnostics; status:
246
- field carries the verdict.
247
- """
248
- t_total = time.perf_counter()
249
- out: dict[str, Any] = {"status": "ok", "ok": True}
250
- try:
251
- t0 = time.perf_counter()
252
- embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
253
- out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
254
- if not embs or not embs[0]:
255
- out["status"] = "embed_failed"
256
- out["ok"] = False
257
- return out
258
- vec = embs[0]
259
- except Exception as exc:
260
- out["status"] = f"embed_error: {type(exc).__name__}"
261
- out["ok"] = False
262
- return out
263
-
264
- try:
265
- conn = _get_db()
266
- t1 = time.perf_counter()
267
- conn.execute(
268
- "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
269
- "VALUES (?, ?, ?, ?)",
270
- (_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
271
- )
272
- conn.commit()
273
- out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
274
-
275
- t2 = time.perf_counter()
276
- rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
277
- (_HEALTH_SENTINEL_ID,)).fetchone()
278
- out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
279
- conn.close()
280
- except Exception as exc:
281
- out["status"] = f"db_error: {type(exc).__name__}"
282
- out["ok"] = False
283
- return out
284
-
285
- out["hit"] = rows is not None
286
- if not out["hit"]:
287
- out["status"] = "sentinel_missing"
288
- out["ok"] = False
289
- out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
290
- return out
291
-
292
-
293
- # ----------------------------------------------------------------------
294
- # Entrypoint
295
- # ----------------------------------------------------------------------
296
-
297
- if __name__ == "__main__":
298
- parser = argparse.ArgumentParser()
299
- parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8042")))
300
- parser.add_argument("--data-dir", default=None)
301
- args = parser.parse_args()
302
- if args.data_dir:
303
- os.environ["L4_DB_PATH"] = str(Path(args.data_dir) / "vec.db")
304
- import uvicorn
305
- uvicorn.run("server:app", host="0.0.0.0", port=args.port, log_level="info")