@pentatonic-ai/ai-agent-sdk 0.9.3 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory/package-lock.json +3 -3
- package/packages/memory-engine/compat/server.py +45 -67
- package/packages/memory-engine/docker-compose.test.yml +0 -7
- package/packages/memory-engine/docker-compose.yml +2 -35
- package/packages/memory-engine/engine/services/l4/Dockerfile +0 -19
- package/packages/memory-engine/engine/services/l4/server.py +0 -315
package/dist/index.cjs
CHANGED
|
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
906
906
|
}
|
|
907
907
|
|
|
908
908
|
// src/telemetry.js
|
|
909
|
-
var VERSION = "0.9.
|
|
909
|
+
var VERSION = "0.9.4";
|
|
910
910
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
911
911
|
function machineId() {
|
|
912
912
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
875
875
|
}
|
|
876
876
|
|
|
877
877
|
// src/telemetry.js
|
|
878
|
-
var VERSION = "0.9.
|
|
878
|
+
var VERSION = "0.9.4";
|
|
879
879
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
880
880
|
function machineId() {
|
|
881
881
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.4",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -568,9 +568,9 @@
|
|
|
568
568
|
}
|
|
569
569
|
},
|
|
570
570
|
"node_modules/hono": {
|
|
571
|
-
"version": "4.12.
|
|
572
|
-
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.
|
|
573
|
-
"integrity": "sha512-
|
|
571
|
+
"version": "4.12.18",
|
|
572
|
+
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
|
|
573
|
+
"integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
|
|
574
574
|
"license": "MIT",
|
|
575
575
|
"engines": {
|
|
576
576
|
"node": ">=16.9.0"
|
|
@@ -25,7 +25,6 @@ Environment:
|
|
|
25
25
|
L0_URL default http://l0:8030
|
|
26
26
|
L2_PROXY_URL default http://l2:8031
|
|
27
27
|
L3_KG_URL default http://l3:8047
|
|
28
|
-
L4_VEC_URL default http://l4:8042
|
|
29
28
|
L5_MILVUS_URL default http://l5:8035
|
|
30
29
|
L6_DOC_URL default http://l6:8037
|
|
31
30
|
NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
|
|
@@ -61,7 +60,6 @@ from _shared.embed_provider import EmbedClient # noqa: E402
|
|
|
61
60
|
L0_URL = os.environ.get("L0_URL", "http://l0:8030")
|
|
62
61
|
L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
|
|
63
62
|
L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
|
|
64
|
-
L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
|
|
65
63
|
L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
|
|
66
64
|
L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
|
|
67
65
|
NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
|
|
@@ -288,32 +286,6 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
|
288
286
|
return [d["embedding"] for d in resp.json()["data"]]
|
|
289
287
|
|
|
290
288
|
|
|
291
|
-
async def _index_l4(
|
|
292
|
-
records: list[dict[str, Any]],
|
|
293
|
-
embeddings: list[list[float]] | None = None,
|
|
294
|
-
) -> int:
|
|
295
|
-
"""Index records into the L4 sqlite-vec layer.
|
|
296
|
-
|
|
297
|
-
When `embeddings` is supplied (parallel to records), L4's /index-batch
|
|
298
|
-
skips its own embed call and uses ours — eliminates the redundant
|
|
299
|
-
embed work that previously cost ~850ms per drain alarm. When None,
|
|
300
|
-
L4 embeds itself (backwards-compatible path for older callers / tests
|
|
301
|
-
that don't share embeddings)."""
|
|
302
|
-
payload: dict[str, Any] = {"records": [
|
|
303
|
-
{"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
|
|
304
|
-
"text": r["content"]} for r in records
|
|
305
|
-
]}
|
|
306
|
-
if embeddings is not None:
|
|
307
|
-
payload["embeddings"] = embeddings
|
|
308
|
-
try:
|
|
309
|
-
resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
|
|
310
|
-
resp.raise_for_status()
|
|
311
|
-
return resp.json().get("inserted", 0)
|
|
312
|
-
except Exception as exc:
|
|
313
|
-
print(f"[shim] L4 index-batch failed: {exc}")
|
|
314
|
-
return 0
|
|
315
|
-
|
|
316
|
-
|
|
317
289
|
async def _index_l5(
|
|
318
290
|
records: list[dict[str, Any]],
|
|
319
291
|
arena: str = "general",
|
|
@@ -325,7 +297,10 @@ async def _index_l5(
|
|
|
325
297
|
by arena natively (vs the shim's defence-in-depth post-filter).
|
|
326
298
|
|
|
327
299
|
When `embeddings` is supplied (parallel to records), L5 skips its
|
|
328
|
-
own embed call —
|
|
300
|
+
own embed call — the shim pre-computes vectors once at /store-batch
|
|
301
|
+
level and threads them through each layer to avoid 3× redundant
|
|
302
|
+
embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
|
|
303
|
+
texts in parallel).
|
|
329
304
|
"""
|
|
330
305
|
payload: dict[str, Any] = {
|
|
331
306
|
"collection": "chats",
|
|
@@ -348,9 +323,9 @@ async def _index_l5(
|
|
|
348
323
|
resp.raise_for_status()
|
|
349
324
|
return resp.json().get("inserted", 0)
|
|
350
325
|
except Exception as exc:
|
|
351
|
-
# Best-effort: L5 is one of
|
|
352
|
-
# mean the record is unsearchable. L0 BM25 + L4
|
|
353
|
-
# all carry it independently.
|
|
326
|
+
# Best-effort: L5 is one of five redundant layers; failure here
|
|
327
|
+
# doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
|
|
328
|
+
# L6 doc-store all carry it independently.
|
|
354
329
|
print(f"[shim] L5 index-batch failed: {exc}")
|
|
355
330
|
return 0
|
|
356
331
|
|
|
@@ -363,7 +338,8 @@ async def _index_l6(
|
|
|
363
338
|
"""Index records into the L6 document store.
|
|
364
339
|
|
|
365
340
|
When `embeddings` is supplied (parallel to records), L6 skips its
|
|
366
|
-
own embed call —
|
|
341
|
+
own embed call — the shim pre-computes vectors once at /store-batch
|
|
342
|
+
level and threads them through each layer.
|
|
367
343
|
"""
|
|
368
344
|
payload: dict[str, Any] = {
|
|
369
345
|
"arena": arena,
|
|
@@ -401,8 +377,9 @@ async def _index_l2_internal(
|
|
|
401
377
|
/index-internal-batch which writes to all three in one round-trip.
|
|
402
378
|
|
|
403
379
|
When `embeddings` is supplied (parallel to records), L2's internal
|
|
404
|
-
embed call (used for L4-QMD population) is skipped —
|
|
405
|
-
|
|
380
|
+
embed call (used for L4-QMD population) is skipped — the shim
|
|
381
|
+
pre-computes vectors once at /store-batch level and threads them
|
|
382
|
+
through to L4_QMD via this endpoint.
|
|
406
383
|
"""
|
|
407
384
|
payload: dict[str, Any] = {
|
|
408
385
|
"arena": arena,
|
|
@@ -530,25 +507,25 @@ async def health():
|
|
|
530
507
|
nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
|
|
531
508
|
|
|
532
509
|
import asyncio
|
|
533
|
-
l2_v,
|
|
510
|
+
l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
|
|
534
511
|
_probe(f"{L2_PROXY_URL}/health"),
|
|
535
|
-
_probe(f"{L4_VEC_URL}/health"),
|
|
536
512
|
_probe(f"{L5_MILVUS_URL}/health"),
|
|
537
513
|
_probe(f"{L6_DOC_URL}/health"),
|
|
538
514
|
_probe(nv_embed_health),
|
|
539
515
|
_probe_l3(),
|
|
540
516
|
)
|
|
541
517
|
|
|
542
|
-
# L0 BM25 (FTS5)
|
|
543
|
-
# inside the L2 proxy
|
|
544
|
-
#
|
|
518
|
+
# L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
|
|
519
|
+
# all in-process inside the L2 proxy — L0+L1 in workspace.db / core
|
|
520
|
+
# files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
|
|
521
|
+
# if L2 is healthy, all three layers are usable. Tie their status to L2.
|
|
545
522
|
l2_ok = l2_v == "ok"
|
|
546
523
|
out["layers"] = {
|
|
547
524
|
"l0": "ok" if l2_ok else l2_v,
|
|
548
525
|
"l1": "ok" if l2_ok else l2_v,
|
|
549
526
|
"l2": l2_v,
|
|
550
527
|
"l3": l3_v,
|
|
551
|
-
"l4":
|
|
528
|
+
"l4": "ok" if l2_ok else l2_v,
|
|
552
529
|
"l5": l5_v,
|
|
553
530
|
"l6": l6_v,
|
|
554
531
|
"nv_embed": nv_v,
|
|
@@ -569,19 +546,15 @@ async def health():
|
|
|
569
546
|
"l6_vector_chunks": None,
|
|
570
547
|
"l6_fts_chunks": None,
|
|
571
548
|
}
|
|
572
|
-
# L0
|
|
549
|
+
# L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
|
|
550
|
+
# opened by the L2 proxy). L2 exposes /index-internal-stats with both
|
|
551
|
+
# counts in one round-trip.
|
|
573
552
|
try:
|
|
574
553
|
r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
|
|
575
554
|
if r.status_code == 200:
|
|
576
555
|
stats = r.json()
|
|
577
556
|
memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
|
|
578
|
-
|
|
579
|
-
pass
|
|
580
|
-
# L4 reports n_vectors on its own /health.
|
|
581
|
-
try:
|
|
582
|
-
r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
|
|
583
|
-
if r.status_code == 200:
|
|
584
|
-
memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
|
|
557
|
+
memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
|
|
585
558
|
except Exception:
|
|
586
559
|
pass
|
|
587
560
|
# L5 reports per-collection counts on /health. We surface chats —
|
|
@@ -634,8 +607,9 @@ async def health_deep():
|
|
|
634
607
|
except Exception as exc:
|
|
635
608
|
return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
|
|
636
609
|
|
|
610
|
+
# L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
|
|
611
|
+
# round-trip is covered by L2's /health/deep, no separate probe needed.
|
|
637
612
|
results = await asyncio.gather(
|
|
638
|
-
_probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
|
|
639
613
|
_probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
|
|
640
614
|
_probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
|
|
641
615
|
)
|
|
@@ -675,15 +649,15 @@ async def store(req: StoreRequest):
|
|
|
675
649
|
# depending on which one was supplied).
|
|
676
650
|
_stash_all_keys(rid, req.metadata or {}, arena)
|
|
677
651
|
|
|
678
|
-
# Fan out to
|
|
652
|
+
# Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
|
|
679
653
|
import asyncio
|
|
680
|
-
|
|
681
|
-
_index_l4([record]),
|
|
654
|
+
l5_count, l6_count, l2_internal = await asyncio.gather(
|
|
682
655
|
_index_l5([record], arena=arena),
|
|
683
656
|
_index_l6([record], arena=arena),
|
|
684
657
|
_index_l2_internal([record], arena=arena),
|
|
685
658
|
)
|
|
686
659
|
|
|
660
|
+
l4_qmd_count = l2_internal.get("l4_qmd", 0)
|
|
687
661
|
return {
|
|
688
662
|
"id": rid,
|
|
689
663
|
"content": req.content,
|
|
@@ -692,8 +666,11 @@ async def store(req: StoreRequest):
|
|
|
692
666
|
"l0": l2_internal.get("l0", 0),
|
|
693
667
|
"l3_chunks": l2_internal.get("l3_chunks", 0),
|
|
694
668
|
"l3_entities": l2_internal.get("l3_entities", 0),
|
|
695
|
-
"l4_qmd":
|
|
696
|
-
|
|
669
|
+
"l4_qmd": l4_qmd_count,
|
|
670
|
+
# `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
|
|
671
|
+
# sidecar has been dropped. Kept in the response for wire-format
|
|
672
|
+
# back-compat with callers that read engine.l4.
|
|
673
|
+
"l4": l4_qmd_count,
|
|
697
674
|
"l5": l5_count,
|
|
698
675
|
"l6": l6_count,
|
|
699
676
|
},
|
|
@@ -724,13 +701,13 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
724
701
|
import asyncio
|
|
725
702
|
|
|
726
703
|
# Shared-embed mode: compute embeddings ONCE here, pass them down to
|
|
727
|
-
# every layer so they skip their own embed call. Previously
|
|
728
|
-
# +
|
|
729
|
-
#
|
|
730
|
-
#
|
|
731
|
-
#
|
|
732
|
-
#
|
|
733
|
-
#
|
|
704
|
+
# every layer so they skip their own embed call. Previously L5 + L6
|
|
705
|
+
# + L2-internal each re-embedded the same texts in parallel, which
|
|
706
|
+
# fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
|
|
707
|
+
# requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
|
|
708
|
+
# each = ~2.5s of pure embed time per /store-batch. With shared
|
|
709
|
+
# embeddings we issue one chunked embed pass (10 sub-calls for N=50
|
|
710
|
+
# records) and skip the per-layer redundant work entirely.
|
|
734
711
|
# Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
|
|
735
712
|
# per-layer differentiated embedders.
|
|
736
713
|
shared_embeddings: list[list[float]] | None = None
|
|
@@ -748,24 +725,25 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
748
725
|
shared_embeddings = None
|
|
749
726
|
embed_ms = (time.perf_counter() - embed_t0) * 1000.0
|
|
750
727
|
|
|
751
|
-
|
|
752
|
-
_index_l4(normalised, embeddings=shared_embeddings),
|
|
728
|
+
l5_count, l6_count, l2_internal = await asyncio.gather(
|
|
753
729
|
_index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
754
730
|
_index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
755
731
|
_index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
756
732
|
)
|
|
757
733
|
dur_ms = (time.perf_counter() - t0) * 1000.0
|
|
758
734
|
|
|
735
|
+
l4_qmd_count = l2_internal.get("l4_qmd", 0)
|
|
759
736
|
return {
|
|
760
737
|
"status": "ok",
|
|
761
|
-
"inserted": max(
|
|
738
|
+
"inserted": max(l4_qmd_count, l5_count, l6_count),
|
|
762
739
|
"ids": [r["id"] for r in normalised],
|
|
763
740
|
"engine": {
|
|
764
741
|
"l0": l2_internal.get("l0", 0),
|
|
765
742
|
"l3_chunks": l2_internal.get("l3_chunks", 0),
|
|
766
743
|
"l3_entities": l2_internal.get("l3_entities", 0),
|
|
767
|
-
"l4_qmd":
|
|
768
|
-
|
|
744
|
+
"l4_qmd": l4_qmd_count,
|
|
745
|
+
# `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
|
|
746
|
+
"l4": l4_qmd_count,
|
|
769
747
|
"l5": l5_count,
|
|
770
748
|
"l6": l6_count,
|
|
771
749
|
},
|
|
@@ -32,12 +32,6 @@ services:
|
|
|
32
32
|
# Pin the embedding dim explicitly across layers, independent of any
|
|
33
33
|
# developer-local .env (which may set EMBED_DIM=768 for Ollama-based
|
|
34
34
|
# local dev). The stub returns 4096; layers must agree.
|
|
35
|
-
l4:
|
|
36
|
-
environment:
|
|
37
|
-
L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
|
|
38
|
-
L4_EMBED_API_KEY: ""
|
|
39
|
-
L4_EMBED_DIM: "4096"
|
|
40
|
-
|
|
41
35
|
l5:
|
|
42
36
|
environment:
|
|
43
37
|
L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
|
|
@@ -61,6 +55,5 @@ services:
|
|
|
61
55
|
embed-stub:
|
|
62
56
|
condition: service_healthy
|
|
63
57
|
l2: { condition: service_started }
|
|
64
|
-
l4: { condition: service_started }
|
|
65
58
|
l5: { condition: service_started }
|
|
66
59
|
l6: { condition: service_started }
|
|
@@ -82,36 +82,6 @@ services:
|
|
|
82
82
|
retries: 30
|
|
83
83
|
start_period: 30s
|
|
84
84
|
|
|
85
|
-
# --------------------------------------------------------------------
|
|
86
|
-
# L4 — sqlite-vec sidecar
|
|
87
|
-
# --------------------------------------------------------------------
|
|
88
|
-
l4:
|
|
89
|
-
<<: *engine-base
|
|
90
|
-
build:
|
|
91
|
-
context: ./engine/services
|
|
92
|
-
dockerfile: l4/Dockerfile
|
|
93
|
-
container_name: pme-l4
|
|
94
|
-
# Default 18042 to avoid port collisions on 8042.
|
|
95
|
-
# Override via PME_L4_PORT for bench setups that intentionally replace it.
|
|
96
|
-
ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
|
|
97
|
-
environment:
|
|
98
|
-
L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
99
|
-
L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
|
|
100
|
-
L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
|
|
101
|
-
L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
|
|
102
|
-
L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
|
|
103
|
-
L4_EMBED_DIM: ${EMBED_DIM:-4096}
|
|
104
|
-
L4_DB_PATH: /data/vec.db
|
|
105
|
-
extra_hosts:
|
|
106
|
-
- "host.docker.internal:host-gateway"
|
|
107
|
-
volumes:
|
|
108
|
-
- pme-l4-data:/data
|
|
109
|
-
healthcheck:
|
|
110
|
-
test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
|
|
111
|
-
interval: 10s
|
|
112
|
-
timeout: 5s
|
|
113
|
-
retries: 30
|
|
114
|
-
|
|
115
85
|
# --------------------------------------------------------------------
|
|
116
86
|
# L5 — Qdrant comms layer
|
|
117
87
|
# --------------------------------------------------------------------
|
|
@@ -224,13 +194,12 @@ services:
|
|
|
224
194
|
L0_URL: http://l2:8031
|
|
225
195
|
L2_PROXY_URL: http://l2:8031
|
|
226
196
|
L3_KG_URL: http://l3:7474
|
|
227
|
-
L4_VEC_URL: http://l4:8042
|
|
228
197
|
L5_MILVUS_URL: http://l5:8034
|
|
229
198
|
L6_DOC_URL: http://l6:8037
|
|
230
199
|
NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
231
200
|
# PME_ prefix vars feed the shim's EmbedClient for shared-embed
|
|
232
|
-
# mode on /store-batch (one embed call across all
|
|
233
|
-
#
|
|
201
|
+
# mode on /store-batch (one embed call across all 3 indexers vs
|
|
202
|
+
# 3 redundant calls). Match the L2 config block so both clients
|
|
234
203
|
# hit the same gateway with the same model. Set
|
|
235
204
|
# PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
|
|
236
205
|
PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
@@ -244,7 +213,6 @@ services:
|
|
|
244
213
|
- "host.docker.internal:host-gateway"
|
|
245
214
|
depends_on:
|
|
246
215
|
l2: { condition: service_started }
|
|
247
|
-
l4: { condition: service_started }
|
|
248
216
|
l5: { condition: service_started }
|
|
249
217
|
l6: { condition: service_started }
|
|
250
218
|
healthcheck:
|
|
@@ -261,6 +229,5 @@ volumes:
|
|
|
261
229
|
pme-nv-embed-cache:
|
|
262
230
|
pme-l2-data:
|
|
263
231
|
pme-l3-data:
|
|
264
|
-
pme-l4-data:
|
|
265
232
|
pme-l5-data:
|
|
266
233
|
pme-l6-data:
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
FROM python:3.12-slim
|
|
2
|
-
|
|
3
|
-
WORKDIR /app
|
|
4
|
-
|
|
5
|
-
RUN pip install --no-cache-dir fastapi 'uvicorn[standard]' httpx pydantic
|
|
6
|
-
|
|
7
|
-
# Build context is engine/services so the shared embed_provider module is
|
|
8
|
-
# COPYable. server.py adds engine/services to sys.path at startup, then
|
|
9
|
-
# imports from `_shared.embed_provider`.
|
|
10
|
-
COPY _shared /app/_shared
|
|
11
|
-
COPY l4/server.py /app/server.py
|
|
12
|
-
|
|
13
|
-
RUN mkdir -p /data
|
|
14
|
-
ENV L4_DB_PATH=/data/vec.db
|
|
15
|
-
ENV PORT=8042
|
|
16
|
-
|
|
17
|
-
EXPOSE 8042
|
|
18
|
-
|
|
19
|
-
CMD ["python", "server.py", "--port", "8042"]
|
|
@@ -1,315 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
L4 sqlite-vec sidecar.
|
|
3
|
-
|
|
4
|
-
Vector index sidecar for the Pentatonic Memory Engine stack.
|
|
5
|
-
Exposes /health, /search, /index-batch, /refresh over HTTP.
|
|
6
|
-
|
|
7
|
-
Endpoints:
|
|
8
|
-
GET /health
|
|
9
|
-
POST /search body: {"query":"...", "limit":10}
|
|
10
|
-
POST /index-batch body: {"records":[{"id","text"}, ...]}
|
|
11
|
-
POST /refresh no-op (sqlite-vec writes are immediate)
|
|
12
|
-
|
|
13
|
-
Env:
|
|
14
|
-
L4_DB_PATH default /data/vec.db
|
|
15
|
-
L4_NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
|
|
16
|
-
PORT default 8042
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
from __future__ import annotations
|
|
20
|
-
|
|
21
|
-
import argparse
|
|
22
|
-
import hashlib
|
|
23
|
-
import os
|
|
24
|
-
import sqlite3
|
|
25
|
-
import struct
|
|
26
|
-
import sys
|
|
27
|
-
import time
|
|
28
|
-
from pathlib import Path
|
|
29
|
-
from typing import Any
|
|
30
|
-
|
|
31
|
-
from fastapi import FastAPI, HTTPException
|
|
32
|
-
from pydantic import BaseModel
|
|
33
|
-
|
|
34
|
-
# Shared embedding client lives at engine/services/_shared/. Add the parent of
|
|
35
|
-
# the service dir to sys.path so `from _shared.embed_provider import ...` works
|
|
36
|
-
# regardless of how the service is launched (uvicorn, python server.py, etc.).
|
|
37
|
-
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
38
|
-
from _shared.embed_provider import EmbedClient # noqa: E402
|
|
39
|
-
|
|
40
|
-
# ----------------------------------------------------------------------
|
|
41
|
-
# Config
|
|
42
|
-
# ----------------------------------------------------------------------
|
|
43
|
-
|
|
44
|
-
DB_PATH = os.environ.get("L4_DB_PATH", "/data/vec.db")
|
|
45
|
-
EMBED_DIM = int(os.environ.get("L4_EMBED_DIM", "4096"))
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# ----------------------------------------------------------------------
|
|
50
|
-
# DB helpers
|
|
51
|
-
# ----------------------------------------------------------------------
|
|
52
|
-
|
|
53
|
-
def _vec_to_blob(vec: list[float]) -> bytes:
|
|
54
|
-
"""Pack a list of floats as little-endian f32 bytes for sqlite-vec."""
|
|
55
|
-
return struct.pack(f"<{len(vec)}f", *vec)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _blob_to_vec(blob: bytes) -> list[float]:
|
|
59
|
-
n = len(blob) // 4
|
|
60
|
-
return list(struct.unpack(f"<{n}f", blob))
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _cosine(a: list[float], b: list[float]) -> float:
|
|
64
|
-
import math
|
|
65
|
-
dot = sum(x * y for x, y in zip(a, b))
|
|
66
|
-
na = math.sqrt(sum(x * x for x in a))
|
|
67
|
-
nb = math.sqrt(sum(y * y for y in b))
|
|
68
|
-
if na == 0 or nb == 0:
|
|
69
|
-
return 0.0
|
|
70
|
-
return dot / (na * nb)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _get_db() -> sqlite3.Connection:
|
|
74
|
-
"""Open DB and ensure schema. We use plain BLOB columns rather than
|
|
75
|
-
the sqlite-vec virtual table because sqlite-vec is an optional ext
|
|
76
|
-
that may not be loadable in every container — plain BLOB lets us
|
|
77
|
-
fall back to a Python-side cosine pass without losing correctness.
|
|
78
|
-
"""
|
|
79
|
-
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
|
|
80
|
-
conn = sqlite3.connect(DB_PATH, timeout=10)
|
|
81
|
-
conn.execute("PRAGMA journal_mode=WAL")
|
|
82
|
-
conn.execute("""
|
|
83
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
84
|
-
id TEXT PRIMARY KEY,
|
|
85
|
-
text TEXT,
|
|
86
|
-
embedding BLOB,
|
|
87
|
-
indexed_at REAL
|
|
88
|
-
)
|
|
89
|
-
""")
|
|
90
|
-
return conn
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# ----------------------------------------------------------------------
|
|
94
|
-
# Embedding client
|
|
95
|
-
# ----------------------------------------------------------------------
|
|
96
|
-
|
|
97
|
-
_embed: EmbedClient | None = None
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _embed_client() -> EmbedClient:
|
|
101
|
-
"""Lazily build the embed client so env vars are read at first use."""
|
|
102
|
-
global _embed
|
|
103
|
-
if _embed is None:
|
|
104
|
-
_embed = EmbedClient.from_env(
|
|
105
|
-
prefix="L4_",
|
|
106
|
-
default_url="http://nv-embed:8041/v1/embeddings",
|
|
107
|
-
)
|
|
108
|
-
return _embed
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
112
|
-
"""Embed a batch of texts via the shared EmbedClient."""
|
|
113
|
-
return await _embed_client().embed_batch_async(texts)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
# ----------------------------------------------------------------------
|
|
117
|
-
# FastAPI
|
|
118
|
-
# ----------------------------------------------------------------------
|
|
119
|
-
|
|
120
|
-
class SearchRequest(BaseModel):
|
|
121
|
-
query: str
|
|
122
|
-
limit: int = 10
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class IndexBatchRequest(BaseModel):
|
|
126
|
-
records: list[dict[str, Any]]
|
|
127
|
-
# When supplied (parallel to `records`), skip the embed call and use
|
|
128
|
-
# these vectors directly. Compat shim populates this when shared-embed
|
|
129
|
-
# mode is on so we don't duplicate the embed work across layers.
|
|
130
|
-
embeddings: list[list[float]] | None = None
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
@app.get("/health")
|
|
137
|
-
def health():
|
|
138
|
-
try:
|
|
139
|
-
conn = _get_db()
|
|
140
|
-
n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
|
141
|
-
conn.close()
|
|
142
|
-
return {"status": "ok", "loaded": True, "n_vectors": n,
|
|
143
|
-
"dim": EMBED_DIM, "db_path": DB_PATH,
|
|
144
|
-
# BLOB+Python-cosine is the intentional implementation path,
|
|
145
|
-
# not a degraded fallback (see _get_db docstring). The previous
|
|
146
|
-
# "sqlite-vec-fallback" label gave operators the wrong signal.
|
|
147
|
-
"backend": "sqlite-vec"}
|
|
148
|
-
except Exception as exc:
|
|
149
|
-
return {"status": "degraded", "error": str(exc)}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
@app.post("/search")
|
|
153
|
-
async def search(req: SearchRequest):
|
|
154
|
-
if not req.query:
|
|
155
|
-
return []
|
|
156
|
-
try:
|
|
157
|
-
embs = await _embed_batch([req.query])
|
|
158
|
-
if not embs or embs[0] is None:
|
|
159
|
-
raise HTTPException(status_code=502, detail="embed failed")
|
|
160
|
-
q_vec = embs[0]
|
|
161
|
-
except Exception as exc:
|
|
162
|
-
raise HTTPException(status_code=502, detail=f"embed: {exc}")
|
|
163
|
-
|
|
164
|
-
conn = _get_db()
|
|
165
|
-
rows = conn.execute("SELECT id, text, embedding FROM chunks").fetchall()
|
|
166
|
-
conn.close()
|
|
167
|
-
|
|
168
|
-
# Cosine similarity in Python — fine for OSS / small corpora. For
|
|
169
|
-
# large corpora: consider a dedicated vector DB.
|
|
170
|
-
scored: list[tuple[float, str, str]] = []
|
|
171
|
-
for rid, text, blob in rows:
|
|
172
|
-
if not blob:
|
|
173
|
-
continue
|
|
174
|
-
v = _blob_to_vec(blob)
|
|
175
|
-
if len(v) != len(q_vec):
|
|
176
|
-
continue
|
|
177
|
-
s = _cosine(q_vec, v)
|
|
178
|
-
scored.append((s, rid, text))
|
|
179
|
-
scored.sort(reverse=True)
|
|
180
|
-
out = [
|
|
181
|
-
{"path": rid, "text": text, "score": float(s),
|
|
182
|
-
"source": "L4-sqlite-vec", "layer": "L4"}
|
|
183
|
-
for s, rid, text in scored[: req.limit]
|
|
184
|
-
]
|
|
185
|
-
return out
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
@app.post("/index-batch")
|
|
189
|
-
async def index_batch(req: IndexBatchRequest):
|
|
190
|
-
if not req.records:
|
|
191
|
-
return {"status": "ok", "inserted": 0}
|
|
192
|
-
texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
|
|
193
|
-
t0 = time.perf_counter()
|
|
194
|
-
# Shared-embed shortcut: caller (compat shim) computed vectors once
|
|
195
|
-
# and forwards them so we skip the embed RPC. Length must match
|
|
196
|
-
# records — defensive bail if it doesn't.
|
|
197
|
-
if req.embeddings is not None and len(req.embeddings) == len(req.records):
|
|
198
|
-
embs = req.embeddings
|
|
199
|
-
else:
|
|
200
|
-
embs = await _embed_batch(texts)
|
|
201
|
-
embed_ms = (time.perf_counter() - t0) * 1000.0
|
|
202
|
-
|
|
203
|
-
conn = _get_db()
|
|
204
|
-
t1 = time.perf_counter()
|
|
205
|
-
rows = []
|
|
206
|
-
for r, emb, txt in zip(req.records, embs, texts):
|
|
207
|
-
if not emb:
|
|
208
|
-
continue
|
|
209
|
-
rid = r.get("id") or hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
|
|
210
|
-
rows.append((rid, txt, _vec_to_blob(emb), time.time()))
|
|
211
|
-
if rows:
|
|
212
|
-
conn.executemany(
|
|
213
|
-
"INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
|
|
214
|
-
"VALUES (?, ?, ?, ?)", rows,
|
|
215
|
-
)
|
|
216
|
-
conn.commit()
|
|
217
|
-
insert_ms = (time.perf_counter() - t1) * 1000.0
|
|
218
|
-
conn.close()
|
|
219
|
-
return {"status": "ok", "inserted": len(rows),
|
|
220
|
-
"embed_ms": round(embed_ms, 1), "insert_ms": round(insert_ms, 1)}
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
@app.post("/refresh")
|
|
224
|
-
def refresh():
|
|
225
|
-
"""No-op for sqlite-vec — writes are immediate. Kept for API parity."""
|
|
226
|
-
return {"status": "ok", "noop": True}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
# ----------------------------------------------------------------------
|
|
230
|
-
# /health/deep — synthetic round-trip
|
|
231
|
-
# ----------------------------------------------------------------------
|
|
232
|
-
|
|
233
|
-
# Fixed sentinel id used by /health/deep. Upserted on every probe call,
|
|
234
|
-
# so the row is idempotent. Kept under id="__healthcheck__sentinel" so
|
|
235
|
-
# the L4 corpus has at most one healthcheck row regardless of probe rate.
|
|
236
|
-
_HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
|
|
237
|
-
_HEALTH_SENTINEL_TEXT = (
|
|
238
|
-
"healthcheck sentinel — embed-write-search round-trip verifier"
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
@app.get("/health/deep")
|
|
243
|
-
async def health_deep():
|
|
244
|
-
"""Real functional probe: embed → write → search the sentinel.
|
|
245
|
-
|
|
246
|
-
Catches the class of failure that plain /health misses — broken
|
|
247
|
-
embed paths, write 500s, query path bugs — i.e. exactly the bug
|
|
248
|
-
shape that silently degraded L6 from v0.8.0 → v0.8.2.
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
{status, embed_ms, write_ms, search_ms, hit, ok}
|
|
252
|
-
|
|
253
|
-
`hit` confirms the sentinel was returned from search; `ok` is the
|
|
254
|
-
aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
|
|
255
|
-
regardless so callers can read the body for diagnostics; status:
|
|
256
|
-
field carries the verdict.
|
|
257
|
-
"""
|
|
258
|
-
t_total = time.perf_counter()
|
|
259
|
-
out: dict[str, Any] = {"status": "ok", "ok": True}
|
|
260
|
-
try:
|
|
261
|
-
t0 = time.perf_counter()
|
|
262
|
-
embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
|
|
263
|
-
out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
|
|
264
|
-
if not embs or not embs[0]:
|
|
265
|
-
out["status"] = "embed_failed"
|
|
266
|
-
out["ok"] = False
|
|
267
|
-
return out
|
|
268
|
-
vec = embs[0]
|
|
269
|
-
except Exception as exc:
|
|
270
|
-
out["status"] = f"embed_error: {type(exc).__name__}"
|
|
271
|
-
out["ok"] = False
|
|
272
|
-
return out
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
conn = _get_db()
|
|
276
|
-
t1 = time.perf_counter()
|
|
277
|
-
conn.execute(
|
|
278
|
-
"INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
|
|
279
|
-
"VALUES (?, ?, ?, ?)",
|
|
280
|
-
(_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
|
|
281
|
-
)
|
|
282
|
-
conn.commit()
|
|
283
|
-
out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
|
|
284
|
-
|
|
285
|
-
t2 = time.perf_counter()
|
|
286
|
-
rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
|
|
287
|
-
(_HEALTH_SENTINEL_ID,)).fetchone()
|
|
288
|
-
out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
|
|
289
|
-
conn.close()
|
|
290
|
-
except Exception as exc:
|
|
291
|
-
out["status"] = f"db_error: {type(exc).__name__}"
|
|
292
|
-
out["ok"] = False
|
|
293
|
-
return out
|
|
294
|
-
|
|
295
|
-
out["hit"] = rows is not None
|
|
296
|
-
if not out["hit"]:
|
|
297
|
-
out["status"] = "sentinel_missing"
|
|
298
|
-
out["ok"] = False
|
|
299
|
-
out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
|
|
300
|
-
return out
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
# ----------------------------------------------------------------------
|
|
304
|
-
# Entrypoint
|
|
305
|
-
# ----------------------------------------------------------------------
|
|
306
|
-
|
|
307
|
-
if __name__ == "__main__":
|
|
308
|
-
parser = argparse.ArgumentParser()
|
|
309
|
-
parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8042")))
|
|
310
|
-
parser.add_argument("--data-dir", default=None)
|
|
311
|
-
args = parser.parse_args()
|
|
312
|
-
if args.data_dir:
|
|
313
|
-
os.environ["L4_DB_PATH"] = str(Path(args.data_dir) / "vec.db")
|
|
314
|
-
import uvicorn
|
|
315
|
-
uvicorn.run("server:app", host="0.0.0.0", port=args.port, log_level="info")
|