@pentatonic-ai/ai-agent-sdk 0.9.2 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory/package-lock.json +3 -3
- package/packages/memory-engine/compat/Dockerfile +12 -1
- package/packages/memory-engine/compat/server.py +135 -55
- package/packages/memory-engine/docker-compose.test.yml +0 -7
- package/packages/memory-engine/docker-compose.yml +16 -35
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +19 -1
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +11 -5
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +11 -5
- package/packages/memory-engine/engine/services/l4/Dockerfile +0 -19
- package/packages/memory-engine/engine/services/l4/server.py +0 -305
package/dist/index.cjs
CHANGED
|
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
906
906
|
}
|
|
907
907
|
|
|
908
908
|
// src/telemetry.js
|
|
909
|
-
var VERSION = "0.9.
|
|
909
|
+
var VERSION = "0.9.4";
|
|
910
910
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
911
911
|
function machineId() {
|
|
912
912
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
875
875
|
}
|
|
876
876
|
|
|
877
877
|
// src/telemetry.js
|
|
878
|
-
var VERSION = "0.9.
|
|
878
|
+
var VERSION = "0.9.4";
|
|
879
879
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
880
880
|
function machineId() {
|
|
881
881
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.4",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -568,9 +568,9 @@
|
|
|
568
568
|
}
|
|
569
569
|
},
|
|
570
570
|
"node_modules/hono": {
|
|
571
|
-
"version": "4.12.
|
|
572
|
-
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.
|
|
573
|
-
"integrity": "sha512-
|
|
571
|
+
"version": "4.12.18",
|
|
572
|
+
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
|
|
573
|
+
"integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
|
|
574
574
|
"license": "MIT",
|
|
575
575
|
"engines": {
|
|
576
576
|
"node": ">=16.9.0"
|
|
@@ -4,7 +4,18 @@ WORKDIR /app
|
|
|
4
4
|
|
|
5
5
|
RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx pydantic
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
# Build context is the memory-engine root (see docker-compose.yml). The
|
|
8
|
+
# shim's server.py side-loads engine/services/_shared/embed_provider.py
|
|
9
|
+
# for shared-embed mode on /store-batch (one embed call across all 4
|
|
10
|
+
# layer indexers vs 4 redundant calls).
|
|
11
|
+
COPY compat/server.py /app/server.py
|
|
12
|
+
# server.py's sys.path.insert resolves "../engine/services" relative to
|
|
13
|
+
# its own location (/app/server.py → /engine/services). Mirror that
|
|
14
|
+
# layout so the import works without changing the runtime code.
|
|
15
|
+
COPY engine/services/_shared /engine/services/_shared
|
|
16
|
+
# Make `_shared` an importable package (mirror the layer services'
|
|
17
|
+
# layout where __init__.py exists or python detects PEP 420 namespace).
|
|
18
|
+
RUN touch /engine/services/__init__.py
|
|
8
19
|
|
|
9
20
|
EXPOSE 8099
|
|
10
21
|
|
|
@@ -25,7 +25,6 @@ Environment:
|
|
|
25
25
|
L0_URL default http://l0:8030
|
|
26
26
|
L2_PROXY_URL default http://l2:8031
|
|
27
27
|
L3_KG_URL default http://l3:8047
|
|
28
|
-
L4_VEC_URL default http://l4:8042
|
|
29
28
|
L5_MILVUS_URL default http://l5:8035
|
|
30
29
|
L6_DOC_URL default http://l6:8037
|
|
31
30
|
NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
|
|
@@ -34,6 +33,7 @@ Environment:
|
|
|
34
33
|
|
|
35
34
|
import hashlib
|
|
36
35
|
import os
|
|
36
|
+
import sys
|
|
37
37
|
import time
|
|
38
38
|
from datetime import datetime, timezone
|
|
39
39
|
from typing import Any, Optional
|
|
@@ -42,6 +42,17 @@ import httpx
|
|
|
42
42
|
from fastapi import FastAPI, HTTPException
|
|
43
43
|
from pydantic import BaseModel, Field
|
|
44
44
|
|
|
45
|
+
# Reach into the engine/services tree so we can reuse EmbedClient. The
|
|
46
|
+
# tree isn't a real installed package; layer services and the compat
|
|
47
|
+
# shim both side-load it the same way. Keeps the chunking + auto-detect
|
|
48
|
+
# behaviour identical between the shim's pre-embed and the per-layer
|
|
49
|
+
# embeds that previously did the same work N times.
|
|
50
|
+
sys.path.insert(
|
|
51
|
+
0,
|
|
52
|
+
os.path.join(os.path.dirname(__file__), "..", "engine", "services"),
|
|
53
|
+
)
|
|
54
|
+
from _shared.embed_provider import EmbedClient # noqa: E402
|
|
55
|
+
|
|
45
56
|
# ----------------------------------------------------------------------
|
|
46
57
|
# Config
|
|
47
58
|
# ----------------------------------------------------------------------
|
|
@@ -49,7 +60,6 @@ from pydantic import BaseModel, Field
|
|
|
49
60
|
L0_URL = os.environ.get("L0_URL", "http://l0:8030")
|
|
50
61
|
L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
|
|
51
62
|
L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
|
|
52
|
-
L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
|
|
53
63
|
L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
|
|
54
64
|
L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
|
|
55
65
|
NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
|
|
@@ -63,6 +73,30 @@ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
|
|
|
63
73
|
|
|
64
74
|
PORT = int(os.environ.get("PORT", "8099"))
|
|
65
75
|
|
|
76
|
+
# Shared-embed mode. When on, /store-batch computes embeddings once at
|
|
77
|
+
# the shim level and forwards them to each layer's /index-batch so the
|
|
78
|
+
# layer skips its own embed call. Cuts gateway RPC count by ~4× (L4 +
|
|
79
|
+
# L5 + L6 + L2-internal all did the same embed work independently).
|
|
80
|
+
# Default ON because all layers in this engine use the same NV-Embed
|
|
81
|
+
# model; disable if you ever wire up per-layer differentiated embedders
|
|
82
|
+
# (e.g. cohere on L5, openai on L4).
|
|
83
|
+
SHARE_EMBEDDINGS = os.environ.get("PME_SHARE_EMBEDDINGS", "true").lower() == "true"
|
|
84
|
+
|
|
85
|
+
_embed_client: EmbedClient | None = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_embed_client() -> EmbedClient:
|
|
89
|
+
"""Lazy-init the shim's EmbedClient using PME_-prefixed env vars
|
|
90
|
+
(matches L2's pattern). Cached for the process lifetime so the
|
|
91
|
+
auto-detect handshake only happens once."""
|
|
92
|
+
global _embed_client
|
|
93
|
+
if _embed_client is None:
|
|
94
|
+
_embed_client = EmbedClient.from_env(
|
|
95
|
+
prefix="PME_",
|
|
96
|
+
default_url=NV_EMBED_URL,
|
|
97
|
+
)
|
|
98
|
+
return _embed_client
|
|
99
|
+
|
|
66
100
|
|
|
67
101
|
# Layer types we surface as the SDK 4-layer projection. Engine stores
|
|
68
102
|
# everything as chunks tagged with arena + layer_type metadata; this
|
|
@@ -252,28 +286,23 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
|
252
286
|
return [d["embedding"] for d in resp.json()["data"]]
|
|
253
287
|
|
|
254
288
|
|
|
255
|
-
async def
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
]}
|
|
261
|
-
try:
|
|
262
|
-
resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
|
|
263
|
-
resp.raise_for_status()
|
|
264
|
-
return resp.json().get("inserted", 0)
|
|
265
|
-
except Exception as exc:
|
|
266
|
-
print(f"[shim] L4 index-batch failed: {exc}")
|
|
267
|
-
return 0
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
|
|
289
|
+
async def _index_l5(
|
|
290
|
+
records: list[dict[str, Any]],
|
|
291
|
+
arena: str = "general",
|
|
292
|
+
embeddings: list[list[float]] | None = None,
|
|
293
|
+
) -> int:
|
|
271
294
|
"""Index records into the L5 Milvus comms layer (chats collection).
|
|
272
295
|
|
|
273
296
|
arena is forwarded as a Milvus dynamic field so /search can filter
|
|
274
297
|
by arena natively (vs the shim's defence-in-depth post-filter).
|
|
298
|
+
|
|
299
|
+
When `embeddings` is supplied (parallel to records), L5 skips its
|
|
300
|
+
own embed call — the shim pre-computes vectors once at /store-batch
|
|
301
|
+
level and threads them through each layer to avoid 3× redundant
|
|
302
|
+
embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
|
|
303
|
+
texts in parallel).
|
|
275
304
|
"""
|
|
276
|
-
payload = {
|
|
305
|
+
payload: dict[str, Any] = {
|
|
277
306
|
"collection": "chats",
|
|
278
307
|
"records": [
|
|
279
308
|
{
|
|
@@ -287,21 +316,32 @@ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> in
|
|
|
287
316
|
for r in records
|
|
288
317
|
],
|
|
289
318
|
}
|
|
319
|
+
if embeddings is not None:
|
|
320
|
+
payload["embeddings"] = embeddings
|
|
290
321
|
try:
|
|
291
322
|
resp = await _client().post(f"{L5_MILVUS_URL}/index-batch", json=payload, timeout=60.0)
|
|
292
323
|
resp.raise_for_status()
|
|
293
324
|
return resp.json().get("inserted", 0)
|
|
294
325
|
except Exception as exc:
|
|
295
|
-
# Best-effort: L5 is one of
|
|
296
|
-
# mean the record is unsearchable. L0 BM25 + L4
|
|
297
|
-
# all carry it independently.
|
|
326
|
+
# Best-effort: L5 is one of five redundant layers; failure here
|
|
327
|
+
# doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
|
|
328
|
+
# L6 doc-store all carry it independently.
|
|
298
329
|
print(f"[shim] L5 index-batch failed: {exc}")
|
|
299
330
|
return 0
|
|
300
331
|
|
|
301
332
|
|
|
302
|
-
async def _index_l6(
|
|
303
|
-
|
|
304
|
-
|
|
333
|
+
async def _index_l6(
|
|
334
|
+
records: list[dict[str, Any]],
|
|
335
|
+
arena: str = "general",
|
|
336
|
+
embeddings: list[list[float]] | None = None,
|
|
337
|
+
) -> int:
|
|
338
|
+
"""Index records into the L6 document store.
|
|
339
|
+
|
|
340
|
+
When `embeddings` is supplied (parallel to records), L6 skips its
|
|
341
|
+
own embed call — the shim pre-computes vectors once at /store-batch
|
|
342
|
+
level and threads them through each layer.
|
|
343
|
+
"""
|
|
344
|
+
payload: dict[str, Any] = {
|
|
305
345
|
"arena": arena,
|
|
306
346
|
"records": [
|
|
307
347
|
{
|
|
@@ -314,6 +354,8 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
|
|
|
314
354
|
for r in records
|
|
315
355
|
],
|
|
316
356
|
}
|
|
357
|
+
if embeddings is not None:
|
|
358
|
+
payload["embeddings"] = embeddings
|
|
317
359
|
try:
|
|
318
360
|
resp = await _client().post(f"{L6_DOC_URL}/index-batch", json=payload, timeout=120.0)
|
|
319
361
|
resp.raise_for_status()
|
|
@@ -323,14 +365,23 @@ async def _index_l6(records: list[dict[str, Any]], arena: str = "general") -> in
|
|
|
323
365
|
return 0
|
|
324
366
|
|
|
325
367
|
|
|
326
|
-
async def _index_l2_internal(
|
|
368
|
+
async def _index_l2_internal(
|
|
369
|
+
records: list[dict[str, Any]],
|
|
370
|
+
arena: str = "general",
|
|
371
|
+
embeddings: list[list[float]] | None = None,
|
|
372
|
+
) -> dict:
|
|
327
373
|
"""Populate L2's internal stores: L0 BM25 + L4 QMD vec + L3 Neo4j KG.
|
|
328
374
|
|
|
329
375
|
Without this, L2's RRF fusion runs over empty L0/L4-qmd/L3 layers and
|
|
330
376
|
those zero-result rank lists pollute the score. The L2 proxy exposes
|
|
331
377
|
/index-internal-batch which writes to all three in one round-trip.
|
|
378
|
+
|
|
379
|
+
When `embeddings` is supplied (parallel to records), L2's internal
|
|
380
|
+
embed call (used for L4-QMD population) is skipped — the shim
|
|
381
|
+
pre-computes vectors once at /store-batch level and threads them
|
|
382
|
+
through to L4_QMD via this endpoint.
|
|
332
383
|
"""
|
|
333
|
-
payload = {
|
|
384
|
+
payload: dict[str, Any] = {
|
|
334
385
|
"arena": arena,
|
|
335
386
|
"records": [
|
|
336
387
|
{
|
|
@@ -341,6 +392,8 @@ async def _index_l2_internal(records: list[dict[str, Any]], arena: str = "genera
|
|
|
341
392
|
for r in records
|
|
342
393
|
],
|
|
343
394
|
}
|
|
395
|
+
if embeddings is not None:
|
|
396
|
+
payload["embeddings"] = embeddings
|
|
344
397
|
try:
|
|
345
398
|
resp = await _client().post(f"{L2_PROXY_URL}/index-internal-batch",
|
|
346
399
|
json=payload, timeout=180.0)
|
|
@@ -454,25 +507,25 @@ async def health():
|
|
|
454
507
|
nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
|
|
455
508
|
|
|
456
509
|
import asyncio
|
|
457
|
-
l2_v,
|
|
510
|
+
l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
|
|
458
511
|
_probe(f"{L2_PROXY_URL}/health"),
|
|
459
|
-
_probe(f"{L4_VEC_URL}/health"),
|
|
460
512
|
_probe(f"{L5_MILVUS_URL}/health"),
|
|
461
513
|
_probe(f"{L6_DOC_URL}/health"),
|
|
462
514
|
_probe(nv_embed_health),
|
|
463
515
|
_probe_l3(),
|
|
464
516
|
)
|
|
465
517
|
|
|
466
|
-
# L0 BM25 (FTS5)
|
|
467
|
-
# inside the L2 proxy
|
|
468
|
-
#
|
|
518
|
+
# L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
|
|
519
|
+
# all in-process inside the L2 proxy — L0+L1 in workspace.db / core
|
|
520
|
+
# files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
|
|
521
|
+
# if L2 is healthy, all three layers are usable. Tie their status to L2.
|
|
469
522
|
l2_ok = l2_v == "ok"
|
|
470
523
|
out["layers"] = {
|
|
471
524
|
"l0": "ok" if l2_ok else l2_v,
|
|
472
525
|
"l1": "ok" if l2_ok else l2_v,
|
|
473
526
|
"l2": l2_v,
|
|
474
527
|
"l3": l3_v,
|
|
475
|
-
"l4":
|
|
528
|
+
"l4": "ok" if l2_ok else l2_v,
|
|
476
529
|
"l5": l5_v,
|
|
477
530
|
"l6": l6_v,
|
|
478
531
|
"nv_embed": nv_v,
|
|
@@ -493,19 +546,15 @@ async def health():
|
|
|
493
546
|
"l6_vector_chunks": None,
|
|
494
547
|
"l6_fts_chunks": None,
|
|
495
548
|
}
|
|
496
|
-
# L0
|
|
549
|
+
# L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
|
|
550
|
+
# opened by the L2 proxy). L2 exposes /index-internal-stats with both
|
|
551
|
+
# counts in one round-trip.
|
|
497
552
|
try:
|
|
498
553
|
r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
|
|
499
554
|
if r.status_code == 200:
|
|
500
555
|
stats = r.json()
|
|
501
556
|
memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
|
|
502
|
-
|
|
503
|
-
pass
|
|
504
|
-
# L4 reports n_vectors on its own /health.
|
|
505
|
-
try:
|
|
506
|
-
r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
|
|
507
|
-
if r.status_code == 200:
|
|
508
|
-
memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
|
|
557
|
+
memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
|
|
509
558
|
except Exception:
|
|
510
559
|
pass
|
|
511
560
|
# L5 reports per-collection counts on /health. We surface chats —
|
|
@@ -558,8 +607,9 @@ async def health_deep():
|
|
|
558
607
|
except Exception as exc:
|
|
559
608
|
return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
|
|
560
609
|
|
|
610
|
+
# L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
|
|
611
|
+
# round-trip is covered by L2's /health/deep, no separate probe needed.
|
|
561
612
|
results = await asyncio.gather(
|
|
562
|
-
_probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
|
|
563
613
|
_probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
|
|
564
614
|
_probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
|
|
565
615
|
)
|
|
@@ -599,15 +649,15 @@ async def store(req: StoreRequest):
|
|
|
599
649
|
# depending on which one was supplied).
|
|
600
650
|
_stash_all_keys(rid, req.metadata or {}, arena)
|
|
601
651
|
|
|
602
|
-
# Fan out to
|
|
652
|
+
# Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
|
|
603
653
|
import asyncio
|
|
604
|
-
|
|
605
|
-
_index_l4([record]),
|
|
654
|
+
l5_count, l6_count, l2_internal = await asyncio.gather(
|
|
606
655
|
_index_l5([record], arena=arena),
|
|
607
656
|
_index_l6([record], arena=arena),
|
|
608
657
|
_index_l2_internal([record], arena=arena),
|
|
609
658
|
)
|
|
610
659
|
|
|
660
|
+
l4_qmd_count = l2_internal.get("l4_qmd", 0)
|
|
611
661
|
return {
|
|
612
662
|
"id": rid,
|
|
613
663
|
"content": req.content,
|
|
@@ -616,8 +666,11 @@ async def store(req: StoreRequest):
|
|
|
616
666
|
"l0": l2_internal.get("l0", 0),
|
|
617
667
|
"l3_chunks": l2_internal.get("l3_chunks", 0),
|
|
618
668
|
"l3_entities": l2_internal.get("l3_entities", 0),
|
|
619
|
-
"l4_qmd":
|
|
620
|
-
|
|
669
|
+
"l4_qmd": l4_qmd_count,
|
|
670
|
+
# `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
|
|
671
|
+
# sidecar has been dropped. Kept in the response for wire-format
|
|
672
|
+
# back-compat with callers that read engine.l4.
|
|
673
|
+
"l4": l4_qmd_count,
|
|
621
674
|
"l5": l5_count,
|
|
622
675
|
"l6": l6_count,
|
|
623
676
|
},
|
|
@@ -646,24 +699,51 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
646
699
|
|
|
647
700
|
t0 = time.perf_counter()
|
|
648
701
|
import asyncio
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
702
|
+
|
|
703
|
+
# Shared-embed mode: compute embeddings ONCE here, pass them down to
|
|
704
|
+
# every layer so they skip their own embed call. Previously L5 + L6
|
|
705
|
+
# + L2-internal each re-embedded the same texts in parallel, which
|
|
706
|
+
# fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
|
|
707
|
+
# requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
|
|
708
|
+
# each = ~2.5s of pure embed time per /store-batch. With shared
|
|
709
|
+
# embeddings we issue one chunked embed pass (10 sub-calls for N=50
|
|
710
|
+
# records) and skip the per-layer redundant work entirely.
|
|
711
|
+
# Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
|
|
712
|
+
# per-layer differentiated embedders.
|
|
713
|
+
shared_embeddings: list[list[float]] | None = None
|
|
714
|
+
embed_ms = 0.0
|
|
715
|
+
if SHARE_EMBEDDINGS and normalised:
|
|
716
|
+
texts = [r["content"] for r in normalised]
|
|
717
|
+
embed_t0 = time.perf_counter()
|
|
718
|
+
try:
|
|
719
|
+
shared_embeddings = await _get_embed_client().embed_batch_async(texts)
|
|
720
|
+
except Exception as exc:
|
|
721
|
+
# Fall back to per-layer embedding rather than failing the
|
|
722
|
+
# whole batch. The layers' /index-batch still works when
|
|
723
|
+
# `embeddings` is absent.
|
|
724
|
+
print(f"[shim] shared embed failed, falling back to per-layer: {exc}")
|
|
725
|
+
shared_embeddings = None
|
|
726
|
+
embed_ms = (time.perf_counter() - embed_t0) * 1000.0
|
|
727
|
+
|
|
728
|
+
l5_count, l6_count, l2_internal = await asyncio.gather(
|
|
729
|
+
_index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
730
|
+
_index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
731
|
+
_index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
|
|
654
732
|
)
|
|
655
733
|
dur_ms = (time.perf_counter() - t0) * 1000.0
|
|
656
734
|
|
|
735
|
+
l4_qmd_count = l2_internal.get("l4_qmd", 0)
|
|
657
736
|
return {
|
|
658
737
|
"status": "ok",
|
|
659
|
-
"inserted": max(
|
|
738
|
+
"inserted": max(l4_qmd_count, l5_count, l6_count),
|
|
660
739
|
"ids": [r["id"] for r in normalised],
|
|
661
740
|
"engine": {
|
|
662
741
|
"l0": l2_internal.get("l0", 0),
|
|
663
742
|
"l3_chunks": l2_internal.get("l3_chunks", 0),
|
|
664
743
|
"l3_entities": l2_internal.get("l3_entities", 0),
|
|
665
|
-
"l4_qmd":
|
|
666
|
-
|
|
744
|
+
"l4_qmd": l4_qmd_count,
|
|
745
|
+
# `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
|
|
746
|
+
"l4": l4_qmd_count,
|
|
667
747
|
"l5": l5_count,
|
|
668
748
|
"l6": l6_count,
|
|
669
749
|
},
|
|
@@ -32,12 +32,6 @@ services:
|
|
|
32
32
|
# Pin the embedding dim explicitly across layers, independent of any
|
|
33
33
|
# developer-local .env (which may set EMBED_DIM=768 for Ollama-based
|
|
34
34
|
# local dev). The stub returns 4096; layers must agree.
|
|
35
|
-
l4:
|
|
36
|
-
environment:
|
|
37
|
-
L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
|
|
38
|
-
L4_EMBED_API_KEY: ""
|
|
39
|
-
L4_EMBED_DIM: "4096"
|
|
40
|
-
|
|
41
35
|
l5:
|
|
42
36
|
environment:
|
|
43
37
|
L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
|
|
@@ -61,6 +55,5 @@ services:
|
|
|
61
55
|
embed-stub:
|
|
62
56
|
condition: service_healthy
|
|
63
57
|
l2: { condition: service_started }
|
|
64
|
-
l4: { condition: service_started }
|
|
65
58
|
l5: { condition: service_started }
|
|
66
59
|
l6: { condition: service_started }
|
|
@@ -82,36 +82,6 @@ services:
|
|
|
82
82
|
retries: 30
|
|
83
83
|
start_period: 30s
|
|
84
84
|
|
|
85
|
-
# --------------------------------------------------------------------
|
|
86
|
-
# L4 — sqlite-vec sidecar
|
|
87
|
-
# --------------------------------------------------------------------
|
|
88
|
-
l4:
|
|
89
|
-
<<: *engine-base
|
|
90
|
-
build:
|
|
91
|
-
context: ./engine/services
|
|
92
|
-
dockerfile: l4/Dockerfile
|
|
93
|
-
container_name: pme-l4
|
|
94
|
-
# Default 18042 to avoid port collisions on 8042.
|
|
95
|
-
# Override via PME_L4_PORT for bench setups that intentionally replace it.
|
|
96
|
-
ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
|
|
97
|
-
environment:
|
|
98
|
-
L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
99
|
-
L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
|
|
100
|
-
L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
|
|
101
|
-
L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
|
|
102
|
-
L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
|
|
103
|
-
L4_EMBED_DIM: ${EMBED_DIM:-4096}
|
|
104
|
-
L4_DB_PATH: /data/vec.db
|
|
105
|
-
extra_hosts:
|
|
106
|
-
- "host.docker.internal:host-gateway"
|
|
107
|
-
volumes:
|
|
108
|
-
- pme-l4-data:/data
|
|
109
|
-
healthcheck:
|
|
110
|
-
test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
|
|
111
|
-
interval: 10s
|
|
112
|
-
timeout: 5s
|
|
113
|
-
retries: 30
|
|
114
|
-
|
|
115
85
|
# --------------------------------------------------------------------
|
|
116
86
|
# L5 — Qdrant comms layer
|
|
117
87
|
# --------------------------------------------------------------------
|
|
@@ -212,8 +182,11 @@ services:
|
|
|
212
182
|
compat:
|
|
213
183
|
<<: *engine-base
|
|
214
184
|
build:
|
|
215
|
-
context
|
|
216
|
-
|
|
185
|
+
# Build context is the memory-engine root so the Dockerfile can
|
|
186
|
+
# COPY both compat/server.py and engine/services/_shared (shared
|
|
187
|
+
# EmbedClient for /store-batch dedup).
|
|
188
|
+
context: .
|
|
189
|
+
dockerfile: compat/Dockerfile
|
|
217
190
|
container_name: pme-compat
|
|
218
191
|
ports:
|
|
219
192
|
- "127.0.0.1:${PME_PORT:-8099}:8099"
|
|
@@ -221,16 +194,25 @@ services:
|
|
|
221
194
|
L0_URL: http://l2:8031
|
|
222
195
|
L2_PROXY_URL: http://l2:8031
|
|
223
196
|
L3_KG_URL: http://l3:7474
|
|
224
|
-
L4_VEC_URL: http://l4:8042
|
|
225
197
|
L5_MILVUS_URL: http://l5:8034
|
|
226
198
|
L6_DOC_URL: http://l6:8037
|
|
227
199
|
NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
200
|
+
# PME_ prefix vars feed the shim's EmbedClient for shared-embed
|
|
201
|
+
# mode on /store-batch (one embed call across all 3 indexers vs
|
|
202
|
+
# 3 redundant calls). Match the L2 config block so both clients
|
|
203
|
+
# hit the same gateway with the same model. Set
|
|
204
|
+
# PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
|
|
205
|
+
PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
|
|
206
|
+
PME_EMBED_API_KEY: ${EMBED_API_KEY:-}
|
|
207
|
+
PME_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
|
|
208
|
+
PME_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
|
|
209
|
+
PME_NV_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
|
|
210
|
+
PME_SHARE_EMBEDDINGS: ${PME_SHARE_EMBEDDINGS:-true}
|
|
228
211
|
BYPASS_L2_PROXY: ${BYPASS_L2_PROXY:-0}
|
|
229
212
|
extra_hosts:
|
|
230
213
|
- "host.docker.internal:host-gateway"
|
|
231
214
|
depends_on:
|
|
232
215
|
l2: { condition: service_started }
|
|
233
|
-
l4: { condition: service_started }
|
|
234
216
|
l5: { condition: service_started }
|
|
235
217
|
l6: { condition: service_started }
|
|
236
218
|
healthcheck:
|
|
@@ -247,6 +229,5 @@ volumes:
|
|
|
247
229
|
pme-nv-embed-cache:
|
|
248
230
|
pme-l2-data:
|
|
249
231
|
pme-l3-data:
|
|
250
|
-
pme-l4-data:
|
|
251
232
|
pme-l5-data:
|
|
252
233
|
pme-l6-data:
|
|
@@ -1496,6 +1496,12 @@ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
|
|
|
1496
1496
|
class IndexInternalBatchRequest(BaseModel):
|
|
1497
1497
|
records: List[Dict[str, Any]] # [{"id": str, "content": str, "metadata": dict}, ...]
|
|
1498
1498
|
arena: Optional[str] = "general"
|
|
1499
|
+
# When supplied (parallel to `records`), skip the L4-QMD embed call
|
|
1500
|
+
# and use these vectors directly. Compat shim populates this when
|
|
1501
|
+
# shared-embed mode is on so we don't duplicate embed work across
|
|
1502
|
+
# layers. Length must match records — defensive bail-out below if
|
|
1503
|
+
# it doesn't.
|
|
1504
|
+
embeddings: Optional[List[List[float]]] = None
|
|
1499
1505
|
|
|
1500
1506
|
|
|
1501
1507
|
@app.post("/index-internal-batch")
|
|
@@ -1575,7 +1581,19 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
|
|
|
1575
1581
|
# ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
|
|
1576
1582
|
l4_inserted = 0
|
|
1577
1583
|
try:
|
|
1578
|
-
|
|
1584
|
+
# Shared-embed shortcut: if the compat shim handed us pre-computed
|
|
1585
|
+
# vectors that line up with our normalised records, use them and
|
|
1586
|
+
# skip our own embed RPC. Fall back to per-layer embedding when
|
|
1587
|
+
# the vectors are absent or the lengths don't match (defensive).
|
|
1588
|
+
shared_embs = req.embeddings
|
|
1589
|
+
if (
|
|
1590
|
+
shared_embs is not None
|
|
1591
|
+
and len(shared_embs) == len(records)
|
|
1592
|
+
and len(records) == len(norm)
|
|
1593
|
+
):
|
|
1594
|
+
embeddings = shared_embs
|
|
1595
|
+
else:
|
|
1596
|
+
embeddings = await _embed_batch_local([n["content"] for n in norm])
|
|
1579
1597
|
if len(embeddings) != len(norm):
|
|
1580
1598
|
log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
|
|
1581
1599
|
qmd_db = Path(QMD_DB_PATH)
|
|
@@ -629,13 +629,19 @@ def serve(port=8034):
|
|
|
629
629
|
client = get_client()
|
|
630
630
|
ensure_collection(client, collection)
|
|
631
631
|
|
|
632
|
-
#
|
|
632
|
+
# Shared-embed shortcut: caller (compat shim) computed vectors
|
|
633
|
+
# once and forwards them so we skip the embed RPC. Length must
|
|
634
|
+
# match records — fall back to per-layer embed if it doesn't.
|
|
633
635
|
texts = [(r.get("text") or "")[:8192] for r in records]
|
|
636
|
+
shared_embs = req.get("embeddings")
|
|
634
637
|
t0 = _time.time()
|
|
635
|
-
|
|
636
|
-
embs =
|
|
637
|
-
|
|
638
|
-
|
|
638
|
+
if isinstance(shared_embs, list) and len(shared_embs) == len(records):
|
|
639
|
+
embs = shared_embs
|
|
640
|
+
else:
|
|
641
|
+
try:
|
|
642
|
+
embs = _embed_post(texts)
|
|
643
|
+
except Exception as exc:
|
|
644
|
+
return {"status": "error", "error": f"embed failed: {exc}"}
|
|
639
645
|
embed_ms = (_time.time() - t0) * 1000.0
|
|
640
646
|
|
|
641
647
|
# Single batched insert. Mirror every field the chats collection
|
|
@@ -990,12 +990,18 @@ def serve(port: int = DEFAULT_PORT):
|
|
|
990
990
|
|
|
991
991
|
texts = [(r.get("text") or "")[:16000] for r in records]
|
|
992
992
|
|
|
993
|
-
#
|
|
993
|
+
# Shared-embed shortcut: caller (compat shim) computed vectors
|
|
994
|
+
# once and forwards them so we skip the embed RPC. Length must
|
|
995
|
+
# match records — fall back to per-layer embed if it doesn't.
|
|
996
|
+
shared_embs = req.get("embeddings")
|
|
994
997
|
t0 = _time.time()
|
|
995
|
-
|
|
996
|
-
embs =
|
|
997
|
-
|
|
998
|
-
|
|
998
|
+
if isinstance(shared_embs, list) and len(shared_embs) == len(records):
|
|
999
|
+
embs = shared_embs
|
|
1000
|
+
else:
|
|
1001
|
+
try:
|
|
1002
|
+
embs = _embed_post(texts)
|
|
1003
|
+
except Exception as exc:
|
|
1004
|
+
raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
|
|
999
1005
|
embed_ms = (_time.time() - t0) * 1000.0
|
|
1000
1006
|
|
|
1001
1007
|
# Single milvus insert.
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
FROM python:3.12-slim
|
|
2
|
-
|
|
3
|
-
WORKDIR /app
|
|
4
|
-
|
|
5
|
-
RUN pip install --no-cache-dir fastapi 'uvicorn[standard]' httpx pydantic
|
|
6
|
-
|
|
7
|
-
# Build context is engine/services so the shared embed_provider module is
|
|
8
|
-
# COPYable. server.py adds engine/services to sys.path at startup, then
|
|
9
|
-
# imports from `_shared.embed_provider`.
|
|
10
|
-
COPY _shared /app/_shared
|
|
11
|
-
COPY l4/server.py /app/server.py
|
|
12
|
-
|
|
13
|
-
RUN mkdir -p /data
|
|
14
|
-
ENV L4_DB_PATH=/data/vec.db
|
|
15
|
-
ENV PORT=8042
|
|
16
|
-
|
|
17
|
-
EXPOSE 8042
|
|
18
|
-
|
|
19
|
-
CMD ["python", "server.py", "--port", "8042"]
|
|
@@ -1,305 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
L4 sqlite-vec sidecar.
|
|
3
|
-
|
|
4
|
-
Vector index sidecar for the Pentatonic Memory Engine stack.
|
|
5
|
-
Exposes /health, /search, /index-batch, /refresh over HTTP.
|
|
6
|
-
|
|
7
|
-
Endpoints:
|
|
8
|
-
GET /health
|
|
9
|
-
POST /search body: {"query":"...", "limit":10}
|
|
10
|
-
POST /index-batch body: {"records":[{"id","text"}, ...]}
|
|
11
|
-
POST /refresh no-op (sqlite-vec writes are immediate)
|
|
12
|
-
|
|
13
|
-
Env:
|
|
14
|
-
L4_DB_PATH default /data/vec.db
|
|
15
|
-
L4_NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
|
|
16
|
-
PORT default 8042
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
from __future__ import annotations
|
|
20
|
-
|
|
21
|
-
import argparse
|
|
22
|
-
import hashlib
|
|
23
|
-
import os
|
|
24
|
-
import sqlite3
|
|
25
|
-
import struct
|
|
26
|
-
import sys
|
|
27
|
-
import time
|
|
28
|
-
from pathlib import Path
|
|
29
|
-
from typing import Any
|
|
30
|
-
|
|
31
|
-
from fastapi import FastAPI, HTTPException
|
|
32
|
-
from pydantic import BaseModel
|
|
33
|
-
|
|
34
|
-
# Shared embedding client lives at engine/services/_shared/. Add the parent of
|
|
35
|
-
# the service dir to sys.path so `from _shared.embed_provider import ...` works
|
|
36
|
-
# regardless of how the service is launched (uvicorn, python server.py, etc.).
|
|
37
|
-
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
38
|
-
from _shared.embed_provider import EmbedClient # noqa: E402
|
|
39
|
-
|
|
40
|
-
# ----------------------------------------------------------------------
|
|
41
|
-
# Config
|
|
42
|
-
# ----------------------------------------------------------------------
|
|
43
|
-
|
|
44
|
-
DB_PATH = os.environ.get("L4_DB_PATH", "/data/vec.db")
|
|
45
|
-
EMBED_DIM = int(os.environ.get("L4_EMBED_DIM", "4096"))
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# ----------------------------------------------------------------------
|
|
50
|
-
# DB helpers
|
|
51
|
-
# ----------------------------------------------------------------------
|
|
52
|
-
|
|
53
|
-
def _vec_to_blob(vec: list[float]) -> bytes:
|
|
54
|
-
"""Pack a list of floats as little-endian f32 bytes for sqlite-vec."""
|
|
55
|
-
return struct.pack(f"<{len(vec)}f", *vec)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _blob_to_vec(blob: bytes) -> list[float]:
|
|
59
|
-
n = len(blob) // 4
|
|
60
|
-
return list(struct.unpack(f"<{n}f", blob))
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _cosine(a: list[float], b: list[float]) -> float:
|
|
64
|
-
import math
|
|
65
|
-
dot = sum(x * y for x, y in zip(a, b))
|
|
66
|
-
na = math.sqrt(sum(x * x for x in a))
|
|
67
|
-
nb = math.sqrt(sum(y * y for y in b))
|
|
68
|
-
if na == 0 or nb == 0:
|
|
69
|
-
return 0.0
|
|
70
|
-
return dot / (na * nb)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _get_db() -> sqlite3.Connection:
|
|
74
|
-
"""Open DB and ensure schema. We use plain BLOB columns rather than
|
|
75
|
-
the sqlite-vec virtual table because sqlite-vec is an optional ext
|
|
76
|
-
that may not be loadable in every container — plain BLOB lets us
|
|
77
|
-
fall back to a Python-side cosine pass without losing correctness.
|
|
78
|
-
"""
|
|
79
|
-
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
|
|
80
|
-
conn = sqlite3.connect(DB_PATH, timeout=10)
|
|
81
|
-
conn.execute("PRAGMA journal_mode=WAL")
|
|
82
|
-
conn.execute("""
|
|
83
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
84
|
-
id TEXT PRIMARY KEY,
|
|
85
|
-
text TEXT,
|
|
86
|
-
embedding BLOB,
|
|
87
|
-
indexed_at REAL
|
|
88
|
-
)
|
|
89
|
-
""")
|
|
90
|
-
return conn
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# ----------------------------------------------------------------------
|
|
94
|
-
# Embedding client
|
|
95
|
-
# ----------------------------------------------------------------------
|
|
96
|
-
|
|
97
|
-
_embed: EmbedClient | None = None
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _embed_client() -> EmbedClient:
|
|
101
|
-
"""Lazily build the embed client so env vars are read at first use."""
|
|
102
|
-
global _embed
|
|
103
|
-
if _embed is None:
|
|
104
|
-
_embed = EmbedClient.from_env(
|
|
105
|
-
prefix="L4_",
|
|
106
|
-
default_url="http://nv-embed:8041/v1/embeddings",
|
|
107
|
-
)
|
|
108
|
-
return _embed
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
112
|
-
"""Embed a batch of texts via the shared EmbedClient."""
|
|
113
|
-
return await _embed_client().embed_batch_async(texts)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
# ----------------------------------------------------------------------
|
|
117
|
-
# FastAPI
|
|
118
|
-
# ----------------------------------------------------------------------
|
|
119
|
-
|
|
120
|
-
class SearchRequest(BaseModel):
|
|
121
|
-
query: str
|
|
122
|
-
limit: int = 10
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class IndexBatchRequest(BaseModel):
|
|
126
|
-
records: list[dict[str, Any]]
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
@app.get("/health")
|
|
133
|
-
def health():
|
|
134
|
-
try:
|
|
135
|
-
conn = _get_db()
|
|
136
|
-
n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
|
137
|
-
conn.close()
|
|
138
|
-
return {"status": "ok", "loaded": True, "n_vectors": n,
|
|
139
|
-
"dim": EMBED_DIM, "db_path": DB_PATH,
|
|
140
|
-
# BLOB+Python-cosine is the intentional implementation path,
|
|
141
|
-
# not a degraded fallback (see _get_db docstring). The previous
|
|
142
|
-
# "sqlite-vec-fallback" label gave operators the wrong signal.
|
|
143
|
-
"backend": "sqlite-vec"}
|
|
144
|
-
except Exception as exc:
|
|
145
|
-
return {"status": "degraded", "error": str(exc)}
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
@app.post("/search")
|
|
149
|
-
async def search(req: SearchRequest):
|
|
150
|
-
if not req.query:
|
|
151
|
-
return []
|
|
152
|
-
try:
|
|
153
|
-
embs = await _embed_batch([req.query])
|
|
154
|
-
if not embs or embs[0] is None:
|
|
155
|
-
raise HTTPException(status_code=502, detail="embed failed")
|
|
156
|
-
q_vec = embs[0]
|
|
157
|
-
except Exception as exc:
|
|
158
|
-
raise HTTPException(status_code=502, detail=f"embed: {exc}")
|
|
159
|
-
|
|
160
|
-
conn = _get_db()
|
|
161
|
-
rows = conn.execute("SELECT id, text, embedding FROM chunks").fetchall()
|
|
162
|
-
conn.close()
|
|
163
|
-
|
|
164
|
-
# Cosine similarity in Python — fine for OSS / small corpora. For
|
|
165
|
-
# large corpora: consider a dedicated vector DB.
|
|
166
|
-
scored: list[tuple[float, str, str]] = []
|
|
167
|
-
for rid, text, blob in rows:
|
|
168
|
-
if not blob:
|
|
169
|
-
continue
|
|
170
|
-
v = _blob_to_vec(blob)
|
|
171
|
-
if len(v) != len(q_vec):
|
|
172
|
-
continue
|
|
173
|
-
s = _cosine(q_vec, v)
|
|
174
|
-
scored.append((s, rid, text))
|
|
175
|
-
scored.sort(reverse=True)
|
|
176
|
-
out = [
|
|
177
|
-
{"path": rid, "text": text, "score": float(s),
|
|
178
|
-
"source": "L4-sqlite-vec", "layer": "L4"}
|
|
179
|
-
for s, rid, text in scored[: req.limit]
|
|
180
|
-
]
|
|
181
|
-
return out
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
@app.post("/index-batch")
|
|
185
|
-
async def index_batch(req: IndexBatchRequest):
|
|
186
|
-
if not req.records:
|
|
187
|
-
return {"status": "ok", "inserted": 0}
|
|
188
|
-
texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
|
|
189
|
-
t0 = time.perf_counter()
|
|
190
|
-
embs = await _embed_batch(texts)
|
|
191
|
-
embed_ms = (time.perf_counter() - t0) * 1000.0
|
|
192
|
-
|
|
193
|
-
conn = _get_db()
|
|
194
|
-
t1 = time.perf_counter()
|
|
195
|
-
rows = []
|
|
196
|
-
for r, emb, txt in zip(req.records, embs, texts):
|
|
197
|
-
if not emb:
|
|
198
|
-
continue
|
|
199
|
-
rid = r.get("id") or hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
|
|
200
|
-
rows.append((rid, txt, _vec_to_blob(emb), time.time()))
|
|
201
|
-
if rows:
|
|
202
|
-
conn.executemany(
|
|
203
|
-
"INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
|
|
204
|
-
"VALUES (?, ?, ?, ?)", rows,
|
|
205
|
-
)
|
|
206
|
-
conn.commit()
|
|
207
|
-
insert_ms = (time.perf_counter() - t1) * 1000.0
|
|
208
|
-
conn.close()
|
|
209
|
-
return {"status": "ok", "inserted": len(rows),
|
|
210
|
-
"embed_ms": round(embed_ms, 1), "insert_ms": round(insert_ms, 1)}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
@app.post("/refresh")
|
|
214
|
-
def refresh():
|
|
215
|
-
"""No-op for sqlite-vec — writes are immediate. Kept for API parity."""
|
|
216
|
-
return {"status": "ok", "noop": True}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
# ----------------------------------------------------------------------
|
|
220
|
-
# /health/deep — synthetic round-trip
|
|
221
|
-
# ----------------------------------------------------------------------
|
|
222
|
-
|
|
223
|
-
# Fixed sentinel id used by /health/deep. Upserted on every probe call,
|
|
224
|
-
# so the row is idempotent. Kept under id="__healthcheck__sentinel" so
|
|
225
|
-
# the L4 corpus has at most one healthcheck row regardless of probe rate.
|
|
226
|
-
_HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
|
|
227
|
-
_HEALTH_SENTINEL_TEXT = (
|
|
228
|
-
"healthcheck sentinel — embed-write-search round-trip verifier"
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
@app.get("/health/deep")
|
|
233
|
-
async def health_deep():
|
|
234
|
-
"""Real functional probe: embed → write → search the sentinel.
|
|
235
|
-
|
|
236
|
-
Catches the class of failure that plain /health misses — broken
|
|
237
|
-
embed paths, write 500s, query path bugs — i.e. exactly the bug
|
|
238
|
-
shape that silently degraded L6 from v0.8.0 → v0.8.2.
|
|
239
|
-
|
|
240
|
-
Returns:
|
|
241
|
-
{status, embed_ms, write_ms, search_ms, hit, ok}
|
|
242
|
-
|
|
243
|
-
`hit` confirms the sentinel was returned from search; `ok` is the
|
|
244
|
-
aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
|
|
245
|
-
regardless so callers can read the body for diagnostics; status:
|
|
246
|
-
field carries the verdict.
|
|
247
|
-
"""
|
|
248
|
-
t_total = time.perf_counter()
|
|
249
|
-
out: dict[str, Any] = {"status": "ok", "ok": True}
|
|
250
|
-
try:
|
|
251
|
-
t0 = time.perf_counter()
|
|
252
|
-
embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
|
|
253
|
-
out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
|
|
254
|
-
if not embs or not embs[0]:
|
|
255
|
-
out["status"] = "embed_failed"
|
|
256
|
-
out["ok"] = False
|
|
257
|
-
return out
|
|
258
|
-
vec = embs[0]
|
|
259
|
-
except Exception as exc:
|
|
260
|
-
out["status"] = f"embed_error: {type(exc).__name__}"
|
|
261
|
-
out["ok"] = False
|
|
262
|
-
return out
|
|
263
|
-
|
|
264
|
-
try:
|
|
265
|
-
conn = _get_db()
|
|
266
|
-
t1 = time.perf_counter()
|
|
267
|
-
conn.execute(
|
|
268
|
-
"INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
|
|
269
|
-
"VALUES (?, ?, ?, ?)",
|
|
270
|
-
(_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
|
|
271
|
-
)
|
|
272
|
-
conn.commit()
|
|
273
|
-
out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
|
|
274
|
-
|
|
275
|
-
t2 = time.perf_counter()
|
|
276
|
-
rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
|
|
277
|
-
(_HEALTH_SENTINEL_ID,)).fetchone()
|
|
278
|
-
out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
|
|
279
|
-
conn.close()
|
|
280
|
-
except Exception as exc:
|
|
281
|
-
out["status"] = f"db_error: {type(exc).__name__}"
|
|
282
|
-
out["ok"] = False
|
|
283
|
-
return out
|
|
284
|
-
|
|
285
|
-
out["hit"] = rows is not None
|
|
286
|
-
if not out["hit"]:
|
|
287
|
-
out["status"] = "sentinel_missing"
|
|
288
|
-
out["ok"] = False
|
|
289
|
-
out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
|
|
290
|
-
return out
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
# ----------------------------------------------------------------------
|
|
294
|
-
# Entrypoint
|
|
295
|
-
# ----------------------------------------------------------------------
|
|
296
|
-
|
|
297
|
-
if __name__ == "__main__":
|
|
298
|
-
parser = argparse.ArgumentParser()
|
|
299
|
-
parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8042")))
|
|
300
|
-
parser.add_argument("--data-dir", default=None)
|
|
301
|
-
args = parser.parse_args()
|
|
302
|
-
if args.data_dir:
|
|
303
|
-
os.environ["L4_DB_PATH"] = str(Path(args.data_dir) / "vec.db")
|
|
304
|
-
import uvicorn
|
|
305
|
-
uvicorn.run("server:app", host="0.0.0.0", port=args.port, log_level="info")
|