@pentatonic-ai/ai-agent-sdk 0.9.3 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
906
  }
907
907
 
908
908
  // src/telemetry.js
909
- var VERSION = "0.9.3";
909
+ var VERSION = "0.9.4";
910
910
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
911
  function machineId() {
912
912
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
875
  }
876
876
 
877
877
  // src/telemetry.js
878
- var VERSION = "0.9.3";
878
+ var VERSION = "0.9.4";
879
879
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
880
  function machineId() {
881
881
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.3",
3
+ "version": "0.9.4",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -568,9 +568,9 @@
568
568
  }
569
569
  },
570
570
  "node_modules/hono": {
571
- "version": "4.12.12",
572
- "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.12.tgz",
573
- "integrity": "sha512-p1JfQMKaceuCbpJKAPKVqyqviZdS0eUxH9v82oWo1kb9xjQ5wA6iP3FNVAPDFlz5/p7d45lO+BpSk1tuSZMF4Q==",
571
+ "version": "4.12.18",
572
+ "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.18.tgz",
573
+ "integrity": "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==",
574
574
  "license": "MIT",
575
575
  "engines": {
576
576
  "node": ">=16.9.0"
@@ -25,7 +25,6 @@ Environment:
25
25
  L0_URL default http://l0:8030
26
26
  L2_PROXY_URL default http://l2:8031
27
27
  L3_KG_URL default http://l3:8047
28
- L4_VEC_URL default http://l4:8042
29
28
  L5_MILVUS_URL default http://l5:8035
30
29
  L6_DOC_URL default http://l6:8037
31
30
  NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
@@ -61,7 +60,6 @@ from _shared.embed_provider import EmbedClient # noqa: E402
61
60
  L0_URL = os.environ.get("L0_URL", "http://l0:8030")
62
61
  L2_PROXY_URL = os.environ.get("L2_PROXY_URL", "http://l2:8031")
63
62
  L3_KG_URL = os.environ.get("L3_KG_URL", "http://l3:8047")
64
- L4_VEC_URL = os.environ.get("L4_VEC_URL", "http://l4:8042")
65
63
  L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
66
64
  L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
67
65
  NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
@@ -288,32 +286,6 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
288
286
  return [d["embedding"] for d in resp.json()["data"]]
289
287
 
290
288
 
291
- async def _index_l4(
292
- records: list[dict[str, Any]],
293
- embeddings: list[list[float]] | None = None,
294
- ) -> int:
295
- """Index records into the L4 sqlite-vec layer.
296
-
297
- When `embeddings` is supplied (parallel to records), L4's /index-batch
298
- skips its own embed call and uses ours — eliminates the redundant
299
- embed work that previously cost ~850ms per drain alarm. When None,
300
- L4 embeds itself (backwards-compatible path for older callers / tests
301
- that don't share embeddings)."""
302
- payload: dict[str, Any] = {"records": [
303
- {"id": r.get("id") or hashlib.sha1(r["content"].encode()).hexdigest()[:32],
304
- "text": r["content"]} for r in records
305
- ]}
306
- if embeddings is not None:
307
- payload["embeddings"] = embeddings
308
- try:
309
- resp = await _client().post(f"{L4_VEC_URL}/index-batch", json=payload, timeout=120.0)
310
- resp.raise_for_status()
311
- return resp.json().get("inserted", 0)
312
- except Exception as exc:
313
- print(f"[shim] L4 index-batch failed: {exc}")
314
- return 0
315
-
316
-
317
289
  async def _index_l5(
318
290
  records: list[dict[str, Any]],
319
291
  arena: str = "general",
@@ -325,7 +297,10 @@ async def _index_l5(
325
297
  by arena natively (vs the shim's defence-in-depth post-filter).
326
298
 
327
299
  When `embeddings` is supplied (parallel to records), L5 skips its
328
- own embed call — see _index_l4 docstring for the dedup story.
300
+ own embed call — the shim pre-computes vectors once at /store-batch
301
+ level and threads them through each layer to avoid 3× redundant
302
+ embed RPCs (L5 + L6 + L2-internal otherwise each re-embed the same
303
+ texts in parallel).
329
304
  """
330
305
  payload: dict[str, Any] = {
331
306
  "collection": "chats",
@@ -348,9 +323,9 @@ async def _index_l5(
348
323
  resp.raise_for_status()
349
324
  return resp.json().get("inserted", 0)
350
325
  except Exception as exc:
351
- # Best-effort: L5 is one of six redundant layers; failure here doesn't
352
- # mean the record is unsearchable. L0 BM25 + L4 vec + L6 doc-store
353
- # all carry it independently.
326
+ # Best-effort: L5 is one of five redundant layers; failure here
327
+ # doesn't mean the record is unsearchable. L0 BM25 + L4 QMD +
328
+ # L6 doc-store all carry it independently.
354
329
  print(f"[shim] L5 index-batch failed: {exc}")
355
330
  return 0
356
331
 
@@ -363,7 +338,8 @@ async def _index_l6(
363
338
  """Index records into the L6 document store.
364
339
 
365
340
  When `embeddings` is supplied (parallel to records), L6 skips its
366
- own embed call — see _index_l4 docstring for the dedup story.
341
+ own embed call — the shim pre-computes vectors once at /store-batch
342
+ level and threads them through each layer.
367
343
  """
368
344
  payload: dict[str, Any] = {
369
345
  "arena": arena,
@@ -401,8 +377,9 @@ async def _index_l2_internal(
401
377
  /index-internal-batch which writes to all three in one round-trip.
402
378
 
403
379
  When `embeddings` is supplied (parallel to records), L2's internal
404
- embed call (used for L4-QMD population) is skipped — see _index_l4
405
- docstring for the dedup story.
380
+ embed call (used for L4-QMD population) is skipped — the shim
381
+ pre-computes vectors once at /store-batch level and threads them
382
+ through to L4_QMD via this endpoint.
406
383
  """
407
384
  payload: dict[str, Any] = {
408
385
  "arena": arena,
@@ -530,25 +507,25 @@ async def health():
530
507
  nv_embed_health = urlunparse((_u.scheme, _u.netloc, "/health", "", "", ""))
531
508
 
532
509
  import asyncio
533
- l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
510
+ l2_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
534
511
  _probe(f"{L2_PROXY_URL}/health"),
535
- _probe(f"{L4_VEC_URL}/health"),
536
512
  _probe(f"{L5_MILVUS_URL}/health"),
537
513
  _probe(f"{L6_DOC_URL}/health"),
538
514
  _probe(nv_embed_health),
539
515
  _probe_l3(),
540
516
  )
541
517
 
542
- # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
543
- # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
544
- # both layers are usable. Tie their status to L2.
518
+ # L0 BM25 (FTS5), L1 (always-loaded core files) and L4 QMD vec are
519
+ # all in-process inside the L2 proxy L0+L1 in workspace.db / core
520
+ # files; L4 in qmd.sqlite which L2 opens directly. No separate runtime;
521
+ # if L2 is healthy, all three layers are usable. Tie their status to L2.
545
522
  l2_ok = l2_v == "ok"
546
523
  out["layers"] = {
547
524
  "l0": "ok" if l2_ok else l2_v,
548
525
  "l1": "ok" if l2_ok else l2_v,
549
526
  "l2": l2_v,
550
527
  "l3": l3_v,
551
- "l4": l4_v,
528
+ "l4": "ok" if l2_ok else l2_v,
552
529
  "l5": l5_v,
553
530
  "l6": l6_v,
554
531
  "nv_embed": nv_v,
@@ -569,19 +546,15 @@ async def health():
569
546
  "l6_vector_chunks": None,
570
547
  "l6_fts_chunks": None,
571
548
  }
572
- # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
549
+ # L0 and L4 both live inside L2 (workspace.db + qmd.sqlite directly
550
+ # opened by the L2 proxy). L2 exposes /index-internal-stats with both
551
+ # counts in one round-trip.
573
552
  try:
574
553
  r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
575
554
  if r.status_code == 200:
576
555
  stats = r.json()
577
556
  memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
578
- except Exception:
579
- pass
580
- # L4 reports n_vectors on its own /health.
581
- try:
582
- r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
583
- if r.status_code == 200:
584
- memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
557
+ memories["l4_vectors"] = int(stats.get("l4_qmd_chunks") or 0)
585
558
  except Exception:
586
559
  pass
587
560
  # L5 reports per-collection counts on /health. We surface chats —
@@ -634,8 +607,9 @@ async def health_deep():
634
607
  except Exception as exc:
635
608
  return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
636
609
 
610
+ # L4 is in-process inside L2 (qmd.sqlite direct-read) — its deep
611
+ # round-trip is covered by L2's /health/deep, no separate probe needed.
637
612
  results = await asyncio.gather(
638
- _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
639
613
  _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
640
614
  _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
641
615
  )
@@ -675,15 +649,15 @@ async def store(req: StoreRequest):
675
649
  # depending on which one was supplied).
676
650
  _stash_all_keys(rid, req.metadata or {}, arena)
677
651
 
678
- # Fan out to L4 + L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
652
+ # Fan out to L5 + L6 + L2-internal (L0+L4qmd+L3) in parallel.
679
653
  import asyncio
680
- l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
681
- _index_l4([record]),
654
+ l5_count, l6_count, l2_internal = await asyncio.gather(
682
655
  _index_l5([record], arena=arena),
683
656
  _index_l6([record], arena=arena),
684
657
  _index_l2_internal([record], arena=arena),
685
658
  )
686
659
 
660
+ l4_qmd_count = l2_internal.get("l4_qmd", 0)
687
661
  return {
688
662
  "id": rid,
689
663
  "content": req.content,
@@ -692,8 +666,11 @@ async def store(req: StoreRequest):
692
666
  "l0": l2_internal.get("l0", 0),
693
667
  "l3_chunks": l2_internal.get("l3_chunks", 0),
694
668
  "l3_entities": l2_internal.get("l3_entities", 0),
695
- "l4_qmd": l2_internal.get("l4_qmd", 0),
696
- "l4": l4_count,
669
+ "l4_qmd": l4_qmd_count,
670
+ # `l4` is aliased to L4_QMD now that the standalone L4 sqlite-vec
671
+ # sidecar has been dropped. Kept in the response for wire-format
672
+ # back-compat with callers that read engine.l4.
673
+ "l4": l4_qmd_count,
697
674
  "l5": l5_count,
698
675
  "l6": l6_count,
699
676
  },
@@ -724,13 +701,13 @@ async def store_batch(req: StoreBatchRequest):
724
701
  import asyncio
725
702
 
726
703
  # Shared-embed mode: compute embeddings ONCE here, pass them down to
727
- # every layer so they skip their own embed call. Previously L4 + L5
728
- # + L6 + L2-internal each re-embedded the same texts in parallel,
729
- # which fanned 4× the gateway RPCs. The gateway throttles at K≈10
730
- # concurrent requests, so 40-way fan-out serialised into ~4 rounds
731
- # of ~850ms each = ~3.5s of pure embed time per /store-batch. With
732
- # shared embeddings we issue one chunked embed pass (10 sub-calls
733
- # for N=50 records) and skip the per-layer redundant work entirely.
704
+ # every layer so they skip their own embed call. Previously L5 + L6
705
+ # + L2-internal each re-embedded the same texts in parallel, which
706
+ # fanned 3× the gateway RPCs. The gateway throttles at K≈10 concurrent
707
+ # requests, so 30-way fan-out serialised into ~3 rounds of ~850ms
708
+ # each = ~2.5s of pure embed time per /store-batch. With shared
709
+ # embeddings we issue one chunked embed pass (10 sub-calls for N=50
710
+ # records) and skip the per-layer redundant work entirely.
734
711
  # Disabled via PME_SHARE_EMBEDDINGS=false for operators wiring up
735
712
  # per-layer differentiated embedders.
736
713
  shared_embeddings: list[list[float]] | None = None
@@ -748,24 +725,25 @@ async def store_batch(req: StoreBatchRequest):
748
725
  shared_embeddings = None
749
726
  embed_ms = (time.perf_counter() - embed_t0) * 1000.0
750
727
 
751
- l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
752
- _index_l4(normalised, embeddings=shared_embeddings),
728
+ l5_count, l6_count, l2_internal = await asyncio.gather(
753
729
  _index_l5(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
754
730
  _index_l6(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
755
731
  _index_l2_internal(normalised, arena=req.arena or "general", embeddings=shared_embeddings),
756
732
  )
757
733
  dur_ms = (time.perf_counter() - t0) * 1000.0
758
734
 
735
+ l4_qmd_count = l2_internal.get("l4_qmd", 0)
759
736
  return {
760
737
  "status": "ok",
761
- "inserted": max(l4_count, l5_count, l6_count),
738
+ "inserted": max(l4_qmd_count, l5_count, l6_count),
762
739
  "ids": [r["id"] for r in normalised],
763
740
  "engine": {
764
741
  "l0": l2_internal.get("l0", 0),
765
742
  "l3_chunks": l2_internal.get("l3_chunks", 0),
766
743
  "l3_entities": l2_internal.get("l3_entities", 0),
767
- "l4_qmd": l2_internal.get("l4_qmd", 0),
768
- "l4": l4_count,
744
+ "l4_qmd": l4_qmd_count,
745
+ # `l4` aliased to L4_QMD — sidecar dropped, see /store handler.
746
+ "l4": l4_qmd_count,
769
747
  "l5": l5_count,
770
748
  "l6": l6_count,
771
749
  },
@@ -32,12 +32,6 @@ services:
32
32
  # Pin the embedding dim explicitly across layers, independent of any
33
33
  # developer-local .env (which may set EMBED_DIM=768 for Ollama-based
34
34
  # local dev). The stub returns 4096; layers must agree.
35
- l4:
36
- environment:
37
- L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
38
- L4_EMBED_API_KEY: ""
39
- L4_EMBED_DIM: "4096"
40
-
41
35
  l5:
42
36
  environment:
43
37
  L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
@@ -61,6 +55,5 @@ services:
61
55
  embed-stub:
62
56
  condition: service_healthy
63
57
  l2: { condition: service_started }
64
- l4: { condition: service_started }
65
58
  l5: { condition: service_started }
66
59
  l6: { condition: service_started }
@@ -82,36 +82,6 @@ services:
82
82
  retries: 30
83
83
  start_period: 30s
84
84
 
85
- # --------------------------------------------------------------------
86
- # L4 — sqlite-vec sidecar
87
- # --------------------------------------------------------------------
88
- l4:
89
- <<: *engine-base
90
- build:
91
- context: ./engine/services
92
- dockerfile: l4/Dockerfile
93
- container_name: pme-l4
94
- # Default 18042 to avoid port collisions on 8042.
95
- # Override via PME_L4_PORT for bench setups that intentionally replace it.
96
- ports: ["127.0.0.1:${PME_L4_PORT:-18042}:8042"]
97
- environment:
98
- L4_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
99
- L4_EMBED_MODEL: ${EMBED_MODEL_NAME:-nv-embed-v2}
100
- L4_EMBED_API_KEY: ${EMBED_API_KEY:-}
101
- L4_EMBED_PROVIDER: ${EMBED_PROVIDER:-openai}
102
- L4_EMBED_AUTODETECT: ${EMBED_AUTODETECT:-true}
103
- L4_EMBED_DIM: ${EMBED_DIM:-4096}
104
- L4_DB_PATH: /data/vec.db
105
- extra_hosts:
106
- - "host.docker.internal:host-gateway"
107
- volumes:
108
- - pme-l4-data:/data
109
- healthcheck:
110
- test: ["CMD", "python", "-c", "import urllib.request,sys; urllib.request.urlopen('http://localhost:8042/health',timeout=3)"]
111
- interval: 10s
112
- timeout: 5s
113
- retries: 30
114
-
115
85
  # --------------------------------------------------------------------
116
86
  # L5 — Qdrant comms layer
117
87
  # --------------------------------------------------------------------
@@ -224,13 +194,12 @@ services:
224
194
  L0_URL: http://l2:8031
225
195
  L2_PROXY_URL: http://l2:8031
226
196
  L3_KG_URL: http://l3:7474
227
- L4_VEC_URL: http://l4:8042
228
197
  L5_MILVUS_URL: http://l5:8034
229
198
  L6_DOC_URL: http://l6:8037
230
199
  NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
231
200
  # PME_ prefix vars feed the shim's EmbedClient for shared-embed
232
- # mode on /store-batch (one embed call across all 4 indexers vs
233
- # 4 redundant calls). Match the L2 config block so both clients
201
+ # mode on /store-batch (one embed call across all 3 indexers vs
202
+ # 3 redundant calls). Match the L2 config block so both clients
234
203
  # hit the same gateway with the same model. Set
235
204
  # PME_SHARE_EMBEDDINGS=false to revert to per-layer embedding.
236
205
  PME_NV_EMBED_URL: ${NV_EMBED_URL:-http://host.docker.internal:8041/v1/embeddings}
@@ -244,7 +213,6 @@ services:
244
213
  - "host.docker.internal:host-gateway"
245
214
  depends_on:
246
215
  l2: { condition: service_started }
247
- l4: { condition: service_started }
248
216
  l5: { condition: service_started }
249
217
  l6: { condition: service_started }
250
218
  healthcheck:
@@ -261,6 +229,5 @@ volumes:
261
229
  pme-nv-embed-cache:
262
230
  pme-l2-data:
263
231
  pme-l3-data:
264
- pme-l4-data:
265
232
  pme-l5-data:
266
233
  pme-l6-data:
@@ -1,19 +0,0 @@
1
- FROM python:3.12-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN pip install --no-cache-dir fastapi 'uvicorn[standard]' httpx pydantic
6
-
7
- # Build context is engine/services so the shared embed_provider module is
8
- # COPYable. server.py adds engine/services to sys.path at startup, then
9
- # imports from `_shared.embed_provider`.
10
- COPY _shared /app/_shared
11
- COPY l4/server.py /app/server.py
12
-
13
- RUN mkdir -p /data
14
- ENV L4_DB_PATH=/data/vec.db
15
- ENV PORT=8042
16
-
17
- EXPOSE 8042
18
-
19
- CMD ["python", "server.py", "--port", "8042"]
@@ -1,315 +0,0 @@
1
- """
2
- L4 sqlite-vec sidecar.
3
-
4
- Vector index sidecar for the Pentatonic Memory Engine stack.
5
- Exposes /health, /search, /index-batch, /refresh over HTTP.
6
-
7
- Endpoints:
8
- GET /health
9
- POST /search body: {"query":"...", "limit":10}
10
- POST /index-batch body: {"records":[{"id","text"}, ...]}
11
- POST /refresh no-op (sqlite-vec writes are immediate)
12
-
13
- Env:
14
- L4_DB_PATH default /data/vec.db
15
- L4_NV_EMBED_URL default http://nv-embed:8041/v1/embeddings
16
- PORT default 8042
17
- """
18
-
19
- from __future__ import annotations
20
-
21
- import argparse
22
- import hashlib
23
- import os
24
- import sqlite3
25
- import struct
26
- import sys
27
- import time
28
- from pathlib import Path
29
- from typing import Any
30
-
31
- from fastapi import FastAPI, HTTPException
32
- from pydantic import BaseModel
33
-
34
- # Shared embedding client lives at engine/services/_shared/. Add the parent of
35
- # the service dir to sys.path so `from _shared.embed_provider import ...` works
36
- # regardless of how the service is launched (uvicorn, python server.py, etc.).
37
- sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
38
- from _shared.embed_provider import EmbedClient # noqa: E402
39
-
40
- # ----------------------------------------------------------------------
41
- # Config
42
- # ----------------------------------------------------------------------
43
-
44
- DB_PATH = os.environ.get("L4_DB_PATH", "/data/vec.db")
45
- EMBED_DIM = int(os.environ.get("L4_EMBED_DIM", "4096"))
46
-
47
-
48
-
49
- # ----------------------------------------------------------------------
50
- # DB helpers
51
- # ----------------------------------------------------------------------
52
-
53
- def _vec_to_blob(vec: list[float]) -> bytes:
54
- """Pack a list of floats as little-endian f32 bytes for sqlite-vec."""
55
- return struct.pack(f"<{len(vec)}f", *vec)
56
-
57
-
58
- def _blob_to_vec(blob: bytes) -> list[float]:
59
- n = len(blob) // 4
60
- return list(struct.unpack(f"<{n}f", blob))
61
-
62
-
63
- def _cosine(a: list[float], b: list[float]) -> float:
64
- import math
65
- dot = sum(x * y for x, y in zip(a, b))
66
- na = math.sqrt(sum(x * x for x in a))
67
- nb = math.sqrt(sum(y * y for y in b))
68
- if na == 0 or nb == 0:
69
- return 0.0
70
- return dot / (na * nb)
71
-
72
-
73
- def _get_db() -> sqlite3.Connection:
74
- """Open DB and ensure schema. We use plain BLOB columns rather than
75
- the sqlite-vec virtual table because sqlite-vec is an optional ext
76
- that may not be loadable in every container — plain BLOB lets us
77
- fall back to a Python-side cosine pass without losing correctness.
78
- """
79
- Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
80
- conn = sqlite3.connect(DB_PATH, timeout=10)
81
- conn.execute("PRAGMA journal_mode=WAL")
82
- conn.execute("""
83
- CREATE TABLE IF NOT EXISTS chunks (
84
- id TEXT PRIMARY KEY,
85
- text TEXT,
86
- embedding BLOB,
87
- indexed_at REAL
88
- )
89
- """)
90
- return conn
91
-
92
-
93
- # ----------------------------------------------------------------------
94
- # Embedding client
95
- # ----------------------------------------------------------------------
96
-
97
- _embed: EmbedClient | None = None
98
-
99
-
100
- def _embed_client() -> EmbedClient:
101
- """Lazily build the embed client so env vars are read at first use."""
102
- global _embed
103
- if _embed is None:
104
- _embed = EmbedClient.from_env(
105
- prefix="L4_",
106
- default_url="http://nv-embed:8041/v1/embeddings",
107
- )
108
- return _embed
109
-
110
-
111
- async def _embed_batch(texts: list[str]) -> list[list[float]]:
112
- """Embed a batch of texts via the shared EmbedClient."""
113
- return await _embed_client().embed_batch_async(texts)
114
-
115
-
116
- # ----------------------------------------------------------------------
117
- # FastAPI
118
- # ----------------------------------------------------------------------
119
-
120
- class SearchRequest(BaseModel):
121
- query: str
122
- limit: int = 10
123
-
124
-
125
- class IndexBatchRequest(BaseModel):
126
- records: list[dict[str, Any]]
127
- # When supplied (parallel to `records`), skip the embed call and use
128
- # these vectors directly. Compat shim populates this when shared-embed
129
- # mode is on so we don't duplicate the embed work across layers.
130
- embeddings: list[list[float]] | None = None
131
-
132
-
133
- app = FastAPI(title="L4 sqlite-vec sidecar (OSS)")
134
-
135
-
136
- @app.get("/health")
137
- def health():
138
- try:
139
- conn = _get_db()
140
- n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
141
- conn.close()
142
- return {"status": "ok", "loaded": True, "n_vectors": n,
143
- "dim": EMBED_DIM, "db_path": DB_PATH,
144
- # BLOB+Python-cosine is the intentional implementation path,
145
- # not a degraded fallback (see _get_db docstring). The previous
146
- # "sqlite-vec-fallback" label gave operators the wrong signal.
147
- "backend": "sqlite-vec"}
148
- except Exception as exc:
149
- return {"status": "degraded", "error": str(exc)}
150
-
151
-
152
- @app.post("/search")
153
- async def search(req: SearchRequest):
154
- if not req.query:
155
- return []
156
- try:
157
- embs = await _embed_batch([req.query])
158
- if not embs or embs[0] is None:
159
- raise HTTPException(status_code=502, detail="embed failed")
160
- q_vec = embs[0]
161
- except Exception as exc:
162
- raise HTTPException(status_code=502, detail=f"embed: {exc}")
163
-
164
- conn = _get_db()
165
- rows = conn.execute("SELECT id, text, embedding FROM chunks").fetchall()
166
- conn.close()
167
-
168
- # Cosine similarity in Python — fine for OSS / small corpora. For
169
- # large corpora: consider a dedicated vector DB.
170
- scored: list[tuple[float, str, str]] = []
171
- for rid, text, blob in rows:
172
- if not blob:
173
- continue
174
- v = _blob_to_vec(blob)
175
- if len(v) != len(q_vec):
176
- continue
177
- s = _cosine(q_vec, v)
178
- scored.append((s, rid, text))
179
- scored.sort(reverse=True)
180
- out = [
181
- {"path": rid, "text": text, "score": float(s),
182
- "source": "L4-sqlite-vec", "layer": "L4"}
183
- for s, rid, text in scored[: req.limit]
184
- ]
185
- return out
186
-
187
-
188
- @app.post("/index-batch")
189
- async def index_batch(req: IndexBatchRequest):
190
- if not req.records:
191
- return {"status": "ok", "inserted": 0}
192
- texts = [(r.get("text") or r.get("content") or "")[:8192] for r in req.records]
193
- t0 = time.perf_counter()
194
- # Shared-embed shortcut: caller (compat shim) computed vectors once
195
- # and forwards them so we skip the embed RPC. Length must match
196
- # records — defensive bail if it doesn't.
197
- if req.embeddings is not None and len(req.embeddings) == len(req.records):
198
- embs = req.embeddings
199
- else:
200
- embs = await _embed_batch(texts)
201
- embed_ms = (time.perf_counter() - t0) * 1000.0
202
-
203
- conn = _get_db()
204
- t1 = time.perf_counter()
205
- rows = []
206
- for r, emb, txt in zip(req.records, embs, texts):
207
- if not emb:
208
- continue
209
- rid = r.get("id") or hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
210
- rows.append((rid, txt, _vec_to_blob(emb), time.time()))
211
- if rows:
212
- conn.executemany(
213
- "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
214
- "VALUES (?, ?, ?, ?)", rows,
215
- )
216
- conn.commit()
217
- insert_ms = (time.perf_counter() - t1) * 1000.0
218
- conn.close()
219
- return {"status": "ok", "inserted": len(rows),
220
- "embed_ms": round(embed_ms, 1), "insert_ms": round(insert_ms, 1)}
221
-
222
-
223
- @app.post("/refresh")
224
- def refresh():
225
- """No-op for sqlite-vec — writes are immediate. Kept for API parity."""
226
- return {"status": "ok", "noop": True}
227
-
228
-
229
- # ----------------------------------------------------------------------
230
- # /health/deep — synthetic round-trip
231
- # ----------------------------------------------------------------------
232
-
233
- # Fixed sentinel id used by /health/deep. Upserted on every probe call,
234
- # so the row is idempotent. Kept under id="__healthcheck__sentinel" so
235
- # the L4 corpus has at most one healthcheck row regardless of probe rate.
236
- _HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
237
- _HEALTH_SENTINEL_TEXT = (
238
- "healthcheck sentinel — embed-write-search round-trip verifier"
239
- )
240
-
241
-
242
- @app.get("/health/deep")
243
- async def health_deep():
244
- """Real functional probe: embed → write → search the sentinel.
245
-
246
- Catches the class of failure that plain /health misses — broken
247
- embed paths, write 500s, query path bugs — i.e. exactly the bug
248
- shape that silently degraded L6 from v0.8.0 → v0.8.2.
249
-
250
- Returns:
251
- {status, embed_ms, write_ms, search_ms, hit, ok}
252
-
253
- `hit` confirms the sentinel was returned from search; `ok` is the
254
- aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
255
- regardless so callers can read the body for diagnostics; status:
256
- field carries the verdict.
257
- """
258
- t_total = time.perf_counter()
259
- out: dict[str, Any] = {"status": "ok", "ok": True}
260
- try:
261
- t0 = time.perf_counter()
262
- embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
263
- out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
264
- if not embs or not embs[0]:
265
- out["status"] = "embed_failed"
266
- out["ok"] = False
267
- return out
268
- vec = embs[0]
269
- except Exception as exc:
270
- out["status"] = f"embed_error: {type(exc).__name__}"
271
- out["ok"] = False
272
- return out
273
-
274
- try:
275
- conn = _get_db()
276
- t1 = time.perf_counter()
277
- conn.execute(
278
- "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
279
- "VALUES (?, ?, ?, ?)",
280
- (_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
281
- )
282
- conn.commit()
283
- out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
284
-
285
- t2 = time.perf_counter()
286
- rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
287
- (_HEALTH_SENTINEL_ID,)).fetchone()
288
- out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
289
- conn.close()
290
- except Exception as exc:
291
- out["status"] = f"db_error: {type(exc).__name__}"
292
- out["ok"] = False
293
- return out
294
-
295
- out["hit"] = rows is not None
296
- if not out["hit"]:
297
- out["status"] = "sentinel_missing"
298
- out["ok"] = False
299
- out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
300
- return out
301
-
302
-
303
- # ----------------------------------------------------------------------
304
- # Entrypoint
305
- # ----------------------------------------------------------------------
306
-
307
- if __name__ == "__main__":
308
- parser = argparse.ArgumentParser()
309
- parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8042")))
310
- parser.add_argument("--data-dir", default=None)
311
- args = parser.parse_args()
312
- if args.data_dir:
313
- os.environ["L4_DB_PATH"] = str(Path(args.data_dir) / "vec.db")
314
- import uvicorn
315
- uvicorn.run("server:app", host="0.0.0.0", port=args.port, log_level="info")