@pentatonic-ai/ai-agent-sdk 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.7.3",
3
+ "version": "0.7.5",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -55,6 +55,13 @@ L5_MILVUS_URL = os.environ.get("L5_MILVUS_URL", "http://l5:8035")
55
55
  L6_DOC_URL = os.environ.get("L6_DOC_URL", "http://l6:8037")
56
56
  NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
57
57
 
58
+ # Neo4j has no /health endpoint, so the shim probes the HTTP transactional
59
+ # API with a trivial RETURN 1 — that confirms Neo4j is actually answering
60
+ # Cypher, not just serving HTTP. Auth shape is the same as L2 / docker-compose:
61
+ # "user/pass" string. Default matches the local-dev compose default.
62
+ NEO4J_AUTH = os.environ.get("NEO4J_AUTH", "neo4j/local-dev-pw")
63
+ NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
64
+
58
65
  PORT = int(os.environ.get("PORT", "8099"))
59
66
  CLIENT_ID = os.environ.get("CLIENT_ID", "default")
60
67
 
@@ -204,8 +211,12 @@ async def _index_l4(records: list[dict[str, Any]]) -> int:
204
211
  return 0
205
212
 
206
213
 
207
- async def _index_l5(records: list[dict[str, Any]]) -> int:
208
- """Index records into the L5 Milvus comms layer (chats collection)."""
214
+ async def _index_l5(records: list[dict[str, Any]], arena: str = "general") -> int:
215
+ """Index records into the L5 Milvus comms layer (chats collection).
216
+
217
+ arena is forwarded as a Milvus dynamic field so /search can filter
218
+ by arena natively (vs the shim's defence-in-depth post-filter).
219
+ """
209
220
  payload = {
210
221
  "collection": "chats",
211
222
  "records": [
@@ -215,6 +226,7 @@ async def _index_l5(records: list[dict[str, Any]]) -> int:
215
226
  "source": (r.get("metadata") or {}).get("source", "shim"),
216
227
  "channel": "pentatonic-memory",
217
228
  "contact": (r.get("metadata") or {}).get("user", ""),
229
+ "arena": (r.get("metadata") or {}).get("arena") or arena,
218
230
  }
219
231
  for r in records
220
232
  ],
@@ -294,9 +306,82 @@ app = FastAPI(
294
306
  )
295
307
 
296
308
 
309
+ def _interpret_body_status(body: Any) -> str | None:
310
+ """Pull a layer's self-reported status out of its /health body.
311
+
312
+ Layers don't all use the same vocabulary — L4 says "ok"/"degraded",
313
+ L2 says "healthy"/"unavailable", some return nothing. Normalize to
314
+ "ok" or a short failure reason; None means the body didn't carry
315
+ a status field, in which case the HTTP code is the source of truth.
316
+ """
317
+ if not isinstance(body, dict):
318
+ return None
319
+ raw = body.get("status")
320
+ if raw is None:
321
+ return None
322
+ s = str(raw).lower()
323
+ if s in ("ok", "healthy"):
324
+ return "ok"
325
+ err = body.get("error") or body.get("reason") or ""
326
+ return f"{s}: {str(err)[:80]}" if err else s
327
+
328
+
329
+ async def _probe(url: str) -> str:
330
+ """Probe a layer /health endpoint and return a single-string verdict
331
+ that surfaces both transport-level failure and self-reported status."""
332
+ try:
333
+ r = await _client().get(url, timeout=3.0)
334
+ except Exception as exc:
335
+ return f"unreachable: {type(exc).__name__}"
336
+ if r.status_code != 200:
337
+ return f"http {r.status_code}"
338
+ try:
339
+ body_status = _interpret_body_status(r.json())
340
+ except Exception:
341
+ body_status = None
342
+ return body_status or "ok"
343
+
344
+
345
+ async def _probe_l3() -> str:
346
+ """Real Neo4j probe — POST a trivial Cypher via the HTTP transactional
347
+ API and require a 200 response. Confirms Neo4j is actually answering
348
+ queries, not just serving the Browser HTML on :7474.
349
+ """
350
+ user, _, password = NEO4J_AUTH.partition("/")
351
+ url = f"{L3_KG_URL}/db/{NEO4J_DB}/tx/commit"
352
+ try:
353
+ r = await _client().post(
354
+ url,
355
+ json={"statements": [{"statement": "RETURN 1"}]},
356
+ auth=(user, password),
357
+ timeout=3.0,
358
+ )
359
+ except Exception as exc:
360
+ return f"unreachable: {type(exc).__name__}"
361
+ if r.status_code != 200:
362
+ return f"http {r.status_code}"
363
+ try:
364
+ body = r.json()
365
+ # Neo4j tx/commit returns {"results":[...], "errors":[...]}.
366
+ # Any errors here means the DB is up but rejecting queries.
367
+ errs = body.get("errors") or []
368
+ if errs:
369
+ return f"cypher error: {str(errs[0])[:80]}"
370
+ except Exception:
371
+ return "non-json response"
372
+ return "ok"
373
+
374
+
297
375
  @app.get("/health")
298
376
  async def health():
299
- """Aggregate health across all 7 layers."""
377
+ """Aggregate health across all 7 layers.
378
+
379
+ Each layer's verdict is honest: it reflects whether the layer can
380
+ actually do its job, not just whether its HTTP server answers. The
381
+ shim reads the layer's body.status (when present) and degrades when
382
+ the layer self-reports a problem. L3 uses a real Cypher probe since
383
+ Neo4j has no /health route.
384
+ """
300
385
  out = {
301
386
  "status": "ok",
302
387
  "client": CLIENT_ID,
@@ -304,49 +389,43 @@ async def health():
304
389
  "engine": "pentatonic-memory-engine",
305
390
  "layers": {},
306
391
  }
307
- # L0 BM25 is in-process inside the L2 proxy (SQLite FTS5 is a library,
308
- # not a service). Reporting it via L2's /health.
309
- layer_health_endpoints = {
310
- "l2": f"{L2_PROXY_URL}/health", # also reports L0 status
311
- "l3": f"{L3_KG_URL}/health",
312
- "l4": f"{L4_VEC_URL}/health",
313
- "l5": f"{L5_MILVUS_URL}/health",
314
- "l6": f"{L6_DOC_URL}/health",
315
- # NV-Embed exposes both /health and /v1/embeddings; /health is enough.
316
- "nv_embed": NV_EMBED_URL.replace("/v1/embeddings", "/health"),
317
- }
318
- failures = 0
319
- for name, url in layer_health_endpoints.items():
320
- try:
321
- r = await _client().get(url, timeout=3.0)
322
- out["layers"][name] = "ok" if r.status_code == 200 else f"http {r.status_code}"
323
- if r.status_code != 200:
324
- failures += 1
325
- except Exception:
326
- out["layers"][name] = "unreachable"
327
- failures += 1
392
+ # NV-Embed exposes /health alongside /v1/embeddings.
393
+ nv_embed_health = NV_EMBED_URL.replace("/v1/embeddings", "/health")
394
+
395
+ import asyncio
396
+ l2_v, l4_v, l5_v, l6_v, nv_v, l3_v = await asyncio.gather(
397
+ _probe(f"{L2_PROXY_URL}/health"),
398
+ _probe(f"{L4_VEC_URL}/health"),
399
+ _probe(f"{L5_MILVUS_URL}/health"),
400
+ _probe(f"{L6_DOC_URL}/health"),
401
+ _probe(nv_embed_health),
402
+ _probe_l3(),
403
+ )
404
+
328
405
  # L0 BM25 (FTS5) and L1 (always-loaded core files) are both in-process
329
- # inside the L2 proxy. They have no separate health endpoint; if L2 is
330
- # responding, both are usable. Report them as "ok" tied to L2.
331
- raw_layers = out["layers"]
332
- l2_ok = raw_layers.get("l2") == "ok"
406
+ # inside the L2 proxy. They have no separate runtime; if L2 is healthy,
407
+ # both layers are usable. Tie their status to L2.
408
+ l2_ok = l2_v == "ok"
333
409
  out["layers"] = {
334
- "l0": "ok" if l2_ok else "unknown",
335
- "l1": "ok" if l2_ok else "unknown",
336
- "l2": raw_layers.get("l2", "unknown"),
337
- "l3": raw_layers.get("l3", "unknown"),
338
- "l4": raw_layers.get("l4", "unknown"),
339
- "l5": raw_layers.get("l5", "unknown"),
340
- "l6": raw_layers.get("l6", "unknown"),
341
- "nv_embed": raw_layers.get("nv_embed", "unknown"),
410
+ "l0": "ok" if l2_ok else l2_v,
411
+ "l1": "ok" if l2_ok else l2_v,
412
+ "l2": l2_v,
413
+ "l3": l3_v,
414
+ "l4": l4_v,
415
+ "l5": l5_v,
416
+ "l6": l6_v,
417
+ "nv_embed": nv_v,
342
418
  }
419
+ failures = sum(1 for v in out["layers"].values() if v != "ok")
343
420
  if failures:
344
421
  out["status"] = "degraded" if failures < 3 else "down"
345
- # Memory count: query L6 doc-store as authoritative
422
+
423
+ # Memory count: query L6 doc-store as authoritative.
346
424
  try:
347
425
  r = await _client().get(f"{L6_DOC_URL}/stats", timeout=3.0)
348
426
  if r.status_code == 200:
349
- out["memories"] = r.json().get("total_chunks", 0)
427
+ stats = r.json()
428
+ out["memories"] = stats.get("total_chunks") or stats.get("fts_chunks") or 0
350
429
  except Exception:
351
430
  out["memories"] = None
352
431
  return out
@@ -369,7 +448,7 @@ async def store(req: StoreRequest):
369
448
  import asyncio
370
449
  l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
371
450
  _index_l4([record]),
372
- _index_l5([record]),
451
+ _index_l5([record], arena=arena),
373
452
  _index_l6([record], arena=arena),
374
453
  _index_l2_internal([record], arena=arena),
375
454
  )
@@ -414,7 +493,7 @@ async def store_batch(req: StoreBatchRequest):
414
493
  import asyncio
415
494
  l4_count, l5_count, l6_count, l2_internal = await asyncio.gather(
416
495
  _index_l4(normalised),
417
- _index_l5(normalised),
496
+ _index_l5(normalised, arena=req.arena or "general"),
418
497
  _index_l6(normalised, arena=req.arena or "general"),
419
498
  _index_l2_internal(normalised, arena=req.arena or "general"),
420
499
  )
@@ -633,9 +712,12 @@ async def search(req: SearchRequest):
633
712
  out_results = _apply_metadata_filters(out_results, req)
634
713
  return {"results": out_results[: req.limit or 10]}
635
714
  try:
715
+ get_params: dict[str, Any] = {"q": req.query, "limit": _search_overfetch(req)}
716
+ if req.arena:
717
+ get_params["arena"] = req.arena
636
718
  r = await _client().get(
637
719
  f"{L2_PROXY_URL}/search",
638
- params={"q": req.query, "limit": _search_overfetch(req)},
720
+ params=get_params,
639
721
  timeout=30.0,
640
722
  )
641
723
  r.raise_for_status()
@@ -643,10 +725,16 @@ async def search(req: SearchRequest):
643
725
  except Exception as exc:
644
726
  last_err = exc
645
727
  try:
728
+ post_body: dict[str, Any] = {
729
+ "query": req.query,
730
+ "limit": _search_overfetch(req),
731
+ "min_score": req.min_score or 0.001,
732
+ }
733
+ if req.arena:
734
+ post_body["arena"] = req.arena
646
735
  r = await _client().post(
647
736
  f"{L2_PROXY_URL}/v1/search",
648
- json={"query": req.query, "limit": _search_overfetch(req),
649
- "min_score": req.min_score or 0.001},
737
+ json=post_body,
650
738
  timeout=30.0,
651
739
  )
652
740
  r.raise_for_status()
@@ -0,0 +1,60 @@
1
+ # docker-compose.test.yml — overlay for hermetic CI runs.
2
+ #
3
+ # Replaces the nv-embed GPU service with a deterministic embedding
4
+ # stub that mimics both the OpenAI /v1/embeddings shape and the
5
+ # lambda-gateway /v1/embed shape. Lets CI exercise every layer's
6
+ # vector path without an actual model.
7
+ #
8
+ # Usage:
9
+ # docker compose -f docker-compose.yml -f docker-compose.test.yml \
10
+ # up -d --wait l3 l4 l5 l6 l2 compat embed-stub
11
+ #
12
+ # The base nv-embed service is intentionally NOT started in CI
13
+ # (requires a GPU). l4/l5/l6 are pointed at embed-stub via env.
14
+
15
+ services:
16
+ embed-stub:
17
+ build:
18
+ context: ./tests/embed_stub
19
+ dockerfile: Dockerfile
20
+ container_name: pme-embed-stub
21
+ networks:
22
+ - engine-net
23
+ environment:
24
+ EMBED_DIM: "4096"
25
+ healthcheck:
26
+ test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8041/health',timeout=3)"]
27
+ interval: 5s
28
+ timeout: 3s
29
+ retries: 20
30
+ start_period: 5s
31
+
32
+ l4:
33
+ environment:
34
+ L4_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
35
+ L4_EMBED_API_KEY: ""
36
+
37
+ l5:
38
+ environment:
39
+ L5_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
40
+ L5_EMBED_API_KEY: ""
41
+
42
+ l6:
43
+ environment:
44
+ L6_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
45
+ L6_EMBED_API_KEY: ""
46
+
47
+ l2:
48
+ environment:
49
+ PME_NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
50
+
51
+ compat:
52
+ environment:
53
+ NV_EMBED_URL: http://embed-stub:8041/v1/embeddings
54
+ depends_on:
55
+ embed-stub:
56
+ condition: service_healthy
57
+ l2: { condition: service_started }
58
+ l4: { condition: service_started }
59
+ l5: { condition: service_started }
60
+ l6: { condition: service_started }
@@ -719,12 +719,17 @@ L0_MEMORY_DB = Path(os.environ.get(
719
719
  str(Path.home() / ".pentatonic" / "memory" / "main.sqlite"),
720
720
  ))
721
721
 
722
- def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
722
+ def search_l0_bm25(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
723
723
  """Search native BM25 index over workspace memory files.
724
-
724
+
725
725
  Covers chunks from daily notes, memory files, people profiles,
726
726
  infrastructure docs, project files — corpus that L3-L6 don't index.
727
727
  Sub-millisecond local SQLite reads, zero network overhead.
728
+
729
+ arena (optional): when set, filter to paths under bench/<arena>/.
730
+ Records stored via the compat shim land under that prefix per
731
+ _stash_all_keys; this is the L0 path-based equivalent of the
732
+ arena dynamic-field filter on L5/L6.
728
733
  """
729
734
  if not L0_MEMORY_DB.exists():
730
735
  return []
@@ -741,16 +746,21 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
741
746
 
742
747
  conn = sqlite3.connect(str(L0_MEMORY_DB), timeout=2)
743
748
  conn.execute("PRAGMA journal_mode=WAL")
744
- rows = conn.execute("""
749
+ sql = """
745
750
  SELECT path, text, bm25(chunks_fts) as rank
746
751
  FROM chunks_fts
747
752
  WHERE chunks_fts MATCH ?
748
753
  AND path NOT LIKE '%/snapshots/%'
749
754
  AND path NOT LIKE '%/archive/%'
750
755
  AND path NOT LIKE '%-backup-%'
751
- ORDER BY rank ASC
752
- LIMIT ?
753
- """, (fts_query, limit * 2)).fetchall()
756
+ """
757
+ params: list = [fts_query]
758
+ if arena:
759
+ sql += " AND path LIKE ?"
760
+ params.append(f"bench/{arena}/%")
761
+ sql += " ORDER BY rank ASC LIMIT ?"
762
+ params.append(limit * 2)
763
+ rows = conn.execute(sql, params).fetchall()
754
764
  conn.close()
755
765
 
756
766
  results = []
@@ -761,12 +771,20 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
761
771
  seen_paths.add(path)
762
772
  relevance = -rank if rank < 0 else 0.001
763
773
  score = min(relevance / (1 + relevance) * 0.85, 0.75)
774
+ # Parse arena from path (bench/<arena>/...) so downstream
775
+ # consumers can read it directly without parsing again.
776
+ row_arena = ""
777
+ if path.startswith("bench/"):
778
+ parts = path.split("/", 2)
779
+ if len(parts) >= 3:
780
+ row_arena = parts[1]
764
781
  results.append({
765
782
  "path": f"L0/{path}",
766
783
  "snippet": text[:500],
767
784
  "score": round(score, 4),
768
785
  "layer": "L0_workspace_bm25",
769
786
  "source": path,
787
+ "arena": row_arena,
770
788
  })
771
789
  if len(results) >= limit:
772
790
  break
@@ -782,12 +800,20 @@ def search_l0_bm25(query: str, limit: int = 6) -> List[Dict]:
782
800
 
783
801
  L5_API_URL = os.environ.get("PME_L5_URL", "http://127.0.0.1:8034")
784
802
 
785
- def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
786
- """Search L5 Communications Context via L5 API (emails, chats, calendar)."""
803
+ def search_l5_communications(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
804
+ """Search L5 Communications Context via L5 API (emails, chats, calendar).
805
+
806
+ arena (optional): forwarded to L5; filters Milvus by the arena
807
+ dynamic field. Records id is included in the result so callers
808
+ can attach metadata via the shim's _META_CACHE.
809
+ """
787
810
  try:
811
+ params: dict = {"q": query, "limit": limit}
812
+ if arena:
813
+ params["arena"] = arena
788
814
  resp = requests.get(
789
815
  f"{L5_API_URL}/search",
790
- params={"q": query, "limit": limit},
816
+ params=params,
791
817
  timeout=10,
792
818
  )
793
819
  if resp.status_code != 200:
@@ -804,10 +830,15 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
804
830
  continue # skip low relevance
805
831
  contact = hit.get("contact", "")
806
832
  channel = hit.get("channel", "")
807
- path_label = f"L5/{source}"
808
- if contact:
833
+ hit_id = hit.get("id", "")
834
+ # Use record id as path label so the shim can attach
835
+ # metadata via _META_CACHE; falls back to source label
836
+ # for legacy records that have no id.
837
+ path_label = hit_id or f"L5/{source}"
838
+ if not hit_id and contact:
809
839
  path_label = f"L5/{channel}/{contact}"
810
840
  results.append({
841
+ "id": hit_id,
811
842
  "path": path_label,
812
843
  "snippet": hit.get("text", "")[:500],
813
844
  "score": scaled_score,
@@ -815,6 +846,7 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
815
846
  "source": source,
816
847
  "collection": hit.get("collection", ""),
817
848
  "timestamp": hit.get("timestamp", ""),
849
+ "arena": hit.get("arena", ""),
818
850
  })
819
851
  return results
820
852
  except Exception as e:
@@ -825,12 +857,19 @@ def search_l5_communications(query: str, limit: int = 6) -> List[Dict]:
825
857
  # L6: Document Store Search
826
858
  L6_URL = os.environ.get("PME_L6_URL", "http://localhost:8037")
827
859
 
828
- def search_l6_documents(query: str, limit: int = 6) -> List[Dict]:
829
- """Search L6 Document Store (research, legal, financial, project docs)."""
860
+ def search_l6_documents(query: str, limit: int = 6, arena: str = None) -> List[Dict]:
861
+ """Search L6 Document Store (research, legal, financial, project docs).
862
+
863
+ arena (optional): forwarded to L6 — L6 already supports arena
864
+ natively (see l6-document-store.py search_vector / search_fts).
865
+ """
830
866
  try:
867
+ params: dict = {"q": query, "method": "hybrid", "limit": limit, "rerank": "true"}
868
+ if arena:
869
+ params["arena"] = arena
831
870
  resp = requests.get(
832
871
  f"{L6_URL}/search",
833
- params={"q": query, "method": "hybrid", "limit": limit, "rerank": "true"},
872
+ params=params,
834
873
  timeout=10,
835
874
  )
836
875
  if resp.status_code != 200:
@@ -875,13 +914,19 @@ def search_l6_documents(query: str, limit: int = 6) -> List[Dict]:
875
914
  return []
876
915
 
877
916
 
878
- def sequential_hybridrag_search(query: str, limit: int = 16) -> List[Dict]:
879
- """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs)."""
917
+ def sequential_hybridrag_search(query: str, limit: int = 16, arena: str = None) -> List[Dict]:
918
+ """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs).
919
+
920
+ arena (optional): tenant scope. Forwarded to L0 (path-prefix
921
+ filter), L5 (Milvus dynamic-field filter), L6 (native arena).
922
+ L4 vector and L3 graph don't yet support native arena filtering;
923
+ the compat shim post-filter catches those before they leak out.
924
+ """
880
925
  start_time = time.time()
881
- log.info(f"Starting sequential HybridRAG search for: '{query}'")
926
+ log.info(f"Starting sequential HybridRAG search for: '{query}' arena={arena!r}")
882
927
 
883
928
  # L0: BM25 workspace memory (keyword search — complements semantic layers)
884
- l0_results = search_l0_bm25(query, limit=6)
929
+ l0_results = search_l0_bm25(query, limit=6, arena=arena)
885
930
  log.info(f"L0 BM25 workspace: {len(l0_results)} results")
886
931
 
887
932
  # L1: System Files (HIGHEST PRIORITY)
@@ -902,11 +947,11 @@ def sequential_hybridrag_search(query: str, limit: int = 16) -> List[Dict]:
902
947
  log.info(f"L4 Vector search: {len(vector_results)} results (HyDE={'on' if hyde_query != query else 'off'})")
903
948
 
904
949
  # L5: Communications Context (emails, chats, calendar) — also use HyDE
905
- l5_results = search_l5_communications(hyde_query, limit=6)
950
+ l5_results = search_l5_communications(hyde_query, limit=6, arena=arena)
906
951
  log.info(f"L5 Communications: {len(l5_results)} results")
907
952
 
908
953
  # L6: Document Store (research, legal, financial, project docs)
909
- l6_results = search_l6_documents(hyde_query, limit=6)
954
+ l6_results = search_l6_documents(hyde_query, limit=6, arena=arena)
910
955
  log.info(f"L6 Documents: {len(l6_results)} results")
911
956
 
912
957
  # L2: HybridRAG fusion (combines all layers with L1 priority)
@@ -966,10 +1011,11 @@ async def search_endpoint(request: Request) -> dict:
966
1011
  body = await request.json()
967
1012
  query = body.get("query", "")
968
1013
  limit = body.get("limit", 16)
1014
+ arena = body.get("arena") or None
969
1015
  if not query:
970
1016
  raise HTTPException(status_code=400, detail="query is required")
971
1017
 
972
- results = sequential_hybridrag_search(query, limit=limit)
1018
+ results = sequential_hybridrag_search(query, limit=limit, arena=arena)
973
1019
 
974
1020
  # Also return raw graph entities for context enrichment
975
1021
  entities = extract_query_entities(query)
@@ -1150,8 +1196,17 @@ def _check_l6_health() -> bool:
1150
1196
 
1151
1197
  @app.get("/health")
1152
1198
  async def health() -> dict:
1153
- """System health check."""
1199
+ """System health check.
1200
+
1201
+ Reports "ok" iff every layer L2 directly owns is healthy: L0 BM25
1202
+ (SQLite FTS5 file), L4 QMD vector store (sqlite file), and the
1203
+ Neo4j connection. L5/L6 reachability is reported informationally
1204
+ only — the compat shim probes them directly. Ollama is no longer
1205
+ a hard dependency anywhere; the engine uses the configured
1206
+ NV_EMBED_URL via _embed_post helpers in each layer.
1207
+ """
1154
1208
  qmd_healthy = os.path.exists(QMD_DB_PATH)
1209
+ l0_healthy = L0_MEMORY_DB.exists()
1155
1210
 
1156
1211
  neo4j_healthy = False
1157
1212
  try:
@@ -1163,25 +1218,26 @@ async def health() -> dict:
1163
1218
  except Exception as e:
1164
1219
  logging.debug(f"Suppressed: {e}")
1165
1220
 
1166
- ollama_healthy = False
1167
- try:
1168
- r = requests.get("http://localhost:11434/api/tags", timeout=5)
1169
- ollama_healthy = r.status_code == 200
1170
- except Exception as e:
1171
- logging.debug(f"Suppressed: {e}")
1221
+ l5_reachable = _check_l5_health()
1222
+ l6_reachable = _check_l6_health()
1223
+
1224
+ # Top-level status: degrade only on layers L2 is the sole gatekeeper for.
1225
+ # L5/L6 are independent services probed by the compat shim.
1226
+ must_be_ok = [l0_healthy, qmd_healthy, neo4j_healthy]
1227
+ overall = "ok" if all(must_be_ok) else "degraded"
1172
1228
 
1173
1229
  return {
1230
+ "status": overall,
1174
1231
  "proxy": "healthy",
1175
1232
  "architecture": "sequential-hybridrag-proper-layers",
1176
1233
  "layers": {
1177
- "L0_workspace_bm25": {"status": "healthy" if L0_MEMORY_DB.exists() else "unavailable", "backend": "sqlite-fts5"},
1234
+ "L0_workspace_bm25": {"status": "healthy" if l0_healthy else "unavailable", "backend": "sqlite-fts5"},
1178
1235
  "L1_system_files": {"status": "healthy", "description": "MEMORY.md, plans.md, daily notes"},
1179
1236
  "L2_hybridrag": {"status": "healthy", "description": "Orchestrates L3+L4 fusion"},
1180
1237
  "L3_graph_search": {"status": "healthy" if neo4j_healthy else "unavailable", "backend": "neo4j"},
1181
- "L4_vector_search": {"status": "healthy" if qmd_healthy else "unavailable", "backend": "qmd+ollama"},
1182
- "L5_communications": {"status": "healthy" if _check_l5_health() else "unavailable", "backend": "sqlite+ollama"},
1183
- "L6_document_store": {"status": "healthy" if _check_l6_health() else "unavailable", "backend": "milvus+fts5+reranker", "port": 8037},
1184
- "ollama_embeddings": {"status": "healthy" if ollama_healthy else "unavailable"}
1238
+ "L4_vector_search": {"status": "healthy" if qmd_healthy else "unavailable", "backend": "qmd"},
1239
+ "L5_communications": {"status": "healthy" if l5_reachable else "unavailable", "backend": "milvus"},
1240
+ "L6_document_store": {"status": "healthy" if l6_reachable else "unavailable", "backend": "milvus+fts5+reranker", "port": 8037},
1185
1241
  }
1186
1242
  }
1187
1243
 
@@ -449,8 +449,13 @@ def index_memory(client):
449
449
 
450
450
  # --- Search ---
451
451
 
452
- def search(query: str, collection: str = None, limit: int = 10):
453
- """Search across collections."""
452
+ def search(query: str, collection: str = None, limit: int = 10, arena: str = None):
453
+ """Search across collections.
454
+
455
+ arena (optional): when set, filter to records whose arena dynamic
456
+ field matches. Records indexed before arena was added carry no
457
+ arena field — those are dropped under multi-tenant safety.
458
+ """
454
459
  client = get_client()
455
460
  vectors = embed_texts([query])
456
461
  if not vectors or all(v == 0.0 for v in vectors[0]):
@@ -460,6 +465,12 @@ def search(query: str, collection: str = None, limit: int = 10):
460
465
  collections = [collection] if collection else ["chats", "emails", "contacts", "memory"]
461
466
  all_results = []
462
467
 
468
+ filter_expr = ""
469
+ if arena:
470
+ # Escape double quotes; Milvus filter syntax for dynamic fields.
471
+ safe = str(arena).replace('"', '\\"')
472
+ filter_expr = f'arena == "{safe}"'
473
+
463
474
  for coll in collections:
464
475
  if not client.has_collection(coll):
465
476
  continue
@@ -468,12 +479,14 @@ def search(query: str, collection: str = None, limit: int = 10):
468
479
  collection_name=coll,
469
480
  data=[vectors[0]],
470
481
  limit=limit,
471
- output_fields=["text", "source", "channel", "contact", "timestamp"],
482
+ filter=filter_expr,
483
+ output_fields=["text", "source", "channel", "contact", "timestamp", "arena"],
472
484
  )
473
485
  for hits in results:
474
486
  for hit in hits:
475
487
  entity = hit.get("entity", {})
476
488
  all_results.append({
489
+ "id": hit.get("id", ""),
477
490
  "collection": coll,
478
491
  "score": round(hit.get("distance", 0), 4),
479
492
  "text": entity.get("text", ""),
@@ -481,6 +494,7 @@ def search(query: str, collection: str = None, limit: int = 10):
481
494
  "channel": entity.get("channel", ""),
482
495
  "contact": entity.get("contact", ""),
483
496
  "timestamp": entity.get("timestamp", ""),
497
+ "arena": entity.get("arena", ""),
484
498
  })
485
499
  except Exception as e:
486
500
  print(f" Search error in {coll}: {e}")
@@ -492,28 +506,28 @@ def search(query: str, collection: str = None, limit: int = 10):
492
506
  # --- Health / Stats ---
493
507
 
494
508
  def health():
495
- """Check L5 health."""
509
+ """Check L5 health.
510
+
511
+ Reports "ok" iff the Milvus client can list collections — that's
512
+ L5's actual data plane. Embeddings are intentionally NOT probed
513
+ here: that's a separate concern reported by the compat shim's
514
+ nv_embed entry. Probing an external embedding endpoint on every
515
+ /health adds latency and false negatives for layers that only
516
+ embed on demand.
517
+ """
496
518
  try:
497
519
  client = get_client()
498
520
  collections = ["chats", "emails", "contacts", "memory"]
499
- status = {"status": "ok", "db_path": DB_PATH, "collections": {}}
521
+ out = {"status": "ok", "db_path": DB_PATH, "collections": {}}
500
522
  for coll in collections:
501
523
  if client.has_collection(coll):
502
524
  stats = client.get_collection_stats(coll)
503
525
  count = stats.get("row_count", 0)
504
- status["collections"][coll] = {"exists": True, "count": count}
526
+ out["collections"][coll] = {"exists": True, "count": count}
505
527
  else:
506
- status["collections"][coll] = {"exists": False, "count": 0}
507
- total = sum(c["count"] for c in status["collections"].values())
508
- status["total_chunks"] = total
509
- # Check embeddings
510
- try:
511
- r = httpx.get("http://localhost:11434/api/tags", timeout=3)
512
- models = [m["name"] for m in r.json().get("models", [])]
513
- status["embeddings"] = EMBED_MODEL in str(models)
514
- except Exception:
515
- status["embeddings"] = False
516
- return status
528
+ out["collections"][coll] = {"exists": False, "count": 0}
529
+ out["total_chunks"] = sum(c["count"] for c in out["collections"].values())
530
+ return out
517
531
  except Exception as e:
518
532
  return {"status": "error", "error": str(e)}
519
533
 
@@ -547,8 +561,9 @@ def serve(port=8034):
547
561
  return health()
548
562
 
549
563
  @api.get("/search")
550
- def api_search(q: str = Query(...), collection: str = None, limit: int = 10):
551
- results = search(q, collection=collection, limit=limit)
564
+ def api_search(q: str = Query(...), collection: str = None, limit: int = 10,
565
+ arena: str = None):
566
+ results = search(q, collection=collection, limit=limit, arena=arena)
552
567
  return {"query": q, "results": results, "count": len(results)}
553
568
 
554
569
  @api.get("/stats")
@@ -618,6 +633,10 @@ def serve(port=8034):
618
633
  "channel": (r.get("channel") or "")[:64],
619
634
  "contact": (r.get("contact") or "")[:256],
620
635
  "timestamp": (r.get("timestamp") or _now)[:32],
636
+ # arena lands in the dynamic-field section of the
637
+ # collection (enable_dynamic_field=True). Filterable
638
+ # via `arena == "..."` in /search.
639
+ "arena": (r.get("arena") or "general")[:64],
621
640
  })
622
641
  t1 = _time.time()
623
642
  if rows:
@@ -94,35 +94,13 @@ log = logging.getLogger("l6-document-store")
94
94
  _embed_client = httpx.Client(timeout=60)
95
95
 
96
96
  def embed_text(text: str) -> List[float]:
97
- """Get embedding NV-Embed-v2 primary, Ollama fallback."""
98
- if NV_EMBED_ENABLED:
99
- try:
100
- resp = _embed_client.post(NV_EMBED_URL, json={"input": text[:4000]})
101
- resp.raise_for_status()
102
- return resp.json()["data"][0]["embedding"]
103
- except Exception as e:
104
- log.warning(f"NV-Embed-v2 failed, falling back to Ollama: {e}")
105
-
106
- # Ollama fallback
107
- resp = _embed_client.post(
108
- f"{OLLAMA_URL}/api/embeddings",
109
- json={"model": EMBED_MODEL, "prompt": text[:8000]},
110
- )
111
- resp.raise_for_status()
112
- return resp.json()["embedding"]
97
+ """Single-text embed via _embed_post (OpenAI-compat first, lambda-gateway fallback)."""
98
+ return _embed_post([text[:8000]])[0]
113
99
 
114
100
 
115
101
  def embed_batch(texts: List[str]) -> List[List[float]]:
116
- """Embed a batch of texts — NV-Embed-v2 supports native batching."""
117
- if NV_EMBED_ENABLED:
118
- try:
119
- resp = _embed_client.post(NV_EMBED_URL, json={"input": [t[:4000] for t in texts]})
120
- resp.raise_for_status()
121
- return [d["embedding"] for d in resp.json()["data"]]
122
- except Exception as e:
123
- log.warning(f"NV-Embed-v2 batch failed, falling back to sequential: {e}")
124
-
125
- return [embed_text(t) for t in texts]
102
+ """Batched embed via _embed_post."""
103
+ return _embed_post([t[:8000] for t in texts])
126
104
 
127
105
  # ---------------------------------------------------------------------------
128
106
  # Cross-Encoder Reranker
@@ -767,41 +745,40 @@ def get_stats() -> Dict:
767
745
 
768
746
 
769
747
  def health() -> Dict:
770
- """Health check."""
771
- status = {"status": "ok", "milvus": "unknown", "fts": "unknown", "ollama": "unknown", "reranker": "unknown"}
748
+ """Health check.
772
749
 
773
- # Milvus
750
+ Reports "ok" iff Milvus and the FTS sidecar both answer. Embeddings
751
+ are NOT probed here — the compat shim's nv_embed entry covers that.
752
+ Ollama was a legacy fallback that is not used in any deployment, so
753
+ its previous probe was a false negative on prod.
754
+ """
755
+ out = {"status": "ok", "milvus": "unknown", "fts": "unknown", "reranker": "unknown"}
756
+
757
+ # Milvus — vector store
774
758
  try:
775
759
  client = get_milvus()
776
760
  colls = client.list_collections()
777
- status["milvus"] = f"ok ({len(colls)} collections)"
761
+ out["milvus"] = f"ok ({len(colls)} collections)"
778
762
  except Exception as e:
779
- status["milvus"] = f"error: {e}"
780
- status["status"] = "degraded"
763
+ out["milvus"] = f"error: {e}"
764
+ out["status"] = "degraded"
781
765
 
782
- # FTS
766
+ # FTS — keyword fallback over the same chunk set
783
767
  try:
784
768
  conn = get_fts_db()
785
769
  cnt = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
786
- status["fts"] = f"ok ({cnt} chunks)"
770
+ out["fts"] = f"ok ({cnt} chunks)"
787
771
  conn.close()
788
772
  except Exception as e:
789
- status["fts"] = f"error: {e}"
790
- status["status"] = "degraded"
791
-
792
- # Ollama
793
- try:
794
- resp = _embed_client.get(f"{OLLAMA_URL}/api/tags", timeout=5)
795
- status["ollama"] = "ok" if resp.status_code == 200 else f"http {resp.status_code}"
796
- except Exception as e:
797
- status["ollama"] = f"error: {e}"
798
- status["status"] = "degraded"
773
+ out["fts"] = f"error: {e}"
774
+ out["status"] = "degraded"
799
775
 
800
- # Reranker
776
+ # Reranker — informational; CPU fallback to RRF is acceptable, so
777
+ # don't degrade overall status when it's unavailable.
801
778
  reranker = get_reranker()
802
- status["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
779
+ out["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
803
780
 
804
- return status
781
+ return out
805
782
 
806
783
  # ---------------------------------------------------------------------------
807
784
  # FastAPI Server
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env bash
2
+ # e2e_arena.sh — multi-tenant store/retrieve smoke test against a live
3
+ # memory-engine stack. Exercises /store, arena-scoped /search, and
4
+ # /forget end-to-end across L0/L4/L5/L6 + the compat shim.
5
+ #
6
+ # Run after `docker compose -f docker-compose.yml -f docker-compose.test.yml \
7
+ # up -d --wait l3 l4 l5 l6 l2 compat embed-stub`.
8
+ set -eu
9
+
10
+ BASE="${BASE:-http://localhost:8099}"
11
+ WAIT_HEALTH_SECS="${WAIT_HEALTH_SECS:-180}"
12
+ PASS=0
13
+ FAIL=0
14
+
15
+ ok() { echo " ✅ $1"; PASS=$((PASS+1)); }
16
+ fail() { echo " ❌ $1"; FAIL=$((FAIL+1)); }
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Wait for the compat shim to come up. Its /health aggregates layer
20
+ # health; we accept "ok" or "degraded" (l3 cosmetic 404 is known and
21
+ # doesn't block functional paths).
22
+ # ---------------------------------------------------------------------------
23
+
24
+ echo "=== waiting for $BASE/health (up to ${WAIT_HEALTH_SECS}s) ==="
25
+ deadline=$(( $(date +%s) + WAIT_HEALTH_SECS ))
26
+ while :; do
27
+ if H=$(curl -sf --max-time 5 "$BASE/health"); then
28
+ s=$(echo "$H" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",""))')
29
+ if [ "$s" = "ok" ] || [ "$s" = "degraded" ]; then
30
+ echo " health: $s"
31
+ break
32
+ fi
33
+ fi
34
+ if [ "$(date +%s)" -ge "$deadline" ]; then
35
+ echo " ❌ engine never became healthy"
36
+ exit 1
37
+ fi
38
+ sleep 3
39
+ done
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # /store — two arenas, two distinct documents per arena.
43
+ # ---------------------------------------------------------------------------
44
+
45
+ echo ""
46
+ echo "=== /store ==="
47
+ post() {
48
+ curl -sf -X POST "$BASE/store" \
49
+ -H "Content-Type: application/json" \
50
+ -d "$1"
51
+ }
52
+
53
+ R1=$(post '{"content":"Alpha team owns project Atlas","metadata":{"arena":"e2e-tenant-a","kind":"note","probe":"e2e-arena"}}')
54
+ R2=$(post '{"content":"Alpha team owns project Borealis","metadata":{"arena":"e2e-tenant-b","kind":"note","probe":"e2e-arena"}}')
55
+ R3=$(post '{"content":"Bravo team owns project Cobalt","metadata":{"arena":"e2e-tenant-a","kind":"note","probe":"e2e-arena"}}')
56
+ R4=$(post '{"content":"Bravo team owns project Diamond","metadata":{"arena":"e2e-tenant-b","kind":"note","probe":"e2e-arena"}}')
57
+
58
+ [ -n "$R1" ] && [ -n "$R2" ] && [ -n "$R3" ] && [ -n "$R4" ] \
59
+ && ok "stored 4 docs across 2 arenas" \
60
+ || fail "store"
61
+
62
+ # Indexing is async on some layers — give the stack a brief settle.
63
+ sleep 4
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # /search — arena scoping. tenant-a should never see Borealis/Diamond,
67
+ # tenant-b should never see Atlas/Cobalt.
68
+ # ---------------------------------------------------------------------------
69
+
70
+ echo ""
71
+ echo "=== /search arena=e2e-tenant-a ==="
72
+ SA=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
73
+ -d '{"query":"team project","limit":20,"arena":"e2e-tenant-a"}')
74
+ echo " hits: $(echo "$SA" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("results",[])))')"
75
+
76
+ leak_a=$(echo "$SA" | python3 -c '
77
+ import json,sys
78
+ data=json.load(sys.stdin).get("results",[])
79
+ bad=[r for r in data if "Borealis" in r.get("content","") or "Diamond" in r.get("content","")]
80
+ print(len(bad))')
81
+ [ "$leak_a" = "0" ] && ok "tenant-a: no Borealis/Diamond leakage" \
82
+ || fail "tenant-a leaked $leak_a tenant-b docs"
83
+
84
+ found_atlas=$(echo "$SA" | python3 -c '
85
+ import json,sys
86
+ data=json.load(sys.stdin).get("results",[])
87
+ print("yes" if any("Atlas" in r.get("content","") for r in data) else "no")')
88
+ [ "$found_atlas" = "yes" ] && ok "tenant-a: Atlas recovered" \
89
+ || fail "tenant-a missing Atlas"
90
+
91
+ echo ""
92
+ echo "=== /search arena=e2e-tenant-b ==="
93
+ SB=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
94
+ -d '{"query":"team project","limit":20,"arena":"e2e-tenant-b"}')
95
+ echo " hits: $(echo "$SB" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("results",[])))')"
96
+
97
+ leak_b=$(echo "$SB" | python3 -c '
98
+ import json,sys
99
+ data=json.load(sys.stdin).get("results",[])
100
+ bad=[r for r in data if "Atlas" in r.get("content","") or "Cobalt" in r.get("content","")]
101
+ print(len(bad))')
102
+ [ "$leak_b" = "0" ] && ok "tenant-b: no Atlas/Cobalt leakage" \
103
+ || fail "tenant-b leaked $leak_b tenant-a docs"
104
+
105
+ found_borealis=$(echo "$SB" | python3 -c '
106
+ import json,sys
107
+ data=json.load(sys.stdin).get("results",[])
108
+ print("yes" if any("Borealis" in r.get("content","") for r in data) else "no")')
109
+ [ "$found_borealis" = "yes" ] && ok "tenant-b: Borealis recovered" \
110
+ || fail "tenant-b missing Borealis"
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # /search with metadata_filter — arena+probe combo should still scope.
114
+ # ---------------------------------------------------------------------------
115
+
116
+ echo ""
117
+ echo "=== /search metadata_filter probe=e2e-arena ==="
118
+ SF=$(curl -sf -X POST "$BASE/search" -H "Content-Type: application/json" \
119
+ -d '{"query":"team","limit":20,"arena":"e2e-tenant-a","metadata_filter":{"probe":"e2e-arena"}}')
120
+ all_match=$(echo "$SF" | python3 -c '
121
+ import json,sys
122
+ data=json.load(sys.stdin).get("results",[])
123
+ ok=all(r.get("metadata",{}).get("probe")=="e2e-arena" and r.get("metadata",{}).get("arena")=="e2e-tenant-a" for r in data)
124
+ print("yes" if ok and data else "no")')
125
+ [ "$all_match" = "yes" ] && ok "metadata_filter scopes to probe + arena" \
126
+ || fail "metadata_filter let other rows through"
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # /forget — by metadata_contains. Cleans up so reruns are idempotent.
130
+ # ---------------------------------------------------------------------------
131
+
132
+ echo ""
133
+ echo "=== /forget probe=e2e-arena ==="
134
+ F=$(curl -sf -X POST "$BASE/forget" -H "Content-Type: application/json" \
135
+ -d '{"metadata_contains":{"probe":"e2e-arena"}}')
136
+ deleted=$(echo "$F" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("deleted",0))')
137
+ echo " deleted: $deleted"
138
+ [ "$deleted" -ge "1" ] && ok "/forget removed at least 1 row" || fail "/forget"
139
+
140
+ echo ""
141
+ echo "=== Result ==="
142
+ echo " PASS: $PASS"
143
+ echo " FAIL: $FAIL"
144
+ exit $FAIL
@@ -0,0 +1,13 @@
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN pip install --no-cache-dir fastapi "uvicorn[standard]" pydantic
6
+
7
+ COPY server.py /app/server.py
8
+
9
+ ENV EMBED_DIM=4096
10
+
11
+ EXPOSE 8041
12
+
13
+ CMD ["python", "-m", "uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8041"]
@@ -0,0 +1,80 @@
1
+ """Deterministic embedding stub for hermetic CI runs.
2
+
3
+ Returns a fixed-dim vector per input string, derived from a hash so the
4
+ same text always maps to the same vector. Cosine similarity between two
5
+ embeddings equals 1.0 only for identical input strings, and decreases
6
+ roughly with edit distance — enough to exercise the engine's vector
7
+ search paths in CI without an actual embedding model.
8
+
9
+ Speaks both shapes the engine uses:
10
+ POST /v1/embeddings { input, model } -> { data:[{embedding:[...] }] }
11
+ POST /v1/embed { input, model } -> { embeddings:[[...]] }
12
+
13
+ Run:
14
+ EMBED_DIM=4096 uvicorn server:app --host 0.0.0.0 --port 8041
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ import math
20
+ import os
21
+ from typing import Any
22
+
23
+ from fastapi import FastAPI
24
+ from pydantic import BaseModel
25
+
26
+ EMBED_DIM = int(os.environ.get("EMBED_DIM", "4096"))
27
+
28
+ app = FastAPI(title="embed-stub")
29
+
30
+
31
+ class EmbedRequest(BaseModel):
32
+ input: Any
33
+ model: str | None = None
34
+
35
+
36
+ def _vector_for(text: str) -> list[float]:
37
+ """Deterministic vector: hash the text, expand to EMBED_DIM, L2-normalise."""
38
+ text = text or ""
39
+ seed = hashlib.sha256(text.encode("utf-8")).digest()
40
+ raw: list[int] = []
41
+ counter = 0
42
+ while len(raw) < EMBED_DIM:
43
+ chunk = hashlib.sha256(seed + counter.to_bytes(4, "big")).digest()
44
+ raw.extend(chunk)
45
+ counter += 1
46
+ floats = [(b - 127.5) / 127.5 for b in raw[:EMBED_DIM]]
47
+ norm = math.sqrt(sum(x * x for x in floats)) or 1.0
48
+ return [x / norm for x in floats]
49
+
50
+
51
+ def _normalise_inputs(inp: Any) -> list[str]:
52
+ if isinstance(inp, str):
53
+ return [inp]
54
+ if isinstance(inp, list):
55
+ return [str(x) for x in inp]
56
+ return [str(inp)]
57
+
58
+
59
+ @app.get("/health")
60
+ def health() -> dict:
61
+ return {"status": "ok", "dim": EMBED_DIM}
62
+
63
+
64
+ @app.post("/v1/embeddings")
65
+ def openai_embeddings(req: EmbedRequest) -> dict:
66
+ texts = _normalise_inputs(req.input)
67
+ return {
68
+ "object": "list",
69
+ "data": [
70
+ {"object": "embedding", "index": i, "embedding": _vector_for(t)}
71
+ for i, t in enumerate(texts)
72
+ ],
73
+ "model": req.model or "embed-stub",
74
+ }
75
+
76
+
77
+ @app.post("/v1/embed")
78
+ def lambda_gateway_embed(req: EmbedRequest) -> dict:
79
+ texts = _normalise_inputs(req.input)
80
+ return {"embeddings": [_vector_for(t) for t in texts]}