@pentatonic-ai/ai-agent-sdk 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.8.3",
3
+ "version": "0.8.4",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -453,21 +453,95 @@ async def health():
453
453
  if failures:
454
454
  out["status"] = "degraded" if failures < 3 else "down"
455
455
 
456
- # Memory count: query L6 doc-store as authoritative. L6 /stats
457
- # returns vector_chunks (Milvus) and fts_chunks (sqlite content
458
- # table). Under healthy operation they're equal — take the max so
459
- # the count is honest if one side is mid-rebuild.
456
+ # Per-layer chunk counts. Replaces the previous single `memories` int
457
+ # which only reflected L6's vector chunk count — misleading because
458
+ # L0/L4/L5 hold different (overlapping) projections of the corpus.
459
+ # Each layer is independently probed; transient failure on one layer
460
+ # leaves its slot null rather than zeroing the whole field.
461
+ memories: dict[str, int | None] = {
462
+ "l0_bm25_chunks": None,
463
+ "l4_vectors": None,
464
+ "l5_chats_chunks": None,
465
+ "l6_vector_chunks": None,
466
+ "l6_fts_chunks": None,
467
+ }
468
+ # L0 lives inside L2; L2 exposes /index-internal-stats with the counts.
469
+ try:
470
+ r = await _client().get(f"{L2_PROXY_URL}/index-internal-stats", timeout=3.0)
471
+ if r.status_code == 200:
472
+ stats = r.json()
473
+ memories["l0_bm25_chunks"] = int(stats.get("l0_chunks") or 0)
474
+ except Exception:
475
+ pass
476
+ # L4 reports n_vectors on its own /health.
477
+ try:
478
+ r = await _client().get(f"{L4_VEC_URL}/health", timeout=3.0)
479
+ if r.status_code == 200:
480
+ memories["l4_vectors"] = int(r.json().get("n_vectors") or 0)
481
+ except Exception:
482
+ pass
483
+ # L5 reports per-collection counts on /health. We surface chats —
484
+ # the only collection currently populated; emails/contacts/memory
485
+ # collections coming back online with L5 collection bootstrap.
486
+ try:
487
+ r = await _client().get(f"{L5_MILVUS_URL}/health", timeout=3.0)
488
+ if r.status_code == 200:
489
+ colls = r.json().get("collections") or {}
490
+ chats = colls.get("chats") if isinstance(colls, dict) else None
491
+ if isinstance(chats, dict):
492
+ memories["l5_chats_chunks"] = int(chats.get("count") or 0)
493
+ except Exception:
494
+ pass
495
+ # L6 exposes vector vs fts splits on /stats.
460
496
  try:
461
497
  r = await _client().get(f"{L6_DOC_URL}/stats", timeout=3.0)
462
498
  if r.status_code == 200:
463
499
  stats = r.json()
464
- out["memories"] = max(
465
- int(stats.get("vector_chunks") or 0),
466
- int(stats.get("fts_chunks") or 0),
467
- int(stats.get("total_chunks") or 0),
468
- )
500
+ memories["l6_vector_chunks"] = int(stats.get("vector_chunks") or 0)
501
+ memories["l6_fts_chunks"] = int(stats.get("fts_chunks") or 0)
469
502
  except Exception:
470
- out["memories"] = None
503
+ pass
504
+ out["memories"] = memories
505
+ return out
506
+
507
+
508
+ @app.get("/health/deep")
509
+ async def health_deep():
510
+ """Aggregate functional probe — fans out to each layer's /health/deep
511
+ and reports per-layer ok/status.
512
+
513
+ Where /health checks "process up + port answering", /health/deep
514
+ actually exercises embed → write → search round-trips on each layer
515
+ that supports it. Catches the class of bug that masquerades as
516
+ "healthy" — request handlers 500'ing while the process stays up.
517
+
518
+ Slower than /health (~1–2s); intended for ops/monitoring/cron use,
519
+ not the deploy gate or compose healthcheck.
520
+ """
521
+ import asyncio
522
+ out: dict[str, Any] = {"status": "ok", "ok": True, "layers": {}}
523
+
524
+ async def _probe_deep(name: str, url: str) -> tuple[str, dict]:
525
+ try:
526
+ r = await _client().get(url, timeout=15.0)
527
+ if r.status_code != 200:
528
+ return name, {"ok": False, "status": f"http {r.status_code}"}
529
+ return name, r.json()
530
+ except Exception as exc:
531
+ return name, {"ok": False, "status": f"unreachable: {type(exc).__name__}"}
532
+
533
+ results = await asyncio.gather(
534
+ _probe_deep("l4", f"{L4_VEC_URL}/health/deep"),
535
+ _probe_deep("l5", f"{L5_MILVUS_URL}/health/deep"),
536
+ _probe_deep("l6", f"{L6_DOC_URL}/health/deep"),
537
+ )
538
+ for name, body in results:
539
+ out["layers"][name] = body
540
+ if not body.get("ok", False):
541
+ out["ok"] = False
542
+
543
+ if not out["ok"]:
544
+ out["status"] = "degraded"
471
545
  return out
472
546
 
473
547
 
@@ -12,6 +12,7 @@ Port: 8031 (replaces neo4j-qmd-proxy.py)
12
12
  """
13
13
 
14
14
  import argparse
15
+ import asyncio
15
16
  import hashlib
16
17
  import json
17
18
  import logging
@@ -19,14 +20,16 @@ import os
19
20
  import sqlite3
20
21
  import sys
21
22
  import time
23
+ from contextlib import asynccontextmanager
22
24
  from datetime import datetime
23
25
  from pathlib import Path
24
- from typing import Any, Dict, List, Optional, Set
26
+ from typing import Any, AsyncIterator, Dict, List, Optional, Set
25
27
 
26
28
  import re
29
+ import httpx
27
30
  import requests
28
31
  from fastapi import FastAPI, HTTPException, Request
29
- from neo4j import GraphDatabase
32
+ from neo4j import AsyncGraphDatabase
30
33
  from neo4j.time import DateTime as Neo4jDateTime, Date as Neo4jDate
31
34
  from pydantic import BaseModel
32
35
  import uvicorn
@@ -129,7 +132,67 @@ TRACKER_FILE = WORKSPACE / "memory" / "memory-tracker.jsonl"
129
132
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
130
133
  log = logging.getLogger("sequential-hybridrag")
131
134
 
132
- app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0")
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Async driver + HTTP client singletons
138
+ # ---------------------------------------------------------------------------
139
+ #
140
+ # v0.8.4: migrated from sync neo4j (`GraphDatabase`) to async
141
+ # (`AsyncGraphDatabase`) and from per-call drivers to one process-wide
142
+ # pool. Previously every handler created a fresh sync driver inline,
143
+ # which (a) blocked the event loop on connection establishment and
144
+ # every query, and (b) held one thread per in-flight request. Under
145
+ # sustained ingest the threadpool exhausted and /health itself timed
146
+ # out (the `l0/l1/l2: unreachable: ReadTimeout` we saw in prod).
147
+ #
148
+ # The async driver multiplexes many queries through bolt without
149
+ # blocking the event loop; one shared pool means connection
150
+ # establishment is amortised. fastapi lifespan handles open/close.
151
+
152
+ _neo4j_driver: "AsyncGraphDatabase | None" = None
153
+ _http_client: "httpx.AsyncClient | None" = None
154
+
155
+
156
+ def get_neo4j_driver():
157
+ """Lazy module-level singleton — created at first call (which the
158
+ lifespan handler does at startup). Sharing across requests is the
159
+ documented neo4j Python-driver pattern; the driver itself is
160
+ thread- and task-safe."""
161
+ global _neo4j_driver
162
+ if _neo4j_driver is None:
163
+ _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
164
+ return _neo4j_driver
165
+
166
+
167
+ def get_http_client() -> httpx.AsyncClient:
168
+ """Shared async HTTP client for L4/L5/L6 fan-out and embedding-proxy
169
+ pass-through. Reuses TCP connections via httpx's built-in pool."""
170
+ global _http_client
171
+ if _http_client is None:
172
+ _http_client = httpx.AsyncClient(timeout=30.0)
173
+ return _http_client
174
+
175
+
176
+ @asynccontextmanager
177
+ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
178
+ """Open the neo4j driver + HTTP client at process startup, close on
179
+ shutdown. Without this, the first request pays driver-open latency
180
+ and the driver is never properly closed on SIGTERM (leaking conns)."""
181
+ global _neo4j_driver, _http_client
182
+ _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
183
+ _http_client = httpx.AsyncClient(timeout=30.0)
184
+ try:
185
+ yield
186
+ finally:
187
+ if _neo4j_driver is not None:
188
+ await _neo4j_driver.close()
189
+ _neo4j_driver = None
190
+ if _http_client is not None:
191
+ await _http_client.aclose()
192
+ _http_client = None
193
+
194
+
195
+ app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0", lifespan=lifespan)
133
196
 
134
197
  # ---------------------------------------------------------------------------
135
198
  # Memory Usage Tracking
@@ -267,7 +330,7 @@ def extract_query_entities(query: str) -> List[str]:
267
330
  log.info(f"Extracted entities: {potential_entities}")
268
331
  return potential_entities
269
332
 
270
- def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], increment: float = 0.05) -> None:
333
+ async def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], increment: float = 0.05) -> None:
271
334
  """Hebbian: strengthen edges between co-accessed nodes during query.
272
335
 
273
336
  Scoped by arena so a search inside tenant A can't reinforce edges
@@ -281,7 +344,7 @@ def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], incre
281
344
  for i, n1 in enumerate(node_names):
282
345
  for n2 in node_names[i+1:]:
283
346
  try:
284
- session.run(
347
+ await session.run(
285
348
  """MATCH (a {name: $n1})-[r]-(b {name: $n2})
286
349
  WHERE a.arena IN $arenas AND b.arena IN $arenas
287
350
  SET r.weight = coalesce(r.weight, 1.0) + $inc,
@@ -292,7 +355,7 @@ def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], incre
292
355
  pass # non-critical
293
356
 
294
357
 
295
- def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str], limit: int = 12) -> Dict:
358
+ async def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str], limit: int = 12) -> Dict:
296
359
  """Phase 1: Neo4j graph search with spreading activation + Hebbian.
297
360
 
298
361
  `arenas` is the tenant-scope set the caller is authorised for —
@@ -306,11 +369,11 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
306
369
  log.warning("search_neo4j_sequential called without arenas — returning empty results")
307
370
  return {"results": [], "graph_entities": [], "entity_count": 0}
308
371
  try:
309
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
372
+ driver = get_neo4j_driver()
310
373
  results = []
311
374
  graph_entities = set()
312
375
 
313
- with driver.session() as session:
376
+ async with driver.session() as session:
314
377
  # Search for specific entities — use weighted spreading activation
315
378
  for entity in entities:
316
379
  # Direct match first — arena-scoped on every node we touch.
@@ -326,9 +389,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
326
389
  LIMIT $limit
327
390
  """
328
391
 
329
- records = session.run(cypher, entity=entity, arenas=arenas, limit=8)
392
+ result = await session.run(cypher, entity=entity, arenas=arenas, limit=8)
330
393
 
331
- for record in records:
394
+ async for record in result:
332
395
  node = _serialize_neo4j_value(dict(record["n"]))
333
396
  rel = record["r"]
334
397
  connected = record["connected"]
@@ -366,7 +429,7 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
366
429
  # the filter, an activation could walk into another
367
430
  # tenant's graph via a name-collision on the start node.
368
431
  if entity:
369
- activation_results = session.run("""
432
+ activation_results = await session.run("""
370
433
  MATCH (start)-[r1]-(mid)-[r2]-(end)
371
434
  WHERE start.name CONTAINS $entity
372
435
  AND start.arena IN $arenas
@@ -382,7 +445,7 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
382
445
  LIMIT 5
383
446
  """, entity=entity, arenas=arenas)
384
447
 
385
- for rec in activation_results:
448
+ async for rec in activation_results:
386
449
  end_node = _serialize_neo4j_value(dict(rec["end"])) if rec["end"] else {}
387
450
  name = end_node.get("name", "")
388
451
  if name and name not in graph_entities:
@@ -413,9 +476,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
413
476
  LIMIT $limit
414
477
  """
415
478
 
416
- records = session.run(cypher, term=word, arenas=arenas, limit=4)
479
+ result = await session.run(cypher, term=word, arenas=arenas, limit=4)
417
480
 
418
- for record in records:
481
+ async for record in result:
419
482
  node = _serialize_neo4j_value(dict(record["n"]))
420
483
  context = f"Related: {node}"
421
484
  graph_entities.add(node.get('name', ''))
@@ -430,9 +493,9 @@ def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str],
430
493
  })
431
494
 
432
495
  # Hebbian: strengthen edges between all accessed entities
433
- _hebbian_strengthen(session, arenas, list(graph_entities))
496
+ await _hebbian_strengthen(session, arenas, list(graph_entities))
434
497
 
435
- driver.close()
498
+ # Note: driver is a module-level singleton, do NOT close here.
436
499
 
437
500
  return {
438
501
  "results": results[:limit],
@@ -861,8 +924,8 @@ def search_l0_bm25(query: str, limit: int = 6, arena: str = None,
861
924
 
862
925
  L5_API_URL = os.environ.get("PME_L5_URL", "http://127.0.0.1:8034")
863
926
 
864
- def search_l5_communications(query: str, limit: int = 6, arena: str = None,
865
- arenas: List[str] = None) -> List[Dict]:
927
+ async def search_l5_communications(query: str, limit: int = 6, arena: str = None,
928
+ arenas: List[str] = None) -> List[Dict]:
866
929
  """Search L5 Communications Context via L5 API (emails, chats, calendar).
867
930
 
868
931
  arena / arenas (optional): forwarded to L5; filters Milvus by the
@@ -876,7 +939,7 @@ def search_l5_communications(query: str, limit: int = 6, arena: str = None,
876
939
  params: list = [("q", query), ("limit", str(limit))]
877
940
  for a in arena_list:
878
941
  params.append(("arenas", a))
879
- resp = requests.get(
942
+ resp = await get_http_client().get(
880
943
  f"{L5_API_URL}/search",
881
944
  params=params,
882
945
  timeout=10,
@@ -922,8 +985,8 @@ def search_l5_communications(query: str, limit: int = 6, arena: str = None,
922
985
  # L6: Document Store Search
923
986
  L6_URL = os.environ.get("PME_L6_URL", "http://localhost:8037")
924
987
 
925
- def search_l6_documents(query: str, limit: int = 6, arena: str = None,
926
- arenas: List[str] = None) -> List[Dict]:
988
+ async def search_l6_documents(query: str, limit: int = 6, arena: str = None,
989
+ arenas: List[str] = None) -> List[Dict]:
927
990
  """Search L6 Document Store (research, legal, financial, project docs).
928
991
 
929
992
  arena / arenas (optional): forwarded to L6 — L6 supports multi-arena
@@ -939,7 +1002,7 @@ def search_l6_documents(query: str, limit: int = 6, arena: str = None,
939
1002
  ]
940
1003
  for a in arena_list:
941
1004
  params.append(("arenas", a))
942
- resp = requests.get(
1005
+ resp = await get_http_client().get(
943
1006
  f"{L6_URL}/search",
944
1007
  params=params,
945
1008
  timeout=10,
@@ -986,50 +1049,52 @@ def search_l6_documents(query: str, limit: int = 6, arena: str = None,
986
1049
  return []
987
1050
 
988
1051
 
989
- def sequential_hybridrag_search(query: str, limit: int = 16,
990
- arena: str = None,
991
- arenas: List[str] = None) -> List[Dict]:
1052
+ async def sequential_hybridrag_search(query: str, limit: int = 16,
1053
+ arena: str = None,
1054
+ arenas: List[str] = None) -> List[Dict]:
992
1055
  """Main HybridRAG processing: L0 BM25 → L1 System Files → L2 HybridRAG (L3 Graph + L4 Vector + L5 Comms + L6 Docs).
993
1056
 
994
1057
  arena / arenas (optional): tenant + user scope. Multi-arena lets a
995
1058
  user's search span tenant-wide rows + their own user-scoped rows in
996
1059
  a single hybrid pass. Forwarded to L0, L5, L6 native filters; L4
997
1060
  and L3 still rely on the compat shim post-filter.
1061
+
1062
+ Async since v0.8.4: independent layer fan-out runs concurrently via
1063
+ asyncio.gather; sync workers (sqlite, file I/O, PyTorch reranker)
1064
+ are dispatched to threads via asyncio.to_thread to keep the event
1065
+ loop responsive under sustained ingest.
998
1066
  """
999
1067
  arena_list = list(arenas) if arenas else ([arena] if arena else [])
1000
1068
  start_time = time.time()
1001
1069
  log.info(f"Starting sequential HybridRAG search for: '{query}' arenas={arena_list!r}")
1002
1070
 
1003
- # L0: BM25 workspace memory (keyword search complements semantic layers)
1004
- l0_results = search_l0_bm25(query, limit=6, arenas=arena_list)
1071
+ # L0 (sqlite) + L1 (file I/O) + entity extraction run in parallel.
1072
+ # Each is sync — offloaded to the threadpool so the event loop stays
1073
+ # free for the in-flight L3/L5/L6 calls.
1074
+ l0_results, system_results, entities = await asyncio.gather(
1075
+ asyncio.to_thread(search_l0_bm25, query, 6, None, arena_list),
1076
+ asyncio.to_thread(search_core_memory_files, query, 4),
1077
+ asyncio.to_thread(extract_query_entities, query),
1078
+ )
1005
1079
  log.info(f"L0 BM25 workspace: {len(l0_results)} results")
1006
-
1007
- # L1: System Files (HIGHEST PRIORITY)
1008
- system_results = search_core_memory_files(query, limit=4)
1009
1080
  log.info(f"L1 System files: {len(system_results)} results")
1010
1081
 
1011
- # L2: HybridRAG orchestration
1012
- # L3: Graph search (entity extraction + Neo4j) — arena-scoped so a
1013
- # tenant's search can never traverse another tenant's entity graph
1014
- # via name collisions on shared :Entity nodes. The post-filter shim
1015
- # protects chunks; this protects the entity-walking layer too.
1016
- entities = extract_query_entities(query)
1017
- graph_context = search_neo4j_sequential(query, entities, arena_list, limit=8)
1082
+ # L3: Graph search (now native async via AsyncGraphDatabase).
1083
+ graph_context = await search_neo4j_sequential(query, entities, arena_list, limit=8)
1018
1084
  log.info(f"L3 Graph search: {len(graph_context['results'])} results, {graph_context['entity_count']} entities")
1019
1085
 
1020
- # HyDE: expand query for better vector embeddings
1021
- hyde_query = hyde_expand(query)
1086
+ # HyDE expansion is sync (LLM call), offload to thread.
1087
+ hyde_query = await asyncio.to_thread(hyde_expand, query)
1022
1088
 
1023
- # L4: Vector search (informed by L3 graph context + HyDE)
1024
- vector_results = search_qmd_informed(hyde_query, graph_context, limit=8)
1089
+ # L4/L5/L6 fan out concurrently L4 (sqlite) via to_thread, L5/L6
1090
+ # native async via httpx.AsyncClient.
1091
+ vector_results, l5_results, l6_results = await asyncio.gather(
1092
+ asyncio.to_thread(search_qmd_informed, hyde_query, graph_context, 8),
1093
+ search_l5_communications(hyde_query, limit=6, arenas=arena_list),
1094
+ search_l6_documents(hyde_query, limit=6, arenas=arena_list),
1095
+ )
1025
1096
  log.info(f"L4 Vector search: {len(vector_results)} results (HyDE={'on' if hyde_query != query else 'off'})")
1026
-
1027
- # L5: Communications Context (emails, chats, calendar) — also use HyDE
1028
- l5_results = search_l5_communications(hyde_query, limit=6, arenas=arena_list)
1029
1097
  log.info(f"L5 Communications: {len(l5_results)} results")
1030
-
1031
- # L6: Document Store (research, legal, financial, project docs)
1032
- l6_results = search_l6_documents(hyde_query, limit=6, arenas=arena_list)
1033
1098
  log.info(f"L6 Documents: {len(l6_results)} results")
1034
1099
 
1035
1100
  # L2: HybridRAG fusion (combines all layers with L1 priority)
@@ -1046,8 +1111,9 @@ def sequential_hybridrag_search(query: str, limit: int = 16,
1046
1111
  # Sort by layer priority: L1 System (1.0) > L3 Graph (0.9) > L4 Vector (0.7+)
1047
1112
  deduplicated.sort(key=lambda x: x["score"], reverse=True)
1048
1113
 
1049
- # Cross-encoder reranking: re-embed top results and blend scores
1050
- deduplicated = cross_encoder_rerank(query, deduplicated, top_k=limit)
1114
+ # Cross-encoder reranking: re-embed top results and blend scores.
1115
+ # PyTorch CrossEncoder.predict is sync — offload to thread.
1116
+ deduplicated = await asyncio.to_thread(cross_encoder_rerank, query, deduplicated, limit)
1051
1117
 
1052
1118
  # Track layer usage for evolution
1053
1119
  search_time_ms = (time.time() - start_time) * 1000
@@ -1094,7 +1160,7 @@ async def search_endpoint(request: Request) -> dict:
1094
1160
  if not query:
1095
1161
  raise HTTPException(status_code=400, detail="query is required")
1096
1162
 
1097
- results = sequential_hybridrag_search(query, limit=limit, arena=arena, arenas=arenas)
1163
+ results = await sequential_hybridrag_search(query, limit=limit, arena=arena, arenas=arenas)
1098
1164
 
1099
1165
  # Also return raw graph entities for context enrichment.
1100
1166
  # Same arena scope as the cascade search above — without it
@@ -1172,7 +1238,7 @@ async def chat_completions(request: ChatCompletionRequest) -> dict:
1172
1238
  # empty when no arenas are supplied; callers that need L3 must
1173
1239
  # pass `arena` or `arenas` on the request body.
1174
1240
  start_time = time.time()
1175
- results = sequential_hybridrag_search(
1241
+ results = await sequential_hybridrag_search(
1176
1242
  query, limit=16, arena=request.arena, arenas=request.arenas,
1177
1243
  )
1178
1244
  search_time = time.time() - start_time
@@ -1244,33 +1310,35 @@ async def check_contradictions(node_name: str, arena: Optional[str] = None) -> d
1244
1310
  detail="arena query parameter is required to scope contradiction lookup",
1245
1311
  )
1246
1312
  try:
1247
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
1313
+ driver = get_neo4j_driver()
1248
1314
  contradictions = []
1249
- with driver.session() as session:
1315
+ async with driver.session() as session:
1250
1316
  # Find the node — must be in the caller's arena.
1251
- node = session.run(
1317
+ result = await session.run(
1252
1318
  """MATCH (n) WHERE toLower(n.name) = toLower($name) AND n.arena = $arena
1253
1319
  RETURN elementId(n) AS id""",
1254
1320
  name=node_name, arena=arena,
1255
- ).single()
1321
+ )
1322
+ node = await result.single()
1256
1323
  if not node:
1257
1324
  return {"node": node_name, "contradictions": [], "error": "Node not found"}
1258
1325
  nid = node["id"]
1259
1326
 
1260
1327
  # Explicit CONTRADICTS — both endpoints must be in the same arena.
1261
- for rec in session.run(
1328
+ result = await session.run(
1262
1329
  """MATCH (a)-[r:CONTRADICTS]-(b)
1263
1330
  WHERE elementId(a) = $nid AND b.arena = $arena
1264
1331
  RETURN a.name AS a, b.name AS b, r.reason AS reason""",
1265
1332
  nid=nid, arena=arena,
1266
- ):
1333
+ )
1334
+ async for rec in result:
1267
1335
  contradictions.append({"type": "explicit", "a": rec["a"], "b": rec["b"], "reason": rec["reason"]})
1268
1336
 
1269
1337
  # Property conflicts via shared neighbour — every node along
1270
1338
  # the (a)--(shared)--(b) path filtered by arena so a shared
1271
1339
  # neighbour from another tenant can't trigger a false-positive
1272
1340
  # conflict in this tenant's view.
1273
- for rec in session.run(
1341
+ result = await session.run(
1274
1342
  """MATCH (a)--(shared)--(b)
1275
1343
  WHERE elementId(a) = $nid AND a <> b
1276
1344
  AND shared.arena = $arena AND b.arena = $arena
@@ -1281,28 +1349,28 @@ async def check_contradictions(node_name: str, arena: Optional[str] = None) -> d
1281
1349
  WHERE size(ck) > 0
1282
1350
  RETURN a.name AS a, b.name AS b, shared.name AS via, ck
1283
1351
  LIMIT 10""", nid=nid, arena=arena,
1284
- ):
1352
+ )
1353
+ async for rec in result:
1285
1354
  contradictions.append({
1286
1355
  "type": "property_conflict", "a": rec["a"], "b": rec["b"],
1287
1356
  "via": rec["via"], "conflicting_keys": rec["ck"]
1288
1357
  })
1289
- driver.close()
1290
1358
  return {"node": node_name, "contradictions": contradictions, "count": len(contradictions)}
1291
1359
  except Exception as e:
1292
1360
  raise HTTPException(status_code=500, detail=str(e))
1293
1361
 
1294
- def _check_l5_health() -> bool:
1362
+ async def _check_l5_health() -> bool:
1295
1363
  """Quick check if L5 Communications API is responding."""
1296
1364
  try:
1297
- resp = requests.get(f"{L5_API_URL}/health", timeout=3)
1365
+ resp = await get_http_client().get(f"{L5_API_URL}/health", timeout=3)
1298
1366
  return resp.status_code == 200
1299
1367
  except Exception:
1300
1368
  return False
1301
1369
 
1302
- def _check_l6_health() -> bool:
1370
+ async def _check_l6_health() -> bool:
1303
1371
  """Quick check if L6 Document Store is responding."""
1304
1372
  try:
1305
- resp = requests.get(f"{L6_URL}/health", timeout=3)
1373
+ resp = await get_http_client().get(f"{L6_URL}/health", timeout=3)
1306
1374
  return resp.status_code == 200 and resp.json().get("status") in ("ok", "degraded")
1307
1375
  except Exception:
1308
1376
  return False
@@ -1323,16 +1391,15 @@ async def health() -> dict:
1323
1391
 
1324
1392
  neo4j_healthy = False
1325
1393
  try:
1326
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
1327
- with driver.session() as session:
1328
- session.run("RETURN 1")
1394
+ driver = get_neo4j_driver()
1395
+ async with driver.session() as session:
1396
+ await session.run("RETURN 1")
1329
1397
  neo4j_healthy = True
1330
- driver.close()
1331
1398
  except Exception as e:
1332
1399
  logging.debug(f"Suppressed: {e}")
1333
1400
 
1334
- l5_reachable = _check_l5_health()
1335
- l6_reachable = _check_l6_health()
1401
+ l5_reachable = await _check_l5_health()
1402
+ l6_reachable = await _check_l6_health()
1336
1403
 
1337
1404
  # Top-level status: degrade only on layers L2 is the sole gatekeeper for.
1338
1405
  # L5/L6 are independent services probed by the compat shim.
@@ -1411,15 +1478,19 @@ def _extract_entities_for_kg(text: str, max_entities: int = 32) -> List[str]:
1411
1478
  return found[:max_entities]
1412
1479
 
1413
1480
 
1414
- def _embed_batch_local(texts: List[str]) -> List[List[float]]:
1415
- """Batch embed via the shared EmbedClient. Returns vectors in input order."""
1481
+ async def _embed_batch_local(texts: List[str]) -> List[List[float]]:
1482
+ """Batch embed via the shared EmbedClient (async). Returns vectors in input order."""
1416
1483
  if not texts:
1417
1484
  return []
1418
1485
  try:
1419
- return _embed_client().embed_batch(texts)
1486
+ return await _embed_client().embed_batch_async(texts)
1420
1487
  except Exception as e:
1421
1488
  log.warning(f"NV-Embed batch failed: {e}; trying singletons")
1422
- return [get_embedding(t) for t in texts]
1489
+ # Singleton fallback stays sync (each one-shot embed is small);
1490
+ # offload to thread so we don't block the loop.
1491
+ return await asyncio.gather(
1492
+ *(asyncio.to_thread(get_embedding, t) for t in texts)
1493
+ )
1423
1494
 
1424
1495
 
1425
1496
  class IndexInternalBatchRequest(BaseModel):
@@ -1504,7 +1575,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1504
1575
  # ---- L4 QMD vec (qmd.sqlite) ----------------------------------------
1505
1576
  l4_inserted = 0
1506
1577
  try:
1507
- embeddings = _embed_batch_local([n["content"] for n in norm])
1578
+ embeddings = await _embed_batch_local([n["content"] for n in norm])
1508
1579
  if len(embeddings) != len(norm):
1509
1580
  log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
1510
1581
  qmd_db = Path(QMD_DB_PATH)
@@ -1560,30 +1631,30 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1560
1631
  l3_entities = 0
1561
1632
  l3_chunks = 0
1562
1633
  try:
1563
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
1564
- with driver.session() as session:
1634
+ driver = get_neo4j_driver()
1635
+ async with driver.session() as session:
1565
1636
  # Indexes — idempotent. The compound (arena, name) is the
1566
1637
  # right shape now that entities are arena-scoped; the legacy
1567
1638
  # entity_name index stays for the wipe-migration to work
1568
1639
  # against pre-arena rows, then can be dropped in a follow-up.
1569
1640
  try:
1570
- session.run("CREATE INDEX entity_arena_name IF NOT EXISTS FOR (n:Entity) ON (n.arena, n.name)")
1571
- session.run("CREATE INDEX person_arena_email IF NOT EXISTS FOR (n:Person) ON (n.arena, n.email)")
1572
- session.run("CREATE INDEX chunk_arena IF NOT EXISTS FOR (c:Chunk) ON (c.arena)")
1573
- session.run("CREATE INDEX chunk_id IF NOT EXISTS FOR (c:Chunk) ON (c.id)")
1641
+ await session.run("CREATE INDEX entity_arena_name IF NOT EXISTS FOR (n:Entity) ON (n.arena, n.name)")
1642
+ await session.run("CREATE INDEX person_arena_email IF NOT EXISTS FOR (n:Person) ON (n.arena, n.email)")
1643
+ await session.run("CREATE INDEX chunk_arena IF NOT EXISTS FOR (c:Chunk) ON (c.arena)")
1644
+ await session.run("CREATE INDEX chunk_id IF NOT EXISTS FOR (c:Chunk) ON (c.id)")
1574
1645
  # ChannelStat is the denormalised aggregate read by
1575
1646
  # /aggregate on the fast path. Compound index covers
1576
1647
  # the (arena, person_email) lookup that the reader
1577
1648
  # uses; the per-channel rows are returned in one
1578
1649
  # range scan.
1579
- session.run("CREATE INDEX channelstat_arena_email IF NOT EXISTS FOR (s:ChannelStat) ON (s.arena, s.person_email)")
1650
+ await session.run("CREATE INDEX channelstat_arena_email IF NOT EXISTS FOR (s:ChannelStat) ON (s.arena, s.person_email)")
1580
1651
  # UNIQUE constraint on the writer's MERGE key. Without
1581
1652
  # this, two concurrent index-internal-batch transactions
1582
1653
  # can both decide a ChannelStat doesn't exist and create
1583
1654
  # rival nodes — the index doesn't lock, the constraint
1584
1655
  # does. The constraint also implies an index on the
1585
1656
  # full key so the MERGE locks efficiently.
1586
- session.run("CREATE CONSTRAINT channelstat_unique IF NOT EXISTS FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE")
1657
+ await session.run("CREATE CONSTRAINT channelstat_unique IF NOT EXISTS FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE")
1587
1658
  except Exception:
1588
1659
  pass
1589
1660
  for n in norm:
@@ -1602,7 +1673,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1602
1673
  # tenant-isolation anchor. Every read traverses through
1603
1674
  # this node, so getting the arena right here is the
1604
1675
  # single most important invariant of this whole block.
1605
- session.run(
1676
+ await session.run(
1606
1677
  """
1607
1678
  MERGE (c:Chunk {id: $cid})
1608
1679
  SET c.text = $text,
@@ -1618,7 +1689,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1618
1689
 
1619
1690
  # Concept entities — heuristic, arena-scoped.
1620
1691
  for ent in heuristic_entities:
1621
- session.run(
1692
+ await session.run(
1622
1693
  """
1623
1694
  MERGE (e:Entity:Concept {arena: $arena, name: $name})
1624
1695
  ON CREATE SET e.type = 'Concept',
@@ -1639,7 +1710,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1639
1710
  if len(heuristic_entities) >= 2:
1640
1711
  for i in range(len(heuristic_entities)):
1641
1712
  for j in range(i + 1, len(heuristic_entities)):
1642
- session.run(
1713
+ await session.run(
1643
1714
  """
1644
1715
  MATCH (a:Entity:Concept {arena: $arena, name: $a})
1645
1716
  MATCH (b:Entity:Concept {arena: $arena, name: $b})
@@ -1666,7 +1737,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1666
1737
  # flipped to true after the stat update, so replays
1667
1738
  # of the same eventId never double-count even when
1668
1739
  # the chunk already exists.
1669
- session.run(
1740
+ await session.run(
1670
1741
  """
1671
1742
  MERGE (p:Entity:Person {arena: $arena, email: $email})
1672
1743
  ON CREATE SET p.created_at = $now,
@@ -1725,7 +1796,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1725
1796
  l3_entities += 1
1726
1797
  if isinstance(contact_name, str) and contact_name.strip():
1727
1798
  cname = contact_name.strip()
1728
- session.run(
1799
+ await session.run(
1729
1800
  """
1730
1801
  MERGE (p:Entity:Person {arena: $arena, name: $name})
1731
1802
  ON CREATE SET p.created_at = $now,
@@ -1751,7 +1822,7 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1751
1822
  # Link name→email node so the relationships query
1752
1823
  # can resolve either alias to the same person.
1753
1824
  if person_email_node:
1754
- session.run(
1825
+ await session.run(
1755
1826
  """
1756
1827
  MATCH (n:Person {arena: $arena, name: $name})
1757
1828
  MATCH (e:Person {arena: $arena, email: $email})
@@ -1759,7 +1830,6 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
1759
1830
  """,
1760
1831
  arena=arena, name=cname, email=person_email_node,
1761
1832
  )
1762
- driver.close()
1763
1833
  except Exception as e:
1764
1834
  log.error(f"L3 KG write failed: {e}")
1765
1835
 
@@ -1841,25 +1911,28 @@ async def forget_internal(request: Request) -> dict:
1841
1911
  # Neo4j chunks AND entities both carry arena now, so tenant-scoped
1842
1912
  # delete works correctly here even if L0/L4 still need a migration.
1843
1913
  try:
1844
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
1845
- with driver.session() as session:
1914
+ driver = get_neo4j_driver()
1915
+ async with driver.session() as session:
1846
1916
  if arena:
1847
- r1 = session.run(
1917
+ r1 = await session.run(
1848
1918
  "MATCH (c:Chunk {arena: $arena}) DETACH DELETE c RETURN count(c) AS n",
1849
1919
  arena=arena,
1850
1920
  )
1851
- deleted["l3_chunks"] = r1.single()["n"]
1852
- r2 = session.run(
1921
+ rec = await r1.single()
1922
+ deleted["l3_chunks"] = rec["n"]
1923
+ r2 = await session.run(
1853
1924
  "MATCH (e:Entity {arena: $arena}) DETACH DELETE e RETURN count(e) AS n",
1854
1925
  arena=arena,
1855
1926
  )
1856
- deleted["l3_entities"] = r2.single()["n"]
1927
+ rec = await r2.single()
1928
+ deleted["l3_entities"] = rec["n"]
1857
1929
  else: # confirm == "GLOBAL_WIPE", validated above
1858
- r1 = session.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n")
1859
- deleted["l3_chunks"] = r1.single()["n"]
1860
- r2 = session.run("MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n")
1861
- deleted["l3_entities"] = r2.single()["n"]
1862
- driver.close()
1930
+ r1 = await session.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n")
1931
+ rec = await r1.single()
1932
+ deleted["l3_chunks"] = rec["n"]
1933
+ r2 = await session.run("MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n")
1934
+ rec = await r2.single()
1935
+ deleted["l3_entities"] = rec["n"]
1863
1936
  except Exception as e:
1864
1937
  log.error(f"L3 forget failed: {e}")
1865
1938
  return {"status": "ok", "deleted": deleted, "arena": arena, "global_wipe": confirm == "GLOBAL_WIPE"}
@@ -1948,13 +2021,10 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
1948
2021
  seen.add(k)
1949
2022
  safe_group_by.append(k)
1950
2023
 
1951
- try:
1952
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
1953
- except Exception as e:
1954
- raise HTTPException(status_code=500, detail=f"neo4j connect: {e}")
2024
+ driver = get_neo4j_driver()
1955
2025
 
1956
2026
  try:
1957
- with driver.session() as session:
2027
+ async with driver.session() as session:
1958
2028
  # Fast path: read from the ChannelStat denormalisation
1959
2029
  # whenever the caller has an email and is grouping by
1960
2030
  # channel. ChannelStats are written by /index-internal-batch
@@ -1973,7 +2043,7 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
1973
2043
  not safe_group_by or safe_group_by == ["channel"]
1974
2044
  )
1975
2045
  if fast_path_eligible:
1976
- stats_rows = list(session.run(
2046
+ _res = await session.run(
1977
2047
  "MATCH (s:ChannelStat {arena: $arena, person_email: $email})\n"
1978
2048
  "RETURN s.channel AS channel,\n"
1979
2049
  " s.count AS count,\n"
@@ -1983,7 +2053,8 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
1983
2053
  " s.first_seen AS first_seen\n"
1984
2054
  "ORDER BY s.count DESC\n",
1985
2055
  arena=arena, email=contact_email,
1986
- ))
2056
+ )
2057
+ stats_rows = [rec async for rec in _res]
1987
2058
  if stats_rows:
1988
2059
  # Build buckets directly. When group_by=[] we
1989
2060
  # collapse to a single overall bucket; otherwise
@@ -2098,7 +2169,8 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
2098
2169
  buckets: List[AggregateBucket] = []
2099
2170
  total = 0
2100
2171
  latest: Optional[str] = None
2101
- for rec in session.run(cypher, **params):
2172
+ _res = await session.run(cypher, **params)
2173
+ async for rec in _res:
2102
2174
  count = int(rec["count"] or 0)
2103
2175
  total += count
2104
2176
  last_seen = rec["last_seen"]
@@ -2126,8 +2198,6 @@ async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternal
2126
2198
  except Exception as e:
2127
2199
  log.error(f"aggregate-internal failed: {e}")
2128
2200
  raise HTTPException(status_code=500, detail=f"aggregate failed: {e}")
2129
- finally:
2130
- driver.close()
2131
2201
 
2132
2202
 
2133
2203
  @app.get("/index-internal-stats")
@@ -2153,13 +2223,14 @@ async def index_internal_stats() -> dict:
2153
2223
  except Exception as e:
2154
2224
  out["l4_qmd_error"] = str(e)
2155
2225
  try:
2156
- driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
2157
- with driver.session() as session:
2158
- r = session.run("MATCH (c:Chunk) RETURN count(c) AS n").single()
2226
+ driver = get_neo4j_driver()
2227
+ async with driver.session() as session:
2228
+ res = await session.run("MATCH (c:Chunk) RETURN count(c) AS n")
2229
+ r = await res.single()
2159
2230
  out["l3_chunks"] = r["n"] if r else 0
2160
- r = session.run("MATCH (e:Entity) RETURN count(e) AS n").single()
2231
+ res = await session.run("MATCH (e:Entity) RETURN count(e) AS n")
2232
+ r = await res.single()
2161
2233
  out["l3_entities"] = r["n"] if r else 0
2162
- driver.close()
2163
2234
  except Exception as e:
2164
2235
  out["l3_error"] = str(e)
2165
2236
  return out
@@ -136,7 +136,11 @@ def health():
136
136
  n = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
137
137
  conn.close()
138
138
  return {"status": "ok", "loaded": True, "n_vectors": n,
139
- "dim": EMBED_DIM, "db_path": DB_PATH, "backend": "sqlite-vec-fallback"}
139
+ "dim": EMBED_DIM, "db_path": DB_PATH,
140
+ # BLOB+Python-cosine is the intentional implementation path,
141
+ # not a degraded fallback (see _get_db docstring). The previous
142
+ # "sqlite-vec-fallback" label gave operators the wrong signal.
143
+ "backend": "sqlite-vec"}
140
144
  except Exception as exc:
141
145
  return {"status": "degraded", "error": str(exc)}
142
146
 
@@ -212,6 +216,80 @@ def refresh():
212
216
  return {"status": "ok", "noop": True}
213
217
 
214
218
 
219
+ # ----------------------------------------------------------------------
220
+ # /health/deep — synthetic round-trip
221
+ # ----------------------------------------------------------------------
222
+
223
+ # Fixed sentinel id used by /health/deep. Upserted on every probe call,
224
+ # so the row is idempotent. Kept under id="__healthcheck__sentinel" so
225
+ # the L4 corpus has at most one healthcheck row regardless of probe rate.
226
+ _HEALTH_SENTINEL_ID = "__healthcheck__sentinel"
227
+ _HEALTH_SENTINEL_TEXT = (
228
+ "healthcheck sentinel — embed-write-search round-trip verifier"
229
+ )
230
+
231
+
232
+ @app.get("/health/deep")
233
+ async def health_deep():
234
+ """Real functional probe: embed → write → search the sentinel.
235
+
236
+ Catches the class of failure that plain /health misses — broken
237
+ embed paths, write 500s, query path bugs — i.e. exactly the bug
238
+ shape that silently degraded L6 from v0.8.0 → v0.8.2.
239
+
240
+ Returns:
241
+ {status, embed_ms, write_ms, search_ms, hit, ok}
242
+
243
+ `hit` confirms the sentinel was returned from search; `ok` is the
244
+ aggregate `embed_ok AND write_ok AND hit`. HTTP status stays 200
245
+ regardless so callers can read the body for diagnostics; status:
246
+ field carries the verdict.
247
+ """
248
+ t_total = time.perf_counter()
249
+ out: dict[str, Any] = {"status": "ok", "ok": True}
250
+ try:
251
+ t0 = time.perf_counter()
252
+ embs = await _embed_batch([_HEALTH_SENTINEL_TEXT])
253
+ out["embed_ms"] = round((time.perf_counter() - t0) * 1000.0, 1)
254
+ if not embs or not embs[0]:
255
+ out["status"] = "embed_failed"
256
+ out["ok"] = False
257
+ return out
258
+ vec = embs[0]
259
+ except Exception as exc:
260
+ out["status"] = f"embed_error: {type(exc).__name__}"
261
+ out["ok"] = False
262
+ return out
263
+
264
+ try:
265
+ conn = _get_db()
266
+ t1 = time.perf_counter()
267
+ conn.execute(
268
+ "INSERT OR REPLACE INTO chunks(id, text, embedding, indexed_at) "
269
+ "VALUES (?, ?, ?, ?)",
270
+ (_HEALTH_SENTINEL_ID, _HEALTH_SENTINEL_TEXT, _vec_to_blob(vec), time.time()),
271
+ )
272
+ conn.commit()
273
+ out["write_ms"] = round((time.perf_counter() - t1) * 1000.0, 1)
274
+
275
+ t2 = time.perf_counter()
276
+ rows = conn.execute("SELECT id, embedding FROM chunks WHERE id = ?",
277
+ (_HEALTH_SENTINEL_ID,)).fetchone()
278
+ out["search_ms"] = round((time.perf_counter() - t2) * 1000.0, 1)
279
+ conn.close()
280
+ except Exception as exc:
281
+ out["status"] = f"db_error: {type(exc).__name__}"
282
+ out["ok"] = False
283
+ return out
284
+
285
+ out["hit"] = rows is not None
286
+ if not out["hit"]:
287
+ out["status"] = "sentinel_missing"
288
+ out["ok"] = False
289
+ out["total_ms"] = round((time.perf_counter() - t_total) * 1000.0, 1)
290
+ return out
291
+
292
+
215
293
  # ----------------------------------------------------------------------
216
294
  # Entrypoint
217
295
  # ----------------------------------------------------------------------
@@ -558,6 +558,20 @@ def serve(port=8034):
558
558
  from fastapi import FastAPI, Query
559
559
  import uvicorn
560
560
 
561
+ # Bootstrap all 4 collections on startup. Previously only `chats`
562
+ # was being created (the indexer entrypoints below each call their
563
+ # own ensure_collection lazily, so collections without an indexer
564
+ # — i.e. those fed solely via /index-internal or the compat shim's
565
+ # _index_l5 fan-out — never came into existence and writes to them
566
+ # 500'd). Idempotent: ensure_collection short-circuits if exists.
567
+ try:
568
+ bootstrap_client = get_client()
569
+ for _name in ("chats", "emails", "contacts", "memory"):
570
+ ensure_collection(bootstrap_client, _name)
571
+ logging.info("L5 collections bootstrapped: chats, emails, contacts, memory")
572
+ except Exception as exc:
573
+ logging.warning(f"L5 collection bootstrap failed (continuing): {exc}")
574
+
561
575
  api = FastAPI(title="L5 Communications Layer")
562
576
 
563
577
  @api.get("/health")
@@ -658,6 +672,71 @@ def serve(port=8034):
658
672
  "insert_ms": round(insert_ms, 1),
659
673
  }
660
674
 
675
+ @api.get("/health/deep")
676
+ def api_health_deep():
677
+ """Real functional probe: assert all 4 collections exist + run
678
+ embed+insert+search of a sentinel chunk in the `chats` collection.
679
+ Catches missing-collection regressions (Issue 3) and embed/insert/
680
+ search path bugs the shallow /health misses."""
681
+ import time as _time, hashlib as _hashlib
682
+ out = {"status": "ok", "ok": True}
683
+ client = get_client()
684
+
685
+ # 1. Collection presence
686
+ expected = ("chats", "emails", "contacts", "memory")
687
+ present = {n: client.has_collection(n) for n in expected}
688
+ out["collections"] = present
689
+ missing = [n for n, ok in present.items() if not ok]
690
+ if missing:
691
+ out["status"] = f"missing_collections:{','.join(missing)}"
692
+ out["ok"] = False
693
+ return out
694
+
695
+ # 2. Synthetic embed → insert → search in `chats`
696
+ sentinel_id = "__healthcheck__sentinel"
697
+ sentinel_text = "healthcheck sentinel — L5 embed-write-search round-trip verifier"
698
+ try:
699
+ t0 = _time.time()
700
+ embs = _embed_post([sentinel_text])
701
+ out["embed_ms"] = round((_time.time() - t0) * 1000.0, 1)
702
+ if not embs or embs[0] is None:
703
+ out["status"] = "embed_failed"
704
+ out["ok"] = False
705
+ return out
706
+
707
+ t1 = _time.time()
708
+ from datetime import datetime as _dt, timezone as _tz
709
+ client.upsert(collection_name="chats", data=[{
710
+ "id": sentinel_id,
711
+ "vector": embs[0],
712
+ "text": sentinel_text,
713
+ "source": "healthcheck",
714
+ "channel": "__healthcheck__",
715
+ "contact": "",
716
+ "timestamp": _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
717
+ "arena": "__healthcheck__",
718
+ }])
719
+ out["write_ms"] = round((_time.time() - t1) * 1000.0, 1)
720
+
721
+ t2 = _time.time()
722
+ hits = client.search(
723
+ collection_name="chats",
724
+ data=[embs[0]],
725
+ limit=1,
726
+ filter='arena == "__healthcheck__"',
727
+ output_fields=["id"],
728
+ )
729
+ out["search_ms"] = round((_time.time() - t2) * 1000.0, 1)
730
+ found = bool(hits and hits[0] and hits[0][0].get("entity", {}).get("id") == sentinel_id)
731
+ out["hit"] = found
732
+ if not found:
733
+ out["status"] = "sentinel_missing"
734
+ out["ok"] = False
735
+ except Exception as exc:
736
+ out["status"] = f"probe_error: {type(exc).__name__}: {exc}"
737
+ out["ok"] = False
738
+ return out
739
+
661
740
  print(f"\n L5 Communications Layer — http://127.0.0.1:{port}")
662
741
  uvicorn.run(api, host=os.environ.get("HOST","127.0.0.1"), port=port, log_level="warning")
663
742
 
@@ -1,8 +1,25 @@
1
1
  FROM python:3.12-slim
2
2
  WORKDIR /app
3
- RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
4
- RUN pip install --no-cache-dir fastapi "uvicorn[standard]" httpx "pymilvus[milvus_lite]" "setuptools<70" pydantic spacy
3
+ # gcc/g++ needed by some sentence-transformers transitive deps; curl kept
4
+ # for in-container debugging.
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ gcc g++ curl \
7
+ && rm -rf /var/lib/apt/lists/*
8
+ # Core deps + sentence-transformers/torch for the cross-encoder reranker.
9
+ # Torch CPU wheel is enough — the reranker is small (MiniLM L-6) and
10
+ # CPU-bound throughput is fine at L6's request volume. Without these,
11
+ # get_reranker() falls back to RRF-only, capping recall ranking quality.
12
+ RUN pip install --no-cache-dir \
13
+ fastapi "uvicorn[standard]" httpx "pymilvus[milvus_lite]" "setuptools<70" pydantic spacy \
14
+ "sentence-transformers" \
15
+ "torch" --extra-index-url https://download.pytorch.org/whl/cpu
5
16
  RUN python -m spacy download en_core_web_sm
17
+ ENV HF_HOME=/data/.cache/huggingface
18
+ # Pre-download the cross-encoder so cold-start doesn't pay first-pull
19
+ # latency. The model is small (~80MB) and gets cached at /data — survives
20
+ # container recreates since /data is a volume mount.
21
+ RUN mkdir -p /data/.cache/huggingface && \
22
+ python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', cache_folder='/data/.cache/huggingface')"
6
23
  # Shared embed_provider module (build context is engine/services).
7
24
  COPY _shared /app/_shared
8
25
  COPY l6/l6-document-store.py /app/server.py
@@ -838,6 +838,67 @@ def serve(port: int = DEFAULT_PORT):
838
838
  def api_health():
839
839
  return health()
840
840
 
841
+ @api.get("/health/deep")
842
+ def api_health_deep():
843
+ """Real functional probe: embed → insert via /index-batch path →
844
+ search the sentinel via hybrid search → assert reranker loaded.
845
+
846
+ Built to catch the v0.8.0–0.8.2 L6 _embed_client shadowing bug
847
+ and its kind (request-handler-level breakage with the layer
848
+ process appearing healthy)."""
849
+ import time as _time
850
+ sentinel_id = "__healthcheck__sentinel"
851
+ sentinel_text = "healthcheck sentinel — L6 embed-write-search round-trip verifier"
852
+ out = {"status": "ok", "ok": True}
853
+ try:
854
+ t0 = _time.time()
855
+ try:
856
+ emb = embed_text(sentinel_text)
857
+ except Exception as exc:
858
+ out["status"] = f"embed_failed: {type(exc).__name__}: {exc}"
859
+ out["ok"] = False
860
+ return out
861
+ out["embed_ms"] = round((_time.time() - t0) * 1000.0, 1)
862
+
863
+ # Insert via the same path real ingest uses, so the probe
864
+ # actually exercises /index-batch's code.
865
+ t1 = _time.time()
866
+ import httpx as _httpx
867
+ r = _httpx.post(
868
+ f"http://localhost:{DEFAULT_PORT}/index-batch",
869
+ json={
870
+ "arena": "__healthcheck__",
871
+ "records": [{"id": sentinel_id, "text": sentinel_text}],
872
+ },
873
+ timeout=15.0,
874
+ )
875
+ out["write_ms"] = round((_time.time() - t1) * 1000.0, 1)
876
+ if r.status_code != 200:
877
+ out["status"] = f"write_failed: http {r.status_code}"
878
+ out["ok"] = False
879
+ return out
880
+
881
+ t2 = _time.time()
882
+ results = search(
883
+ sentinel_text, method="hybrid", limit=3,
884
+ arena="__healthcheck__", enable_rerank=False,
885
+ )
886
+ out["search_ms"] = round((_time.time() - t2) * 1000.0, 1)
887
+ hit = any(r.get("id", "").startswith(sentinel_id) for r in (results or []))
888
+ out["hit"] = hit
889
+ if not hit:
890
+ out["status"] = "sentinel_missing"
891
+ out["ok"] = False
892
+
893
+ # Reranker check — informational. Failure here doesn't flip
894
+ # ok=False because L6 falls back to RRF and still serves
895
+ # results; it just caps the recall ranking quality.
896
+ out["reranker"] = "ok" if get_reranker() is not None else "rrf_fallback"
897
+ except Exception as exc:
898
+ out["status"] = f"probe_error: {type(exc).__name__}: {exc}"
899
+ out["ok"] = False
900
+ return out
901
+
841
902
  @api.get("/stats")
842
903
  def api_stats():
843
904
  return get_stats()