@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1018 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ L6 Document Store — HybridRAG for Document Retrieval
4
+
5
+ Features:
6
+ - Milvus Lite (vector) + SQLite FTS5 (BM25) + RRF fusion
7
+ - Cross-encoder reranker (ms-marco-MiniLM-L-6-v2)
8
+ - Ingest-time entity extraction via Ollama graph-preflexor
9
+ - Adaptive chunk sizing by doc_type
10
+ - Freshness-aware dedup (purge-and-replace on re-index)
11
+ - Confidence scoring (RRF + engine_count + reranker_score)
12
+
13
+ Port: 8037
14
+ """
15
+
16
+ import argparse
17
+ import hashlib
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import sqlite3
23
+ import time
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional, Tuple
27
+
28
+ import httpx
29
+ from pymilvus import MilvusClient, DataType, CollectionSchema, FieldSchema
30
+ from pymilvus.milvus_client.index import IndexParams
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Config
34
+ # ---------------------------------------------------------------------------
35
+
36
+ DATA_DIR = Path(os.environ.get("L6_DATA_DIR", str(Path.home() / "l6-document-store" / "data")))
37
+ MILVUS_DB = str(DATA_DIR / "documents.db")
38
+ FTS_DB = str(DATA_DIR / "documents_fts.db")
39
+ OLLAMA_URL = os.environ.get("L6_OLLAMA_URL", "http://localhost:11434")
40
+ EMBED_MODEL = os.environ.get("L6_EMBED_MODEL", "nomic-embed-text")
41
+ NV_EMBED_URL = os.environ.get("L6_NV_EMBED_URL", "http://localhost:8041/v1/embeddings")
42
+ NV_EMBED_ENABLED = os.environ.get("L6_NV_EMBED_ENABLED", "true").lower() == "true"
43
+ EMBED_DIM = int(os.environ.get("L6_EMBED_DIM", "4096"))
44
+ COLLECTION_NAME = "documents"
45
+ RRF_K = 60
46
+ DEFAULT_PORT = 8037
47
+
48
+ # Chunk sizes by doc_type
49
+ CHUNK_CONFIG = {
50
+ "legal": {"max_chars": 2500, "overlap": 400},
51
+ "financial": {"max_chars": 2500, "overlap": 400},
52
+ "governance": {"max_chars": 2500, "overlap": 400},
53
+ "technical": {"max_chars": 2000, "overlap": 300},
54
+ "general": {"max_chars": 1500, "overlap": 200},
55
+ }
56
+
57
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
58
+ log = logging.getLogger("l6-document-store")
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Embedding
62
+ # ---------------------------------------------------------------------------
63
+
64
+ _embed_client = httpx.Client(timeout=60)
65
+
66
+ def embed_text(text: str) -> List[float]:
67
+ """Get embedding — NV-Embed-v2 primary, Ollama fallback."""
68
+ if NV_EMBED_ENABLED:
69
+ try:
70
+ resp = _embed_client.post(NV_EMBED_URL, json={"input": text[:4000]})
71
+ resp.raise_for_status()
72
+ return resp.json()["data"][0]["embedding"]
73
+ except Exception as e:
74
+ log.warning(f"NV-Embed-v2 failed, falling back to Ollama: {e}")
75
+
76
+ # Ollama fallback
77
+ resp = _embed_client.post(
78
+ f"{OLLAMA_URL}/api/embeddings",
79
+ json={"model": EMBED_MODEL, "prompt": text[:8000]},
80
+ )
81
+ resp.raise_for_status()
82
+ return resp.json()["embedding"]
83
+
84
+
85
+ def embed_batch(texts: List[str]) -> List[List[float]]:
86
+ """Embed a batch of texts — NV-Embed-v2 supports native batching."""
87
+ if NV_EMBED_ENABLED:
88
+ try:
89
+ resp = _embed_client.post(NV_EMBED_URL, json={"input": [t[:4000] for t in texts]})
90
+ resp.raise_for_status()
91
+ return [d["embedding"] for d in resp.json()["data"]]
92
+ except Exception as e:
93
+ log.warning(f"NV-Embed-v2 batch failed, falling back to sequential: {e}")
94
+
95
+ return [embed_text(t) for t in texts]
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Cross-Encoder Reranker
99
+ # ---------------------------------------------------------------------------
100
+
101
+ _reranker = None
102
+ _reranker_loaded = False
103
+
104
+ def get_reranker():
105
+ """Lazy-load cross-encoder reranker."""
106
+ global _reranker, _reranker_loaded
107
+ if not _reranker_loaded:
108
+ try:
109
+ from sentence_transformers import CrossEncoder
110
+ _reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
111
+ log.info("Cross-encoder reranker loaded (ms-marco-MiniLM-L-6-v2)")
112
+ except Exception as e:
113
+ log.warning(f"Cross-encoder not available: {e}")
114
+ _reranker = None
115
+ _reranker_loaded = True
116
+ return _reranker
117
+
118
+
119
+ def rerank(query: str, results: List[Dict], top_k: int = 10) -> List[Dict]:
120
+ """Rerank results using cross-encoder."""
121
+ reranker = get_reranker()
122
+ if not reranker or not results:
123
+ return results[:top_k]
124
+
125
+ pairs = [(query, r["text"][:512]) for r in results[:20]]
126
+ scores = reranker.predict(pairs)
127
+
128
+ for i, r in enumerate(results[:20]):
129
+ r["reranker_score"] = float(scores[i])
130
+
131
+ results[:20] = sorted(results[:20], key=lambda x: x.get("reranker_score", -999), reverse=True)
132
+ return results[:top_k]
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Entity Extraction (ingest-time)
136
+ # ---------------------------------------------------------------------------
137
+
138
+ def extract_entities(text: str) -> List[str]:
139
+ """Extract entities from text using Ollama graph-preflexor."""
140
+ try:
141
+ resp = _embed_client.post(
142
+ f"{OLLAMA_URL}/api/generate",
143
+ json={
144
+ "model": "graph-preflexor",
145
+ "prompt": f"Extract all named entities (people, companies, products, places, dates) from this text. Return ONLY a JSON array of strings, nothing else.\n\nText: {text[:2000]}",
146
+ "stream": False,
147
+ },
148
+ timeout=15,
149
+ )
150
+ if resp.status_code == 200:
151
+ raw = resp.json().get("response", "")
152
+ # Try to parse JSON array from response
153
+ match = re.search(r'\[.*?\]', raw, re.DOTALL)
154
+ if match:
155
+ entities = json.loads(match.group())
156
+ return [str(e).strip() for e in entities if e and len(str(e).strip()) > 1][:20]
157
+ except Exception as e:
158
+ log.debug(f"Entity extraction failed: {e}")
159
+ return []
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Document Processing
163
+ # ---------------------------------------------------------------------------
164
+
165
+ def detect_doc_type(path: str) -> str:
166
+ """Detect document type from path patterns."""
167
+ p = path.lower()
168
+ if any(k in p for k in ["legal", "contract", "nda", "agreement", "terms"]):
169
+ return "legal"
170
+ if any(k in p for k in ["finance", "financial", "investor", "revenue", "budget", "portfolio"]):
171
+ return "financial"
172
+ if any(k in p for k in ["governance", "policy", "compliance", "audit"]):
173
+ return "governance"
174
+ if any(k in p for k in ["technical", "architecture", "api", "schema", "code"]):
175
+ return "technical"
176
+ return "general"
177
+
178
+
179
+ def detect_arena(path: str) -> str:
180
+ """Detect arena/domain from path patterns."""
181
+ p = path.lower()
182
+ if "company" in p or "internal" in p:
183
+ return "company"
184
+ if "project" in p or "proj-" in p:
185
+ return "project"
186
+ if "sarai" in p or "defence" in p:
187
+ return "sarai"
188
+ if "research" in p:
189
+ return "research"
190
+ if "finance" in p or "portfolio" in p or "stock" in p:
191
+ return "finance"
192
+ return "general"
193
+
194
+
195
+ def content_hash(text: str) -> str:
196
+ """SHA256 hash for dedup."""
197
+ return hashlib.sha256(text.encode()).hexdigest()[:16]
198
+
199
+
200
+ def chunk_markdown(text: str, doc_type: str = "general") -> List[Dict]:
201
+ """Split markdown into chunks with adaptive sizing."""
202
+ cfg = CHUNK_CONFIG.get(doc_type, CHUNK_CONFIG["general"])
203
+ max_chars = cfg["max_chars"]
204
+ overlap = cfg["overlap"]
205
+
206
+ chunks = []
207
+ current_heading = ""
208
+
209
+ # Split on ## or ### headings
210
+ sections = re.split(r'(^#{2,3}\s+.+$)', text, flags=re.MULTILINE)
211
+
212
+ current_text = ""
213
+ for part in sections:
214
+ if re.match(r'^#{2,3}\s+', part):
215
+ # Save previous section
216
+ if current_text.strip():
217
+ chunks.extend(_split_section(current_text.strip(), current_heading, max_chars, overlap))
218
+ current_heading = part.strip().lstrip('#').strip()
219
+ current_text = ""
220
+ else:
221
+ current_text += part
222
+
223
+ # Don't forget last section
224
+ if current_text.strip():
225
+ chunks.extend(_split_section(current_text.strip(), current_heading, max_chars, overlap))
226
+
227
+ # If no headings found, chunk the whole thing
228
+ if not chunks and text.strip():
229
+ chunks = _split_section(text.strip(), "", max_chars, overlap)
230
+
231
+ return chunks
232
+
233
+
234
+ def _split_section(text: str, heading: str, max_chars: int, overlap: int) -> List[Dict]:
235
+ """Split a section into overlapping chunks."""
236
+ if len(text) <= max_chars:
237
+ return [{"text": text, "heading": heading}]
238
+
239
+ chunks = []
240
+ start = 0
241
+ while start < len(text):
242
+ end = start + max_chars
243
+
244
+ # Try to break at paragraph boundary
245
+ if end < len(text):
246
+ para_break = text.rfind('\n\n', start, end)
247
+ if para_break > start + max_chars // 2:
248
+ end = para_break
249
+
250
+ chunk_text = text[start:end].strip()
251
+ if chunk_text:
252
+ chunks.append({"text": chunk_text, "heading": heading})
253
+
254
+ start = end - overlap
255
+ if start >= len(text):
256
+ break
257
+
258
+ return chunks
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # Milvus Operations
262
+ # ---------------------------------------------------------------------------
263
+
264
+ def get_milvus() -> MilvusClient:
265
+ """Get or create Milvus client."""
266
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
267
+ client = MilvusClient(uri=MILVUS_DB)
268
+
269
+ if COLLECTION_NAME not in client.list_collections():
270
+ schema = CollectionSchema(fields=[
271
+ FieldSchema("id", DataType.VARCHAR, is_primary=True, max_length=64),
272
+ FieldSchema("vector", DataType.FLOAT_VECTOR, dim=EMBED_DIM),
273
+ FieldSchema("text", DataType.VARCHAR, max_length=16000),
274
+ FieldSchema("source_file", DataType.VARCHAR, max_length=500),
275
+ FieldSchema("arena", DataType.VARCHAR, max_length=60),
276
+ FieldSchema("doc_type", DataType.VARCHAR, max_length=30),
277
+ FieldSchema("heading", DataType.VARCHAR, max_length=300),
278
+ FieldSchema("chunk_index", DataType.INT64),
279
+ FieldSchema("content_hash", DataType.VARCHAR, max_length=20),
280
+ FieldSchema("entities_json", DataType.VARCHAR, max_length=2000),
281
+ FieldSchema("indexed_at", DataType.VARCHAR, max_length=30),
282
+ ])
283
+ client.create_collection(
284
+ collection_name=COLLECTION_NAME,
285
+ schema=schema,
286
+ )
287
+ # Create index
288
+ idx = IndexParams()
289
+ idx.add_index(field_name="vector", index_type="AUTOINDEX", metric_type="COSINE")
290
+ client.create_index(collection_name=COLLECTION_NAME, index_params=idx)
291
+ client.load_collection(COLLECTION_NAME)
292
+ log.info(f"Created Milvus collection '{COLLECTION_NAME}'")
293
+
294
+ return client
295
+
296
+
297
+ def search_vector(client: MilvusClient, query_vec: List[float], limit: int = 20,
298
+ arena: Optional[str] = None) -> List[Dict]:
299
+ """Vector similarity search."""
300
+ filter_expr = f'arena == "{arena}"' if arena else ""
301
+ results = client.search(
302
+ collection_name=COLLECTION_NAME,
303
+ data=[query_vec],
304
+ limit=limit,
305
+ output_fields=["text", "source_file", "arena", "doc_type", "heading",
306
+ "chunk_index", "content_hash", "entities_json", "indexed_at"],
307
+ filter=filter_expr if filter_expr else None,
308
+ )
309
+ out = []
310
+ for hits in results:
311
+ for hit in hits:
312
+ entity = hit.get("entity", {})
313
+ out.append({
314
+ "text": entity.get("text", ""),
315
+ "source_file": entity.get("source_file", ""),
316
+ "arena": entity.get("arena", ""),
317
+ "doc_type": entity.get("doc_type", ""),
318
+ "heading": entity.get("heading", ""),
319
+ "chunk_index": entity.get("chunk_index", 0),
320
+ "content_hash": entity.get("content_hash", ""),
321
+ "entities": _parse_entities_json(entity.get("entities_json", "[]")),
322
+ "score": hit.get("distance", 0),
323
+ "engine": "vector",
324
+ })
325
+ return out
326
+
327
+ # ---------------------------------------------------------------------------
328
+ # FTS5 Operations
329
+ # ---------------------------------------------------------------------------
330
+
331
+ def get_fts_db() -> sqlite3.Connection:
332
+ """Get or create FTS5 database."""
333
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
334
+ conn = sqlite3.connect(FTS_DB)
335
+ conn.execute("PRAGMA journal_mode=WAL")
336
+
337
+ # Create content table
338
+ conn.execute("""
339
+ CREATE TABLE IF NOT EXISTS chunks (
340
+ id TEXT PRIMARY KEY,
341
+ text TEXT,
342
+ source_file TEXT,
343
+ arena TEXT,
344
+ doc_type TEXT,
345
+ heading TEXT,
346
+ chunk_index INTEGER,
347
+ content_hash TEXT,
348
+ entities_json TEXT,
349
+ indexed_at TEXT
350
+ )
351
+ """)
352
+
353
+ # Create FTS5 virtual table
354
+ conn.execute("""
355
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
356
+ text, source_file, arena, heading, entities_json,
357
+ content='chunks',
358
+ content_rowid='rowid'
359
+ )
360
+ """)
361
+
362
+ # Triggers for sync
363
+ conn.execute("""
364
+ CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
365
+ INSERT INTO chunks_fts(rowid, text, source_file, arena, heading, entities_json)
366
+ VALUES (new.rowid, new.text, new.source_file, new.arena, new.heading, new.entities_json);
367
+ END
368
+ """)
369
+ conn.execute("""
370
+ CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
371
+ INSERT INTO chunks_fts(chunks_fts, rowid, text, source_file, arena, heading, entities_json)
372
+ VALUES ('delete', old.rowid, old.text, old.source_file, old.arena, old.heading, old.entities_json);
373
+ END
374
+ """)
375
+
376
+ conn.commit()
377
+ return conn
378
+
379
+
380
+ def search_fts(conn: sqlite3.Connection, query: str, limit: int = 20,
381
+ arena: Optional[str] = None) -> List[Dict]:
382
+ """BM25 keyword search via FTS5."""
383
+ # Escape FTS5 special chars
384
+ safe_query = re.sub(r'[^\w\s]', ' ', query).strip()
385
+ if not safe_query:
386
+ return []
387
+
388
+ arena_filter = f"AND c.arena = ?" if arena else ""
389
+ params = [safe_query, limit] if not arena else [safe_query, arena, limit]
390
+
391
+ sql = f"""
392
+ SELECT c.*, bm25(chunks_fts) as rank
393
+ FROM chunks_fts f
394
+ JOIN chunks c ON c.rowid = f.rowid
395
+ WHERE chunks_fts MATCH ?
396
+ {arena_filter}
397
+ ORDER BY rank
398
+ LIMIT ?
399
+ """
400
+
401
+ try:
402
+ rows = conn.execute(sql, params).fetchall()
403
+ except sqlite3.OperationalError as e:
404
+ log.warning(f"FTS query failed: {e}")
405
+ return []
406
+
407
+ cols = ["id", "text", "source_file", "arena", "doc_type", "heading",
408
+ "chunk_index", "content_hash", "entities_json", "indexed_at", "rank"]
409
+ out = []
410
+ for row in rows:
411
+ d = dict(zip(cols, row))
412
+ out.append({
413
+ "text": d.get("text", ""),
414
+ "source_file": d.get("source_file", ""),
415
+ "arena": d.get("arena", ""),
416
+ "doc_type": d.get("doc_type", ""),
417
+ "heading": d.get("heading", ""),
418
+ "chunk_index": d.get("chunk_index", 0),
419
+ "content_hash": d.get("content_hash", ""),
420
+ "entities": _parse_entities_json(d.get("entities_json", "[]")),
421
+ "score": -d.get("rank", 0), # BM25 returns negative scores
422
+ "engine": "bm25",
423
+ })
424
+ return out
425
+
426
+ # ---------------------------------------------------------------------------
427
+ # RRF Fusion
428
+ # ---------------------------------------------------------------------------
429
+
430
+ def rrf_fuse(vector_results: List[Dict], bm25_results: List[Dict]) -> List[Dict]:
431
+ """Reciprocal Rank Fusion combining vector and BM25 results."""
432
+ scored = {}
433
+
434
+ for rank, r in enumerate(vector_results):
435
+ key = (r["source_file"], r["chunk_index"])
436
+ if key not in scored:
437
+ scored[key] = {"result": r, "rrf_score": 0, "engines": set()}
438
+ scored[key]["rrf_score"] += 1.0 / (RRF_K + rank + 1)
439
+ scored[key]["engines"].add("vector")
440
+
441
+ for rank, r in enumerate(bm25_results):
442
+ key = (r["source_file"], r["chunk_index"])
443
+ if key not in scored:
444
+ scored[key] = {"result": r, "rrf_score": 0, "engines": set()}
445
+ scored[key]["rrf_score"] += 1.0 / (RRF_K + rank + 1)
446
+ scored[key]["engines"].add("bm25")
447
+
448
+ # Sort by RRF score
449
+ fused = sorted(scored.values(), key=lambda x: x["rrf_score"], reverse=True)
450
+
451
+ out = []
452
+ for item in fused:
453
+ r = item["result"]
454
+ r["rrf_score"] = round(item["rrf_score"], 6)
455
+ r["engine_count"] = len(item["engines"])
456
+ r["engines"] = list(item["engines"])
457
+ out.append(r)
458
+
459
+ return out
460
+
461
+ # ---------------------------------------------------------------------------
462
+ # Indexing
463
+ # ---------------------------------------------------------------------------
464
+
465
+ def index_documents(paths: List[str], arena: Optional[str] = None,
466
+ doc_type: Optional[str] = None,
467
+ extract_entities_flag: bool = True,
468
+ use_enhanced_ingest: bool = True) -> Dict:
469
+ """Index documents into both Milvus and FTS5.
470
+
471
+ Supports: .md, .txt, .markdown, .pdf (via enhanced_ingest)
472
+ """
473
+ milvus = get_milvus()
474
+ fts_conn = get_fts_db()
475
+
476
+ stats = {"files": 0, "chunks": 0, "entities_extracted": 0, "errors": 0, "skipped": 0,
477
+ "tables": 0, "semantic_chunks": 0}
478
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
479
+
480
+ for file_path in paths:
481
+ p = Path(file_path)
482
+ if not p.exists():
483
+ log.warning(f"File not found: {file_path}")
484
+ stats["errors"] += 1
485
+ continue
486
+
487
+ supported_exts = (
488
+ '.md', '.txt', '.markdown', '.pdf',
489
+ # enhanced_ingest formats
490
+ '.csv', '.json', '.yaml', '.yml', '.toml',
491
+ '.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h',
492
+ '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls',
493
+ '.rtf', '.odt', '.epub', '.tex',
494
+ '.html', '.htm', '.xml',
495
+ '.ipynb',
496
+ )
497
+ if not p.suffix.lower() in supported_exts:
498
+ log.debug(f"Skipping unsupported: {file_path}")
499
+ stats["skipped"] += 1
500
+ continue
501
+
502
+ # Use enhanced ingest for PDFs (and optionally for all docs)
503
+ if p.suffix.lower() == '.pdf' or use_enhanced_ingest:
504
+ try:
505
+ from enhanced_ingest import ingest_document, Chunk
506
+ result = ingest_document(str(p), arena or detect_arena(str(p)))
507
+
508
+ file_arena = result["arena"]
509
+ file_doc_type = doc_type or detect_doc_type(str(p))
510
+ source_file = str(p)
511
+
512
+ # Purge old chunks
513
+ _purge_file(milvus, fts_conn, source_file)
514
+
515
+ milvus_batch = []
516
+ for chunk_data in result["chunks"]:
517
+ chunk_text = chunk_data["text"]
518
+ c_hash = content_hash(chunk_text)
519
+ idx = chunk_data["chunk_index"]
520
+ chunk_id = f"{c_hash}_{idx}"
521
+
522
+ # Track semantic vs fixed chunks
523
+ if chunk_data.get("metadata", {}).get("type") == "semantic":
524
+ stats["semantic_chunks"] += 1
525
+ if chunk_data.get("metadata", {}).get("type") == "table":
526
+ stats["tables"] += 1
527
+
528
+ # Extract entities
529
+ entities = []
530
+ if extract_entities_flag and len(chunk_text) > 50:
531
+ entities = extract_entities(chunk_text)
532
+ if entities:
533
+ stats["entities_extracted"] += len(entities)
534
+
535
+ entities_json = json.dumps(entities)
536
+ vector = embed_text(chunk_text)
537
+
538
+ milvus_batch.append({
539
+ "id": chunk_id,
540
+ "vector": vector,
541
+ "text": chunk_text[:15000],
542
+ "source_file": source_file[:500],
543
+ "arena": file_arena[:60],
544
+ "doc_type": file_doc_type[:30],
545
+ "heading": chunk_data.get("heading", "")[:300],
546
+ "chunk_index": idx,
547
+ "content_hash": c_hash,
548
+ "entities_json": entities_json[:2000],
549
+ "indexed_at": now,
550
+ })
551
+
552
+ fts_conn.execute(
553
+ "INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
554
+ (chunk_id, chunk_text[:15000], source_file[:500], file_arena[:60],
555
+ file_doc_type[:30], chunk_data.get("heading", "")[:300], idx,
556
+ c_hash, entities_json[:2000], now),
557
+ )
558
+
559
+ if milvus_batch:
560
+ # pymilvus 2.6+ requires keyword args; old positional form
561
+ # silently no-ops which causes vector hits to be empty.
562
+ milvus.insert(collection_name=COLLECTION_NAME, data=milvus_batch)
563
+ fts_conn.commit()
564
+
565
+ stats["files"] += 1
566
+ stats["chunks"] += len(result["chunks"])
567
+ log.info(f"Indexed (enhanced): {p.name} — {len(result['chunks'])} chunks, {len(result.get('tables', []))} tables")
568
+ continue
569
+
570
+ except ImportError:
571
+ log.warning("enhanced_ingest not available, falling back to basic chunking")
572
+ except Exception as e:
573
+ log.error(f"Enhanced ingest failed for {file_path}: {e}")
574
+ if p.suffix.lower() == '.pdf':
575
+ stats["errors"] += 1
576
+ continue
577
+ # Fall through to basic chunking for non-PDFs
578
+
579
+ try:
580
+ text = p.read_text(errors="replace")
581
+ if len(text.strip()) < 20:
582
+ stats["skipped"] += 1
583
+ continue
584
+
585
+ file_arena = arena or detect_arena(str(p))
586
+ file_doc_type = doc_type or detect_doc_type(str(p))
587
+ source_file = str(p)
588
+
589
+ # Purge old chunks for this file (freshness-aware dedup)
590
+ _purge_file(milvus, fts_conn, source_file)
591
+
592
+ # Chunk the document
593
+ chunks = chunk_markdown(text, file_doc_type)
594
+
595
+ # Process each chunk
596
+ milvus_batch = []
597
+ for idx, chunk in enumerate(chunks):
598
+ chunk_text = chunk["text"]
599
+ c_hash = content_hash(chunk_text)
600
+ chunk_id = f"{c_hash}_{idx}"
601
+
602
+ # Extract entities (ingest-time)
603
+ entities = []
604
+ if extract_entities_flag and len(chunk_text) > 50:
605
+ entities = extract_entities(chunk_text)
606
+ if entities:
607
+ stats["entities_extracted"] += len(entities)
608
+
609
+ entities_json = json.dumps(entities)
610
+
611
+ # Embed
612
+ vector = embed_text(chunk_text)
613
+
614
+ # Prepare Milvus record
615
+ milvus_batch.append({
616
+ "id": chunk_id,
617
+ "vector": vector,
618
+ "text": chunk_text[:15000],
619
+ "source_file": source_file[:500],
620
+ "arena": file_arena[:60],
621
+ "doc_type": file_doc_type[:30],
622
+ "heading": chunk.get("heading", "")[:300],
623
+ "chunk_index": idx,
624
+ "content_hash": c_hash,
625
+ "entities_json": entities_json[:2000],
626
+ "indexed_at": now,
627
+ })
628
+
629
+ # Insert into FTS5
630
+ fts_conn.execute(
631
+ "INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
632
+ (chunk_id, chunk_text[:15000], source_file[:500], file_arena[:60],
633
+ file_doc_type[:30], chunk.get("heading", "")[:300], idx,
634
+ c_hash, entities_json[:2000], now),
635
+ )
636
+
637
+ stats["chunks"] += 1
638
+
639
+ # Batch insert into Milvus
640
+ if milvus_batch:
641
+ milvus.upsert(collection_name=COLLECTION_NAME, data=milvus_batch)
642
+
643
+ stats["files"] += 1
644
+ log.info(f"Indexed {p.name}: {len(chunks)} chunks, arena={file_arena}, type={file_doc_type}")
645
+
646
+ except Exception as e:
647
+ log.error(f"Error indexing {file_path}: {e}")
648
+ stats["errors"] += 1
649
+
650
+ fts_conn.commit()
651
+ fts_conn.close()
652
+ return stats
653
+
654
+
655
+ def _purge_file(milvus: MilvusClient, fts_conn: sqlite3.Connection, source_file: str):
656
+ """Remove all chunks for a source file (freshness-aware re-index)."""
657
+ try:
658
+ # Purge from Milvus
659
+ milvus.delete(
660
+ collection_name=COLLECTION_NAME,
661
+ filter=f'source_file == "{source_file}"',
662
+ )
663
+ except Exception as e:
664
+ log.debug(f"Milvus purge (may be empty): {e}")
665
+
666
+ try:
667
+ # Purge from FTS
668
+ fts_conn.execute("DELETE FROM chunks WHERE source_file = ?", (source_file,))
669
+ except Exception as e:
670
+ log.debug(f"FTS purge: {e}")
671
+
672
+
673
+ def _parse_entities_json(s: str) -> List[str]:
674
+ """Safely parse entities JSON."""
675
+ try:
676
+ return json.loads(s) if s else []
677
+ except (json.JSONDecodeError, TypeError):
678
+ return []
679
+
680
+ # ---------------------------------------------------------------------------
681
+ # Search
682
+ # ---------------------------------------------------------------------------
683
+
684
+ def search(query: str, method: str = "hybrid", limit: int = 10,
685
+ arena: Optional[str] = None, enable_rerank: bool = True) -> List[Dict]:
686
+ """Search documents with specified method."""
687
+
688
+ if method == "vector":
689
+ vec = embed_text(query)
690
+ results = search_vector(get_milvus(), vec, limit=limit, arena=arena)
691
+ elif method == "bm25":
692
+ results = search_fts(get_fts_db(), query, limit=limit, arena=arena)
693
+ else:
694
+ # Hybrid: RRF fusion
695
+ vec = embed_text(query)
696
+ vector_results = search_vector(get_milvus(), vec, limit=20, arena=arena)
697
+ bm25_results = search_fts(get_fts_db(), query, limit=20, arena=arena)
698
+ results = rrf_fuse(vector_results, bm25_results)
699
+
700
+ # Rerank if enabled
701
+ if enable_rerank and len(results) > 1:
702
+ results = rerank(query, results, top_k=limit)
703
+
704
+ return results[:limit]
705
+
706
+ # ---------------------------------------------------------------------------
707
+ # Stats & Health
708
+ # ---------------------------------------------------------------------------
709
+
710
+ def get_stats() -> Dict:
711
+ """Get index statistics."""
712
+ stats = {"vector_chunks": 0, "fts_chunks": 0, "arenas": {}, "doc_types": {}}
713
+
714
+ try:
715
+ milvus = get_milvus()
716
+ info = milvus.get_collection_stats(COLLECTION_NAME)
717
+ stats["vector_chunks"] = info.get("row_count", 0)
718
+ except Exception:
719
+ pass
720
+
721
+ try:
722
+ conn = get_fts_db()
723
+ row = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()
724
+ stats["fts_chunks"] = row[0] if row else 0
725
+
726
+ for row in conn.execute("SELECT arena, COUNT(*) FROM chunks GROUP BY arena").fetchall():
727
+ stats["arenas"][row[0]] = row[1]
728
+
729
+ for row in conn.execute("SELECT doc_type, COUNT(*) FROM chunks GROUP BY doc_type").fetchall():
730
+ stats["doc_types"][row[0]] = row[1]
731
+
732
+ conn.close()
733
+ except Exception:
734
+ pass
735
+
736
+ return stats
737
+
738
+
739
+ def health() -> Dict:
740
+ """Health check."""
741
+ status = {"status": "ok", "milvus": "unknown", "fts": "unknown", "ollama": "unknown", "reranker": "unknown"}
742
+
743
+ # Milvus
744
+ try:
745
+ client = get_milvus()
746
+ colls = client.list_collections()
747
+ status["milvus"] = f"ok ({len(colls)} collections)"
748
+ except Exception as e:
749
+ status["milvus"] = f"error: {e}"
750
+ status["status"] = "degraded"
751
+
752
+ # FTS
753
+ try:
754
+ conn = get_fts_db()
755
+ cnt = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
756
+ status["fts"] = f"ok ({cnt} chunks)"
757
+ conn.close()
758
+ except Exception as e:
759
+ status["fts"] = f"error: {e}"
760
+ status["status"] = "degraded"
761
+
762
+ # Ollama
763
+ try:
764
+ resp = _embed_client.get(f"{OLLAMA_URL}/api/tags", timeout=5)
765
+ status["ollama"] = "ok" if resp.status_code == 200 else f"http {resp.status_code}"
766
+ except Exception as e:
767
+ status["ollama"] = f"error: {e}"
768
+ status["status"] = "degraded"
769
+
770
+ # Reranker
771
+ reranker = get_reranker()
772
+ status["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
773
+
774
+ return status
775
+
776
+ # ---------------------------------------------------------------------------
777
+ # FastAPI Server
778
+ # ---------------------------------------------------------------------------
779
+
780
+ def serve(port: int = DEFAULT_PORT):
781
+ """Run as HTTP API server."""
782
+ from fastapi import FastAPI, Query as Q, HTTPException
783
+ from pydantic import BaseModel
784
+ import uvicorn
785
+
786
+ api = FastAPI(title="L6 Document Store", version="1.0.0")
787
+
788
+ class IndexRequest(BaseModel):
789
+ paths: List[str]
790
+ arena: Optional[str] = None
791
+ doc_type: Optional[str] = None
792
+ extract_entities: bool = True
793
+
794
+ @api.get("/health")
795
+ def api_health():
796
+ return health()
797
+
798
+ @api.get("/stats")
799
+ def api_stats():
800
+ return get_stats()
801
+
802
+ @api.get("/search")
803
+ def api_search(
804
+ q: str = Q(..., description="Search query"),
805
+ method: str = Q("hybrid", description="hybrid|vector|bm25"),
806
+ limit: int = Q(10, ge=1, le=50),
807
+ arena: Optional[str] = Q(None),
808
+ rerank: bool = Q(True),
809
+ ):
810
+ results = search(q, method=method, limit=limit, arena=arena, enable_rerank=rerank)
811
+ return {"query": q, "method": method, "results": results, "count": len(results)}
812
+
813
+ @api.post("/search")
814
+ def api_search_post(
815
+ q: str,
816
+ method: str = "hybrid",
817
+ limit: int = 10,
818
+ arena: Optional[str] = None,
819
+ rerank: bool = True,
820
+ ):
821
+ """POST version of search for compatibility."""
822
+ results = search(q, method=method, limit=limit, arena=arena, enable_rerank=rerank)
823
+ return {"query": q, "method": method, "results": results, "count": len(results)}
824
+
825
+ @api.post("/index")
826
+ def api_index(req: IndexRequest):
827
+ stats = index_documents(
828
+ req.paths, arena=req.arena, doc_type=req.doc_type,
829
+ extract_entities_flag=req.extract_entities,
830
+ )
831
+ return {"status": "ok", "stats": stats}
832
+
833
+ @api.post("/index-batch")
834
+ def api_index_batch(req: dict):
835
+ """Index a batch of in-memory documents in a single batched
836
+ NV-Embed call + a single milvus insert + one FTS write.
837
+
838
+ Roughly 30-50x faster than calling /index for the equivalent
839
+ files because the legacy path does one embed roundtrip per
840
+ chunk. This endpoint exists for tests, smoke runs and bench
841
+ harnesses where small corpora need to land quickly.
842
+
843
+ Request body::
844
+
845
+ {
846
+ "arena": "benchmark",
847
+ "records": [
848
+ {
849
+ "id": "doc1", # required, becomes chunk id prefix
850
+ "text": "…", # required, indexed as one chunk
851
+ "source_file": "doc1.md", # optional
852
+ "doc_type": "general", # optional, default "general"
853
+ "heading": "…" # optional
854
+ }, …
855
+ ]
856
+ }
857
+
858
+ Returns::
859
+
860
+ {"status": "ok", "inserted": N, "embed_ms": float, "insert_ms": float}
861
+ """
862
+ import time as _time, hashlib as _hashlib, httpx as _httpx
863
+ from datetime import datetime as _dt, timezone as _tz
864
+
865
+ records = req.get("records") or []
866
+ arena = req.get("arena") or "general"
867
+ if not records:
868
+ return {"status": "ok", "inserted": 0}
869
+
870
+ texts = [(r.get("text") or "")[:16000] for r in records]
871
+
872
+ # Single batched NV-Embed call.
873
+ t0 = _time.time()
874
+ try:
875
+ resp = _httpx.post(
876
+ NV_EMBED_URL, json={"input": texts, "model": "nv-embed-v2"},
877
+ timeout=120,
878
+ )
879
+ resp.raise_for_status()
880
+ embs = [d["embedding"] for d in resp.json()["data"]]
881
+ except Exception as exc:
882
+ raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
883
+ embed_ms = (_time.time() - t0) * 1000.0
884
+
885
+ # Single milvus insert.
886
+ milvus = get_milvus()
887
+ now = _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
888
+ rows = []
889
+ for r, emb, txt in zip(records, embs, texts):
890
+ if emb is None:
891
+ continue
892
+ rid = r.get("id") or _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
893
+ chunk_id = f"l6:{rid}:0"[:63]
894
+ rows.append({
895
+ "id": chunk_id,
896
+ "vector": emb,
897
+ "text": txt,
898
+ "source_file": (r.get("source_file") or f"{rid}.md")[:500],
899
+ "arena": arena[:60],
900
+ "doc_type": (r.get("doc_type") or "general")[:30],
901
+ "heading": (r.get("heading") or "")[:300],
902
+ "chunk_index": 0,
903
+ "content_hash": _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:20],
904
+ "entities_json": "[]",
905
+ "indexed_at": now,
906
+ })
907
+ t1 = _time.time()
908
+ if rows:
909
+ milvus.insert(collection_name=COLLECTION_NAME, data=rows)
910
+ insert_ms = (_time.time() - t1) * 1000.0
911
+
912
+ # FTS5 is auto-populated by a trigger when we INSERT into the
913
+ # `chunks` table. Inserting into chunks_fts directly bypasses the
914
+ # trigger and leaves the FTS5 index empty (the bug we hit on
915
+ # first /index-batch landing). Write into `chunks` instead; the
916
+ # AFTER-INSERT trigger syncs to chunks_fts atomically.
917
+ try:
918
+ fts_conn = get_fts_db()
919
+ for r, txt in zip(records, texts):
920
+ rid = r.get("id") or _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
921
+ chunk_id = f"l6:{rid}:0"[:63]
922
+ source_file = (r.get("source_file") or f"{rid}.md")[:500]
923
+ heading = (r.get("heading") or "")[:300]
924
+ doc_type = (r.get("doc_type") or "general")[:30]
925
+ content_hash = _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:20]
926
+ fts_conn.execute(
927
+ "INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
928
+ (chunk_id, txt[:15000], source_file, arena[:60],
929
+ doc_type, heading, 0, content_hash, "[]", now),
930
+ )
931
+ fts_conn.commit()
932
+ fts_conn.close()
933
+ except Exception as exc:
934
+ log.warning("FTS write failed in /index-batch: %s", exc)
935
+
936
+ return {
937
+ "status": "ok",
938
+ "inserted": len(rows),
939
+ "embed_ms": round(embed_ms, 1),
940
+ "insert_ms": round(insert_ms, 1),
941
+ }
942
+
943
+ @api.delete("/purge")
944
+ def api_purge(source_file: str = Q(...)):
945
+ """Remove all chunks for a source file."""
946
+ milvus = get_milvus()
947
+ fts_conn = get_fts_db()
948
+ _purge_file(milvus, fts_conn, source_file)
949
+ fts_conn.commit()
950
+ fts_conn.close()
951
+ return {"status": "purged", "source_file": source_file}
952
+
953
+ @api.post("/rebuild-index")
954
+ def api_rebuild():
955
+ """Force Milvus index rebuild."""
956
+ milvus = get_milvus()
957
+ milvus.release_collection(COLLECTION_NAME)
958
+ milvus.load_collection(COLLECTION_NAME)
959
+ return {"status": "rebuilt"}
960
+
961
+ log.info(f"L6 Document Store — http://127.0.0.1:{port}")
962
+ uvicorn.run(api, host=os.environ.get("HOST","127.0.0.1"), port=port, log_level="info")
963
+
964
+ # ---------------------------------------------------------------------------
965
+ # CLI
966
+ # ---------------------------------------------------------------------------
967
+
968
+ def main():
969
+ parser = argparse.ArgumentParser(description="L6 Document Store")
970
+ parser.add_argument("command", choices=["serve", "index", "search", "health", "stats"])
971
+ parser.add_argument("args", nargs="*")
972
+ parser.add_argument("--port", "-p", type=int, default=DEFAULT_PORT)
973
+ parser.add_argument("--arena", "-a", type=str, default=None)
974
+ parser.add_argument("--doc-type", "-t", type=str, default=None)
975
+ parser.add_argument("--method", "-m", type=str, default="hybrid")
976
+ parser.add_argument("--limit", "-l", type=int, default=10)
977
+ parser.add_argument("--no-entities", action="store_true")
978
+ parser.add_argument("--no-rerank", action="store_true")
979
+
980
+ args = parser.parse_args()
981
+
982
+ if args.command == "serve":
983
+ serve(port=args.port)
984
+
985
+ elif args.command == "index":
986
+ paths = args.args
987
+ if not paths:
988
+ print("Usage: l6-document-store.py index <file1.md> [file2.md ...]")
989
+ print(" l6-document-store.py index ~/memory/research/*.md")
990
+ return
991
+ stats = index_documents(paths, arena=args.arena, doc_type=args.doc_type,
992
+ extract_entities_flag=not args.no_entities)
993
+ print(json.dumps(stats, indent=2))
994
+
995
+ elif args.command == "search":
996
+ query = " ".join(args.args) if args.args else ""
997
+ if not query:
998
+ print("Usage: l6-document-store.py search 'your query'")
999
+ return
1000
+ results = search(query, method=args.method, limit=args.limit,
1001
+ arena=args.arena, enable_rerank=not args.no_rerank)
1002
+ for i, r in enumerate(results, 1):
1003
+ print(f"\n--- [{i}] {r.get('source_file','?')} (rrf={r.get('rrf_score',0):.4f}, engines={r.get('engines','?')}) ---")
1004
+ if r.get("heading"):
1005
+ print(f"Heading: {r['heading']}")
1006
+ if r.get("entities"):
1007
+ print(f"Entities: {', '.join(r['entities'][:10])}")
1008
+ print(r["text"][:300])
1009
+
1010
+ elif args.command == "health":
1011
+ print(json.dumps(health(), indent=2))
1012
+
1013
+ elif args.command == "stats":
1014
+ print(json.dumps(get_stats(), indent=2))
1015
+
1016
+
1017
+ if __name__ == "__main__":
1018
+ main()