@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,1018 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
L6 Document Store — HybridRAG for Document Retrieval
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Milvus Lite (vector) + SQLite FTS5 (BM25) + RRF fusion
|
|
7
|
+
- Cross-encoder reranker (ms-marco-MiniLM-L-6-v2)
|
|
8
|
+
- Ingest-time entity extraction via Ollama graph-preflexor
|
|
9
|
+
- Adaptive chunk sizing by doc_type
|
|
10
|
+
- Freshness-aware dedup (purge-and-replace on re-index)
|
|
11
|
+
- Confidence scoring (RRF + engine_count + reranker_score)
|
|
12
|
+
|
|
13
|
+
Port: 8037
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import sqlite3
|
|
23
|
+
import time
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
27
|
+
|
|
28
|
+
import httpx
|
|
29
|
+
from pymilvus import MilvusClient, DataType, CollectionSchema, FieldSchema
|
|
30
|
+
from pymilvus.milvus_client.index import IndexParams
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Config
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
DATA_DIR = Path(os.environ.get("L6_DATA_DIR", str(Path.home() / "l6-document-store" / "data")))
|
|
37
|
+
MILVUS_DB = str(DATA_DIR / "documents.db")
|
|
38
|
+
FTS_DB = str(DATA_DIR / "documents_fts.db")
|
|
39
|
+
OLLAMA_URL = os.environ.get("L6_OLLAMA_URL", "http://localhost:11434")
|
|
40
|
+
EMBED_MODEL = os.environ.get("L6_EMBED_MODEL", "nomic-embed-text")
|
|
41
|
+
NV_EMBED_URL = os.environ.get("L6_NV_EMBED_URL", "http://localhost:8041/v1/embeddings")
|
|
42
|
+
NV_EMBED_ENABLED = os.environ.get("L6_NV_EMBED_ENABLED", "true").lower() == "true"
|
|
43
|
+
EMBED_DIM = int(os.environ.get("L6_EMBED_DIM", "4096"))
|
|
44
|
+
COLLECTION_NAME = "documents"
|
|
45
|
+
RRF_K = 60
|
|
46
|
+
DEFAULT_PORT = 8037
|
|
47
|
+
|
|
48
|
+
# Chunk sizes by doc_type
|
|
49
|
+
CHUNK_CONFIG = {
|
|
50
|
+
"legal": {"max_chars": 2500, "overlap": 400},
|
|
51
|
+
"financial": {"max_chars": 2500, "overlap": 400},
|
|
52
|
+
"governance": {"max_chars": 2500, "overlap": 400},
|
|
53
|
+
"technical": {"max_chars": 2000, "overlap": 300},
|
|
54
|
+
"general": {"max_chars": 1500, "overlap": 200},
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
58
|
+
log = logging.getLogger("l6-document-store")
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# Embedding
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
_embed_client = httpx.Client(timeout=60)
|
|
65
|
+
|
|
66
|
+
def embed_text(text: str) -> List[float]:
|
|
67
|
+
"""Get embedding — NV-Embed-v2 primary, Ollama fallback."""
|
|
68
|
+
if NV_EMBED_ENABLED:
|
|
69
|
+
try:
|
|
70
|
+
resp = _embed_client.post(NV_EMBED_URL, json={"input": text[:4000]})
|
|
71
|
+
resp.raise_for_status()
|
|
72
|
+
return resp.json()["data"][0]["embedding"]
|
|
73
|
+
except Exception as e:
|
|
74
|
+
log.warning(f"NV-Embed-v2 failed, falling back to Ollama: {e}")
|
|
75
|
+
|
|
76
|
+
# Ollama fallback
|
|
77
|
+
resp = _embed_client.post(
|
|
78
|
+
f"{OLLAMA_URL}/api/embeddings",
|
|
79
|
+
json={"model": EMBED_MODEL, "prompt": text[:8000]},
|
|
80
|
+
)
|
|
81
|
+
resp.raise_for_status()
|
|
82
|
+
return resp.json()["embedding"]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def embed_batch(texts: List[str]) -> List[List[float]]:
|
|
86
|
+
"""Embed a batch of texts — NV-Embed-v2 supports native batching."""
|
|
87
|
+
if NV_EMBED_ENABLED:
|
|
88
|
+
try:
|
|
89
|
+
resp = _embed_client.post(NV_EMBED_URL, json={"input": [t[:4000] for t in texts]})
|
|
90
|
+
resp.raise_for_status()
|
|
91
|
+
return [d["embedding"] for d in resp.json()["data"]]
|
|
92
|
+
except Exception as e:
|
|
93
|
+
log.warning(f"NV-Embed-v2 batch failed, falling back to sequential: {e}")
|
|
94
|
+
|
|
95
|
+
return [embed_text(t) for t in texts]
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Cross-Encoder Reranker
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
_reranker = None
|
|
102
|
+
_reranker_loaded = False
|
|
103
|
+
|
|
104
|
+
def get_reranker():
|
|
105
|
+
"""Lazy-load cross-encoder reranker."""
|
|
106
|
+
global _reranker, _reranker_loaded
|
|
107
|
+
if not _reranker_loaded:
|
|
108
|
+
try:
|
|
109
|
+
from sentence_transformers import CrossEncoder
|
|
110
|
+
_reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
|
111
|
+
log.info("Cross-encoder reranker loaded (ms-marco-MiniLM-L-6-v2)")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
log.warning(f"Cross-encoder not available: {e}")
|
|
114
|
+
_reranker = None
|
|
115
|
+
_reranker_loaded = True
|
|
116
|
+
return _reranker
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def rerank(query: str, results: List[Dict], top_k: int = 10) -> List[Dict]:
|
|
120
|
+
"""Rerank results using cross-encoder."""
|
|
121
|
+
reranker = get_reranker()
|
|
122
|
+
if not reranker or not results:
|
|
123
|
+
return results[:top_k]
|
|
124
|
+
|
|
125
|
+
pairs = [(query, r["text"][:512]) for r in results[:20]]
|
|
126
|
+
scores = reranker.predict(pairs)
|
|
127
|
+
|
|
128
|
+
for i, r in enumerate(results[:20]):
|
|
129
|
+
r["reranker_score"] = float(scores[i])
|
|
130
|
+
|
|
131
|
+
results[:20] = sorted(results[:20], key=lambda x: x.get("reranker_score", -999), reverse=True)
|
|
132
|
+
return results[:top_k]
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Entity Extraction (ingest-time)
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
def extract_entities(text: str) -> List[str]:
|
|
139
|
+
"""Extract entities from text using Ollama graph-preflexor."""
|
|
140
|
+
try:
|
|
141
|
+
resp = _embed_client.post(
|
|
142
|
+
f"{OLLAMA_URL}/api/generate",
|
|
143
|
+
json={
|
|
144
|
+
"model": "graph-preflexor",
|
|
145
|
+
"prompt": f"Extract all named entities (people, companies, products, places, dates) from this text. Return ONLY a JSON array of strings, nothing else.\n\nText: {text[:2000]}",
|
|
146
|
+
"stream": False,
|
|
147
|
+
},
|
|
148
|
+
timeout=15,
|
|
149
|
+
)
|
|
150
|
+
if resp.status_code == 200:
|
|
151
|
+
raw = resp.json().get("response", "")
|
|
152
|
+
# Try to parse JSON array from response
|
|
153
|
+
match = re.search(r'\[.*?\]', raw, re.DOTALL)
|
|
154
|
+
if match:
|
|
155
|
+
entities = json.loads(match.group())
|
|
156
|
+
return [str(e).strip() for e in entities if e and len(str(e).strip()) > 1][:20]
|
|
157
|
+
except Exception as e:
|
|
158
|
+
log.debug(f"Entity extraction failed: {e}")
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# Document Processing
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
def detect_doc_type(path: str) -> str:
|
|
166
|
+
"""Detect document type from path patterns."""
|
|
167
|
+
p = path.lower()
|
|
168
|
+
if any(k in p for k in ["legal", "contract", "nda", "agreement", "terms"]):
|
|
169
|
+
return "legal"
|
|
170
|
+
if any(k in p for k in ["finance", "financial", "investor", "revenue", "budget", "portfolio"]):
|
|
171
|
+
return "financial"
|
|
172
|
+
if any(k in p for k in ["governance", "policy", "compliance", "audit"]):
|
|
173
|
+
return "governance"
|
|
174
|
+
if any(k in p for k in ["technical", "architecture", "api", "schema", "code"]):
|
|
175
|
+
return "technical"
|
|
176
|
+
return "general"
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def detect_arena(path: str) -> str:
|
|
180
|
+
"""Detect arena/domain from path patterns."""
|
|
181
|
+
p = path.lower()
|
|
182
|
+
if "company" in p or "internal" in p:
|
|
183
|
+
return "company"
|
|
184
|
+
if "project" in p or "proj-" in p:
|
|
185
|
+
return "project"
|
|
186
|
+
if "sarai" in p or "defence" in p:
|
|
187
|
+
return "sarai"
|
|
188
|
+
if "research" in p:
|
|
189
|
+
return "research"
|
|
190
|
+
if "finance" in p or "portfolio" in p or "stock" in p:
|
|
191
|
+
return "finance"
|
|
192
|
+
return "general"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def content_hash(text: str) -> str:
|
|
196
|
+
"""SHA256 hash for dedup."""
|
|
197
|
+
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def chunk_markdown(text: str, doc_type: str = "general") -> List[Dict]:
|
|
201
|
+
"""Split markdown into chunks with adaptive sizing."""
|
|
202
|
+
cfg = CHUNK_CONFIG.get(doc_type, CHUNK_CONFIG["general"])
|
|
203
|
+
max_chars = cfg["max_chars"]
|
|
204
|
+
overlap = cfg["overlap"]
|
|
205
|
+
|
|
206
|
+
chunks = []
|
|
207
|
+
current_heading = ""
|
|
208
|
+
|
|
209
|
+
# Split on ## or ### headings
|
|
210
|
+
sections = re.split(r'(^#{2,3}\s+.+$)', text, flags=re.MULTILINE)
|
|
211
|
+
|
|
212
|
+
current_text = ""
|
|
213
|
+
for part in sections:
|
|
214
|
+
if re.match(r'^#{2,3}\s+', part):
|
|
215
|
+
# Save previous section
|
|
216
|
+
if current_text.strip():
|
|
217
|
+
chunks.extend(_split_section(current_text.strip(), current_heading, max_chars, overlap))
|
|
218
|
+
current_heading = part.strip().lstrip('#').strip()
|
|
219
|
+
current_text = ""
|
|
220
|
+
else:
|
|
221
|
+
current_text += part
|
|
222
|
+
|
|
223
|
+
# Don't forget last section
|
|
224
|
+
if current_text.strip():
|
|
225
|
+
chunks.extend(_split_section(current_text.strip(), current_heading, max_chars, overlap))
|
|
226
|
+
|
|
227
|
+
# If no headings found, chunk the whole thing
|
|
228
|
+
if not chunks and text.strip():
|
|
229
|
+
chunks = _split_section(text.strip(), "", max_chars, overlap)
|
|
230
|
+
|
|
231
|
+
return chunks
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _split_section(text: str, heading: str, max_chars: int, overlap: int) -> List[Dict]:
|
|
235
|
+
"""Split a section into overlapping chunks."""
|
|
236
|
+
if len(text) <= max_chars:
|
|
237
|
+
return [{"text": text, "heading": heading}]
|
|
238
|
+
|
|
239
|
+
chunks = []
|
|
240
|
+
start = 0
|
|
241
|
+
while start < len(text):
|
|
242
|
+
end = start + max_chars
|
|
243
|
+
|
|
244
|
+
# Try to break at paragraph boundary
|
|
245
|
+
if end < len(text):
|
|
246
|
+
para_break = text.rfind('\n\n', start, end)
|
|
247
|
+
if para_break > start + max_chars // 2:
|
|
248
|
+
end = para_break
|
|
249
|
+
|
|
250
|
+
chunk_text = text[start:end].strip()
|
|
251
|
+
if chunk_text:
|
|
252
|
+
chunks.append({"text": chunk_text, "heading": heading})
|
|
253
|
+
|
|
254
|
+
start = end - overlap
|
|
255
|
+
if start >= len(text):
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
return chunks
|
|
259
|
+
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
# Milvus Operations
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
def get_milvus() -> MilvusClient:
|
|
265
|
+
"""Get or create Milvus client."""
|
|
266
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
267
|
+
client = MilvusClient(uri=MILVUS_DB)
|
|
268
|
+
|
|
269
|
+
if COLLECTION_NAME not in client.list_collections():
|
|
270
|
+
schema = CollectionSchema(fields=[
|
|
271
|
+
FieldSchema("id", DataType.VARCHAR, is_primary=True, max_length=64),
|
|
272
|
+
FieldSchema("vector", DataType.FLOAT_VECTOR, dim=EMBED_DIM),
|
|
273
|
+
FieldSchema("text", DataType.VARCHAR, max_length=16000),
|
|
274
|
+
FieldSchema("source_file", DataType.VARCHAR, max_length=500),
|
|
275
|
+
FieldSchema("arena", DataType.VARCHAR, max_length=60),
|
|
276
|
+
FieldSchema("doc_type", DataType.VARCHAR, max_length=30),
|
|
277
|
+
FieldSchema("heading", DataType.VARCHAR, max_length=300),
|
|
278
|
+
FieldSchema("chunk_index", DataType.INT64),
|
|
279
|
+
FieldSchema("content_hash", DataType.VARCHAR, max_length=20),
|
|
280
|
+
FieldSchema("entities_json", DataType.VARCHAR, max_length=2000),
|
|
281
|
+
FieldSchema("indexed_at", DataType.VARCHAR, max_length=30),
|
|
282
|
+
])
|
|
283
|
+
client.create_collection(
|
|
284
|
+
collection_name=COLLECTION_NAME,
|
|
285
|
+
schema=schema,
|
|
286
|
+
)
|
|
287
|
+
# Create index
|
|
288
|
+
idx = IndexParams()
|
|
289
|
+
idx.add_index(field_name="vector", index_type="AUTOINDEX", metric_type="COSINE")
|
|
290
|
+
client.create_index(collection_name=COLLECTION_NAME, index_params=idx)
|
|
291
|
+
client.load_collection(COLLECTION_NAME)
|
|
292
|
+
log.info(f"Created Milvus collection '{COLLECTION_NAME}'")
|
|
293
|
+
|
|
294
|
+
return client
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def search_vector(client: MilvusClient, query_vec: List[float], limit: int = 20,
|
|
298
|
+
arena: Optional[str] = None) -> List[Dict]:
|
|
299
|
+
"""Vector similarity search."""
|
|
300
|
+
filter_expr = f'arena == "{arena}"' if arena else ""
|
|
301
|
+
results = client.search(
|
|
302
|
+
collection_name=COLLECTION_NAME,
|
|
303
|
+
data=[query_vec],
|
|
304
|
+
limit=limit,
|
|
305
|
+
output_fields=["text", "source_file", "arena", "doc_type", "heading",
|
|
306
|
+
"chunk_index", "content_hash", "entities_json", "indexed_at"],
|
|
307
|
+
filter=filter_expr if filter_expr else None,
|
|
308
|
+
)
|
|
309
|
+
out = []
|
|
310
|
+
for hits in results:
|
|
311
|
+
for hit in hits:
|
|
312
|
+
entity = hit.get("entity", {})
|
|
313
|
+
out.append({
|
|
314
|
+
"text": entity.get("text", ""),
|
|
315
|
+
"source_file": entity.get("source_file", ""),
|
|
316
|
+
"arena": entity.get("arena", ""),
|
|
317
|
+
"doc_type": entity.get("doc_type", ""),
|
|
318
|
+
"heading": entity.get("heading", ""),
|
|
319
|
+
"chunk_index": entity.get("chunk_index", 0),
|
|
320
|
+
"content_hash": entity.get("content_hash", ""),
|
|
321
|
+
"entities": _parse_entities_json(entity.get("entities_json", "[]")),
|
|
322
|
+
"score": hit.get("distance", 0),
|
|
323
|
+
"engine": "vector",
|
|
324
|
+
})
|
|
325
|
+
return out
|
|
326
|
+
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
# FTS5 Operations
|
|
329
|
+
# ---------------------------------------------------------------------------
|
|
330
|
+
|
|
331
|
+
def get_fts_db() -> sqlite3.Connection:
|
|
332
|
+
"""Get or create FTS5 database."""
|
|
333
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
334
|
+
conn = sqlite3.connect(FTS_DB)
|
|
335
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
336
|
+
|
|
337
|
+
# Create content table
|
|
338
|
+
conn.execute("""
|
|
339
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
340
|
+
id TEXT PRIMARY KEY,
|
|
341
|
+
text TEXT,
|
|
342
|
+
source_file TEXT,
|
|
343
|
+
arena TEXT,
|
|
344
|
+
doc_type TEXT,
|
|
345
|
+
heading TEXT,
|
|
346
|
+
chunk_index INTEGER,
|
|
347
|
+
content_hash TEXT,
|
|
348
|
+
entities_json TEXT,
|
|
349
|
+
indexed_at TEXT
|
|
350
|
+
)
|
|
351
|
+
""")
|
|
352
|
+
|
|
353
|
+
# Create FTS5 virtual table
|
|
354
|
+
conn.execute("""
|
|
355
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
356
|
+
text, source_file, arena, heading, entities_json,
|
|
357
|
+
content='chunks',
|
|
358
|
+
content_rowid='rowid'
|
|
359
|
+
)
|
|
360
|
+
""")
|
|
361
|
+
|
|
362
|
+
# Triggers for sync
|
|
363
|
+
conn.execute("""
|
|
364
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
|
|
365
|
+
INSERT INTO chunks_fts(rowid, text, source_file, arena, heading, entities_json)
|
|
366
|
+
VALUES (new.rowid, new.text, new.source_file, new.arena, new.heading, new.entities_json);
|
|
367
|
+
END
|
|
368
|
+
""")
|
|
369
|
+
conn.execute("""
|
|
370
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
|
|
371
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, text, source_file, arena, heading, entities_json)
|
|
372
|
+
VALUES ('delete', old.rowid, old.text, old.source_file, old.arena, old.heading, old.entities_json);
|
|
373
|
+
END
|
|
374
|
+
""")
|
|
375
|
+
|
|
376
|
+
conn.commit()
|
|
377
|
+
return conn
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def search_fts(conn: sqlite3.Connection, query: str, limit: int = 20,
|
|
381
|
+
arena: Optional[str] = None) -> List[Dict]:
|
|
382
|
+
"""BM25 keyword search via FTS5."""
|
|
383
|
+
# Escape FTS5 special chars
|
|
384
|
+
safe_query = re.sub(r'[^\w\s]', ' ', query).strip()
|
|
385
|
+
if not safe_query:
|
|
386
|
+
return []
|
|
387
|
+
|
|
388
|
+
arena_filter = f"AND c.arena = ?" if arena else ""
|
|
389
|
+
params = [safe_query, limit] if not arena else [safe_query, arena, limit]
|
|
390
|
+
|
|
391
|
+
sql = f"""
|
|
392
|
+
SELECT c.*, bm25(chunks_fts) as rank
|
|
393
|
+
FROM chunks_fts f
|
|
394
|
+
JOIN chunks c ON c.rowid = f.rowid
|
|
395
|
+
WHERE chunks_fts MATCH ?
|
|
396
|
+
{arena_filter}
|
|
397
|
+
ORDER BY rank
|
|
398
|
+
LIMIT ?
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
rows = conn.execute(sql, params).fetchall()
|
|
403
|
+
except sqlite3.OperationalError as e:
|
|
404
|
+
log.warning(f"FTS query failed: {e}")
|
|
405
|
+
return []
|
|
406
|
+
|
|
407
|
+
cols = ["id", "text", "source_file", "arena", "doc_type", "heading",
|
|
408
|
+
"chunk_index", "content_hash", "entities_json", "indexed_at", "rank"]
|
|
409
|
+
out = []
|
|
410
|
+
for row in rows:
|
|
411
|
+
d = dict(zip(cols, row))
|
|
412
|
+
out.append({
|
|
413
|
+
"text": d.get("text", ""),
|
|
414
|
+
"source_file": d.get("source_file", ""),
|
|
415
|
+
"arena": d.get("arena", ""),
|
|
416
|
+
"doc_type": d.get("doc_type", ""),
|
|
417
|
+
"heading": d.get("heading", ""),
|
|
418
|
+
"chunk_index": d.get("chunk_index", 0),
|
|
419
|
+
"content_hash": d.get("content_hash", ""),
|
|
420
|
+
"entities": _parse_entities_json(d.get("entities_json", "[]")),
|
|
421
|
+
"score": -d.get("rank", 0), # BM25 returns negative scores
|
|
422
|
+
"engine": "bm25",
|
|
423
|
+
})
|
|
424
|
+
return out
|
|
425
|
+
|
|
426
|
+
# ---------------------------------------------------------------------------
|
|
427
|
+
# RRF Fusion
|
|
428
|
+
# ---------------------------------------------------------------------------
|
|
429
|
+
|
|
430
|
+
def rrf_fuse(vector_results: List[Dict], bm25_results: List[Dict]) -> List[Dict]:
|
|
431
|
+
"""Reciprocal Rank Fusion combining vector and BM25 results."""
|
|
432
|
+
scored = {}
|
|
433
|
+
|
|
434
|
+
for rank, r in enumerate(vector_results):
|
|
435
|
+
key = (r["source_file"], r["chunk_index"])
|
|
436
|
+
if key not in scored:
|
|
437
|
+
scored[key] = {"result": r, "rrf_score": 0, "engines": set()}
|
|
438
|
+
scored[key]["rrf_score"] += 1.0 / (RRF_K + rank + 1)
|
|
439
|
+
scored[key]["engines"].add("vector")
|
|
440
|
+
|
|
441
|
+
for rank, r in enumerate(bm25_results):
|
|
442
|
+
key = (r["source_file"], r["chunk_index"])
|
|
443
|
+
if key not in scored:
|
|
444
|
+
scored[key] = {"result": r, "rrf_score": 0, "engines": set()}
|
|
445
|
+
scored[key]["rrf_score"] += 1.0 / (RRF_K + rank + 1)
|
|
446
|
+
scored[key]["engines"].add("bm25")
|
|
447
|
+
|
|
448
|
+
# Sort by RRF score
|
|
449
|
+
fused = sorted(scored.values(), key=lambda x: x["rrf_score"], reverse=True)
|
|
450
|
+
|
|
451
|
+
out = []
|
|
452
|
+
for item in fused:
|
|
453
|
+
r = item["result"]
|
|
454
|
+
r["rrf_score"] = round(item["rrf_score"], 6)
|
|
455
|
+
r["engine_count"] = len(item["engines"])
|
|
456
|
+
r["engines"] = list(item["engines"])
|
|
457
|
+
out.append(r)
|
|
458
|
+
|
|
459
|
+
return out
|
|
460
|
+
|
|
461
|
+
# ---------------------------------------------------------------------------
|
|
462
|
+
# Indexing
|
|
463
|
+
# ---------------------------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
def index_documents(paths: List[str], arena: Optional[str] = None,
|
|
466
|
+
doc_type: Optional[str] = None,
|
|
467
|
+
extract_entities_flag: bool = True,
|
|
468
|
+
use_enhanced_ingest: bool = True) -> Dict:
|
|
469
|
+
"""Index documents into both Milvus and FTS5.
|
|
470
|
+
|
|
471
|
+
Supports: .md, .txt, .markdown, .pdf (via enhanced_ingest)
|
|
472
|
+
"""
|
|
473
|
+
milvus = get_milvus()
|
|
474
|
+
fts_conn = get_fts_db()
|
|
475
|
+
|
|
476
|
+
stats = {"files": 0, "chunks": 0, "entities_extracted": 0, "errors": 0, "skipped": 0,
|
|
477
|
+
"tables": 0, "semantic_chunks": 0}
|
|
478
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
479
|
+
|
|
480
|
+
for file_path in paths:
|
|
481
|
+
p = Path(file_path)
|
|
482
|
+
if not p.exists():
|
|
483
|
+
log.warning(f"File not found: {file_path}")
|
|
484
|
+
stats["errors"] += 1
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
supported_exts = (
|
|
488
|
+
'.md', '.txt', '.markdown', '.pdf',
|
|
489
|
+
# enhanced_ingest formats
|
|
490
|
+
'.csv', '.json', '.yaml', '.yml', '.toml',
|
|
491
|
+
'.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h',
|
|
492
|
+
'.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls',
|
|
493
|
+
'.rtf', '.odt', '.epub', '.tex',
|
|
494
|
+
'.html', '.htm', '.xml',
|
|
495
|
+
'.ipynb',
|
|
496
|
+
)
|
|
497
|
+
if not p.suffix.lower() in supported_exts:
|
|
498
|
+
log.debug(f"Skipping unsupported: {file_path}")
|
|
499
|
+
stats["skipped"] += 1
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
# Use enhanced ingest for PDFs (and optionally for all docs)
|
|
503
|
+
if p.suffix.lower() == '.pdf' or use_enhanced_ingest:
|
|
504
|
+
try:
|
|
505
|
+
from enhanced_ingest import ingest_document, Chunk
|
|
506
|
+
result = ingest_document(str(p), arena or detect_arena(str(p)))
|
|
507
|
+
|
|
508
|
+
file_arena = result["arena"]
|
|
509
|
+
file_doc_type = doc_type or detect_doc_type(str(p))
|
|
510
|
+
source_file = str(p)
|
|
511
|
+
|
|
512
|
+
# Purge old chunks
|
|
513
|
+
_purge_file(milvus, fts_conn, source_file)
|
|
514
|
+
|
|
515
|
+
milvus_batch = []
|
|
516
|
+
for chunk_data in result["chunks"]:
|
|
517
|
+
chunk_text = chunk_data["text"]
|
|
518
|
+
c_hash = content_hash(chunk_text)
|
|
519
|
+
idx = chunk_data["chunk_index"]
|
|
520
|
+
chunk_id = f"{c_hash}_{idx}"
|
|
521
|
+
|
|
522
|
+
# Track semantic vs fixed chunks
|
|
523
|
+
if chunk_data.get("metadata", {}).get("type") == "semantic":
|
|
524
|
+
stats["semantic_chunks"] += 1
|
|
525
|
+
if chunk_data.get("metadata", {}).get("type") == "table":
|
|
526
|
+
stats["tables"] += 1
|
|
527
|
+
|
|
528
|
+
# Extract entities
|
|
529
|
+
entities = []
|
|
530
|
+
if extract_entities_flag and len(chunk_text) > 50:
|
|
531
|
+
entities = extract_entities(chunk_text)
|
|
532
|
+
if entities:
|
|
533
|
+
stats["entities_extracted"] += len(entities)
|
|
534
|
+
|
|
535
|
+
entities_json = json.dumps(entities)
|
|
536
|
+
vector = embed_text(chunk_text)
|
|
537
|
+
|
|
538
|
+
milvus_batch.append({
|
|
539
|
+
"id": chunk_id,
|
|
540
|
+
"vector": vector,
|
|
541
|
+
"text": chunk_text[:15000],
|
|
542
|
+
"source_file": source_file[:500],
|
|
543
|
+
"arena": file_arena[:60],
|
|
544
|
+
"doc_type": file_doc_type[:30],
|
|
545
|
+
"heading": chunk_data.get("heading", "")[:300],
|
|
546
|
+
"chunk_index": idx,
|
|
547
|
+
"content_hash": c_hash,
|
|
548
|
+
"entities_json": entities_json[:2000],
|
|
549
|
+
"indexed_at": now,
|
|
550
|
+
})
|
|
551
|
+
|
|
552
|
+
fts_conn.execute(
|
|
553
|
+
"INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
|
|
554
|
+
(chunk_id, chunk_text[:15000], source_file[:500], file_arena[:60],
|
|
555
|
+
file_doc_type[:30], chunk_data.get("heading", "")[:300], idx,
|
|
556
|
+
c_hash, entities_json[:2000], now),
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
if milvus_batch:
|
|
560
|
+
# pymilvus 2.6+ requires keyword args; old positional form
|
|
561
|
+
# silently no-ops which causes vector hits to be empty.
|
|
562
|
+
milvus.insert(collection_name=COLLECTION_NAME, data=milvus_batch)
|
|
563
|
+
fts_conn.commit()
|
|
564
|
+
|
|
565
|
+
stats["files"] += 1
|
|
566
|
+
stats["chunks"] += len(result["chunks"])
|
|
567
|
+
log.info(f"Indexed (enhanced): {p.name} — {len(result['chunks'])} chunks, {len(result.get('tables', []))} tables")
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
except ImportError:
|
|
571
|
+
log.warning("enhanced_ingest not available, falling back to basic chunking")
|
|
572
|
+
except Exception as e:
|
|
573
|
+
log.error(f"Enhanced ingest failed for {file_path}: {e}")
|
|
574
|
+
if p.suffix.lower() == '.pdf':
|
|
575
|
+
stats["errors"] += 1
|
|
576
|
+
continue
|
|
577
|
+
# Fall through to basic chunking for non-PDFs
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
text = p.read_text(errors="replace")
|
|
581
|
+
if len(text.strip()) < 20:
|
|
582
|
+
stats["skipped"] += 1
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
file_arena = arena or detect_arena(str(p))
|
|
586
|
+
file_doc_type = doc_type or detect_doc_type(str(p))
|
|
587
|
+
source_file = str(p)
|
|
588
|
+
|
|
589
|
+
# Purge old chunks for this file (freshness-aware dedup)
|
|
590
|
+
_purge_file(milvus, fts_conn, source_file)
|
|
591
|
+
|
|
592
|
+
# Chunk the document
|
|
593
|
+
chunks = chunk_markdown(text, file_doc_type)
|
|
594
|
+
|
|
595
|
+
# Process each chunk
|
|
596
|
+
milvus_batch = []
|
|
597
|
+
for idx, chunk in enumerate(chunks):
|
|
598
|
+
chunk_text = chunk["text"]
|
|
599
|
+
c_hash = content_hash(chunk_text)
|
|
600
|
+
chunk_id = f"{c_hash}_{idx}"
|
|
601
|
+
|
|
602
|
+
# Extract entities (ingest-time)
|
|
603
|
+
entities = []
|
|
604
|
+
if extract_entities_flag and len(chunk_text) > 50:
|
|
605
|
+
entities = extract_entities(chunk_text)
|
|
606
|
+
if entities:
|
|
607
|
+
stats["entities_extracted"] += len(entities)
|
|
608
|
+
|
|
609
|
+
entities_json = json.dumps(entities)
|
|
610
|
+
|
|
611
|
+
# Embed
|
|
612
|
+
vector = embed_text(chunk_text)
|
|
613
|
+
|
|
614
|
+
# Prepare Milvus record
|
|
615
|
+
milvus_batch.append({
|
|
616
|
+
"id": chunk_id,
|
|
617
|
+
"vector": vector,
|
|
618
|
+
"text": chunk_text[:15000],
|
|
619
|
+
"source_file": source_file[:500],
|
|
620
|
+
"arena": file_arena[:60],
|
|
621
|
+
"doc_type": file_doc_type[:30],
|
|
622
|
+
"heading": chunk.get("heading", "")[:300],
|
|
623
|
+
"chunk_index": idx,
|
|
624
|
+
"content_hash": c_hash,
|
|
625
|
+
"entities_json": entities_json[:2000],
|
|
626
|
+
"indexed_at": now,
|
|
627
|
+
})
|
|
628
|
+
|
|
629
|
+
# Insert into FTS5
|
|
630
|
+
fts_conn.execute(
|
|
631
|
+
"INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
|
|
632
|
+
(chunk_id, chunk_text[:15000], source_file[:500], file_arena[:60],
|
|
633
|
+
file_doc_type[:30], chunk.get("heading", "")[:300], idx,
|
|
634
|
+
c_hash, entities_json[:2000], now),
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
stats["chunks"] += 1
|
|
638
|
+
|
|
639
|
+
# Batch insert into Milvus
|
|
640
|
+
if milvus_batch:
|
|
641
|
+
milvus.upsert(collection_name=COLLECTION_NAME, data=milvus_batch)
|
|
642
|
+
|
|
643
|
+
stats["files"] += 1
|
|
644
|
+
log.info(f"Indexed {p.name}: {len(chunks)} chunks, arena={file_arena}, type={file_doc_type}")
|
|
645
|
+
|
|
646
|
+
except Exception as e:
|
|
647
|
+
log.error(f"Error indexing {file_path}: {e}")
|
|
648
|
+
stats["errors"] += 1
|
|
649
|
+
|
|
650
|
+
fts_conn.commit()
|
|
651
|
+
fts_conn.close()
|
|
652
|
+
return stats
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def _purge_file(milvus: MilvusClient, fts_conn: sqlite3.Connection, source_file: str):
|
|
656
|
+
"""Remove all chunks for a source file (freshness-aware re-index)."""
|
|
657
|
+
try:
|
|
658
|
+
# Purge from Milvus
|
|
659
|
+
milvus.delete(
|
|
660
|
+
collection_name=COLLECTION_NAME,
|
|
661
|
+
filter=f'source_file == "{source_file}"',
|
|
662
|
+
)
|
|
663
|
+
except Exception as e:
|
|
664
|
+
log.debug(f"Milvus purge (may be empty): {e}")
|
|
665
|
+
|
|
666
|
+
try:
|
|
667
|
+
# Purge from FTS
|
|
668
|
+
fts_conn.execute("DELETE FROM chunks WHERE source_file = ?", (source_file,))
|
|
669
|
+
except Exception as e:
|
|
670
|
+
log.debug(f"FTS purge: {e}")
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _parse_entities_json(s: str) -> List[str]:
|
|
674
|
+
"""Safely parse entities JSON."""
|
|
675
|
+
try:
|
|
676
|
+
return json.loads(s) if s else []
|
|
677
|
+
except (json.JSONDecodeError, TypeError):
|
|
678
|
+
return []
|
|
679
|
+
|
|
680
|
+
# ---------------------------------------------------------------------------
|
|
681
|
+
# Search
|
|
682
|
+
# ---------------------------------------------------------------------------
|
|
683
|
+
|
|
684
|
+
def search(query: str, method: str = "hybrid", limit: int = 10,
|
|
685
|
+
arena: Optional[str] = None, enable_rerank: bool = True) -> List[Dict]:
|
|
686
|
+
"""Search documents with specified method."""
|
|
687
|
+
|
|
688
|
+
if method == "vector":
|
|
689
|
+
vec = embed_text(query)
|
|
690
|
+
results = search_vector(get_milvus(), vec, limit=limit, arena=arena)
|
|
691
|
+
elif method == "bm25":
|
|
692
|
+
results = search_fts(get_fts_db(), query, limit=limit, arena=arena)
|
|
693
|
+
else:
|
|
694
|
+
# Hybrid: RRF fusion
|
|
695
|
+
vec = embed_text(query)
|
|
696
|
+
vector_results = search_vector(get_milvus(), vec, limit=20, arena=arena)
|
|
697
|
+
bm25_results = search_fts(get_fts_db(), query, limit=20, arena=arena)
|
|
698
|
+
results = rrf_fuse(vector_results, bm25_results)
|
|
699
|
+
|
|
700
|
+
# Rerank if enabled
|
|
701
|
+
if enable_rerank and len(results) > 1:
|
|
702
|
+
results = rerank(query, results, top_k=limit)
|
|
703
|
+
|
|
704
|
+
return results[:limit]
|
|
705
|
+
|
|
706
|
+
# ---------------------------------------------------------------------------
|
|
707
|
+
# Stats & Health
|
|
708
|
+
# ---------------------------------------------------------------------------
|
|
709
|
+
|
|
710
|
+
def get_stats() -> Dict:
|
|
711
|
+
"""Get index statistics."""
|
|
712
|
+
stats = {"vector_chunks": 0, "fts_chunks": 0, "arenas": {}, "doc_types": {}}
|
|
713
|
+
|
|
714
|
+
try:
|
|
715
|
+
milvus = get_milvus()
|
|
716
|
+
info = milvus.get_collection_stats(COLLECTION_NAME)
|
|
717
|
+
stats["vector_chunks"] = info.get("row_count", 0)
|
|
718
|
+
except Exception:
|
|
719
|
+
pass
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
conn = get_fts_db()
|
|
723
|
+
row = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()
|
|
724
|
+
stats["fts_chunks"] = row[0] if row else 0
|
|
725
|
+
|
|
726
|
+
for row in conn.execute("SELECT arena, COUNT(*) FROM chunks GROUP BY arena").fetchall():
|
|
727
|
+
stats["arenas"][row[0]] = row[1]
|
|
728
|
+
|
|
729
|
+
for row in conn.execute("SELECT doc_type, COUNT(*) FROM chunks GROUP BY doc_type").fetchall():
|
|
730
|
+
stats["doc_types"][row[0]] = row[1]
|
|
731
|
+
|
|
732
|
+
conn.close()
|
|
733
|
+
except Exception:
|
|
734
|
+
pass
|
|
735
|
+
|
|
736
|
+
return stats
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def health() -> Dict:
|
|
740
|
+
"""Health check."""
|
|
741
|
+
status = {"status": "ok", "milvus": "unknown", "fts": "unknown", "ollama": "unknown", "reranker": "unknown"}
|
|
742
|
+
|
|
743
|
+
# Milvus
|
|
744
|
+
try:
|
|
745
|
+
client = get_milvus()
|
|
746
|
+
colls = client.list_collections()
|
|
747
|
+
status["milvus"] = f"ok ({len(colls)} collections)"
|
|
748
|
+
except Exception as e:
|
|
749
|
+
status["milvus"] = f"error: {e}"
|
|
750
|
+
status["status"] = "degraded"
|
|
751
|
+
|
|
752
|
+
# FTS
|
|
753
|
+
try:
|
|
754
|
+
conn = get_fts_db()
|
|
755
|
+
cnt = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
|
756
|
+
status["fts"] = f"ok ({cnt} chunks)"
|
|
757
|
+
conn.close()
|
|
758
|
+
except Exception as e:
|
|
759
|
+
status["fts"] = f"error: {e}"
|
|
760
|
+
status["status"] = "degraded"
|
|
761
|
+
|
|
762
|
+
# Ollama
|
|
763
|
+
try:
|
|
764
|
+
resp = _embed_client.get(f"{OLLAMA_URL}/api/tags", timeout=5)
|
|
765
|
+
status["ollama"] = "ok" if resp.status_code == 200 else f"http {resp.status_code}"
|
|
766
|
+
except Exception as e:
|
|
767
|
+
status["ollama"] = f"error: {e}"
|
|
768
|
+
status["status"] = "degraded"
|
|
769
|
+
|
|
770
|
+
# Reranker
|
|
771
|
+
reranker = get_reranker()
|
|
772
|
+
status["reranker"] = "loaded" if reranker else "unavailable (CPU fallback to RRF)"
|
|
773
|
+
|
|
774
|
+
return status
|
|
775
|
+
|
|
776
|
+
# ---------------------------------------------------------------------------
|
|
777
|
+
# FastAPI Server
|
|
778
|
+
# ---------------------------------------------------------------------------
|
|
779
|
+
|
|
780
|
+
def serve(port: int = DEFAULT_PORT):
|
|
781
|
+
"""Run as HTTP API server."""
|
|
782
|
+
from fastapi import FastAPI, Query as Q, HTTPException
|
|
783
|
+
from pydantic import BaseModel
|
|
784
|
+
import uvicorn
|
|
785
|
+
|
|
786
|
+
api = FastAPI(title="L6 Document Store", version="1.0.0")
|
|
787
|
+
|
|
788
|
+
class IndexRequest(BaseModel):
|
|
789
|
+
paths: List[str]
|
|
790
|
+
arena: Optional[str] = None
|
|
791
|
+
doc_type: Optional[str] = None
|
|
792
|
+
extract_entities: bool = True
|
|
793
|
+
|
|
794
|
+
@api.get("/health")
|
|
795
|
+
def api_health():
|
|
796
|
+
return health()
|
|
797
|
+
|
|
798
|
+
@api.get("/stats")
|
|
799
|
+
def api_stats():
|
|
800
|
+
return get_stats()
|
|
801
|
+
|
|
802
|
+
@api.get("/search")
|
|
803
|
+
def api_search(
|
|
804
|
+
q: str = Q(..., description="Search query"),
|
|
805
|
+
method: str = Q("hybrid", description="hybrid|vector|bm25"),
|
|
806
|
+
limit: int = Q(10, ge=1, le=50),
|
|
807
|
+
arena: Optional[str] = Q(None),
|
|
808
|
+
rerank: bool = Q(True),
|
|
809
|
+
):
|
|
810
|
+
results = search(q, method=method, limit=limit, arena=arena, enable_rerank=rerank)
|
|
811
|
+
return {"query": q, "method": method, "results": results, "count": len(results)}
|
|
812
|
+
|
|
813
|
+
@api.post("/search")
|
|
814
|
+
def api_search_post(
|
|
815
|
+
q: str,
|
|
816
|
+
method: str = "hybrid",
|
|
817
|
+
limit: int = 10,
|
|
818
|
+
arena: Optional[str] = None,
|
|
819
|
+
rerank: bool = True,
|
|
820
|
+
):
|
|
821
|
+
"""POST version of search for compatibility."""
|
|
822
|
+
results = search(q, method=method, limit=limit, arena=arena, enable_rerank=rerank)
|
|
823
|
+
return {"query": q, "method": method, "results": results, "count": len(results)}
|
|
824
|
+
|
|
825
|
+
@api.post("/index")
|
|
826
|
+
def api_index(req: IndexRequest):
|
|
827
|
+
stats = index_documents(
|
|
828
|
+
req.paths, arena=req.arena, doc_type=req.doc_type,
|
|
829
|
+
extract_entities_flag=req.extract_entities,
|
|
830
|
+
)
|
|
831
|
+
return {"status": "ok", "stats": stats}
|
|
832
|
+
|
|
833
|
+
@api.post("/index-batch")
|
|
834
|
+
def api_index_batch(req: dict):
|
|
835
|
+
"""Index a batch of in-memory documents in a single batched
|
|
836
|
+
NV-Embed call + a single milvus insert + one FTS write.
|
|
837
|
+
|
|
838
|
+
Roughly 30-50x faster than calling /index for the equivalent
|
|
839
|
+
files because the legacy path does one embed roundtrip per
|
|
840
|
+
chunk. This endpoint exists for tests, smoke runs and bench
|
|
841
|
+
harnesses where small corpora need to land quickly.
|
|
842
|
+
|
|
843
|
+
Request body::
|
|
844
|
+
|
|
845
|
+
{
|
|
846
|
+
"arena": "benchmark",
|
|
847
|
+
"records": [
|
|
848
|
+
{
|
|
849
|
+
"id": "doc1", # required, becomes chunk id prefix
|
|
850
|
+
"text": "…", # required, indexed as one chunk
|
|
851
|
+
"source_file": "doc1.md", # optional
|
|
852
|
+
"doc_type": "general", # optional, default "general"
|
|
853
|
+
"heading": "…" # optional
|
|
854
|
+
}, …
|
|
855
|
+
]
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
Returns::
|
|
859
|
+
|
|
860
|
+
{"status": "ok", "inserted": N, "embed_ms": float, "insert_ms": float}
|
|
861
|
+
"""
|
|
862
|
+
import time as _time, hashlib as _hashlib, httpx as _httpx
|
|
863
|
+
from datetime import datetime as _dt, timezone as _tz
|
|
864
|
+
|
|
865
|
+
records = req.get("records") or []
|
|
866
|
+
arena = req.get("arena") or "general"
|
|
867
|
+
if not records:
|
|
868
|
+
return {"status": "ok", "inserted": 0}
|
|
869
|
+
|
|
870
|
+
texts = [(r.get("text") or "")[:16000] for r in records]
|
|
871
|
+
|
|
872
|
+
# Single batched NV-Embed call.
|
|
873
|
+
t0 = _time.time()
|
|
874
|
+
try:
|
|
875
|
+
resp = _httpx.post(
|
|
876
|
+
NV_EMBED_URL, json={"input": texts, "model": "nv-embed-v2"},
|
|
877
|
+
timeout=120,
|
|
878
|
+
)
|
|
879
|
+
resp.raise_for_status()
|
|
880
|
+
embs = [d["embedding"] for d in resp.json()["data"]]
|
|
881
|
+
except Exception as exc:
|
|
882
|
+
raise HTTPException(status_code=500, detail=f"embed failed: {exc}")
|
|
883
|
+
embed_ms = (_time.time() - t0) * 1000.0
|
|
884
|
+
|
|
885
|
+
# Single milvus insert.
|
|
886
|
+
milvus = get_milvus()
|
|
887
|
+
now = _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
888
|
+
rows = []
|
|
889
|
+
for r, emb, txt in zip(records, embs, texts):
|
|
890
|
+
if emb is None:
|
|
891
|
+
continue
|
|
892
|
+
rid = r.get("id") or _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
|
|
893
|
+
chunk_id = f"l6:{rid}:0"[:63]
|
|
894
|
+
rows.append({
|
|
895
|
+
"id": chunk_id,
|
|
896
|
+
"vector": emb,
|
|
897
|
+
"text": txt,
|
|
898
|
+
"source_file": (r.get("source_file") or f"{rid}.md")[:500],
|
|
899
|
+
"arena": arena[:60],
|
|
900
|
+
"doc_type": (r.get("doc_type") or "general")[:30],
|
|
901
|
+
"heading": (r.get("heading") or "")[:300],
|
|
902
|
+
"chunk_index": 0,
|
|
903
|
+
"content_hash": _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:20],
|
|
904
|
+
"entities_json": "[]",
|
|
905
|
+
"indexed_at": now,
|
|
906
|
+
})
|
|
907
|
+
t1 = _time.time()
|
|
908
|
+
if rows:
|
|
909
|
+
milvus.insert(collection_name=COLLECTION_NAME, data=rows)
|
|
910
|
+
insert_ms = (_time.time() - t1) * 1000.0
|
|
911
|
+
|
|
912
|
+
# FTS5 is auto-populated by a trigger when we INSERT into the
|
|
913
|
+
# `chunks` table. Inserting into chunks_fts directly bypasses the
|
|
914
|
+
# trigger and leaves the FTS5 index empty (the bug we hit on
|
|
915
|
+
# first /index-batch landing). Write into `chunks` instead; the
|
|
916
|
+
# AFTER-INSERT trigger syncs to chunks_fts atomically.
|
|
917
|
+
try:
|
|
918
|
+
fts_conn = get_fts_db()
|
|
919
|
+
for r, txt in zip(records, texts):
|
|
920
|
+
rid = r.get("id") or _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:32]
|
|
921
|
+
chunk_id = f"l6:{rid}:0"[:63]
|
|
922
|
+
source_file = (r.get("source_file") or f"{rid}.md")[:500]
|
|
923
|
+
heading = (r.get("heading") or "")[:300]
|
|
924
|
+
doc_type = (r.get("doc_type") or "general")[:30]
|
|
925
|
+
content_hash = _hashlib.sha1(txt.encode("utf-8")).hexdigest()[:20]
|
|
926
|
+
fts_conn.execute(
|
|
927
|
+
"INSERT OR REPLACE INTO chunks VALUES (?,?,?,?,?,?,?,?,?,?)",
|
|
928
|
+
(chunk_id, txt[:15000], source_file, arena[:60],
|
|
929
|
+
doc_type, heading, 0, content_hash, "[]", now),
|
|
930
|
+
)
|
|
931
|
+
fts_conn.commit()
|
|
932
|
+
fts_conn.close()
|
|
933
|
+
except Exception as exc:
|
|
934
|
+
log.warning("FTS write failed in /index-batch: %s", exc)
|
|
935
|
+
|
|
936
|
+
return {
|
|
937
|
+
"status": "ok",
|
|
938
|
+
"inserted": len(rows),
|
|
939
|
+
"embed_ms": round(embed_ms, 1),
|
|
940
|
+
"insert_ms": round(insert_ms, 1),
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
@api.delete("/purge")
|
|
944
|
+
def api_purge(source_file: str = Q(...)):
|
|
945
|
+
"""Remove all chunks for a source file."""
|
|
946
|
+
milvus = get_milvus()
|
|
947
|
+
fts_conn = get_fts_db()
|
|
948
|
+
_purge_file(milvus, fts_conn, source_file)
|
|
949
|
+
fts_conn.commit()
|
|
950
|
+
fts_conn.close()
|
|
951
|
+
return {"status": "purged", "source_file": source_file}
|
|
952
|
+
|
|
953
|
+
@api.post("/rebuild-index")
|
|
954
|
+
def api_rebuild():
|
|
955
|
+
"""Force Milvus index rebuild."""
|
|
956
|
+
milvus = get_milvus()
|
|
957
|
+
milvus.release_collection(COLLECTION_NAME)
|
|
958
|
+
milvus.load_collection(COLLECTION_NAME)
|
|
959
|
+
return {"status": "rebuilt"}
|
|
960
|
+
|
|
961
|
+
log.info(f"L6 Document Store — http://127.0.0.1:{port}")
|
|
962
|
+
uvicorn.run(api, host=os.environ.get("HOST","127.0.0.1"), port=port, log_level="info")
|
|
963
|
+
|
|
964
|
+
# ---------------------------------------------------------------------------
|
|
965
|
+
# CLI
|
|
966
|
+
# ---------------------------------------------------------------------------
|
|
967
|
+
|
|
968
|
+
def main():
|
|
969
|
+
parser = argparse.ArgumentParser(description="L6 Document Store")
|
|
970
|
+
parser.add_argument("command", choices=["serve", "index", "search", "health", "stats"])
|
|
971
|
+
parser.add_argument("args", nargs="*")
|
|
972
|
+
parser.add_argument("--port", "-p", type=int, default=DEFAULT_PORT)
|
|
973
|
+
parser.add_argument("--arena", "-a", type=str, default=None)
|
|
974
|
+
parser.add_argument("--doc-type", "-t", type=str, default=None)
|
|
975
|
+
parser.add_argument("--method", "-m", type=str, default="hybrid")
|
|
976
|
+
parser.add_argument("--limit", "-l", type=int, default=10)
|
|
977
|
+
parser.add_argument("--no-entities", action="store_true")
|
|
978
|
+
parser.add_argument("--no-rerank", action="store_true")
|
|
979
|
+
|
|
980
|
+
args = parser.parse_args()
|
|
981
|
+
|
|
982
|
+
if args.command == "serve":
|
|
983
|
+
serve(port=args.port)
|
|
984
|
+
|
|
985
|
+
elif args.command == "index":
|
|
986
|
+
paths = args.args
|
|
987
|
+
if not paths:
|
|
988
|
+
print("Usage: l6-document-store.py index <file1.md> [file2.md ...]")
|
|
989
|
+
print(" l6-document-store.py index ~/memory/research/*.md")
|
|
990
|
+
return
|
|
991
|
+
stats = index_documents(paths, arena=args.arena, doc_type=args.doc_type,
|
|
992
|
+
extract_entities_flag=not args.no_entities)
|
|
993
|
+
print(json.dumps(stats, indent=2))
|
|
994
|
+
|
|
995
|
+
elif args.command == "search":
|
|
996
|
+
query = " ".join(args.args) if args.args else ""
|
|
997
|
+
if not query:
|
|
998
|
+
print("Usage: l6-document-store.py search 'your query'")
|
|
999
|
+
return
|
|
1000
|
+
results = search(query, method=args.method, limit=args.limit,
|
|
1001
|
+
arena=args.arena, enable_rerank=not args.no_rerank)
|
|
1002
|
+
for i, r in enumerate(results, 1):
|
|
1003
|
+
print(f"\n--- [{i}] {r.get('source_file','?')} (rrf={r.get('rrf_score',0):.4f}, engines={r.get('engines','?')}) ---")
|
|
1004
|
+
if r.get("heading"):
|
|
1005
|
+
print(f"Heading: {r['heading']}")
|
|
1006
|
+
if r.get("entities"):
|
|
1007
|
+
print(f"Entities: {', '.join(r['entities'][:10])}")
|
|
1008
|
+
print(r["text"][:300])
|
|
1009
|
+
|
|
1010
|
+
elif args.command == "health":
|
|
1011
|
+
print(json.dumps(health(), indent=2))
|
|
1012
|
+
|
|
1013
|
+
elif args.command == "stats":
|
|
1014
|
+
print(json.dumps(get_stats(), indent=2))
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
if __name__ == "__main__":
|
|
1018
|
+
main()
|