@geravant/sinain 1.12.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/config-shared.js +1 -0
- package/package.json +4 -1
- package/sinain-agent/run.sh +36 -4
- package/sinain-core/package-lock.json +963 -0
- package/sinain-core/package.json +1 -0
- package/sinain-core/src/buffers/feed-buffer.ts +34 -0
- package/sinain-core/src/embedding/service.ts +66 -0
- package/sinain-core/src/index.ts +65 -17
- package/sinain-core/src/learning/local-curation.ts +137 -7
- package/sinain-core/src/server.ts +31 -0
- package/sinain-memory/README.md +105 -0
- package/sinain-memory/embed_client.py +117 -0
- package/sinain-memory/graph_query.py +269 -18
- package/sinain-memory/knowledge_integrator.py +551 -74
- package/sinain-memory/memory-config.json +1 -1
- package/sinain-memory/session_distiller.py +43 -19
- package/sinain-memory/triplestore.py +60 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/query.py +0 -172
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/runner.py +0 -276
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Embedding client — calls sinain-core's /embed endpoint for vector operations.
|
|
2
|
+
|
|
3
|
+
Provides semantic similarity for:
|
|
4
|
+
- Write path: dedup before asserting facts (knowledge_integrator.py)
|
|
5
|
+
- Read path: semantic retrieval (graph_query.py)
|
|
6
|
+
|
|
7
|
+
Falls back gracefully if sinain-core is not running or model not loaded.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import json
|
|
12
|
+
import struct
|
|
13
|
+
import urllib.request
|
|
14
|
+
from functools import lru_cache
|
|
15
|
+
|
|
16
|
+
SINAIN_CORE_URL = "http://localhost:9500"
|
|
17
|
+
EMBED_TIMEOUT_S = 5
|
|
18
|
+
SIMILARITY_THRESHOLD = 0.78 # calibrated: catches rephrased facts, rejects different facts
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def embed(texts: list[str]) -> list[list[float]] | None:
|
|
22
|
+
"""Embed texts via sinain-core /embed endpoint. Returns None if unavailable."""
|
|
23
|
+
try:
|
|
24
|
+
data = json.dumps({"texts": texts}).encode()
|
|
25
|
+
req = urllib.request.Request(
|
|
26
|
+
f"{SINAIN_CORE_URL}/embed",
|
|
27
|
+
data=data,
|
|
28
|
+
headers={"Content-Type": "application/json"},
|
|
29
|
+
method="POST",
|
|
30
|
+
)
|
|
31
|
+
with urllib.request.urlopen(req, timeout=EMBED_TIMEOUT_S) as resp:
|
|
32
|
+
result = json.loads(resp.read())
|
|
33
|
+
# Decode base64 float32 arrays
|
|
34
|
+
embeddings = []
|
|
35
|
+
for b64 in result["embeddings"]:
|
|
36
|
+
raw = base64.b64decode(b64)
|
|
37
|
+
floats = list(struct.unpack(f"{len(raw)//4}f", raw))
|
|
38
|
+
embeddings.append(floats)
|
|
39
|
+
return embeddings
|
|
40
|
+
except Exception:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def cosine(a: list[float], b: list[float]) -> float:
|
|
45
|
+
"""Cosine similarity between two vectors."""
|
|
46
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
47
|
+
return dot # vectors are pre-normalized by the model
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def find_duplicates_batch(
|
|
51
|
+
new_texts: list[str],
|
|
52
|
+
existing_texts: list[str],
|
|
53
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
54
|
+
) -> dict[int, int]:
|
|
55
|
+
"""Find duplicates for multiple new texts against existing texts in one batch.
|
|
56
|
+
|
|
57
|
+
Returns {new_index: existing_index} for texts with similarity >= threshold.
|
|
58
|
+
Single HTTP call for all texts — avoids per-fact round trips.
|
|
59
|
+
"""
|
|
60
|
+
if not existing_texts or not new_texts:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
all_texts = new_texts + existing_texts
|
|
64
|
+
embeddings = embed(all_texts)
|
|
65
|
+
if embeddings is None:
|
|
66
|
+
return {}
|
|
67
|
+
|
|
68
|
+
n_new = len(new_texts)
|
|
69
|
+
result = {}
|
|
70
|
+
|
|
71
|
+
for i in range(n_new):
|
|
72
|
+
best_idx = None
|
|
73
|
+
best_sim = threshold
|
|
74
|
+
for j in range(n_new, len(embeddings)):
|
|
75
|
+
sim = cosine(embeddings[i], embeddings[j])
|
|
76
|
+
if sim > best_sim:
|
|
77
|
+
best_sim = sim
|
|
78
|
+
best_idx = j - n_new
|
|
79
|
+
if best_idx is not None:
|
|
80
|
+
result[i] = best_idx
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def find_duplicate(
|
|
86
|
+
new_text: str,
|
|
87
|
+
existing_texts: list[str],
|
|
88
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
89
|
+
) -> int | None:
|
|
90
|
+
"""Find the index of the most similar existing text, or None if no match."""
|
|
91
|
+
result = find_duplicates_batch([new_text], existing_texts, threshold)
|
|
92
|
+
return result.get(0)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def rank_by_similarity(
|
|
96
|
+
query: str,
|
|
97
|
+
texts: list[str],
|
|
98
|
+
) -> list[tuple[int, float]] | None:
|
|
99
|
+
"""Rank texts by semantic similarity to query. Returns [(index, score), ...] descending.
|
|
100
|
+
|
|
101
|
+
Returns None if embedding service unavailable (caller should fall back to keyword).
|
|
102
|
+
"""
|
|
103
|
+
if not texts:
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
all_texts = [query] + texts
|
|
107
|
+
embeddings = embed(all_texts)
|
|
108
|
+
if embeddings is None:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
query_emb = embeddings[0]
|
|
112
|
+
scored = []
|
|
113
|
+
for i, emb in enumerate(embeddings[1:]):
|
|
114
|
+
scored.append((i, cosine(query_emb, emb)))
|
|
115
|
+
|
|
116
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
117
|
+
return scored
|
|
@@ -129,32 +129,278 @@ def query_top_facts(db_path: str, limit: int = 30) -> list[dict]:
|
|
|
129
129
|
return []
|
|
130
130
|
|
|
131
131
|
|
|
132
|
+
def query_facts_fts(db_path: str, query: str, max_facts: int = 10) -> list[dict]:
|
|
133
|
+
"""Full-text search on fact values via FTS5 index.
|
|
134
|
+
|
|
135
|
+
Returns facts whose value field matches the query keywords.
|
|
136
|
+
Falls back to LIKE search if FTS5 is not available.
|
|
137
|
+
"""
|
|
138
|
+
if not Path(db_path).exists():
|
|
139
|
+
return []
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
from triplestore import TripleStore
|
|
143
|
+
store = TripleStore(db_path)
|
|
144
|
+
|
|
145
|
+
# Try FTS5 first
|
|
146
|
+
try:
|
|
147
|
+
rows = store._conn.execute(
|
|
148
|
+
"""SELECT DISTINCT t.entity_id
|
|
149
|
+
FROM triples_fts fts
|
|
150
|
+
JOIN triples t ON fts.rowid = t.id
|
|
151
|
+
WHERE triples_fts MATCH ?
|
|
152
|
+
AND t.attribute = 'value'
|
|
153
|
+
AND NOT t.retracted
|
|
154
|
+
LIMIT ?""",
|
|
155
|
+
(query, max_facts),
|
|
156
|
+
).fetchall()
|
|
157
|
+
except Exception:
|
|
158
|
+
# FTS5 not available — fall back to LIKE search
|
|
159
|
+
keywords = [w.lower() for w in query.split() if len(w) > 2]
|
|
160
|
+
if not keywords:
|
|
161
|
+
store.close()
|
|
162
|
+
return []
|
|
163
|
+
# Match any keyword in value
|
|
164
|
+
conditions = " OR ".join(["LOWER(value) LIKE ?"] * len(keywords))
|
|
165
|
+
params = [f"%{k}%" for k in keywords] + [max_facts]
|
|
166
|
+
rows = store._conn.execute(
|
|
167
|
+
f"""SELECT DISTINCT entity_id
|
|
168
|
+
FROM triples
|
|
169
|
+
WHERE attribute = 'value'
|
|
170
|
+
AND NOT retracted
|
|
171
|
+
AND ({conditions})
|
|
172
|
+
LIMIT ?""",
|
|
173
|
+
params,
|
|
174
|
+
).fetchall()
|
|
175
|
+
|
|
176
|
+
entity_ids = [r["entity_id"] for r in rows]
|
|
177
|
+
if not entity_ids:
|
|
178
|
+
store.close()
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
# Fetch full attributes for matched entities
|
|
182
|
+
facts = []
|
|
183
|
+
for eid in entity_ids:
|
|
184
|
+
attrs = store.entity(eid)
|
|
185
|
+
fact = {"entity_id": eid, "entity": eid.split(":")[-1].rsplit("-", 1)[0] if ":" in eid else eid}
|
|
186
|
+
for attr, values in attrs.items():
|
|
187
|
+
if attr == "tag":
|
|
188
|
+
continue
|
|
189
|
+
fact[attr] = values[0] if len(values) == 1 else values
|
|
190
|
+
facts.append(fact)
|
|
191
|
+
|
|
192
|
+
store.close()
|
|
193
|
+
return facts[:max_facts]
|
|
194
|
+
except Exception:
|
|
195
|
+
return []
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def query_facts_by_entity_graph(
|
|
199
|
+
db_path: str,
|
|
200
|
+
entity_name: str,
|
|
201
|
+
max_facts: int = 10,
|
|
202
|
+
) -> list[dict]:
|
|
203
|
+
"""Find facts about an entity via VAET backref traversal.
|
|
204
|
+
|
|
205
|
+
Uses the entity graph layer: entity:* nodes linked to fact:* nodes
|
|
206
|
+
via 'about' ref edges. Also follows 'mentions' ref edges for
|
|
207
|
+
cross-entity context.
|
|
208
|
+
"""
|
|
209
|
+
if not Path(db_path).exists():
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
from triplestore import TripleStore
|
|
214
|
+
store = TripleStore(db_path)
|
|
215
|
+
|
|
216
|
+
entity_node_id = f"entity:{entity_name.lower().replace(' ', '-')}"
|
|
217
|
+
if not store.entity(entity_node_id):
|
|
218
|
+
store.close()
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
# Get all facts linked to this entity via "about" ref edge
|
|
222
|
+
fact_refs = store.backrefs(entity_node_id, attribute="about")
|
|
223
|
+
# Also get facts that "mention" this entity
|
|
224
|
+
mention_refs = store.backrefs(entity_node_id, attribute="mentions")
|
|
225
|
+
all_refs = fact_refs + mention_refs
|
|
226
|
+
|
|
227
|
+
# Load fact details
|
|
228
|
+
seen = set()
|
|
229
|
+
facts = []
|
|
230
|
+
for fact_eid, _ in all_refs:
|
|
231
|
+
if fact_eid in seen or not fact_eid.startswith("fact:"):
|
|
232
|
+
continue
|
|
233
|
+
seen.add(fact_eid)
|
|
234
|
+
attrs = store.entity(fact_eid)
|
|
235
|
+
if attrs and "value" in attrs:
|
|
236
|
+
fact = {"entity_id": fact_eid}
|
|
237
|
+
for attr, values in attrs.items():
|
|
238
|
+
if attr == "tag":
|
|
239
|
+
continue
|
|
240
|
+
fact[attr] = values[0] if len(values) == 1 else values
|
|
241
|
+
facts.append(fact)
|
|
242
|
+
|
|
243
|
+
store.close()
|
|
244
|
+
return facts[:max_facts]
|
|
245
|
+
except Exception:
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def query_facts_hybrid(
|
|
250
|
+
db_path: str,
|
|
251
|
+
query: str,
|
|
252
|
+
max_facts: int = 10,
|
|
253
|
+
) -> list[dict]:
|
|
254
|
+
"""Hybrid retrieval with Reciprocal Rank Fusion (Graphiti pattern).
|
|
255
|
+
|
|
256
|
+
Runs three independent retrieval methods, fuses via RRF, then
|
|
257
|
+
expands top results with 1-hop graph neighbors.
|
|
258
|
+
"""
|
|
259
|
+
import re
|
|
260
|
+
keywords = [w.lower() for w in re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query) if len(w) > 2]
|
|
261
|
+
|
|
262
|
+
# Entity graph pre-filter: find facts linked to mentioned entities via backrefs.
|
|
263
|
+
# Used to BOOST relevant facts in RRF, not as a separate tier (avoids dilution).
|
|
264
|
+
graph_fact_ids: set[str] = set()
|
|
265
|
+
for kw in keywords:
|
|
266
|
+
for f in query_facts_by_entity_graph(db_path, kw, max_facts=50):
|
|
267
|
+
eid = f.get("entity_id", "")
|
|
268
|
+
if eid:
|
|
269
|
+
graph_fact_ids.add(eid)
|
|
270
|
+
|
|
271
|
+
# Run three retrieval methods independently
|
|
272
|
+
candidate_limit = max_facts * 3
|
|
273
|
+
fts_results = query_facts_fts(db_path, query, max_facts=candidate_limit)
|
|
274
|
+
tag_results = query_facts_by_entities(db_path, keywords, max_facts=candidate_limit) if keywords else []
|
|
275
|
+
top_results = query_top_facts(db_path, limit=candidate_limit)
|
|
276
|
+
|
|
277
|
+
# Build ranked lists by entity_id
|
|
278
|
+
def _ranked_ids(facts: list[dict]) -> list[str]:
|
|
279
|
+
seen = set()
|
|
280
|
+
out = []
|
|
281
|
+
for f in facts:
|
|
282
|
+
eid = f.get("entity_id", "")
|
|
283
|
+
if eid and eid not in seen:
|
|
284
|
+
seen.add(eid)
|
|
285
|
+
out.append(eid)
|
|
286
|
+
return out
|
|
287
|
+
|
|
288
|
+
fts_ranked = _ranked_ids(fts_results)
|
|
289
|
+
tag_ranked = _ranked_ids(tag_results)
|
|
290
|
+
top_ranked = _ranked_ids(top_results)
|
|
291
|
+
|
|
292
|
+
# Reciprocal Rank Fusion: RRF(d) = Σ 1/(k + rank_i(d))
|
|
293
|
+
K = 60 # standard RRF constant
|
|
294
|
+
rrf_scores: dict[str, float] = {}
|
|
295
|
+
for ranked_list in [fts_ranked, tag_ranked, top_ranked]:
|
|
296
|
+
for rank, eid in enumerate(ranked_list):
|
|
297
|
+
rrf_scores[eid] = rrf_scores.get(eid, 0.0) + 1.0 / (K + rank)
|
|
298
|
+
|
|
299
|
+
# Graph boost: facts linked to mentioned entities via backrefs get priority
|
|
300
|
+
if graph_fact_ids:
|
|
301
|
+
for eid in rrf_scores:
|
|
302
|
+
if eid in graph_fact_ids:
|
|
303
|
+
rrf_scores[eid] += 0.02 # significant boost — graph-linked facts rank higher
|
|
304
|
+
|
|
305
|
+
# Apply confidence decay as secondary signal (fresh facts rank above stale ones)
|
|
306
|
+
from triplestore import decayed_confidence
|
|
307
|
+
for facts_list in [fts_results, tag_results, top_results]:
|
|
308
|
+
for f in facts_list:
|
|
309
|
+
eid = f.get("entity_id", "")
|
|
310
|
+
if eid in rrf_scores:
|
|
311
|
+
conf = 0.5
|
|
312
|
+
created = ""
|
|
313
|
+
try:
|
|
314
|
+
conf = float(f.get("confidence", 0.5))
|
|
315
|
+
created = str(f.get("first_seen", ""))
|
|
316
|
+
except (ValueError, TypeError):
|
|
317
|
+
pass
|
|
318
|
+
if created:
|
|
319
|
+
effective = decayed_confidence(conf, created)
|
|
320
|
+
rrf_scores[eid] += effective * 0.01 # small boost, preserves RRF rank
|
|
321
|
+
|
|
322
|
+
# Sort by RRF score descending
|
|
323
|
+
sorted_ids = sorted(rrf_scores, key=rrf_scores.get, reverse=True)
|
|
324
|
+
|
|
325
|
+
# Build fact lookup from all candidates
|
|
326
|
+
fact_map: dict[str, dict] = {}
|
|
327
|
+
for facts in [fts_results, tag_results, top_results]:
|
|
328
|
+
for f in facts:
|
|
329
|
+
eid = f.get("entity_id", "")
|
|
330
|
+
if eid and eid not in fact_map:
|
|
331
|
+
fact_map[eid] = f
|
|
332
|
+
|
|
333
|
+
# Return top RRF candidates. Embedding re-ranking is done by the caller
|
|
334
|
+
# (sinain-core Node.js) to avoid deadlock — the Python subprocess can't call
|
|
335
|
+
# back to sinain-core's /embed endpoint while sinain-core is blocked waiting
|
|
336
|
+
# for the subprocess.
|
|
337
|
+
results = [fact_map[eid] for eid in sorted_ids[:max_facts] if eid in fact_map]
|
|
338
|
+
|
|
339
|
+
# Expand top results with 1-hop graph neighbors
|
|
340
|
+
if results and len(results) < max_facts:
|
|
341
|
+
seen_ids = {f.get("entity_id", "") for f in results}
|
|
342
|
+
try:
|
|
343
|
+
from triplestore import TripleStore
|
|
344
|
+
store = TripleStore(db_path)
|
|
345
|
+
for fact in list(results):
|
|
346
|
+
eid = fact.get("entity_id", "")
|
|
347
|
+
if not eid:
|
|
348
|
+
continue
|
|
349
|
+
neighbors = store.neighbors(eid, depth=1)
|
|
350
|
+
for nid, nattrs in neighbors.items():
|
|
351
|
+
if nid not in seen_ids and len(results) < max_facts:
|
|
352
|
+
seen_ids.add(nid)
|
|
353
|
+
nfact = {"entity_id": nid, "entity": nid.split(":")[-1].rsplit("-", 1)[0] if ":" in nid else nid}
|
|
354
|
+
for attr, values in nattrs.items():
|
|
355
|
+
if attr != "tag":
|
|
356
|
+
nfact[attr] = values[0] if len(values) == 1 else values
|
|
357
|
+
results.append(nfact)
|
|
358
|
+
store.close()
|
|
359
|
+
except Exception:
|
|
360
|
+
pass
|
|
361
|
+
|
|
362
|
+
return results[:max_facts]
|
|
363
|
+
|
|
364
|
+
|
|
132
365
|
def format_facts_text(facts: list[dict], max_chars: int = 500) -> str:
|
|
133
|
-
"""Format facts
|
|
366
|
+
"""Format facts grouped by entity for better cross-fact reasoning.
|
|
367
|
+
|
|
368
|
+
Groups related facts under entity headers so the QA model sees
|
|
369
|
+
connected context (e.g., all Citibank facts together).
|
|
370
|
+
"""
|
|
134
371
|
if not facts:
|
|
135
372
|
return ""
|
|
136
373
|
|
|
137
|
-
|
|
138
|
-
|
|
374
|
+
# Group by entity name (strip fact: prefix and hash suffix)
|
|
375
|
+
from collections import OrderedDict
|
|
376
|
+
groups: OrderedDict[str, list[dict]] = OrderedDict()
|
|
139
377
|
for f in facts:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
line = f"- [{domain}] {value} (confidence: {conf}, confirmed {count}x)"
|
|
378
|
+
entity = f.get("entity", "")
|
|
379
|
+
if isinstance(entity, list):
|
|
380
|
+
entity = entity[0] if entity else ""
|
|
381
|
+
if not entity:
|
|
382
|
+
eid = str(f.get("entity_id", ""))
|
|
383
|
+
entity = eid.split(":")[-1].rsplit("-", 1)[0] if ":" in eid else eid
|
|
384
|
+
groups.setdefault(str(entity), []).append(f)
|
|
148
385
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
386
|
+
lines = []
|
|
387
|
+
total = 0
|
|
388
|
+
for entity, group_facts in groups.items():
|
|
389
|
+
for f in group_facts:
|
|
390
|
+
value = f.get("value", "")
|
|
391
|
+
conf = f.get("confidence", "?")
|
|
392
|
+
count = f.get("reinforce_count", "1")
|
|
393
|
+
|
|
394
|
+
line = f"- [{entity}] {value} (conf: {conf}, {count}x)"
|
|
395
|
+
if total + len(line) > max_chars:
|
|
396
|
+
return "\n".join(lines)
|
|
397
|
+
lines.append(line)
|
|
398
|
+
total += len(line)
|
|
153
399
|
|
|
154
400
|
return "\n".join(lines)
|
|
155
401
|
|
|
156
402
|
|
|
157
|
-
def format_facts_compact(facts: list[dict], max_chars: int =
|
|
403
|
+
def format_facts_compact(facts: list[dict], max_chars: int = 1200) -> str:
|
|
158
404
|
"""Encode facts for efficient escalation context injection.
|
|
159
405
|
|
|
160
406
|
Compact format: domain/entity: value (conf, Nx)
|
|
@@ -167,7 +413,7 @@ def format_facts_compact(facts: list[dict], max_chars: int = 400) -> str:
|
|
|
167
413
|
total = 0
|
|
168
414
|
for f in facts:
|
|
169
415
|
entity = f.get("entityId", "").split(":")[-1][:20]
|
|
170
|
-
value = f.get("value", "")
|
|
416
|
+
value = f.get("value", "")
|
|
171
417
|
conf = f.get("confidence", "?")
|
|
172
418
|
count = f.get("reinforce_count", "1")
|
|
173
419
|
domain = f.get("domain", "")
|
|
@@ -227,7 +473,12 @@ def main() -> None:
|
|
|
227
473
|
facts = query_top_facts(args.db, limit=args.top)
|
|
228
474
|
elif args.entities:
|
|
229
475
|
entities = json.loads(args.entities)
|
|
230
|
-
|
|
476
|
+
# Use hybrid retrieval (FTS5 + tags + entity graph + RRF) for best results
|
|
477
|
+
query_text = " ".join(entities)
|
|
478
|
+
facts = query_facts_hybrid(args.db, query_text, max_facts=args.max_facts)
|
|
479
|
+
# Fallback to tag-only if hybrid returns nothing
|
|
480
|
+
if not facts:
|
|
481
|
+
facts = query_facts_by_entities(args.db, entities, max_facts=args.max_facts)
|
|
231
482
|
else:
|
|
232
483
|
facts = query_top_facts(args.db, limit=args.max_facts)
|
|
233
484
|
|