@geravant/sinain 1.19.0 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +439 -0
- package/sinain-core/package.json +2 -0
- package/sinain-core/src/index.ts +283 -0
- package/sinain-core/src/learning/local-curation.ts +3 -0
- package/sinain-core/src/server.ts +1570 -2
- package/sinain-core/src/web-db/schema.ts +122 -0
- package/sinain-core/src/web-db/store.ts +406 -0
- package/sinain-memory/concept_export.py +310 -0
- package/sinain-memory/concept_import.py +254 -0
- package/sinain-memory/graph_query.py +461 -4
- package/sinain-memory/knowledge_integrator.py +87 -10
- package/sinain-memory/page_renderer.py +447 -0
- package/sinain-memory/retract.py +236 -0
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Page Renderer — synthesize a Confluence-style page for an entity.
|
|
3
|
+
|
|
4
|
+
Given an entity_id, loads top-K facts from the triplestore, calls an LLM to
|
|
5
|
+
group them into themed sections with a summary, and emits validated JSON.
|
|
6
|
+
|
|
7
|
+
Output schema:
|
|
8
|
+
{
|
|
9
|
+
"entity": "entity:citibank",
|
|
10
|
+
"tx_watermark": 14823,
|
|
11
|
+
"fact_count": 247,
|
|
12
|
+
"facts_used": 247,
|
|
13
|
+
"summary": "Citibank is …",
|
|
14
|
+
"sections": [
|
|
15
|
+
{
|
|
16
|
+
"heading": "Key People",
|
|
17
|
+
"bullets": [
|
|
18
|
+
{ "fact_id": "fact:citibank-cto-17yrs",
|
|
19
|
+
"text": "CTO has 17 yrs tenure",
|
|
20
|
+
"confidence": 0.92,
|
|
21
|
+
"domain": "people",
|
|
22
|
+
"first_seen": "2026-04-12" }
|
|
23
|
+
]
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"stats": { "tokens_in": 18420, "tokens_out": 1380, "dropped_bullets": 0 }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Hallucinated fact_ids (in LLM output but not in input) are dropped with a
|
|
30
|
+
count in stats.dropped_bullets — same defensive pattern as
|
|
31
|
+
knowledge_integrator.py.
|
|
32
|
+
|
|
33
|
+
Usage:
|
|
34
|
+
python3 page_renderer.py --db memory/knowledge-graph.db \
|
|
35
|
+
--entity entity:citibank [--max-facts 1000]
|
|
36
|
+
"""
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import argparse
|
|
40
|
+
import json
|
|
41
|
+
import sys
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
|
|
44
|
+
from common import LLMError, call_llm_with_fallback, extract_json
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
SYSTEM_PROMPT = """\
|
|
48
|
+
You are organizing knowledge into a Confluence-style page about a specific entity.
|
|
49
|
+
INPUT: a list of facts each with a stable fact_id, value, confidence, domain, and first_seen.
|
|
50
|
+
OUTPUT: JSON matching the schema below.
|
|
51
|
+
|
|
52
|
+
RULES:
|
|
53
|
+
- Group facts into 2 to 8 themed sections.
|
|
54
|
+
- Section ordering preference: Overview → People → Projects/Work → Decisions → Open Questions → Recent Activity. Only include sections that fit the available facts.
|
|
55
|
+
- EVERY bullet MUST reference a real fact_id from the input list. Do not invent fact_ids.
|
|
56
|
+
- Each bullet's "text" is at most 140 characters, present-tense, plain English. Rewrite the fact for readability — do NOT just quote it verbatim if it can be tighter.
|
|
57
|
+
- The "summary" is 2 to 4 sentences synthesizing the entity at a glance. Cite no fact_ids in the summary.
|
|
58
|
+
- If facts contradict, prefer the higher-confidence fact and note the disagreement in the section's optional "notes" field.
|
|
59
|
+
- If the entity has very few facts (<5), produce one "Overview" section with all of them.
|
|
60
|
+
|
|
61
|
+
OUTPUT ONLY a JSON object, no other text:
|
|
62
|
+
{
|
|
63
|
+
"summary": "2-4 sentence summary",
|
|
64
|
+
"sections": [
|
|
65
|
+
{
|
|
66
|
+
"heading": "Section title",
|
|
67
|
+
"bullets": [
|
|
68
|
+
{ "fact_id": "fact:...", "text": "...", "confidence": 0.0 }
|
|
69
|
+
],
|
|
70
|
+
"notes": "optional: contradictions or caveats"
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
}
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_entity_facts(db_path: str, entity_id: str, max_facts: int) -> tuple[list[dict], int]:
|
|
78
|
+
"""Load facts for an entity. Returns (facts, tx_watermark).
|
|
79
|
+
|
|
80
|
+
Resolution strategy:
|
|
81
|
+
1. If entity_id starts with `entity:` — find all facts referencing it
|
|
82
|
+
via (attribute='entity', value=entity_id, value_type='ref'),
|
|
83
|
+
then return their full attribute sets.
|
|
84
|
+
2. If entity_id starts with `fact:` — return its own attributes as a
|
|
85
|
+
single-fact "page" (the fact IS the page).
|
|
86
|
+
3. Otherwise — try entity:<slug> first, then fall back to all fact:<slug>-*.
|
|
87
|
+
"""
|
|
88
|
+
from triplestore import TripleStore
|
|
89
|
+
|
|
90
|
+
store = TripleStore(db_path)
|
|
91
|
+
fact_ids: list[str] = []
|
|
92
|
+
|
|
93
|
+
if entity_id.startswith("fact:"):
|
|
94
|
+
fact_ids = [entity_id]
|
|
95
|
+
elif entity_id.startswith("entity:"):
|
|
96
|
+
# Find ANY incoming ref (any attribute), matching the same broad
|
|
97
|
+
# predicate that graph_children uses. Filtering to attribute='entity'
|
|
98
|
+
# was too narrow — real data references entities via many attribute
|
|
99
|
+
# names (related_to, parent_org, employed_by, etc.).
|
|
100
|
+
rows = store._conn.execute(
|
|
101
|
+
"""SELECT DISTINCT entity_id FROM triples
|
|
102
|
+
WHERE value = ? AND value_type = 'ref' AND retracted = 0
|
|
103
|
+
LIMIT ?""",
|
|
104
|
+
(entity_id, max_facts),
|
|
105
|
+
).fetchall()
|
|
106
|
+
fact_ids = [r["entity_id"] for r in rows]
|
|
107
|
+
# Some installs store the entity-pointer as value_type='string' (the
|
|
108
|
+
# slug, not a typed ref). Match those too via the slugified entity name
|
|
109
|
+
# against the 'entity' attribute (the most common holder for that
|
|
110
|
+
# legacy shape).
|
|
111
|
+
slug_part = entity_id.split(":", 1)[1] if ":" in entity_id else entity_id
|
|
112
|
+
legacy_rows = store._conn.execute(
|
|
113
|
+
"""SELECT DISTINCT entity_id FROM triples
|
|
114
|
+
WHERE attribute = 'entity' AND value = ?
|
|
115
|
+
AND value_type = 'string' AND retracted = 0
|
|
116
|
+
LIMIT ?""",
|
|
117
|
+
(slug_part, max_facts),
|
|
118
|
+
).fetchall()
|
|
119
|
+
for r in legacy_rows:
|
|
120
|
+
if r["entity_id"] not in fact_ids:
|
|
121
|
+
fact_ids.append(r["entity_id"])
|
|
122
|
+
# Always include the entity itself's own attributes as a "self" fact.
|
|
123
|
+
self_attrs_count = store._conn.execute(
|
|
124
|
+
"SELECT COUNT(*) AS n FROM triples WHERE entity_id = ? AND retracted = 0",
|
|
125
|
+
(entity_id,),
|
|
126
|
+
).fetchone()["n"]
|
|
127
|
+
if self_attrs_count > 0:
|
|
128
|
+
fact_ids.insert(0, entity_id)
|
|
129
|
+
fact_ids = fact_ids[:max_facts]
|
|
130
|
+
else:
|
|
131
|
+
# Bare slug — try entity: first, then fact: prefix scan, then
|
|
132
|
+
# broader substring match across both prefixes (handles cases where
|
|
133
|
+
# the slug is a fragment of the actual entity_id).
|
|
134
|
+
eid = f"entity:{entity_id}"
|
|
135
|
+
exists = store._conn.execute(
|
|
136
|
+
"SELECT 1 FROM triples WHERE entity_id = ? AND retracted = 0 LIMIT 1",
|
|
137
|
+
(eid,),
|
|
138
|
+
).fetchone()
|
|
139
|
+
if exists:
|
|
140
|
+
return load_entity_facts(db_path, eid, max_facts)
|
|
141
|
+
# Try fact:<slug>* (prefix), then *<slug>* (substring) across both prefixes.
|
|
142
|
+
rows = store._conn.execute(
|
|
143
|
+
"""SELECT DISTINCT entity_id FROM triples
|
|
144
|
+
WHERE entity_id LIKE ? AND retracted = 0
|
|
145
|
+
LIMIT ?""",
|
|
146
|
+
(f"fact:{entity_id}%", max_facts),
|
|
147
|
+
).fetchall()
|
|
148
|
+
fact_ids = [r["entity_id"] for r in rows]
|
|
149
|
+
if not fact_ids:
|
|
150
|
+
rows = store._conn.execute(
|
|
151
|
+
"""SELECT DISTINCT entity_id FROM triples
|
|
152
|
+
WHERE (entity_id LIKE ? OR entity_id LIKE ?) AND retracted = 0
|
|
153
|
+
LIMIT ?""",
|
|
154
|
+
(f"fact:%{entity_id}%", f"entity:%{entity_id}%", max_facts),
|
|
155
|
+
).fetchall()
|
|
156
|
+
fact_ids = [r["entity_id"] for r in rows]
|
|
157
|
+
|
|
158
|
+
# Load full attribute sets for each fact
|
|
159
|
+
facts: list[dict] = []
|
|
160
|
+
tx_watermark = 0
|
|
161
|
+
for fid in fact_ids:
|
|
162
|
+
attrs = store.entity(fid)
|
|
163
|
+
if not attrs:
|
|
164
|
+
continue
|
|
165
|
+
# Compute tx_watermark across all triples for this fact
|
|
166
|
+
max_tx_row = store._conn.execute(
|
|
167
|
+
"SELECT MAX(tx_id) AS m FROM triples WHERE entity_id = ? AND retracted = 0",
|
|
168
|
+
(fid,),
|
|
169
|
+
).fetchone()
|
|
170
|
+
if max_tx_row and max_tx_row["m"] is not None:
|
|
171
|
+
tx_watermark = max(tx_watermark, max_tx_row["m"])
|
|
172
|
+
|
|
173
|
+
fact = {"fact_id": fid}
|
|
174
|
+
for attr, values in attrs.items():
|
|
175
|
+
v = values[0] if len(values) == 1 else values
|
|
176
|
+
if attr == "tag":
|
|
177
|
+
fact["tags"] = values
|
|
178
|
+
else:
|
|
179
|
+
fact[attr] = v
|
|
180
|
+
# Coerce confidence to float
|
|
181
|
+
try:
|
|
182
|
+
fact["confidence"] = float(fact.get("confidence", 0.5))
|
|
183
|
+
except (ValueError, TypeError):
|
|
184
|
+
fact["confidence"] = 0.5
|
|
185
|
+
facts.append(fact)
|
|
186
|
+
|
|
187
|
+
store.close()
|
|
188
|
+
|
|
189
|
+
# Sort by composite score: confidence * recency-bonus
|
|
190
|
+
facts.sort(key=lambda f: (
|
|
191
|
+
-float(f.get("confidence", 0.5)),
|
|
192
|
+
f.get("first_seen", ""),
|
|
193
|
+
))
|
|
194
|
+
return facts[:max_facts], tx_watermark
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def build_user_prompt(entity_id: str, facts: list[dict]) -> str:
|
|
198
|
+
"""Compact representation for the LLM. Strips noise, keeps essential fields."""
|
|
199
|
+
parts = [f"Entity: {entity_id}", f"Total facts: {len(facts)}", "", "Facts:"]
|
|
200
|
+
for f in facts:
|
|
201
|
+
fid = f.get("fact_id", "?")
|
|
202
|
+
value = (f.get("value") or "").strip().replace("\n", " ")[:300]
|
|
203
|
+
conf = f.get("confidence", "?")
|
|
204
|
+
domain = f.get("domain", "")
|
|
205
|
+
first_seen = f.get("first_seen", "")[:10] # YYYY-MM-DD
|
|
206
|
+
tags = ",".join(f.get("tags", [])[:5]) if f.get("tags") else ""
|
|
207
|
+
meta_bits = []
|
|
208
|
+
if domain: meta_bits.append(f"domain={domain}")
|
|
209
|
+
if first_seen: meta_bits.append(f"first_seen={first_seen}")
|
|
210
|
+
if tags: meta_bits.append(f"tags={tags}")
|
|
211
|
+
meta = " | ".join(meta_bits)
|
|
212
|
+
parts.append(f"- [{fid}] (conf={conf}{', ' + meta if meta else ''}): {value}")
|
|
213
|
+
return "\n".join(parts)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def render_page(db_path: str, entity_id: str, max_facts: int = 1000) -> dict:
|
|
217
|
+
facts, tx_watermark = load_entity_facts(db_path, entity_id, max_facts)
|
|
218
|
+
fact_count_total = len(facts)
|
|
219
|
+
|
|
220
|
+
if not facts:
|
|
221
|
+
return {
|
|
222
|
+
"entity": entity_id,
|
|
223
|
+
"tx_watermark": tx_watermark,
|
|
224
|
+
"fact_count": 0,
|
|
225
|
+
"facts_used": 0,
|
|
226
|
+
"summary": "No knowledge captured for this entity yet.",
|
|
227
|
+
"sections": [],
|
|
228
|
+
"stats": {"tokens_in": 0, "tokens_out": 0, "dropped_bullets": 0,
|
|
229
|
+
"from_cache": False},
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
# Single-fact short-circuit (no LLM needed)
|
|
233
|
+
if fact_count_total == 1:
|
|
234
|
+
f = facts[0]
|
|
235
|
+
return {
|
|
236
|
+
"entity": entity_id,
|
|
237
|
+
"tx_watermark": tx_watermark,
|
|
238
|
+
"fact_count": 1,
|
|
239
|
+
"facts_used": 1,
|
|
240
|
+
"summary": (f.get("value") or "")[:200],
|
|
241
|
+
"sections": [{
|
|
242
|
+
"heading": "Overview",
|
|
243
|
+
"bullets": [{
|
|
244
|
+
"fact_id": f["fact_id"],
|
|
245
|
+
"text": (f.get("value") or "")[:140],
|
|
246
|
+
"confidence": f.get("confidence", 0.5),
|
|
247
|
+
"domain": f.get("domain"),
|
|
248
|
+
"first_seen": f.get("first_seen"),
|
|
249
|
+
}],
|
|
250
|
+
}],
|
|
251
|
+
"stats": {"tokens_in": 0, "tokens_out": 0, "dropped_bullets": 0,
|
|
252
|
+
"from_cache": False},
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
# Multi-fact: LLM rendering
|
|
256
|
+
user_prompt = build_user_prompt(entity_id, facts)
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
raw = call_llm_with_fallback(
|
|
260
|
+
SYSTEM_PROMPT, user_prompt, script="page_renderer",
|
|
261
|
+
json_mode=True, retries=1,
|
|
262
|
+
)
|
|
263
|
+
parsed = extract_json(raw)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
# Catch broad — covers LLMError, ValueError, missing API key
|
|
266
|
+
# (RuntimeError), JSON parse errors, network errors, etc. The web UI
|
|
267
|
+
# always wants a renderable response; degraded > broken.
|
|
268
|
+
sys.stderr.write(f"page_renderer LLM failed: {e}\n")
|
|
269
|
+
return _fallback_page(entity_id, facts, tx_watermark, error=str(e))
|
|
270
|
+
|
|
271
|
+
if not isinstance(parsed, dict):
|
|
272
|
+
return _fallback_page(entity_id, facts, tx_watermark, error="LLM did not return object")
|
|
273
|
+
|
|
274
|
+
# Validate fact_ids — drop hallucinated ones
|
|
275
|
+
valid_fids = {f["fact_id"] for f in facts}
|
|
276
|
+
fact_meta = {f["fact_id"]: f for f in facts}
|
|
277
|
+
sections_in = parsed.get("sections", []) or []
|
|
278
|
+
sections_out: list[dict] = []
|
|
279
|
+
dropped = 0
|
|
280
|
+
|
|
281
|
+
for sec in sections_in:
|
|
282
|
+
if not isinstance(sec, dict): continue
|
|
283
|
+
heading = (sec.get("heading") or "").strip()[:80]
|
|
284
|
+
if not heading: continue
|
|
285
|
+
bullets_in = sec.get("bullets", []) or []
|
|
286
|
+
bullets_out: list[dict] = []
|
|
287
|
+
for b in bullets_in:
|
|
288
|
+
if not isinstance(b, dict): continue
|
|
289
|
+
fid = b.get("fact_id")
|
|
290
|
+
if not fid or fid not in valid_fids:
|
|
291
|
+
dropped += 1
|
|
292
|
+
continue
|
|
293
|
+
meta = fact_meta[fid]
|
|
294
|
+
bullets_out.append({
|
|
295
|
+
"fact_id": fid,
|
|
296
|
+
"text": (b.get("text") or meta.get("value") or "")[:200],
|
|
297
|
+
"confidence": meta.get("confidence", 0.5),
|
|
298
|
+
"domain": meta.get("domain"),
|
|
299
|
+
"first_seen": meta.get("first_seen"),
|
|
300
|
+
})
|
|
301
|
+
if bullets_out:
|
|
302
|
+
sec_out = {"heading": heading, "bullets": bullets_out}
|
|
303
|
+
if sec.get("notes"):
|
|
304
|
+
sec_out["notes"] = str(sec["notes"])[:300]
|
|
305
|
+
sections_out.append(sec_out)
|
|
306
|
+
|
|
307
|
+
summary = (parsed.get("summary") or "").strip()[:1000]
|
|
308
|
+
if not summary:
|
|
309
|
+
summary = f"Knowledge about {entity_id}: {fact_count_total} facts."
|
|
310
|
+
|
|
311
|
+
return {
|
|
312
|
+
"entity": entity_id,
|
|
313
|
+
"tx_watermark": tx_watermark,
|
|
314
|
+
"fact_count": fact_count_total,
|
|
315
|
+
"facts_used": fact_count_total - dropped,
|
|
316
|
+
"summary": summary,
|
|
317
|
+
"sections": sections_out,
|
|
318
|
+
"stats": {
|
|
319
|
+
"tokens_in": 0, # tracked in stderr; aggregating here would require parser
|
|
320
|
+
"tokens_out": 0,
|
|
321
|
+
"dropped_bullets": dropped,
|
|
322
|
+
"from_cache": False,
|
|
323
|
+
},
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _fallback_page(entity_id: str, facts: list[dict], tx_watermark: int, error: str = "") -> dict:
|
|
328
|
+
"""Ungrouped fallback when LLM fails or returns garbage."""
|
|
329
|
+
bullets = [{
|
|
330
|
+
"fact_id": f["fact_id"],
|
|
331
|
+
"text": (f.get("value") or "")[:140],
|
|
332
|
+
"confidence": f.get("confidence", 0.5),
|
|
333
|
+
"domain": f.get("domain"),
|
|
334
|
+
"first_seen": f.get("first_seen"),
|
|
335
|
+
} for f in facts]
|
|
336
|
+
return {
|
|
337
|
+
"entity": entity_id,
|
|
338
|
+
"tx_watermark": tx_watermark,
|
|
339
|
+
"fact_count": len(facts),
|
|
340
|
+
"facts_used": len(facts),
|
|
341
|
+
"summary": f"Knowledge about {entity_id} ({len(facts)} facts). LLM rendering unavailable.",
|
|
342
|
+
"sections": [{"heading": "All Facts", "bullets": bullets}],
|
|
343
|
+
"stats": {"tokens_in": 0, "tokens_out": 0, "dropped_bullets": 0,
|
|
344
|
+
"from_cache": False, "fallback": True, "error": error[:200]},
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def lookup_cache(web_db_path: str, entity_id: str, tx_watermark: int) -> dict | None:
|
|
349
|
+
"""Read page_cache row matching (entity, tx_watermark). Returns parsed JSON or None."""
|
|
350
|
+
import sqlite3
|
|
351
|
+
if not Path(web_db_path).exists():
|
|
352
|
+
return None
|
|
353
|
+
try:
|
|
354
|
+
conn = sqlite3.connect(web_db_path)
|
|
355
|
+
conn.row_factory = sqlite3.Row
|
|
356
|
+
row = conn.execute(
|
|
357
|
+
"SELECT page_json, generated_at FROM page_cache WHERE entity_id = ? AND tx_watermark = ?",
|
|
358
|
+
(entity_id, tx_watermark),
|
|
359
|
+
).fetchone()
|
|
360
|
+
conn.close()
|
|
361
|
+
if row:
|
|
362
|
+
page = json.loads(row["page_json"])
|
|
363
|
+
page.setdefault("stats", {})["from_cache"] = True
|
|
364
|
+
page["generated_at"] = row["generated_at"]
|
|
365
|
+
return page
|
|
366
|
+
except Exception as e:
|
|
367
|
+
sys.stderr.write(f"page_renderer cache lookup failed: {e}\n")
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def write_cache(web_db_path: str, entity_id: str, page: dict) -> None:
|
|
372
|
+
"""Persist a freshly-rendered page to web.db.page_cache."""
|
|
373
|
+
import sqlite3
|
|
374
|
+
import time
|
|
375
|
+
try:
|
|
376
|
+
conn = sqlite3.connect(web_db_path)
|
|
377
|
+
stats = page.get("stats", {})
|
|
378
|
+
conn.execute(
|
|
379
|
+
"""INSERT OR REPLACE INTO page_cache
|
|
380
|
+
(entity_id, tx_watermark, page_json, generated_at, tokens_in, tokens_out, cost_usd)
|
|
381
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
382
|
+
(
|
|
383
|
+
entity_id,
|
|
384
|
+
page.get("tx_watermark", 0),
|
|
385
|
+
json.dumps(page, ensure_ascii=False),
|
|
386
|
+
int(time.time() * 1000),
|
|
387
|
+
stats.get("tokens_in"),
|
|
388
|
+
stats.get("tokens_out"),
|
|
389
|
+
stats.get("cost_usd"),
|
|
390
|
+
),
|
|
391
|
+
)
|
|
392
|
+
conn.commit()
|
|
393
|
+
conn.close()
|
|
394
|
+
except Exception as e:
|
|
395
|
+
sys.stderr.write(f"page_renderer cache write failed: {e}\n")
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def main() -> None:
|
|
399
|
+
parser = argparse.ArgumentParser(description="Page Renderer")
|
|
400
|
+
parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
|
|
401
|
+
parser.add_argument("--entity", required=True, help="Entity id (entity:* or fact:* or bare slug)")
|
|
402
|
+
parser.add_argument("--max-facts", type=int, default=1000, help="Max facts to consider")
|
|
403
|
+
parser.add_argument("--web-db", default=None,
|
|
404
|
+
help="Path to web.db for page cache (optional). If provided, hits and writes cache.")
|
|
405
|
+
parser.add_argument("--refresh", action="store_true",
|
|
406
|
+
help="Bypass cache and always re-render via LLM.")
|
|
407
|
+
args = parser.parse_args()
|
|
408
|
+
|
|
409
|
+
if not Path(args.db).exists():
|
|
410
|
+
print(json.dumps({"error": f"db not found: {args.db}"}))
|
|
411
|
+
sys.exit(1)
|
|
412
|
+
|
|
413
|
+
# Determine tx_watermark cheaply for the cache key, before we commit to LLM.
|
|
414
|
+
facts, tx_watermark = load_entity_facts(args.db, args.entity, args.max_facts)
|
|
415
|
+
|
|
416
|
+
# Cache hit fast-path
|
|
417
|
+
if args.web_db and not args.refresh:
|
|
418
|
+
cached = lookup_cache(args.web_db, args.entity, tx_watermark)
|
|
419
|
+
if cached:
|
|
420
|
+
print(json.dumps(cached, ensure_ascii=False))
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
# Render (uses already-loaded facts via a thin reuse path)
|
|
424
|
+
if not facts:
|
|
425
|
+
page = {
|
|
426
|
+
"entity": args.entity,
|
|
427
|
+
"tx_watermark": tx_watermark,
|
|
428
|
+
"fact_count": 0,
|
|
429
|
+
"facts_used": 0,
|
|
430
|
+
"summary": "No knowledge captured for this entity yet.",
|
|
431
|
+
"sections": [],
|
|
432
|
+
"stats": {"tokens_in": 0, "tokens_out": 0, "dropped_bullets": 0, "from_cache": False},
|
|
433
|
+
}
|
|
434
|
+
else:
|
|
435
|
+
page = render_page(args.db, args.entity, max_facts=args.max_facts)
|
|
436
|
+
# render_page reloads facts internally; the load above is duplicated. Keep
|
|
437
|
+
# the cleaner separation rather than threading state — the second load
|
|
438
|
+
# hits the in-process SQLite page cache, so the cost is negligible.
|
|
439
|
+
|
|
440
|
+
if args.web_db and page.get("fact_count", 0) > 0:
|
|
441
|
+
write_cache(args.web_db, args.entity, page)
|
|
442
|
+
|
|
443
|
+
print(json.dumps(page, ensure_ascii=False))
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
if __name__ == "__main__":
|
|
447
|
+
main()
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fact retraction (soft-delete) for the web UI.
|
|
3
|
+
|
|
4
|
+
The triplestore already supports retraction via store.retract_triple() —
|
|
5
|
+
this script is the user-initiated equivalent of what knowledge_integrator
|
|
6
|
+
does automatically. The new ingredients are:
|
|
7
|
+
|
|
8
|
+
1. Audit triples (retracted_reason, retracted_by) so the WHY survives.
|
|
9
|
+
2. Pre-retraction snapshot saved to web.db.retraction_undo, single-use,
|
|
10
|
+
10-minute TTL — gives the UI a real "undo" button.
|
|
11
|
+
|
|
12
|
+
Soft delete: rows stay; retracted=1 + retracted_tx + valid_to are set.
|
|
13
|
+
Bi-temporal queries (entity_as_of) still see the fact at past tx_ids.
|
|
14
|
+
Physical removal only happens via gc_retracted_triples (off by default).
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python3 retract.py --retract --db <db> --web-db <web.db> \
|
|
18
|
+
--fact-id fact:foo [--reason "..."] [--actor "..."]
|
|
19
|
+
|
|
20
|
+
python3 retract.py --restore --db <db> --web-db <web.db> \
|
|
21
|
+
--fact-id fact:foo --undo-token <token>
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import json
|
|
27
|
+
import secrets
|
|
28
|
+
import sqlite3
|
|
29
|
+
import sys
|
|
30
|
+
import time
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
UNDO_TTL_MS = 10 * 60 * 1000 # 10 minutes
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def snapshot_triples(store, fact_id: str) -> list[dict]:
|
|
37
|
+
"""Capture every active triple for a fact entity for restore. Includes
|
|
38
|
+
value_type so re-asserts preserve string-vs-ref semantics."""
|
|
39
|
+
rows = store._conn.execute(
|
|
40
|
+
"""SELECT attribute, value, value_type, tx_id, created_at
|
|
41
|
+
FROM triples
|
|
42
|
+
WHERE entity_id = ? AND retracted = 0""",
|
|
43
|
+
(fact_id,),
|
|
44
|
+
).fetchall()
|
|
45
|
+
return [
|
|
46
|
+
{
|
|
47
|
+
"attribute": r["attribute"],
|
|
48
|
+
"value": r["value"],
|
|
49
|
+
"value_type": r["value_type"],
|
|
50
|
+
"original_tx_id": r["tx_id"],
|
|
51
|
+
"original_created_at": r["created_at"],
|
|
52
|
+
}
|
|
53
|
+
for r in rows
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def retract_fact(db_path: str, web_db_path: str, fact_id: str,
|
|
58
|
+
reason: str | None, actor: str | None,
|
|
59
|
+
source_entity: str | None = None) -> dict:
|
|
60
|
+
"""Retract all triples for a fact entity + persist undo snapshot."""
|
|
61
|
+
from triplestore import TripleStore
|
|
62
|
+
|
|
63
|
+
store = TripleStore(db_path)
|
|
64
|
+
snapshot = snapshot_triples(store, fact_id)
|
|
65
|
+
if not snapshot:
|
|
66
|
+
store.close()
|
|
67
|
+
return {"ok": False, "error": "fact not found or already retracted",
|
|
68
|
+
"fact_id": fact_id}
|
|
69
|
+
|
|
70
|
+
metadata = {"actor": actor, "reason": reason, "source": "web-ui"}
|
|
71
|
+
tx_id = store.begin_tx(source="web-ui-retract",
|
|
72
|
+
metadata={k: v for k, v in metadata.items() if v})
|
|
73
|
+
|
|
74
|
+
# Retract every active triple
|
|
75
|
+
triples_retracted = 0
|
|
76
|
+
for t in snapshot:
|
|
77
|
+
triples_retracted += store.retract_triple(
|
|
78
|
+
tx_id, fact_id, t["attribute"], t["value"],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Audit triples — these are NEW assertions ABOUT the retraction event
|
|
82
|
+
if reason:
|
|
83
|
+
store.assert_triple(tx_id, fact_id, "retracted_reason", reason, "string")
|
|
84
|
+
if actor:
|
|
85
|
+
store.assert_triple(tx_id, fact_id, "retracted_by", actor, "string")
|
|
86
|
+
|
|
87
|
+
store.close()
|
|
88
|
+
|
|
89
|
+
# Persist undo snapshot
|
|
90
|
+
token = secrets.token_hex(16)
|
|
91
|
+
now_ms = int(time.time() * 1000)
|
|
92
|
+
expires_at = now_ms + UNDO_TTL_MS
|
|
93
|
+
|
|
94
|
+
if Path(web_db_path).exists():
|
|
95
|
+
try:
|
|
96
|
+
conn = sqlite3.connect(web_db_path)
|
|
97
|
+
conn.execute(
|
|
98
|
+
"""INSERT INTO retraction_undo
|
|
99
|
+
(token, fact_id, snapshot_json, retracted_tx,
|
|
100
|
+
reason, actor, created_at, expires_at)
|
|
101
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
102
|
+
(token, fact_id, json.dumps(snapshot),
|
|
103
|
+
tx_id, reason, actor, now_ms, expires_at),
|
|
104
|
+
)
|
|
105
|
+
conn.execute(
|
|
106
|
+
"""INSERT INTO retraction_log
|
|
107
|
+
(ts, fact_id, reason, actor, source_entity)
|
|
108
|
+
VALUES (?, ?, ?, ?, ?)""",
|
|
109
|
+
(now_ms, fact_id, reason, actor, source_entity),
|
|
110
|
+
)
|
|
111
|
+
conn.commit()
|
|
112
|
+
conn.close()
|
|
113
|
+
except Exception as e:
|
|
114
|
+
sys.stderr.write(f"undo persist failed: {e}\n")
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
"ok": True,
|
|
118
|
+
"fact_id": fact_id,
|
|
119
|
+
"retracted": True,
|
|
120
|
+
"retracted_tx": tx_id,
|
|
121
|
+
"triples_retracted": triples_retracted,
|
|
122
|
+
"undo_token": token,
|
|
123
|
+
"expires_at": expires_at,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def restore_fact(db_path: str, web_db_path: str, fact_id: str,
|
|
128
|
+
undo_token: str) -> dict:
|
|
129
|
+
"""Re-assert a previously retracted fact from the undo snapshot."""
|
|
130
|
+
from triplestore import TripleStore
|
|
131
|
+
|
|
132
|
+
if not Path(web_db_path).exists():
|
|
133
|
+
return {"ok": False, "error": "web.db not available"}
|
|
134
|
+
|
|
135
|
+
conn = sqlite3.connect(web_db_path)
|
|
136
|
+
conn.row_factory = sqlite3.Row
|
|
137
|
+
row = conn.execute(
|
|
138
|
+
"SELECT * FROM retraction_undo WHERE token = ? AND fact_id = ?",
|
|
139
|
+
(undo_token, fact_id),
|
|
140
|
+
).fetchone()
|
|
141
|
+
|
|
142
|
+
if not row:
|
|
143
|
+
conn.close()
|
|
144
|
+
return {"ok": False, "error": "undo token not found"}
|
|
145
|
+
if row["consumed_at"] is not None:
|
|
146
|
+
conn.close()
|
|
147
|
+
return {"ok": False, "error": "undo token already consumed"}
|
|
148
|
+
if row["expires_at"] < int(time.time() * 1000):
|
|
149
|
+
conn.close()
|
|
150
|
+
return {"ok": False, "error": "undo token expired"}
|
|
151
|
+
|
|
152
|
+
original_retracted_tx = row["retracted_tx"]
|
|
153
|
+
|
|
154
|
+
store = TripleStore(db_path)
|
|
155
|
+
tx_id = store.begin_tx(source="web-ui-restore",
|
|
156
|
+
metadata={"undo_token": undo_token,
|
|
157
|
+
"reverses_tx": original_retracted_tx})
|
|
158
|
+
|
|
159
|
+
# Un-retract: flip retracted=0 on triples that were closed by the original
|
|
160
|
+
# retraction tx. Avoids creating duplicate triples — the originals come back
|
|
161
|
+
# with their original tx_ids and created_at intact, preserving bi-temporal
|
|
162
|
+
# history.
|
|
163
|
+
cur = store._conn.execute(
|
|
164
|
+
"""UPDATE triples SET retracted = 0, retracted_tx = NULL, valid_to = NULL
|
|
165
|
+
WHERE entity_id = ? AND retracted_tx = ?""",
|
|
166
|
+
(fact_id, original_retracted_tx),
|
|
167
|
+
)
|
|
168
|
+
triples_restored = cur.rowcount
|
|
169
|
+
|
|
170
|
+
# Also retract the audit triples we wrote during retraction so they don't
|
|
171
|
+
# linger as active facts on the restored entity.
|
|
172
|
+
store.retract_triple(tx_id, fact_id, "retracted_reason")
|
|
173
|
+
store.retract_triple(tx_id, fact_id, "retracted_by")
|
|
174
|
+
|
|
175
|
+
store._conn.commit()
|
|
176
|
+
store.close()
|
|
177
|
+
|
|
178
|
+
# Mark consumed + log undo
|
|
179
|
+
conn.execute(
|
|
180
|
+
"UPDATE retraction_undo SET consumed_at = ? WHERE token = ?",
|
|
181
|
+
(int(time.time() * 1000), undo_token),
|
|
182
|
+
)
|
|
183
|
+
conn.execute(
|
|
184
|
+
"""UPDATE retraction_log SET undone_at = ?
|
|
185
|
+
WHERE rowid = (
|
|
186
|
+
SELECT rowid FROM retraction_log
|
|
187
|
+
WHERE fact_id = ? AND undone_at IS NULL
|
|
188
|
+
ORDER BY ts DESC LIMIT 1
|
|
189
|
+
)""",
|
|
190
|
+
(int(time.time() * 1000), fact_id),
|
|
191
|
+
)
|
|
192
|
+
conn.commit()
|
|
193
|
+
conn.close()
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
"ok": True,
|
|
197
|
+
"fact_id": fact_id,
|
|
198
|
+
"restored": True,
|
|
199
|
+
"restored_tx": tx_id,
|
|
200
|
+
"triples_restored": triples_restored,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def main() -> None:
|
|
205
|
+
parser = argparse.ArgumentParser(description="Fact retraction / restore")
|
|
206
|
+
parser.add_argument("--db", required=True, help="Knowledge graph DB path")
|
|
207
|
+
parser.add_argument("--web-db", required=True, help="Web metadata DB path")
|
|
208
|
+
parser.add_argument("--fact-id", required=True, help="Fact entity id (e.g. fact:foo)")
|
|
209
|
+
mode = parser.add_mutually_exclusive_group(required=True)
|
|
210
|
+
mode.add_argument("--retract", action="store_true")
|
|
211
|
+
mode.add_argument("--restore", action="store_true")
|
|
212
|
+
parser.add_argument("--reason", default=None)
|
|
213
|
+
parser.add_argument("--actor", default=None)
|
|
214
|
+
parser.add_argument("--source-entity", default=None,
|
|
215
|
+
help="Entity page user was on when retracting (telemetry)")
|
|
216
|
+
parser.add_argument("--undo-token", default=None)
|
|
217
|
+
args = parser.parse_args()
|
|
218
|
+
|
|
219
|
+
if not Path(args.db).exists():
|
|
220
|
+
print(json.dumps({"ok": False, "error": f"db not found: {args.db}"}))
|
|
221
|
+
sys.exit(1)
|
|
222
|
+
|
|
223
|
+
if args.retract:
|
|
224
|
+
out = retract_fact(args.db, args.web_db, args.fact_id,
|
|
225
|
+
args.reason, args.actor, args.source_entity)
|
|
226
|
+
else:
|
|
227
|
+
if not args.undo_token:
|
|
228
|
+
print(json.dumps({"ok": False, "error": "--undo-token required for --restore"}))
|
|
229
|
+
sys.exit(1)
|
|
230
|
+
out = restore_fact(args.db, args.web_db, args.fact_id, args.undo_token)
|
|
231
|
+
|
|
232
|
+
print(json.dumps(out, ensure_ascii=False))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == "__main__":
|
|
236
|
+
main()
|