@geravant/sinain 1.18.3 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Schema for ~/.sinain/memory/web.db — UI metadata storage.
3
+ *
4
+ * Kept separate from triplestore (knowledge-graph.db) because triples are
5
+ * claims about the world (with confidence, retraction, bi-temporal validity),
6
+ * whereas this DB stores UI state, page caches, and undo tokens — metadata
7
+ * that should not be visible to the curator/distiller.
8
+ */
9
+
10
+ export const SCHEMA_VERSION = 1;
11
+
12
+ export const SCHEMA_SQL = `
13
+ -- Schema version tracking (for future migrations)
14
+ CREATE TABLE IF NOT EXISTS schema_meta (
15
+ key TEXT PRIMARY KEY,
16
+ value TEXT NOT NULL
17
+ );
18
+
19
+ -- User bookmarks (favorite / archive / recent)
20
+ CREATE TABLE IF NOT EXISTS user_bookmarks (
21
+ entity_id TEXT PRIMARY KEY,
22
+ status TEXT NOT NULL CHECK (status IN ('favorite','archive','recent')),
23
+ note TEXT,
24
+ created_at INTEGER NOT NULL,
25
+ last_visited INTEGER NOT NULL
26
+ );
27
+ CREATE INDEX IF NOT EXISTS idx_bookmarks_status_visited
28
+ ON user_bookmarks(status, last_visited DESC);
29
+
30
+ -- LLM-rendered page cache. Key: (entity, max tx_id of facts that fed the render)
31
+ -- Implicit invalidation: new facts → tx advances → cache miss → regenerate.
32
+ -- Old entries kept for bi-temporal "view as of" support.
33
+ CREATE TABLE IF NOT EXISTS page_cache (
34
+ entity_id TEXT NOT NULL,
35
+ tx_watermark INTEGER NOT NULL,
36
+ page_json TEXT NOT NULL,
37
+ generated_at INTEGER NOT NULL,
38
+ tokens_in INTEGER,
39
+ tokens_out INTEGER,
40
+ cost_usd REAL,
41
+ PRIMARY KEY (entity_id, tx_watermark)
42
+ );
43
+ CREATE INDEX IF NOT EXISTS idx_page_cache_entity
44
+ ON page_cache(entity_id, generated_at DESC);
45
+
46
+ -- Undo snapshots for fact retraction. 10-minute server-side window.
47
+ -- Single-use: consumed_at set when restored; row not deleted (audit trail).
48
+ CREATE TABLE IF NOT EXISTS retraction_undo (
49
+ token TEXT PRIMARY KEY,
50
+ fact_id TEXT NOT NULL,
51
+ snapshot_json TEXT NOT NULL,
52
+ retracted_tx INTEGER NOT NULL,
53
+ reason TEXT,
54
+ actor TEXT,
55
+ created_at INTEGER NOT NULL,
56
+ expires_at INTEGER NOT NULL,
57
+ consumed_at INTEGER
58
+ );
59
+ CREATE INDEX IF NOT EXISTS idx_retraction_undo_expires
60
+ ON retraction_undo(expires_at);
61
+
62
+ -- Audit log of every retraction (kept forever, feeds eval reports).
63
+ CREATE TABLE IF NOT EXISTS retraction_log (
64
+ ts INTEGER NOT NULL,
65
+ fact_id TEXT NOT NULL,
66
+ reason TEXT,
67
+ actor TEXT,
68
+ undone_at INTEGER,
69
+ source_entity TEXT
70
+ );
71
+ CREATE INDEX IF NOT EXISTS idx_retraction_log_ts ON retraction_log(ts DESC);
72
+
73
+ -- Imported concepts — provenance + idempotency check via bundle_sha256.
74
+ CREATE TABLE IF NOT EXISTS concept_imports (
75
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
76
+ imported_at INTEGER NOT NULL,
77
+ root_entity TEXT NOT NULL,
78
+ source_tool TEXT,
79
+ source_version TEXT,
80
+ envelope_format TEXT NOT NULL,
81
+ bundle_sha256 TEXT NOT NULL,
82
+ conflict_mode TEXT NOT NULL,
83
+ triples_count INTEGER,
84
+ redactions_seen TEXT,
85
+ notes TEXT
86
+ );
87
+ CREATE INDEX IF NOT EXISTS idx_concept_imports_root
88
+ ON concept_imports(root_entity, imported_at DESC);
89
+ CREATE INDEX IF NOT EXISTS idx_concept_imports_sha
90
+ ON concept_imports(bundle_sha256);
91
+
92
+ -- Search log for telemetry / "what did the user search for that didn't resolve."
93
+ CREATE TABLE IF NOT EXISTS search_log (
94
+ ts INTEGER NOT NULL,
95
+ query TEXT NOT NULL,
96
+ resolved_to TEXT,
97
+ result_count INTEGER
98
+ );
99
+ CREATE INDEX IF NOT EXISTS idx_search_log_ts ON search_log(ts DESC);
100
+ `;
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Typed accessor for ~/.sinain/memory/web.db.
3
+ *
4
+ * One module owns all SQL — keeps query strings out of HTTP handlers and
5
+ * lets us swap out better-sqlite3 later if needed.
6
+ */
7
+ import Database from "better-sqlite3";
8
+ import { existsSync, mkdirSync } from "node:fs";
9
+ import { dirname } from "node:path";
10
+ import { SCHEMA_SQL, SCHEMA_VERSION } from "./schema.js";
11
+ import { log } from "../log.js";
12
+
13
+ const TAG = "web-db";
14
+
15
+ export type BookmarkStatus = "favorite" | "archive" | "recent";
16
+
17
+ export interface Bookmark {
18
+ entity_id: string;
19
+ status: BookmarkStatus;
20
+ note: string | null;
21
+ created_at: number;
22
+ last_visited: number;
23
+ }
24
+
25
+ export interface PageCacheRow {
26
+ entity_id: string;
27
+ tx_watermark: number;
28
+ page_json: string;
29
+ generated_at: number;
30
+ tokens_in: number | null;
31
+ tokens_out: number | null;
32
+ cost_usd: number | null;
33
+ }
34
+
35
+ export interface RetractionUndoRow {
36
+ token: string;
37
+ fact_id: string;
38
+ snapshot_json: string;
39
+ retracted_tx: number;
40
+ reason: string | null;
41
+ actor: string | null;
42
+ created_at: number;
43
+ expires_at: number;
44
+ consumed_at: number | null;
45
+ }
46
+
47
+ export interface ConceptImportRow {
48
+ id?: number;
49
+ imported_at: number;
50
+ root_entity: string;
51
+ source_tool: string | null;
52
+ source_version: string | null;
53
+ envelope_format: string;
54
+ bundle_sha256: string;
55
+ conflict_mode: string;
56
+ triples_count: number | null;
57
+ redactions_seen: string | null;
58
+ notes: string | null;
59
+ }
60
+
61
+ const PAGE_CACHE_LRU_CAP = 500;
62
+
63
+ export class WebDb {
64
+ private db: Database.Database;
65
+
66
+ constructor(dbPath: string) {
67
+ if (!existsSync(dirname(dbPath))) {
68
+ mkdirSync(dirname(dbPath), { recursive: true });
69
+ }
70
+ this.db = new Database(dbPath);
71
+ this.db.pragma("journal_mode = WAL");
72
+ this.db.pragma("foreign_keys = ON");
73
+ this.db.exec(SCHEMA_SQL);
74
+ this.db
75
+ .prepare(
76
+ "INSERT INTO schema_meta(key, value) VALUES('version', ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value",
77
+ )
78
+ .run(String(SCHEMA_VERSION));
79
+ log(TAG, `web.db ready at ${dbPath} (schema v${SCHEMA_VERSION})`);
80
+ }
81
+
82
+ close(): void {
83
+ this.db.close();
84
+ }
85
+
86
+ // ── Bookmarks ───────────────────────────────────────────
87
+
88
+ listBookmarks(status?: BookmarkStatus, limit = 100): Bookmark[] {
89
+ const sql = status
90
+ ? "SELECT * FROM user_bookmarks WHERE status = ? ORDER BY last_visited DESC LIMIT ?"
91
+ : "SELECT * FROM user_bookmarks ORDER BY last_visited DESC LIMIT ?";
92
+ const args = status ? [status, limit] : [limit];
93
+ return this.db.prepare(sql).all(...args) as Bookmark[];
94
+ }
95
+
96
+ upsertBookmark(entity_id: string, status: BookmarkStatus, note?: string): Bookmark {
97
+ const now = Date.now();
98
+ this.db
99
+ .prepare(
100
+ `INSERT INTO user_bookmarks(entity_id, status, note, created_at, last_visited)
101
+ VALUES(?, ?, ?, ?, ?)
102
+ ON CONFLICT(entity_id) DO UPDATE SET
103
+ status = excluded.status,
104
+ note = COALESCE(excluded.note, user_bookmarks.note),
105
+ last_visited = excluded.last_visited`,
106
+ )
107
+ .run(entity_id, status, note ?? null, now, now);
108
+ return this.db
109
+ .prepare("SELECT * FROM user_bookmarks WHERE entity_id = ?")
110
+ .get(entity_id) as Bookmark;
111
+ }
112
+
113
+ deleteBookmark(entity_id: string): boolean {
114
+ const r = this.db.prepare("DELETE FROM user_bookmarks WHERE entity_id = ?").run(entity_id);
115
+ return r.changes > 0;
116
+ }
117
+
118
+ /** Bump last_visited for a bookmark; if absent, insert as 'recent'. */
119
+ touchVisit(entity_id: string): void {
120
+ const now = Date.now();
121
+ this.db
122
+ .prepare(
123
+ `INSERT INTO user_bookmarks(entity_id, status, note, created_at, last_visited)
124
+ VALUES(?, 'recent', NULL, ?, ?)
125
+ ON CONFLICT(entity_id) DO UPDATE SET last_visited = excluded.last_visited`,
126
+ )
127
+ .run(entity_id, now, now);
128
+ }
129
+
130
+ // ── Page cache ──────────────────────────────────────────
131
+
132
+ getPageCache(entity_id: string, tx_watermark: number): PageCacheRow | null {
133
+ const row = this.db
134
+ .prepare(
135
+ "SELECT * FROM page_cache WHERE entity_id = ? AND tx_watermark = ?",
136
+ )
137
+ .get(entity_id, tx_watermark) as PageCacheRow | undefined;
138
+ return row ?? null;
139
+ }
140
+
141
+ putPageCache(row: Omit<PageCacheRow, "generated_at"> & { generated_at?: number }): void {
142
+ const generated_at = row.generated_at ?? Date.now();
143
+ this.db
144
+ .prepare(
145
+ `INSERT OR REPLACE INTO page_cache
146
+ (entity_id, tx_watermark, page_json, generated_at, tokens_in, tokens_out, cost_usd)
147
+ VALUES (?, ?, ?, ?, ?, ?, ?)`,
148
+ )
149
+ .run(
150
+ row.entity_id,
151
+ row.tx_watermark,
152
+ row.page_json,
153
+ generated_at,
154
+ row.tokens_in ?? null,
155
+ row.tokens_out ?? null,
156
+ row.cost_usd ?? null,
157
+ );
158
+ this.pruneCache();
159
+ }
160
+
161
+ /** LRU prune: keep newest PAGE_CACHE_LRU_CAP entries by generated_at. */
162
+ private pruneCache(): void {
163
+ const count = (this.db.prepare("SELECT COUNT(*) as n FROM page_cache").get() as { n: number }).n;
164
+ if (count <= PAGE_CACHE_LRU_CAP) return;
165
+ const overflow = count - PAGE_CACHE_LRU_CAP;
166
+ this.db
167
+ .prepare(
168
+ `DELETE FROM page_cache WHERE rowid IN (
169
+ SELECT rowid FROM page_cache ORDER BY generated_at ASC LIMIT ?
170
+ )`,
171
+ )
172
+ .run(overflow);
173
+ }
174
+
175
+ // ── Retraction undo ─────────────────────────────────────
176
+
177
+ putRetractionUndo(row: Omit<RetractionUndoRow, "consumed_at">): void {
178
+ this.db
179
+ .prepare(
180
+ `INSERT INTO retraction_undo
181
+ (token, fact_id, snapshot_json, retracted_tx, reason, actor, created_at, expires_at)
182
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
183
+ )
184
+ .run(
185
+ row.token,
186
+ row.fact_id,
187
+ row.snapshot_json,
188
+ row.retracted_tx,
189
+ row.reason,
190
+ row.actor,
191
+ row.created_at,
192
+ row.expires_at,
193
+ );
194
+ }
195
+
196
+ getRetractionUndo(token: string): RetractionUndoRow | null {
197
+ const row = this.db
198
+ .prepare("SELECT * FROM retraction_undo WHERE token = ?")
199
+ .get(token) as RetractionUndoRow | undefined;
200
+ return row ?? null;
201
+ }
202
+
203
+ consumeRetractionUndo(token: string): void {
204
+ this.db
205
+ .prepare("UPDATE retraction_undo SET consumed_at = ? WHERE token = ?")
206
+ .run(Date.now(), token);
207
+ }
208
+
209
+ pruneExpiredUndos(): number {
210
+ const r = this.db
211
+ .prepare("DELETE FROM retraction_undo WHERE expires_at < ? AND consumed_at IS NULL")
212
+ .run(Date.now());
213
+ return r.changes;
214
+ }
215
+
216
+ logRetraction(fact_id: string, reason: string | null, actor: string | null, source_entity: string | null): void {
217
+ this.db
218
+ .prepare(
219
+ `INSERT INTO retraction_log(ts, fact_id, reason, actor, source_entity)
220
+ VALUES (?, ?, ?, ?, ?)`,
221
+ )
222
+ .run(Date.now(), fact_id, reason, actor, source_entity);
223
+ }
224
+
225
+ markRetractionUndone(fact_id: string): void {
226
+ this.db
227
+ .prepare(
228
+ `UPDATE retraction_log SET undone_at = ?
229
+ WHERE rowid = (
230
+ SELECT rowid FROM retraction_log
231
+ WHERE fact_id = ? AND undone_at IS NULL
232
+ ORDER BY ts DESC LIMIT 1
233
+ )`,
234
+ )
235
+ .run(Date.now(), fact_id);
236
+ }
237
+
238
+ // ── Concept imports ─────────────────────────────────────
239
+
240
+ recordConceptImport(row: ConceptImportRow): number {
241
+ const r = this.db
242
+ .prepare(
243
+ `INSERT INTO concept_imports
244
+ (imported_at, root_entity, source_tool, source_version, envelope_format,
245
+ bundle_sha256, conflict_mode, triples_count, redactions_seen, notes)
246
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
247
+ )
248
+ .run(
249
+ row.imported_at,
250
+ row.root_entity,
251
+ row.source_tool,
252
+ row.source_version,
253
+ row.envelope_format,
254
+ row.bundle_sha256,
255
+ row.conflict_mode,
256
+ row.triples_count,
257
+ row.redactions_seen,
258
+ row.notes,
259
+ );
260
+ return Number(r.lastInsertRowid);
261
+ }
262
+
263
+ findImportBySha(bundle_sha256: string): ConceptImportRow | null {
264
+ const row = this.db
265
+ .prepare("SELECT * FROM concept_imports WHERE bundle_sha256 = ? ORDER BY imported_at DESC LIMIT 1")
266
+ .get(bundle_sha256) as ConceptImportRow | undefined;
267
+ return row ?? null;
268
+ }
269
+
270
+ // ── Search log ──────────────────────────────────────────
271
+
272
+ logSearch(query: string, resolved_to: string | null, result_count: number): void {
273
+ this.db
274
+ .prepare(
275
+ "INSERT INTO search_log(ts, query, resolved_to, result_count) VALUES (?, ?, ?, ?)",
276
+ )
277
+ .run(Date.now(), query, resolved_to, result_count);
278
+ }
279
+ }
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """Concept Export — package an entity + its neighborhood as a portable bundle.
3
+
4
+ Produces a sinain-concept/v1 envelope that can be transferred to another
5
+ machine and re-imported with concept_import.py to reconstruct the same
6
+ entity page (including the LLM-rendered view, if bundled).
7
+
8
+ The reproducibility invariant we honor:
9
+ "On a new machine: import the bundle → open the same URL → see the same page."
10
+
11
+ For that to hold:
12
+ 1. Entity IDs are content-addressed slugs → stable across machines.
13
+ 2. Triples are exported verbatim (created_at, retracted) for round-trip.
14
+ 3. Optionally bundle the rendered_page JSON so the receiver gets a
15
+ cache hit on first view (deterministic visual identity).
16
+
17
+ We do NOT bundle embeddings — same model on both ends → same vectors,
18
+ so receiver recomputes for ~1.5KB/fact saved.
19
+
20
+ Usage:
21
+ python3 concept_export.py --db <kg.db> --root entity:foo \\
22
+ [--depth 1] [--include-retracted] [--include-page] [--web-db <web.db>] \\
23
+ [--redact private,creditcard,apikey,...]
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import hashlib
29
+ import json
30
+ import re
31
+ import sys
32
+ import time
33
+ from pathlib import Path
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Redaction (MIRROR OF sense_client/privacy.py — keep patterns in sync until
37
+ # we extract a shared sinain-memory/redaction.py module).
38
+ # ---------------------------------------------------------------------------
39
+ REDACT_RULES_VERSION = "1.2"
40
+
41
+ _REDACT_PATTERNS: list[tuple[re.Pattern, str, str]] = [
42
+ # (regex, replacement, rule-name)
43
+ (re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"), "[REDACTED:card]", "creditcard"),
44
+ (re.compile(r"\b(?:sk-|pk-|api[_-]?key[=:]\s*)[A-Za-z0-9_\-]{20,}\b"), "[REDACTED:apikey]", "apikey"),
45
+ (re.compile(r"Bearer\s+[A-Za-z0-9_\-\.]{20,}"), "[REDACTED:bearer]", "bearer"),
46
+ (re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED:awskey]", "awskey"),
47
+ (re.compile(r"(?:password|passwd|pwd)\s*[:=]\s*\S+", re.IGNORECASE), "[REDACTED:password]", "password"),
48
+ (re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "[REDACTED:github_pat]", "github_pat"),
49
+ (re.compile(r"\bghs_[A-Za-z0-9]{36}\b"), "[REDACTED:github_srv]", "github_srv"),
50
+ (re.compile(r"\bxox[bpoa]-[0-9A-Za-z\-]+"), "[REDACTED:slack]", "slack"),
51
+ (re.compile(r"\bya29\.[0-9A-Za-z\-_]+"), "[REDACTED:google_oauth]", "google_oauth"),
52
+ (re.compile(r"\beyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+"), "[REDACTED:jwt]", "jwt"),
53
+ (re.compile(r"(?:secret|token|key)\s*[:=]\s*[A-Za-z0-9_\-\.]{10,}", re.IGNORECASE), "[REDACTED:secret]", "secret"),
54
+ (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED:ssn]", "ssn"),
55
+ (re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"), "[REDACTED:privkey]", "privkey"),
56
+ ]
57
+ _PRIVATE_TAG = re.compile(r"<private>.*?</private>", re.DOTALL)
58
+
59
+
60
+ def apply_redactions(text: str, enabled_rules: set[str]) -> tuple[str, list[str]]:
61
+ """Run enabled redaction rules over *text*. Returns (redacted_text, applied)."""
62
+ applied: list[str] = []
63
+ if "private" in enabled_rules:
64
+ new_text = _PRIVATE_TAG.sub("[REDACTED:private]", text)
65
+ if new_text != text:
66
+ applied.append("private")
67
+ text = new_text
68
+ for pattern, replacement, name in _REDACT_PATTERNS:
69
+ if name in enabled_rules:
70
+ new_text = pattern.sub(replacement, text)
71
+ if new_text != text:
72
+ applied.append(name)
73
+ text = new_text
74
+ return text, applied
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Export
79
+ # ---------------------------------------------------------------------------
80
+
81
+ def collect_neighborhood(store, root_entity: str, depth: int,
82
+ include_retracted: bool) -> list[str]:
83
+ """BFS from root, following both incoming and outgoing refs.
84
+
85
+ Outgoing: triples WHERE entity_id = X AND value_type = 'ref' → recurse to value.
86
+ Incoming: triples WHERE value = X AND value_type = 'ref' → recurse to entity_id.
87
+
88
+ Returns deterministically-ordered list of entity_ids reachable within depth.
89
+ """
90
+ visited: dict[str, int] = {root_entity: 0}
91
+ queue: list[tuple[str, int]] = [(root_entity, 0)]
92
+ retracted_filter = "" if include_retracted else "AND retracted = 0"
93
+
94
+ while queue:
95
+ eid, d = queue.pop(0)
96
+ if d >= depth:
97
+ continue
98
+
99
+ # Outgoing refs
100
+ rows_out = store._conn.execute(
101
+ f"""SELECT DISTINCT value FROM triples
102
+ WHERE entity_id = ? AND value_type = 'ref' {retracted_filter}""",
103
+ (eid,),
104
+ ).fetchall()
105
+ for r in rows_out:
106
+ ref = r["value"]
107
+ if ref and ref not in visited:
108
+ visited[ref] = d + 1
109
+ queue.append((ref, d + 1))
110
+
111
+ # Incoming refs
112
+ rows_in = store._conn.execute(
113
+ f"""SELECT DISTINCT entity_id FROM triples
114
+ WHERE value = ? AND value_type = 'ref' {retracted_filter}""",
115
+ (eid,),
116
+ ).fetchall()
117
+ for r in rows_in:
118
+ ref = r["entity_id"]
119
+ if ref and ref not in visited:
120
+ visited[ref] = d + 1
121
+ queue.append((ref, d + 1))
122
+
123
+ return sorted(visited.keys()) # deterministic ordering for stable checksum
124
+
125
+
126
+ def serialize_entity(store, entity_id: str, include_retracted: bool,
127
+ redact_rules: set[str]) -> tuple[dict, list[str], int]:
128
+ """Pull all triples for entity_id, apply redactions to string values.
129
+
130
+ Returns ({id, type, triples: [...]}, applied_rules, redacted_count).
131
+ """
132
+ where = "" if include_retracted else "AND retracted = 0"
133
+ rows = store._conn.execute(
134
+ f"""SELECT attribute, value, value_type, tx_id, created_at, retracted, valid_to
135
+ FROM triples WHERE entity_id = ? {where}
136
+ ORDER BY tx_id, attribute, value""",
137
+ (entity_id,),
138
+ ).fetchall()
139
+
140
+ triples = []
141
+ all_applied: list[str] = []
142
+ redacted_count = 0
143
+ for r in rows:
144
+ value = r["value"]
145
+ if r["value_type"] == "string" and redact_rules and value:
146
+ new_value, applied = apply_redactions(value, redact_rules)
147
+ if applied:
148
+ all_applied.extend(applied)
149
+ redacted_count += 1
150
+ value = new_value
151
+ triples.append({
152
+ "attribute": r["attribute"],
153
+ "value": value,
154
+ "value_type": r["value_type"],
155
+ "tx_id": r["tx_id"],
156
+ "created_at": r["created_at"],
157
+ "retracted": int(r["retracted"]),
158
+ "valid_to": r["valid_to"],
159
+ })
160
+
161
+ type_prefix = entity_id.split(":", 1)[0] if ":" in entity_id else "unknown"
162
+ return ({"id": entity_id, "type": type_prefix, "triples": triples},
163
+ all_applied, redacted_count)
164
+
165
+
166
+ def fetch_cached_page(web_db_path: str, root_entity: str,
167
+ redact_rules: set[str]) -> dict | None:
168
+ """Pull the most recent cached rendered_page for root_entity, if any.
169
+ Apply redactions over the page summary + bullet text too — the LLM may
170
+ have woven sensitive content into its synthesis.
171
+ """
172
+ import sqlite3
173
+ if not Path(web_db_path).exists():
174
+ return None
175
+ try:
176
+ conn = sqlite3.connect(web_db_path)
177
+ conn.row_factory = sqlite3.Row
178
+ row = conn.execute(
179
+ """SELECT page_json, generated_at, tokens_in, tokens_out, cost_usd
180
+ FROM page_cache WHERE entity_id = ?
181
+ ORDER BY generated_at DESC LIMIT 1""",
182
+ (root_entity,),
183
+ ).fetchone()
184
+ conn.close()
185
+ if not row:
186
+ return None
187
+ page = json.loads(row["page_json"])
188
+ # Redact rendered_page content.
189
+ if redact_rules:
190
+ if page.get("summary"):
191
+ new_summary, _ = apply_redactions(page["summary"], redact_rules)
192
+ page["summary"] = new_summary
193
+ for sec in page.get("sections", []) or []:
194
+ for b in sec.get("bullets", []) or []:
195
+ if b.get("text"):
196
+ new_text, _ = apply_redactions(b["text"], redact_rules)
197
+ b["text"] = new_text
198
+ page["generated_at"] = row["generated_at"]
199
+ page.setdefault("rendered_with", {})
200
+ if row["tokens_in"]: page["rendered_with"]["tokens_in"] = row["tokens_in"]
201
+ if row["tokens_out"]: page["rendered_with"]["tokens_out"] = row["tokens_out"]
202
+ return page
203
+ except Exception as e:
204
+ sys.stderr.write(f"fetch_cached_page failed: {e}\n")
205
+ return None
206
+
207
+
208
+ def export_concept(db_path: str, root_entity: str, depth: int = 1,
209
+ include_retracted: bool = False,
210
+ include_page: bool = True,
211
+ web_db_path: str | None = None,
212
+ redact_rules: set[str] | None = None) -> dict:
213
+ from triplestore import TripleStore
214
+
215
+ if redact_rules is None:
216
+ redact_rules = {"private", "creditcard", "apikey", "bearer", "awskey",
217
+ "password", "secret"}
218
+
219
+ store = TripleStore(db_path)
220
+ entity_ids = collect_neighborhood(store, root_entity, depth, include_retracted)
221
+
222
+ entities = []
223
+ all_applied: list[str] = []
224
+ total_redacted = 0
225
+ triple_count = 0
226
+ fact_count = 0
227
+
228
+ for eid in entity_ids:
229
+ entity_obj, applied, n_redacted = serialize_entity(
230
+ store, eid, include_retracted, redact_rules,
231
+ )
232
+ entities.append(entity_obj)
233
+ all_applied.extend(applied)
234
+ total_redacted += n_redacted
235
+ triple_count += len(entity_obj["triples"])
236
+ if eid.startswith("fact:"):
237
+ fact_count += 1
238
+
239
+ store.close()
240
+
241
+ rendered_page = None
242
+ if include_page and web_db_path:
243
+ rendered_page = fetch_cached_page(web_db_path, root_entity, redact_rules)
244
+
245
+ envelope = {
246
+ "format": "sinain-concept/v1",
247
+ "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
248
+ "exporter": {
249
+ "tool": "sinain-core",
250
+ "tool_version": "1.14.0",
251
+ "schema_version": "triplestore/v3",
252
+ "embedding_model": "all-MiniLM-L6-v2",
253
+ },
254
+ "root_entity": root_entity,
255
+ "depth": depth,
256
+ "stats": {
257
+ "entities": len(entity_ids),
258
+ "facts": fact_count,
259
+ "triples": triple_count,
260
+ },
261
+ "entities": entities,
262
+ "rendered_page": rendered_page,
263
+ "redactions": {
264
+ "applied": sorted(set(all_applied)),
265
+ "rules_version": REDACT_RULES_VERSION,
266
+ "redacted_count": total_redacted,
267
+ },
268
+ }
269
+
270
+ # Compute checksum over canonical JSON of (root_entity + entities) — this is
271
+ # what the receiver should validate against on import.
272
+ canonical = json.dumps(
273
+ {"root_entity": root_entity, "entities": entities},
274
+ sort_keys=True, ensure_ascii=False, separators=(",", ":"),
275
+ )
276
+ envelope["checksum"] = "sha256:" + hashlib.sha256(canonical.encode("utf-8")).hexdigest()
277
+ return envelope
278
+
279
+
280
+ def main() -> None:
281
+ parser = argparse.ArgumentParser(description="Concept Export")
282
+ parser.add_argument("--db", required=True)
283
+ parser.add_argument("--root", required=True, help="Root entity id (e.g. entity:citibank)")
284
+ parser.add_argument("--depth", type=int, default=1)
285
+ parser.add_argument("--include-retracted", action="store_true")
286
+ parser.add_argument("--include-page", action="store_true",
287
+ help="Bundle the cached rendered_page if available")
288
+ parser.add_argument("--web-db", default=None,
289
+ help="Path to web.db for page cache lookup")
290
+ parser.add_argument("--redact", default="private,creditcard,apikey,bearer,awskey,password,secret",
291
+ help="Comma-separated redaction rule names")
292
+ args = parser.parse_args()
293
+
294
+ if not Path(args.db).exists():
295
+ print(json.dumps({"error": f"db not found: {args.db}"}))
296
+ sys.exit(1)
297
+
298
+ rules = {r.strip() for r in args.redact.split(",") if r.strip()}
299
+ envelope = export_concept(
300
+ args.db, args.root, depth=args.depth,
301
+ include_retracted=args.include_retracted,
302
+ include_page=args.include_page,
303
+ web_db_path=args.web_db,
304
+ redact_rules=rules,
305
+ )
306
+ print(json.dumps(envelope, ensure_ascii=False))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()