@geravant/sinain 1.19.0 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +439 -0
- package/sinain-core/package.json +2 -0
- package/sinain-core/src/index.ts +283 -0
- package/sinain-core/src/server.ts +999 -2
- package/sinain-core/src/web-db/schema.ts +100 -0
- package/sinain-core/src/web-db/store.ts +279 -0
- package/sinain-memory/concept_export.py +310 -0
- package/sinain-memory/concept_import.py +254 -0
- package/sinain-memory/graph_query.py +455 -0
- package/sinain-memory/page_renderer.py +447 -0
- package/sinain-memory/retract.py +236 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema for ~/.sinain/memory/web.db — UI metadata storage.
|
|
3
|
+
*
|
|
4
|
+
* Kept separate from triplestore (knowledge-graph.db) because triples are
|
|
5
|
+
* claims about the world (with confidence, retraction, bi-temporal validity),
|
|
6
|
+
* whereas this DB stores UI state, page caches, and undo tokens — metadata
|
|
7
|
+
* that should not be visible to the curator/distiller.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export const SCHEMA_VERSION = 1;
|
|
11
|
+
|
|
12
|
+
export const SCHEMA_SQL = `
|
|
13
|
+
-- Schema version tracking (for future migrations)
|
|
14
|
+
CREATE TABLE IF NOT EXISTS schema_meta (
|
|
15
|
+
key TEXT PRIMARY KEY,
|
|
16
|
+
value TEXT NOT NULL
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
-- User bookmarks (favorite / archive / recent)
|
|
20
|
+
CREATE TABLE IF NOT EXISTS user_bookmarks (
|
|
21
|
+
entity_id TEXT PRIMARY KEY,
|
|
22
|
+
status TEXT NOT NULL CHECK (status IN ('favorite','archive','recent')),
|
|
23
|
+
note TEXT,
|
|
24
|
+
created_at INTEGER NOT NULL,
|
|
25
|
+
last_visited INTEGER NOT NULL
|
|
26
|
+
);
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_bookmarks_status_visited
|
|
28
|
+
ON user_bookmarks(status, last_visited DESC);
|
|
29
|
+
|
|
30
|
+
-- LLM-rendered page cache. Key: (entity, max tx_id of facts that fed the render)
|
|
31
|
+
-- Implicit invalidation: new facts → tx advances → cache miss → regenerate.
|
|
32
|
+
-- Old entries kept for bi-temporal "view as of" support.
|
|
33
|
+
CREATE TABLE IF NOT EXISTS page_cache (
|
|
34
|
+
entity_id TEXT NOT NULL,
|
|
35
|
+
tx_watermark INTEGER NOT NULL,
|
|
36
|
+
page_json TEXT NOT NULL,
|
|
37
|
+
generated_at INTEGER NOT NULL,
|
|
38
|
+
tokens_in INTEGER,
|
|
39
|
+
tokens_out INTEGER,
|
|
40
|
+
cost_usd REAL,
|
|
41
|
+
PRIMARY KEY (entity_id, tx_watermark)
|
|
42
|
+
);
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_page_cache_entity
|
|
44
|
+
ON page_cache(entity_id, generated_at DESC);
|
|
45
|
+
|
|
46
|
+
-- Undo snapshots for fact retraction. 10-minute server-side window.
|
|
47
|
+
-- Single-use: consumed_at set when restored; row not deleted (audit trail).
|
|
48
|
+
CREATE TABLE IF NOT EXISTS retraction_undo (
|
|
49
|
+
token TEXT PRIMARY KEY,
|
|
50
|
+
fact_id TEXT NOT NULL,
|
|
51
|
+
snapshot_json TEXT NOT NULL,
|
|
52
|
+
retracted_tx INTEGER NOT NULL,
|
|
53
|
+
reason TEXT,
|
|
54
|
+
actor TEXT,
|
|
55
|
+
created_at INTEGER NOT NULL,
|
|
56
|
+
expires_at INTEGER NOT NULL,
|
|
57
|
+
consumed_at INTEGER
|
|
58
|
+
);
|
|
59
|
+
CREATE INDEX IF NOT EXISTS idx_retraction_undo_expires
|
|
60
|
+
ON retraction_undo(expires_at);
|
|
61
|
+
|
|
62
|
+
-- Audit log of every retraction (kept forever, feeds eval reports).
|
|
63
|
+
CREATE TABLE IF NOT EXISTS retraction_log (
|
|
64
|
+
ts INTEGER NOT NULL,
|
|
65
|
+
fact_id TEXT NOT NULL,
|
|
66
|
+
reason TEXT,
|
|
67
|
+
actor TEXT,
|
|
68
|
+
undone_at INTEGER,
|
|
69
|
+
source_entity TEXT
|
|
70
|
+
);
|
|
71
|
+
CREATE INDEX IF NOT EXISTS idx_retraction_log_ts ON retraction_log(ts DESC);
|
|
72
|
+
|
|
73
|
+
-- Imported concepts — provenance + idempotency check via bundle_sha256.
|
|
74
|
+
CREATE TABLE IF NOT EXISTS concept_imports (
|
|
75
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
76
|
+
imported_at INTEGER NOT NULL,
|
|
77
|
+
root_entity TEXT NOT NULL,
|
|
78
|
+
source_tool TEXT,
|
|
79
|
+
source_version TEXT,
|
|
80
|
+
envelope_format TEXT NOT NULL,
|
|
81
|
+
bundle_sha256 TEXT NOT NULL,
|
|
82
|
+
conflict_mode TEXT NOT NULL,
|
|
83
|
+
triples_count INTEGER,
|
|
84
|
+
redactions_seen TEXT,
|
|
85
|
+
notes TEXT
|
|
86
|
+
);
|
|
87
|
+
CREATE INDEX IF NOT EXISTS idx_concept_imports_root
|
|
88
|
+
ON concept_imports(root_entity, imported_at DESC);
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_concept_imports_sha
|
|
90
|
+
ON concept_imports(bundle_sha256);
|
|
91
|
+
|
|
92
|
+
-- Search log for telemetry / "what did the user search for that didn't resolve."
|
|
93
|
+
CREATE TABLE IF NOT EXISTS search_log (
|
|
94
|
+
ts INTEGER NOT NULL,
|
|
95
|
+
query TEXT NOT NULL,
|
|
96
|
+
resolved_to TEXT,
|
|
97
|
+
result_count INTEGER
|
|
98
|
+
);
|
|
99
|
+
CREATE INDEX IF NOT EXISTS idx_search_log_ts ON search_log(ts DESC);
|
|
100
|
+
`;
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Typed accessor for ~/.sinain/memory/web.db.
|
|
3
|
+
*
|
|
4
|
+
* One module owns all SQL — keeps query strings out of HTTP handlers and
|
|
5
|
+
* lets us swap out better-sqlite3 later if needed.
|
|
6
|
+
*/
|
|
7
|
+
import Database from "better-sqlite3";
|
|
8
|
+
import { existsSync, mkdirSync } from "node:fs";
|
|
9
|
+
import { dirname } from "node:path";
|
|
10
|
+
import { SCHEMA_SQL, SCHEMA_VERSION } from "./schema.js";
|
|
11
|
+
import { log } from "../log.js";
|
|
12
|
+
|
|
13
|
+
const TAG = "web-db";
|
|
14
|
+
|
|
15
|
+
export type BookmarkStatus = "favorite" | "archive" | "recent";
|
|
16
|
+
|
|
17
|
+
export interface Bookmark {
|
|
18
|
+
entity_id: string;
|
|
19
|
+
status: BookmarkStatus;
|
|
20
|
+
note: string | null;
|
|
21
|
+
created_at: number;
|
|
22
|
+
last_visited: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface PageCacheRow {
|
|
26
|
+
entity_id: string;
|
|
27
|
+
tx_watermark: number;
|
|
28
|
+
page_json: string;
|
|
29
|
+
generated_at: number;
|
|
30
|
+
tokens_in: number | null;
|
|
31
|
+
tokens_out: number | null;
|
|
32
|
+
cost_usd: number | null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface RetractionUndoRow {
|
|
36
|
+
token: string;
|
|
37
|
+
fact_id: string;
|
|
38
|
+
snapshot_json: string;
|
|
39
|
+
retracted_tx: number;
|
|
40
|
+
reason: string | null;
|
|
41
|
+
actor: string | null;
|
|
42
|
+
created_at: number;
|
|
43
|
+
expires_at: number;
|
|
44
|
+
consumed_at: number | null;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface ConceptImportRow {
|
|
48
|
+
id?: number;
|
|
49
|
+
imported_at: number;
|
|
50
|
+
root_entity: string;
|
|
51
|
+
source_tool: string | null;
|
|
52
|
+
source_version: string | null;
|
|
53
|
+
envelope_format: string;
|
|
54
|
+
bundle_sha256: string;
|
|
55
|
+
conflict_mode: string;
|
|
56
|
+
triples_count: number | null;
|
|
57
|
+
redactions_seen: string | null;
|
|
58
|
+
notes: string | null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const PAGE_CACHE_LRU_CAP = 500;
|
|
62
|
+
|
|
63
|
+
export class WebDb {
|
|
64
|
+
private db: Database.Database;
|
|
65
|
+
|
|
66
|
+
constructor(dbPath: string) {
|
|
67
|
+
if (!existsSync(dirname(dbPath))) {
|
|
68
|
+
mkdirSync(dirname(dbPath), { recursive: true });
|
|
69
|
+
}
|
|
70
|
+
this.db = new Database(dbPath);
|
|
71
|
+
this.db.pragma("journal_mode = WAL");
|
|
72
|
+
this.db.pragma("foreign_keys = ON");
|
|
73
|
+
this.db.exec(SCHEMA_SQL);
|
|
74
|
+
this.db
|
|
75
|
+
.prepare(
|
|
76
|
+
"INSERT INTO schema_meta(key, value) VALUES('version', ?) ON CONFLICT(key) DO UPDATE SET value=excluded.value",
|
|
77
|
+
)
|
|
78
|
+
.run(String(SCHEMA_VERSION));
|
|
79
|
+
log(TAG, `web.db ready at ${dbPath} (schema v${SCHEMA_VERSION})`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
close(): void {
|
|
83
|
+
this.db.close();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ── Bookmarks ───────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
listBookmarks(status?: BookmarkStatus, limit = 100): Bookmark[] {
|
|
89
|
+
const sql = status
|
|
90
|
+
? "SELECT * FROM user_bookmarks WHERE status = ? ORDER BY last_visited DESC LIMIT ?"
|
|
91
|
+
: "SELECT * FROM user_bookmarks ORDER BY last_visited DESC LIMIT ?";
|
|
92
|
+
const args = status ? [status, limit] : [limit];
|
|
93
|
+
return this.db.prepare(sql).all(...args) as Bookmark[];
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
upsertBookmark(entity_id: string, status: BookmarkStatus, note?: string): Bookmark {
|
|
97
|
+
const now = Date.now();
|
|
98
|
+
this.db
|
|
99
|
+
.prepare(
|
|
100
|
+
`INSERT INTO user_bookmarks(entity_id, status, note, created_at, last_visited)
|
|
101
|
+
VALUES(?, ?, ?, ?, ?)
|
|
102
|
+
ON CONFLICT(entity_id) DO UPDATE SET
|
|
103
|
+
status = excluded.status,
|
|
104
|
+
note = COALESCE(excluded.note, user_bookmarks.note),
|
|
105
|
+
last_visited = excluded.last_visited`,
|
|
106
|
+
)
|
|
107
|
+
.run(entity_id, status, note ?? null, now, now);
|
|
108
|
+
return this.db
|
|
109
|
+
.prepare("SELECT * FROM user_bookmarks WHERE entity_id = ?")
|
|
110
|
+
.get(entity_id) as Bookmark;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
deleteBookmark(entity_id: string): boolean {
|
|
114
|
+
const r = this.db.prepare("DELETE FROM user_bookmarks WHERE entity_id = ?").run(entity_id);
|
|
115
|
+
return r.changes > 0;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/** Bump last_visited for a bookmark; if absent, insert as 'recent'. */
|
|
119
|
+
touchVisit(entity_id: string): void {
|
|
120
|
+
const now = Date.now();
|
|
121
|
+
this.db
|
|
122
|
+
.prepare(
|
|
123
|
+
`INSERT INTO user_bookmarks(entity_id, status, note, created_at, last_visited)
|
|
124
|
+
VALUES(?, 'recent', NULL, ?, ?)
|
|
125
|
+
ON CONFLICT(entity_id) DO UPDATE SET last_visited = excluded.last_visited`,
|
|
126
|
+
)
|
|
127
|
+
.run(entity_id, now, now);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ── Page cache ──────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
getPageCache(entity_id: string, tx_watermark: number): PageCacheRow | null {
|
|
133
|
+
const row = this.db
|
|
134
|
+
.prepare(
|
|
135
|
+
"SELECT * FROM page_cache WHERE entity_id = ? AND tx_watermark = ?",
|
|
136
|
+
)
|
|
137
|
+
.get(entity_id, tx_watermark) as PageCacheRow | undefined;
|
|
138
|
+
return row ?? null;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
putPageCache(row: Omit<PageCacheRow, "generated_at"> & { generated_at?: number }): void {
|
|
142
|
+
const generated_at = row.generated_at ?? Date.now();
|
|
143
|
+
this.db
|
|
144
|
+
.prepare(
|
|
145
|
+
`INSERT OR REPLACE INTO page_cache
|
|
146
|
+
(entity_id, tx_watermark, page_json, generated_at, tokens_in, tokens_out, cost_usd)
|
|
147
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
148
|
+
)
|
|
149
|
+
.run(
|
|
150
|
+
row.entity_id,
|
|
151
|
+
row.tx_watermark,
|
|
152
|
+
row.page_json,
|
|
153
|
+
generated_at,
|
|
154
|
+
row.tokens_in ?? null,
|
|
155
|
+
row.tokens_out ?? null,
|
|
156
|
+
row.cost_usd ?? null,
|
|
157
|
+
);
|
|
158
|
+
this.pruneCache();
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/** LRU prune: keep newest PAGE_CACHE_LRU_CAP entries by generated_at. */
|
|
162
|
+
private pruneCache(): void {
|
|
163
|
+
const count = (this.db.prepare("SELECT COUNT(*) as n FROM page_cache").get() as { n: number }).n;
|
|
164
|
+
if (count <= PAGE_CACHE_LRU_CAP) return;
|
|
165
|
+
const overflow = count - PAGE_CACHE_LRU_CAP;
|
|
166
|
+
this.db
|
|
167
|
+
.prepare(
|
|
168
|
+
`DELETE FROM page_cache WHERE rowid IN (
|
|
169
|
+
SELECT rowid FROM page_cache ORDER BY generated_at ASC LIMIT ?
|
|
170
|
+
)`,
|
|
171
|
+
)
|
|
172
|
+
.run(overflow);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ── Retraction undo ─────────────────────────────────────
|
|
176
|
+
|
|
177
|
+
putRetractionUndo(row: Omit<RetractionUndoRow, "consumed_at">): void {
|
|
178
|
+
this.db
|
|
179
|
+
.prepare(
|
|
180
|
+
`INSERT INTO retraction_undo
|
|
181
|
+
(token, fact_id, snapshot_json, retracted_tx, reason, actor, created_at, expires_at)
|
|
182
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
183
|
+
)
|
|
184
|
+
.run(
|
|
185
|
+
row.token,
|
|
186
|
+
row.fact_id,
|
|
187
|
+
row.snapshot_json,
|
|
188
|
+
row.retracted_tx,
|
|
189
|
+
row.reason,
|
|
190
|
+
row.actor,
|
|
191
|
+
row.created_at,
|
|
192
|
+
row.expires_at,
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
getRetractionUndo(token: string): RetractionUndoRow | null {
|
|
197
|
+
const row = this.db
|
|
198
|
+
.prepare("SELECT * FROM retraction_undo WHERE token = ?")
|
|
199
|
+
.get(token) as RetractionUndoRow | undefined;
|
|
200
|
+
return row ?? null;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
consumeRetractionUndo(token: string): void {
|
|
204
|
+
this.db
|
|
205
|
+
.prepare("UPDATE retraction_undo SET consumed_at = ? WHERE token = ?")
|
|
206
|
+
.run(Date.now(), token);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
pruneExpiredUndos(): number {
|
|
210
|
+
const r = this.db
|
|
211
|
+
.prepare("DELETE FROM retraction_undo WHERE expires_at < ? AND consumed_at IS NULL")
|
|
212
|
+
.run(Date.now());
|
|
213
|
+
return r.changes;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
logRetraction(fact_id: string, reason: string | null, actor: string | null, source_entity: string | null): void {
|
|
217
|
+
this.db
|
|
218
|
+
.prepare(
|
|
219
|
+
`INSERT INTO retraction_log(ts, fact_id, reason, actor, source_entity)
|
|
220
|
+
VALUES (?, ?, ?, ?, ?)`,
|
|
221
|
+
)
|
|
222
|
+
.run(Date.now(), fact_id, reason, actor, source_entity);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
markRetractionUndone(fact_id: string): void {
|
|
226
|
+
this.db
|
|
227
|
+
.prepare(
|
|
228
|
+
`UPDATE retraction_log SET undone_at = ?
|
|
229
|
+
WHERE rowid = (
|
|
230
|
+
SELECT rowid FROM retraction_log
|
|
231
|
+
WHERE fact_id = ? AND undone_at IS NULL
|
|
232
|
+
ORDER BY ts DESC LIMIT 1
|
|
233
|
+
)`,
|
|
234
|
+
)
|
|
235
|
+
.run(Date.now(), fact_id);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ── Concept imports ─────────────────────────────────────
|
|
239
|
+
|
|
240
|
+
recordConceptImport(row: ConceptImportRow): number {
|
|
241
|
+
const r = this.db
|
|
242
|
+
.prepare(
|
|
243
|
+
`INSERT INTO concept_imports
|
|
244
|
+
(imported_at, root_entity, source_tool, source_version, envelope_format,
|
|
245
|
+
bundle_sha256, conflict_mode, triples_count, redactions_seen, notes)
|
|
246
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
247
|
+
)
|
|
248
|
+
.run(
|
|
249
|
+
row.imported_at,
|
|
250
|
+
row.root_entity,
|
|
251
|
+
row.source_tool,
|
|
252
|
+
row.source_version,
|
|
253
|
+
row.envelope_format,
|
|
254
|
+
row.bundle_sha256,
|
|
255
|
+
row.conflict_mode,
|
|
256
|
+
row.triples_count,
|
|
257
|
+
row.redactions_seen,
|
|
258
|
+
row.notes,
|
|
259
|
+
);
|
|
260
|
+
return Number(r.lastInsertRowid);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
findImportBySha(bundle_sha256: string): ConceptImportRow | null {
|
|
264
|
+
const row = this.db
|
|
265
|
+
.prepare("SELECT * FROM concept_imports WHERE bundle_sha256 = ? ORDER BY imported_at DESC LIMIT 1")
|
|
266
|
+
.get(bundle_sha256) as ConceptImportRow | undefined;
|
|
267
|
+
return row ?? null;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// ── Search log ──────────────────────────────────────────
|
|
271
|
+
|
|
272
|
+
logSearch(query: string, resolved_to: string | null, result_count: number): void {
|
|
273
|
+
this.db
|
|
274
|
+
.prepare(
|
|
275
|
+
"INSERT INTO search_log(ts, query, resolved_to, result_count) VALUES (?, ?, ?, ?)",
|
|
276
|
+
)
|
|
277
|
+
.run(Date.now(), query, resolved_to, result_count);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Concept Export — package an entity + its neighborhood as a portable bundle.
|
|
3
|
+
|
|
4
|
+
Produces a sinain-concept/v1 envelope that can be transferred to another
|
|
5
|
+
machine and re-imported with concept_import.py to reconstruct the same
|
|
6
|
+
entity page (including the LLM-rendered view, if bundled).
|
|
7
|
+
|
|
8
|
+
The reproducibility invariant we honor:
|
|
9
|
+
"On a new machine: import the bundle → open the same URL → see the same page."
|
|
10
|
+
|
|
11
|
+
For that to hold:
|
|
12
|
+
1. Entity IDs are content-addressed slugs → stable across machines.
|
|
13
|
+
2. Triples are exported verbatim (created_at, retracted) for round-trip.
|
|
14
|
+
3. Optionally bundle the rendered_page JSON so the receiver gets a
|
|
15
|
+
cache hit on first view (deterministic visual identity).
|
|
16
|
+
|
|
17
|
+
We do NOT bundle embeddings — same model on both ends → same vectors,
|
|
18
|
+
so receiver recomputes for ~1.5KB/fact saved.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
python3 concept_export.py --db <kg.db> --root entity:foo \\
|
|
22
|
+
[--depth 1] [--include-retracted] [--include-page] [--web-db <web.db>] \\
|
|
23
|
+
[--redact private,creditcard,apikey,...]
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import hashlib
|
|
29
|
+
import json
|
|
30
|
+
import re
|
|
31
|
+
import sys
|
|
32
|
+
import time
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Redaction (MIRROR OF sense_client/privacy.py — keep patterns in sync until
|
|
37
|
+
# we extract a shared sinain-memory/redaction.py module).
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
REDACT_RULES_VERSION = "1.2"
|
|
40
|
+
|
|
41
|
+
_REDACT_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
|
42
|
+
# (regex, replacement, rule-name)
|
|
43
|
+
(re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"), "[REDACTED:card]", "creditcard"),
|
|
44
|
+
(re.compile(r"\b(?:sk-|pk-|api[_-]?key[=:]\s*)[A-Za-z0-9_\-]{20,}\b"), "[REDACTED:apikey]", "apikey"),
|
|
45
|
+
(re.compile(r"Bearer\s+[A-Za-z0-9_\-\.]{20,}"), "[REDACTED:bearer]", "bearer"),
|
|
46
|
+
(re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED:awskey]", "awskey"),
|
|
47
|
+
(re.compile(r"(?:password|passwd|pwd)\s*[:=]\s*\S+", re.IGNORECASE), "[REDACTED:password]", "password"),
|
|
48
|
+
(re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "[REDACTED:github_pat]", "github_pat"),
|
|
49
|
+
(re.compile(r"\bghs_[A-Za-z0-9]{36}\b"), "[REDACTED:github_srv]", "github_srv"),
|
|
50
|
+
(re.compile(r"\bxox[bpoa]-[0-9A-Za-z\-]+"), "[REDACTED:slack]", "slack"),
|
|
51
|
+
(re.compile(r"\bya29\.[0-9A-Za-z\-_]+"), "[REDACTED:google_oauth]", "google_oauth"),
|
|
52
|
+
(re.compile(r"\beyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+"), "[REDACTED:jwt]", "jwt"),
|
|
53
|
+
(re.compile(r"(?:secret|token|key)\s*[:=]\s*[A-Za-z0-9_\-\.]{10,}", re.IGNORECASE), "[REDACTED:secret]", "secret"),
|
|
54
|
+
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED:ssn]", "ssn"),
|
|
55
|
+
(re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"), "[REDACTED:privkey]", "privkey"),
|
|
56
|
+
]
|
|
57
|
+
_PRIVATE_TAG = re.compile(r"<private>.*?</private>", re.DOTALL)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apply_redactions(text: str, enabled_rules: set[str]) -> tuple[str, list[str]]:
|
|
61
|
+
"""Run enabled redaction rules over *text*. Returns (redacted_text, applied)."""
|
|
62
|
+
applied: list[str] = []
|
|
63
|
+
if "private" in enabled_rules:
|
|
64
|
+
new_text = _PRIVATE_TAG.sub("[REDACTED:private]", text)
|
|
65
|
+
if new_text != text:
|
|
66
|
+
applied.append("private")
|
|
67
|
+
text = new_text
|
|
68
|
+
for pattern, replacement, name in _REDACT_PATTERNS:
|
|
69
|
+
if name in enabled_rules:
|
|
70
|
+
new_text = pattern.sub(replacement, text)
|
|
71
|
+
if new_text != text:
|
|
72
|
+
applied.append(name)
|
|
73
|
+
text = new_text
|
|
74
|
+
return text, applied
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Export
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def collect_neighborhood(store, root_entity: str, depth: int,
|
|
82
|
+
include_retracted: bool) -> list[str]:
|
|
83
|
+
"""BFS from root, following both incoming and outgoing refs.
|
|
84
|
+
|
|
85
|
+
Outgoing: triples WHERE entity_id = X AND value_type = 'ref' → recurse to value.
|
|
86
|
+
Incoming: triples WHERE value = X AND value_type = 'ref' → recurse to entity_id.
|
|
87
|
+
|
|
88
|
+
Returns deterministically-ordered list of entity_ids reachable within depth.
|
|
89
|
+
"""
|
|
90
|
+
visited: dict[str, int] = {root_entity: 0}
|
|
91
|
+
queue: list[tuple[str, int]] = [(root_entity, 0)]
|
|
92
|
+
retracted_filter = "" if include_retracted else "AND retracted = 0"
|
|
93
|
+
|
|
94
|
+
while queue:
|
|
95
|
+
eid, d = queue.pop(0)
|
|
96
|
+
if d >= depth:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Outgoing refs
|
|
100
|
+
rows_out = store._conn.execute(
|
|
101
|
+
f"""SELECT DISTINCT value FROM triples
|
|
102
|
+
WHERE entity_id = ? AND value_type = 'ref' {retracted_filter}""",
|
|
103
|
+
(eid,),
|
|
104
|
+
).fetchall()
|
|
105
|
+
for r in rows_out:
|
|
106
|
+
ref = r["value"]
|
|
107
|
+
if ref and ref not in visited:
|
|
108
|
+
visited[ref] = d + 1
|
|
109
|
+
queue.append((ref, d + 1))
|
|
110
|
+
|
|
111
|
+
# Incoming refs
|
|
112
|
+
rows_in = store._conn.execute(
|
|
113
|
+
f"""SELECT DISTINCT entity_id FROM triples
|
|
114
|
+
WHERE value = ? AND value_type = 'ref' {retracted_filter}""",
|
|
115
|
+
(eid,),
|
|
116
|
+
).fetchall()
|
|
117
|
+
for r in rows_in:
|
|
118
|
+
ref = r["entity_id"]
|
|
119
|
+
if ref and ref not in visited:
|
|
120
|
+
visited[ref] = d + 1
|
|
121
|
+
queue.append((ref, d + 1))
|
|
122
|
+
|
|
123
|
+
return sorted(visited.keys()) # deterministic ordering for stable checksum
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def serialize_entity(store, entity_id: str, include_retracted: bool,
|
|
127
|
+
redact_rules: set[str]) -> tuple[dict, list[str], int]:
|
|
128
|
+
"""Pull all triples for entity_id, apply redactions to string values.
|
|
129
|
+
|
|
130
|
+
Returns ({id, type, triples: [...]}, applied_rules, redacted_count).
|
|
131
|
+
"""
|
|
132
|
+
where = "" if include_retracted else "AND retracted = 0"
|
|
133
|
+
rows = store._conn.execute(
|
|
134
|
+
f"""SELECT attribute, value, value_type, tx_id, created_at, retracted, valid_to
|
|
135
|
+
FROM triples WHERE entity_id = ? {where}
|
|
136
|
+
ORDER BY tx_id, attribute, value""",
|
|
137
|
+
(entity_id,),
|
|
138
|
+
).fetchall()
|
|
139
|
+
|
|
140
|
+
triples = []
|
|
141
|
+
all_applied: list[str] = []
|
|
142
|
+
redacted_count = 0
|
|
143
|
+
for r in rows:
|
|
144
|
+
value = r["value"]
|
|
145
|
+
if r["value_type"] == "string" and redact_rules and value:
|
|
146
|
+
new_value, applied = apply_redactions(value, redact_rules)
|
|
147
|
+
if applied:
|
|
148
|
+
all_applied.extend(applied)
|
|
149
|
+
redacted_count += 1
|
|
150
|
+
value = new_value
|
|
151
|
+
triples.append({
|
|
152
|
+
"attribute": r["attribute"],
|
|
153
|
+
"value": value,
|
|
154
|
+
"value_type": r["value_type"],
|
|
155
|
+
"tx_id": r["tx_id"],
|
|
156
|
+
"created_at": r["created_at"],
|
|
157
|
+
"retracted": int(r["retracted"]),
|
|
158
|
+
"valid_to": r["valid_to"],
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
type_prefix = entity_id.split(":", 1)[0] if ":" in entity_id else "unknown"
|
|
162
|
+
return ({"id": entity_id, "type": type_prefix, "triples": triples},
|
|
163
|
+
all_applied, redacted_count)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def fetch_cached_page(web_db_path: str, root_entity: str,
|
|
167
|
+
redact_rules: set[str]) -> dict | None:
|
|
168
|
+
"""Pull the most recent cached rendered_page for root_entity, if any.
|
|
169
|
+
Apply redactions over the page summary + bullet text too — the LLM may
|
|
170
|
+
have woven sensitive content into its synthesis.
|
|
171
|
+
"""
|
|
172
|
+
import sqlite3
|
|
173
|
+
if not Path(web_db_path).exists():
|
|
174
|
+
return None
|
|
175
|
+
try:
|
|
176
|
+
conn = sqlite3.connect(web_db_path)
|
|
177
|
+
conn.row_factory = sqlite3.Row
|
|
178
|
+
row = conn.execute(
|
|
179
|
+
"""SELECT page_json, generated_at, tokens_in, tokens_out, cost_usd
|
|
180
|
+
FROM page_cache WHERE entity_id = ?
|
|
181
|
+
ORDER BY generated_at DESC LIMIT 1""",
|
|
182
|
+
(root_entity,),
|
|
183
|
+
).fetchone()
|
|
184
|
+
conn.close()
|
|
185
|
+
if not row:
|
|
186
|
+
return None
|
|
187
|
+
page = json.loads(row["page_json"])
|
|
188
|
+
# Redact rendered_page content.
|
|
189
|
+
if redact_rules:
|
|
190
|
+
if page.get("summary"):
|
|
191
|
+
new_summary, _ = apply_redactions(page["summary"], redact_rules)
|
|
192
|
+
page["summary"] = new_summary
|
|
193
|
+
for sec in page.get("sections", []) or []:
|
|
194
|
+
for b in sec.get("bullets", []) or []:
|
|
195
|
+
if b.get("text"):
|
|
196
|
+
new_text, _ = apply_redactions(b["text"], redact_rules)
|
|
197
|
+
b["text"] = new_text
|
|
198
|
+
page["generated_at"] = row["generated_at"]
|
|
199
|
+
page.setdefault("rendered_with", {})
|
|
200
|
+
if row["tokens_in"]: page["rendered_with"]["tokens_in"] = row["tokens_in"]
|
|
201
|
+
if row["tokens_out"]: page["rendered_with"]["tokens_out"] = row["tokens_out"]
|
|
202
|
+
return page
|
|
203
|
+
except Exception as e:
|
|
204
|
+
sys.stderr.write(f"fetch_cached_page failed: {e}\n")
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def export_concept(db_path: str, root_entity: str, depth: int = 1,
|
|
209
|
+
include_retracted: bool = False,
|
|
210
|
+
include_page: bool = True,
|
|
211
|
+
web_db_path: str | None = None,
|
|
212
|
+
redact_rules: set[str] | None = None) -> dict:
|
|
213
|
+
from triplestore import TripleStore
|
|
214
|
+
|
|
215
|
+
if redact_rules is None:
|
|
216
|
+
redact_rules = {"private", "creditcard", "apikey", "bearer", "awskey",
|
|
217
|
+
"password", "secret"}
|
|
218
|
+
|
|
219
|
+
store = TripleStore(db_path)
|
|
220
|
+
entity_ids = collect_neighborhood(store, root_entity, depth, include_retracted)
|
|
221
|
+
|
|
222
|
+
entities = []
|
|
223
|
+
all_applied: list[str] = []
|
|
224
|
+
total_redacted = 0
|
|
225
|
+
triple_count = 0
|
|
226
|
+
fact_count = 0
|
|
227
|
+
|
|
228
|
+
for eid in entity_ids:
|
|
229
|
+
entity_obj, applied, n_redacted = serialize_entity(
|
|
230
|
+
store, eid, include_retracted, redact_rules,
|
|
231
|
+
)
|
|
232
|
+
entities.append(entity_obj)
|
|
233
|
+
all_applied.extend(applied)
|
|
234
|
+
total_redacted += n_redacted
|
|
235
|
+
triple_count += len(entity_obj["triples"])
|
|
236
|
+
if eid.startswith("fact:"):
|
|
237
|
+
fact_count += 1
|
|
238
|
+
|
|
239
|
+
store.close()
|
|
240
|
+
|
|
241
|
+
rendered_page = None
|
|
242
|
+
if include_page and web_db_path:
|
|
243
|
+
rendered_page = fetch_cached_page(web_db_path, root_entity, redact_rules)
|
|
244
|
+
|
|
245
|
+
envelope = {
|
|
246
|
+
"format": "sinain-concept/v1",
|
|
247
|
+
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
248
|
+
"exporter": {
|
|
249
|
+
"tool": "sinain-core",
|
|
250
|
+
"tool_version": "1.14.0",
|
|
251
|
+
"schema_version": "triplestore/v3",
|
|
252
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
|
253
|
+
},
|
|
254
|
+
"root_entity": root_entity,
|
|
255
|
+
"depth": depth,
|
|
256
|
+
"stats": {
|
|
257
|
+
"entities": len(entity_ids),
|
|
258
|
+
"facts": fact_count,
|
|
259
|
+
"triples": triple_count,
|
|
260
|
+
},
|
|
261
|
+
"entities": entities,
|
|
262
|
+
"rendered_page": rendered_page,
|
|
263
|
+
"redactions": {
|
|
264
|
+
"applied": sorted(set(all_applied)),
|
|
265
|
+
"rules_version": REDACT_RULES_VERSION,
|
|
266
|
+
"redacted_count": total_redacted,
|
|
267
|
+
},
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
# Compute checksum over canonical JSON of (root_entity + entities) — this is
|
|
271
|
+
# what the receiver should validate against on import.
|
|
272
|
+
canonical = json.dumps(
|
|
273
|
+
{"root_entity": root_entity, "entities": entities},
|
|
274
|
+
sort_keys=True, ensure_ascii=False, separators=(",", ":"),
|
|
275
|
+
)
|
|
276
|
+
envelope["checksum"] = "sha256:" + hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
277
|
+
return envelope
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def main() -> None:
|
|
281
|
+
parser = argparse.ArgumentParser(description="Concept Export")
|
|
282
|
+
parser.add_argument("--db", required=True)
|
|
283
|
+
parser.add_argument("--root", required=True, help="Root entity id (e.g. entity:citibank)")
|
|
284
|
+
parser.add_argument("--depth", type=int, default=1)
|
|
285
|
+
parser.add_argument("--include-retracted", action="store_true")
|
|
286
|
+
parser.add_argument("--include-page", action="store_true",
|
|
287
|
+
help="Bundle the cached rendered_page if available")
|
|
288
|
+
parser.add_argument("--web-db", default=None,
|
|
289
|
+
help="Path to web.db for page cache lookup")
|
|
290
|
+
parser.add_argument("--redact", default="private,creditcard,apikey,bearer,awskey,password,secret",
|
|
291
|
+
help="Comma-separated redaction rule names")
|
|
292
|
+
args = parser.parse_args()
|
|
293
|
+
|
|
294
|
+
if not Path(args.db).exists():
|
|
295
|
+
print(json.dumps({"error": f"db not found: {args.db}"}))
|
|
296
|
+
sys.exit(1)
|
|
297
|
+
|
|
298
|
+
rules = {r.strip() for r in args.redact.split(",") if r.strip()}
|
|
299
|
+
envelope = export_concept(
|
|
300
|
+
args.db, args.root, depth=args.depth,
|
|
301
|
+
include_retracted=args.include_retracted,
|
|
302
|
+
include_page=args.include_page,
|
|
303
|
+
web_db_path=args.web_db,
|
|
304
|
+
redact_rules=rules,
|
|
305
|
+
)
|
|
306
|
+
print(json.dumps(envelope, ensure_ascii=False))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
if __name__ == "__main__":
|
|
310
|
+
main()
|